def __init__(self, _model_path = paths.SEG_MODEL_PATH, _model_file = None, _scale_model_file = None,
                 verbose=False, _name="segmenter", dependencies = False):
        self.svm_classifier = SVMClassifier(class_type = 'bin', software = 'libsvm', 
                                         model_path = _model_path, bin_model_file = _model_file, bin_scale_model_file = _scale_model_file,
                                         output_filter = float,
                                         name= _name, verbose = verbose)

        self.feature_writer = SegFeatureWriter(verbose = verbose)
        
        self.syntax_parser = SyntaxParser(verbose = verbose, dependencies = dependencies)
Пример #2
0
 def get_classifier(self, classifier_name):
     modelmap = {
         "naive": NaiveBayesClassifier(),
          "svm" : SVMClassifier(),
         'random': RandomForestClf()
         # "grid": GridSearchClassifier()
     }
     if classifier_name in modelmap:
         return modelmap[classifier_name]
     else:
         raise Exception("Unrecognized model: {}".format(classifier_name))
 def __init__(self, _model_path = paths.MODEL_PATH, _bin_model_file = None, _bin_scale_model_file = None,
              _mc_model_file = None, _mc_scale_model_file = None, 
              _name = "FengHirst", verbose = False, use_contextual_features = False):
     
     self.name = _name
     self.verbose = verbose
     self.use_contextual_features = use_contextual_features
     
     if self.name == 'hilda':
         self.feature_writer = features.tree_feature_writer_hilda.TreeFeatureWriter(verbose)
         self.svm_mc_classifier = SVMClassifier(class_type = 'mc', software = 'libsvm', 
                                      model_path = _model_path, mc_model_file = _mc_model_file, mc_scale_model_file = _mc_scale_model_file,
                                      output_filter = self.treat_mc_output,
                                      name= _name + "_mc", verbose = verbose)
         self.svm_bin_classifier = SVMClassifier(class_type = 'bin', software = 'liblinear', 
                                      model_path = _model_path, bin_model_file = _bin_model_file, bin_scale_model_file = _bin_scale_model_file,
                                      output_filter = self.treat_liblinear_output,
                                      name= _name + "_bin", verbose = verbose)
     elif self.name == 'FengHirst':
         self.feature_writer = features.tree_feature_writer_Feng_Hirst.TreeFeatureWriter(verbose,
                                                                                         use_contextual_features = use_contextual_features,
                                                                                         subdir = '')
         
         self.svm_bin_classifier1 = SVMClassifier(class_type = 'bin', software = 'svm_perf_classify',  
                                      model_path = _model_path, bin_model_file = _bin_model_file[0], bin_scale_model_file = _bin_scale_model_file,
                                      output_filter = float,
                                      name= _name + "_bin", verbose = verbose)
         self.svm_bin_classifier2 = SVMClassifier(class_type = 'bin', software = 'svm_perf_classify',  
                                      model_path = _model_path, bin_model_file = _bin_model_file[1], bin_scale_model_file = _bin_scale_model_file,
                                      output_filter = float,
                                      name= _name + "_bin", verbose = verbose)
         self.svm_mc_classifier1 = SVMClassifier(class_type = 'mc', software = 'svm_multiclass_classify', 
                                      model_path = _model_path, mc_model_file = _mc_model_file[0], mc_scale_model_file = _mc_scale_model_file,
                                      output_filter = self.treat_mc_output,
                                      name= _name + "_mc", verbose = verbose)
         self.svm_mc_classifier2 = SVMClassifier(class_type = 'mc', software = 'svm_multiclass_classify', 
                                      model_path = _model_path, mc_model_file = _mc_model_file[1], mc_scale_model_file = _mc_scale_model_file,
                                      output_filter = self.treat_mc_output,
                                      name= _name + "_mc", verbose = verbose)
         
     else:
         print 'Unrecognized tree_builder name: %s' % self.name
         raise Exception
class TreeBuilder:

    
    def __init__(self, _model_path = paths.MODEL_PATH, _bin_model_file = None, _bin_scale_model_file = None,
                 _mc_model_file = None, _mc_scale_model_file = None, 
                 _name = "FengHirst", verbose = False, use_contextual_features = False):
        
        self.name = _name
        self.verbose = verbose
        self.use_contextual_features = use_contextual_features
        
        if self.name == 'hilda':
            self.feature_writer = features.tree_feature_writer_hilda.TreeFeatureWriter(verbose)
            self.svm_mc_classifier = SVMClassifier(class_type = 'mc', software = 'libsvm', 
                                         model_path = _model_path, mc_model_file = _mc_model_file, mc_scale_model_file = _mc_scale_model_file,
                                         output_filter = self.treat_mc_output,
                                         name= _name + "_mc", verbose = verbose)
            self.svm_bin_classifier = SVMClassifier(class_type = 'bin', software = 'liblinear', 
                                         model_path = _model_path, bin_model_file = _bin_model_file, bin_scale_model_file = _bin_scale_model_file,
                                         output_filter = self.treat_liblinear_output,
                                         name= _name + "_bin", verbose = verbose)
        elif self.name == 'FengHirst':
            self.feature_writer = features.tree_feature_writer_Feng_Hirst.TreeFeatureWriter(verbose,
                                                                                            use_contextual_features = use_contextual_features,
                                                                                            subdir = '')
            
            self.svm_bin_classifier1 = SVMClassifier(class_type = 'bin', software = 'svm_perf_classify',  
                                         model_path = _model_path, bin_model_file = _bin_model_file[0], bin_scale_model_file = _bin_scale_model_file,
                                         output_filter = float,
                                         name= _name + "_bin", verbose = verbose)
            self.svm_bin_classifier2 = SVMClassifier(class_type = 'bin', software = 'svm_perf_classify',  
                                         model_path = _model_path, bin_model_file = _bin_model_file[1], bin_scale_model_file = _bin_scale_model_file,
                                         output_filter = float,
                                         name= _name + "_bin", verbose = verbose)
            self.svm_mc_classifier1 = SVMClassifier(class_type = 'mc', software = 'svm_multiclass_classify', 
                                         model_path = _model_path, mc_model_file = _mc_model_file[0], mc_scale_model_file = _mc_scale_model_file,
                                         output_filter = self.treat_mc_output,
                                         name= _name + "_mc", verbose = verbose)
            self.svm_mc_classifier2 = SVMClassifier(class_type = 'mc', software = 'svm_multiclass_classify', 
                                         model_path = _model_path, mc_model_file = _mc_model_file[1], mc_scale_model_file = _mc_scale_model_file,
                                         output_filter = self.treat_mc_output,
                                         name= _name + "_mc", verbose = verbose)
            
        else:
            print 'Unrecognized tree_builder name: %s' % self.name
            raise Exception
        
        
    
    def get_list_hash(self, L):
        if not len(L):
            return ''
        if isinstance(L[0], ParseTree):
            ret_str = L[0].get_hash()
        else:
            ret_str = str(len(L[0]))
        return ret_str + '|' + self.get_list_hash(L[1:])
            
    
    def center_norm(self, V):
        M = max(V)
        d = float(M+min(V))/2
        return map(lambda x: float(x-d)/(M-d), V)
    
    
    def treat_mc_output(self, output):
        fields = output.split()
        if len(fields) < 2:
            return (-1, 'Error', fields)
        
        
        if int(fields[0]) in self.feature_writer.inv_relations:
            label = self.feature_writer.inv_relations[int(fields[0])]
        else: 
            label = 'Unknown'
        
        #print "####\n", int(fields[0]), label, map(float, fields[1:]), "\n###\n"
        return (int(fields[0]), label, map(float, fields[1:]))
    
    
    
    def treat_liblinear_output(self, output):
        fields = output.split()
        if len(fields) < 3:
            print "Error in liblinear output: ", fields
            exit(-1)
        return float(fields[1])
     

    def classify_pair(self, stumps, pair, offsets, i, stumps_mc_scores = None, stumps_bin_scores = None):
        if self.use_contextual_features and self.prev_tree is not None:
            (prev_stump, next_stump) = utils.utils.get_context_stumps(self.prev_tree, stumps, pair, i)
#            print 'L:', stumps[i]
#            print 'R:', stumps[i+1]
#            print pair
#            print 'prev:', prev_stump
#            print 'next:', next_stump
#            print
        else:
            prev_stump = None
            next_stump = None
            
        scope = utils.utils.is_within_sentence(pair)
        if self.name == 'FengHirst':
            (bin_inst, mc_inst) = self.feature_writer.write_instance(None, None, pair, prev_stump, next_stump, None, 
                                                                         self.syntax_trees, self.sent2deps_list, 
                                                                         self.breaks, offsets[i],
                                                                         reset_contextual_features = not scope)
            
        else:
            (bin_inst, mc_inst) = self.feature_writer.write_instance(None, None, pair, None, self.syntax_trees, self.breaks, 
                                                                         offsets[i])

            
        if self.name == 'FengHirst':
            if scope:
                bin_score = self.svm_bin_classifier1.classify(bin_inst)
                mc_score = self.svm_mc_classifier1.classify(mc_inst)
            else:
                bin_score = self.svm_bin_classifier2.classify(bin_inst)
                mc_score = self.svm_mc_classifier2.classify(mc_inst)
        else:
            bin_score = self.svm_bin_classifier.classify(bin_inst)
            mc_score = self.svm_mc_classifier.classify(mc_inst)
        return (bin_score, mc_score)
    
                
    def connect_stumps(self, i, (stumps, stumps_mc_scores, stumps_bin_scores, offsets, tree_score)):
        #print stumps_mc_scores[i]
        
        new_stump = utils.utils.make_new_stump(stumps_mc_scores[i][1], stumps[i], stumps[i+1])
        new_stump.probs = self.center_norm(stumps_mc_scores[i][2])
        tree_score[0] = tree_score[0] + stumps_bin_scores[i] # + max(new_stump.probs)
        tree_score[1] += 1
        
        
        if i > 0: # left pair
            #print "\n### Erasing score [%d]: %f" %(i-1, stumps_bin_scores[i-1])
            pair = utils.utils.make_new_stump('n/a', stumps[i-1], new_stump)
            (bin_score, mc_score) = self.classify_pair(stumps, pair, offsets, i - 1, stumps_mc_scores, stumps_bin_scores)
                
            #print i, bin_inst
            stumps_bin_scores[i-1:i] = [bin_score]
            #print stumps_bin_scores[i - 1]
            stumps_mc_scores[i-1:i] = [mc_score]
            
        if i+2 < len(stumps): # right pair
            pair = utils.utils.make_new_stump('n/a', new_stump, stumps[i+2])
            (bin_score, mc_score) = self.classify_pair(stumps, pair, offsets, i, stumps_mc_scores, stumps_bin_scores)
                
            #print i, bin_inst
            stumps_bin_scores[i+1:i+2] = [bin_score]
            #print stumps_bin_scores[i - 1]
            stumps_mc_scores[i+1:i+2] = [mc_score]
    
        #print "\n### Erasing score [%d]: %f" %(i, stumps_bin_scores[i])
        stumps_bin_scores[i:i+1] = []
        stumps_mc_scores[i:i+1] = []
        stumps[i:i+2] = [new_stump, ]
        offsets[i+1:i+2] = []
Пример #5
0
from classifiers.naive_bayes_classifier import NaiveBayesClassifier
from classifiers.svm_classifier import SVMClassifier
import argparse

datasets = {
    'ag_news': AgNews(),
    'yahoo_answers': YahooAnswers(),
    'yelp_review_polarity': YelpReviewPolarity(),
}

classifiers = {
    'cnn': CNNClassifier(),
    'lstm': LSTMClassifier(),
    'character_level_cnn': CharacterLevelCNNClassifier(),
    'naive_bayes': NaiveBayesClassifier(),
    'svm': SVMClassifier()
}


def main(classifier_str, dataset_str):
    dataset = datasets[dataset_str]
    classifier = classifiers[classifier_str]
    classifier.load(dataset)
    classifier.fit()
    classifier.evaluate()

    # TODO grid search


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
class Segmenter: 
    
    penn_special_chars = {'-LRB-': '(', '-RRB-': ')', '-LAB-': '<', '-RAB-': '>',
                        '-LCB-': '{', '-RCB-': '}', '-LSB-': '[', '-RSB-':']',
                      '\\/' : '/', '\\*' : '*', '``' : '"', "''" : '"', "`" : "'"}
    
    def __init__(self, _model_path = paths.SEG_MODEL_PATH, _model_file = None, _scale_model_file = None,
                 verbose=False, _name="segmenter", dependencies = False):
        self.svm_classifier = SVMClassifier(class_type = 'bin', software = 'libsvm', 
                                         model_path = _model_path, bin_model_file = _model_file, bin_scale_model_file = _scale_model_file,
                                         output_filter = float,
                                         name= _name, verbose = verbose)

        self.feature_writer = SegFeatureWriter(verbose = verbose)
        
        self.syntax_parser = SyntaxParser(verbose = verbose, dependencies = dependencies)
    

    def create_lexicalized_tree(self, mrg, heads):
        """
        Creates a lexicalized syntax tree given a MRG-style parse and a Penn2Malt style heads file. 
        """
        t = LexicalizedTree.parse(mrg, leaf_pattern = '(?<=\\s)[^\)\(]+')  # Vanessa's modification
        t.lexicalize(heads, from_string = True)
        
        return t
    
    
    def split_by_sentence(self, text):
        """
        Takes a text and returns a list of (sentence, is_paragraph_boundary) elements
        Assumes that the text is pre-processed such that end of sentences are marked with <s>, end of paragraphs with <p>
        """
        result = []
        
        text = text.replace("\n", "")
        
        parse_pos = 0
        prev_pos = 0
        
        while parse_pos < len(text):
            
            next_tok = text[parse_pos:parse_pos + 3]
            
            if next_tok == "<s>" or next_tok == "<p>":
                result.append((text[prev_pos:parse_pos].strip(), next_tok))
                parse_pos = parse_pos + 3
                prev_pos = parse_pos
            else:
                parse_pos = parse_pos + 1
            
        return result
    
    def segment_tree(self, t):
        """
        Segments a text represented as a lexicalized syntax trees
        Returns a list of class labels for each token of the tree
        """
        data_to_classify = self.feature_writer.extract_features([t])
        
        results = []
        for d in data_to_classify:
            #print d
            results.append(self.svm_classifier.classify(d))
        
        return results
    
    def get_parsed_trees_from_string(self, tree_strings):
        # tree_strings separated by "\n"
        parsed_trees = []
        for line in tree_strings:
            line = line.strip()
            if line != '':
                parsed_trees.append(LexicalizedTree.parse(line, leaf_pattern = '(?<=\\s)[^\)\(]+'))
       
        return parsed_trees


    def get_deps(self, deps_filename):
        try:
            dep_f = open(deps_filename, 'r')
            deps = []
            sent_dep_str = ''
            
            started = True
            for line in dep_f.readlines():
                line = line.strip()
                if line == '' and started:
                    started = False
                    deps.append(sent_dep_str)       
                    sent_dep_str = ''
                else:
                    started = True
                    sent_dep_str += '\n' + line
            dep_f.close()
            return deps
        except Exception, e:
            print "*** Could not read the input file..."
            raise
Пример #7
0
def main():
    if sys.argv[1] == 'logistic_regression':
        if sys.argv[2] in ['accuracy', 'confusion_matrix']:
            kfold = 5
            model = LogisticRegressionClassifier(approch=sys.argv[3])
            model.train()
            model.evaluate(label="Training", metrics=sys.argv[2])
            model.evaluate(label="Testing", metrics=sys.argv[2])
            hyperparameters = {
                'penalty': ['l1', 'l2'],
                'C': np.logspace(0, 4, 10)
            }
            model.setting_model(hyperparameters, kfold, sys.argv[2])

    elif sys.argv[1] == 'random_forest':
        if sys.argv[2] in ['accuracy', 'confusion_matrix']:
            kfold = 5
            model = RandomForestAlgorithmClassifier(approch=sys.argv[3])
            model.train()
            model.evaluate(label="Training", metrics=sys.argv[2])
            model.evaluate(label="Testing", metrics=sys.argv[2])
            hyperparameters = {
                'max_depth': np.linspace(10, 100, num=10),  # best=50
                'n_estimators': range(88, 91)
            }
            model.setting_model(hyperparameters, kfold, sys.argv[2])
    elif sys.argv[1] == 'svm':
        if sys.argv[2] in ['accuracy', 'confusion_matrix']:
            kfold = 5
            model = SVMClassifier(approch=sys.argv[3])
            model.train()
            model.evaluate(label="Training", metrics=sys.argv[2])
            model.evaluate(label="Testing", metrics=sys.argv[2])
            hyperparameters = {
                'C': [0.1, 1, 10, 100, 1000],
                'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                'kernel': ['rbf', 'sigmoid']
            }
            model.setting_model(hyperparameters, kfold, sys.argv[2])
    elif sys.argv[1] == 'adaboost':
        if sys.argv[2] in ['accuracy', 'confusion_matrix']:
            kfold = 5
            model = AdaBoostAlgorithmClassifier(approch=sys.argv[3])
            model.train()
            model.evaluate(label="Training", metrics=sys.argv[2])
            model.evaluate(label="Testing", metrics=sys.argv[2])
            hyperparameters = {
                'base_estimator': [
                    tree.DecisionTreeClassifier(max_depth=n)
                    for n in range(13, 14)
                ],
                'n_estimators': [50, 55, 60, 65, 70, 80, 85],
                'algorithm': ['SAMME.R', 'SAMME'],
                'learning_rate':
                np.geomspace(0.01, 1, num=3)
            }
            model.setting_model(hyperparameters, kfold, sys.argv[2])
    elif sys.argv[1] == 'neural_network':
        if sys.argv[2] in ['accuracy', 'confusion_matrix']:
            kfold = 5
            model = NeuralnetworkClassifier(approch=sys.argv[3])
            model.train()
            model.evaluate(label="Training", metrics=sys.argv[2])
            model.evaluate(label="Testing", metrics=sys.argv[2])
            hyperparameters = {
                'hidden_layer_sizes': [(5, ), (5, 5)],
                'activation': ['relu'],
                'solver': ['identity', 'relu', 'tanh'],
                'alpha': [1e-5, 3e-4],
                'learning_rate_init': [1e-2, 1e-3]
            }
            model.setting_model(hyperparameters, kfold, sys.argv[2])
    elif sys.argv[1] == 'linear_discriminant':
        if sys.argv[2] in ['accuracy', 'confusion_matrix']:
            kfold = 5
            model = LinearDiscriminantClassifier(approch=sys.argv[3])
            model.train()
            model.evaluate(label="Training", metrics=sys.argv[2])
            model.evaluate(label="Testing", metrics=sys.argv[2])
            hyperparameters = {
                'penalty': ['l1', 'l2'],
                'multi_class': ['auto'],
                'solver': ['liblinear', 'identity', 'relu', 'tanh'],
                'max_iter': [10000]
            }
            model.setting_model(hyperparameters, kfold, sys.argv[2])