def __init__(self, _model_path = paths.SEG_MODEL_PATH, _model_file = None, _scale_model_file = None, verbose=False, _name="segmenter", dependencies = False): self.svm_classifier = SVMClassifier(class_type = 'bin', software = 'libsvm', model_path = _model_path, bin_model_file = _model_file, bin_scale_model_file = _scale_model_file, output_filter = float, name= _name, verbose = verbose) self.feature_writer = SegFeatureWriter(verbose = verbose) self.syntax_parser = SyntaxParser(verbose = verbose, dependencies = dependencies)
def get_classifier(self, classifier_name): modelmap = { "naive": NaiveBayesClassifier(), "svm" : SVMClassifier(), 'random': RandomForestClf() # "grid": GridSearchClassifier() } if classifier_name in modelmap: return modelmap[classifier_name] else: raise Exception("Unrecognized model: {}".format(classifier_name))
def __init__(self, _model_path = paths.MODEL_PATH, _bin_model_file = None, _bin_scale_model_file = None, _mc_model_file = None, _mc_scale_model_file = None, _name = "FengHirst", verbose = False, use_contextual_features = False): self.name = _name self.verbose = verbose self.use_contextual_features = use_contextual_features if self.name == 'hilda': self.feature_writer = features.tree_feature_writer_hilda.TreeFeatureWriter(verbose) self.svm_mc_classifier = SVMClassifier(class_type = 'mc', software = 'libsvm', model_path = _model_path, mc_model_file = _mc_model_file, mc_scale_model_file = _mc_scale_model_file, output_filter = self.treat_mc_output, name= _name + "_mc", verbose = verbose) self.svm_bin_classifier = SVMClassifier(class_type = 'bin', software = 'liblinear', model_path = _model_path, bin_model_file = _bin_model_file, bin_scale_model_file = _bin_scale_model_file, output_filter = self.treat_liblinear_output, name= _name + "_bin", verbose = verbose) elif self.name == 'FengHirst': self.feature_writer = features.tree_feature_writer_Feng_Hirst.TreeFeatureWriter(verbose, use_contextual_features = use_contextual_features, subdir = '') self.svm_bin_classifier1 = SVMClassifier(class_type = 'bin', software = 'svm_perf_classify', model_path = _model_path, bin_model_file = _bin_model_file[0], bin_scale_model_file = _bin_scale_model_file, output_filter = float, name= _name + "_bin", verbose = verbose) self.svm_bin_classifier2 = SVMClassifier(class_type = 'bin', software = 'svm_perf_classify', model_path = _model_path, bin_model_file = _bin_model_file[1], bin_scale_model_file = _bin_scale_model_file, output_filter = float, name= _name + "_bin", verbose = verbose) self.svm_mc_classifier1 = SVMClassifier(class_type = 'mc', software = 'svm_multiclass_classify', model_path = _model_path, mc_model_file = _mc_model_file[0], mc_scale_model_file = _mc_scale_model_file, output_filter = self.treat_mc_output, name= _name + "_mc", verbose = verbose) self.svm_mc_classifier2 = SVMClassifier(class_type = 'mc', software = 'svm_multiclass_classify', model_path = _model_path, mc_model_file = _mc_model_file[1], mc_scale_model_file = _mc_scale_model_file, output_filter = self.treat_mc_output, name= _name + "_mc", verbose = verbose) else: print 'Unrecognized tree_builder name: %s' % self.name raise Exception
class TreeBuilder: def __init__(self, _model_path = paths.MODEL_PATH, _bin_model_file = None, _bin_scale_model_file = None, _mc_model_file = None, _mc_scale_model_file = None, _name = "FengHirst", verbose = False, use_contextual_features = False): self.name = _name self.verbose = verbose self.use_contextual_features = use_contextual_features if self.name == 'hilda': self.feature_writer = features.tree_feature_writer_hilda.TreeFeatureWriter(verbose) self.svm_mc_classifier = SVMClassifier(class_type = 'mc', software = 'libsvm', model_path = _model_path, mc_model_file = _mc_model_file, mc_scale_model_file = _mc_scale_model_file, output_filter = self.treat_mc_output, name= _name + "_mc", verbose = verbose) self.svm_bin_classifier = SVMClassifier(class_type = 'bin', software = 'liblinear', model_path = _model_path, bin_model_file = _bin_model_file, bin_scale_model_file = _bin_scale_model_file, output_filter = self.treat_liblinear_output, name= _name + "_bin", verbose = verbose) elif self.name == 'FengHirst': self.feature_writer = features.tree_feature_writer_Feng_Hirst.TreeFeatureWriter(verbose, use_contextual_features = use_contextual_features, subdir = '') self.svm_bin_classifier1 = SVMClassifier(class_type = 'bin', software = 'svm_perf_classify', model_path = _model_path, bin_model_file = _bin_model_file[0], bin_scale_model_file = _bin_scale_model_file, output_filter = float, name= _name + "_bin", verbose = verbose) self.svm_bin_classifier2 = SVMClassifier(class_type = 'bin', software = 'svm_perf_classify', model_path = _model_path, bin_model_file = _bin_model_file[1], bin_scale_model_file = _bin_scale_model_file, output_filter = float, name= _name + "_bin", verbose = verbose) self.svm_mc_classifier1 = SVMClassifier(class_type = 'mc', software = 'svm_multiclass_classify', model_path = _model_path, mc_model_file = _mc_model_file[0], mc_scale_model_file = _mc_scale_model_file, output_filter = self.treat_mc_output, name= _name + "_mc", verbose = verbose) self.svm_mc_classifier2 = SVMClassifier(class_type = 'mc', software = 'svm_multiclass_classify', model_path = _model_path, mc_model_file = _mc_model_file[1], mc_scale_model_file = _mc_scale_model_file, output_filter = self.treat_mc_output, name= _name + "_mc", verbose = verbose) else: print 'Unrecognized tree_builder name: %s' % self.name raise Exception def get_list_hash(self, L): if not len(L): return '' if isinstance(L[0], ParseTree): ret_str = L[0].get_hash() else: ret_str = str(len(L[0])) return ret_str + '|' + self.get_list_hash(L[1:]) def center_norm(self, V): M = max(V) d = float(M+min(V))/2 return map(lambda x: float(x-d)/(M-d), V) def treat_mc_output(self, output): fields = output.split() if len(fields) < 2: return (-1, 'Error', fields) if int(fields[0]) in self.feature_writer.inv_relations: label = self.feature_writer.inv_relations[int(fields[0])] else: label = 'Unknown' #print "####\n", int(fields[0]), label, map(float, fields[1:]), "\n###\n" return (int(fields[0]), label, map(float, fields[1:])) def treat_liblinear_output(self, output): fields = output.split() if len(fields) < 3: print "Error in liblinear output: ", fields exit(-1) return float(fields[1]) def classify_pair(self, stumps, pair, offsets, i, stumps_mc_scores = None, stumps_bin_scores = None): if self.use_contextual_features and self.prev_tree is not None: (prev_stump, next_stump) = utils.utils.get_context_stumps(self.prev_tree, stumps, pair, i) # print 'L:', stumps[i] # print 'R:', stumps[i+1] # print pair # print 'prev:', prev_stump # print 'next:', next_stump # print else: prev_stump = None next_stump = None scope = utils.utils.is_within_sentence(pair) if self.name == 'FengHirst': (bin_inst, mc_inst) = self.feature_writer.write_instance(None, None, pair, prev_stump, next_stump, None, self.syntax_trees, self.sent2deps_list, self.breaks, offsets[i], reset_contextual_features = not scope) else: (bin_inst, mc_inst) = self.feature_writer.write_instance(None, None, pair, None, self.syntax_trees, self.breaks, offsets[i]) if self.name == 'FengHirst': if scope: bin_score = self.svm_bin_classifier1.classify(bin_inst) mc_score = self.svm_mc_classifier1.classify(mc_inst) else: bin_score = self.svm_bin_classifier2.classify(bin_inst) mc_score = self.svm_mc_classifier2.classify(mc_inst) else: bin_score = self.svm_bin_classifier.classify(bin_inst) mc_score = self.svm_mc_classifier.classify(mc_inst) return (bin_score, mc_score) def connect_stumps(self, i, (stumps, stumps_mc_scores, stumps_bin_scores, offsets, tree_score)): #print stumps_mc_scores[i] new_stump = utils.utils.make_new_stump(stumps_mc_scores[i][1], stumps[i], stumps[i+1]) new_stump.probs = self.center_norm(stumps_mc_scores[i][2]) tree_score[0] = tree_score[0] + stumps_bin_scores[i] # + max(new_stump.probs) tree_score[1] += 1 if i > 0: # left pair #print "\n### Erasing score [%d]: %f" %(i-1, stumps_bin_scores[i-1]) pair = utils.utils.make_new_stump('n/a', stumps[i-1], new_stump) (bin_score, mc_score) = self.classify_pair(stumps, pair, offsets, i - 1, stumps_mc_scores, stumps_bin_scores) #print i, bin_inst stumps_bin_scores[i-1:i] = [bin_score] #print stumps_bin_scores[i - 1] stumps_mc_scores[i-1:i] = [mc_score] if i+2 < len(stumps): # right pair pair = utils.utils.make_new_stump('n/a', new_stump, stumps[i+2]) (bin_score, mc_score) = self.classify_pair(stumps, pair, offsets, i, stumps_mc_scores, stumps_bin_scores) #print i, bin_inst stumps_bin_scores[i+1:i+2] = [bin_score] #print stumps_bin_scores[i - 1] stumps_mc_scores[i+1:i+2] = [mc_score] #print "\n### Erasing score [%d]: %f" %(i, stumps_bin_scores[i]) stumps_bin_scores[i:i+1] = [] stumps_mc_scores[i:i+1] = [] stumps[i:i+2] = [new_stump, ] offsets[i+1:i+2] = []
from classifiers.naive_bayes_classifier import NaiveBayesClassifier from classifiers.svm_classifier import SVMClassifier import argparse datasets = { 'ag_news': AgNews(), 'yahoo_answers': YahooAnswers(), 'yelp_review_polarity': YelpReviewPolarity(), } classifiers = { 'cnn': CNNClassifier(), 'lstm': LSTMClassifier(), 'character_level_cnn': CharacterLevelCNNClassifier(), 'naive_bayes': NaiveBayesClassifier(), 'svm': SVMClassifier() } def main(classifier_str, dataset_str): dataset = datasets[dataset_str] classifier = classifiers[classifier_str] classifier.load(dataset) classifier.fit() classifier.evaluate() # TODO grid search if __name__ == "__main__": parser = argparse.ArgumentParser()
class Segmenter: penn_special_chars = {'-LRB-': '(', '-RRB-': ')', '-LAB-': '<', '-RAB-': '>', '-LCB-': '{', '-RCB-': '}', '-LSB-': '[', '-RSB-':']', '\\/' : '/', '\\*' : '*', '``' : '"', "''" : '"', "`" : "'"} def __init__(self, _model_path = paths.SEG_MODEL_PATH, _model_file = None, _scale_model_file = None, verbose=False, _name="segmenter", dependencies = False): self.svm_classifier = SVMClassifier(class_type = 'bin', software = 'libsvm', model_path = _model_path, bin_model_file = _model_file, bin_scale_model_file = _scale_model_file, output_filter = float, name= _name, verbose = verbose) self.feature_writer = SegFeatureWriter(verbose = verbose) self.syntax_parser = SyntaxParser(verbose = verbose, dependencies = dependencies) def create_lexicalized_tree(self, mrg, heads): """ Creates a lexicalized syntax tree given a MRG-style parse and a Penn2Malt style heads file. """ t = LexicalizedTree.parse(mrg, leaf_pattern = '(?<=\\s)[^\)\(]+') # Vanessa's modification t.lexicalize(heads, from_string = True) return t def split_by_sentence(self, text): """ Takes a text and returns a list of (sentence, is_paragraph_boundary) elements Assumes that the text is pre-processed such that end of sentences are marked with <s>, end of paragraphs with <p> """ result = [] text = text.replace("\n", "") parse_pos = 0 prev_pos = 0 while parse_pos < len(text): next_tok = text[parse_pos:parse_pos + 3] if next_tok == "<s>" or next_tok == "<p>": result.append((text[prev_pos:parse_pos].strip(), next_tok)) parse_pos = parse_pos + 3 prev_pos = parse_pos else: parse_pos = parse_pos + 1 return result def segment_tree(self, t): """ Segments a text represented as a lexicalized syntax trees Returns a list of class labels for each token of the tree """ data_to_classify = self.feature_writer.extract_features([t]) results = [] for d in data_to_classify: #print d results.append(self.svm_classifier.classify(d)) return results def get_parsed_trees_from_string(self, tree_strings): # tree_strings separated by "\n" parsed_trees = [] for line in tree_strings: line = line.strip() if line != '': parsed_trees.append(LexicalizedTree.parse(line, leaf_pattern = '(?<=\\s)[^\)\(]+')) return parsed_trees def get_deps(self, deps_filename): try: dep_f = open(deps_filename, 'r') deps = [] sent_dep_str = '' started = True for line in dep_f.readlines(): line = line.strip() if line == '' and started: started = False deps.append(sent_dep_str) sent_dep_str = '' else: started = True sent_dep_str += '\n' + line dep_f.close() return deps except Exception, e: print "*** Could not read the input file..." raise
def main(): if sys.argv[1] == 'logistic_regression': if sys.argv[2] in ['accuracy', 'confusion_matrix']: kfold = 5 model = LogisticRegressionClassifier(approch=sys.argv[3]) model.train() model.evaluate(label="Training", metrics=sys.argv[2]) model.evaluate(label="Testing", metrics=sys.argv[2]) hyperparameters = { 'penalty': ['l1', 'l2'], 'C': np.logspace(0, 4, 10) } model.setting_model(hyperparameters, kfold, sys.argv[2]) elif sys.argv[1] == 'random_forest': if sys.argv[2] in ['accuracy', 'confusion_matrix']: kfold = 5 model = RandomForestAlgorithmClassifier(approch=sys.argv[3]) model.train() model.evaluate(label="Training", metrics=sys.argv[2]) model.evaluate(label="Testing", metrics=sys.argv[2]) hyperparameters = { 'max_depth': np.linspace(10, 100, num=10), # best=50 'n_estimators': range(88, 91) } model.setting_model(hyperparameters, kfold, sys.argv[2]) elif sys.argv[1] == 'svm': if sys.argv[2] in ['accuracy', 'confusion_matrix']: kfold = 5 model = SVMClassifier(approch=sys.argv[3]) model.train() model.evaluate(label="Training", metrics=sys.argv[2]) model.evaluate(label="Testing", metrics=sys.argv[2]) hyperparameters = { 'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf', 'sigmoid'] } model.setting_model(hyperparameters, kfold, sys.argv[2]) elif sys.argv[1] == 'adaboost': if sys.argv[2] in ['accuracy', 'confusion_matrix']: kfold = 5 model = AdaBoostAlgorithmClassifier(approch=sys.argv[3]) model.train() model.evaluate(label="Training", metrics=sys.argv[2]) model.evaluate(label="Testing", metrics=sys.argv[2]) hyperparameters = { 'base_estimator': [ tree.DecisionTreeClassifier(max_depth=n) for n in range(13, 14) ], 'n_estimators': [50, 55, 60, 65, 70, 80, 85], 'algorithm': ['SAMME.R', 'SAMME'], 'learning_rate': np.geomspace(0.01, 1, num=3) } model.setting_model(hyperparameters, kfold, sys.argv[2]) elif sys.argv[1] == 'neural_network': if sys.argv[2] in ['accuracy', 'confusion_matrix']: kfold = 5 model = NeuralnetworkClassifier(approch=sys.argv[3]) model.train() model.evaluate(label="Training", metrics=sys.argv[2]) model.evaluate(label="Testing", metrics=sys.argv[2]) hyperparameters = { 'hidden_layer_sizes': [(5, ), (5, 5)], 'activation': ['relu'], 'solver': ['identity', 'relu', 'tanh'], 'alpha': [1e-5, 3e-4], 'learning_rate_init': [1e-2, 1e-3] } model.setting_model(hyperparameters, kfold, sys.argv[2]) elif sys.argv[1] == 'linear_discriminant': if sys.argv[2] in ['accuracy', 'confusion_matrix']: kfold = 5 model = LinearDiscriminantClassifier(approch=sys.argv[3]) model.train() model.evaluate(label="Training", metrics=sys.argv[2]) model.evaluate(label="Testing", metrics=sys.argv[2]) hyperparameters = { 'penalty': ['l1', 'l2'], 'multi_class': ['auto'], 'solver': ['liblinear', 'identity', 'relu', 'tanh'], 'max_iter': [10000] } model.setting_model(hyperparameters, kfold, sys.argv[2])