def make_links(train_path, test_path, out_path, index_filter, tag_class, link_name, id_prefix): # make olink test_data = HypotheticalCorpus(test_path) test_data = list(test_data.extents(index_filter, tag_class)) #parse into XML tags id_number = 0 curr_doc = test_data[0].document doc_name = curr_doc.basename for extent in test_data: if doc_name != extent.document.basename: curr_doc.save_xml(os.path.join(out_path, doc_name)) id_number = 0 curr_doc = extent.document doc_name = curr_doc.basename tag = {'name': link_name, 'id': '{}{}'.format(id_prefix, id_number), 'trigger': extent.tag['id'], 'fromID': extent.tag['id'], 'fromText': extent.tag['text'], 'toID': extent.tag['id'], 'toText': extent.tag['text'], } extent.document.insert_tag(tag) id_number += 1 curr_doc.save_xml(os.path.join(out_path, doc_name))
def run_demo(self, verbose=0): """ test functionality of loading data, classification and evaluation """ # load training data train_corpus = Corpus(self.train_path) extents = list(train_corpus.extents(self.indices_function, self.extent_class)) # load test data if self.test_path: train_data = extents test_corpus = HypotheticalCorpus(self.test_path) test_data = list(test_corpus.extents(self.indices_function, self.extent_class)) else: i = int(len(extents) * self.split) train_data = extents[:i] test_data = extents[i:] # verbosity functionality if verbose >= 1: print "data loaded" labels = [self.label_function(x) for x in extents] if verbose >= 2: fd = {} for l in labels: fd[l] = fd.get(l, 1) + 1 print fd # train model clf = SKClassifier(LogisticRegression(), self.label_function, self.feature_functions) clf.add_labels(set(labels)) clf.train(train_data) if verbose >= 1: print "model trained" # classify pred = clf.classify(test_data, keys = ["{a},{b},{c}".format(a=extent.basename, b=extent.lex[0].begin, c=extent.lex[-1].end) for extent in test_data]) # evaluate if self.gold_path: gold_corpus = Corpus(self.gold_path) gold_data = list(gold_corpus.extents(self.indices_function, self.extent_class)) else: gold_data = test_data gold_labels = dict([ ("{a},{b},{c}".format(a=extent.basename, b=extent.lex[0].begin, c=extent.lex[-1].end), self.label_function(extent)) for extent in gold_data]) clf.evaluate(pred, gold_labels)
def generate_test_train(self): """ create test and training instances based on provided paths """ train_corpus = Corpus(self.train_path) extents = list(train_corpus.extents(self.indices_function, self.extent_class)) # load test data if self.test_path: train_data = extents test_corpus = HypotheticalCorpus(self.test_path) test_data = list(test_corpus.extents(self.indices_function, self.extent_class)) else: i = int(len(extents) * self.split) train_data = extents[:i] test_data = extents[i:] return train_data, test_data