Exemplo n.º 1
0
def make_links(train_path, test_path, out_path, 
               index_filter, tag_class, link_name, id_prefix):
    # make olink
    test_data = HypotheticalCorpus(test_path)
    test_data = list(test_data.extents(index_filter,
                                       tag_class))
    #parse into XML tags
    id_number = 0
    curr_doc = test_data[0].document
    doc_name = curr_doc.basename

    for extent in test_data:
        if doc_name != extent.document.basename:
            curr_doc.save_xml(os.path.join(out_path, doc_name))
            id_number = 0
            curr_doc = extent.document
            doc_name = curr_doc.basename
        tag = {'name': link_name, 
               'id': '{}{}'.format(id_prefix, id_number),
               'trigger': extent.tag['id'],
               'fromID': extent.tag['id'],
               'fromText': extent.tag['text'],
               'toID': extent.tag['id'],
               'toText': extent.tag['text'],
               }
        extent.document.insert_tag(tag)
        id_number += 1
    curr_doc.save_xml(os.path.join(out_path, doc_name))
Exemplo n.º 2
0
 def run_demo(self, verbose=0):
     """ test functionality of loading data, classification and evaluation """
     # load training data
     train_corpus = Corpus(self.train_path)
     extents = list(train_corpus.extents(self.indices_function,
                                         self.extent_class))
     # load test data
     if self.test_path:
         train_data = extents
         test_corpus = HypotheticalCorpus(self.test_path)
         test_data = list(test_corpus.extents(self.indices_function,
                                              self.extent_class))
     else:
         i = int(len(extents) * self.split)
         train_data = extents[:i]
         test_data = extents[i:]
     # verbosity functionality
     if verbose >= 1:
         print "data loaded"
     labels = [self.label_function(x) for x in extents]
     if verbose >= 2:
         fd = {}
         for l in labels:
             fd[l] = fd.get(l, 1) + 1
         print fd
     # train model
     clf = SKClassifier(LogisticRegression(),
                        self.label_function,
                        self.feature_functions)
     clf.add_labels(set(labels))
     clf.train(train_data)
     if verbose >= 1:
         print "model trained"
     # classify
     pred = clf.classify(test_data, 
                         keys = ["{a},{b},{c}".format(a=extent.basename,
                                                      b=extent.lex[0].begin, 
                                                      c=extent.lex[-1].end)
                                 for extent in test_data])
     # evaluate
     if self.gold_path:
         gold_corpus = Corpus(self.gold_path)
         gold_data = list(gold_corpus.extents(self.indices_function,
                                         self.extent_class))
     else:
         gold_data = test_data
     gold_labels = dict([
                         ("{a},{b},{c}".format(a=extent.basename,
                                               b=extent.lex[0].begin, 
                                               c=extent.lex[-1].end),  
                         self.label_function(extent)) 
                     for extent in gold_data])        
     clf.evaluate(pred, gold_labels)
Exemplo n.º 3
0
    def generate_test_train(self):
        """ create test and training instances based on provided paths """

        train_corpus = Corpus(self.train_path)
        extents = list(train_corpus.extents(self.indices_function,
                                            self.extent_class))
        # load test data
        if self.test_path:
            train_data = extents
            test_corpus = HypotheticalCorpus(self.test_path)
            test_data = list(test_corpus.extents(self.indices_function,
                                                 self.extent_class))
        else:
            i = int(len(extents) * self.split)
            train_data = extents[:i]
            test_data = extents[i:]
        return train_data, test_data
Exemplo n.º 4
0
def generate_tags(train_path, test_path, clean_path, out_path):
    # make outpath
    if not os.path.exists(out_path):
        os.makedirs(out_path)
    # clean data to write tags to
    clean_corpus = HypotheticalCorpus(clean_path)
    clean_data = list(clean_corpus.documents())
   
    for type_name, type_fields in tag_types.iteritems():
        # generate labels
        demo = TypesClassifier(type_name, train_path, test_path, gold_path = '')
        pred, test_data = demo.generate_labels()
        
        # labels -> tagged docs
        id_number = 0
        i = 0
        curr_doc = clean_data[0]
        doc_name = curr_doc.basename
        for extent in test_data:
            if doc_name != extent.document.basename:
                curr_doc.save_xml(os.path.join(out_path, doc_name))
                id_number = 0
                i += 1
                curr_doc = clean_data[i]
                doc_name = curr_doc.basename
            
            offsets = "{a},{b},{c}".format(a=extent.basename,
                                           b=extent.lex[0].begin, 
                                           c=extent.lex[-1].end)
            if pred[offsets] == 'True':
                tag = {'name': type_name, 
                       'start': extent.tag['start'], 
                       'end': extent.tag['end'],
                       'text': extent.tag['text'],
                       'id': '{}{}'.format(type_keys[type_name][0], id_number),
                       }
                tag.update(type_fields)
                curr_doc.insert_tag(tag)
                id_number += 1
        # switch to established files
        curr_doc.save_xml(os.path.join(out_path, doc_name))
        clean_corpus = HypotheticalCorpus(out_path)
        clean_data = list(clean_corpus.documents())