示例#1
0
    def test_a2kb(self):
        import pickle
        print('REL-RW, A2KB')
        metric = Metrics()
        import codecs
        out = codecs.open('results.tsv', 'w', 'utf-8')
        try:
            pickle_dict = pickle.load(open('related_uris.pcl'))
        except:
            pickle_dict = {}
        for datafile in self.microposts_data.data:
            syntactic_subsumption(datafile.candidates)
            closeness_pruning(datafile.candidates, pickle_dict)
            #for candidate in datafile:
            #    candidate.init_context_types(ngram_predictor)
            for candidate in datafile:
                if candidate.truth_data['uri'] and candidate.truth_data['uri'].startswith('NIL'):
                    continue
                uri = None
                if candidate.context is not None:
                    if not uri:
                        uri = candidate.get_max_uri()
                    if uri != candidate.truth_data['uri']:
                        print(uri, candidate.truth_data, candidate.cand_string)
                if uri:
                    uri = uri.decode('utf-8')
                else:
                    uri = unicode(uri)
                metric.evaluate(candidate.truth_data, uri)
                out.write(u'\t'.join([datafile.filename, candidate.cand_string,
                                     unicode(candidate.start_i), unicode(candidate.end_i), uri, datafile.text]) + u'\n')

        metric.print_metrics()
        out.close()
        pickle.dump(pickle_dict, open('related_uris.pcl', 'w'))
示例#2
0
def d2kb_prior():
    result = request.get_json(force=True)
    candidates = DataSet(result['text'], result['mentions'], ner).candidates
    syntactic_subsumption(candidates)
    new_mentions = []
    for mention, candidate in zip(result['mentions'], candidates):
        mention['uri'] = candidate.get_max_uri()
        if mention['uri'] is None:
            mention['uri'] = 'http://unknown.org/unknown/' + mention['name'].replace(' ', '_')
        else:
            mention['uri'] = 'http://dbpedia.org/resource/' + mention['uri']
        print mention
        new_mentions.append(mention)
    return jsonify({'mentions': new_mentions})
示例#3
0
 def test_d2kb(self):
     print 'REL-RW, D2KB'
     metric = Metrics()
     for datafile in self.microposts_data.data:
         syntactic_subsumption(datafile.candidates)
         graph = SemanticGraph(datafile.candidates)
         graph.do_linking()
         for candidate in datafile:
             candidate.init_context_types(ngram_predictor)
         for candidate in datafile:
             # D2KB condition
             if candidate.truth_data['uri'] is None:
                 continue
             uri = candidate.resolved_true_entity
             metric.evaluate(candidate.truth_data, uri)
     metric.print_metrics()
示例#4
0
 def test_d2kb(self):
     print 'REL-RW, D2KB'
     metric = Metrics()
     for datafile in self.microposts_data.data:
         syntactic_subsumption(datafile.candidates)
         graph = SemanticGraph(datafile.candidates)
         graph.do_linking()
         for candidate in datafile:
             candidate.init_context_types(ngram_predictor)
         for candidate in datafile:
             # D2KB condition
             if candidate.truth_data['uri'] is None:
                 continue
             uri = candidate.resolved_true_entity
             metric.evaluate(candidate.truth_data, uri)
     metric.print_metrics()
示例#5
0
def d2kb_prior_types():
    result = request.get_json(force=True)
    candidates = DataSet(result['text'], result['mentions'], ner).candidates
    syntactic_subsumption(candidates)
    for candidate in candidates:
        candidate.init_context_types(ngram_predictor)
    new_mentions = []
    for mention, candidate in zip(result['mentions'], candidates):
        context_types_list = [c.context_types for c in candidates
                              if c.context_types and c.cand_string == candidate.cand_string]
        if context_types_list:
            mention['uri'] = candidate.get_max_typed_uri(context_types_list)
        else:
            mention['uri'] = candidate.get_max_uri()
        new_mentions.append(mention)
    return jsonify({'mentions': new_mentions})
示例#6
0
 def test_d2kb(self):
     print('Prior prob, D2KB')
     for data_col in self.datas:
         print data_col.data_dir
         metric = Metrics()
         for datafile in data_col.data:
             syntactic_subsumption(datafile.candidates)
             for candidate in datafile:
                 # D2KB condition
                 if candidate.truth_data['uri'] is None:
                     continue
                 max_uri = candidate.get_max_uri()
                 if max_uri != candidate.truth_data['uri'] and max_uri is not None:
                     print('cand', candidate.cand_string,'max_uri:', max_uri, 'true_uri', candidate.truth_data['uri'], candidate.context)
                 metric.evaluate(candidate.truth_data, max_uri)
         metric.print_metrics()
         print()
示例#7
0
 def test_d2kb(self):
     print('Prior prob, D2KB')
     for data_col in self.datas:
         print data_col.data_dir
         metric = Metrics()
         for datafile in data_col.data:
             syntactic_subsumption(datafile.candidates)
             for candidate in datafile:
                 # D2KB condition
                 if candidate.truth_data['uri'] is None:
                     continue
                 max_uri = candidate.get_max_uri()
                 if max_uri != candidate.truth_data[
                         'uri'] and max_uri is not None:
                     print('cand', candidate.cand_string, 'max_uri:',
                           max_uri, 'true_uri', candidate.truth_data['uri'],
                           candidate.context)
                 metric.evaluate(candidate.truth_data, max_uri)
         metric.print_metrics()
         print()
示例#8
0
    def test_d2kb_typed(self):
        print('Prior prob + type improvements, D2KB')
        for data_col in self.datas:
            metric = Metrics()
            for datafile in data_col.data:
                syntactic_subsumption(datafile.candidates)
                for candidate in datafile:
                    candidate.init_context_types(ngram_predictor)
                for candidate in datafile:
                    # D2KB
                    if candidate.truth_data['uri'] is None:
                        continue
                    context_types_list = [c.context_types for c in datafile if c.context_types and c.cand_string == candidate.cand_string]
                    if context_types_list:
                        uri = candidate.get_max_typed_uri(context_types_list)
                    else:
                        uri = candidate.get_max_uri()

                    metric.evaluate(candidate.truth_data, uri)
            metric.print_metrics()
            print()
示例#9
0
 def test_d2kb(self):
     print 'MENTION-RW, D2KB'
     feature_file = open('features.txt', 'w')
     metric = Metrics()
     feature_file.write(Feature.header()+'\n')
     for datafile in self.msnbc_data.data:
         syntactic_subsumption(datafile.candidates)
         graph = SemanticGraph(datafile.candidates)
         #import pickle
         #pickle.dump(graph.G, open(datafile.filename+'.pcl', 'wb'))
         for candidate in datafile:
             candidate.init_context_types(ngram_predictor)
         graph.do_linking()
         for candidate in datafile:
             # D2KB condition
             if candidate.truth_data['uri'] is None:
                 continue
             uri = candidate.resolved_true_entity
             metric.evaluate(candidate.truth_data, uri)
     feature_file.close()
     metric.print_metrics()
示例#10
0
 def test_d2kb(self):
     print 'MENTION-RW, D2KB'
     feature_file = open('features.txt', 'w')
     metric = Metrics()
     feature_file.write(Feature.header() + '\n')
     for datafile in self.msnbc_data.data:
         syntactic_subsumption(datafile.candidates)
         graph = SemanticGraph(datafile.candidates)
         #import pickle
         #pickle.dump(graph.G, open(datafile.filename+'.pcl', 'wb'))
         for candidate in datafile:
             candidate.init_context_types(ngram_predictor)
         graph.do_linking()
         for candidate in datafile:
             # D2KB condition
             if candidate.truth_data['uri'] is None:
                 continue
             uri = candidate.resolved_true_entity
             metric.evaluate(candidate.truth_data, uri)
     feature_file.close()
     metric.print_metrics()
示例#11
0
    def test_a2kb(self):
        import pickle
        print('REL-RW, A2KB')
        metric = Metrics()
        import codecs
        out = codecs.open('results.tsv', 'w', 'utf-8')
        try:
            pickle_dict = pickle.load(open('related_uris.pcl'))
        except:
            pickle_dict = {}
        for datafile in self.microposts_data.data:
            syntactic_subsumption(datafile.candidates)
            closeness_pruning(datafile.candidates, pickle_dict)
            #for candidate in datafile:
            #    candidate.init_context_types(ngram_predictor)
            for candidate in datafile:
                if candidate.truth_data['uri'] and candidate.truth_data[
                        'uri'].startswith('NIL'):
                    continue
                uri = None
                if candidate.context is not None:
                    if not uri:
                        uri = candidate.get_max_uri()
                    if uri != candidate.truth_data['uri']:
                        print(uri, candidate.truth_data, candidate.cand_string)
                if uri:
                    uri = uri.decode('utf-8')
                else:
                    uri = unicode(uri)
                metric.evaluate(candidate.truth_data, uri)
                out.write(u'\t'.join([
                    datafile.filename, candidate.cand_string,
                    unicode(candidate.start_i),
                    unicode(candidate.end_i), uri, datafile.text
                ]) + u'\n')

        metric.print_metrics()
        out.close()
        pickle.dump(pickle_dict, open('related_uris.pcl', 'w'))
示例#12
0
 def test_d2kb(self):
     print 'REL-RW, D2KB'
     feature_file = open('features.txt', 'w')
     metric = Metrics()
     feature_file.write(Feature.header() + '\n')
     for datafile in self.msnbc_data.data:
         syntactic_subsumption(datafile.candidates)
         graph = SemanticGraph(datafile.candidates)
         #graph.do_linking()
         for candidate in datafile:
             candidate.init_context_types(ngram_predictor)
         features = graph.do_features()
         for f_list in features.values():
             feature_file.write('#CANDIDATE\n')
             for f in f_list:
                 feature_file.write(str(f) + '\n')
         for candidate in datafile:
             # D2KB condition
             if candidate.truth_data['uri'] is None:
                 continue
             uri = candidate.resolved_true_entity
             metric.evaluate(candidate.truth_data, uri)
     feature_file.close()
     metric.print_metrics()
示例#13
0
    def test_d2kb_typed(self):
        print('Prior prob + type improvements, D2KB')
        for data_col in self.datas:
            metric = Metrics()
            for datafile in data_col.data:
                syntactic_subsumption(datafile.candidates)
                for candidate in datafile:
                    candidate.init_context_types(ngram_predictor)
                for candidate in datafile:
                    # D2KB
                    if candidate.truth_data['uri'] is None:
                        continue
                    context_types_list = [
                        c.context_types for c in datafile if c.context_types
                        and c.cand_string == candidate.cand_string
                    ]
                    if context_types_list:
                        uri = candidate.get_max_typed_uri(context_types_list)
                    else:
                        uri = candidate.get_max_uri()

                    metric.evaluate(candidate.truth_data, uri)
            metric.print_metrics()
            print()
示例#14
0
 def test_d2kb(self):
     print 'REL-RW, D2KB'
     feature_file = open('features.txt', 'w')
     metric = Metrics()
     feature_file.write(Feature.header()+'\n')
     for datafile in self.msnbc_data.data:
         syntactic_subsumption(datafile.candidates)
         graph = SemanticGraph(datafile.candidates)
         #graph.do_linking()
         for candidate in datafile:
             candidate.init_context_types(ngram_predictor)
         features = graph.do_features()
         for f_list in features.values():
             feature_file.write('#CANDIDATE\n')
             for f in f_list:
                 feature_file.write(str(f) + '\n')
         for candidate in datafile:
             # D2KB condition
             if candidate.truth_data['uri'] is None:
                 continue
             uri = candidate.resolved_true_entity
             metric.evaluate(candidate.truth_data, uri)
     feature_file.close()
     metric.print_metrics()
示例#15
0
import codecs
out = codecs.open('results.tsv', 'w', 'utf-8')
for line in sys.stdin:
    line = line.decode('utf-8')
    tweet_id, timestamp, user_id, user_name, user_description, tweet, tweet_ner = line.strip().split('\t')
    datafile = DataFile(tweet_id, tweet)

    tweet_ne_list = parse_tweet_entities(datafile.text)
    tweet_ne_names = set([x['text'] for x in tweet_ne_list])
    ner_list = parse_entities_from_xml(datafile.text, tweet_ner)
    ner_list = [x for x in ner_list if x['text'] not in tweet_ne_names] + tweet_ne_list

    for values in ner_list:
        cand_string = values['text']
        candidate = CandidateEntity(values['start_i'], values['end_i'], cand_string, e_type=values['type'],
                                    context=values['context'], ner=ner)
        astr_uri = DataSet._astrology_uri(candidate.cand_string)
        if astr_uri:
            candidate.entities = [Entity(astr_uri, 1, ner)]
        datafile.candidates.append(candidate)

    syntactic_subsumption(datafile.candidates)
    for candidate in datafile:
        uri = candidate.get_max_uri()
        if uri:
            uri = uri.decode('utf-8')
            out.write(u'\t'.join([tweet_id, candidate.cand_string, unicode(candidate.start_i),
                                  unicode(candidate.end_i), uri, tweet]) + u'\n')

out.close()
示例#16
0
    tweet_ne_names = set([x['text'] for x in tweet_ne_list])
    ner_list = parse_entities_from_xml(datafile.text, tweet_ner)
    ner_list = [x for x in ner_list if x['text'] not in tweet_ne_names
                ] + tweet_ne_list

    for values in ner_list:
        cand_string = values['text']
        candidate = CandidateEntity(values['start_i'],
                                    values['end_i'],
                                    cand_string,
                                    e_type=values['type'],
                                    context=values['context'],
                                    ner=ner)
        astr_uri = DataSet._astrology_uri(candidate.cand_string)
        if astr_uri:
            candidate.entities = [Entity(astr_uri, 1, ner)]
        datafile.candidates.append(candidate)

    syntactic_subsumption(datafile.candidates)
    for candidate in datafile:
        uri = candidate.get_max_uri()
        if uri:
            uri = uri.decode('utf-8')
            out.write(u'\t'.join([
                tweet_id, candidate.cand_string,
                unicode(candidate.start_i),
                unicode(candidate.end_i), uri, tweet
            ]) + u'\n')

out.close()