def test_a2kb(self): import pickle print('REL-RW, A2KB') metric = Metrics() import codecs out = codecs.open('results.tsv', 'w', 'utf-8') try: pickle_dict = pickle.load(open('related_uris.pcl')) except: pickle_dict = {} for datafile in self.microposts_data.data: syntactic_subsumption(datafile.candidates) closeness_pruning(datafile.candidates, pickle_dict) #for candidate in datafile: # candidate.init_context_types(ngram_predictor) for candidate in datafile: if candidate.truth_data['uri'] and candidate.truth_data['uri'].startswith('NIL'): continue uri = None if candidate.context is not None: if not uri: uri = candidate.get_max_uri() if uri != candidate.truth_data['uri']: print(uri, candidate.truth_data, candidate.cand_string) if uri: uri = uri.decode('utf-8') else: uri = unicode(uri) metric.evaluate(candidate.truth_data, uri) out.write(u'\t'.join([datafile.filename, candidate.cand_string, unicode(candidate.start_i), unicode(candidate.end_i), uri, datafile.text]) + u'\n') metric.print_metrics() out.close() pickle.dump(pickle_dict, open('related_uris.pcl', 'w'))
def d2kb_prior(): result = request.get_json(force=True) candidates = DataSet(result['text'], result['mentions'], ner).candidates syntactic_subsumption(candidates) new_mentions = [] for mention, candidate in zip(result['mentions'], candidates): mention['uri'] = candidate.get_max_uri() if mention['uri'] is None: mention['uri'] = 'http://unknown.org/unknown/' + mention['name'].replace(' ', '_') else: mention['uri'] = 'http://dbpedia.org/resource/' + mention['uri'] print mention new_mentions.append(mention) return jsonify({'mentions': new_mentions})
def test_d2kb(self): print 'REL-RW, D2KB' metric = Metrics() for datafile in self.microposts_data.data: syntactic_subsumption(datafile.candidates) graph = SemanticGraph(datafile.candidates) graph.do_linking() for candidate in datafile: candidate.init_context_types(ngram_predictor) for candidate in datafile: # D2KB condition if candidate.truth_data['uri'] is None: continue uri = candidate.resolved_true_entity metric.evaluate(candidate.truth_data, uri) metric.print_metrics()
def d2kb_prior_types(): result = request.get_json(force=True) candidates = DataSet(result['text'], result['mentions'], ner).candidates syntactic_subsumption(candidates) for candidate in candidates: candidate.init_context_types(ngram_predictor) new_mentions = [] for mention, candidate in zip(result['mentions'], candidates): context_types_list = [c.context_types for c in candidates if c.context_types and c.cand_string == candidate.cand_string] if context_types_list: mention['uri'] = candidate.get_max_typed_uri(context_types_list) else: mention['uri'] = candidate.get_max_uri() new_mentions.append(mention) return jsonify({'mentions': new_mentions})
def test_d2kb(self): print('Prior prob, D2KB') for data_col in self.datas: print data_col.data_dir metric = Metrics() for datafile in data_col.data: syntactic_subsumption(datafile.candidates) for candidate in datafile: # D2KB condition if candidate.truth_data['uri'] is None: continue max_uri = candidate.get_max_uri() if max_uri != candidate.truth_data['uri'] and max_uri is not None: print('cand', candidate.cand_string,'max_uri:', max_uri, 'true_uri', candidate.truth_data['uri'], candidate.context) metric.evaluate(candidate.truth_data, max_uri) metric.print_metrics() print()
def test_d2kb(self): print('Prior prob, D2KB') for data_col in self.datas: print data_col.data_dir metric = Metrics() for datafile in data_col.data: syntactic_subsumption(datafile.candidates) for candidate in datafile: # D2KB condition if candidate.truth_data['uri'] is None: continue max_uri = candidate.get_max_uri() if max_uri != candidate.truth_data[ 'uri'] and max_uri is not None: print('cand', candidate.cand_string, 'max_uri:', max_uri, 'true_uri', candidate.truth_data['uri'], candidate.context) metric.evaluate(candidate.truth_data, max_uri) metric.print_metrics() print()
def test_d2kb_typed(self): print('Prior prob + type improvements, D2KB') for data_col in self.datas: metric = Metrics() for datafile in data_col.data: syntactic_subsumption(datafile.candidates) for candidate in datafile: candidate.init_context_types(ngram_predictor) for candidate in datafile: # D2KB if candidate.truth_data['uri'] is None: continue context_types_list = [c.context_types for c in datafile if c.context_types and c.cand_string == candidate.cand_string] if context_types_list: uri = candidate.get_max_typed_uri(context_types_list) else: uri = candidate.get_max_uri() metric.evaluate(candidate.truth_data, uri) metric.print_metrics() print()
def test_d2kb(self): print 'MENTION-RW, D2KB' feature_file = open('features.txt', 'w') metric = Metrics() feature_file.write(Feature.header()+'\n') for datafile in self.msnbc_data.data: syntactic_subsumption(datafile.candidates) graph = SemanticGraph(datafile.candidates) #import pickle #pickle.dump(graph.G, open(datafile.filename+'.pcl', 'wb')) for candidate in datafile: candidate.init_context_types(ngram_predictor) graph.do_linking() for candidate in datafile: # D2KB condition if candidate.truth_data['uri'] is None: continue uri = candidate.resolved_true_entity metric.evaluate(candidate.truth_data, uri) feature_file.close() metric.print_metrics()
def test_d2kb(self): print 'MENTION-RW, D2KB' feature_file = open('features.txt', 'w') metric = Metrics() feature_file.write(Feature.header() + '\n') for datafile in self.msnbc_data.data: syntactic_subsumption(datafile.candidates) graph = SemanticGraph(datafile.candidates) #import pickle #pickle.dump(graph.G, open(datafile.filename+'.pcl', 'wb')) for candidate in datafile: candidate.init_context_types(ngram_predictor) graph.do_linking() for candidate in datafile: # D2KB condition if candidate.truth_data['uri'] is None: continue uri = candidate.resolved_true_entity metric.evaluate(candidate.truth_data, uri) feature_file.close() metric.print_metrics()
def test_a2kb(self): import pickle print('REL-RW, A2KB') metric = Metrics() import codecs out = codecs.open('results.tsv', 'w', 'utf-8') try: pickle_dict = pickle.load(open('related_uris.pcl')) except: pickle_dict = {} for datafile in self.microposts_data.data: syntactic_subsumption(datafile.candidates) closeness_pruning(datafile.candidates, pickle_dict) #for candidate in datafile: # candidate.init_context_types(ngram_predictor) for candidate in datafile: if candidate.truth_data['uri'] and candidate.truth_data[ 'uri'].startswith('NIL'): continue uri = None if candidate.context is not None: if not uri: uri = candidate.get_max_uri() if uri != candidate.truth_data['uri']: print(uri, candidate.truth_data, candidate.cand_string) if uri: uri = uri.decode('utf-8') else: uri = unicode(uri) metric.evaluate(candidate.truth_data, uri) out.write(u'\t'.join([ datafile.filename, candidate.cand_string, unicode(candidate.start_i), unicode(candidate.end_i), uri, datafile.text ]) + u'\n') metric.print_metrics() out.close() pickle.dump(pickle_dict, open('related_uris.pcl', 'w'))
def test_d2kb(self): print 'REL-RW, D2KB' feature_file = open('features.txt', 'w') metric = Metrics() feature_file.write(Feature.header() + '\n') for datafile in self.msnbc_data.data: syntactic_subsumption(datafile.candidates) graph = SemanticGraph(datafile.candidates) #graph.do_linking() for candidate in datafile: candidate.init_context_types(ngram_predictor) features = graph.do_features() for f_list in features.values(): feature_file.write('#CANDIDATE\n') for f in f_list: feature_file.write(str(f) + '\n') for candidate in datafile: # D2KB condition if candidate.truth_data['uri'] is None: continue uri = candidate.resolved_true_entity metric.evaluate(candidate.truth_data, uri) feature_file.close() metric.print_metrics()
def test_d2kb_typed(self): print('Prior prob + type improvements, D2KB') for data_col in self.datas: metric = Metrics() for datafile in data_col.data: syntactic_subsumption(datafile.candidates) for candidate in datafile: candidate.init_context_types(ngram_predictor) for candidate in datafile: # D2KB if candidate.truth_data['uri'] is None: continue context_types_list = [ c.context_types for c in datafile if c.context_types and c.cand_string == candidate.cand_string ] if context_types_list: uri = candidate.get_max_typed_uri(context_types_list) else: uri = candidate.get_max_uri() metric.evaluate(candidate.truth_data, uri) metric.print_metrics() print()
def test_d2kb(self): print 'REL-RW, D2KB' feature_file = open('features.txt', 'w') metric = Metrics() feature_file.write(Feature.header()+'\n') for datafile in self.msnbc_data.data: syntactic_subsumption(datafile.candidates) graph = SemanticGraph(datafile.candidates) #graph.do_linking() for candidate in datafile: candidate.init_context_types(ngram_predictor) features = graph.do_features() for f_list in features.values(): feature_file.write('#CANDIDATE\n') for f in f_list: feature_file.write(str(f) + '\n') for candidate in datafile: # D2KB condition if candidate.truth_data['uri'] is None: continue uri = candidate.resolved_true_entity metric.evaluate(candidate.truth_data, uri) feature_file.close() metric.print_metrics()
import codecs out = codecs.open('results.tsv', 'w', 'utf-8') for line in sys.stdin: line = line.decode('utf-8') tweet_id, timestamp, user_id, user_name, user_description, tweet, tweet_ner = line.strip().split('\t') datafile = DataFile(tweet_id, tweet) tweet_ne_list = parse_tweet_entities(datafile.text) tweet_ne_names = set([x['text'] for x in tweet_ne_list]) ner_list = parse_entities_from_xml(datafile.text, tweet_ner) ner_list = [x for x in ner_list if x['text'] not in tweet_ne_names] + tweet_ne_list for values in ner_list: cand_string = values['text'] candidate = CandidateEntity(values['start_i'], values['end_i'], cand_string, e_type=values['type'], context=values['context'], ner=ner) astr_uri = DataSet._astrology_uri(candidate.cand_string) if astr_uri: candidate.entities = [Entity(astr_uri, 1, ner)] datafile.candidates.append(candidate) syntactic_subsumption(datafile.candidates) for candidate in datafile: uri = candidate.get_max_uri() if uri: uri = uri.decode('utf-8') out.write(u'\t'.join([tweet_id, candidate.cand_string, unicode(candidate.start_i), unicode(candidate.end_i), uri, tweet]) + u'\n') out.close()
tweet_ne_names = set([x['text'] for x in tweet_ne_list]) ner_list = parse_entities_from_xml(datafile.text, tweet_ner) ner_list = [x for x in ner_list if x['text'] not in tweet_ne_names ] + tweet_ne_list for values in ner_list: cand_string = values['text'] candidate = CandidateEntity(values['start_i'], values['end_i'], cand_string, e_type=values['type'], context=values['context'], ner=ner) astr_uri = DataSet._astrology_uri(candidate.cand_string) if astr_uri: candidate.entities = [Entity(astr_uri, 1, ner)] datafile.candidates.append(candidate) syntactic_subsumption(datafile.candidates) for candidate in datafile: uri = candidate.get_max_uri() if uri: uri = uri.decode('utf-8') out.write(u'\t'.join([ tweet_id, candidate.cand_string, unicode(candidate.start_i), unicode(candidate.end_i), uri, tweet ]) + u'\n') out.close()