def computed_every_grounded_graph_f1_webq_name(input_file, answer_file, mid_to_names_file): # from datasets_interface.freebase import webquestions_interface # from evaluation.webq_denotation import webq_mid_to_names_process #------------------------------------------------ #read qid-to-answers qid_to_answers_dict = dict() lines = read_list(answer_file) for line in lines: cols = line.split('\t') qid_to_answers_dict[cols[0]] = eval(cols[2]) #------------------------------------------------ # mid to names dict mid_to_names_dict = dict() lines = read_list(mid_to_names_file) for line in lines: cols = line.split('\t') mid = cols[1] names = list(eval(cols[2])) mid_to_names_dict[mid] = names #------------------------------------------------ all_structure_path = os.listdir(input_file) for structure_path in all_structure_path: structure_with_grounded_graphq_file = input_file + structure_path structure_list = read_structure_file( structure_with_grounded_graphq_file) for structure in structure_list: qid = structure.qid gold_answer_names_set = evaluation_utils.search_for_answers_by_id( qid, qid_to_answers_dict) print(structure_path, '#gold:\t', gold_answer_names_set) for ungrounded_graph in structure.ungrounded_graph_forest: for grounded_graph in ungrounded_graph.get_grounded_graph_forest( ): system_denotation_names_set = set() for denotation_mid in grounded_graph.denotation: denotation_name = evaluation_utils.get_name_by_mid( denotation_mid, mid_to_names_dict) print('###denotation:\t', denotation_mid, denotation_name) if denotation_name is not None: system_denotation_names_set.add(denotation_name) else: print(denotation_mid, '#####error!!!', denotation_name) print('#gold:\t', gold_answer_names_set, '#system:\t', system_denotation_names_set) recall, precision, f1 = sempre_evaluation.computeF1( gold_answer_names_set, system_denotation_names_set) if f1 > 0.0: print('#result:\t', f1) grounded_graph.f1_score = f1 write_structure_file(structure_list, input_file + structure_path)
def get_all_relation_domain_range(): human_relation_list = read_list( '../dataset/resources_cwq/dataset_freebase_latest/freebase_relations') error_qid_list = [] import sys from parsing.logger_test import Logger sys.stdout = Logger('./2019.03.20_freebase_relation_domain_range.txt', sys.stdout) for line in human_relation_list: try: domains_set = get_domain(line) range_set = get_range(line) print(('%s\t%s\t%s') % (line, list(domains_set), list(range_set))) # sparql = 'PREFIX : <http://rdf.freebase.com/ns/> SELECT count(distinct ?s) WHERE { ?s :'+line+' ?o}' # count_relation = kb_interface.execute_sparql(sparql) # print(('%s\t%d') % (line, count_relation.pop())) # names = kb_interface.get_names(line) # if len(names) > 0: # name = names.pop() # token_list = name.lower().split(' ') # print(('%s\t%s') % (line, '\t'.join(token_list))) except Exception as e: error_qid_list.append(line) # if line not in human_types_list: # print(line) print(error_qid_list)
def get_freebase_schema(): import sys from parsing.logger_test import Logger sys.stdout = Logger('./2019.05.13_freebase_schema.txt', sys.stdout) # types_list = read_list('../dataset/resources_cwq/dataset_freebase_latest/freebase_types') # mediators = read_list('../dataset/resources_cwq/dataset_freebase_latest/mediators.tsv') # relation_domain_range_list = read_list('../dataset/resources_cwq/dataset_freebase_latest/freebase_relations_domain_range') types_list = read_list('./20190512_ywsun/75_all_classes.txt') mediators = read_list('./20190512_ywsun/mediators.tsv') relation_domain_range_list = read_list( './20190512_ywsun/2019.05.12_properties_with_domain_range.txt') relation_domain_range_tuple_list = [] for relation_domain_range in relation_domain_range_list: cols = relation_domain_range.split('\t') relation = cols[0] domains_list = eval(cols[1]) ranges_list = eval(cols[2]) relation_domain_range_tuple_list.append( (relation, domains_list, ranges_list)) for type_ in types_list: attr = 'main' if type_ in mediators: attr = 'mediator' related_relation_range_list = [] for i, (relation, domains_list, ranges_list) in \ enumerate(relation_domain_range_tuple_list): if type_ in domains_list: related_relation_range_list.append((relation, ranges_list)) for related_relation_range in related_relation_range_list: range = '' if len(related_relation_range[1]) > 0: range = related_relation_range[1][0] if len(related_relation_range[1]) > 1: print('error!!!', related_relation_range) print(('%s\t%s\t%s\t%s') % (type_, attr, related_relation_range[0], range))
def instance_to_types(): instance_to_types_dict = dict() types_instance_list = read_list('./2019_03_15_freebase_instance_type_1') import sys from parsing.logger_test import Logger sys.stdout = Logger('./2019.03.20_freebase_instance_type_1_reverse.txt', sys.stdout) for i, line in enumerate(types_instance_list): terms = line.split('\t') type_str = terms[0] instance = terms[1] if instance in instance_to_types_dict.keys(): instance_to_types_dict[instance].add(type_str) else: types = set() types.add(type_str) instance_to_types_dict[instance] = types for instance, types in instance_to_types_dict.items(): print(('%s\t%s') % (instance, str(types)))
def notable_type_to_instances(): import sys from parsing.logger_test import Logger sys.stdout = Logger('./2019.03.15_freebase_instance_notable_1.txt', sys.stdout) error_qid_list = [] notable_types_types_list = read_list( '../dataset/resources_cwq/dataset_freebase_latest/freebase_notable_types' ) for line in notable_types_types_list: try: instances = get_instance_by_class_notable_type(line) for instance in instances: print(('%s\t%s') % (line, instance)) except Exception as e: error_qid_list.append(line) # if line not in human_types_list: # print(line) print(error_qid_list)
def mediator_to_instances(): mediators_list = read_list( '../dataset/resources_cwq/dataset_freebase_latest/mediators.tsv') import sys from parsing.logger_test import Logger sys.stdout = Logger('./2019.04.10_freebase_mediators_instance_sp.txt', sys.stdout) error_qid_list = [] for i, line in enumerate(mediators_list): if line in [ 'common.notable_for', 'medicine.drug_label_section', 'location.geocode', 'film.performance', 'measurement_unit.dated_percentage', 'base.schemastaging.nutrition_information', 'common.webpage', 'music.track_contribution', 'measurement_unit.dated_integer' ]: continue try: sparql = '''SELECT DISTINCT ?s ?p ?instance WHERE { ?s ?p ?instance . ?instance :type.object.type :''' + line + ''' }''' execute_sparql_three_args(sparql, line) # instances = get_instance_by_class(line) # for instance in instances: # p_o_set, _, _ = get_p_o_by_entity(instance) # for p_o in p_o_set: # print(('%d\t%s\t%s') % (i, instance, p_o)) except Exception as e: error_qid_list.append(line) # if line not in human_types_list: # print(line) print(error_qid_list)
def get_all_class_names(): def write_dict(dict, write_file): fi = open(write_file, "w", encoding="utf-8") # fi.write(str(len(dict))) # fi.write("\n") for key in dict: fi.write(str(key)) fi.write("\t") fi.write(str(dict[key])) fi.write("\n") fi.close() # human_types_list = read_list('../dataset/resources_cwq/dataset_freebase_latest/freebase_types') human_types_list = read_list('./freebase_types') error_qid_list = [] name_to_class_dict = OrderedDict() for i, line in enumerate(human_types_list): try: names = get_names(line) if len(names) > 0: name = names.pop().lower() if name in name_to_class_dict: name_to_class_dict[name][line] = 1.0 else: class_dict = dict() class_dict[line] = 1.0 name_to_class_dict[name] = class_dict print(i, name) # token_list = name.lower().split(' ') # print(('%s\t%s') % (line, '\t'.join(token_list))) except Exception as e: error_qid_list.append(line) # if line not in human_types_list: # print(line) print('#error:\t', error_qid_list) write_dict(name_to_class_dict, './types_reverse.txt')
for gold in goldList: if gold in predictedList: truePositives += 1 else: falseNegatives += 1 for predicted in predictedList: if predicted not in goldList: falsePositives += 1 return EvaluationCounts(truePositives=truePositives, falsePositives=falsePositives, falseNegatives=falseNegatives) if __name__ == '__main__': lines = read_list('./sample_el_q_result.txt') q_to_system_answer_dict = dict() for line in lines: cols = line.split('\t') q_to_system_answer_dict[cols[1]] = eval(cols[3]) gold_lines = read_list('./sample_gold_q_result.txt') q_to_gold_answer_dict = dict() for gold_line in gold_lines: cols = gold_line.split('\t') q_to_gold_answer_dict[cols[0]] = eval(cols[1]) counts = [] for q, goldList in q_to_gold_answer_dict.items(): predictedList = [] if q in q_to_system_answer_dict:
def type_to_instances(): # mediators_list = read_list('../dataset/resources_cwq/dataset_freebase_latest/mediators.tsv') human_types_list = read_list( '../dataset/resources_cwq/dataset_freebase_latest/freebase_types') import sys from parsing.logger_test import Logger sys.stdout = Logger('./2019.03.15_freebase_instance_type_1.txt', sys.stdout) error_qid_list = [] filter_list_3 = [ 'music.recording', 'music.release_track', 'base.type_ontology.abstract', 'base.type_ontology.non_agent', 'common.notable_for', 'common.topic' ] filter_list_2 = [ 'type.content_import', 'type.content', 'type.namespace', 'common.document', 'base.type_ontology.agent', 'base.type_ontology.inanimate', 'base.type_ontology.animate' ] filter_list_4 = [ 'user.joram.environmental_science_$0026_technology.water_quality', 'user.rogopag.www$002ecittadiivrea$002ecom.topic', 'user.player.player_entertainment_group_inc$002e.branding', 'user.sankeerth.http$003a$002f$002fwebisee$002ecom.topic', 'user.player.player_entertainment_group_inc$002e.televisions_production', 'user.player.player_entertainment_group_inc$002e.visual_art', 'user.robert.world$0027s_tallest.topic', 'user.rial13.dre_$0022rial$0022_porcher.topic', 'user.ray315.$0432$0430$043b$044e$0442$0430.topic', 'user.bluenorthernmusic.musical_artist$002c_music_lessons.topic', 'user.mad_god.$0418$0441$043a$0443$0441$0441$0442$0432$0435$043d$043d$044b$0439_$0438$043d$0442$0435$043b$043b$0435$043a$0442.topic', 'user.player.player_entertainment_group_inc$002e.games', 'user.dreig.web_3$002e0.topic', 'user.beatyourprice.http$003a$002f$002fwww$002ebeatyourprice$002ecom.topic', 'user.brabblejr.www$002ebrabble$002ccom.topic', 'user.player.player_entertainment_group_inc$002e.concerts', 'user.player.player_entertainment_group_inc$002e.media_common', 'user.shomoa.magic$003a_the_gathering.subtype', 'user.mad_god.$0418$0441$043a$0443$0441$0441$0442$0432$0435$043d$043d$044b$0439_$0438$043d$0442$0435$043b$043b$0435$043a$0442.ai', 'user.shomoa.magic$003a_the_gathering.color', 'user.gadgetsgalore.www$002er4us$002ecom$002ftrophy.topic', 'user.player.player_entertainment_group_inc$002e.film', 'user.robert.world$0027s_tallest.building', 'user.shomoa.magic$003a_the_gathering.x_type', 'user.xiongy.$4e2d$56fd.x', 'user.hsetty.web2$002e0.topic', 'user.rogopag.robanostra$002ehomeftp$002enet.topic', 'user.freedom2002.$00e2$1ea11ea1c.topic', 'user.integrity19.taxation_and_pornography$003a_designing_system_to_survive_constitutional_challenges.topic', 'user.visha.$0645$062d$0645$062f_$062d$0645$06cc$062f_$0634$0627$06be$062f.topic', 'user.shomoa.magic$003a_the_gathering.card', 'user.player.player_entertainment_group_inc$002e.entertainment_company', 'user.rrhobbs.location_scouting$002c_location_management_and_locations_for_film$002c_tv$002c_photo_and_events.topic', 'user.player.player_entertainment_group_inc$002e.topic', 'user.shomoa.magic$003a_the_gathering.supertype', 'user.paulsipot.www$002eunnamedservice$002ecom.topic', 'user.shomoa.magic$003a_the_gathering.topic', 'user.zameen.ringtones$002emobi.topic', 'user.archbishopderrickyoung.archbishop_derrick_l$002e_young_d$002ed$002e$002c_d$002emin$002e.topic', 'user.player.player_entertainment_group_inc$002e.computer_game_designer', 'user.xiongy.$4e2d$56fd.topic', 'user.shomoa.magic$003a_the_gathering.zone', 'user.player.player_entertainment_group_inc$002e.product_integration', 'user.saranshsehgal.www$002emcllo$002ecom.topic', 'user.funkyflash.www$002edujdc$002eorg.topic', 'user.player.player_entertainment_group_inc$002e.game_development', 'user.player.player_entertainment_group_inc$002e.tv_program', 'user.chiliteslegacy.default_domain.the_chi_lites_bass_singer_creadel_jones_had_a_son_darren_in_which_played_a_important_role_in_helping_protect_his_legacy_against_fraud_exploition_and_embelzelments_to_creadel_jones_singer_legacy_and_his_music_his_son_darren_cubie_has_been_a_force_of_truth_and_guidence_for_iconic_legacies_an_thier_futher_darren_has_made_wed_sites_for_the_news_of_legacy_through_out_the_entertainment_field_that_mistreated_by_abuse_and_for_news_related_and_music_to_legendary_artist_icons_and_music_called_http_www_chilites_ning_com_and_http_www_chilites_net_all_are_real_disscussion_stating_information_music_abuse_and_news_and_music_creadel_jones_family_includes_wife_deborah_jones_and_two_sisters', 'user.joram.environmental_science_$0026_technology.topic', 'user.player.player_entertainment_group_inc$002e.computer_games', 'user.mirzak2.www$002emirzak2$002ewebs$002ecom.topic', 'user.pasidor.pasidor$002ecom.topic', 'user.imteam1.http$003a$002f$002fwww$002egreenconservationproducts$002ecom$002f.topic', 'user.player.player_entertainment_group_inc$002e.arts_entertainment', 'user.rogopag.www$002enastypixel$002ecom.topic', 'user.kunninmindzradio.http$003a$002f$002fkunninmindz$002ecom.topic' ] for line in human_types_list: if line in filter_list_2 or line in filter_list_3 or line in filter_list_4: continue try: instances = get_instance_by_class(line) for instance in instances: print(('%s\t%s') % (line, instance)) except Exception as e: error_qid_list.append(line) # if line not in human_types_list: # print(line) print(error_qid_list)
from common import hand_files q_mode = globals_args.argument_parser.q_mode # 2.2 args if q_mode == 'cwq': oracle_file_root = globals_args.fn_cwq_file.grounded_graph_file + 'result/' oracle_all_files_path_names = os.listdir(oracle_file_root) literal_to_id_map = grounding_utils.read_literal_to_id_map( file_root=globals_args.fn_cwq_file.grounded_graph_file) kb_relations = hand_files.read_set( globals_args.kb_freebase_latest_file.freebase_relations_file) mediators_instances_set = hand_files.read_set( globals_args.kb_freebase_latest_file.mediators_instances_file) schema_lines_list = hand_files.read_list( globals_args.kb_freebase_latest_file.schema_file) property_reverse_dict = hand_files.read_dict( globals_args.kb_freebase_latest_file.freebase_reverse_property) literal_property_dict = hand_files.read_dict( globals_args.kb_freebase_latest_file.freebase_literal_property) elif q_mode == 'graphq': oracle_file_root = globals_args.fn_graph_file.grounded_graph_file + 'result/' oracle_all_files_path_names = os.listdir(oracle_file_root) literal_to_id_map = grounding_utils.read_literal_to_id_map( file_root=globals_args.fn_graph_file.grounded_graph_file) kb_relations = hand_files.read_set( globals_args.kb_freebase_en_2013.freebase_relations_file) mediators_instances_set = hand_files.read_set( globals_args.kb_freebase_en_2013.mediators_instances_file)
from parsing.nltk_nlp_utils import NLTK_NLP from common import globals_args from common import hand_files parser_mode = globals_args.parser_mode wh_words_set = { "what", "which", "whom", "who", "when", "where", "why", "how", "how many", "how large", "how big" } bert_args = BertArgs(globals_args.root, globals_args.q_mode) nltk_nlp = NLTK_NLP(globals_args.argument_parser.ip_port) sutime = SUTime(jars=globals_args.argument_parser.sutime_jar_files, mark_time_ranges=True) unimportantwords = hand_files.read_set( globals_args.argument_parser.unimportantwords) unimportantphrases = hand_files.read_list( globals_args.argument_parser.unimportantphrases) stopwords_dict = hand_files.read_set( globals_args.argument_parser.stopwords_dir) ordinal_lines_dict = hand_files.read_ordinal_file( globals_args.argument_parser.ordinal_fengli ) #2 {'second', '2ndis_equal_wh_word'} count_phrases = [ 'Count', 'How many', 'how many', 'the number of', 'the count of', 'the amount of', 'total number of', 'count' ] count_ner_tags = ['count'] dayu_phrases = [ 'more', 'more than', 'greater', 'higher', 'longer than', 'taller than' ] #'over', dayu_dengyu_phrases = ['at least', 'not less than', 'or more']