def generate_templates(label,project_name,depth=1,output_file="sentence_and_template_generator", paraphraser=False): """ The function acts as a wrapper for the whole package of supplied source code. """ val = generate_url(label) url = val[0] about = (val[1]) count =0 vessel= [] depth=int(depth) diction = fetch_ranks("../utility/part-r-00000") if(not os.path.isdir(project_name)): os.makedirs(project_name) output_file = open(project_name+"/" + output_file, 'w') test_set = open(project_name+"/" + "test.csv", 'w') if paraphraser: expand_set = open(project_name+"/" + "expand.csv", 'w') prop_dic = {} for iterator in range(depth): prop_dic[iterator] = [] # Create a logger object logger = logging.getLogger() # Configure logger logging.basicConfig(filename=project_name+"/logfile.log", format='%(filename)s: %(message)s', filemode='w') # Setting threshold level logger.setLevel(logging.WARNING) # Use the logging methods #logger.debug("This is a debug message") logger.info("This is a log file.") #logger.warning("This is a warning message") #logger.error("This is an error message") #logger.critical("This is a critical message") if paraphraser: folder_path = get_pretrained_model(const.URL) set_seed(42) tokenizer, device, model = prepare_model(folder_path) list_of_property_information = get_properties(url=url,project_name=project_name,output_file = "get_properties.csv") for property_line in list_of_property_information: count+=1 prop = property_line.split(',') print("**************\n"+str(prop)) if paraphraser: sentence_and_template_generator(original_count=depth,prop_dic=prop_dic,test_set=test_set,log=logger,diction=diction,output_file=output_file,mother_ontology=about.strip().replace("http://dbpedia.org/ontology/","dbo:"),vessel=vessel,project_name=project_name ,prop=prop, suffix = " of <A> ?",count = depth,expand_set=expand_set,tokenizer=tokenizer,device=device,model=model) else: sentence_and_template_generator(original_count=depth, prop_dic=prop_dic, test_set=test_set, log=logger, diction=diction, output_file=output_file, mother_ontology=about.strip().replace("http://dbpedia.org/ontology/", "dbo:"), vessel=vessel, project_name=project_name, prop=prop, suffix=" of <A> ?", count=depth) output_file.close()
def generate_templates(label,project_name,depth=1,output_file="sentence_and_template_generator"): """ Funtion to generate templates | wrapper function for rest of the functions. """ val = generate_url(label) url = val[0] about = (val[1]) count =0 vessel= [] diction = fetch_ranks("../utility/part-r-00000") if(not os.path.isdir(project_name)): os.makedirs(project_name) output_file = open(project_name+"/" + output_file, 'w') # Create a logger object logger = logging.getLogger() # Configure logger logging.basicConfig(filename=project_name+"/logfile.log", format='%(filename)s: %(message)s', filemode='w') # Setting threshold level logger.setLevel(logging.DEBUG) # Use the logging methods #logger.debug("This is a debug message") logger.info("This is a log file.") #logger.warning("This is a warning message") #logger.error("This is an error message") #logger.critical("This is a critical message") list_of_property_information = get_properties(url=url,project_name=project_name,output_file = "get_properties.csv") for property_line in list_of_property_information: count+=1 prop = property_line.split(',') print("**************\n"+str(prop)) sentence_and_template_generator(log=logger,diction=diction,output_file=output_file,mother_ontology=about.strip().replace("http://dbpedia.org/ontology/","dbo:"),vessel=vessel,project_name=project_name ,prop=prop, suffix = " of <A> ?",count = 2) output_file.close()
def sentence_and_template_generator(prop_dic, test_set, log, mother_ontology, vessel, prop, project_name, output_file, diction, expand_set=[], tokenizer=None, device=None, model=None, original_count=0, count=0, suffix=" of <A> ?", query_suffix=""): if (type(prop) == str): prop = prop.split(',') #original_count = count natural_language_question = [] sparql_query = [] expanded_nl_question = [] expanded_sparql_query = [] question_form = open("../utility/question_form.csv", 'r').readlines() question_starts_with = question_form[0].split(',') query_starts_with = question_form[1].split(',') query_ends_with = question_form[2].split(',') question_number = [2] if (prop[3] == "owl:Thing" or prop[3] == "xsd:string"): question_number = [2, 4] elif (prop[3] == "Place"): question_number = [3, 4] elif (prop[3] == "Person"): question_number = [1, 4] elif (prop[3] == "xsd:date" or "date" in prop[3] or "year" in prop[3] or "date" in prop[3] or "time" in prop[3]): question_number = [0, 4, 5] elif (prop[3] == "xsd:nonNegativeInteger" or "negative" in prop[3].lower()): question_number = [2, 6] elif (prop[3] == "xsd:integer" or "integer" in prop[3].lower()): question_number = [2, 6] else: question_number = [2] val = (generate_url_spec(prop[0])) prop_link = val[0] if (prop_link == "None" or prop_link == None): return derived = val[1] prop_link = "dbo:" + prop_link.strip().split( 'http://dbpedia.org/ontology/')[-1] for number in question_number: original_question = question_starts_with[number] + prop[1] + suffix original_sparql = query_starts_with[ number] + "where { <A> " + query_suffix + prop_link + " ?x " + query_ends_with[ number] natural_language_question.append(original_question) sparql_query.append(original_sparql) if (query_suffix == ""): query_answer = ("select distinct(?a) where { ?a " + prop_link + " [] } ") else: query_answer = ("select distinct(?a) where { ?a " + query_suffix.split(" ")[0] + " [] . ?a " + query_suffix + " " + prop_link + " ?x } ") if (query_suffix == ""): flag = (check_query(log=log, query=query_answer.replace("select distinct(?a)", "ask"))) else: flag = (check_query(log=log, query=query_answer.replace("select distinct(?a)", "ask"))) if (not flag): return rank = rank_check(diction=diction, count=count, query=query_answer, original_count=original_count) count = count - 1 if (count == 0): variable = "?x" else: variable = "?x" + str(count) query_suffix = prop_link + " " + variable + " . " + variable + " " #for temp_counter in range(original_count): if (not prop[0] in prop_dic[original_count - count - 1]): for number in range(len(natural_language_question)): if count == original_count - 1 and device: final_candidates = paraphrase_questions( tokenizer, device, model, original_question) final_quesiton = pick_final_sentence(original_question, final_candidates) expanded_nl_question.append(final_quesiton) expanded_sparql_query.append(original_sparql) if expanded_sparql_query: expand_line = [ mother_ontology, "", "", expanded_nl_question[number], expanded_sparql_query[number], query_answer ] expand_set.write( (';'.join(expand_line) + ";" + str(rank) + ";" + "Paraphrased" + "\n").replace(" ", " ")) vessel.append([ mother_ontology, "", "", natural_language_question[number], sparql_query[number], query_answer ]) output_file.write((';'.join(vessel[-1]) + ";" + str(rank) + ";" + "Original" + "\n").replace(" ", " ")) log.info(';'.join(vessel[-1]) + str(rank) + "\n") else: for number in range(len(natural_language_question)): if expanded_sparql_query: expand_line = [ mother_ontology, "", "", expanded_sparql_query[number], expanded_sparql_query[number], query_answer ] expand_set.write( (';'.join(expand_line) + ";" + str(rank) + "\n").replace( " ", " ")) vessel.append([ mother_ontology, "", "", natural_language_question[number], sparql_query[number], query_answer ]) test_set.write( (';'.join(vessel[-1]) + ";" + str(rank) + "\n").replace( " ", " ")) print("++++++++++++++++++++", vessel[-1], "+++++++++++++++") log.info("Test: " + ';'.join(vessel[-1]) + str(rank) + "\n") prop_dic[original_count - count - 1].append(prop[0]) #print(str(natural_language_question)+"\n"+str(sparql_query)+"\n"+query_answer+"\n*************") suffix = " of " + prop[1] + " of <A> ?" if (count > 0): print(prop[3].split(":")[-1]) val = generate_url(prop[3].split(":")[-1]) url = val[0] if (not url.startswith("http://mappings.dbpedia.org")): return list_of_property_information = get_properties( url=url, project_name=project_name, output_file=prop[1] + ".csv") for property_line in tqdm(list_of_property_information): prop_inside = property_line.split(',') sentence_and_template_generator(expand_set=expand_set, prop_dic=prop_dic, test_set=test_set, log=log, original_count=original_count, diction=diction, output_file=output_file, mother_ontology=mother_ontology, vessel=vessel, prop=prop_inside, suffix=suffix, count=count, project_name=project_name, query_suffix=query_suffix, tokenizer=tokenizer, device=device, model=model)
for number in range(len(natural_language_question)): vessel.append([mother_ontology, "", "", natural_language_question[number], sparql_query[number], query_answer]) output_file.write((';'.join(vessel[-1]) + ";" + str(rank) + "\n").replace(" ", " ")) log.info(';'.join(vessel[-1]) + str(rank) + "\n") # print(str(natural_language_question)+"\n"+str(sparql_query)+"\n"+query_answer+"\n*************") suffix = " of " + prop[1] + " of <A> ?" if (count > 0): print(prop[3].split(":")[-1]) val = generate_url(prop[3].split(":")[-1]) url = val[0] if (not url.startswith("http://mappings.dbpedia.org")): return list_of_property_information = get_properties(url=url, project_name=project_name, output_file=prop[1] + ".csv") for property_line in tqdm(list_of_property_information): prop_inside = property_line.split(',') sentence_and_template_generator(log=log, original_count=original_count, diction=diction, output_file=output_file, mother_ontology=mother_ontology, vessel=vessel, prop=prop_inside, suffix=suffix, count=count, project_name=project_name, query_suffix=query_suffix) if __name__ == "__main__": """ Section to parse the command line arguments. """ parser = argparse.ArgumentParser() requiredNamed = parser.add_argument_group('Required Arguments')
def integrate(namespace, uri_file, output_file="integrate.csv", project_name="test_project", url="Enter a valid URL", input_file="Pleaes enter a valid file name"): print("Reading the TSV file: ") open_tsv = open(uri_file, 'r') read_tsv = open_tsv.readlines() diction = {} for line in tqdm(read_tsv): line = line.strip().split('\t') if line[0].split('/')[-2] != namespace: continue diction[line[0].split('/')[-1]] = line[1] open_tsv.close() """ Processing the input file. - The input file is read, out put from get_properties.py - Reading lines from the input files. - Iterating over every line of the read file. - Taking the name from the line. - if the given name is in the dictionry created above appending the url to the given name and corresponding frequency to the row entry(read line). Else appending an empty string. - Joining all the elements of the list line with a comma, adding a new line character and then going for the next iteration after adding it to a variable final (string addition) """ if (__name__ == "__main__"): print("Reading the input file: ") open_inp = open(input_file, 'r') line_inp = open_inp.readlines() if (not __name__ == "__main__"): line_inp = get_properties(url=url, output_file="get_properties.csv", project_name=project_name) cnt, tot = 0, 0 final = "" accum = [] for in_line in tqdm(line_inp): line = in_line.strip().split(',') in_line = line[0] tot += 1 # if ':' in m: # print "lol", m if in_line in diction: cnt += 1 line.append("http://dbpedia.org/" + namespace + "/" + in_line) line.append(diction[in_line]) else: line.append('') line.append('') # print in_line final += ",".join(line) accum.append(",".join(line)) final += '\n' """ The string final is the written to the output file name as given in the command line argument. """ # print final f = open(project_name + "/" + output_file, 'w') f.write(final) print("**************************************") print("Total number of entity whose URI was found: " + str(cnt) + "\nTotal number of entities present: " + str(tot)) return accum