Пример #1
0
def generate_templates(label,project_name,depth=1,output_file="sentence_and_template_generator", paraphraser=False):
    """
    The function acts as a wrapper for the whole package of supplied source code.
    """
    val = generate_url(label)
    url = val[0]
    about = (val[1])
    count =0
    vessel= []  
    depth=int(depth)
    diction = fetch_ranks("../utility/part-r-00000")
    if(not os.path.isdir(project_name)):
        os.makedirs(project_name)
    output_file = open(project_name+"/" + output_file, 'w')
    test_set = open(project_name+"/" + "test.csv", 'w')
    if paraphraser:
        expand_set = open(project_name+"/" + "expand.csv", 'w')
    prop_dic = {}
    for iterator in range(depth):
        prop_dic[iterator] = []
    # Create a logger object
    logger = logging.getLogger()

    # Configure logger
    logging.basicConfig(filename=project_name+"/logfile.log", format='%(filename)s: %(message)s', filemode='w')

    # Setting threshold level
    logger.setLevel(logging.WARNING)

    # Use the logging methods
    #logger.debug("This is a debug message")  
    logger.info("This is a log file.")  
    #logger.warning("This is a warning message")  
    #logger.error("This is an error message")  
    #logger.critical("This is a critical message")
    if paraphraser:
        folder_path = get_pretrained_model(const.URL)
        set_seed(42)
        tokenizer, device, model = prepare_model(folder_path)

    list_of_property_information = get_properties(url=url,project_name=project_name,output_file = "get_properties.csv")
    for property_line in list_of_property_information:
        count+=1
        prop = property_line.split(',')
        print("**************\n"+str(prop))
        if paraphraser:
            sentence_and_template_generator(original_count=depth,prop_dic=prop_dic,test_set=test_set,log=logger,diction=diction,output_file=output_file,mother_ontology=about.strip().replace("http://dbpedia.org/ontology/","dbo:"),vessel=vessel,project_name=project_name ,prop=prop, suffix = " of <A> ?",count = depth,expand_set=expand_set,tokenizer=tokenizer,device=device,model=model)
        else:
            sentence_and_template_generator(original_count=depth, prop_dic=prop_dic, test_set=test_set, log=logger,
                                            diction=diction, output_file=output_file,
                                            mother_ontology=about.strip().replace("http://dbpedia.org/ontology/",
                                                                                  "dbo:"), vessel=vessel,
                                            project_name=project_name, prop=prop, suffix=" of <A> ?", count=depth)

    output_file.close()
Пример #2
0
def generate_templates(label,project_name,depth=1,output_file="sentence_and_template_generator"):
    """
    Funtion to generate templates | wrapper function for rest of the functions. 
    """
    val = generate_url(label)
    url = val[0]
    about = (val[1])
    count =0
    vessel= []  
    
    diction = fetch_ranks("../utility/part-r-00000")
    if(not os.path.isdir(project_name)):
        os.makedirs(project_name)
    output_file = open(project_name+"/" + output_file, 'w')
    
    # Create a logger object
    logger = logging.getLogger()

    # Configure logger
    logging.basicConfig(filename=project_name+"/logfile.log", format='%(filename)s: %(message)s', filemode='w')

    # Setting threshold level
    logger.setLevel(logging.DEBUG)

    # Use the logging methods
    #logger.debug("This is a debug message")  
    logger.info("This is a log file.")  
    #logger.warning("This is a warning message")  
    #logger.error("This is an error message")  
    #logger.critical("This is a critical message")   

    list_of_property_information = get_properties(url=url,project_name=project_name,output_file = "get_properties.csv")
    for property_line in list_of_property_information:
        count+=1
        prop = property_line.split(',')
        print("**************\n"+str(prop))
        sentence_and_template_generator(log=logger,diction=diction,output_file=output_file,mother_ontology=about.strip().replace("http://dbpedia.org/ontology/","dbo:"),vessel=vessel,project_name=project_name ,prop=prop, suffix = " of <A> ?",count = 2)
    output_file.close()    
Пример #3
0
def sentence_and_template_generator(prop_dic,
                                    test_set,
                                    log,
                                    mother_ontology,
                                    vessel,
                                    prop,
                                    project_name,
                                    output_file,
                                    diction,
                                    expand_set=[],
                                    tokenizer=None,
                                    device=None,
                                    model=None,
                                    original_count=0,
                                    count=0,
                                    suffix=" of <A> ?",
                                    query_suffix=""):

    if (type(prop) == str):
        prop = prop.split(',')
    #original_count = count
    natural_language_question = []
    sparql_query = []
    expanded_nl_question = []
    expanded_sparql_query = []
    question_form = open("../utility/question_form.csv", 'r').readlines()
    question_starts_with = question_form[0].split(',')
    query_starts_with = question_form[1].split(',')
    query_ends_with = question_form[2].split(',')
    question_number = [2]
    if (prop[3] == "owl:Thing" or prop[3] == "xsd:string"):
        question_number = [2, 4]
    elif (prop[3] == "Place"):
        question_number = [3, 4]
    elif (prop[3] == "Person"):
        question_number = [1, 4]
    elif (prop[3] == "xsd:date" or "date" in prop[3] or "year" in prop[3]
          or "date" in prop[3] or "time" in prop[3]):
        question_number = [0, 4, 5]
    elif (prop[3] == "xsd:nonNegativeInteger"
          or "negative" in prop[3].lower()):
        question_number = [2, 6]
    elif (prop[3] == "xsd:integer" or "integer" in prop[3].lower()):
        question_number = [2, 6]
    else:
        question_number = [2]

    val = (generate_url_spec(prop[0]))
    prop_link = val[0]
    if (prop_link == "None" or prop_link == None):
        return
    derived = val[1]
    prop_link = "dbo:" + prop_link.strip().split(
        'http://dbpedia.org/ontology/')[-1]

    for number in question_number:
        original_question = question_starts_with[number] + prop[1] + suffix
        original_sparql = query_starts_with[
            number] + "where { <A>  " + query_suffix + prop_link + " ?x " + query_ends_with[
                number]
        natural_language_question.append(original_question)
        sparql_query.append(original_sparql)

    if (query_suffix == ""):
        query_answer = ("select distinct(?a) where { ?a " + prop_link +
                        " []  } ")
    else:
        query_answer = ("select distinct(?a) where { ?a " +
                        query_suffix.split(" ")[0] + " [] . ?a  " +
                        query_suffix + " " + prop_link + " ?x } ")

    if (query_suffix == ""):
        flag = (check_query(log=log,
                            query=query_answer.replace("select distinct(?a)",
                                                       "ask")))
    else:
        flag = (check_query(log=log,
                            query=query_answer.replace("select distinct(?a)",
                                                       "ask")))
    if (not flag):
        return

    rank = rank_check(diction=diction,
                      count=count,
                      query=query_answer,
                      original_count=original_count)

    count = count - 1
    if (count == 0):
        variable = "?x"
    else:
        variable = "?x" + str(count)
    query_suffix = prop_link + " " + variable + " . " + variable + " "
    #for temp_counter in range(original_count):
    if (not prop[0] in prop_dic[original_count - count - 1]):
        for number in range(len(natural_language_question)):
            if count == original_count - 1 and device:
                final_candidates = paraphrase_questions(
                    tokenizer, device, model, original_question)
                final_quesiton = pick_final_sentence(original_question,
                                                     final_candidates)

                expanded_nl_question.append(final_quesiton)
                expanded_sparql_query.append(original_sparql)
            if expanded_sparql_query:
                expand_line = [
                    mother_ontology, "", "", expanded_nl_question[number],
                    expanded_sparql_query[number], query_answer
                ]
                expand_set.write(
                    (';'.join(expand_line) + ";" + str(rank) + ";" +
                     "Paraphrased" + "\n").replace("  ", " "))
            vessel.append([
                mother_ontology, "", "", natural_language_question[number],
                sparql_query[number], query_answer
            ])
            output_file.write((';'.join(vessel[-1]) + ";" + str(rank) + ";" +
                               "Original" + "\n").replace("  ", " "))
            log.info(';'.join(vessel[-1]) + str(rank) + "\n")

    else:
        for number in range(len(natural_language_question)):
            if expanded_sparql_query:
                expand_line = [
                    mother_ontology, "", "", expanded_sparql_query[number],
                    expanded_sparql_query[number], query_answer
                ]
                expand_set.write(
                    (';'.join(expand_line) + ";" + str(rank) + "\n").replace(
                        "  ", " "))
            vessel.append([
                mother_ontology, "", "", natural_language_question[number],
                sparql_query[number], query_answer
            ])
            test_set.write(
                (';'.join(vessel[-1]) + ";" + str(rank) + "\n").replace(
                    "  ", " "))
            print("++++++++++++++++++++", vessel[-1], "+++++++++++++++")
            log.info("Test: " + ';'.join(vessel[-1]) + str(rank) + "\n")

    prop_dic[original_count - count - 1].append(prop[0])
    #print(str(natural_language_question)+"\n"+str(sparql_query)+"\n"+query_answer+"\n*************")

    suffix = " of " + prop[1] + " of <A> ?"

    if (count > 0):
        print(prop[3].split(":")[-1])
        val = generate_url(prop[3].split(":")[-1])
        url = val[0]
        if (not url.startswith("http://mappings.dbpedia.org")):
            return
        list_of_property_information = get_properties(
            url=url, project_name=project_name, output_file=prop[1] + ".csv")
        for property_line in tqdm(list_of_property_information):
            prop_inside = property_line.split(',')
            sentence_and_template_generator(expand_set=expand_set,
                                            prop_dic=prop_dic,
                                            test_set=test_set,
                                            log=log,
                                            original_count=original_count,
                                            diction=diction,
                                            output_file=output_file,
                                            mother_ontology=mother_ontology,
                                            vessel=vessel,
                                            prop=prop_inside,
                                            suffix=suffix,
                                            count=count,
                                            project_name=project_name,
                                            query_suffix=query_suffix,
                                            tokenizer=tokenizer,
                                            device=device,
                                            model=model)
Пример #4
0
    for number in range(len(natural_language_question)):
        vessel.append([mother_ontology, "", "", natural_language_question[number], sparql_query[number], query_answer])
        output_file.write((';'.join(vessel[-1]) + ";" + str(rank) + "\n").replace("  ", " "))
        log.info(';'.join(vessel[-1]) + str(rank) + "\n")
    # print(str(natural_language_question)+"\n"+str(sparql_query)+"\n"+query_answer+"\n*************")

    suffix = " of " + prop[1] + " of <A> ?"

    if (count > 0):
        print(prop[3].split(":")[-1])
        val = generate_url(prop[3].split(":")[-1])
        url = val[0]
        if (not url.startswith("http://mappings.dbpedia.org")):
            return
        list_of_property_information = get_properties(url=url, project_name=project_name, output_file=prop[1] + ".csv")
        for property_line in tqdm(list_of_property_information):
            prop_inside = property_line.split(',')
            sentence_and_template_generator(log=log, original_count=original_count, diction=diction,
                                            output_file=output_file, mother_ontology=mother_ontology, vessel=vessel,
                                            prop=prop_inside, suffix=suffix, count=count, project_name=project_name,
                                            query_suffix=query_suffix)


if __name__ == "__main__":
    """
    Section to parse the command line arguments.
    """
    parser = argparse.ArgumentParser()
    requiredNamed = parser.add_argument_group('Required Arguments')
Пример #5
0
def integrate(namespace,
              uri_file,
              output_file="integrate.csv",
              project_name="test_project",
              url="Enter a valid URL",
              input_file="Pleaes enter a valid file name"):
    print("Reading the TSV file: ")
    open_tsv = open(uri_file, 'r')
    read_tsv = open_tsv.readlines()
    diction = {}
    for line in tqdm(read_tsv):
        line = line.strip().split('\t')
        if line[0].split('/')[-2] != namespace:
            continue
        diction[line[0].split('/')[-1]] = line[1]

    open_tsv.close()
    """
	Processing the input file. 
	-	The input file is read, out put from get_properties.py
	-	Reading lines from the input files.
	-	Iterating over every line of the read file.
	-	Taking the name from the line.
	-	if the given name is in the dictionry created above 
		appending the url to the given name and corresponding 
		frequency to the row entry(read line). Else appending 
		an empty string. 
	-	Joining all the elements of the list line with a comma,
		adding a new line character and then going for the next 
		iteration after adding it to a variable final (string addition)
	"""

    if (__name__ == "__main__"):
        print("Reading the input file: ")
        open_inp = open(input_file, 'r')
        line_inp = open_inp.readlines()

    if (not __name__ == "__main__"):
        line_inp = get_properties(url=url,
                                  output_file="get_properties.csv",
                                  project_name=project_name)

    cnt, tot = 0, 0
    final = ""
    accum = []
    for in_line in tqdm(line_inp):

        line = in_line.strip().split(',')
        in_line = line[0]
        tot += 1
        # if ':' in m:
        # 	print "lol", m
        if in_line in diction:
            cnt += 1
            line.append("http://dbpedia.org/" + namespace + "/" + in_line)
            line.append(diction[in_line])
        else:

            line.append('')
            line.append('')
            # print in_line

        final += ",".join(line)
        accum.append(",".join(line))
        final += '\n'
    """
	The string final is the written to the output file name
	as given in the command line argument.
	"""
    # print final
    f = open(project_name + "/" + output_file, 'w')
    f.write(final)
    print("**************************************")
    print("Total number of entity whose URI was found: " + str(cnt) +
          "\nTotal number of entities present: " + str(tot))
    return accum