示例#1
0
def extract_org_data(bio):
    import culturalForm as cf
    global uber_graph
    elements = ["POLITICALAFFILIATION", "DENOMINATION", "SCHOOL"]
    for element in elements:
        tag = bio.find_all(element)
        for instance in tag:
            org = get_org(instance)
            if org:
                if element == elements[0]:
                    org_type = utilities.NS_DICT["cwrc"].PoliticalOrganization
                elif element == elements[1]:
                    org_type = utilities.NS_DICT["cwrc"].ReligiousOrganization
                elif element == elements[2]:
                    org_type = utilities.NS_DICT[
                        "cwrc"].EducationalOrganization

                for x in org:
                    org_uri = get_org_uri(x)
                    uber_graph.add((org_uri, RDF.type, org_type))
                    uber_graph.remove((org_uri, RDF.type,
                                       utilities.NS_DICT["org"].Organization))

                    # Adding the hasOrganization relation
                    if org_type == utilities.NS_DICT[
                            "cwrc"].ReligiousOrganization:
                        mapped_value = cf.get_mapped_term(
                            "Religion", cf.get_value(instance))
                        if type(mapped_value) is rdflib.term.URIRef:
                            uber_graph.add(
                                (mapped_value,
                                 utilities.NS_DICT["cwrc"].hasOrganization,
                                 org_uri))
                    elif org_type == utilities.NS_DICT[
                            "cwrc"].PoliticalOrganization:
                        mapped_value = cf.get_mapped_term(
                            "PoliticalAffiliation", cf.get_value(instance))
                        if type(mapped_value) is rdflib.term.URIRef:
                            uber_graph.add(
                                (mapped_value,
                                 utilities.NS_DICT["cwrc"].hasOrganization,
                                 org_uri))
示例#2
0
def main():
    from bs4 import BeautifulSoup
    import culturalForm

    file_dict = utilities.parse_args(__file__, "Occupation")

    entry_num = 1

    uber_graph = utilities.create_graph()

    for filename in file_dict.keys():
        with open(filename) as f:
            soup = BeautifulSoup(f, 'lxml-xml')

        person_id = filename.split("/")[-1][:6]

        print(filename)
        print(file_dict[filename])
        print(person_id)
        print("*" * 55)

        person = Biography(
            person_id, soup,
            culturalForm.get_mapped_term("Gender", utilities.get_sex(soup)))
        extract_occupation_data(soup, person)

        graph = person.to_graph()

        temp_path = "extracted_triples/occupation_turtle/" + person_id + "_occupations.ttl"
        utilities.create_extracted_file(temp_path, person)

        uber_graph += graph
        entry_num += 1

    print("UberGraph is size:", len(uber_graph))
    temp_path = "extracted_triples/occupations.ttl"
    utilities.create_extracted_uberfile(temp_path, uber_graph)

    temp_path = "extracted_triples/occupations.rdf"
    utilities.create_extracted_uberfile(temp_path, uber_graph, "pretty-xml")

    if 'http://sparql.cwrc.ca/ontologies/cwrc#Occupation' in fail_dict:
        job_fail_dict = fail_dict[
            'http://sparql.cwrc.ca/ontologies/cwrc#Occupation']
        logger.info("Missed Terms: " + str(len(job_fail_dict.keys())))
        count = 0
        for x in job_fail_dict.keys():
            logger.info(x + " : " + str(job_fail_dict[x]))
            count += job_fail_dict[x]
        logger.info("Total Terms: " + str(count))
示例#3
0
def main():
    from bs4 import BeautifulSoup
    import culturalForm
    from biography import Biography

    file_dict = utilities.parse_args(__file__, "Birth/Death")
    print("-" * 200)
    entry_num = 1

    uber_graph = utilities.create_graph()

    for filename in file_dict.keys():
        with open(filename) as f:
            soup = BeautifulSoup(f, 'lxml-xml')

        person_id = filename.split("/")[-1][:6]

        print(filename)
        print(file_dict[filename])
        print(person_id)
        print("*" * 55)

        person = Biography(
            person_id, soup,
            culturalForm.get_mapped_term("Gender", utilities.get_sex(soup)))
        extract_birth_data(soup, person)
        extract_death_data(soup, person)
        person.name = utilities.get_readable_name(soup)
        print(person.to_file())

        temp_path = "extracted_triples/birthdeath_turtle/" + person_id + "_birthdeath.ttl"
        utilities.create_extracted_file(temp_path, person)

        uber_graph += person.to_graph()
        entry_num += 1
        print("=" * 55)

    print("UberGraph is size:", len(uber_graph))
    temp_path = "extracted_triples/birthdeath.ttl"
    utilities.create_extracted_uberfile(temp_path, uber_graph)

    temp_path = "extracted_triples/birthdeath.rdf"
    utilities.create_extracted_uberfile(temp_path, uber_graph, "pretty-xml")
示例#4
0
def main():
    from bs4 import BeautifulSoup
    import culturalForm

    ext_type = "Violence, Wealth, Leisure and Society, Other Life Event, Health contexts"
    file_dict = utilities.parse_args(__file__, ext_type)

    entry_num = 1

    uber_graph = utilities.create_graph()

    for filename in file_dict.keys():
        with open(filename) as f:
            soup = BeautifulSoup(f, 'lxml-xml')

        person_id = filename.split("/")[-1][:6]

        print(filename)
        print(file_dict[filename])
        print(person_id)
        print("*" * 55)

        person = Biography(person_id, soup, culturalForm.get_mapped_term("Gender", utilities.get_sex(soup)))
        extract_other_contexts_data(soup, person)

        graph = person.to_graph()

        temp_path = "extracted_triples/other_contexts_turtle/" + person_id + "_other_contexts.ttl"
        utilities.create_extracted_file(temp_path, person)

        uber_graph += graph
        entry_num += 1

    print("UberGraph is size:", len(uber_graph))
    temp_path = "extracted_triples/other_contexts.ttl"
    utilities.create_extracted_uberfile(temp_path, uber_graph)

    temp_path = "extracted_triples/other_contexts.rdf"
    utilities.create_extracted_uberfile(temp_path, uber_graph, "pretty-xml")
示例#5
0
def main():
    file_dict = utilities.parse_args(__file__,
                                     "Majority of biography related data")

    entry_num = 1
    uber_graph = utilities.create_graph()

    highest_triples = 0
    least_triples = 0
    smallest_person = None
    largest_person = None
    logger.info("Time started: " + utilities.get_current_time() + "\n")

    for filename in file_dict.keys():
        with open(filename) as f:
            soup = BeautifulSoup(f, 'lxml-xml')

        person_id = filename.split("/")[-1][:6]

        print(person_id)
        print(file_dict[filename])
        print("*" * 55)
        person = Biography(
            person_id, soup,
            cf.get_mapped_term("Gender", utilities.get_sex(soup)))
        cf.extract_cf_data(soup, person)
        other_contexts.extract_other_contexts_data(soup, person)
        location.extract_location_data(soup, person)
        occupation.extract_occupation_data(soup, person)
        education.extract_education_data(soup, person)

        # personname.extract_person_name(soup, person)
        birthDeath.extract_birth_data(soup, person)
        # birthDeath.extract_death(soup, person)
        # lifeInfo.extract_cohabitants(soup, person)
        # lifeInfo.extract_family(soup, person)
        # lifeInfo.extract_friends_associates(soup, person)
        # lifeInfo.extract_intimate_relationships(soup, person)
        # lifeInfo.extract_childlessness(soup, person)
        # lifeInfo.extract_children(soup, person)

        graph = person.to_graph()
        triple_count = len(graph)

        if triple_count > highest_triples:
            highest_triples = triple_count
            largest_person = filename
        if least_triples == 0 or triple_count < least_triples:
            least_triples = triple_count
            smallest_person = filename

        # triples to files
        temp_path = "extracted_triples/biography_turtle/" + person_id + "_biography.ttl"
        utilities.create_extracted_file(temp_path, person)

        uber_graph += graph
        entry_num += 1

    temp_path = "extracted_triples/biography_triples.ttl"
    create_extracted_uberfile(temp_path, uber_graph)

    cf.log_mapping_fails()
    logger.info(str(len(uber_graph)) + " total triples created")
    logger.info(
        str(largest_person) + " produces the most triples(" +
        str(highest_triples) + ")")
    logger.info(
        str(smallest_person) + " produces the least triples(" +
        str(least_triples) + ")")

    logger.info("Time completed: " + utilities.get_current_time())