Пример #1
0
def get_location_spotlight(n_lines=10):
    """
    Gets the location according to dbpedia spotlight from the first 10 lines of the 1page pdfs. Or the first n lines
    until "resume" is found.
    :return:
    """

    def filter_locations(list_annotations):

        filtered_list = []
        dbpedia_types = ["Place", "Education"]
        for v in list_annotations:
            if any([t for t in dbpedia_types if t in v["types"]]):
                filtered_list.append(v)
        return filtered_list

    # egc_df = get_EGC_articles(load_data_egc("../../input/RNTI_articles_export_fixed1347_ids.txt"))
    one_pages = load_text_data("../../input/pdfs/1page/", "txt")
    dict_locations = defaultdict(list)
    for page_idx, doc in one_pages.iteritems():
        first_lines = [l.lower() for l in doc.split("\n")[:n_lines]]
        resume_in = [True if u"résumé" in t else False for t in first_lines]
        if any(resume_in):
            first_lines = first_lines[:resume_in.index(True)]
        text = " ".join(first_lines)
        result = get_spotlight_annotation(text)
        if not result:
            continue
        result = filter_locations(result)
        dict_locations[page_idx].extend(list(set([r["surfaceForm"] for r in result])))
        print page_idx
    import pandas
    df = pandas.DataFrame(dict_locations.items(), columns=["id", "1page_location"])
    df.to_csv("../../input/location_1pagePDF.csv", index_label=False, index=False)
Пример #2
0
def create_nmf_graph():
    one_pages = load_text_data("../../input/pdfs/1page/", "txt")
    dict_topics = nmf_clustering(one_pages)
    import pprint
    pprint.pprint(dict_topics)
    d3js_dict = defaultdict(list)
    # import snap as sn
    # G = sn.TUNGraph.New()
    # for topic_idx, topic in dict_topics.iteritems():


    pass
Пример #3
0
def get_location_spotlight(n_lines=10):
    """
    Gets the location according to dbpedia spotlight from the first 10 lines of the 1page pdfs. Or the first n lines
    until "resume" is found.
    :return:
    """
    def filter_locations(list_annotations):

        filtered_list = []
        dbpedia_types = ["Place", "Education"]
        for v in list_annotations:
            if any([t for t in dbpedia_types if t in v["types"]]):
                filtered_list.append(v)
        return filtered_list

    # egc_df = get_EGC_articles(load_data_egc("../../input/RNTI_articles_export_fixed1347_ids.txt"))
    one_pages = load_text_data("../../input/pdfs/1page/", "txt")
    dict_locations = defaultdict(list)
    for page_idx, doc in one_pages.iteritems():
        first_lines = [l.lower() for l in doc.split("\n")[:n_lines]]
        resume_in = [True if u"résumé" in t else False for t in first_lines]
        if any(resume_in):
            first_lines = first_lines[:resume_in.index(True)]
        text = " ".join(first_lines)
        result = get_spotlight_annotation(text)
        if not result:
            continue
        result = filter_locations(result)
        dict_locations[page_idx].extend(
            list(set([r["surfaceForm"] for r in result])))
        print page_idx
    import pandas
    df = pandas.DataFrame(dict_locations.items(),
                          columns=["id", "1page_location"])
    df.to_csv("../../input/location_1pagePDF.csv",
              index_label=False,
              index=False)
Пример #4
0
def run_models():
    one_pages = load_text_data("../../input/pdfs/1page/", "txt")
    dict_topic_top_words, dict_doc_top_topics = nmf_clustering(one_pages)
Пример #5
0
            # finally, if we found one of the possible location markers, we keep the line
            findo = re.findall(possible_location, l)
            if findo:
                locaion_list.append(l)
        return locaion_list

    def treat_asteriks(text_list, asterisks_idx):

        asterisks_idx = np.array(asterisks_idx)
        text_list = np.array(text_list)
        lines_with_asterisk = [re.sub(ur"[\u2217\*]+", "", l, re.UNICODE) for l in text_list[asterisks_idx]]

        return lines_with_asterisk

    egc_df = get_EGC_articles(load_data_egc("../../input/RNTI_articles_export_fixed1347_ids.txt"))
    one_pages = load_text_data("../../input/pdfs/1page/", "txt")
    locations_dict = defaultdict(list)
    for page_idx, doc in one_pages.iteritems():
        authors = [n.lower() for n in egc_df[egc_df["id"] == int(page_idx)]["authors"].values[0].split(",")]
        doc_split = [l for l in doc.split("\n") if len(l.strip()) > 2]
        first_lines = [l for l in doc_split[2:10]]
        text_lines = "\n".join(first_lines)
        # if u"*" in text_lines or u"\u2217" in text_lines:
        asterisk_idx = [True if l[0] in asterisks else False for l in first_lines]
        if any(asterisk_idx):
            info_location = treat_asteriks(first_lines, asterisk_idx)
            # print info_location
            # print
            # print text_lines
            # print "***"*80
            # with_stars+=1
def get_1page_topics():
    one_pages = load_text_data("../../input/pdfs/1page/", "txt")
    return nmf_clustering(one_pages)
Пример #7
0
def run_models():
    one_pages = load_text_data("../../input/pdfs/1page/", "txt")
    dict_topic_top_words, dict_doc_top_topics = nmf_clustering(one_pages)
Пример #8
0
        return locaion_list

    def treat_asteriks(text_list, asterisks_idx):

        asterisks_idx = np.array(asterisks_idx)
        text_list = np.array(text_list)
        lines_with_asterisk = [
            re.sub(ur"[\u2217\*]+", "", l, re.UNICODE)
            for l in text_list[asterisks_idx]
        ]

        return lines_with_asterisk

    egc_df = get_EGC_articles(
        load_data_egc("../../input/RNTI_articles_export_fixed1347_ids.txt"))
    one_pages = load_text_data("../../input/pdfs/1page/", "txt")
    locations_dict = defaultdict(list)
    for page_idx, doc in one_pages.iteritems():
        authors = [
            n.lower() for n in egc_df[egc_df["id"] == int(page_idx)]
            ["authors"].values[0].split(",")
        ]
        doc_split = [l for l in doc.split("\n") if len(l.strip()) > 2]
        first_lines = [l for l in doc_split[2:10]]
        text_lines = "\n".join(first_lines)
        # if u"*" in text_lines or u"\u2217" in text_lines:
        asterisk_idx = [
            True if l[0] in asterisks else False for l in first_lines
        ]
        if any(asterisk_idx):
            info_location = treat_asteriks(first_lines, asterisk_idx)
def get_1page_topics():
    one_pages = load_text_data("../../input/pdfs/1page/", "txt")
    return nmf_clustering(one_pages)