def get_location_spotlight(n_lines=10): """ Gets the location according to dbpedia spotlight from the first 10 lines of the 1page pdfs. Or the first n lines until "resume" is found. :return: """ def filter_locations(list_annotations): filtered_list = [] dbpedia_types = ["Place", "Education"] for v in list_annotations: if any([t for t in dbpedia_types if t in v["types"]]): filtered_list.append(v) return filtered_list # egc_df = get_EGC_articles(load_data_egc("../../input/RNTI_articles_export_fixed1347_ids.txt")) one_pages = load_text_data("../../input/pdfs/1page/", "txt") dict_locations = defaultdict(list) for page_idx, doc in one_pages.iteritems(): first_lines = [l.lower() for l in doc.split("\n")[:n_lines]] resume_in = [True if u"résumé" in t else False for t in first_lines] if any(resume_in): first_lines = first_lines[:resume_in.index(True)] text = " ".join(first_lines) result = get_spotlight_annotation(text) if not result: continue result = filter_locations(result) dict_locations[page_idx].extend(list(set([r["surfaceForm"] for r in result]))) print page_idx import pandas df = pandas.DataFrame(dict_locations.items(), columns=["id", "1page_location"]) df.to_csv("../../input/location_1pagePDF.csv", index_label=False, index=False)
def create_nmf_graph(): one_pages = load_text_data("../../input/pdfs/1page/", "txt") dict_topics = nmf_clustering(one_pages) import pprint pprint.pprint(dict_topics) d3js_dict = defaultdict(list) # import snap as sn # G = sn.TUNGraph.New() # for topic_idx, topic in dict_topics.iteritems(): pass
def get_location_spotlight(n_lines=10): """ Gets the location according to dbpedia spotlight from the first 10 lines of the 1page pdfs. Or the first n lines until "resume" is found. :return: """ def filter_locations(list_annotations): filtered_list = [] dbpedia_types = ["Place", "Education"] for v in list_annotations: if any([t for t in dbpedia_types if t in v["types"]]): filtered_list.append(v) return filtered_list # egc_df = get_EGC_articles(load_data_egc("../../input/RNTI_articles_export_fixed1347_ids.txt")) one_pages = load_text_data("../../input/pdfs/1page/", "txt") dict_locations = defaultdict(list) for page_idx, doc in one_pages.iteritems(): first_lines = [l.lower() for l in doc.split("\n")[:n_lines]] resume_in = [True if u"résumé" in t else False for t in first_lines] if any(resume_in): first_lines = first_lines[:resume_in.index(True)] text = " ".join(first_lines) result = get_spotlight_annotation(text) if not result: continue result = filter_locations(result) dict_locations[page_idx].extend( list(set([r["surfaceForm"] for r in result]))) print page_idx import pandas df = pandas.DataFrame(dict_locations.items(), columns=["id", "1page_location"]) df.to_csv("../../input/location_1pagePDF.csv", index_label=False, index=False)
def run_models(): one_pages = load_text_data("../../input/pdfs/1page/", "txt") dict_topic_top_words, dict_doc_top_topics = nmf_clustering(one_pages)
# finally, if we found one of the possible location markers, we keep the line findo = re.findall(possible_location, l) if findo: locaion_list.append(l) return locaion_list def treat_asteriks(text_list, asterisks_idx): asterisks_idx = np.array(asterisks_idx) text_list = np.array(text_list) lines_with_asterisk = [re.sub(ur"[\u2217\*]+", "", l, re.UNICODE) for l in text_list[asterisks_idx]] return lines_with_asterisk egc_df = get_EGC_articles(load_data_egc("../../input/RNTI_articles_export_fixed1347_ids.txt")) one_pages = load_text_data("../../input/pdfs/1page/", "txt") locations_dict = defaultdict(list) for page_idx, doc in one_pages.iteritems(): authors = [n.lower() for n in egc_df[egc_df["id"] == int(page_idx)]["authors"].values[0].split(",")] doc_split = [l for l in doc.split("\n") if len(l.strip()) > 2] first_lines = [l for l in doc_split[2:10]] text_lines = "\n".join(first_lines) # if u"*" in text_lines or u"\u2217" in text_lines: asterisk_idx = [True if l[0] in asterisks else False for l in first_lines] if any(asterisk_idx): info_location = treat_asteriks(first_lines, asterisk_idx) # print info_location # print # print text_lines # print "***"*80 # with_stars+=1
def get_1page_topics(): one_pages = load_text_data("../../input/pdfs/1page/", "txt") return nmf_clustering(one_pages)
return locaion_list def treat_asteriks(text_list, asterisks_idx): asterisks_idx = np.array(asterisks_idx) text_list = np.array(text_list) lines_with_asterisk = [ re.sub(ur"[\u2217\*]+", "", l, re.UNICODE) for l in text_list[asterisks_idx] ] return lines_with_asterisk egc_df = get_EGC_articles( load_data_egc("../../input/RNTI_articles_export_fixed1347_ids.txt")) one_pages = load_text_data("../../input/pdfs/1page/", "txt") locations_dict = defaultdict(list) for page_idx, doc in one_pages.iteritems(): authors = [ n.lower() for n in egc_df[egc_df["id"] == int(page_idx)] ["authors"].values[0].split(",") ] doc_split = [l for l in doc.split("\n") if len(l.strip()) > 2] first_lines = [l for l in doc_split[2:10]] text_lines = "\n".join(first_lines) # if u"*" in text_lines or u"\u2217" in text_lines: asterisk_idx = [ True if l[0] in asterisks else False for l in first_lines ] if any(asterisk_idx): info_location = treat_asteriks(first_lines, asterisk_idx)