def get_associated_words(pages, words, sections=None): """Get all words in the words parameter Arguments: pages {list} -- list of pages words {list} -- list of words Returns: dict -- Dictionary containing the results of the query in the foramt {Word:DataFrame} """ words = [w.strip().lower() for w in words] outputs = {} for word in words: all_matches = [] for page_ind, page in enumerate(pages): clean_page = clean_pdf_page(page) for sentance in clean_page: if word in sentance.lower(): sentance_words = [ i for i in sentance.lower().split() if i != word and i not in stop ] all_matches.extend(sentance_words) outputs[word] = pd.Series(all_matches).value_counts() return outputs
def get_words_in_sentances(pages, words, sections=None): """Get all words in the words parameter Arguments: pages {list} -- list of pages words {list} -- list of words Returns: dict -- Dictionary containing the results of the query in the foramt {Word:DataFrame} """ words = [w.strip().lower() for w in words] outputs = {} for word in words: all_matches = [] for page_ind, page in enumerate(pages): clean_page = clean_pdf_page(page) for sentance in clean_page: if word in sentance.lower(): d = {"Sentance": sentance, "Page": page_ind + 1} if sections is not None: d["Section"] = sections[page_ind + 1] all_matches.append(d) outputs[word] = pd.DataFrame(all_matches) return outputs
def get_headers(pages): """Find Section headers and sub headers in a dataframe Arguments: pages {list} -- list of pages to extract headers from Returns: list -- list of all header titles """ results = [] page_nums = [] page_num = 0 for page in pages: clean_page = clean_pdf_page(page) for i in clean_page: if ( i.startswith("Section") and "page" not in i ): # If the sentence starts with Secion X. results.append(i) page_nums.append(page_num + 1) elif re.findall("^(\d+\.\d+\.*)(?![\d\.])", i) and not re.findall( "\.\.\.", i ): # Else if the sentence begins with a section id (3.2.1, 1.1, etc) results.append(i) page_nums.append(page_num + 1) page_num += 1 last_num = 1 cleaned_results = [] cleaned_page_nums = [] for ind, val in enumerate(results): if str(val).lower() == "section.": continue if "section" in str(val).lower() or int(val.split(".")[0]) == last_num: cleaned_results.append(val) cleaned_page_nums.append(page_nums[ind]) elif int(val.split(".")[0]) == last_num + 1: cleaned_results.append(val) cleaned_page_nums.append(page_nums[ind]) last_num += 1 df = pd.DataFrame( [cleaned_page_nums, cleaned_results], index=["Page Number", "Header"] ).T return df
def get_money(pages, sections=None): all_matches = [] for word in ["$", "dollar", "money"]: all_matches = [] for page_ind, page in enumerate(pages): clean_page = clean_pdf_page(page) for sentance in clean_page: if word.lower() in sentance.lower(): d = {"Sentance": sentance, "Page": page_ind + 1} if sections is not None: d["Section"] = sections[page_ind + 1] all_matches.append(d) return pd.DataFrame(all_matches)
def run_query(pages, sections, all_queries): section_scores = {} specific_section_scores = {} for page_ind, page in enumerate(pages): section = sections[page_ind + 1] specific_section_scores.setdefault(section, {}) clean_page = clean_pdf_page(page) for sentance in clean_page: for qw, qs in all_queries: if qw.lower() in sentance.lower(): specific_section_scores[section].setdefault(qw, 0) specific_section_scores[section][qw] += qs section_scores.setdefault(section, 0) section_scores[section] += qs return pd.Series(section_scores).to_frame( "Weight"), specific_section_scores
def get_figures_tables(pages, sections=None): outputs = {} for word in ["Table", "Figure"]: all_matches = [] for page_ind, page in enumerate(pages): clean_page = clean_pdf_page(page) for sentance in clean_page: if re.findall(f"{word.lower()} \d", sentance.lower()) and all([ i not in sentance.lower() for i in [" in ", " refer ", " according ", " to "] ]): d = {"Sentance": sentance, "Page": page_ind + 1} if sections is not None: d["Section"] = sections[page_ind + 1] all_matches.append(d) outputs[word] = pd.DataFrame(all_matches) return outputs