def compute_top_n_tokens_for_each_doc(top_n, first_id, last_id): models.connect_to_db(conf.DATABASE_FILENAME) cleaner = Cleaner() top_n_tokens_per_paper = {} for i in range(first_id, last_id + 1, increments): papers_to_process = ids_to_query(i, increments, last_id) for paper_id in papers_to_process: paper_query = models.Papers_NR.select().where( models.Papers.id == paper_id) if DEBUG: print(paper_query) print(len(paper_query)) if len(paper_query) > 0: paper_content = paper_query[0].paper_text pdf_name = paper_query[0].paper_name tokens = cleaner.tokenize(paper_content) token_frequencies = {} for token in tokens: if token not in token_frequencies: token_frequencies[token] = 1 else: token_frequencies[token] = token_frequencies[token] + 1 sorted_tokens = [(k, token_frequencies[k]) for k in sorted( token_frequencies, key=token_frequencies.get, reverse=True) ] top_n_tokens_per_paper[pdf_name] = sorted_tokens[:top_n] models.close_connection() printer = Printer() printer.print_dict(top_n_tokens_per_paper)
def insert_title_column_to_nr_nsw_table(): models.connect_to_db(DATABASE_FILENAME) last_id_query = models.Papers_NR.select().order_by(models.Papers_NR.id.desc()).limit(1) first_id = 1 last_id = last_id_query[0].id last_id = 100 increments = 50 for i in range(first_id, last_id + 1, increments): papers_to_process = create_list_of_ids(i, increments, last_id) for paper_id in papers_to_process: paper_query = models.Papers_NR.select().where(models.Papers_NR.id == paper_id) if len(paper_query) > 0: paper_pdf_name = paper_query[0].pdf_name title = paper_pdf_name.split(".pdf")[0] print("Title before replace: {0}".format(title)) title = title.replace("-", " ") print("Title after replace: {0}".format(title)) new_entry = models.Papers_NR_NSW.create(id=paper_id, pdf_name=paper_pdf_name, paper_text=cleaned_content) new_entry.save() counter += increments print("Number of documents cleaned: {0}".format(counter)) print("Sleeping for one second ...") time.sleep(1) models.close_connection()
def clean_papers_from_db(): models.connect_to_db(DATABASE_FILENAME) last_id_query = models.Papers.select().order_by( models.Papers.id.desc()).limit(1) first_id = 1 last_id = last_id_query[0].id increments = 10 for i in range(first_id, last_id + 1, increments): papers_to_process = create_list_of_ids(i, increments, last_id) for paper_id in papers_to_process: paper_query = models.Papers.select().where( models.Papers.id == paper_id) print(paper_query) print(len(paper_query)) if (len(paper_query) > 0): paper_content = paper_query[0].paper_text paper_pdf_name = paper_query[0].pdf_name print("Removing reference section from paper id: {0}".format( paper_id)) new_content = remove_reference_section(paper_content) print("Saving new paper_text into papers_for_index") new_entry = models.Papers_NR.create(id=paper_id, pdf_name=paper_pdf_name, paper_text=new_content) print("Number of rows modified: {0}".format(new_entry.save())) print("Sleeping for one second ...") time.sleep(1) models.close_connection()
def save_cleaned_files(input_directory): filenames = io.list_files_in_dir(input_directory) models.connect_to_db(conf.DATABASE_FILENAME) for filename in filenames: print("Saving {} into DB".format(filename)) paper_id = extract_id_from_filename(filename) paper_pdf_name = create_pdf_filename(filename) file_path = conf.IRREGULAR_PAPERS_DIRECTORY + "/" + filename paper_content = io.load_file_rows(file_path) new_entry = models.Papers_NR.create(id=paper_id, pdf_name=paper_pdf_name, paper_text=paper_content) print("Number of rows modified: {0}".format(new_entry.save())) models.close_connection()
def clean_papers(): models.connect_to_db(conf.DATABASE_FILENAME) last_id_query = models.Papers_NR.select().order_by( models.Papers_NR.id.desc()).limit(1) first_id = 1 last_id = last_id_query[0].id increments = 50 paper_cleaner = text_cleaner.Cleaner() counter = 0 for i in range(first_id, last_id + 1, increments): papers_to_process = create_list_of_ids(i, increments, last_id) for paper_id in papers_to_process: paper_query = models.Papers_NR.select().where( models.Papers_NR.id == paper_id) if len(paper_query) > 0: paper_content = paper_query[0].paper_text paper_pdf_name = paper_query[0].pdf_name title = paper_pdf_name.split(".pdf")[0] title = title.replace("-", " ") #print("Cleaning content for paper id: {0}".format(paper_id)) rows_to_clean = paper_content.split("\n") cleaned_content = "" cleaner = text_cleaner.Cleaner() for row in rows_to_clean: cleaned_row = cleaner.clean_text(row) if len(cleaned_row) > 0: cleaned_content += cleaned_row + "\n" #print("Saving new paper_text into papers_NR_NSW") new_entry = models.Papers_NR_NSW.create( id=paper_id, pdf_name=paper_pdf_name, paper_text=cleaned_content, paper_title=title) new_entry.save() #print("Number of rows modified: {0}".format(new_entry.save())) counter += increments print("Number of documents cleaned: {0}".format(counter)) print("Sleeping for one second ...") time.sleep(1) models.close_connection()
def add_data(text): paper_text = text #new_paper_text = stemming(paper_text) models.connect_to_db(conf.DATABASE_FILENAME) last_id_query = models.Papers_NR_NSW_STE.select().order_by( models.Papers_NR_NSW_STE.id.desc()).limit(1) first_id = 1 last_id = last_id_query[0].id increments = 10 for i in range(first_id, last_id + 1, increments): papers_to_process = create_list_of_ids(i, increments, last_id) for paper_id in papers_to_process: paper_query = models.Papers_NR_NSW_STE.select().where( models.Papers_NR_NSW_STE.id == paper_id) paper_pdf_name = paper_query[0].pdf_name # is update the statement to use? new_entry = models.Papers_NR_NSW.update(id=paper_id, pdf_name=paper_pdf_name, paper_text=new_paper_text) print("Number of rows modified: {0}".format(new_entry.save())) models.close_connection()
def compute_document_frequencies(): models.connect_to_db(conf.DATABASE_FILENAME) first_id = 1 last_id_query = papers.select().order_by(papers.id.desc()).limit(1) last_id = last_id_query[0].id increments = 10 token_frequencies = {} for i in range(first_id, last_id + 1, increments): papers_to_process = ids_to_query(i, increments, last_id) for paper_id in papers_to_process: paper_query = papers.select().where(papers.id == paper_id) unique_tokens = set() if DEBUG: print(paper_query) print(len(paper_query)) if len(paper_query) > 0: paper_content = paper_query[0].paper_text paper_pdf_name = paper_query[0].pdf_name tokens = paper_content.strip().split() for token in tokens: #print(token) unique_tokens.add(token.lower()) for i, token in enumerate(unique_tokens): #print(token) if token not in token_frequencies: token_frequencies[token] = 1 else: token_frequencies[token] = token_frequencies[token] + 1 models.close_connection() sorted_tokens = [(k, token_frequencies[k]) for k in sorted(token_frequencies, key=token_frequencies.get)] printer = Printer() printer.print_token_frequency(sorted_tokens)
def compute_top_n_tokens_for_collection(top_n): models.connect_to_db(conf.DATABASE_FILENAME) first_id = 1 last_id_query = models.Papers_NR.select().order_by( models.Papers_NR.id.desc()).limit(1) last_id = last_id_query[0].id increments = 10 cleaner = Cleaner() token_frequencies = {} for i in range(first_id, last_id + 1, increments): papers_to_process = ids_to_query(i, increments, last_id) for paper_id in papers_to_process: paper_query = models.Papers.select().where( models.Papers.id == paper_id) if DEBUG: print(paper_query) print(len(paper_query)) if len(paper_query) > 0: paper_content = paper_query[0].paper_text paper_pdf_name = paper_query[0].pdf_name tokens = cleaner.tokenize(paper_content) for token in tokens: if token not in token_frequencies: token_frequencies[token] = 1 else: token_frequencies[token] = token_frequencies[token] + 1 models.close_connection() sorted_tokens = [(k, token_frequencies[k]) for k in sorted( token_frequencies, key=token_frequencies.get, reverse=True)] top_n_tokens = sorted_tokens[:top_n] printer = Printer() printer.print_token_frequency(top_n_tokens)
def retrieve_papers(): io.create_directory(conf.IRREGULAR_PAPERS_DIRECTORY) models.connect_to_db(conf.DATABASE_FILENAME) # retrieve papers with reference not separated query = models.Papers.select().where( models.Papers.id.in_(conf.ids_reference_not_separated)) for paper in query: filename = create_paper_newname(conf.IRREGULAR_PAPERS_DIRECTORY, paper.pdf_name) content = paper.paper_text io.save_file(content, filename) # retrieve papers with poorly defined reference section query = models.Papers.select().where( models.Papers.id.in_(conf.ids_poorly_defined_reference)) for paper in query: filename = create_paper_newname(conf.IRREGULAR_PAPERS_DIRECTORY, paper.pdf_name) content = paper.paper_text io.save_file(content, filename) models.close_connection()
def drop_papers_nr_nsw_table(): models.connect_to_db(conf.DATABASE_FILENAME) models.Papers_NR_NSW_STE.drop_table() models.close_connection()
def drop_papers_nr_table(): models.connect_to_db(DATABASE_FILENAME) models.Papers_NR.drop_table() models.close_connection()