def count_words(run_once=False): txt_directory = '/Users/samuelkaeser/Documents/University/Classes/EE_460J/Homework/Lab5/txts' txts = [f for f in os.listdir(txt_directory)] parser = PDFParser() for txt in txts: txt_path = os.path.join(txt_directory, txt) with open(txt_path, 'r') as f: for line in f: parser.parse(line) if run_once: break return parser.word_counts, parser.total_words
def parse_PDF(): pdf_p = PDFParser() ''' dir = "/home/gpark/corpus_web/tdm/archive/RSC" for filename in os.listdir(dir): if filename.endswith(".pdf"): print(filename) pdf_p.parse(os.path.join(dir, filename)) input("Press Enter to continue...") ''' cnt_article_w_keyword = 0 terms = ['EXAFS', 'XANES', 'NEXAFS', 'pair distribution function'] terms = [x.lower() for x in terms] # lowercase num_of_files = 0 #check_point_found = False dir = "/home/gpark/corpus_web/tdm/archive/IOP_JSON" # debugging file_doi = {} for file in os.listdir(dir): if file.endswith(".json"): with open(os.path.join(dir, file), "r") as read_file: data = json.load(read_file) body_text = data['body_text'] found = False for sent in body_text: tokens = sent['sent'].split() tokens = [x.lower() for x in tokens] if any(elem in tokens for elem in terms[:3]): found = True break if found is True: pdf_file = file.replace('.json', '.pdf') file_doi[pdf_file] = data['uid'] with open("iop_filtered_list.txt", 'a') as out_file: for file, doi in file_doi.items(): out_file.write(file + ' -> https://doi.org/' + doi + '\n') sys.exit() # debugging for root, dirs, files in os.walk(dir): dirs.sort( reverse=True ) # it will traverse the subdirectories in reverse lexicographic order of their names. for file in files: if file.endswith(".pdf"): ''' when an error occurs, to start after the last processed file. if check_point_found == False: if file == 'epl_38_6_453.pdf': check_point_found = True continue else: continue ''' iop_meta_file = os.path.join(root, '.article') if os.path.exists(iop_meta_file) == False: continue pdf_p.parse(os.path.join(root, file), terms, iop_meta_file) num_of_files += 1 print('>> file: ', os.path.join(root, file), ' / num_of_files: ', num_of_files) #input("Press Enter to continue...") #if file in ['jpmater_1_1_01LT02.pdf', 'jpmater_1_1_015010.pdf', 'jpmater_1_1_015006.pdf', 'mfm_1_1_015005.pdf']: #if file in ['jpco_3_1_015002.pdf']: # input("Press Enter to continue...") #if pdf_p.parse(os.path.join(root, file), terms) == True: # len(body_text) == 0 -> True # cnt_article_w_keyword += 1 print(cnt_article_w_keyword)