예제 #1
0
def count_words(run_once=False):
    txt_directory = '/Users/samuelkaeser/Documents/University/Classes/EE_460J/Homework/Lab5/txts'
    txts = [f for f in os.listdir(txt_directory)]
    parser = PDFParser()
    for txt in txts:
        txt_path = os.path.join(txt_directory, txt)
        with open(txt_path, 'r') as f:
            for line in f:
                parser.parse(line)
        if run_once:
            break
    return parser.word_counts, parser.total_words
def parse_PDF():
    pdf_p = PDFParser()
    '''
	dir = "/home/gpark/corpus_web/tdm/archive/RSC"
	for filename in os.listdir(dir):
		if filename.endswith(".pdf"):
			print(filename)
			pdf_p.parse(os.path.join(dir, filename))
			
			input("Press Enter to continue...")
	'''

    cnt_article_w_keyword = 0
    terms = ['EXAFS', 'XANES', 'NEXAFS', 'pair distribution function']
    terms = [x.lower() for x in terms]  # lowercase

    num_of_files = 0

    #check_point_found = False

    dir = "/home/gpark/corpus_web/tdm/archive/IOP_JSON"

    # debugging
    file_doi = {}
    for file in os.listdir(dir):
        if file.endswith(".json"):
            with open(os.path.join(dir, file), "r") as read_file:
                data = json.load(read_file)

                body_text = data['body_text']

                found = False
                for sent in body_text:
                    tokens = sent['sent'].split()
                    tokens = [x.lower() for x in tokens]

                    if any(elem in tokens for elem in terms[:3]):
                        found = True
                        break

                if found is True:
                    pdf_file = file.replace('.json', '.pdf')
                    file_doi[pdf_file] = data['uid']

    with open("iop_filtered_list.txt", 'a') as out_file:
        for file, doi in file_doi.items():
            out_file.write(file + ' -> https://doi.org/' + doi + '\n')

    sys.exit()
    # debugging

    for root, dirs, files in os.walk(dir):
        dirs.sort(
            reverse=True
        )  # it will traverse the subdirectories in reverse lexicographic order of their names.
        for file in files:
            if file.endswith(".pdf"):
                ''' when an error occurs, to start after the last processed file.
				if check_point_found == False:
					if file == 'epl_38_6_453.pdf':
						check_point_found = True
						continue
					else:
						continue
				'''

                iop_meta_file = os.path.join(root, '.article')

                if os.path.exists(iop_meta_file) == False:
                    continue

                pdf_p.parse(os.path.join(root, file), terms, iop_meta_file)

                num_of_files += 1

                print('>> file: ', os.path.join(root, file),
                      ' / num_of_files: ', num_of_files)

                #input("Press Enter to continue...")

                #if file in ['jpmater_1_1_01LT02.pdf', 'jpmater_1_1_015010.pdf', 'jpmater_1_1_015006.pdf', 'mfm_1_1_015005.pdf']:
                #if file in ['jpco_3_1_015002.pdf']:
                #	input("Press Enter to continue...")

                #if pdf_p.parse(os.path.join(root, file), terms) == True:	# len(body_text) == 0 -> True
                #	cnt_article_w_keyword += 1

    print(cnt_article_w_keyword)