def article_transform(article): global minutes_per_page global output_folder_dest global bundles_produced global pdf_names global all_pages global all_words current_directory = os.path.dirname(os.path.realpath(__file__)) directory_name = 'pdf_papers' this_bundle = PyPDF2.PdfFileWriter() this_book = open( os.path.join(current_directory, directory_name, str(article)), 'rb') read_book = PyPDF2.PdfFileReader(this_book) name_without_pdf = str(re.sub('\.pdf$', '', str(article))) #writes first page numPages = read_book.getNumPages() how_many_words = word_counter.count_words(read_book, numPages) all_words += how_many_words all_pages += numPages first_page.Title = 'Article: ' + name_without_pdf first_page.SubTitle = ' Week ' + str(folder_index) + ' Reading Material' first_page.Name = str(numPages) + ' Pages (' + str( minutes_per_page * numPages) + ' Minutes)' first_page.go( os.path.join('title pages', name_without_pdf + '_title_page.pdf')) #adds cover and black page with bookmark title_page_file = open( os.path.join('title pages', name_without_pdf + '_title_page.pdf'), 'rb') read_title_page = PyPDF2.PdfFileReader(title_page_file) this_bundle.addPage(read_title_page.getPage(0)) #this_bundle.addBookmark(name_without_pdf, 1, parent=None, color=(0.0,0.0,1.0)) this_bundle.setPageMode("/UseOutlines") for page in range(numPages): this_bundle.addPage(read_book.getPage(page)) directory_name = output_folder_dest outputStream = open( os.path.join(output_folder_dest, 'Bundles', re.sub('\.pdf$', '', str(article)) + '_bundle.pdf'), 'wb') while True: try: this_bundle.write(outputStream) pdf_names.append((os.path.join( output_folder_dest, 'Bundles', re.sub('\.pdf$', '', str(article)) + '_bundle.pdf'), 'rb')) bundles_produced += 1 except PyPDF2.utils.PdfReadError: print('CANT WRITE PDF') break outputStream.close()
def main(argv): file_name = "words.txt" if len (argv) == 1 else argv[1] with open(file_name, "rb") as fp: counted = word_counter.count_words(fp.read()) word_counter.print_counted_words(counted)
def test_nasty(self): text = """you haint no objections to sharing a harpooneer's blanket, have ye? I s'pose you are goin' a-whalin', so you'd better get used to that sort of thing.""" assert count_words(text) == _____
def test_biggest(self): """An entire book works.""" text = open('____mobydick_full.txt').read() assert count_words(text) > 200000
def test_wrong_input(self): """Non-string fails with a specific error""" with pytest.raises(_____) as e_info: count_words(777)
def test_empty(self): """Empty input works""" text = '' assert count_words(text) == _____
def test_words(): """example with lots of special characters works""" text = """you haint no objections to sharing a harpooneer's blanket, have ye? I s'pose you are goin' a-whalin',so you'd better get used to that sort of thing.""" assert count_words(text) == 32
def test_count_words_tabs(): """words are separated by tabs as well""" text = "the\twhite\twhale" assert count_words(text) == 3
def test_count_words(): """Count words in a short sentence""" n = count_words("Call me Ishmael") assert n == 3
import PDFtoTXTConvert import word_counter import os d = {} word_freq = [] tf_idf_freq = [] pdf_dir = "./articles/" txt_dir = "./txt_formats/" PDFtoTXTConvert.convert_multiple(pdf_dir, txt_dir) word_counter.get_rid_of_stopword(txt_dir) word_counter.count_words(word_freq, d) word_counter.tf_idf_cal(d, txt_dir, tf_idf_freq) if txt_dir == "": txt_dir = os.getcwd() + "\\" # if no txt_dir passed in for txt in os.listdir(txt_dir): file_extension = txt.split(".")[-1] if file_extension == "txt": txt_filename = txt_dir + txt os.remove(txt_filename) txt_dir = "./filtered_txt/" if txt_dir == "": txt_dir = os.getcwd() + "\\" # if no txt_dir passed in for txt in os.listdir(txt_dir): file_extension = txt.split(".")[-1] if file_extension == "txt": txt_filename = txt_dir + txt os.remove(txt_filename)