def create_contract_graph(directory, title): """ Generates a bar graph with the labels as the folders in the directory and the bar heights as the number of documents in each folder Params directory: str where the folders are located title: str title for bar graph """ ds = DirectoryAssistor() folder_lst = ds.create_content_list(directory) folder_dic = {} for folder in folder_lst: folder_dic[folder] = len(ds.create_content_list(directory + folder)) sort_folder_dic = sorted(folder_dic.items(), key=lambda x: x[1], reverse=True) cols = [] labels = [] for i in sort_folder_dic: labels.append(i[0]) cols.append(i[1]) fig, ax = plt.subplots(figsize=(10, 10)) # cols = list(new_dic.values()) # labels = list(new_dic.keys()) make_sns_bar_plot(ax, cols, labels, title=title)
def __init__(self, directory, stop_words): self.directory = directory self.ds = DirectoryAssistor() self.stop_words = stop_words self.porter_dic = {} self.snowball_dic = {} self.wordnet_dic = {} self.combined_dic = {}
class ImageConverter(): def __init__(self, directory): self.directory = directory self.ds = DirectoryAssistor() def convert_image(self, file_name, file_type='.pdf', dpi=500): ''' Converts a file of type file_type to .txt using OCR Params file_name: string that is the name of the file that needs to be read in file_type: str that is the type of file being read in dpi: int that is the dots per inch of the file being read in ''' pages = convert_from_path(self.directory + file_name, dpi=dpi) image_counter = 1 image_names = [] for page in pages: image_name = 'page_' + str(image_counter) + '.jpg' image_names.append(image_name) page.save(image_name, 'JPEG') image_counter += 1 new_file_name = file_name.replace(file_type, '.txt') filelimit = image_counter - 1 outfile = self.directory + new_file_name f = open(outfile, 'a') for i in range(1, filelimit + 1): image_name = "page_" + str(i) + ".jpg" text = str(((pytesseract.image_to_string(Image.open(image_name))))) text = text.replace('-\n', '') f.write(text) f.close() for img in image_names: self.ds.delete_files(img) self.ds.delete_files(directory + file_name) def convert_txt_to_doc(self, text_file): ''' Converts a .txt document to a .doc format Params text_file: name of .txt file stored in the directory for the object ''' document = Document() new_name = text_file.replace('.txt', '') document.add_heading(new_name, 0) myfile = open(self.directory + text_file).read() myfile = re.sub(r'[^\x00-\x7F]+|\x0c', ' ', myfile) p = document.add_paragraph(myfile) document.save(directory + new_name + '.doc')
def __init__(self, directory, stop_words): """ Instantiates a ContractPipeline Object Params directory: str for main directory where the folders for the documntes are stored stop_words: list of words that will be removed from the documents """ self.directory = directory self.stop_words = stop_words self.ds = DirectoryAssistor() self.individual_bag_o_words = {}
def __init__(self, original_doc_dir, amended_doc_dir, original_txt_dir, amended_txt_dir): """ Instantiates the amendment detection class Params: original_doc_dir: str directory where the original document is located amended_doc_dir: str directory where the amended document is located original_txt_dir: str directory where the original txt file will be storeed amended_txt_dir: str directory where the amended txt file will be stored """ self.original_doc_dir = original_doc_dir self.amended_doc_dir = amended_doc_dir self.original_txt_dir = original_txt_dir self.amended_txt_dir = amended_txt_dir self.da = DirectoryAssistor()
def main(directory, file_name,out_dir, file_type='.pdf', dpi=500, verbose=True): """ Walks through all the pages in a file, converts to image, and uses OCR to convert image to .txt files. It then deletes the image files and merges the individual page .txt files. After it deletes the individual page .txt files Params: directory: str where the files are stored file_name: str name of file out_dir: str the directory where the text files are going file_type:str default '.pdf' dpi: int dots per inch verbose: bool default is True """ da = DirectoryAssistor() out_dir = out_dir image_list = convert_to_img(directory=directory, file_name=file_name) page_lst = [] with concurrent.futures.ProcessPoolExecutor(max_workers=6) as executor: for img_path,out_file in zip(image_list,executor.map(ocr,image_list)): print(img_path.split("\\")[-1],',',out_file,', processed') page_lst.append(out_file) for img in image_list: da.delete_files(img) merge_pages(page_lst=page_lst, out_dir=out_dir, file_name=file_name) for page in page_lst: da.delete_files(out_dir+page)
def scrape_pdf(self, directory, contract_dic): """ Creates a directory from the key in the contract_dic and scrapes the documents from urls contained in the value list Params: directory: str directory where the files will be stored contract_dic: dic with the key as the contract class and the value as a list of urls """ ds = DirectoryAssistor() count = 0 for key, values in contract_dic.items(): ds.make_directory(directory, key) contract_dir = directory+key options = webdriver.ChromeOptions() prefs = {"download.default_directory": contract_dir, "plugins.always_open_pdf_externally": True} options.add_experimental_option('prefs', prefs) for val in values: driver = webdriver.Chrome(options=options) driver.get(val) time.sleep(5) elm = driver.find_element_by_xpath("//*[@id='pdfFrame']") url = elm.get_attribute('src') driver.get(url) time.sleep(7) driver.quit() ds.rename_directory(contract_dir, 'DPSWebDocumentViewer.pdf',f'document\ {count}.pdf') count += 1
def __init__(self, directory): self.directory = directory self.ds = DirectoryAssistor()
class ContractPipeline(): def __init__(self, directory, stop_words): """ Instantiates a ContractPipeline Object Params directory: str for main directory where the folders for the documntes are stored stop_words: list of words that will be removed from the documents """ self.directory = directory self.stop_words = stop_words self.ds = DirectoryAssistor() self.individual_bag_o_words = {} def get_list_of_docs(self): """ Creates target_lst: list that has all of the types for each document in the corresponding index of doc_text_lst doc_text_lst: list of lowercased cleaned strings for the text in each document """ print('Converting to txt lists') start_time = time.time() folder_lst = self.ds.create_content_list(self.directory) doc_lst = [] self.target_lst = [] self.doc_text_lst = [] for i in range(len(folder_lst)): doc_lst.append( self.ds.create_content_list(self.directory + folder_lst[i])) self.individual_bag_o_words[folder_lst[i]] = [] for j in range(len(doc_lst[i])): text = textract.process(self.directory + folder_lst[i] + '/' + doc_lst[i][j]) # convert to str text = text.decode('utf-8') # lowercase all text text = text.lower() # remove all punctuation text = re.sub(r'\W+', ' ', text) # remove underscores text = text.replace("_", "") self.doc_text_lst.append(text) self.target_lst.append(folder_lst[i]) self.individual_bag_o_words[folder_lst[i]].append(text.split()) lst = [] for val in self.target_lst: lst.append(val.replace('_', ' ')) self.target_lst = lst end_time = time.time() print(f'This took {end_time-start_time:.2f} seconds') def get_list_of_txts(self): """ Creates target_lst: list that has all of the types for each document in the corresponding index of doc_text_lst doc_text_lst: list of lowercased cleaned strings for the text in each document """ print('Converting to txt lists') start_time = time.time() folder_lst = self.ds.create_content_list(self.directory) doc_lst = [] self.target_lst = [] self.doc_text_lst = [] for i in range(len(folder_lst)): doc_lst.append( self.ds.create_content_list(self.directory + folder_lst[i])) self.individual_bag_o_words[folder_lst[i]] = [] for j in range(len(doc_lst[i])): # read in file as str try: with open( self.directory + folder_lst[i] + '/' + doc_lst[i][j], 'r') as f: text = f.read().replace('\n', '') except: continue # lowercase all text text = text.lower() # remove all punctuation text = re.sub(r'\W+', ' ', text) # remove underscores text = text.replace("_", "") self.doc_text_lst.append(text) self.target_lst.append(folder_lst[i]) self.individual_bag_o_words[folder_lst[i]].append(text.split()) lst = [] for val in self.target_lst: lst.append(val.replace('_', ' ')) self.target_lst = lst end_time = time.time() print(f'This took {end_time-start_time:.2f} seconds') def bag_o_words(self): print('Creating bag o words') start_time = time.time() for key in self.individual_bag_o_words.keys(): lst = [] for val in self.individual_bag_o_words[key]: for word in val: lst.append(word) self.individual_bag_o_words[key] = Counter(lst) total_word_lst = [] for i in self.doc_text_lst: lst = i.split() for j in lst: total_word_lst.append(j) self.total_bag_o_words = Counter(total_word_lst) end_time = time.time() print(f'This took {end_time-start_time:.2f} seconds') def join_list_of_strings(self, lst): """ Joins the list into a string Params lst: list of words """ return [" ".join(x) for x in lst] def remove_stop_words(self): """ Returns a new list of strings with stop words removed stops_removed_str: list of strings with stop words removed stops_removed_lst: list of lists containing words with stops removed """ print('Removing stop words') start_time = time.time() split_lst = [txt.split() for txt in self.doc_text_lst] self.stops_removed_lst = [] for split in split_lst: stops = [w for w in split if w not in self.stop_words] stop_num = [ w for w in stops if not (w.isdigit() or w[0] == '-' and w[1:].isdigit()) ] self.stops_removed_lst.append(stop_num) self.stops_removed_str = self.join_list_of_strings( self.stops_removed_lst) end_time = time.time() print(f'This took {end_time-start_time:.2f} seconds') def word_condenser(self): """ Takes in a list of strings and lemmatizes or stems them depending on the technique chosen self.porter_str: list of strings with porter stem technique used self.snowball_str: list of strings with snowball stem technique used self.wordnet_str: list of strings with wordnet lemmatize technique used """ print('Condensing') start_time = time.time() porter = PorterStemmer() snowball = SnowballStemmer('english') wordnet = WordNetLemmatizer() porter_lst = [[porter.stem(w) for w in words] for words in self.stops_removed_lst] snowball_lst = [[snowball.stem(w) for w in words] for words in self.stops_removed_lst] wordnet_lst = [[wordnet.lemmatize(w) for w in words] for words in self.stops_removed_lst] self.porter_str = self.join_list_of_strings(porter_lst) self.snowball_str = self.join_list_of_strings(snowball_lst) self.wordnet_str = self.join_list_of_strings(wordnet_lst) end_time = time.time() print(f'This took {end_time-start_time:.2f} seconds') def count_vectorizer(self, max_features=None, ngram_range=(1, 1)): """ Sets up a word count matrix, a tfidf matrix, and a CountVectorizer for the documents in the directory Params documents: list of strings to be vectorized Returns count_matrix: matrix with word counts tfidf_matrix: a tfidf matrix of the documents cv: CountVectorizer object for the documents """ print('Generating tfidf and count matrix') start_time = time.time() self.cv = CountVectorizer(max_features=max_features, ngram_range=ngram_range) self.tf_matrix = self.cv.fit_transform(self.stops_removed_str) end_time = time.time() print(f'This took {end_time-start_time:.2f} seconds') def tf_vect(self, documents, max_features=None, ngram_range=(1, 1)): """ Returns tf-idf matrix from documents Prams documents: list of strings """ print('Generating tfidf') start_time = time.time() self.vect = TfidfVectorizer(max_features=max_features, ngram_range=ngram_range) self.tfidf = self.vect.fit_transform(documents) end_time = time.time() print(f'This took {end_time-start_time:.2f} seconds')
class ContractSifter(): def __init__(self, directory, stop_words): self.directory = directory self.ds = DirectoryAssistor() self.stop_words = stop_words self.porter_dic = {} self.snowball_dic = {} self.wordnet_dic = {} self.combined_dic = {} def create_dic(self): """ Returns a dictionary with folder names as the keys and an empty lst as values params folder_names: list of folder names in the directory Returns Dictionary """ lst = self.ds.create_content_list(self.directory) word_dic = {key: [] for key in lst} return word_dic def remove_stop_words(self, lst): return [w for w in lst if w not in self.stop_words] def add_words(self): """ Adds words from the files in the directories that are associated with the keys in the self.word_dic Returns self.word_dic with a list of words with the following removed from each file in the folder for that key: 1. Stop words 2. Punctuation 3. Underscores """ self.word_dic = self.create_dic() for key in self.word_dic.keys(): lst = self.ds.create_content_list(self.directory+key) for file in lst: full_text = textract.process(self.directory+key+'/'+file) str_full_text = full_text.decode('utf-8') lower_full_text = str_full_text.lower() edited_text = re.sub(r'\W+', ' ', lower_full_text) edited_text = edited_text.replace("_","") tokens = word_tokenize(edited_text) stop_lst = self.remove_stop_words(tokens) self.word_dic[key].append(stop_lst) def combine_word_lists(self): """ Combine all of the lists for a key into one list from the Pipeline word_dic attribute """ for key in self.word_dic.keys(): result = [] for lst in self.word_dic[key]: result.extend(lst) self.combined_dic[key] = result def word_condenser(self): porter = PorterStemmer() snowball = SnowballStemmer('english') wordnet = WordNetLemmatizer() for key in self.combined_dic.keys(): porter_lst = [] snowball_lst = [] wordnet_lst = [] for word in self.combined_dic[key]: porter_lst.append(porter.stem(word)) snowball_lst.append(snowball.stem(word)) wordnet_lst.append(wordnet.lemmatize(word)) self.porter_dic[key] = porter_lst self.snowball_dic[key] = snowball_lst self.wordnet_dic[key] = wordnet_lst def word_count(self, dic): """ Returns the count of the words in each key of the dictionary Params dic = dict for which the words will be counted Returns new_dic: dict with word count for each key """ word_count_dic = {} for key, val in dic.items(): word_count_dic[key] = Counter(val) new_dic = dict(word_count_dic) return new_dic def word_cloud(self, dic): """ Generates a word cloud for each key in the dic Params dic: dict for which the word cloud will be generated Returns Plot with word cloud for each key in dic """ word_cloud_dic = {} for key, val in dic.items(): word_cloud_dic[key] = ' '.join(val) wc_lst = [] for val in word_cloud_dic.values(): wc = WordCloud(width=1000, height=1000, background_color='white', min_font_size=9) wc_lst.append(wc.generate(val)) fig, axs = plt.subplots(3,3, figsize=(15,12)) titles = list(dic.keys()) for cloud, title, ax in zip(wc_lst, titles, axs.flatten()): chartBox = ax.get_position() ax.set_position(pos=[chartBox.x0,chartBox.y0,chartBox.width*1.05, chartBox.height*1.05]) ax.imshow(cloud) ax.set_title(title, fontsize=16, weight='bold') ax.axis("off") axs[2,1].set_axis_off() axs[2,2].set_axis_off() chartBox = axs[2,0].get_position() axs[2,0].set_position(pos=[chartBox.x0*2.8,chartBox.y0*.9,chartBox.width*1.05, chartBox.height*1.05]) plt.show()
for img in image_list: da.delete_files(img) merge_pages(page_lst=page_lst, out_dir=out_dir, file_name=file_name) for page in page_lst: da.delete_files(out_dir+page) if __name__ == '__main__': directory='/Users/justinlansdale/Documents/Galvanize/Capstone3/EC2Data/\ changeFolder/pdfs/Original/' out_dir = '/Users/justinlansdale/Documents/Galvanize/Capstone3/EC2Data/\ changeFolder/Amended/' da = DirectoryAssistor() doc_lst = da.create_content_list(directory) print(doc_lst) start = time.time() for i in range(len(doc_lst)): file_name = doc_lst[i] try: main(directory, file_name, out_dir) except: continue end = time.time() print(end-start)
class AmendmentDetector(): def __init__(self, original_doc_dir, amended_doc_dir, original_txt_dir, amended_txt_dir): """ Instantiates the amendment detection class Params: original_doc_dir: str directory where the original document is located amended_doc_dir: str directory where the amended document is located original_txt_dir: str directory where the original txt file will be storeed amended_txt_dir: str directory where the amended txt file will be stored """ self.original_doc_dir = original_doc_dir self.amended_doc_dir = amended_doc_dir self.original_txt_dir = original_txt_dir self.amended_txt_dir = amended_txt_dir self.da = DirectoryAssistor() def convert_original_to_txt(self): """ Converts original doct to .txt file """ doc_lst = self.da.create_content_list(self.original_doc_dir) for i in range(len(doc_lst)): file_name = doc_lst[i] try: main(self.original_doc_dir, file_name, self.original_txt_dir) except: continue def convert_amended_to_txt(self): """ Converts amended doc to .txt file """ doc_lst = self.da.create_content_list(self.amended_doc_dir) for i in range(len(doc_lst)): file_name = doc_lst[i] try: main(self.amended_doc_dir, file_name, self.amended_txt_dir) except: continue def read_in_files(self): """ Reads in txt files """ original_lst = self.da.create_content_list(self.original_txt_dir) amended_lst = self.da.create_content_list(self.amended_txt_dir) for doc in original_lst: try: with open(self.original_txt_dir + doc, 'r') as f: self.original = f.read().replace('\n', ' ') except: continue for doc in amended_lst: try: with open(self.amended_txt_dir + doc, 'r') as f: self.amended = f.read().replace('\n', ' ') except: continue def print_changes(self): """ Prints changes made in the amended document """ original_lst = self.original.split() amended_lst = self.amended.split() original_value = [] amended_value = [] change_ref = [] for i in range(len(original_lst)): if original_lst[i] != amended_lst[i]: original_value.append(original_lst[i]) change_ref.append(list(original_lst[i - 40:i + 40])) amended_value.append(amended_lst[i]) for i in range(len(original_value)): print(f'\n Change # {i+1}: {original_value[i]} changed to \ {amended_value[i]} \n \n Reference text: {" ".join(change_ref[i])} \n')