示例#1
0
def create_contract_graph(directory, title):
    """
    Generates a bar graph with the labels as the folders in the directory and the
    bar heights as the number of documents in each folder

    Params
    directory: str where the folders are located
    title: str title for bar graph
    """
    ds = DirectoryAssistor()
    folder_lst = ds.create_content_list(directory)
    folder_dic = {}
    for folder in folder_lst:
        folder_dic[folder] = len(ds.create_content_list(directory + folder))
    sort_folder_dic = sorted(folder_dic.items(),
                             key=lambda x: x[1],
                             reverse=True)
    cols = []
    labels = []
    for i in sort_folder_dic:
        labels.append(i[0])
        cols.append(i[1])
    fig, ax = plt.subplots(figsize=(10, 10))
    # cols = list(new_dic.values())
    # labels = list(new_dic.keys())
    make_sns_bar_plot(ax, cols, labels, title=title)
 def __init__(self, directory, stop_words):
     self.directory = directory
     self.ds = DirectoryAssistor()
     self.stop_words = stop_words
     self.porter_dic = {}
     self.snowball_dic = {}
     self.wordnet_dic = {}
     self.combined_dic = {}
示例#3
0
class ImageConverter():
    def __init__(self, directory):
        self.directory = directory
        self.ds = DirectoryAssistor()

    def convert_image(self, file_name, file_type='.pdf', dpi=500):
        '''
        Converts a file of type file_type to .txt using OCR

        Params
        file_name: string that is the name of the file that needs to be read in
        file_type: str that is the type of file being read in
        dpi: int that is the dots per inch of the file being read in

        '''
        pages = convert_from_path(self.directory + file_name, dpi=dpi)
        image_counter = 1
        image_names = []

        for page in pages:
            image_name = 'page_' + str(image_counter) + '.jpg'
            image_names.append(image_name)
            page.save(image_name, 'JPEG')
            image_counter += 1

        new_file_name = file_name.replace(file_type, '.txt')
        filelimit = image_counter - 1
        outfile = self.directory + new_file_name

        f = open(outfile, 'a')
        for i in range(1, filelimit + 1):
            image_name = "page_" + str(i) + ".jpg"
            text = str(((pytesseract.image_to_string(Image.open(image_name)))))
            text = text.replace('-\n', '')
            f.write(text)
        f.close()

        for img in image_names:
            self.ds.delete_files(img)
        self.ds.delete_files(directory + file_name)

    def convert_txt_to_doc(self, text_file):
        '''
        Converts a .txt document to a .doc format

        Params
        text_file: name of .txt file stored in the directory for the object

        '''

        document = Document()
        new_name = text_file.replace('.txt', '')
        document.add_heading(new_name, 0)

        myfile = open(self.directory + text_file).read()
        myfile = re.sub(r'[^\x00-\x7F]+|\x0c', ' ', myfile)
        p = document.add_paragraph(myfile)
        document.save(directory + new_name + '.doc')
示例#4
0
    def __init__(self, directory, stop_words):
        """
        Instantiates a ContractPipeline Object

        Params
        directory: str for main directory where the folders for the documntes are stored
        stop_words: list of words that will be removed from the documents
        """
        self.directory = directory
        self.stop_words = stop_words
        self.ds = DirectoryAssistor()
        self.individual_bag_o_words = {}
    def __init__(self, original_doc_dir, amended_doc_dir, original_txt_dir,
                 amended_txt_dir):
        """
        Instantiates the amendment detection class
        
        Params:

        original_doc_dir: str directory where the original document is located
        amended_doc_dir: str directory where the amended document is located
        original_txt_dir: str directory where the original txt file will be 
        storeed
        amended_txt_dir: str directory where the amended txt file will be 
        stored
        """

        self.original_doc_dir = original_doc_dir
        self.amended_doc_dir = amended_doc_dir
        self.original_txt_dir = original_txt_dir
        self.amended_txt_dir = amended_txt_dir
        self.da = DirectoryAssistor()
def main(directory, file_name,out_dir, file_type='.pdf', dpi=500, verbose=True):
    """
    Walks through all the pages in a file, converts to image, and uses OCR to 
    convert image to .txt files.  It then deletes the image files and merges the
    individual page .txt files.  After it deletes the individual page .txt files

    Params:
    directory: str where the files are stored
    file_name: str name of file
    out_dir: str the directory where the text files are going
    file_type:str default '.pdf'
    dpi: int dots per inch
    verbose: bool default is True
    """
    
    da = DirectoryAssistor()
    out_dir = out_dir
    image_list = convert_to_img(directory=directory, file_name=file_name)
    page_lst = []
    with concurrent.futures.ProcessPoolExecutor(max_workers=6) as executor:
        for img_path,out_file in zip(image_list,executor.map(ocr,image_list)):
            print(img_path.split("\\")[-1],',',out_file,', processed')
            page_lst.append(out_file)
    
    for img in image_list:
            da.delete_files(img)

    merge_pages(page_lst=page_lst, out_dir=out_dir, file_name=file_name)
    
    for page in page_lst:
        da.delete_files(out_dir+page)
示例#7
0
    def scrape_pdf(self, directory, contract_dic):
        """
        Creates a directory from the key in the contract_dic and scrapes the 
        documents from urls contained in the value list

        Params:
        directory: str directory where the files will be stored
        contract_dic: dic with the key as the contract class and the value as a 
        list of urls
        """

        ds = DirectoryAssistor()
        count = 0
        for key, values in contract_dic.items():
            ds.make_directory(directory, key)
            contract_dir = directory+key
            options = webdriver.ChromeOptions()
            prefs = {"download.default_directory": contract_dir,
                    "plugins.always_open_pdf_externally": True}
            options.add_experimental_option('prefs', prefs)
            for val in values:
                driver = webdriver.Chrome(options=options)
                driver.get(val)
                time.sleep(5)
                elm = driver.find_element_by_xpath("//*[@id='pdfFrame']")
                url = elm.get_attribute('src')
                driver.get(url)
                time.sleep(7)
                driver.quit()
                ds.rename_directory(contract_dir, 'DPSWebDocumentViewer.pdf',f'document\
{count}.pdf')
                count += 1
示例#8
0
 def __init__(self, directory):
     self.directory = directory
     self.ds = DirectoryAssistor()
示例#9
0
class ContractPipeline():
    def __init__(self, directory, stop_words):
        """
        Instantiates a ContractPipeline Object

        Params
        directory: str for main directory where the folders for the documntes are stored
        stop_words: list of words that will be removed from the documents
        """
        self.directory = directory
        self.stop_words = stop_words
        self.ds = DirectoryAssistor()
        self.individual_bag_o_words = {}

    def get_list_of_docs(self):
        """
        Creates
        target_lst: list that has all of the types for each document in the
        corresponding index of doc_text_lst
        doc_text_lst: list of lowercased cleaned strings for the text in each document
        """
        print('Converting to txt lists')
        start_time = time.time()

        folder_lst = self.ds.create_content_list(self.directory)
        doc_lst = []
        self.target_lst = []
        self.doc_text_lst = []

        for i in range(len(folder_lst)):
            doc_lst.append(
                self.ds.create_content_list(self.directory + folder_lst[i]))
            self.individual_bag_o_words[folder_lst[i]] = []
            for j in range(len(doc_lst[i])):
                text = textract.process(self.directory + folder_lst[i] + '/' +
                                        doc_lst[i][j])
                # convert to str
                text = text.decode('utf-8')
                # lowercase all text
                text = text.lower()
                # remove all punctuation
                text = re.sub(r'\W+', ' ', text)
                # remove underscores
                text = text.replace("_", "")
                self.doc_text_lst.append(text)
                self.target_lst.append(folder_lst[i])
                self.individual_bag_o_words[folder_lst[i]].append(text.split())
                lst = []
        for val in self.target_lst:
            lst.append(val.replace('_', ' '))
        self.target_lst = lst

        end_time = time.time()
        print(f'This took {end_time-start_time:.2f} seconds')

    def get_list_of_txts(self):
        """
        Creates
        target_lst: list that has all of the types for each document in the
        corresponding index of doc_text_lst
        doc_text_lst: list of lowercased cleaned strings for the text in each document
        """
        print('Converting to txt lists')
        start_time = time.time()

        folder_lst = self.ds.create_content_list(self.directory)
        doc_lst = []
        self.target_lst = []
        self.doc_text_lst = []

        for i in range(len(folder_lst)):
            doc_lst.append(
                self.ds.create_content_list(self.directory + folder_lst[i]))
            self.individual_bag_o_words[folder_lst[i]] = []
            for j in range(len(doc_lst[i])):
                # read in file as str
                try:
                    with open(
                            self.directory + folder_lst[i] + '/' +
                            doc_lst[i][j], 'r') as f:
                        text = f.read().replace('\n', '')
                except:
                    continue

                # lowercase all text
                text = text.lower()

                # remove all punctuation
                text = re.sub(r'\W+', ' ', text)

                # remove underscores
                text = text.replace("_", "")

                self.doc_text_lst.append(text)
                self.target_lst.append(folder_lst[i])
                self.individual_bag_o_words[folder_lst[i]].append(text.split())
        lst = []
        for val in self.target_lst:
            lst.append(val.replace('_', ' '))
        self.target_lst = lst

        end_time = time.time()
        print(f'This took {end_time-start_time:.2f} seconds')

    def bag_o_words(self):
        print('Creating bag o words')
        start_time = time.time()

        for key in self.individual_bag_o_words.keys():
            lst = []
            for val in self.individual_bag_o_words[key]:
                for word in val:
                    lst.append(word)
            self.individual_bag_o_words[key] = Counter(lst)
        total_word_lst = []
        for i in self.doc_text_lst:
            lst = i.split()
            for j in lst:
                total_word_lst.append(j)
        self.total_bag_o_words = Counter(total_word_lst)

        end_time = time.time()
        print(f'This took {end_time-start_time:.2f} seconds')

    def join_list_of_strings(self, lst):
        """
        Joins the list into a string
        
        Params
        lst: list of words
        """
        return [" ".join(x) for x in lst]

    def remove_stop_words(self):
        """
        Returns a new list of strings with stop words removed

        stops_removed_str: list of strings with stop words removed
        stops_removed_lst: list of lists containing words with stops removed
        """
        print('Removing stop words')
        start_time = time.time()

        split_lst = [txt.split() for txt in self.doc_text_lst]
        self.stops_removed_lst = []

        for split in split_lst:
            stops = [w for w in split if w not in self.stop_words]
            stop_num = [
                w for w in stops
                if not (w.isdigit() or w[0] == '-' and w[1:].isdigit())
            ]
            self.stops_removed_lst.append(stop_num)

        self.stops_removed_str = self.join_list_of_strings(
            self.stops_removed_lst)

        end_time = time.time()
        print(f'This took {end_time-start_time:.2f} seconds')

    def word_condenser(self):
        """
        Takes in a list of strings and lemmatizes or stems them depending on the
        technique chosen

        self.porter_str: list of strings with porter stem technique used
        self.snowball_str: list of strings with snowball stem technique used
        self.wordnet_str: list of strings with wordnet lemmatize technique used
        """
        print('Condensing')
        start_time = time.time()

        porter = PorterStemmer()
        snowball = SnowballStemmer('english')
        wordnet = WordNetLemmatizer()

        porter_lst = [[porter.stem(w) for w in words]
                      for words in self.stops_removed_lst]
        snowball_lst = [[snowball.stem(w) for w in words]
                        for words in self.stops_removed_lst]
        wordnet_lst = [[wordnet.lemmatize(w) for w in words]
                       for words in self.stops_removed_lst]

        self.porter_str = self.join_list_of_strings(porter_lst)
        self.snowball_str = self.join_list_of_strings(snowball_lst)
        self.wordnet_str = self.join_list_of_strings(wordnet_lst)

        end_time = time.time()
        print(f'This took {end_time-start_time:.2f} seconds')

    def count_vectorizer(self, max_features=None, ngram_range=(1, 1)):
        """
        Sets up a word count matrix, a tfidf matrix, and a CountVectorizer for
        the documents in the directory

        Params
        documents: list of strings to be vectorized

        Returns
        count_matrix: matrix with word counts
        tfidf_matrix: a tfidf matrix of the documents
        cv: CountVectorizer object for the documents
        """
        print('Generating tfidf and count matrix')
        start_time = time.time()

        self.cv = CountVectorizer(max_features=max_features,
                                  ngram_range=ngram_range)
        self.tf_matrix = self.cv.fit_transform(self.stops_removed_str)

        end_time = time.time()
        print(f'This took {end_time-start_time:.2f} seconds')

    def tf_vect(self, documents, max_features=None, ngram_range=(1, 1)):
        """
        Returns tf-idf matrix from documents
        
        Prams
        documents: list of strings
        """
        print('Generating tfidf')
        start_time = time.time()

        self.vect = TfidfVectorizer(max_features=max_features,
                                    ngram_range=ngram_range)
        self.tfidf = self.vect.fit_transform(documents)

        end_time = time.time()
        print(f'This took {end_time-start_time:.2f} seconds')
class ContractSifter():

    def __init__(self, directory, stop_words):
        self.directory = directory
        self.ds = DirectoryAssistor()
        self.stop_words = stop_words
        self.porter_dic = {}
        self.snowball_dic = {}
        self.wordnet_dic = {}
        self.combined_dic = {}

    def create_dic(self):
        """
        Returns a dictionary with folder names as the keys and an empty lst 
        as values

        params

        folder_names: list of folder names in the directory

        Returns 
        Dictionary
        """
        lst = self.ds.create_content_list(self.directory)
        word_dic = {key: [] for key in lst}
        return word_dic
    
    def remove_stop_words(self, lst):
        return [w for w in lst if w not in self.stop_words]
    
    def add_words(self):
        """
        Adds words from the files in the directories that are associated with
        the keys in the self.word_dic

        Returns
        self.word_dic with a list of words with the following removed from each 
        file in the folder for that key:
            1. Stop words
            2. Punctuation
            3. Underscores
        """
        self.word_dic = self.create_dic()
        for key in self.word_dic.keys():
            lst = self.ds.create_content_list(self.directory+key)
            for file in lst:
                full_text = textract.process(self.directory+key+'/'+file)
                str_full_text = full_text.decode('utf-8')
                lower_full_text = str_full_text.lower()
                edited_text = re.sub(r'\W+', ' ', lower_full_text)
                edited_text = edited_text.replace("_","")
                tokens = word_tokenize(edited_text)
                stop_lst = self.remove_stop_words(tokens)
                self.word_dic[key].append(stop_lst)
    
    def combine_word_lists(self):
        """
        Combine all of the lists for a key into one list from the Pipeline
        word_dic attribute
        """
        for key in self.word_dic.keys():
            result = []
            for lst in self.word_dic[key]:
                result.extend(lst)
            self.combined_dic[key] = result
    
    def word_condenser(self):
        
        porter = PorterStemmer()
        snowball = SnowballStemmer('english')
        wordnet = WordNetLemmatizer()
        for key in self.combined_dic.keys():
            porter_lst = []
            snowball_lst = []
            wordnet_lst = []
            for word in self.combined_dic[key]:
                porter_lst.append(porter.stem(word))
                snowball_lst.append(snowball.stem(word))
                wordnet_lst.append(wordnet.lemmatize(word))
            self.porter_dic[key] = porter_lst
            self.snowball_dic[key] = snowball_lst
            self.wordnet_dic[key] = wordnet_lst
    
    def word_count(self, dic):
        """
        Returns the count of the words in each key of the dictionary

        Params

        dic = dict for which the words will be counted

        Returns

        new_dic: dict with word count for each key
        """
        word_count_dic = {}
        for key, val in dic.items():
            word_count_dic[key] = Counter(val)
        new_dic = dict(word_count_dic)
        return new_dic

    def word_cloud(self, dic):
        """
        Generates a word cloud for each key in the dic

        Params

        dic: dict for which the word cloud will be generated

        Returns

        Plot with word cloud for each key in dic
        """

        word_cloud_dic = {}
        for key, val in dic.items():
            word_cloud_dic[key] = ' '.join(val)
        wc_lst = []
        for val in word_cloud_dic.values():
            wc = WordCloud(width=1000, height=1000, background_color='white', 
                            min_font_size=9)
            wc_lst.append(wc.generate(val))
        fig, axs = plt.subplots(3,3, figsize=(15,12))
        titles = list(dic.keys())
        for cloud, title, ax in zip(wc_lst, titles, axs.flatten()):
            chartBox = ax.get_position()
            ax.set_position(pos=[chartBox.x0,chartBox.y0,chartBox.width*1.05,
                                        chartBox.height*1.05])
            ax.imshow(cloud)
            ax.set_title(title, fontsize=16, weight='bold')
            ax.axis("off")
        axs[2,1].set_axis_off()
        axs[2,2].set_axis_off()
        chartBox = axs[2,0].get_position()
        axs[2,0].set_position(pos=[chartBox.x0*2.8,chartBox.y0*.9,chartBox.width*1.05,
                                    chartBox.height*1.05])
        plt.show()
    
    for img in image_list:
            da.delete_files(img)

    merge_pages(page_lst=page_lst, out_dir=out_dir, file_name=file_name)
    
    for page in page_lst:
        da.delete_files(out_dir+page)



 
if __name__ == '__main__':
    directory='/Users/justinlansdale/Documents/Galvanize/Capstone3/EC2Data/\
changeFolder/pdfs/Original/'
    out_dir = '/Users/justinlansdale/Documents/Galvanize/Capstone3/EC2Data/\
changeFolder/Amended/'

    da = DirectoryAssistor()
    doc_lst = da.create_content_list(directory)
    print(doc_lst)
    start = time.time()
    
    for i in range(len(doc_lst)):
        file_name = doc_lst[i]
        try:
            main(directory, file_name, out_dir)
        except:
            continue
    end = time.time()
    print(end-start)
class AmendmentDetector():
    def __init__(self, original_doc_dir, amended_doc_dir, original_txt_dir,
                 amended_txt_dir):
        """
        Instantiates the amendment detection class
        
        Params:

        original_doc_dir: str directory where the original document is located
        amended_doc_dir: str directory where the amended document is located
        original_txt_dir: str directory where the original txt file will be 
        storeed
        amended_txt_dir: str directory where the amended txt file will be 
        stored
        """

        self.original_doc_dir = original_doc_dir
        self.amended_doc_dir = amended_doc_dir
        self.original_txt_dir = original_txt_dir
        self.amended_txt_dir = amended_txt_dir
        self.da = DirectoryAssistor()

    def convert_original_to_txt(self):
        """
        Converts original doct to .txt file
        """

        doc_lst = self.da.create_content_list(self.original_doc_dir)

        for i in range(len(doc_lst)):
            file_name = doc_lst[i]
            try:
                main(self.original_doc_dir, file_name, self.original_txt_dir)
            except:
                continue

    def convert_amended_to_txt(self):
        """
        Converts amended doc to .txt file
        """
        doc_lst = self.da.create_content_list(self.amended_doc_dir)

        for i in range(len(doc_lst)):
            file_name = doc_lst[i]
            try:
                main(self.amended_doc_dir, file_name, self.amended_txt_dir)
            except:
                continue

    def read_in_files(self):
        """
        Reads in txt files
        """

        original_lst = self.da.create_content_list(self.original_txt_dir)
        amended_lst = self.da.create_content_list(self.amended_txt_dir)
        for doc in original_lst:
            try:
                with open(self.original_txt_dir + doc, 'r') as f:
                    self.original = f.read().replace('\n', ' ')
            except:
                continue

        for doc in amended_lst:
            try:
                with open(self.amended_txt_dir + doc, 'r') as f:
                    self.amended = f.read().replace('\n', ' ')
            except:
                continue

    def print_changes(self):
        """
        Prints changes made in the amended document
        """

        original_lst = self.original.split()
        amended_lst = self.amended.split()
        original_value = []
        amended_value = []
        change_ref = []
        for i in range(len(original_lst)):
            if original_lst[i] != amended_lst[i]:
                original_value.append(original_lst[i])
                change_ref.append(list(original_lst[i - 40:i + 40]))
                amended_value.append(amended_lst[i])

        for i in range(len(original_value)):
            print(f'\n Change # {i+1}: {original_value[i]} changed to \
{amended_value[i]} \n \n Reference text: {" ".join(change_ref[i])} \n')