def ppt_extractor(path): # Initialize dictionary to contain content of pptx per slide paragraph_repo = {} # vector = {} f = open(path, "rb") # Use Presentation module from python-pptx to process content prs = Presentation(f) slide_nb = 0 # Initialize string to contain concatenated text from document for nlp purposes s = '' for slide in prs.slides: slide_nb += 1 # Initialize temporary text container temp_text = '' for shape in slide.shapes: # If string is not null and is text, append to temporary text container if hasattr(shape, "text") and shape.text.strip(): temp_text += shape.text # Once all text has been retrieved from slide attribute it to page nb index inside initialized dictionary paragraph_repo[str(slide_nb)] = fix_text(temp_text) # append concatenated string to initialized string for nlp purposes s += fix_text(temp_text) # if vectors: # vector[str(slide_nb)] = vectorizer(temp_text, lang=detect(text)) # else: # pass # if vectors: # return creator, paragraph_repo, vector # else: return paragraph_repo, s
def txt_extractor(path): # Initialize dictionary to contain all content of txt file doc = {} # vector = {} paragraph_nb = 1 # Initialize string to contain all concatenated text from txt file s = "" with open(path) as f: lines = f.read() # Split document at line jumps texts = lines.strip().split("/n/n") for text in texts: # Append text to line index doc[str(paragraph_nb)] = fix_text(text) # Append concatenated string to intialized string for nlp purposes s += fix_text(text) # if vectors: # vector[str(paragraph_nb)] = vectorizer(text, lang=detect(text)) # else: paragraph_nb += 1 # if vectors: # return doc, vector # else: return doc, s
def pdf_extractor(path): # Initialize PDFResourceManager to go through content of PDF rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() # Initialize TextConverter method to convert PDF data to text device = TextConverter(rsrcmgr, retstr, laparams=laparams) fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) # Password may be required to open PDF password = "" maxpages = 0 caching = True pagenos = set() current_page_number = 1 # Initialize dictionary to contain content of PDF indexed per page paragraph_repo = {} # vector = {} # Initialize string to contain all concatenated text for further nlp processing s = '' # Loop through every page the PDFResourceManager has identified with listed options above for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) # Get text value text = retstr.getvalue() retstr.truncate(0) # Fix potential encoding error text = re.sub(u'(\u0000)', "", text) # Attribute retrieved text to current page number in intialized dictionary paragraph_repo[str(current_page_number)] = fix_text(text) # Append concatenated string to initialized string for nlp s += fix_text(text) # if vectors: # vector[str(current_page_number)] = vectorizer(text, lang=detect(text)) # else: # pass current_page_number += 1 fp.close() device.close() retstr.close() # if vectors: # return Classified, creator, paragraph_repo, vector # else: return paragraph_repo, s
def docxExtractor(path): # Function to extract content from docx files, takes path input if path endswith .docx # Start by reading MSoffice zipfile document = zipfile.ZipFile(path) # Search in xml structure the location of content (here it's word/document.xml) xml_content = document.read('word/document.xml') document.close() # Generate xml tree structure from content location tree = XML(xml_content) # Initialize dictionary to contain content and index per paragraph doc = {} # Initialize string to contain concatenated text from document for nlp purposes s = '' # vector = {} paragraph_nb = 1 # Iterate through all elements of the xml tree for paragraph in tree.getiterator(PARA): # Append to list if node in tree contains non-null text texts = [ node.text for node in paragraph.iter(TEXT) #paragraph.getiterator(TEXT) if node.text ] if texts: # Concatenate non null text contained in previous list text = ''.join(texts) # Index concatenated string to paragtaph number doc[str(paragraph_nb)] = fix_text(text) # Append concatenated string to current string (for nlp) s += fix_text(text) # if vectors: # vector[str(paragraph_nb)] = vectorizer(text, lang=detect(text)) # else: # pass paragraph_nb += 1 # if vectors: # return creator, doc, vector # else: return doc, s
def preprocessing(text): stopword_list, lemmatizer, transformation_sc_dict = requirements() # Tokenization try: tokens = word_tokenize(text) except Exception as e: print(e) print("downloading for you!") nltk.download('punkt') tokens = word_tokenize(text) # Deleting words with only one caracter tokens = [token for token in tokens if len(token) > 2] # stopwords + lowercase tokens = [ token.lower() for token in tokens if token.lower() not in stopword_list ] # Deleting specific characters tokens = [ token.translate(str.maketrans(transformation_sc_dict)) for token in tokens ] # Lemmatizing tokens try: tokens = [ lemmatizer.lemmatize(lemmatizer.lemmatize(lemmatizer.lemmatize( token, pos='a'), pos='v'), pos='n') for token in tokens ] except Exception as e: print(e) print("downloading for you!") nltk.download('wordnet') tokens = [ lemmatizer.lemmatize(lemmatizer.lemmatize(lemmatizer.lemmatize( token, pos='a'), pos='v'), pos='n') for token in tokens ] # Final cleaning of additionnal characters tokens = [fix_text(clean_text(token)) for token in tokens] return concat_str_list(tokens)
def scan_extractor(path): # Initialize dictionary to contain contents of scanned document that PDFResourceManager failed to handle paragraph_repo = {} # Initialize string to contain concatenated string for nlp purposes s = '' # Store all the pages of the PDF in a variable pages = convert_from_path(path, 500) # Counter to store images of each page of PDF to image image_counter = 1 photos = [] # Iterate through all the pages stored above for page in pages: # Declaring filename for each page of PDF as JPG # For each page, filename will be: # PDF page 1 -> page_1.jpg # PDF page 2 -> page_2.jpg # PDF page 3 -> page_3.jpg # .... # PDF page n -> page_n.jpg filename = "page_" + str(image_counter) + ".jpg" photos.append(filename) # Save the image of the page in system page.save(filename, 'JPEG') # Increment the counter to update filename image_counter = image_counter + 1 ''' Part #2 - Recognizing text from the images using OCR ''' # Variable to get count of total number of pages filelimit = image_counter - 1 # Iterate from 1 to total number of pages for i in range(1, filelimit + 1): # Set filename to recognize text from # Again, these files will be: # page_1.jpg # page_2.jpg # .... # page_n.jpg filename = "page_" + str(i) + ".jpg" # Recognize the text as string in image using pytesserct text = str((pytesseract.image_to_string(Image.open(filename)))) # The recognized text is stored in variable text # Any string processing may be applied on text # Here, basic formatting has been done: # In many PDFs, at line ending, if a word can't # be written fully, a 'hyphen' is added. # The rest of the word is written in the next line # Eg: This is a sample text this word here GeeksF- # orGeeks is half on first line, remaining on next. # To remove this, we replace every '-\n' to ''. text = fix_text(text.replace('-\n', '')) paragraph_repo[str(i)] = text s += text for file in photos: os.remove(file) return paragraph_repo, s