def test_missing_filename_python(self): """Make sure missing files raise the correct error""" filename = self.get_temp_filename() os.remove(filename) import textract from textract.exceptions import MissingFileError with self.assertRaises(MissingFileError): textract.process(filename)
def test_unsupported_extension_python(self): """Make sure unsupported extension raises the correct error""" filename = self.get_temp_filename(extension="extension") import textract from textract.exceptions import ExtensionNotSupported with self.assertRaises(ExtensionNotSupported): textract.process(filename) os.remove(filename)
def annotate_doc(pdf_file_path, ontologies): if pdf_file_path.endswith('pdf') or pdf_file_path.endswith('PDF'): text = textract.process(pdf_file_path, method="pdfminer") elif pdf_file_path.endswith('html') or pdf_file_path.endswith('htm'): text = textract.process(pdf_file_path, method="beautifulsoup4") elif pdf_file_path.endswith('txt'): with open(pdf_file_path, 'r') as file: text = file.read() db = DBConnect() if text.isspace(): log = { 'file_name': pdf_file_path.encode('utf-8'), 'error': 'Failed PDF to text transformation in annotation process', 'exception': '', 'data': '' } db.insert_log(log) return ontologies = ",".join(ontologies) annotations = [] text = unidecode(text.decode('utf8')) text = ' '.join(text.split()) # post_data = dict(apikey=settings.BIOPORTAL_API_KEY, text=text, # display_links='true', display_context='false', minimum_match_length='3', # exclude_numbers='true', longest_only='true', ontologies=ontologies, exclude_synonyms='true') post_data = dict(apikey=settings.BIOPORTAL_API_KEY, text=text, display_links='true', display_context='false', minimum_match_length='3', exclude_numbers='true', longest_only='true', ontologies=ontologies, exclude_synonyms='true') try: response = requests.post(settings.ANNOTATOR_URL, post_data) json_results = json.loads(response.text) for result in json_results: for annotation in result['annotations']: context_begin = annotation['from'] if annotation['from'] - 40 < 1 else annotation['from'] - 40 context_end = annotation['to'] if annotation['to'] + 40 > len(text) else annotation['to'] + 40 record = { 'file_name': pdf_file_path.encode('utf-8'), 'bio_class_id': result['annotatedClass']['@id'], 'bio_ontology_id': result['annotatedClass']['links']['ontology'], 'text': u'' + annotation['text'].encode('utf-8'), 'match_type': annotation['matchType'], 'context': u''+text[context_begin:context_end] } annotations.append(record) db.insert_annotations(annotations) return except (ValueError, IndexError, KeyError) as e: print e log = { 'file_name': pdf_file_path.encode('utf-8'), 'error': 'Bad response from Bioportal Annotator', 'exception': str(e), 'data': '' } db.insert_log(log) return
def pdftotext_any(myfile): # Todo: use tempfile instead path = '/tmp/infile.pdf' with open(path, 'wb') as f: #with tempfile.NamedTemporaryFile() as f: # path = f.name f.write(myfile) text = textract.process(path, method='pdftotext') if len(text)<5: # No text found, it is probably an image scan, so we need to do an OCR text = textract.process(path, method='tesseract') return text
def build_indexes(files_list, index_file): toolbar_width = len(files_list) print(toolbar_width) sys.stdout.write("[%s]" % (" " * toolbar_width)) sys.stdout.flush() sys.stdout.write("\b" * (toolbar_width+1)) # return to start of line, after '[' hash_index = {} for item in files_list: text = textract.process(item) details = re.split("[, \t\n\\t:;]", text) for i in details: if i == "" : continue if hash_index.has_key((i)) : if hash_index[(i)].has_key((item)): hash_index[(i)][(item)] += 1 else: hash_index[(i)][(item)] = 1 else: hash_index[(i)] = {} if hash_index[(i)].has_key(item): hash_index[(i)][(item)] += 1 else: hash_index[(i)][(item)] = 1 # update the bar sys.stdout.write("-") sys.stdout.flush() sys.stdout.write("\n") fp = open(index_file, "w") json.dump(hash_index, fp) fp.close()
def get_text_from_file(self, file): filename = file['id'] + '.pdf' self._download_file(file, filename) text = textract.process(filename) os.remove(filename) return text
def extract_text_from_lectureDocuments(self): # pull files from database lectureDocumentsObjects = lectureDocuments.objects.filter(extracted=False) # loop through modules and pull all text for lectureDocumentsObject in lectureDocumentsObjects: if lectureDocumentsObject.document: print lectureDocumentsObject.document path_to_file = MEDIA_ROOT + '/' + str(lectureDocumentsObject.document) document_contents = textract.process(path_to_file, encoding='ascii') # create tags from noun_phrases # only add tags if none exist blobbed = TextBlob(document_contents) np = blobbed.noun_phrases np = list(set(np)) np = [s for s in np if s] lectureDocumentsObject.tags.clear() for item in np: s = ''.join(ch for ch in item if ch not in exclude) print s lectureDocumentsObject.tags.add(s) # save this string lectureDocumentsObject.document_contents = document_contents lectureDocumentsObject.extracted = True lectureDocumentsObject.save()
def indexing(): ana = analysis.StemmingAnalyzer() schema = Schema(title=TEXT(analyzer=ana, spelling=True), path=ID(stored=True), content=TEXT) ix = create_in("data/pdf_data", schema) writer = ix.writer() count = 0 with open('Final_Links/doc_links.txt') as fp, open('data/pdf_data/mytemp/doc_content.txt', 'w+') as f: for line in fp: count += 1 url = line doc_name = re.search('.*/(.*)', url).group(1) try: response = urllib2.urlopen(url, timeout=3) if int(response.headers['content-length']) > 2475248: continue fil = open("data/pdf_data/mytemp/" + doc_name, 'w+') fil.write(response.read()) fil.close() content_text = textract.process('data/pdf_data/mytemp/' + doc_name, encoding='ascii') f.write(content_text) writer.add_document(title=unicode(url, "utf-8"), path=unicode(url, "utf-8"), content=unicode(content_text)) writer.add_document(title=unicode(url, "utf-8"), path=unicode(url, "utf-8"), content=unicode(url)) except Exception as e: print "Caught exception e at " + '' + str(e) continue print str(count) + " in " + " URL:" + url writer.commit() print "Indexing Completed !"
def save(self, *args, **kwargs): super(Document, self).save(*args, **kwargs) text = textract.process(self.source_file.url) filtered_stems = self.get_filtered_stems(text) self.total_word_count = len(filtered_stems) self.count_target_words(filtered_stems) super(Document, self).save(*args, **kwargs)
def extract(path): ''' Extract full text fro pdf's :param path: [String] Path to a pdf file downloaded via {fetch}, or another way. :return: [str] a string of text Usage:: from pyminer import miner # a pdf url = "http://www.banglajol.info/index.php/AJMBR/article/viewFile/25509/17126" out = miner.fetch(url) out.parse() # search first, then pass links to fetch res = miner.search(filter = {'has_full_text': True, 'license_url': "http://creativecommons.org/licenses/by/4.0"}) # url = res.links_pdf()[0] url = 'http://www.nepjol.info/index.php/JSAN/article/viewFile/13527/10928' x = miner.fetch(url) miner.extract(x.path) ''' text = textract.process(path) return text
def process_text_file(file_path): file_name, extension = os.path.splitext(file_path) print file_name, extension if (extension == ".txt"): return file_path elif (extension == '.epub'): print "Trying epub" try: text = textract.process(file_path) print "Processed epub: ", file_path output_path = file_name+'.txt' output_file = open(output_path, 'w') output_file.write(text) print "Converted epub: ", output_path return output_path except Exception as error: # TODO: textract raises own error so none isn't returned on try failure print error print 'Failed to convert epub: ', file_path return None elif (extension == ""): text_content = None try: with open(file_path) as input_file: text_content = input_file.read() if text_content: print "Managed to read file: ", file_path return file_path except IOError: print "Failed to read file: ", file_path return None else: print 'Unsupported file type: ', file_path return None
def get_path_details(cls, temp_path, image_path): """Return the byte sequence and the full text for a given path.""" byte_sequence = ByteSequence.from_path(temp_path) extension = map_mime_to_ext(byte_sequence.mime_type, cls.mime_map) logging.debug("Assessing MIME: %s EXTENSION %s SHA1:%s", byte_sequence.mime_type, extension, byte_sequence.sha1) full_text = "" if extension is not None: try: logging.debug("Textract for SHA1 %s, extension map val %s", byte_sequence.sha1, extension) full_text = process(temp_path, extension=extension, encoding='ascii', preserveLineBreaks=True) except ExtensionNotSupported as _: logging.exception("Textract extension not supported for ext %s", extension) logging.debug("Image path for file is %s, temp file at %s", image_path, temp_path) full_text = "N/A" except LookupError as _: logging.exception("Lookup error for encoding.") logging.debug("Image path for file is %s, temp file at %s", image_path, temp_path) full_text = "N/A" except UnicodeDecodeError as _: logging.exception("UnicodeDecodeError, problem with file encoding") logging.debug("Image path for file is %s, temp file at %s", image_path, temp_path) full_text = "N/A" except: logging.exception("Textract UNEXPECTEDLY failed for temp_file.") logging.debug("Image path for file is %s, temp file at %s", image_path, temp_path) full_text = "N/A" return byte_sequence, full_text
def get_text_from_files(files_to_process): """Extracts text from each file given a list of file_names""" file_text_dict = {} for file_name in iter(files_to_process): extracted_text = textract.process(file_name) file_text_dict[file_name] = extracted_text return file_text_dict
def test_standardized_text_python(self): """Make sure standardized text matches from python""" import textract result = textract.process(self.standardized_text_filename) self.assertEqual( ''.join(result.split()), self.get_standardized_text(), )
def parse_sentences(pdf): text = textract.process(pdf) reg = "[.?!]" sentences = re.split(reg, text) return [s for s in sentences if "\\x" not in s]
def compare_python_output(self, filename, expected_filename=None, **kwargs): if expected_filename is None: expected_filename = self.get_expected_filename(filename, **kwargs) import textract result = textract.process(filename, **kwargs) with open(expected_filename) as stream: self.assertEqual(result, stream.read())
def detectar(f): texto = textract.process(f) texto = texto.decode('utf-8') _texto = textblob.TextBlob(texto) try: lang = _texto.detect_language() return lang except TranslatorError: return None
def test_standardized_text_python(self): """Make sure standardized text matches from python""" import textract result = textract.process(self.standardized_text_filename) self.assertEqual( six.b('').join(result.split()), self.get_standardized_text(), "standardized text fails for %s" % self.extension, )
def get_recommendations_file(pdf_file_path): if pdf_file_path.endswith('pdf') or pdf_file_path.endswith('PDF'): text = textract.process(pdf_file_path, method="pdfminer") elif pdf_file_path.endswith('html') or pdf_file_path.endswith('htm'): text = textract.process(pdf_file_path, method="beautifulsoup4") elif pdf_file_path.endswith('txt'): with open(pdf_file_path, 'r') as file: text = file.read() if text.isspace(): log = { 'file_name': pdf_file_path.encode('utf-8'), 'error': 'Failed PDF to text transformation in recommendation process', 'exception': '', 'data': '' } db = DBConnect() db.insert_log(log) return [] abstract_index = text.find('abstract') abstract_index += text.find('ABSTRACT') abstract_index += text.find('Abstract') abstract_index = 0 if abstract_index < 0 else abstract_index text = unidecode(text.decode('utf8')) text = ' '.join(text.split()) text = text[abstract_index:abstract_index+500] if len(text) > 500 else text post_data = dict(apikey=settings.BIOPORTAL_API_KEY, input=text, include='ontologies', display_links='false', output_type='2', display_context='false', wc='0.15', ws='1.0', wa='1.0', wd='0.5') try: response = requests.post(settings.RECOMMENDER_URL, post_data) json_results = json.loads(response.text) best_ontology_set = json_results[0]['ontologies'] if len(json_results) > 0 else [] return [{'acronym': ontology['acronym'], 'id': ontology['@id']} for ontology in best_ontology_set] except (ValueError, IndexError, KeyError) as e: log = { 'file_name': '', 'error': 'Bad response from Bioportal Recommender:', 'exception': str(e), 'data': '' } db = DBConnect() db.insert_log(log) return []
def quickQuotes(fileName): fileText="" try: fileText=textract.process(fileName) except textract.exceptions.ExtensionNotSupported: pass except Exception as e: print e pass return fileText
def extract_all(self, src, maxpages=0): if '.pdf' in src: try: start = time() text = self.extract(src, maxpages=maxpages) print "case 1 elapsed_time {}s".format(time() - start) except Exception, e: start = time() text = textract.process(src) print "case 2 elapsed_time {}s".format(time() - start)
def compare_python_output(self, filename, expected_filename=None, **kwargs): if expected_filename is None: expected_filename = self.get_expected_filename(filename, **kwargs) import textract result = textract.process(filename, **kwargs) with open(expected_filename, "rb") as stream: result = self.clean_text(result) expected = self.clean_text(stream.read()) assert result == expected self.assertEqual(result, expected)
def source(): # Create a list 'text' to store the words of source.pdf # text = textract.process(input("Enter File path (Enter path in double quotes): ")) text = textract.process("source.pdf") text = text.split() # Gets user's difficulty rating # difficulty = input("Please enter difficulty rating (0 being most explanatory, 4 being least): ") difficulty = 0 difficulty = int(difficulty) return (text, difficulty)
def get_text(self,file): if (not re.search(r'.pdf',file)): file = file + ".pdf" try: text = textract.process(file) text = text.strip() text = re.sub(b'\n+',b" ",text) text = re.sub(b'\s+',b" ",text) return text.decode("utf-8") except Exception as e: print("file: {} not found\ninformation from textract:\n\t{}".format(file,e)) return 0
def do_fetch_article(input_payload): logging.info("Fetching article from social post") result_payloads = [] for link in input_payload["urls"]: url = link.get("expanded_url") display_url = link.get("display_url") shortened_url = link.get("url") file_name = download_file(url) text = textract.process(file_name) logging.info("Extracted article text ({} characters)".format(len(text))) metadata = {} try: metadata = extract_metadata(file_name) except: logging.exception("Failed to extract metadata from {}".format(url)) delete_downloaded_file(file_name) logging.info("Deleted temp file: {}".format(file_name)) result_payloads.append( { "contentType": "article-text", "key": url, "picture": get_favicon(url), "summary": { "url": url, "display_url": display_url, "shortened_url": shortened_url, "metadata": metadata }, "raw": { "text": text }, "payload": { "url": url, "display_url": display_url, "shortened_url": shortened_url, "raw_text_size": len(text) }, "placement_hints": { "new_bldg": True, "same_flr": False, "flr_above": True, "location_by_index": False, "same_location": True, } } ) return result_payloads
def crawl(self): self.content = textract.process(self.document) if self.type == "txt": try: self.data = self.content.lower().split("\r\n\r\n") self.data = filter(partial(ne,""),self.data) except: self.data = self.content.lower().split("\n\n") self.data = filter(partial(ne,""),self.data) else: self. data = self.content.lower().split("\n\n") self.data = filter(partial(ne,""),self.data)
def _convert_rtf_to_text(self,index): input_pdf = self.cvFile inputPath = os.getcwd() if os.path.exists(input_pdf): inputPath = os.path.dirname(input_pdf) input_filename = os.path.basename(input_pdf) input_parts = input_filename.split(".") input_parts.pop() text = textract.process(input_pdf) return text
def convert(uploaded_file): ext_format = uploaded_file.split('.')[-1] file_dir = path_to_files + uploaded_file data = textract.process(file_dir) if ext_format == 'xls': data = convert_from_xml(data) with open(path_to_result+'result.txt', "w") as text_file: text_file.write(data)
def get_features(**kwargs): directory = kwargs['directory'] for file_path in RTFReader.get_file_list(directory, 'rtf'): try: features = RTFReader.get_meta_features(file_path=file_path) features['text'] = textract.process(file_path).decode('utf8') features['_texta_id'] = file_path yield features except Exception as e: HandleDatasetImportException(kwargs, e, file_path=file_path)
def word_count(pdf_file_path): if pdf_file_path.endswith('pdf') or pdf_file_path.endswith('PDF'): text = textract.process(pdf_file_path, method="pdfminer") elif pdf_file_path.endswith('html') or pdf_file_path.endswith('htm'): text = textract.process(pdf_file_path, method="beautifulsoup4") elif pdf_file_path.endswith('txt'): with open(pdf_file_path, 'r') as file: text = file.read() if text.isspace(): log = { 'file_name': pdf_file_path.encode('utf-8'), 'error': 'Failed PDF to text transformation in recommendation process', 'exception': '', 'data': '' } db = DBConnect() db.insert_log(log) return [] text = unicode(text, 'utf-8') words = word_tokenize(text.upper()) c = Counter() c.update(words) return c
def parsePDF_textract(path): text = textract.process(path)
date = (input('Input Date (mm/dd/yyyy)\n')) return date # In[20]: dic = {} cont = 1 n = 0 while cont == 1: input('Choose pdf to import \n (press enter when ready)') Tk().withdraw( ) # we don't want a full GUI, so keep the root window from appearing file = askopenfilename( ) # show an "Open" dialog box and return the path to the selected file text = textract.process(file) text = str(text) text = text.replace(':', '') text = text.replace('\\n', ' \n ') print(text) title = input('What is the Title of the Activity: \n ') os.system('clear') print(text) credit = find_credits(text) print(credit) os.system('clear') print(text) date = find_date(text) print(date) dic.update({title: [date, credit]}) n = n + 1
def matches(self, path: Path) -> Any: content = textract.process(str(path)) return self.expr.search(content.decode("utf-8"))
def get_site(url_string): """ :param url_string: A url to be accessed :return: site data for a given urlString. Performs all necessary low level socket-http stuff """ print("get_site(" + str(url_string) + ")") url = urlparse(url_string) return_value = { "url": url_string, } try: response = requests.get(url_string) if "text/html" in response.headers['content-type']: add_to_queue = [] soup = BeautifulSoup(response.content, "html.parser") return_value["content_type"] = "html" return_value["text"] = soup.body.text print("body.text = " + return_value["text"]) for link in soup.findAll('a'): try: href = link.get('href') current_scheme_prefix = url.scheme + "://" parsed_href = urlparse(href) if not parsed_href.netloc: href = url.netloc + href parsed_href = urlparse(href) if not parsed_href.scheme: href = current_scheme_prefix + href parsed_href = urlparse(href) if ".edu.sg" in parsed_href.netloc or "moe.gov.sg" in parsed_href.netloc: add_to_queue.append(href) except Exception as e: print("HTML Parse failed: ", e, flush=True) pass return_value["urlqueue"] = add_to_queue elif "application/pdf" in response.headers['content-type']: fileurl = "./tmp.pdf" with open(fileurl, "wb") as f: f.write(response.content) with open(fileurl, "rb") as f: pdf_reader = PyPDF2.PdfFileReader(f) num_pages = pdf_reader.numPages count = 0 text = "" # The while loop will read each page while count < num_pages: page_object = pdf_reader.getPage(count) count += 1 text += page_object.extractText() # This if statement exists to check if the above library returned #words. It's done because # PyPDF2 cannot read scanned files. if text != "": text = text # If the above returns as False, we run the OCR library textract to #convert scanned/image based # PDF files into text else: text = textract.process(fileurl, method='tesseract', language='eng') # Now we have a text variable which contains all the text derived #from our PDF file. Type print( # text) to see what it contains. It #likely contains a lot of spaces, possibly junk such as '\n' # etc. Now, we will clean our text variable, and return it as a list of keywords. all_web_or_relative_urls_regex = r'(?:(?:http|https):\/\/|www\.|ftp\.)(?:\([-A-Z0-9+&@#\/%=~_|$?!:,' \ r'.]*\)|[-A-Z0-9+&@#\/%=~_|$?!:,.])*(?:\([-A-Z0-9+&@#\/%=~_|$?!:,' \ r'.]*\)|[A-Z0-9+&@#\/%=~_|$])' urls_on_pdf = re.findall(all_web_or_relative_urls_regex, text, re.IGNORECASE | re.MULTILINE) return_value["text"] = text return_value["urlqueue"] = urls_on_pdf return_value["content_type"] = "pdf" except Exception as e: print("get_site failed: ", e, flush=True) return None return return_value
other_index = match[i] experience_resume = '' while (skill_index < other_index): experience_resume += text[skill_index] skill_index += 1 rest_name = experience_resume return rest_name else: return ("no experience detected") data = {} cvs = [] for doc in docs: cv = {} text = textract.process(doc).decode('utf-8') nlp = spacy.load('en_core_web_lg') doct = nlp(text.replace('\n\n', '\n')) cv['file'] = doc print(doc) cv['name'] = get_name(text, nlp) cv['email'] = get_email(text) cv['phone'] = get_phone(text) cv['location'] = get_location(text, doct) cv['language'] = get_language(text, doct) cv['skills'] = get_skills(text) cv['experience'] = get_experience(text) print("==============") cvs.append(cv) data['data'] = cvs with open('data.json', 'w') as f:
found that \xe2\x80\x94 indicates empty cell cmd line args: python pdfReader.py fileName rows cols ''' args = sys.argv PREFIX = args[5] SPLIT = int(sys.argv[4]) ROW_NUM = int(args[2]) COL_NUM = int(args[3]) MAX_LENGTH = 16 ###the right way to make a matrix using lists matrix = [["-" for i in range(COL_NUM)] for j in range(ROW_NUM)] text = textract.process(args[1]) text = str(text) splits = text.split('\\r\\n') column_count = 0 row_count = 0 for i in splits: if (row_count == ROW_NUM): break if (len(i) < MAX_LENGTH): if i == "\\xe2\\x80\\x94": matrix[row_count][column_count] = "-" column_count += 1 if column_count >= COL_NUM: column_count = 0 row_count += 1
def getInfo2(strPath): import textract text = textract.process(strPath, method='pdfminer')
def xls_pw(f): os.rename(f, f+'.xlsx') try: return process(f+'.xlsx').strip() except: return ''
def doc_pw(f): os.rename(f, f+'.docx') try: return process(f+'.docx') except: return ''
import sys import json from os import listdir from os.path import isfile, join import textract if __name__ == "__main__": try: my_dir = sys.argv[1] except: my_dir = "raw_files/" print "Extracting text from:", my_dir file_list = [f for f in listdir(my_dir) if isfile(join(my_dir, f))] text = {} for f in file_list: print "\tProcessing file:", f text[f] = textract.process(join(my_dir, f), encoding="acsii") everything = {'input': text} with open("corpus_data.json", "w") as file_name: json.dump(everything, file_name) print "All Done for: ", text.keys()
pdfReader = PyPDF2.PdfFileReader(pdfFileObj) num_pages = pdfReader.numPages count = 0 text = "" while count < num_pages: pageObj = pdfReader.getPage(count) count += 1 text += pageObj.extractText() if text != "": text = text else: text = textract.process(fileurl, method='tesseract', language='eng') tokens = word_tokenize(text) punctuations = ['(', ')', ';', ':', '[', ']', ','] stop_words = stopwords.words('english') #Remove stop words and punctuation from the parse pdf keywords = [ word for word in tokens if not word in stop_words and not word in string.punctuation ] #Make lowerCase from keywords # lowerKeyword =[stripped.lower() for word in keywords] print(keywords[:100])
#textract is installed in server. Veried by pip install textrect and message returned "Requirement already satisfied: pytz in /usr/lib/python2.7/site-packages (from tzlocal==1.5.1->extract-msg==0.23.1->textract) (2019.3)" filepath = '/var/www/vhosts/tomedes.com/pro.tomedes.com/WordCount/REST/Input1/' + sys.argv[ 1] filename = sys.argv[1] filenamenoext = os.path.splitext(filename)[0] # filepath = 'sample.pdf' docfilename = '428f535ccf95770cdc0147ce7d2b01f0.doc' # if docfilename[-4:] == ".doc": # filepath = './uploaded_files/428f535ccf95770cdc0147ce7d2b01f0.doc' # text = extract('./uploaded_files/428f535ccf95770cdc0147ce7d2b01f0.doc') # else: try: text = textract.process(filepath) # text = textract.process(str("sample.doc")) # text = textract.process("sample.doc").decode('utf-8') # text = textract.process("/var/www/vhosts/clone.tomedes.com/clone/wcnew/sample.doc") # print(text) except: print('textract error') # def extract(self, filepath): # print('extract') # stdout, stderr = self.run(['antiword', filepath]) # return stdout try: f = open( '/var/www/vhosts/tomedes.com/pro.tomedes.com/WordCount/REST/Input1/' +
from tabula import read_pdf # Only do once, then pickle for faster import # cdcdf_full = pd.read_sas('/Users/alex/Documents/ML/cdc/data/LLCP2017.XPT',encoding='utf-8') # Fix import rounding errors (e.g. 0.7e-80 instead of 0) # cdcdf_full = cdcdf_full.round(5) # pickle.dump(cdcdf_full, open('cdcdf_full.pickle', 'wb')) # Import from saved pickle.dump cdcdf_full = pickle.load( open( "cdcdf_full.pickle", "rb" ) ) # Get Variable Summary (varnames and question for each) text = textract.process("information/codebook17_llcp-v2-508.pdf", encoding='utf-8') text = text.splitlines() text = [i.decode('utf-8') for i in text] v_and_q = pd.DataFrame({ 'Variable': [i.split(':')[1].strip() for i in text if 'SAS Variable Name' in i], 'Question': [i.split(':')[1].strip() for i in text if 'Question:' in i] }) v_and_q.drop_duplicates(inplace=True) v_and_q.reset_index(drop=True,inplace=True) # Make label dictionaries # alltables = read_pdf('information/codebook17_llcp-v2-508.pdf',multiple_tables=True, # pages='2-195') # pickle.dump(alltables, open('alltables.pickle', 'wb'))
import textract import re #Faccio una lista con le paroli presenti nel glossario glossx = textract.process("Glossario.pdf", method='pdftotext') glossario = textract.process("Glossario.pdf", method='pdftotext').decode("utf-8") index = glossario.index("Android") - 10 glossario = glossario[index:] glosswords = re.findall(r'\n[\w -]{1,32}[^.]\r\n', glossario) glosswords += re.findall(r'\n[\w]{1,10}[.][\w]{1,10}\r\n', glossario) glosswords += re.findall(r'\n\x0c[\w -]{1,32}[^.]\r\n', glossario) glosswords = list(dict.fromkeys(glosswords)) gloss = [] for glossword in glosswords: gloss.append(glossword.lower().strip()) #tutte le parole sono presenti nella lista gloss docs = [ "AnalisiDeiRequisiti.pdf", "PianoDiProgetto.pdf", "Glossario.pdf", "NormeDiProgetto.pdf", "StudioDiFattibilità.pdf", "PianoDiQualifica.pdf" ] verbali = [ "VI_2020_10_27.pdf", "VI_2020_11_10.pdf", "VI_2020_11_26.pdf", "VI_2020_12_14.pdf", "VI_2020_12_20.pdf", "VI_2021_01_07.pdf", "VE_2020_12_17.pdf" ] scelta = input( "Scegli tra \n1: analizza tutti i documenti\n2: analizza tutti i verbali\n3: analizza verbali e documenti\n4: analizza un solo file\nScelta:" ) parole = [] if scelta == "1":
def parseCV2(): try: if request.method == 'POST': f = request.files['file'] f.save(UPLOAD_FOLDER + f.filename) data = ResumeParser(UPLOAD_FOLDER + f.filename).get_extracted_data() text = textract.process(UPLOAD_FOLDER + f.filename) text = text.decode("utf-8") data['birth_date'] = extractDOB(text) data['marital_status'] = marital(text) data['nationality'] = nation(text) data['gender'] = gen(text) text = text.encode("utf-8").decode("ascii", "ignore").replace("\n", " ").strip() exp, edu = getExpEdu(text) stopwords = set(nltk.corpus.stopwords.words('english')) stopwords.update(['resume', 'curriculum', 'vitae']) filters = ':' translate_dict = dict((c, " ") for c in filters) translate_map = str.maketrans(translate_dict) text = text.translate(translate_map) text = ' '.join([w for w in text.split() if w not in stopwords]) ent = en.extract_entity_sections_grad(text) if 'certifications' in ent.keys(): a = ent['certifications'] cert = " ".join(a) data['certifications'] = cert else: data['certifications'] = None pro = ['project', 'projects', 'project profile'] if len([p for p in pro if p in ent.keys()]) > 0: a = ent[[p for p in pro if p in ent.keys()][0]] proj = " ".join(a) data['projects'] = proj else: data['projects'] = None if 'hobbies' in ent.keys(): a = ent['hobbies'] hob = " ".join(a) data['hobbies'] = hob else: data['hobbies'] = None if 'summary' in ent.keys(): a = ent['summary'] res = " ".join(a) data['summary'] = res else: data['summary'] = None if 'objective' in ent.keys(): b = ent['objective'] rest = " ".join(b) data['objective'] = rest else: data['objective'] = None # data["experience"] = [{ # "organization":None, # "profile": None, # "currentIndicator":None, # "duration": {"start":None, "end":None} # }] links = extractor.find_urls(text) if (len(links) > 0): for link in links: if "github" in link: data['github'] = link else: data['github'] = None for link in links: if "linkedin" in link: data['linkedin'] = link else: data['linkedin'] = None for link in links: if "skype" in link: data['skype'] = link else: data['skype'] = None else: data['linkedin'] = None data['github'] = None data['skype'] = None url = ['linkedin', 'github', 'skype'] for ur in url: for i in range(len(links)): if ur in links[i]: links[i] = links[i].replace(links[i], "") links = list(filter(None, links)) links = list(set(links)) data['webpage'] = links data["skills"] = [i.upper() for i in data["skills"]] if ((isinstance(data["education"], list)) and (len(data["education"]) > 0)): data["qualification"] = [] for ed in data["education"]: if isinstance(ed, tuple): data["qualification"].append({ "educationDegree": ed[0], "year": ed[1], "university": None, "currentIndicator": None }) else: data["qualification"].append({ "educationDegree": ed, "year": None, "university": None, "currentIndicator": None }) else: data['qualification'] = None print(data['qualification']) if edu[0]['educationDegree'] == None: data["education"] = data["qualification"] else: data["education"] = edu data["experience"] = exp # data["education"] = edu data["success"] = True data['first_name'] = None data['middle_name'] = None data['last_name'] = None if len(data['name'].split()) > 0: data['first_name'] = data['name'].split()[0] if len(data['name'].split()) > 2: data['middle_name'] = data['name'].split()[1] data['last_name'] = data['name'].split()[-1] elif len(data['name'].split()) == 2: data['last_name'] = data['name'].split()[-1] else: pass output_data = { "status": True, "message": "Cv Parsed Successfully", "inputFile": f.filename, "data": { "objective": data['objective'], "summary": data['summary'], "personalInfo": { "fullName": data["name"], "firstName": data["first_name"], "middleName": data['middle_name'], "lastName": data["last_name"], "maritialStatus": data['marital_status'], "dateOfBirth": data['birth_date'], "nationality": data['nationality'], "gender": data['gender'], "language": None, "address": None, "hobbies": data['hobbies'], "passportNumber": None }, "contactInfo": { "email": data["email"], "telephone": data["mobile_number"], "currentLocation": None, "webpage": data['webpage'], }, "socials": { "githubURL": data['github'], "linkedinURL": data['linkedin'], "skype": data['skype'] }, "education": data["education"], "experience": data["experience"], "skills": data["skills"], "projects": { "name": data['projects'], "detail": None }, "certification": { "subject": data['certifications'], "provider": None, }, "publications": { "title": None, "publisher": None, "monthYear": None }, "achievements": { "name": None, "detail": None } } } # output_data = { "objective": data['objective'], # "summary": data['summary'], # "personal_info": { # "name": data["name"], # "email": data["email"], # "mobileNumber": data["mobile_number"], # "githubURL": data['github'], # "linkedinURL": data['linkedin'], # "firstName": data["first_name"], # "middleName": data['middle_name'], # "lastName": data["last_name"], # "maritialStatus" : data['marital_status'], # "dateOfBirth" : data['birth_date'], # "nationality" : data['nationality'], # "gender" : data['gender'], # "hobbies": data['hobbies'], # "projects": data['projects'], # "certifications":data['certifications'], # }, # "education": data["qualification"], # "experience": data["experience"], # "skills": data["skills"], # "success": True # } #os.remove(UPLOAD_FOLDER+f.filename) return jsonify(output_data) else: return jsonify({"success": False}), 400 except Exception as e: return jsonify({"success": False}), 400
def add(request): if (len(request.GET['search']) == 0): all_name = [] if 'abstract' in request.GET: DATA_DIR = '/root/Django/file' pdf_name = [] all_name.append("abstract") for filename in os.listdir(DATA_DIR): if (filename[-4:len(filename)] == ".pdf"): pdf_name.append(filename.replace(".pdf", "")) text = textract.process('/root/Django/file/' + filename, method='pdfminer') file_txt = '/root/Django/file/abstract_all_' + filename.replace( ".pdf", ".txt") xml = open(file_txt, 'w') for i in xrange(len(text)): xml.write(text[i]) xml.close() for i in xrange(len(pdf_name)): myfile_x = open('/root/Django/file/abstract_all_' + pdf_name[i] + ".txt") kk = 0 a1 = [] for j in myfile_x.readlines(): k = str(j.strip().replace(" ", "")) if (abstract_start(k) or abstract_start(k[0:8])): kk = 1 elif (abstract_end(k) or abstract_end(k[0:8])): kk = 0 if kk == 1: a1.append(str(j)) b1 = open( '/root/Django/file/' + pdf_name[i] + '_abstract.txt', 'w') for i in xrange(len(a1)): b1.write(a1[i]) b1.close() if 'introduction' in request.GET: DATA_DIR = '/root/Django/file' pdf_name = [] all_name.append("introduction") for filename in os.listdir(DATA_DIR): if (filename[-4:len(filename)] == ".pdf"): pdf_name.append(filename.replace(".pdf", "")) text = textract.process('/root/Django/file/' + filename, method='pdfminer') file_txt = '/root/Django/file/introduction_all_' + filename.replace( ".pdf", ".txt") xml = open(file_txt, 'w') for i in xrange(len(text)): xml.write(text[i]) xml.close() for i in xrange(len(pdf_name)): myfile_x = open('/root/Django/file/introduction_all_' + pdf_name[i] + ".txt") kk = 0 a1 = [] for j in myfile_x.readlines(): k = str(j.strip().replace(" ", "")) if (introduction_start(k) or introduction_start(k[0:8])): kk = 1 elif (introduction_end(k) or introduction_end(k[0:8])): kk = 0 if kk == 1: a1.append(str(j)) b1 = open( '/root/Django/file/' + pdf_name[i] + '_introduction.txt', 'w') for i in xrange(len(a1)): b1.write(a1[i]) b1.close() if 'method' in request.GET: DATA_DIR = '/root/Django/file' pdf_name = [] all_name.append("method") for filename in os.listdir(DATA_DIR): if (filename[-4:len(filename)] == ".pdf"): pdf_name.append(filename.replace(".pdf", "")) text = textract.process('/root/Django/file/' + filename, method='pdfminer') file_txt = '/root/Django/file/method_all_' + filename.replace( ".pdf", ".txt") xml = open(file_txt, 'w') for i in xrange(len(text)): xml.write(text[i]) xml.close() for i in xrange(len(pdf_name)): myfile_x = open('/root/Django/file/method_all_' + pdf_name[i] + ".txt") kk = 0 a1 = [] for j in myfile_x.readlines(): k = str(j.strip().replace(" ", "")) if (method_start(k) or method_start(k[0:8])): kk = 1 elif (method_end(k) or method_end(k[0:7])): kk = 0 if kk == 1: a1.append(str(j)) b1 = open( '/root/Django/file/' + pdf_name[i] + '_method.txt', 'w') for i in xrange(len(a1)): b1.write(a1[i]) b1.close() if 'result' in request.GET: DATA_DIR = '/root/Django/file' pdf_name = [] all_name.append("result") for filename in os.listdir(DATA_DIR): if (filename[-4:len(filename)] == ".pdf"): pdf_name.append(filename.replace(".pdf", "")) text = textract.process('/root/Django/file/' + filename, method='pdfminer') file_txt = '/root/Django/file/result_all_' + filename.replace( ".pdf", ".txt") xml = open(file_txt, 'w') for i in xrange(len(text)): xml.write(text[i]) xml.close() for i in xrange(len(pdf_name)): myfile_x = open('/root/Django/file/result_all_' + pdf_name[i] + ".txt") kk = 0 a1 = [] for j in myfile_x.readlines(): k = str(j.strip().replace(" ", "")) if (result_start(k) or result_start(k[0:7])): kk = 1 elif (result_end(k) or result_end(k[0:8])): kk = 0 if kk == 1: a1.append(str(j)) b1 = open( '/root/Django/file/' + pdf_name[i] + '_result.txt', 'w') for i in xrange(len(a1)): b1.write(a1[i]) b1.close() if 'discussion' in request.GET: DATA_DIR = '/root/Django/file' pdf_name = [] all_name.append("discussion") for filename in os.listdir(DATA_DIR): if (filename[-4:len(filename)] == ".pdf"): pdf_name.append(filename.replace(".pdf", "")) text = textract.process('/root/Django/file/' + filename, method='pdfminer') file_txt = '/root/Django/file/discussion_all_' + filename.replace( ".pdf", ".txt") xml = open(file_txt, 'w') for i in xrange(len(text)): xml.write(text[i]) xml.close() for i in xrange(len(pdf_name)): myfile_x = open('/root/Django/file/discussion_all_' + pdf_name[i] + ".txt") kk = 0 a1 = [] for j in myfile_x.readlines(): k = str(j.strip().replace(" ", "")) if (discussion_start(k) or discussion_start(k[0:8])): kk = 1 elif (discussion_end(k) or discussion_end(k[0:8])): kk = 0 if kk == 1: a1.append(str(j)) b1 = open( '/root/Django/file/' + pdf_name[i] + '_discussion.txt', 'w') for i in xrange(len(a1)): b1.write(a1[i]) b1.close() if 'reference' in request.GET: DATA_DIR = '/root/Django/file' pdf_name = [] all_name.append("reference") for filename in os.listdir(DATA_DIR): if (filename[-4:len(filename)] == ".pdf"): pdf_name.append(filename.replace(".pdf", "")) text = textract.process('/root/Django/file/' + filename, method='pdfminer') file_txt = '/root/Django/file/reference_all_' + filename.replace( ".pdf", ".txt") xml = open(file_txt, 'w') for i in xrange(len(text)): xml.write(text[i]) xml.close() for i in xrange(len(pdf_name)): myfile_x = open('/root/Django/file/reference_all_' + pdf_name[i] + ".txt") kk = 0 a1 = [] for j in myfile_x.readlines(): k = str(j.strip().replace(" ", "")) if (reference_start(k) or reference_start(k[0:8])): kk = 1 if kk == 1: a1.append(str(j)) b1 = open( '/root/Django/file/' + pdf_name[i] + '_reference.txt', 'w') for i in xrange(len(a1)): b1.write(a1[i]) b1.close() DATA_DIR = '/root/Django/file' pdf_name_all = [] for filename in os.listdir(DATA_DIR): if (filename[-4:len(filename)] == ".pdf"): pdf_name_all.append(filename.replace(".pdf", "")) b1 = open('/root/Django/calc/templates/home3.html') b2 = open('/root/Django/calc/templates/home4.html', 'w') for k in b1.readlines(): kk = str(k.strip().replace(" ", "")) if (kk == "start"): if ('abstract' in request.GET): for j in xrange(len(pdf_name_all)): b3 = open('/root/Django/file/' + pdf_name_all[j] + '_abstract.txt') for jj in b3.readlines(): b2.write(str(jj)) b2.write("<br>") b2.write("<br>--------------------------------------<br>") if ('introduction' in request.GET): for j in xrange(len(pdf_name_all)): b3 = open('/root/Django/file/' + pdf_name_all[j] + '_introduction.txt') for jj in b3.readlines(): b2.write(str(jj)) b2.write("<br>") b2.write("<br>--------------------------------------<br>") if ('method' in request.GET): for j in xrange(len(pdf_name_all)): b3 = open('/root/Django/file/' + pdf_name_all[j] + '_method.txt') for jj in b3.readlines(): b2.write(str(jj)) b2.write("<br>") b2.write("<br>--------------------------------------<br>") if ('result' in request.GET): for j in xrange(len(pdf_name_all)): b3 = open('/root/Django/file/' + pdf_name_all[j] + '_result.txt') for jj in b3.readlines(): b2.write(str(jj)) b2.write("<br>") b2.write("<br>--------------------------------------<br>") if ('discussion' in request.GET): for j in xrange(len(pdf_name_all)): b3 = open('/root/Django/file/' + pdf_name_all[j] + '_discussion.txt') for jj in b3.readlines(): b2.write(str(jj)) b2.write("<br>") b2.write("<br>--------------------------------------<br>") if ('reference' in request.GET): for j in xrange(len(pdf_name_all)): b3 = open('/root/Django/file/' + pdf_name_all[j] + '_reference.txt') for jj in b3.readlines(): b2.write(str(jj)) b2.write("<br>") b2.write("<br>--------------------------------------<br>") else: b2.write(k) return render(request, 'home4.html') else: search = delect_special(request.GET['search']).split(" ") DATA_DIR = '/root/Django/file' pdf_name = [] for filename in os.listdir(DATA_DIR): if (filename[-4:len(filename)] == ".pdf"): pdf_name.append(filename.replace(".pdf", "")) text = textract.process('/root/Django/file/' + filename, method='pdfminer') file_txt = '/root/Django/file/all_' + filename.replace( ".pdf", ".txt") xml = open(file_txt, 'w') for i in xrange(len(text)): xml.write(text[i]) xml.close() a = [] introduction = [] method = [] result = [] discussion = [] reference = [] for i in xrange(len(pdf_name)): myfile_w = open('/root/Django/file/all_' + pdf_name[i] + '.txt', 'r') c = 0 c1 = 0 c2_1 = 0 c2 = 0 c3 = 0 c4 = 0 c3_1 = 0 a1 = [] introduction1 = [] method1 = [] result1 = [] discussion1 = [] reference1 = [] for j in myfile_w.readlines(): k = str(j.strip()).replace(" ", "") if (abstract_start(k) or abstract_start(k[0:8])): c = 1 elif (abstract_end(k) or abstract_end(k[0:8])): c = 0 if c == 1: a1.append(j) if (introduction_start(k) or introduction_start(k[0:8])): c1 += 1 elif (introduction_end(k) or introduction_end(k[0:8])): c1 = 0 if c1 > 0: introduction1.append(j) if (method_start(k) or method_start(k[0:8])): c2 += 1 elif (method_end(k) or method_end(k[0:7])): c2 = 0 if c2 > 0: method1.append(j) if (result_start(k) or result_start(k[0:7])): c3 += 1 elif (result_end(k) or result_end(k[0:8])): c3 = 0 if c3 > 0: result1.append(j) if (discussion_start(k) or discussion_start(k[0:8])): c3_1 += 1 elif (discussion_end(k) or discussion_end(k[0:8])): c3_1 = 0 if c3_1 > 0: discussion1.append(j) if (reference_start(k) or reference_start(k[0:8])): c4 += 1 if c4 > 0: reference1.append(j) a2 = ''.join(a1).split('.') a3 = [] for x in xrange(len(a2)): a4 = 0 for xx in xrange(len(search)): if str(search[xx]).upper() in str(a2[x]).upper(): a4 += 1 if a4 > 0: a3.append(a2[x]) introduction2 = ''.join(introduction1).split('.') introduction3 = [] for x in xrange(len(introduction2)): introduction4 = 0 for xx in xrange(len(search)): if str(search[xx]).upper() in str( introduction2[x]).upper(): introduction4 += 1 if introduction4 > 0: introduction3.append(introduction2[x]) method2 = ''.join(method1).split('.') method3 = [] for x in xrange(len(method2)): method4 = 0 for xx in xrange(len(search)): if str(search[xx]).upper() in str(method2[x]).upper(): method4 += 1 if method4 > 0: method3.append(method2[x]) result2 = ''.join(result1).split('.') result3 = [] for x in xrange(len(result2)): result4 = 0 for xx in xrange(len(search)): if str(search[xx]).upper() in str(result2[x]).upper(): result4 += 1 if result4 > 0: result3.append(result2[x]) discussion2 = ''.join(discussion1).split('.') discussion3 = [] for x in xrange(len(discussion2)): discussion4 = 0 for xx in xrange(len(search)): if str(search[xx]).upper() in str(discussion2[x]).upper(): discussion4 += 1 if discussion4 > 0: discussion3.append(discussion2[x]) reference2 = ''.join(reference1).split('.') reference3 = [] for x in xrange(len(reference2)): reference4 = 0 for xx in xrange(len(search)): if str(search[xx]).upper() in str(reference2[x]).upper(): reference4 += 1 if reference4 > 0: reference3.append(reference2[x]) b1 = open('/root/Django/calc/templates/home3.html', 'r') b2 = open('/root/Django/calc/templates/' + pdf_name[i] + '.txt', 'w') for b3 in b1.readlines(): b4 = str(b3.strip().replace(" ", "")) if (b4 == "start" and len(a3) > 0): b2.write("@@@") for xxx in xrange(len(a3)): b2.write(str(a3[xxx]) + "_") b2.write("!!!") for xxx in xrange(len(introduction3)): b2.write(str(introduction3[xxx]) + "_") b2.write("!!!") for xxx in xrange(len(method3)): b2.write(str(method3[xxx]) + "_") b2.write("!!!") for xxx in xrange(len(result3)): b2.write(str(result3[xxx]) + "_") b2.write("!!!") for xxx in xrange(len(discussion3)): b2.write(str(discussion3[xxx]) + "_") b2.write("!!!") for xxx in xrange(len(reference3)): b2.write(str(reference3[xxx]) + "_") b2.write("@@@") # elif(b4=="<!--introduction-->" and len(introduction3)>0): # for xxx in xrange(len(introduction3)): # b2.write(str(introduction3[xxx])+"_") # elif(b4=="<!--method-->" and len(method3)>0): # b2.write("method<br><br>") # for xxx in xrange(len(method3)): # b2.write(str(method3[xxx])+"-------------------<br>") # elif(b4=="<!--result-->" and len(result3)>0): # b2.write("result<br><br>") # for xxx in xrange(len(result3)): # b2.write(str(result3[xxx])+"-------------------<br>") # elif(b4=="<!--discussion-->" and len(discussion3)>0): # b2.write("discussion<br><br>") # for xxx in xrange(len(discussion3)): # b2.write(str(discussion3[xxx])+"-------------------<br>") # elif(b4=="<!--reference-->" and len(reference3)>0): # b2.write("reference<br><br>") # for xxx in xrange(len(reference3)): # b2.write(str(reference3[xxx])+"-------------------<br>") #else: # b2.write(b3) DATA_DIR = '/root/Django/calc/templates' b1 = open('/root/Django/calc/templates/home3.html', 'r') b2 = open('/root/Django/calc/templates/home4.html', 'w') for b3 in b1.readlines(): b4 = str(b3.strip().replace(" ", "")) if (b4 == "start"): for filename in os.listdir(DATA_DIR): arr = [] if (filename[-4:len(filename)] == ".txt"): f1 = open('/root/Django/calc/templates/' + filename, 'r') for f2 in f1.readlines(): arr.append(f2) ext = "".join(arr).replace("<", "").replace( ">", "").replace("\'", "") ext_sp = ext.split('@@@') for f3 in range(1, len(ext_sp)): ext_f3 = ext_sp[f3].split('!!!') aar = [ 'Abstract', 'Introduction', 'Method', 'Result', 'Discussion', 'Reference' ] for f4 in xrange(len(ext_f3)): b2.write(str(aar[f4]) + "<br><br>") f5 = ext_f3[f4].split('_') for f6 in xrange(len(f5)): b2.write(str(f5[f6]) + "<br><br>") b2.write("----------------------<br><br>") else: b2.write(b3) return render(request, 'home4.html')
punctuations = ['(',')',';',':','[',']',',','The','the']#inspite of that the keeps popping up def inspector(stg): if len(stg)<3:#filters out many other unwanted things return False if stg.isalpha()== False:#filters out alpha numeric and numeric strings return False if stg in punctuations: return False if stg in stop_words: return False if stg == '.':#filters out stops(added measure) return False return True filename = "JavaBasics-notes.pdf" #whatever file you want to scan text = textract.process(filename, method='tesseract', encoding='ascii') if text != "": text = text #If the above returns as False, we run the OCR library textract to #convert scanned/image based PDF files into text else: text = textract.process(filename, method='tesseract', language='eng', encoding='ascii') #The word_tokenize() function will break our text phrases into individual words newt = text.decode("ascii") tokens = word_tokenize(newt) #the keywords
def parse_norm(self, response): meta_date = self.extract_with_css( response, 'span.meta-date::text').extract_first() today = date.today().strftime('%Y-%m-%d') # print(meta_date) def date_from_en_to_es(m): split = m.split() def translate(arg): arg = arg.lower() if arg == 'enero': return 'jan' elif arg == 'febrero': return 'feb' elif arg == 'marzo': return 'mar' elif arg == 'abril': return 'apr' elif arg == 'mayo': return 'may' elif arg == 'junio': return 'jun' elif arg == 'julio': return 'jul' elif arg == 'agosto': return 'aug' elif (arg == 'septiembre') | (arg == 'setiembre'): return 'sep' elif arg == 'octubre': return 'oct' elif arg == 'noviembre': return 'nov' elif arg == 'diciembre': return 'dec' else: return 'None' split[0] = translate(split[0]) date = ' '.join(split) return date meta_date = parser.parse(date_from_en_to_es(meta_date)) meta_date = meta_date.strftime('%Y-%m-%d') # print(meta_date) if meta_date == today: # crawl new norm # print('Entered parse_norm') type = self.extract_with_css( response, 'div.main-content h1.entry-title::text').extract_first() pdf_link = self.extract_with_css(response, 'p.embed_download a::attr(href)') if len(pdf_link) == 1: # extract text from PDF # print('\nExtract text from PDF...') res_name = os.getenv( 'NORMATIVES_MUNICIPAL_PATH' ) + '/datasets/pdf/' + response.meta['link'].rsplit( '/', 2)[-2] + '.pdf' # print('res_name', res_name) pdf_name = pdf_link.extract_first() pdf_name = iri_to_uri(pdf_name) # print('pdf_name', pdf_name) urllib.request.urlretrieve(pdf_name, res_name) text = textract.process(res_name).decode("utf-8") # print('Done!\n') else: # extract plain-text # print('\nExtract text from HTML...') html = self.extract_with_css( response, 'div.main-content').extract_first() soup = BeautifulSoup(html, 'html.parser') text = soup.get_text() # print('Done!\n') yield Norm({ 'published_at': meta_date, 'type': dict(full=type), 'text': text, 'link': response.meta['link'], 'html': response.text })
def get_definitions_dict(input_file): ''' The input PDF file is scraped with the help of the textract library for image recognition. This is done by analyzing the content of the page when formatted as text. The definitions are in capital and appear in the header of the text, so these are initialized as keys of the definitions dict. Then, the first sentences to include the definitions are initialized as the value in the definitions dict. Sentences are recognized by the use of full stops, with common abbreviations that include full stops excluded. Some helper methods are present to do simple Natural Language Processing to include prior or subsequent sentences, on top of the first sentence that mentions the definition, in the definition value. ''' #Read the input file pdf_reader = PdfFileReader(open(input_file, "rb")) #The final definitions dictionary definitions_dict = {} for page_counter in range( pdf_reader.getNumPages()): #Loop through each page pdf_writer = PdfFileWriter() pdf_writer.addPage(pdf_reader.getPage(page_counter)) with open('{0}.pdf'.format(page_counter), 'wb') as f: pdf_writer.write(f) f.close() text = textract.process( '{0}.pdf'.format(page_counter) ) #Use textract's image recognition to convert the pdf to text text = text.decode("utf8") lines = list(filter(lambda x: x != '', text.split('\n'))) #Split text into lines passed_page_number = 0 regex = re.compile('.\..') #Split text into sentences for line in lines: if line.isupper(): definitions_dict[line] = '' abbreviations_set = { 'vs', 'etc', 'est', 'bc' } #common abbreviations to ignore when deciding sentences sentences = list(filter(lambda x: x != '', text.split('.'))) for sentence_index in range( len(sentences) - 1 ): #for each sentence, check if definition to the word is in the sentence. If so, update the definitions dictionary sentence = sentences[sentence_index] words = sentence.split(' ') if words[-1] in abbreviations_set or len(words) < 5: sentences[sentence_index + 1] = sentence + sentences[sentence_index + 1] else: for word in words: word = re.sub(r'[^\w\s]', '', word) if word.upper() in definitions_dict: if definitions_dict[word.upper()] == '': if ('Thus' in sentence or 'Because' in sentence) and sentence_index > 0: sentence = sentences[sentence_index - 1] + sentence definitions_dict[word.upper()] = sentence.replace( '\n', ' ') csv_data = [] #save the definitions dict as a csv for value in definitions_dict: csv_data.append([value, definitions_dict[value]]) with open("definitions.csv", "wt") as fp: writer = csv.writer(fp, delimiter=",") writer.writerows(csv_data) for file in os.listdir("."): if os.path.isfile(file) and file.endswith(".pdf"): os.remove(file)
def get_docs(self): if bulk_collect_location_policy.is_allowed(self.path) is False: raise ValueError('Bulk collect path is illegal ' + self.path) source = self.sources[0] host = source['host'] start_path = source['start_path'] target_element = source['target_element'] render_type = source['render_type'] # find the 2nd nested tbody. folder_name = self.country.replace(' ', '-').lower() root_path = self.path + '/' + folder_name page_url = host + start_path results_response = requests.request('GET', page_url) results_html = results_response.content results_soup = BeautifulSoup(results_html, 'html.parser') tables = results_soup.find_all('table') results_table_index = 7 results_table = tables[results_table_index] paragraphs = results_table.find_all('p') for i in range(0, len(paragraphs)-1, 2): p = paragraphs[i] p_next = paragraphs[i+1] if ("Press Release" in p.get_text()) == False: continue date_str = p.get_text().split(' - ')[0].strip() tmp = dateparser.parse(date_str) date = datetime.date(tmp.year, tmp.month, tmp.day) if gdpr_retention_specification.is_satisfied_by(date) is False: continue # try another result_link document_folder = p.get_text() document_folder_md5 = hashlib.md5(document_folder.encode()).hexdigest() language_code = 'en' document_link = links_from_soup_service(p_next)[0] document_url = document_link[1] document_response = requests.request('GET', document_url) document_content = document_response.content dirpath = root_path + '/' + document_folder_md5 try: os.makedirs(dirpath) except FileExistsError: print('Directory path already exists, continue.') document_word_path = dirpath + '/' + language_code + '.doc' with open(document_word_path, 'wb') as f: f.write(document_content) document_text = textract.process(document_word_path) with open(dirpath + '/' + language_code + '.txt', 'wb') as f: f.write(document_text) return True
def add_local_definitions(input_file, page_counter, intermediate_file_name, definitions_dict): ''' This function adds a definition sidebar to each page. It does this by parsing the text of the page, finding terms that exist in the definitions dictionary that is provided as an argument and returning a new page with both the input page and the definitions sidebar ''' #Get the input page and dimensions of the input page input1 = PdfFileReader(open(input_file, 'rb')).getPage(page_counter) page_length = input1.mediaBox[3] page_width = input1.mediaBox[2] page_width = float(page_width) #Create the background canvas object that will hold both the page and the definitions sidebar. background_canvas = Canvas("background.pdf", pagesize=(page_width * 1.5, page_length)) background_canvas.setFont("Times-Roman", 12) background_canvas.setFillColor(white) background_canvas.drawString(1 * inch, 10 * inch, "White text") background_canvas.save() #Attach the original page to the left of the background canvas with open("background.pdf", "rb") as inFile, open(input_file, "rb") as overlay: original = pypdf.PdfFileReader(inFile) background = original.getPage(0) foreground = pypdf.PdfFileReader(overlay).getPage(page_counter) background.mergePage(foreground) writer = pypdf.PdfFileWriter() for i in range(original.getNumPages()): page = original.getPage(i) writer.addPage(page) with open("modified1.pdf", "wb") as outFile: writer.write(outFile) #Convert page to text text = textract.process("modified1.pdf") text = text.decode("utf8") abbreviations_set = {'vs', 'etc', 'est', 'bc'} local_dict = {} text = ''.join(text.splitlines()) #For each phrase of 1-3 words in the text, check if the same phrase is in the dictionary. If so, attach the definition and phrase to the local dict object sentences = list(filter(lambda x: x != '', text.split('.'))) words_queue = collections.deque(3 * ['0'], 3) for sentence_index in range(len(sentences) - 1): sentence = sentences[sentence_index] words = sentence.split(' ') for word in words: if word != '': words_queue.append(word) for word_sample in [ words_queue[0].upper(), (words_queue[0] + ' ' + words_queue[1]).upper(), (words_queue[0] + ' ' + words_queue[1] + ' ' + words_queue[2]).upper() ]: if word_sample in definitions_dict: local_dict[word_sample] = definitions_dict[word_sample] #Create the definitions sidebar, and set relevant properties. insert_canvas = Canvas("insert.pdf", pagesize=(page_width * 0.5, page_length)) insert_canvas.setFillColor(HexColor("#D3D3D3")) insert_canvas.rect(5, 5, page_width * 0.5, page_length, fill=1) insert_canvas.setFillColor(black) insert_canvas.setFont('Times-Bold', 16) insert_canvas.drawString(page_width * 0.16, page_length - 0.25 * inch, "DEFINITIONS") lines = 4 #Write the dictionary of definitions used into the definitions sidebar for item in local_dict: insert_canvas.setFont("Times-Roman", 12) textobject = insert_canvas.beginText( 10, page_length - (0.17 * inch * lines)) my_text = f"{item} : {local_dict[item]}" my_text = textwrap.fill(my_text, 52) + '\n' for line in my_text.splitlines(False): textobject.textLine(line.rstrip()) insert_canvas.drawText(textobject) lines += my_text.count('\n') + 2 insert_canvas.save() #Combine the current page on the background canvas and the definitions sidebar with open("modified1.pdf", "rb") as inFile, open("insert.pdf", "rb") as overlay: original = pypdf.PdfFileReader(inFile) background = original.getPage(0) foreground = pypdf.PdfFileReader(overlay).getPage(0) # merge the first two pages background.mergeTranslatedPage(foreground, 620, 0) # add all pages to a writer writer = pypdf.PdfFileWriter() for i in range(original.getNumPages()): page = original.getPage(i) writer.addPage(page) final_name = 'processed_' + intermediate_file_name # write everything in the writer to a file with open(final_name, "wb") as outFile: writer.write(outFile) #Delete all intermediate files os.remove('background.pdf') os.remove('modified1.pdf') os.remove('insert.pdf') os.remove(intermediate_file_name)
def text_for_epub(path): return textract.process(path).decode('utf-8')
# classify file type and remove metadata clean_fnames.pop(0) clean_fnames.pop(0) data = [] pat1 = re.compile(r"% \S+") pat2 = re.compile(r"\n+") i = 0 # For each sermon for fname in clean_fnames: print("file {}".format(i)) i += 1 filetype = fname.split(".")[-1] # Select correct filetype if filetype == "doc": text = textract.process(fname).decode('utf-8') elif filetype == "docx": text = docx2txt.process(fname) else: print(fname) # TODO read odt # remove metadata text = pat1.sub("", text) text = pat2.sub("\n", text) #append to dataframe data.append([fname.split("/")[-1].split(".")[0], text]) # save df, keeping only sermon_ID and text content df = pd.DataFrame(data) df.columns = ["id", "content"] outpath = os.path.join("data", "content.dat") df.to_csv(outpath, encoding='utf-8')
def res(jobfile): Final_Array = [] def lcs(X, Y): try: mat = [] for i in range(0, len(X)): row = [] for j in range(0, len(Y)): if X[i] == Y[j]: if i == 0 or j == 0: row.append(1) else: val = 1 + int(mat[i - 1][j - 1]) row.append(val) else: row.append(0) mat.append(row) new_mat = [] for r in mat: r.sort() r.reverse() new_mat.append(r) lcs = 0 for r in new_mat: if lcs < r[0]: lcs = r[0] return lcs except: return -9999 def spellCorrect(string): words = string.split(" ") correctWords = [] for i in words: correctWords.append(spell(i)) return " ".join(correctWords) def semanticSearch(searchString, searchSentencesList): result = None searchString = spellCorrect(searchString) bestScore = 0 for i in searchSentencesList: score = lcs(searchString, i) print(score, i[0:100]) print("") temp = [score] Final_Array.extend(temp) if score > bestScore: bestScore = score result = i return result app.config['UPLOAD_FOLDER'] = 'Original_Resumes/' app.config['ALLOWED_EXTENSIONS'] = set( ['txt', 'pdf', 'png', 'jpg', 'jpeg', 'gif']) def allowed_file(filename): return '.' in filename and \ filename.rsplit('.', 1)[1] in app.config['ALLOWED_EXTENSIONS'] Resume_Vector = [] Ordered_list_Resume = [] Ordered_list_Resume_Score = [] LIST_OF_FILES = [] LIST_OF_FILES_PDF = [] LIST_OF_FILES_DOC = [] LIST_OF_FILES_DOCX = [] Resumes_File_Names = [] Resumes = [] Temp_pdf = '' os.chdir('./Original_Resumes') for file in glob.glob('**/*.pdf', recursive=True): LIST_OF_FILES_PDF.append(file) for file in glob.glob('**/*.doc', recursive=True): LIST_OF_FILES_DOC.append(file) for file in glob.glob('**/*.docx', recursive=True): LIST_OF_FILES_DOCX.append(file) LIST_OF_FILES = LIST_OF_FILES_DOC + LIST_OF_FILES_DOCX + LIST_OF_FILES_PDF # LIST_OF_FILES.remove("antiword.exe") print("This is LIST OF FILES") print(LIST_OF_FILES) # print("Total Files to Parse\t" , len(LIST_OF_PDF_FILES)) print("####### PARSING ########") for nooo, i in enumerate(LIST_OF_FILES): Ordered_list_Resume.append(i) Temp = i.split(".") if Temp[1] == "pdf" or Temp[1] == "Pdf" or Temp[1] == "PDF": try: print("This is PDF", nooo) with open(i, 'rb') as pdf_file: read_pdf = PyPDF2.PdfFileReader(pdf_file) # page = read_pdf.getPage(0) # page_content = page.extractText() # Resumes.extend(Temp_pdf) number_of_pages = read_pdf.getNumPages() for page_number in range(number_of_pages): page = read_pdf.getPage(page_number) page_content = page.extractText() page_content = page_content.replace('\n', ' ') # page_content.replace("\r", "") Temp_pdf = Temp_pdf + str(page_content) # Temp_pdf.append(page_content) # print(Temp_pdf) Resumes.extend([Temp_pdf]) Temp_pdf = '' Resumes_File_Names.append(i) # f = open(str(i)+str("+") , 'w') # f.write(page_content) # f.close() except Exception as e: print(e) if Temp[1] == "doc" or Temp[1] == "Doc" or Temp[1] == "DOC": print("This is DOC", i) try: a = textract.process(i) a = a.replace(b'\n', b' ') a = a.replace(b'\r', b' ') b = str(a) c = [b] Resumes.extend(c) Resumes_File_Names.append(i) except Exception as e: print(e) if Temp[1] == "docx" or Temp[1] == "Docx" or Temp[1] == "DOCX": print("This is DOCX", i) try: a = textract.process(i) a = a.replace(b'\n', b' ') a = a.replace(b'\r', b' ') b = str(a) c = [b] Resumes.extend(c) Resumes_File_Names.append(i) except Exception as e: print(e) # Resumes.extend(textract.process(i)) if Temp[1] == "ex" or Temp[1] == "Exe" or Temp[1] == "EXE": # print("This is EXE" , i) pass # print("This is length of Resume Vector : " , len(Resumes)) # # # print(Resumes[1][0:10]) # for m , i in enumerate(Resumes): # print("This is m : " , m , i[0][0:100]) # print("#######################################################################") for m, i in enumerate(Resumes): Resumes[m] = nltk.word_tokenize(Resumes[m]) Resumes[m] = normalize(Resumes[m]) Resumes[m] = ' '.join(map(str, Resumes[m])) jobfile = nltk.word_tokenize(jobfile) jobfile = normalize(jobfile) jobfile = ' '.join(map(str, jobfile)) # Resumes2 = np.array(Resumes) # Resumes2 = Resumes2.ravel() # print(len(Resumes)) # Resumes = ['microsoft is dumb' , 'google is awesome' , 'facebook is cheater'] print("This is len Resumes : ", len(Resumes)) os.chdir('../') print("#############################################################") # a = input("Enter String to Search : ") print("\n\n") print("Printing Scores of all Resumes...") print("\n") result = semanticSearch(jobfile, Resumes) print("\n") print("Printing 1 Best Result.....") print("\n") print(result) print("\n\n") print("#########################################################") print("#########################################################") print("#########################################################") print("#########################################################") print("\n\n") print(Final_Array) print("This is len Final_Array : ", len(Final_Array)) print(Resumes_File_Names) print("This is len Ordered_list_Resume : ", len(Resumes_File_Names)) Ordered_list_Resume = Ordered_list_Resume[1:] # print(Ordered_list_Resume) Z = [ x for _, x in sorted(zip(Final_Array, Resumes_File_Names), reverse=True) ] flask_return = [] # for n,i in enumerate(Z): # print("Rankkkkk\t" , n+1, ":\t" , i) for n, i in enumerate(Z): # print("Rank\t" , n+1, ":\t" , i) # flask_return.append(str("Rank\t" , n+1, ":\t" , i)) name = getfilepath(i) #name = name.split('.')[0] rank = n res = ResultElement(rank, name) flask_return.append(res) # res.printresult() # print(f"Rank{res.rank+1} :\t {res.filename}") return flask_return
def pdf_to_text_textract(pdf_file_path): page_text = textract.process(pdf_file_path) #, encoding='ascii' return page_text
def LoadDoc(path): words = textract.process(path) words = words.decode('utf-8') return words
def readWord(path_to_Doc): document_text = textract.process(path_to_Doc) return document_text
def read_pdf_as_text(path): return textract.process(path)
def process_text(file, extension=None): if not extension: text = textract.process(file) else: text = textract.process(file, extension=extension) return text.decode()
import textract ############################# #convert the files(5articles) ############################# text = textract.process("test.pdf", m='pdfminer') search = "Predicting" #????? ################### #ignore the useless character ################### def delect_special(a): #delete all non-meaningful words b = a.replace(".", "").replace("!", "").replace("@", "").replace( "#", "").replace("~", "").replace(",", "") return b search1 = delect_special(search).split(" ") #split article #print(search1) ####################### #open and write the txt file ####################### xml = open( 'Cobelli1979_Identifiability_of_compartmental_systems_and_related_structural_properties.txt', 'w') for i in xrange(len(text)): xml.write(text[i])