def parsing_jd(jd_file_name): path = file_loc + jd_file_name + '.txt' for file in glob.glob(path, recursive=True): if not file in job_desc_files: job_desc_files.append(file) with open(path, 'rt') as file: jd = file.read() jd = summarize(jd, word_count=200) file.close() jd = text_process.normalize(jd) df = pd.DataFrame(columns=['Path', 'File Name', 'Text']) df.loc[0] = [path, jd_file_name, jd] return df
def extract_text_from_pdf(files_list): resumes = [] # Stores final processed resume files for pdf_path in files_list: text = '' with open(pdf_path, 'rb') as fh: # iterate over all pages of PDF document for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): # creating a resoure manager resource_manager = PDFResourceManager() # create a file handle fake_file_handle = StringIO() # creating a text converter object converter = TextConverter( resource_manager, fake_file_handle, codec='utf-8', laparams=LAParams() ) # creating a page interpreter page_interpreter = PDFPageInterpreter( resource_manager, converter ) # process current page page_interpreter.process_page(page) # extract text text += fake_file_handle.getvalue() text = text.replace('\n', ' ') # close open handles converter.close() fake_file_handle.close() resumes.append(text_process.normalize(text)) for name in resume_list: #print(name) temp = name.split('.')[0] temp = temp.split('/')[1] file_names.append(temp) df = {'Path':resume_list, 'File Name': file_names, 'Text':resumes} data = pd.DataFrame(df) data.to_csv('out.csv') return data