def extract(): logging.info('Begin extract') # Reference variables candidate_file_agg = list() # Create list of candidate files for root, subdirs, files in os.walk(lib.get_conf('resume_directory')): folder_files = map(lambda x: os.path.join(root, x), files) candidate_file_agg.extend(folder_files) # Convert list to a pandas DataFrame observations = pandas.DataFrame(data=candidate_file_agg, columns=['file_path']) logging.info('Found {} candidate files'.format(len(observations.index))) # Subset candidate files to supported extensions observations['extension'] = observations['file_path'].apply( lambda x: os.path.splitext(x)[1]) observations = observations[observations['extension'].isin( lib.AVAILABLE_EXTENSIONS)] logging.info( 'Subset candidate files to extensions w/ available parsers. {} files remain' .format(len(observations.index))) # Attempt to extract text from files observations['text'] = observations['file_path'].apply(lib.convert_pdf) # Archive schema and return lib.archive_dataset_schemas('extract', locals(), globals()) logging.info('End extract') return observations
def load(observations, nlp): logging.info('Begin load') output_path = os.path.join(lib.get_conf('summary_output_directory'), 'resume_summary.csv') logging.info('Results being output to {}'.format(output_path)) print('Results output to {}'.format(output_path)) observations.to_csv(path_or_buf=output_path, index_label='index') logging.info('End transform') pass
def save(observations): logging.info('Begin load') output_path = os.path.join(lib.get_conf('summary_output_directory'), 'resume_summary.csv') tmp = observations.copy() del tmp['text'] tmp.to_csv(path_or_buf=output_path, index_label='index') logging.info('End transform') pass
def extract_fields(df): for extractor, items_of_interest in lib.get_conf('extractors').items(): df[extractor] = df['text'].apply(lambda x: extract_skills(x, extractor, items_of_interest)) return df