def route_compfileread(filepaths): for filepath in filepaths: try: # extracting data from .gz file. gzipfile = gzip.GzipFile(filepath, 'rb') gzipdata = gzipfile.read() gzipfile.close() # getting complete file name of the .gz file compfilename = utility.filename_from_filepath(filepath) # extracting the original file name filename = compfilename.split('.gz')[0] print(filename) # creating file and writing data uncompfile = open( config.ConfigManager().PCRatesFileFolder + '/' + filename, 'wb') uncompfile.write(gzipdata) uncompfile.close() except BaseException as ex: utility.log_exception_file_and_filepath( ex, config.ConfigManager().PromptcloudLogFile, filepath) os.remove(filepath)
def route_compfileread(filepaths): for filepath in filepaths: try: # extracting data from .gz file. gzipfile = gzip.GzipFile(filepath, 'rb') gzipdata = gzipfile.read() gzipfile.close() # getting complete file name of the .gz file compfilename = utility.filename_from_filepath(filepath) # extracting the original file name filename = compfilename.split('.gz')[0] print(filename) # creating file and writing data uncompfile = open( config.ConfigManager().PCFileFolder + '/' + filename, 'wb') uncompfile.write(gzipdata) uncompfile.close() except BaseException as ex: utility.log_exception_with_filepath(ex, filepath) # writing to file the file names that cannot be extracted using # gzip utility.write_to_file( config.ConfigManager().PCDataAnalysisResultsFile, 'a', compfilename + ' cannot be extracted') os.remove(filepath)
def fill_job_by_site(filepath): global totaljobsdict # site name is the string before _ in the file name site = (utility.filename_from_filepath(filepath)).split('_')[0] # condition to add dictionary key as site name and incrementing if site not in totaljobsdict: totaljobsdict[site] = 1 else: totaljobsdict[site] += 1 return totaljobsdict
def route_dataread(filepaths): data_read_count = int(utility.read_from_file( config.ConfigManager().ExecutioncountFile, 'r')) file_read_count = 0 file_path_count = 0 configdocs = custom.retrieve_data_from_DB(int(config.ConfigManager().MongoDBPort), config.ConfigManager( ).DataCollectionDB, config.ConfigManager().ConfigCollection) docid_count = int(configdocs[0]['docid_count']) connection = dbmanager.mongoDB_connection( int(config.ConfigManager().MongoDBPort)) utility.write_to_file(config.ConfigManager().LogFile, 'a', 'dataread running') for filepath in filepaths: data_text = '' try: file_path_count += 1 print('File number: ' + str(file_path_count)) print('Processing file..' + filepath) if filepath[-4:].lower() == ".txt": data_text = datareadfiletypes.read_text_text( filepath, data_text) elif filepath[-4:].lower() == ".pdf": data_text = datareadfiletypes.read_pdf_text( filepath, data_text) elif filepath[-5:].lower() == ".docx": data_text = datareadfiletypes.read_docx_text( filepath, data_text) elif filepath[-4:].lower() == ".doc": data_text = datareadfiletypes.read_doc_text( filepath, data_text) elif filepath[-4:].lower() == ".xls": # data_text = datareadfiletypes.read_excel_text( # filepath, data_text) docid_count = custom.process_excel_rowdata( filepath, docid_count) elif filepath[-5:].lower() == ".xlsx": # data_text = datareadfiletypes.read_excel_text( # filepath, data_text) docid_count = custom.process_excel_rowdata( filepath, docid_count) elif filepath[-4:].lower() == ".csv": data_text = datareadfiletypes.read_csv_text( filepath, data_text) elif filepath[-4:].lower() == ".odt": data_text = datareadfiletypes.read_odt_text( filepath, data_text) elif filepath[-4:].lower() == ".xml": docid_count = custom.process_xml_data(filepath, docid_count) if not data_text == '': docid_count += 1 file_read_count += 1 # dcrnlp.extract_nounphrases_sentences(data_text) noun_phrases = '' dictionaries.DataProperties['description'] = data_text dictionaries.DataProperties['nounPhrases'] = noun_phrases dictionaries.DataProperties[ 'documentType'] = utility.filefolder_from_filepath(filepath) dictionaries.DataProperties[ 'dataSource'] = config.ConfigManager().Misc # config.ConfigManager().JobPortal dictionaries.DataProperties['doc_id'] = docid_count dictionaries.DataProperties[ 'documentTitle'] = utility.filename_from_filepath(filepath) dictionaries.DataProperties['documentDesc'] = ( dictionaries.DataProperties['description'])[0:200] jsonfordatastore = custom.prepare_json_for_datastore( dictionaries.DataProperties) jsonfordatastore_deserialized = utility.jsonstring_deserialize( jsonfordatastore) custom.insert_data_to_DB( jsonfordatastore_deserialized, connection) phrases_file_data = custom.prepare_phrases_file_data( noun_phrases, data_read_count, file_read_count) utility.write_to_file( config.ConfigManager().PhraseFile, 'a', phrases_file_data) except BaseException as ex: exception_message = '\n' + 'Exception:' + \ str(datetime.datetime.now()) + '\n' exception_message += 'File: ' + filepath + '\n' exception_message += '\n' + str(ex) + '\n' exception_message += '-' * 100 utility.write_to_file( config.ConfigManager().LogFile, 'a', exception_message) data_read_count += 1 utility.write_to_file(config.ConfigManager( ).ExecutioncountFile, 'w', str(data_read_count)) dictionaries.UpdateTemplateWhere['_id'] = configdocs[0]['_id'] dictionaries.UpdateTemplateSet['docid_count'] = docid_count dictionaries.DBSet['$set'] = dictionaries.UpdateTemplateSet custom.update_data_to_Db_noupsert(int(config.ConfigManager().MongoDBPort), config.ConfigManager().DataCollectionDB, config.ConfigManager( ).ConfigCollection, dictionaries.UpdateTemplateWhere, dictionaries.DBSet, connection)
if __name__ == "__main__": filepaths = [] directory_list = [] # print(config.ConfigManager().fileDirectory) directory_list = utility.string_to_array( config.ConfigManager().fileDirectory, ',', directory_list) filepaths = filemanager.directory_iterate(directory_list) detection_dict_list = [] detection_dict_list_all = [] for filepath in filepaths: strtimestamp = str(datetime.datetime.now()) data_text = '' # flags = [] filepath_mod = filepath.replace('\\', '!@#$%') file_batchId_name = utility.filename_from_filepath(filepath_mod) batchIdsupplierId = (file_batchId_name[1:]).split('_')[0] batchId = (batchIdsupplierId).split('-')[0] supplierId = int((batchIdsupplierId).split('-')[1]) file_name = (file_batchId_name[1:]).split('_')[1] file_name = file_name.replace('!@#$%', '\\') try: # file_count += 1 # print(filepath) if filepath[-4:].lower() == ".txt": flagsInfo = component.read_text_text(filepath, supplierId) print(flagsInfo) flags = flagsInfo[0] flagsDetails = flagsInfo[1] flagsDetailsBoolean = bool(flagsDetails) if flagsDetailsBoolean:
def route_dataread(filepaths): file_count = 0 docreadcount = 0 antiwordemptycount = 0 connection = dbmanager.mongoDB_connection( int(config.ConfigManager().MongoDBPort)) utility.write_to_file( config.ConfigManager().LogFile, 'a', 'resumeread running' + ' ' + str(datetime.datetime.now())) for filepath in filepaths: strtimestamp = str(datetime.datetime.now()) data_text = '' try: file_count += 1 if filepath[-4:].lower() == ".txt": data_text = datareadfiletypes.read_text_text( filepath, data_text) elif filepath[-4:].lower() == ".pdf": data_text = datareadfiletypes.read_pdf_text( filepath, data_text) elif filepath[-5:].lower() == ".docx": data_text = datareadfiletypes.read_docx_text( filepath, data_text) elif filepath[-4:].lower() == ".doc": data_text = datareadfiletypes.read_doc_text( filepath, data_text) if data_text == '': antiwordemptycount += 1 data_text = datareadfiletypes.read_doc_text_catdoc( filepath, data_text) if data_text == '': utility.write_to_file(config.ConfigManager().LogFile, 'a', 'Filepath is: ' + filepath) elif filepath[-4:].lower() == ".xls": data_text = datareadfiletypes.read_excel_text( filepath, data_text) elif filepath[-5:].lower() == ".xlsx": data_text = datareadfiletypes.read_excel_text( filepath, data_text) elif filepath[-4:].lower() == ".csv": data_text = datareadfiletypes.read_csv_text( filepath, data_text) elif filepath[-4:].lower() == ".odt": data_text = datareadfiletypes.read_odt_text( filepath, data_text) elif filepath[-4:].lower() == ".xml": docid_count = custom.process_xml_data(filepath, docid_count) strtimestamp += ' ' + str(datetime.datetime.now()) if not data_text == '': filepath = filepath.replace('\\', '') file_name = utility.filename_from_filepath(filepath) candidateId = (file_name[1:]).split('_')[0] print(candidateId) UpdateTemplateWhere = utility.clean_dict() UpdateTemplateSet = utility.clean_dict() UpdateTemplateWhere['candidateid'] = int(candidateId) # UpdateTemplateWhere['documentType'] = 'candidate details' # UpdateTemplateWhere['dataSource'] = 'Smart Track' UpdateTemplateSet['resumeText'] = data_text UpdateTemplateSet['isResumeTextNew'] = 1 DBSet = utility.clean_dict() DBSet['$set'] = UpdateTemplateSet custom.update_data_to_Db_noupsert( int(config.ConfigManager().MongoDBPort), config.ConfigManager().DataCollectionDB, config.ConfigManager().STCandidateCollection, UpdateTemplateWhere, DBSet, connection) strtimestamp += ' ' + str(datetime.datetime.now()) if filepath[-4:].lower() == ".doc": docreadcount += 1 print('Total doc read count:' + str(docreadcount)) print('File : ' + str(file_count) + ' ' + strtimestamp) print('Antiword empty count:' + str(antiwordemptycount)) except BaseException as ex: exception_message = '\n' + 'Exception:' + \ str(datetime.datetime.now()) + '\n' exception_message += 'File: ' + '\n' exception_message += '\n' + str(ex) + '\n' exception_message += '-' * 100 # .encode('utf8')) utility.write_to_file(config.ConfigManager().LogFile, 'a', exception_message) utility.write_to_file( config.ConfigManager().LogFile, 'a', 'Number of resumes read - ' + str(file_count) + ' ' + str(datetime.datetime.now()))