def automate_processes(): utility.write_to_file(config.ConfigManager().LogFile, 'a', ' master automationscript running') try: utility.update_config_coll_process_started_date() # Supplier master list load exec(open('st_master_supplier_data_read.py').read(), globals()) # Client master list load exec(open('stclientsdataread.py').read(), globals()) # Currency master list load exec(open('currencydataread.py').read(), globals()) # Industry master list load exec(open('industrydataread.py').read(), globals()) # MSP master list load exec(open('stmspdataread.py').read(), globals()) # Rates information transfer from Smart Track exec(open('stratesdataread.py').read(), globals()) # PromptCloud data load automation exec(open('prompt_cloud_automation.py').read(), globals()) # Transferring files from staging collection to masters collection exec(open('staging_data_read.py').read(), globals()) # Generating master integer graph exec(open('gen_docintgraph_from_db.py').read(), globals()) # Transfering file to webserver exec(open('master_int_graph_transfer.py').read(), globals()) # Learning automation exec(open('knowledge_build_automation.py').read(), globals()) except BaseException as ex: utility.log_exception_file(config.ConfigManager().LogFile, ex)
def modifygeodata(): ratesData = mastercoll.find({}) for row in ratesData: try: if row['cityLocationFlag'] == 1: cityGeoLocation = [] cityGeoLocation.append(float(row['cityLongitude'])) cityGeoLocation.append(float(row['cityLatitude'])) row['coordinates'] = cityGeoLocation mastercoll.update({"doc_id": row['doc_id']}, {"$set": { "coordinates": cityGeoLocation }}) # if row['stateLocationFlag'] == 1: # stateGeoLocation = [] # stateGeoLocation.append(float(row['stateLatitude'])) # stateGeoLocation.append(float(row['stateLongitude'])) # row['stateGeoLocation'] = stateGeoLocation # # print(row,"\n") # mastercoll.update({"doc_id": row['doc_id']}, # {"$set": {"stateGeoLocation": stateGeoLocation}}) except BaseException as ex: utility.log_exception_file(ex, config.ConfigManager().LogFile)
def pc_rates_data_storage(page_dict_object, filepath, dbrecordcount): global totalrecords global invalidrecords global emptydesc global incompletedesc global smalldesc global nonedesc global nodesc global totaljobsdict global jobsitedict dict_object_record_list = [] try: page_object_list = page_dict_object['page'] if isinstance(page_object_list['record'], list): for record_object in page_object_list['record']: record_object = pc_rates_add_fields(record_object, filepath) if sys.getsizeof(record_object['description']) < 13000000: dict_object_record_list.append(record_object) dbrecordcount += 1 else: record_object = page_object_list['record'] record_object = pc_rates_add_fields(record_object, filepath) if sys.getsizeof(record_object['description']) < 13000000: dict_object_record_list.append(record_object) dbrecordcount += 1 except BaseException as ex: utility.log_exception_file(ex, config.ConfigManager().PromptcloudLogFile) if dict_object_record_list: insert_to_db(dict_object_record_list) # updating doc_id in config table return dbrecordcount
def job_info_analysis(page, filepath, dbrecordcount): global totalrecords global invalidrecords global emptydesc global incompletedesc global smalldesc global nonedesc global nodesc global totaljobsdict global jobsitedict dict_object_record_list = [] for jobinfo in page.findall('record'): try: # creating dictionary from xml tag contents dict_object = utility.xml_to_dict(ET.tostring(jobinfo)) # totaljobsdict = fill_job_by_site(filepath) # totalrecords += 1 # outer if check is jobdescription tag is in the xml if 'jobdescription' in (dict_object['record']): # checking if job description is none if ((dict_object['record'])['jobdescription'] is not None): incorrectjobdescription = 0 if (((dict_object['record'])['jobdescription']).strip() ) == '': incorrectjobdescription = 1 if (len(((dict_object['record'])['jobdescription'])) < 20): incorrectjobdescription = 1 if (((dict_object['record'])['jobdescription'] ).strip()[-3:]) == '...': incorrectjobdescription = 1 if (incorrectjobdescription == 0): (dict_object['record'] )['dateCreated'] = datetime.datetime.now() (dict_object['record'] )['dateModified'] = datetime.datetime.now() (dict_object['record'])['createdUser'] = '******' (dict_object['record'])['modifiedUser'] = '******' (dict_object['record'])['source'] = 'PromptCloud' #(dict_object['record'])['Url'] = page['pageurl'] dict_object_record_list.append(dict_object['record']) dbrecordcount += 1 except BaseException as ex: utility.log_exception_file( ex, dcrconfig.ConfigManager().SemanticGraphLogFile) if dict_object_record_list: insert_to_db(dict_object_record_list) # updating doc_id in config table return dbrecordcount
def job_info_analysis_storage(page_dict_object, filepath, dbrecordcount): global totalrecords global invalidrecords global emptydesc global incompletedesc global smalldesc global nonedesc global nodesc global totaljobsdict global jobsitedict dict_object_record_list = [] try: dict_object = page_dict_object['page'] # outer if check is jobdescription tag is in the xml if 'jobdescription' in (dict_object['record']): # checking if job description is none if ((dict_object['record'])['jobdescription'] is not None): incorrectjobdescription = 0 if (((dict_object['record'])['jobdescription']).strip()) == '': incorrectjobdescription = 1 if (len(((dict_object['record'])['jobdescription'])) < 20): incorrectjobdescription = 1 if (((dict_object['record'])['jobdescription']).strip()[-3:] ) == '...': incorrectjobdescription = 1 if (incorrectjobdescription == 0): (dict_object['record'] )['dateCreated'] = datetime.datetime.now() (dict_object['record'] )['dateModified'] = datetime.datetime.now() (dict_object['record'])['createdUser'] = '******' (dict_object['record'])['modifiedUser'] = '******' (dict_object['record'])['source'] = 'PromptCloud' (dict_object['record'])['Url'] = dict_object['pageurl'] (dict_object['record'])['fileName'] = filepath.replace( config.ConfigManager().PCFileFolder + '/', '') dict_object_record_list.append(dict_object['record']) dbrecordcount += 1 except BaseException as ex: utility.log_exception_file( ex, dcrconfig.ConfigManager().SemanticGraphLogFile) if dict_object_record_list: insert_to_db(dict_object_record_list) # updating doc_id in config table return dbrecordcount
def sendflagdetailstoexternalsystem(): flagdetailsdata = flagdetailscollection.find({"isSent": 0}, { "_id": 0, "isSent": 0 }) flagdetailsList = [] for data in flagdetailsdata: flagdetailsList.append(data) if flagdetailsList: headers = {"Content-Type": "application/json"} flagdetailsList = json.dumps(flagdetailsList).encode('utf8') conn = http.client.HTTPConnection(config.ConfigManager().STwebApiHost, "80") conn.request(config.ConfigManager().JobServerMethod, config.ConfigManager().stWebApiSendData, flagdetailsList, headers) # conn = http.client.HTTPConnection("localhost", "4400") # conn.request(config.ConfigManager().JobServerMethod, config.ConfigManager().stWebApiSendData, flagdetailsList, headers) response = conn.getresponse() try: if response.status == 200: data = response.read() result = json.loads(data.decode('utf8')) # resumeIds = result[0]["resumeID"].split(',') resumeIds = [str(x.strip()) for x in result] if resumeIds: flagdetailscollection.update( {"batchId": { "$in": resumeIds }}, {"$set": { "isSent": 1 }}, multi=True) utility.write_to_file( config.ConfigManager().LogFile, 'a', 'ST candidate resume screening, ' + str(len(resumeIds)) + ' resume detection details sent successfully' + ' ' + str(datetime.datetime.now())) else: utility.write_to_file( config.ConfigManager().LogFile, 'a', 'ST candidate resume screening, no resume detection update done' + ' ' + str(datetime.datetime.now())) else: ex = str(response.status) + "--" + str(response.reason) utility.log_exception_file(ex, file) utility.write_to_file( config.ConfigManager().LogFile, 'a', 'ST candidate resume screening, API down time' + ' ' + str(datetime.datetime.now())) except BaseException as ex: utility.log_exception_file(ex, file)
def automate_processes(): utility.write_to_file( dcrconfig.ConfigManager().SemanticGraphLogFile, 'a', 'Knowledge build automation running..! ' + str(datetime.datetime.now())) try: # Copies files from the previous cycle exec(open('filecopy.py').read(), globals()) # Copy the noun phrase text from Mongo DB exec(open('dbtophrasefile.py').read(), globals()) # Remove ngram anything above 3 or more words. exec(open('ngramremoval.py').read(), globals()) # Remove duplicates and save it in new distinct phrase file. exec(open('duplicatefinder.py').read(), globals()) # Checks if there is an existing semantic graph, if yes load and update # with new documents else create a new semantic graph and store. # Normally, this is run after n gram removal and duplicate # find and removal. exec(open('dcrgraphgenerator.py').read(), globals()) # Read the semantic graph which is saved using dcrgraphgenerator.py # and read the document phrase file and create optimized integer # semantic edge file. exec(open('dcrgraphcompactor.py').read(), globals()) # Save the node dictionary using pickle to file. This will be used by # above programs for finding node ids exec(open('savenodes.py').read(), globals()) # Generate document integer graph and store. This will be used for # searching the documents. # exec(open('dcrdocumentintgraphgenerator.py').read(), globals()) # Copy the noun phrase text from Mongo DB (Intelligence collection) exec(open('stdbtophrasefile.py').read(), globals()) # Remove ngram anything above 3 or more words. exec(open('ngramremoval.py').read(), globals()) # Remove duplicates and save it in new distinct phrase file. exec(open('duplicatefinder.py').read(), globals()) # Checks if there is an existing semantic graph, if yes load and update # with new documents else create a new semantic graph and store. # Normally, this is run after n gram removal and duplicate # find and removal. exec(open('stdcrgraphgenerator.py').read(), globals()) # Read the semantic graph which is saved using dcrgraphgenerator.py # and read the document phrase file and create optimized integer # semantic edge file. exec(open('stdcrgraphcompactor.py').read(), globals()) # Save the node dictionary using pickle to file. This will be used by # above programs for finding node ids exec(open('savenodes.py').read(), globals()) # Transfer generated intelligence files exec(open('filetransfer.py').read(), globals()) except BaseException as ex: utility.log_exception_file( ex, dcrconfig.ConfigManager().SemanticGraphLogFile)
def sendtoexternalsystem(): resumeData = collection.find({"isSent": 0}, {"_id": 0, "isSent": 0}) candidateFlags = [] for data in resumeData: data["fileName"] = " " candidateFlags.append(data) if candidateFlags: headers = {"Content-Type": "application/json"} candidateFlags = json.dumps(candidateFlags).encode('utf8') conn = http.client.HTTPConnection(config.ConfigManager().STwebApiHost, "80") conn.request(config.ConfigManager().JobServerMethod, config.ConfigManager().STwebApiUrl, candidateFlags, headers) response = conn.getresponse() try: if response.status == 200: data = response.read() result = json.loads(data.decode('utf8')) resumeIds = result[0]["resumeID"].split(',') resumeIds = [str(x.strip()) for x in resumeIds] print(resumeIds) if resumeIds: collection.update({"batchId": { "$in": resumeIds }}, {"$set": { "isSent": 1 }}, multi=True) utility.write_to_file( config.ConfigManager().LogFile, 'a', 'ST candidate resume screening, ' + str(len(resumeIds)) + ' flag(s) sent successfully' + ' ' + str(datetime.datetime.now())) else: utility.write_to_file( config.ConfigManager().LogFile, 'a', 'ST candidate resume screening, no candidates found' + ' ' + str(datetime.datetime.now())) else: ex = str(response.status) + "--" + str(response.reason) utility.log_exception_file(ex, file) utility.write_to_file( config.ConfigManager().LogFile, 'a', 'ST candidate resume screening, API down time' + ' ' + str(datetime.datetime.now())) except BaseException as ex: utility.log_exception_file(ex, file)
def process_staging_row(row): try: global dataList "Step:1 data scrubbing for email,phone,url and candidate name" row = dataclean(row) "Step:2 nounphrases generation" row = generatenounphrases(row) "Step:3 signature generation" row = signaturegraph(row) "Step:4 rates calculation" row = rates_calculation.billratescalculation(row) # Put rate value calculation before this check "Step:5 verification of rate availability" row = rate_available(row) # geographical data check and additions row = custom.geo_data_check(row, row['countryDictList'], 'country') row = custom.geo_data_check(row, row['stateDictList'], 'state') row = custom.geo_data_check(row, row['cityDictList'], 'city') row = custom.geo_data_check(row, row['zipCodeDictList'], 'zipCode') del row['countryDictList'] del row['stateDictList'] del row['cityDictList'] del row['zipCodeDictList'] dataList.append(row) if row['i'] % int( config.ConfigManager().StagingMasterTransferStep) == 0: stagingDateModified = row['stagingDateModified'] del row['stagingDateModified'] objectid = row['objectid'] del row['objectid'] del row['i'] "Step:4 insert data to db" mastercoll.insert(dataList) dataList = [] docid = row['doc_id'] "Step:5 update config collection with doc_id and datetime" updateconfigcollection(docid, stagingDateModified, objectid) except BaseException as ex: utility.log_exception_file(ex, config.ConfigManager().LogFile)
def update_graph(): '''Load the existing graph and update with new set of job description from predefined locations based on the application.ini file''' semantic_graph = load_graph() phrase_file = open(dcrconfig.ConfigManager().DistinctPhraseFile, 'r') '''Get the config values''' graph_weight = dcrconfig.ConfigManager().GraphEdgeWeight graph_filter_weight = dcrconfig.ConfigManager().FilterGraphEdgeWeight print("weight:%d filter weight: %d" % (graph_weight, graph_filter_weight)) # graph_collection = [] jdcount = 0 for line in phrase_file: try: line = line.strip() if not (line.startswith('--') or len(line.strip()) < 1): graph = dcrgraph.create_graph_distant_neighbors(line, graph_weight) dcrgraph.union_graph(semantic_graph, graph, graph_weight) jdcount += 1 elif (line.startswith('--')): ''' If the line starts with -- then it is job descriptin begenning So print a dot indicate the progress ''' print('.', end='') if jdcount % 1000 == 0: print('%d' % jdcount) sys.stdout.flush() except BaseException as ex: utility.log_exception_file(ex, dcrconfig.ConfigManager().SemanticGraphLogFile) count = list((d['weight']) for u, v, d in semantic_graph.edges_iter(data=True) if d['weight'] > graph_filter_weight) ''' nx.write_gexf(semantic_graph, dcrconfig.ConfigManager().SemanticGraphFile)''' mx = max(d for d in count) print('mx : %d, total jd processed : %d ' % (mx, jdcount)) print('Semantic Graph Info: %s' % nx.info(semantic_graph)) return semantic_graph
def automate_processes(): utility.write_to_file(config.ConfigManager().PromptcloudLogFile, 'a', 'PromptCloudautomationscript running') try: # download files into PCCompData with in mnt/nlpdata,xml format.. exec( open('rates_pc_download_crawldata_threading.py').read(), globals()) # compress the PCCompdata folder exec(open('compress.py').read(), globals()) # unzip files created in PCData folder time stored in dataloadconfig.. exec(open('pc_rates_unzip_gz.py').read(), globals()) # download data into pcdataanalysisresults.ods exec(open('pc_rates_dataload.py').read(), globals()) # for automatically sending emails # exec(open('mailsend.py').read(), globals()) # store analysis file in s3 backup # exec(open('pcdataanalysisbackup.py').read(), globals()) except BaseException as ex: utility.log_exception_file(ex, config.ConfigManager().PromptcloudLogFile)
def datamasking(row): maskingText = makingjsondata(row) maskingText = json.dumps(maskingText) headers = {"Content-Type": "application/json"} conn = http.client.HTTPConnection(config.ConfigManager().Host, config.ConfigManager().Port) conn.request(config.ConfigManager().JobServerMethod, config.ConfigManager().API, maskingText, headers) response = conn.getresponse() data = response.read() result = json.loads(data.decode('utf8')) try: row['supplierName'] = result['supplierName'] row['clientId'] = result['clientId'] row['mspId'] = result['mspId'] row['dataSource'] = result['dataSource'] # row['source'] = result['source'] except BaseException as ex: print(ex) utility.log_exception_file(ex, file) conn.close() return row
import utility import datetime import dcrgraph utility.write_to_file(config.ConfigManager().LogFile, 'a', 'Document integer graph from DB!(gen_docintgraph_from_db.py)' + str(datetime.datetime.now())) cl = MongoClient(config.ConfigManager().MongoClient.replace("##host##", config.ConfigManager().mongoDBHost)) db = cl[config.ConfigManager().RatesDB] mastercoll = db[config.ConfigManager().masterCollection] ratesConfig = db[config.ConfigManager().RatesConfigCollection] ratesConfigValues = ratesConfig.find({}) masterDateCreated = ratesConfigValues[0]['masterDateModified'] masterDateCreatedList = [] for doc in mastercoll.find({"dateCreated": {"$gt": masterDateCreated}}): try: if not doc["signGraph"] == "": dcrgraph.generate_document_integer_graph_fromdb(doc["signGraph"], doc['doc_id'],'a',config.ConfigManager().masterDocumentIntegerFile) masterDateCreatedList.append(doc["dateCreated"]) except BaseException as ex: utility.log_exception_file(str(ex), config.ConfigManager().LogFile) try: if masterDateCreatedList: masterDateCreatedLatest = max(masterDateCreatedList) ratesConfig.update({"_id": ratesConfigValues[0]['_id']}, {"$set": {"masterDateModified": masterDateCreatedLatest}}) except BaseException as ex: utility.log_exception_file(str(ex), config.ConfigManager().LogFile)
def readstagingdata(): utility.write_to_file( config.ConfigManager().LogFile, 'a', 'Staging dataread running' + ' ' + str(datetime.datetime.now())) ratesConfigValues = ratesConfig.find({}) ratesDate = ratesConfigValues[0]['stagingDateModified'] ratesData = stagingcoll.find({'dateModified': { "$gt": ratesDate }}, no_cursor_timeout=True) doc_id = ratesConfigValues[0]['masterDocId'] objectid = ratesConfigValues[0]['_id'] dataList = [] dateModifiedList = [] geoCountryQuery = "select distinct iso_alpha2, name, iso_alpha3, fips_code from geo_country order by name" geoStateQuery = "select ga1.code, ga1.name, gn.admin1, gn.latitude, gn.longitude from geo_admin1 ga1 inner join geo_name gn on ga1.geonameid = gn.geonameid" geoCityQuery = "select distinct sAdminName1, sAdminCode1, sCountryCode, sPlaceName, fLatitude, fLongitude from GeoPostal order by sPlaceName" geoZipCodeQuery = "select distinct sAdminName1, sAdminCode1, sCountryCode, sPostalCode, fLatitude, fLongitude from GeoPostal order by sPostalCode" countryDictList = custom.create_sql_dict_list( geoCountryQuery, config.ConfigManager().geographicalDataConnstr) stateDictList = custom.create_sql_dict_list( geoStateQuery, config.ConfigManager().geographicalDataConnstr) cityDictList = custom.create_sql_dict_list( geoCityQuery, config.ConfigManager().geographicalDataConnstr) zipCodeDictList = custom.create_sql_dict_list( geoZipCodeQuery, config.ConfigManager().geographicalDataConnstr) i = 0 for row in ratesData: try: dateModifiedList.append(row['dateModified']) i += 1 del row['_id'] doc_id += 1 row['doc_id'] = doc_id "Step:1 data scrubbing for email,phone,url and candidate name" row = dataclean(row) "Step:2 nounphrases generation" row = generatenounphrases(row) "Step:3 signature generation" row = signaturegraph(row) "Step:4 rates calculation" row = rates_calculation.billratescalculation(row) # Put rate value calculation before this check "Step:5 verification of rate availability" row = rate_available(row) # "Step:5 verification of location/city availability" # row = location_available(row) # "Step:6 get lat long of city" # row = get_lat_long_of_city(row) # geographical data check and additions row['iso_alpha2_value'] = ')(*&^' row['admin1_value'] = ')(*&^' row['state_name'] = ')(*&^' row = custom.geo_data_check(row, countryDictList, 'country') row = custom.geo_data_check(row, stateDictList, 'state') row = custom.geo_data_check(row, cityDictList, 'city') row = custom.geo_data_check(row, zipCodeDictList, 'zipCode') del row['iso_alpha2_value'] del row['admin1_value'] del row['state_name'] dataList.append(row) if i >= int(config.ConfigManager().StagingMasterTransferStep): "Step:4 insert data to db" mastercoll.insert(dataList) dataList = [] i = 0 docid = row['doc_id'] stagingDateModified = max(dateModifiedList) "Step:5 update config collection with doc_id and datetime" updateconfigcollection(docid, stagingDateModified, objectid) except BaseException as ex: print(ex) utility.log_exception_file(ex, config.ConfigManager().LogFile) # utility.log_exception_file(row, config.ConfigManager().LogFile) ratesData.close() del ratesData
"$ne": "Smart Track" } }] }).sort([("doc_id", 1)]).limit(20000): try: allphrases = '' phrases = doc['nounPhrases'] docId = int(doc['doc_id']) docIdList.append(docId) jobUniqueId = '-' * 3 + str(docId) + '-' * 3 allphrases += '\n' + jobUniqueId + '\n' + phrases print(allphrases, file=jobDescPhrasesFile) jcount += 1 # Print status print('.', end='') sys.stdout.flush() except BaseException as ex: utility.log_exception_file( ex, dcrconfig.ConfigManager().SemanticGraphLogFile) # Updating maximum value in order to take delta in next cycle if docIdList: configcol.update({"_id": configdocs[0]['_id']}, {"$set": { "dataDbToPhraseDocId": max(docIdList) }}) print("total documents processed: " + str(jcount))
def staging_data_load(filepath): dataList = [] errorList = [] errorString = "" errorFinalString = "" excel_file = open_workbook(filepath) sheet = excel_file.sheets()[0] header_keys = [ sheet.cell(0, colindex).value for colindex in range(sheet.ncols) ] stTypeofServiceRows = [x for x in stTypeofServiceCollection.find({})] stLaborCategoryRows = [x for x in stLaborCategoryCollection.find({})] industryRows = [x for x in industryCollection.find({})] currencyRows = [x for x in currencyCollection.find({})] stClientRows = [x for x in stClientsCollection.find({})] stMspRows = [x for x in stMspCollection.find({})] yesNoRows = [{'yesNo': 'Yes'}, {'yesNo': 'No'}] geoCountryQuery = "select distinct name from geo_country order by name" geoStateQuery = "select distinct name from geo_admin1 order by name" geoCityQuery = "select distinct sPlaceName from GeoPostal order by sPlaceName" geoZipCodeQuery = "select distinct sPostalCode from GeoPostal order by sPostalCode" countryList = create_sql_data_list( geoCountryQuery, config.ConfigManager().geographicalDataConnstr, 'name') stateList = create_sql_data_list( geoStateQuery, config.ConfigManager().geographicalDataConnstr, 'name') cityList = create_sql_data_list( geoCityQuery, config.ConfigManager().geographicalDataConnstr, 'sPlaceName') zipCodeList = create_sql_data_list( geoZipCodeQuery, config.ConfigManager().geographicalDataConnstr, 'sPostalCode') for rowindex in range(1, sheet.nrows): try: errorString = "" row_dict = { header_keys[colindex]: sheet.cell(rowindex, colindex).value for colindex in range(sheet.ncols) } row_dict = add_fields(row_dict) mandatoryFieldsPresent = mandatory_fields_check( row_dict, mandatoryFields) if mandatoryFieldsPresent: mandatoryFieldsValuePresent = mandatory_fields_value_presence_check( row_dict, mandatoryFields) if mandatoryFieldsValuePresent: stTypeofServiceValueAccuracyCheck = value_accuracy_check( row_dict, 'typeOfService', stTypeofServiceRows, "VMSTypeofService") stLaborCategoryValueAccuracyCheck = value_accuracy_check( row_dict, 'laborCategory', stLaborCategoryRows, "LaborCategory") industryValueAccuracyCheck = value_accuracy_check( row_dict, 'industry', industryRows, "IndustryName") currencyValueAccuracyCheck = value_accuracy_check( row_dict, 'currency', currencyRows, "currencyCode") clientValueAccuracyCheck = numerical_value_accuracy_check( row_dict, 'clientId', stClientRows, "clientID") mspValueAccuracyCheck = numerical_value_accuracy_check( row_dict, 'mspId', stMspRows, "mspID") rVFlagValueAccuracyCheck = value_accuracy_check( row_dict, 'remoteOrVirtualFlag', yesNoRows, "yesNo") fpTFlagValueAccuracyCheck = value_accuracy_check( row_dict, 'fullTime', yesNoRows, "yesNo") geoCountryAccuracyCheck = geo_data_check( row_dict, countryList, 'country') geoStateAccuracyCheck = geo_data_check( row_dict, stateList, 'state') geoCityAccuracyCheck = geo_data_check( row_dict, cityList, 'city') geoZipCodeAccuracyCheck = geo_data_check( row_dict, zipCodeList, 'zipCode') numericalValidationList = numerical_validation( row_dict, errorString) errorString = numericalValidationList[1] numericalValidation = numericalValidationList[0] dateFormatValidation1 = post_date_format_check( row_dict, errorString, excel_file) dateFormatValidation2 = post_date_format_check_two( row_dict, errorString) if dateFormatValidation1 == False and dateFormatValidation2 == False: dateFormatValidation = False errorString += 'Post date is not in the right format; ' else: dateFormatValidation = True if dateFormatValidation1: row_dict['postDate'] = (datetime.datetime( *xlrd.xldate_as_tuple(row_dict['postDate'], excel_file.datemode)) ).date().isoformat() if dateFormatValidation2: row_dict['postDate'] = (datetime.datetime.strptime( str(row_dict['postDate']), '%Y-%m-%d')).date().isoformat() if stTypeofServiceValueAccuracyCheck and stLaborCategoryValueAccuracyCheck and industryValueAccuracyCheck and currencyValueAccuracyCheck \ and geoCountryAccuracyCheck and geoStateAccuracyCheck and geoCityAccuracyCheck and geoZipCodeAccuracyCheck and clientValueAccuracyCheck\ and mspValueAccuracyCheck and numericalValidation and rVFlagValueAccuracyCheck and fpTFlagValueAccuracyCheck and dateFormatValidation: dataList.append(row_dict) else: errorString = master_list_mismatch_message_composition( errorString, stTypeofServiceValueAccuracyCheck, stLaborCategoryValueAccuracyCheck, industryValueAccuracyCheck, currencyValueAccuracyCheck, clientValueAccuracyCheck, mspValueAccuracyCheck, rVFlagValueAccuracyCheck, fpTFlagValueAccuracyCheck, geoCountryAccuracyCheck, geoStateAccuracyCheck, geoCityAccuracyCheck, geoZipCodeAccuracyCheck) else: errorString += 'Mandatory fields are empty; ' else: errorString += 'Mandatory fields are absent; ' errorString = error_string_clean(errorString) if not errorString == "": errorString = 'Errors in row ' + str(rowindex + 1) + ' - ' + errorString errorList.append(errorString) except BaseException as ex: errorString = 'Errors in row ' + str( rowindex + 1) + ' - ' + 'exception!!; ' + errorString errorList.append(errorString) utility.log_exception_file(ex, config.ConfigManager().LogFile) try: # file_back_up_and_removal(filepath) pass except BaseException as ex: utility.log_exception_file(ex, config.ConfigManager().LogFile) try: if dataList: insert_to_db(dataList, stagingCollection) if errorList: errorList.insert( 0, 'Data submitted successfully! Please upload a brand new file after correcting the following errors.' ) else: errorList.insert(0, 'Data submitted successfully!') errorFinalString = '|!@#$%|'.join(errorList) print(errorFinalString) except BaseException as ex: print('Exception during data load!') utility.log_exception_file(ex, config.ConfigManager().LogFile)
def job_info_analysis(page, filepath, dbrecordcount): global totalrecords global invalidrecords global emptydesc global incompletedesc global smalldesc global nonedesc global nodesc global totaljobsdict global jobsitedict # Fetching current config paramters configdocs = custom.retrieve_data_from_DB(int(config.ConfigManager(). MongoDBPort), config.ConfigManager(). IntelligenceDb, config.ConfigManager(). ConfigCollection) docid_count = int(configdocs[0]['docid_count']) dict_object_record_list = [] for jobinfo in page.findall('record'): try: # creating dictionary from xml tag contents dict_object = utility.xml_to_dict(ET.tostring(jobinfo)) totaljobsdict = fill_job_by_site(filepath) totalrecords += 1 # outer if check is jobdescription tag is in the xml if 'jobdescription' in (dict_object['record']): # checking if job description is none if ((dict_object['record'])['jobdescription'] is not None): # variable to determine if record needs to be # updated in DB incorrectjobdescription = 0 # checking if job description is empty if (((dict_object['record'])['jobdescription']) .strip()) == '': write_fileinfo(filepath, dict_object) invalidrecords += 1 emptydesc += 1 incorrectjobdescription = 1 jobsitedict = fill_job_site_data(filepath) # checking if job desc has less than 20 chars if (len(((dict_object['record'])['jobdescription']) ) < 20): incorrectjobdescription = 1 # eliminating the incomplete desc case if (((dict_object['record'])['jobdescription']) .strip()[-3:]) == '...': print('Do nothing') else: write_fileinfo(filepath, dict_object) invalidrecords += 1 smalldesc += 1 jobsitedict = fill_job_site_data(filepath) # checking the incomplete desc case if (((dict_object['record'])['jobdescription']) .strip()[-3:]) == '...': incorrectjobdescription = 1 write_fileinfo(filepath, dict_object) invalidrecords += 1 incompletedesc += 1 jobsitedict = fill_job_site_data(filepath) if (incorrectjobdescription == 0): docid_count += 1 (dict_object['record'])['doc_id'] = docid_count (dict_object['record'])['description'] = ((dict_object['record'])['jobdescription']) (dict_object['record'])['nounPhrases'] = "" dict_object_record_list.append(dict_object['record']) dbrecordcount += 1 # checking if job description is none if (dict_object['record'])['jobdescription'] is None: write_fileinfo(filepath, dict_object) invalidrecords += 1 nonedesc += 1 jobsitedict = fill_job_site_data(filepath) else: write_fileinfo(filepath, dict_object) invalidrecords += 1 nodesc += 1 jobsitedict = fill_job_site_data(filepath) except BaseException as ex: utility.log_exception_file(ex, dcrconfig.ConfigManager().SemanticGraphLogFile) if dict_object_record_list: insert_to_db(dict_object_record_list) # updating doc_id in config table UpdateTemplateWhere = utility.clean_dict() UpdateTemplateSet = utility.clean_dict() UpdateTemplateWhere['_id'] = configdocs[0]['_id'] UpdateTemplateSet['docid_count'] = docid_count DBSet = utility.clean_dict() DBSet['$set'] = UpdateTemplateSet custom.update_data_to_Db_noupsert(int(config.ConfigManager().MongoDBPort), config.ConfigManager().IntelligenceDb, config.ConfigManager().ConfigCollection, UpdateTemplateWhere, DBSet, connection) return dbrecordcount
def readstagingdata(): utility.write_to_file( config.ConfigManager().LogFile, 'a', 'Staging dataread running' + ' ' + str(datetime.datetime.now())) ratesConfigValues = ratesConfig.find({}) ratesDate = ratesConfigValues[0]['stagingDateModified'] # ratesDataDateMax = ((stagingcoll.find().sort([('dateModified', -1)]).limit(1))[0])['dateModified'] ratesDataCount = (stagingcoll.count({'dateModified': {"$gt": ratesDate}})) geoCountryQuery = "select distinct iso_alpha2, name, iso_alpha3, fips_code from geo_country order by name" # geoStateQuery = "select ga1.code, ga1.name, gn.admin1, gn.latitude, gn.longitude from geo_admin1 ga1 inner join geo_name gn on ga1.geonameid = gn.geonameid" geoStateQuery = "select gc.iso_alpha2, ga1.code, ga1.name, gn.admin1, gn.latitude, gn.longitude from geo_admin1 ga1 inner join geo_name gn on ga1.geonameid = gn.geonameid inner join geo_country gc on ltrim(rtrim(ga1.code)) like '%'+ ltrim(rtrim(gc.iso_alpha2))+'.' + '%'" geoCityQuery = "select distinct sAdminName1, sAdminCode1, sCountryCode, sPlaceName, fLatitude, fLongitude from GeoPostal order by sPlaceName" geoZipCodeQuery = "select distinct sAdminName1, sAdminCode1, sCountryCode, sPostalCode, fLatitude, fLongitude from GeoPostal order by sPostalCode" # countryDictList = custom.create_sql_dict_list(geoCountryQuery, config.ConfigManager().geographicalDataConnstr) # stateDictList = custom.create_sql_dict_list(geoStateQuery, config.ConfigManager().geographicalDataConnstr) # cityDictList = custom.create_sql_dict_list(geoCityQuery, config.ConfigManager().geographicalDataConnstr) # zipCodeDictList = custom.create_sql_dict_list(geoZipCodeQuery, config.ConfigManager().geographicalDataConnstr) geoCountryDict = custom.create_geo_dict( geoCountryQuery, config.ConfigManager().geographicalDataConnstr, 'Country') geoStateDict = custom.create_geo_dict( geoStateQuery, config.ConfigManager().geographicalDataConnstr, 'State') geoCityDict = custom.create_geo_dict( geoCityQuery, config.ConfigManager().geographicalDataConnstr, 'City') geoZipCodeDict = custom.create_geo_dict( geoZipCodeQuery, config.ConfigManager().geographicalDataConnstr, 'zipCode') cleanUpListDict = data_cleanup_lists() ratesConfigValues = ratesConfig.find({}) ratesDate = ratesConfigValues[0]['stagingDateModified'] objectid = ratesConfigValues[0]['_id'] lastDateTime = ratesConfigValues[0]['masterAutomationStartDate'] oldDate = datetime.datetime.strptime(lastDateTime, '%Y-%m-%d') mapping_dict = dcrgraphcompactor.load_node_dict() edge_int_dict = dcrgraphcompactor.get_normalized_dictionary_from_int_edges( ) neighborCount = dcrgraph.neighbor_count_for_edge_weight() diminition_percent = dcrconfig.ConfigManager().DiminitionPercentage # while ratesDate < ratesDataDateMax: while ratesDataCount > 0: ratesConfigValues = ratesConfig.find({}) ratesDate = ratesConfigValues[0]['stagingDateModified'] # countTotalRecords = stagingcoll.count({'dateModified': {"$gt": ratesDate}}) stepSize = int(config.ConfigManager().StagingMasterTransferStep) # if countTotalRecords < stepSize: # stepSize = countTotalRecords if ratesDataCount < stepSize: stepSize = ratesDataCount ratesDataCount = ratesDataCount - stepSize ratesData = stagingcoll.find( { 'dateModified': { "$gt": ratesDate } }, no_cursor_timeout=True).sort([ ('dateModified', 1) ]).limit(int(config.ConfigManager().StagingMasterTransferStep)) doc_id = ratesConfigValues[0]['masterDocId'] dataList = [] dateModifiedList = [] i = 0 for row in ratesData: try: dateModifiedList.append(row['dateModified']) i += 1 print(i) del row['_id'] doc_id += 1 row['doc_id'] = doc_id # print('Start data clean ' + str(datetime.datetime.now())) # "Step:1 data scrubbing for email,phone,url and candidate name" row = dataclean(row, cleanUpListDict) # print('Start noun phrase gen ' + str(datetime.datetime.now())) # "Step:2 nounphrases generation" row = generatenounphrases(row) # print('Start signature graph ' + str(datetime.datetime.now())) # "Step:3 signature generation" row = signaturegraph(row, mapping_dict, edge_int_dict, neighborCount, diminition_percent) # print('Start rates calculation ' + str(datetime.datetime.now())) # "Step:4 rates calculation" row = rates_calculation.billratescalculation(row) # print('Start rate available ' + str(datetime.datetime.now())) # Put rate value calculation before this check # "Step:5 verification of rate availability" row = rate_available(row) # print('Start geo verify ' + str(datetime.datetime.now())) # geographical data check and additions row['iso_alpha2_value'] = ')(*&^' row['admin1_value'] = ')(*&^' row['state_name'] = ')(*&^' row = custom.geo_data_verify(row, geoCountryDict, 'country') row = custom.geo_data_verify(row, geoStateDict, 'state') row = custom.geo_data_verify(row, geoCityDict, 'city') row = custom.geo_data_verify(row, geoZipCodeDict, 'zipCode') del row['iso_alpha2_value'] del row['admin1_value'] del row['state_name'] # print('Stop geo verify ' + str(datetime.datetime.now())) dataList.append(row) except BaseException as ex: utility.log_exception_file(ex, config.ConfigManager().LogFile) ratesData.close() del ratesData if dataList: # Step:4 insert data to db mastercoll.insert(dataList) doc_id = row['doc_id'] todayDate = datetime.date.today() todayDate = datetime.datetime.strptime(str(todayDate), '%Y-%m-%d') delta = todayDate - oldDate days = delta.days if dateModifiedList: ratesDate = max(dateModifiedList) # Step:5 update config collection with doc_id and datetime updateconfigcollection(doc_id, ratesDate, objectid) if int(days) >= int(5): break
def stcandidate_update(): utility.write_to_file( config.ConfigManager().LogFile, 'a', 'in stcandidates update currency Code running.!' + ' ' + str(datetime.datetime.now())) recordnumber = 0 query = custom.fetch_query( config.ConfigManager().STCandidateCurrencyQueryId) print(query) query = custom.query_variable_replace( query, config.ConfigManager().STCandidateCurrencyDetails, config.ConfigManager().ST) print(query) cursor = dbmanager.cursor_odbc_connection(config.ConfigManager().STConnStr) db_data_dict = dbmanager.cursor_execute(cursor, query) db_data = db_data_dict['dbdata'] db_data_cursorexec = db_data_dict['cursor_exec'] cursor_description = db_data_cursorexec.description column_headers = [column[0] for column in cursor_description] connection = dbmanager.mongoDB_connection( int(config.ConfigManager().MongoDBPort)) data_dict1 = {} req_list = [] candidateDatesList = [] for row1 in db_data: try: print(data_dict1) strtimestamp = str(datetime.datetime.now()) recordnumber += 1 print(recordnumber) data_dict1 = dict(utility.zip_list(column_headers, row1)) STCandidateCollection.update( { "$and": [{ "candidateid": data_dict1['candidateid'] }, { "requirementRateStatusList": { "$elemMatch": { "requirementId": data_dict1['requirementid'] } } }] }, { "$set": { "requirementRateStatusList.$.currencyCode": data_dict1['currencycode'], "requirementRateStatusList.$.SupplierCurrencyCode": data_dict1['SupplierCurrencyCode'], "requirementRateStatusList.$.supplierRegBillRateEX": str(data_dict1['supplierRegBillRateEX']) } }) candidateDatesList.append(data_dict1['dateCreated']) except BaseException as ex: print(ex) utility.log_exception_file(ex, config.ConfigManager().LogFile) if 'dateCreated' in data_dict1: maxCandDate = max(candidateDatesList) UpdateTemplateWhere = utility.clean_dict() UpdateTemplateSet = utility.clean_dict() UpdateTemplateWhere['_id'] = configdocs[0]['_id'] print(maxCandDate) print(str(maxCandDate)) UpdateTemplateSet['STCandidateCurrencyCodeLastDate'] = str(maxCandDate) DBSet = utility.clean_dict() DBSet['$set'] = UpdateTemplateSet custom.update_data_to_Db_noupsert( int(config.ConfigManager().MongoDBPort), config.ConfigManager().DataCollectionDB, config.ConfigManager().ConfigCollection, UpdateTemplateWhere, DBSet, connection)