def nounphrase_generate(): c = MongoClient(dcrconfig.ConfigManager().Datadb) db = c[config.ConfigManager().IntelligenceDb] col = db[config.ConfigManager().IntelligenceDataCollection] docs = col.find({'nounPhrases': ""}, { "description": 1, "doc_id": 1, "_id": 1 }) mongoport = int(config.ConfigManager().MongoDBPort) connection = dbmanager.mongoDB_connection(mongoport) for doc in docs: try: data = {} data['desc'] = doc['description'] data['_id'] = doc['_id'] data['doc_id'] = doc['doc_id'] data['connection'] = connection q.put(data) except BaseException as ex: exception_message = '\n' + 'Exception:' + '\n' str(datetime.datetime.now()) + '\n' exception_message += 'File: ' + '\n' exception_message += '\n' + str(ex) + '\n' exception_message += '-' * 100 utility.write_to_file( dcrconfig.ConfigManager().SemanticGraphLogFile, 'a', exception_message)
def nounphrase_generate(): docs = custom.retrieve_rowdata_from_DB( int(config.ConfigManager().MongoDBPort), config.ConfigManager().DataCollectionDB, config.ConfigManager().DataCollectionDBCollection, dictionaries.DBWhereConditon) connection = dbmanager.mongoDB_connection( int(config.ConfigManager().MongoDBPort)) description = '' for doc in docs: try: description = doc['description'] noun_phrases = dcrnlp.extract_nounphrases_sentences(description) dictionaries.UpdateTemplateSet['nounPhrases'] = noun_phrases dictionaries.UpdateTemplateWhere['_id'] = doc['_id'] dictionaries.DBSet['$set'] = dictionaries.UpdateTemplateSet custom.update_data_to_Db_con( int(config.ConfigManager().MongoDBPort), config.ConfigManager().DataCollectionDB, config.ConfigManager().DataCollectionDBCollection, dictionaries.UpdateTemplateWhere, dictionaries.DBSet, connection) except BaseException as ex: exception_message = '\n' + 'Exception:' + \ str(datetime.datetime.now()) + '\n' exception_message += 'File: ' + '\n' exception_message += '\n' + str(ex) + '\n' exception_message += '-' * 100 utility.write_to_file(config.ConfigManager().LogFile, 'a', exception_message)
def route_compfileread(filepaths): for filepath in filepaths: try: # extracting data from .gz file. gzipfile = gzip.GzipFile(filepath, 'rb') gzipdata = gzipfile.read() gzipfile.close() # getting complete file name of the .gz file compfilename = utility.filename_from_filepath(filepath) # extracting the original file name filename = compfilename.split('.gz')[0] print(filename) # creating file and writing data uncompfile = open( config.ConfigManager().PCFileFolder + '/' + filename, 'wb') uncompfile.write(gzipdata) uncompfile.close() except BaseException as ex: utility.log_exception_with_filepath(ex, filepath) # writing to file the file names that cannot be extracted using # gzip utility.write_to_file( config.ConfigManager().PCDataAnalysisResultsFile, 'a', compfilename + ' cannot be extracted') os.remove(filepath)
def main(): """load the feature and cluster them by linkage. Record all the cluster tree and dump it. """ words_set = load_model(GOOGLE_WORD_FEATURE) features = [] words = [] count = 0 for word in words_set: v = words_set[word] if v is not None: features.append(v) words.append(word) count += 1 clusters, maxcluster = hierarchical_cluster(features, words) #can not use json.dump txt = str(clusters[maxcluster]) txt = txt.replace("'", '"') write_to_file(GOOGLE_CLUSTER_PATH, txt.encode(encoding='utf_8', errors='strict'), mode='wb+')
def automate_processes(): utility.write_to_file(config.ConfigManager().PromptcloudLogFile, 'a', 'PromptCloudautomationscript running') try: # download files into PCCompData with in mnt/nlpdata,xml format.. exec(open('pc_download_crawldata_threading.py').read(), globals()) # compress the PCCompdata folder exec(open('compress.py').read(), globals()) # unzip files created in PCData folder time stored in dataloadconfig.. exec(open('pc_unzip_gz.py').read(), globals()) # download data into pcdataanalysisresults.ods exec(open('analyze_crawldata.py').read(), globals()) # for automatically sending emails # exec(open('mailsend.py').read(), globals()) # store analysis file in s3 backup exec(open('pcdataanalysisbackup.py').read(), globals()) except BaseException as ex: exception_message = '\n' + 'Exception:' + \ str(datetime.datetime.now()) + '\n' exception_message += 'File: ' + '\n' exception_message += '\n' + str(ex) + '\n' exception_message += '-' * 100 # .encode('utf8')) utility.write_to_file(config.ConfigManager().PromptcloudLogFile, 'a', exception_message)
def print_table(self): tmp = (self.efficacy.iloc[:50:3] * 100).astype(int) tmp.columns = [x + " (%)" for x in tmp.columns] write_to_file(ANALYSIS_NOTES, "EfficacyTable", tmp.to_markdown()) tmp.plot() imgPath = OUT_FOLDER / "img" / "Efficacy.png" plt.savefig(imgPath) caption = "Estimated efficacy after n days" write_img_to_file(ANALYSIS_NOTES, "EfficacyFigure", imgPath, caption)
def valid_records(): global totaljobsdict global jobsitedict # subtracting dictionary key values to get valid records per site validjobsdict = {key: totaljobsdict[key] - jobsitedict.get(key, 0) for key in totaljobsdict.keys()} utility.write_to_file(config.ConfigManager().PCDataAnalysisResultsFile, 'a', 'Total valid records per site: ') utility.write_to_file(config.ConfigManager().PCDataAnalysisResultsFile, 'a', str(validjobsdict))
def automate_processes(): utility.write_to_file( dcrconfig.ConfigManager().SemanticGraphLogFile, 'a', 'Knowledge build automation running..! ' + str(datetime.datetime.now())) try: # Copies files from the previous cycle exec(open('filecopy.py').read(), globals()) # Copy the noun phrase text from Mongo DB exec(open('dbtophrasefile.py').read(), globals()) # Remove ngram anything above 3 or more words. exec(open('ngramremoval.py').read(), globals()) # Remove duplicates and save it in new distinct phrase file. exec(open('duplicatefinder.py').read(), globals()) # Checks if there is an existing semantic graph, if yes load and update # with new documents else create a new semantic graph and store. # Normally, this is run after n gram removal and duplicate # find and removal. exec(open('dcrgraphgenerator.py').read(), globals()) # Read the semantic graph which is saved using dcrgraphgenerator.py # and read the document phrase file and create optimized integer # semantic edge file. exec(open('dcrgraphcompactor.py').read(), globals()) # Save the node dictionary using pickle to file. This will be used by # above programs for finding node ids exec(open('savenodes.py').read(), globals()) # Generate document integer graph and store. This will be used for # searching the documents. # exec(open('dcrdocumentintgraphgenerator.py').read(), globals()) # Copy the noun phrase text from Mongo DB (Intelligence collection) exec(open('stdbtophrasefile.py').read(), globals()) # Remove ngram anything above 3 or more words. exec(open('ngramremoval.py').read(), globals()) # Remove duplicates and save it in new distinct phrase file. exec(open('duplicatefinder.py').read(), globals()) # Checks if there is an existing semantic graph, if yes load and update # with new documents else create a new semantic graph and store. # Normally, this is run after n gram removal and duplicate # find and removal. exec(open('stdcrgraphgenerator.py').read(), globals()) # Read the semantic graph which is saved using dcrgraphgenerator.py # and read the document phrase file and create optimized integer # semantic edge file. exec(open('stdcrgraphcompactor.py').read(), globals()) # Save the node dictionary using pickle to file. This will be used by # above programs for finding node ids exec(open('savenodes.py').read(), globals()) # Transfer generated intelligence files exec(open('filetransfer.py').read(), globals()) except BaseException as ex: utility.log_exception_file( ex, dcrconfig.ConfigManager().SemanticGraphLogFile)
def display_scores(vectorizer, tfidf_result, savesubfolder): # http://stackoverflow.com/questions/16078015/ write_to_file( os.path.join(STORAGE, '{}/webpage.vocabulary.txt'.format(savesubfolder)), b'', 'wb+') for fea_name in vectorizer.get_feature_names(): fea_name = fea_name + '\n' write_to_file( os.path.join(STORAGE, '{}/webpage.vocabulary.txt'.format(savesubfolder)), fea_name.encode('utf-8')) scores = zip(vectorizer.get_feature_names(), np.asarray(tfidf_result.sum(axis=0)).ravel()) sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True) index = 0 write_to_file( os.path.join(STORAGE, '{}/webpage.vocabulary_top.txt'.format(savesubfolder)), b'', 'wb+') for item in sorted_scores: index += 1 txt = "{0} {1:50} Score: {2}\n".format(index, repr(item[0]), item[1]) write_to_file( os.path.join( STORAGE, '{}/webpage.vocabulary_top.txt'.format(savesubfolder)), txt.encode('utf-8'))
def test_json_to_txt(): folder = os.path.join(STORAGE, 'test_json') save_apicalls = os.path.join(STORAGE, 'test_apicall.txt') save_rtvalue = os.path.join(STORAGE, 'test_rtvalue.txt') write_to_file(save_apicalls, '', mode='w+') write_to_file(save_rtvalue, '', mode='w+') for subfolder in os.listdir(folder): fullsubfolder = os.path.join(folder, subfolder) for fname in os.listdir(fullsubfolder): fullname = os.path.join(fullsubfolder, fname) with open(fullname) as f: jsondata = json.load(f) common = [str(jsondata['file_id'])] apis = [] rtns = [] for tid in jsondata['threads']: api_calls = jsondata['threads'][tid]['api_calls'] apis += [apicall[1] for apicall in api_calls] rtns += [apicall[2] for apicall in api_calls] apis += ['.'] rtns += ['.'] txt = ' '.join(apis) txt = ','.join(common + [txt]) write_to_file(save_apicalls, txt + '\n', mode='a+') txt = ' '.join(rtns) txt = ','.join(common + [txt]) write_to_file(save_rtvalue, txt + '\n', mode='a+')
def automate_processes(): utility.write_to_file(config.ConfigManager().LogFile, 'a', 'stautomationscript running') try: # Reading requirement and candidate data from ST exec(open('stdataread.py').read(), globals()) # Extracting candidate resumes exec(open('resume_extract.py').read(), globals()) # Read extracted resumes and update to 'resumeText' field exec(open('resumeread.py').read(), globals()) # Appending 'resumeText' to description field exec(open('resume_append.py').read(), globals()) # Generate nounphrases for candidate table exec(open('stnounphrase_generate.py').read(), globals()) # Update requirements and rates for candidates exec(open('requirement_update_fastest.py').read(), globals()) # Update candidate statuses which changed exec(open('submission_status_update.py').read(), globals()) # Extracting requirement description files exec(open('req_desc_file_extract.py').read(), globals()) # Read extracted description files and update to 'reqFileDesc' field exec(open('req_desc_file_read.py').read(), globals()) # Appending 'reqFileDesc' to description field exec(open('req_desc_file_append.py').read(), globals()) # Generate nounPhrases for requirement tables exec(open('streqnounphrase_generate.py').read(), globals()) # Get supplier info exec(open('stsupplierdataread.py').read(), globals()) # Candidate resume screening exec(open('contactinfodetect.py').read(), globals()) # Client master list load # exec(open('stclientsdataread.py').read(), globals()) # # Currency master list load # exec(open('currencydataread.py').read(), globals()) # # Industry master list load # exec(open('industrydataread.py').read(), globals()) # # MSP master list load # exec(open('stmspdataread.py').read(), globals()) # currency code update exec(open('stcandidateCurrency_update_fastest.py').read(), globals()) except BaseException as ex: exception_message = '\n' + 'Exception:' + \ str(datetime.datetime.now()) + '\n' exception_message += 'File: ' + '\n' exception_message += '\n' + str(ex) + '\n' exception_message += '-' * 100 # .encode('utf8')) utility.write_to_file(config.ConfigManager().LogFile, 'a', exception_message)
def create_account(self, account, card_file, initial_balance): # Check initial balance to be greater or equal to 10 if Decimal(initial_balance) < 10: raise SystemExit(255) # Check if card_file exits. if os.path.isfile(card_file): raise SystemExit(255) card_secret = generate_random_secret() self.send_request('create_account', account, card_secret, initial_balance) # Save new card_file write_to_file(card_file, card_secret)
def automate_processes(): utility.write_to_file(config.ConfigManager().LogFile, 'a', 'pcanalysisautomationscript running') try: exec(open('download_crawldata_threading.py').read(), globals()) exec(open('unzip_gz.py').read(), globals()) exec(open('analyze_crawldata.py').read(), globals()) except BaseException as ex: exception_message = '\n' + 'Exception:' + \ str(datetime.datetime.now()) + '\n' exception_message += 'File: ' + '\n' exception_message += '\n' + str(ex) + '\n' exception_message += '-' * 100 # .encode('utf8')) utility.write_to_file(config.ConfigManager().LogFile, 'a', exception_message)
def readstagingdata(): utility.write_to_file( config.ConfigManager().LogFile, 'a', 'Staging dataread running' + ' ' + str(datetime.datetime.now())) ratesConfigValues = ratesConfig.find({}) ratesDate = ratesConfigValues[0]['stagingDateModified'] ratesData = stagingcoll.find({'dateModified': { "$gt": ratesDate }}, no_cursor_timeout=True) doc_id = ratesConfigValues[0]['masterDocId'] objectid = ratesConfigValues[0]['_id'] dateModifiedList = [] geoCountryQuery = "select distinct name,iso_alpha3, fips_code from geo_country order by name" geoStateQuery = "select ga1.name, gn.admin1, gn.latitude, gn.longitude from geo_admin1 ga1 inner join geo_name gn on ga1.geonameid = gn.geonameid" geoCityQuery = "select distinct sPlaceName, fLatitude, fLongitude from GeoPostal order by sPlaceName" geoZipCodeQuery = "select distinct sPostalCode, fLatitude, fLongitude from GeoPostal order by sPostalCode" countryDictList = custom.create_sql_dict_list( geoCountryQuery, config.ConfigManager().geographicalDataConnstr) stateDictList = custom.create_sql_dict_list( geoStateQuery, config.ConfigManager().geographicalDataConnstr) cityDictList = custom.create_sql_dict_list( geoCityQuery, config.ConfigManager().geographicalDataConnstr) zipCodeDictList = custom.create_sql_dict_list( geoZipCodeQuery, config.ConfigManager().geographicalDataConnstr) i = 0 for row in ratesData: dateModifiedList.append(row['dateModified']) i += 1 del row['_id'] doc_id += 1 row['doc_id'] = doc_id row['stagingDateModified'] = max(dateModifiedList) row['i'] = i row['objectid'] = objectid row['countryDictList'] = countryDictList row['stateDictList'] = stateDictList row['cityDictList'] = cityDictList row['zipCodeDictList'] = zipCodeDictList q.put(row) ratesData.close() del ratesData
def __init__(self, ip, port, auth_file): # Store all accounts data in memory self._accounts = {} # Check if auth_file already exits. if os.path.isfile(auth_file): raise SystemExit(255) # Create server using TCP/IP socket self._server = socket.socket() self._server.bind((ip, port)) self._server.listen(3) # Generate new auth token and write to file self._auth_token = generate_random_secret() write_to_file(auth_file, self._auth_token) print('created', flush=1)
def model_build_main(storage, datasetpath, featureheaders, targethearders): name, clf, modelinfo = model_build(datasetpath, featureheaders, targethearders) summarypath = os.path.join(storage, 'model/lightgbm.model.esimate') modelsavepath = os.path.join(storage, 'model/lightgbm.model') modelinfosavepath = os.path.join(storage, 'model/lightgbm.modelinfo') txt = json.dumps(modelinfo, indent=4) write_to_file(modelinfosavepath, txt.encode('utf-8'), mode='wb+') save_model(clf, modelsavepath) print('model summary:') print('save model summary->', summarypath) write_to_file(summarypath, txt.encode('utf-8'), mode='wb+')
def requirement_update(): (dictionaries.DBWhereCondition1)['documentType'] = 'candidate details' (dictionaries.DBWhereCondition1)['dataSource'] = 'Smart Track' docs = custom.retrieve_rowdata_from_DB_notimeout( int(config.ConfigManager().MongoDBPort), config.ConfigManager().DataCollectionDB, config.ConfigManager().DataCollectionDBCollection, dictionaries.DBWhereCondition1) recordnumber = 0 for doc in docs: recordnumber += 1 requirementIDList = [] query = custom.fetch_query( config.ConfigManager().STCandidateSubmissionsQueryId) cursor = dbmanager.cursor_odbc_connection( config.ConfigManager().STConnStr) db_data_dict = dbmanager.cursor_execute(cursor, query) db_data = db_data_dict['dbdata'] db_data_cursorexec = db_data_dict['cursor_exec'] cursor_description = db_data_cursorexec.description column_headers = [column[0] for column in cursor_description] for row in db_data: try: data_dict = dict(utility.zip_list(column_headers, row)) if (data_dict['CandidateID'] == doc['candidateid']): requirementIDList.append(data_dict['requirementID']) except BaseException as ex: exception_message = '\n' + 'Exception:' + \ str(datetime.datetime.now()) + '\n' exception_message += 'File: ' + '\n' exception_message += '\n' + str(ex) + '\n' exception_message += '-' * 100 utility.write_to_file(config.ConfigManager().LogFile, 'a', exception_message) dictionaries.UpdateTemplateSet['requirementIDList'] = requirementIDList dictionaries.UpdateTemplateWhere['candidateid'] = doc['candidateid'] dictionaries.DBSet['$set'] = dictionaries.UpdateTemplateSet print(recordnumber, doc['candidateid'], requirementIDList) custom.update_data_to_Db_noupsert( int(config.ConfigManager().MongoDBPort), config.ConfigManager().DataCollectionDB, config.ConfigManager().DataCollectionDBCollection, dictionaries.UpdateTemplateWhere, dictionaries.DBSet)
def dump_flat_result(oidpath, flatsavepath): write_to_file(flatsavepath, ''.encode(encoding='utf_8', errors='strict'), mode='wb+') for filename in os.listdir(oidpath): fullname = os.path.join(oidpath, filename) if filename.endswith('.json'): print(fullname) with open(fullname, encoding='utf-8') as f: data = json.load(f) v = [] get_children(data, v) txt = ' '.join(v) txt += '\n' txt = filename + "\t" + str(len(v)) + "\t" + txt write_to_file(flatsavepath, txt.encode(encoding='utf_8', errors='strict')) fullname = fullname.replace('.json', '') draw_cluster_tree(v, fullname + '.pdf')
def automate_processes(): utility.write_to_file(config.ConfigManager().PromptcloudLogFile, 'a', 'PromptCloudautomationscript running') try: # download files into PCCompData with in mnt/nlpdata,xml format.. exec( open('rates_pc_download_crawldata_threading.py').read(), globals()) # compress the PCCompdata folder exec(open('compress.py').read(), globals()) # unzip files created in PCData folder time stored in dataloadconfig.. exec(open('pc_rates_unzip_gz.py').read(), globals()) # download data into pcdataanalysisresults.ods exec(open('pc_rates_dataload.py').read(), globals()) # for automatically sending emails # exec(open('mailsend.py').read(), globals()) # store analysis file in s3 backup # exec(open('pcdataanalysisbackup.py').read(), globals()) except BaseException as ex: utility.log_exception_file(ex, config.ConfigManager().PromptcloudLogFile)
def train_file_split(): 'split the train file to many files by file id.Rows with the same file id should be save in one file' count = 0 with open(trainpath, 'r') as f: for line in f: count += 1 if count == 1: continue splits = line.split(',') if len(splits) >= 2: lable = splits[1] fileid = splits[0] subfolder = lable savefolder = os.path.join(STORAGE, 'train_flat', subfolder) os.makedirs(savefolder, exist_ok=True) savefile = os.path.join(savefolder, str(fileid) + '.txt') write_to_file(savefile, line) if count % 10000 == 0: print(count)
def remove_ngram_from_allphrasefile(): utility.write_to_file( dcrconfig.ConfigManager().SemanticGraphLogFile, 'a', 'Semantic graph Generation Step 5..! (ngramremoval.py) ' + str(datetime.datetime.now())) # Loop thru all phrase files and generate the integer graph phrase_file = open(dcrconfig.ConfigManager().PhraseFile, 'r') ng_phrase_file = open(dcrconfig.ConfigManager().NGramFilteredPhraseFile, 'w') for line in phrase_file: line = line.strip() if (line.startswith('--')): # If the line starts with -- then it is job descriptin beginning # So print a dot indicate the progress print('.', end='') sys.stdout.flush() print(line, file=ng_phrase_file) # If the line doesn't start with -- or is not empty space if not (line.startswith('--') or len(line.strip()) < 1): print(remove_ngram(line), file=ng_phrase_file)
def folder_to_json(folder, fileparser, save_folder): stime = time.time() count = 0 for subfolder in os.listdir(folder): fullsubfolder = os.path.join(folder, subfolder) save_subfolder = os.path.join(save_folder, subfolder) os.makedirs(save_subfolder, exist_ok=True) for fname in os.listdir(fullsubfolder): try: fullname = os.path.join(save_subfolder, fname) document = fileparser(fullname) txt = json.dumps(document) write_to_file(fullname, txt, mode='a+') count += 1 if count % 1000 == 0: print(count) except Exception as e: traceback.print_exc() print(fullname) print(time.time() - stime, count)
def lda_similarity_main(dictionary, lda_save_path, corpora_path, index_path): questions = get_question() corpus = load_corpora(corpora_path) ldamodel = load_lad_model(lda_save_path) if os.path.exists(index_path): index_sim = similarities.MatrixSimilarity.load(index_path) else: index_sim = similarities.MatrixSimilarity(ldamodel[corpus]) index_sim.save(index_path) write_to_file('../data/query.lda.txt', ''.encode('utf-8'), mode='wb+') for i in range(1000): querydoc = questions[i] vec_bow = dictionary.doc2bow(jieba.lcut(querydoc)) vec_lda = ldamodel[vec_bow] sims = index_sim[vec_lda] sims = sorted(enumerate(sims), key=lambda item: -item[1]) for sim in sims[:10]: index = sim[0] distance = sim[1] txt = '{} {} {} {} {}\n'.format(i, querydoc, index, questions[index], distance) write_to_file('../data/query.lda.txt', txt.encode('utf-8')) write_to_file('../data/query.lda.txt', '\n'.encode('utf-8'))
def nounphrase_generate(): (dictionaries.DBWhereConditon)['documentType'] = 'candidate details' (dictionaries.DBWhereConditon)['dataSource'] = 'Smart Track' docs = custom.retrieve_rowdata_from_DB( int(config.ConfigManager().MongoDBPort), config.ConfigManager().DataCollectionDB, config.ConfigManager().DataCollectionDBCollection, dictionaries.DBWhereConditon) description = '' for doc in docs: try: if not doc['descriptionOld'] is None: print('Inside if') description = doc['descriptionOld'] + '. ' + doc['resumeText'] noun_phrases = dcrnlp.extract_nounphrases_sentences( description) dictionaries.UpdateTemplateSet['nounPhrases'] = noun_phrases dictionaries.UpdateTemplateSet['description'] = description else: print('Inside else') description = doc['resumeText'] noun_phrases = dcrnlp.extract_nounphrases_sentences( description) dictionaries.UpdateTemplateSet['description'] = description dictionaries.UpdateTemplateSet['nounPhrases'] = noun_phrases dictionaries.UpdateTemplateWhere['_id'] = doc['_id'] dictionaries.DBSet['$set'] = dictionaries.UpdateTemplateSet custom.update_data_to_Db( int(config.ConfigManager().MongoDBPort), config.ConfigManager().DataCollectionDB, config.ConfigManager().DataCollectionDBCollection, dictionaries.UpdateTemplateWhere, dictionaries.DBSet) except BaseException as ex: exception_message = '\n' + 'Exception:' + \ str(datetime.datetime.now()) + '\n' exception_message += 'File: ' + '\n' exception_message += '\n' + str(ex) + '\n' exception_message += '-' * 100 utility.write_to_file(config.ConfigManager().LogFile, 'a', exception_message)
def rp_similarity_main(dictionary, tfidf_save_path, rp_save_path, corpora_path, index_path): questions = get_question() corpus = load_corpora(corpora_path) rpmodel = load_rp_model(lda_save_path) tfidfmodel = load_tfidf_model(tfidf_save_path) if os.path.exists(index_path): index_sim = similarities.MatrixSimilarity.load(index_path) else: print('build matrix similarity') corpus_tfidf = tfidfmodel[corpus] index_sim = similarities.MatrixSimilarity(rpmodel[corpus_tfidf]) index_sim.save(index_path) write_to_file('../data/query.rp.txt', ''.encode('utf-8'), mode='wb+') for i in range(1000): querydoc = questions[i] vec_bow = dictionary.doc2bow(jieba.lcut(querydoc)) vectfidf = tfidfmodel[vec_bow] vec_rp = rpmodel[vectfidf] sims = index_sim[vec_rp] sims = sorted(enumerate(sims), key=lambda item: -item[1]) for sim in sims[:10]: index = sim[0] distance = sim[1] txt = '{} {} {} {} {}\n'.format(i, querydoc, index, questions[index], distance) write_to_file('../data/query.rp.txt', txt.encode('utf-8')) write_to_file('../data/query.rp.txt', '\n'.encode('utf-8'))
def test_file_split(mode, left): 'split the test file to many files by file id.Rows with the same file id should be save in one file' count = 0 with open(testpath, 'r') as f: for line in f: count += 1 if count == 1: continue splits = line.split(',') if len(splits) >= 2: fileid = splits[0] moderesut = int(fileid) % mode if moderesut == left: savefolder = os.path.join(STORAGE, 'test_flat') savefolder = os.path.join(savefolder, str(left)) os.makedirs(savefolder, exist_ok=True) savefile = os.path.join(savefolder, str(fileid) + '.txt') write_to_file(savefile, line, mode='a+') if count % 10000 == 0: print(count, fileid, mode, left)
def generate_nounphrase_insert_into_db(data): global count try: status = "{:<8}".format(str(count)) + " :" status += str(datetime.datetime.now()) count += 1 mongoport = int(config.ConfigManager().MongoDBPort) col = config.ConfigManager().IntelligenceDataCollection desc = data['desc'] noun_phrases = dcrnlp.extract_nounphrases_sentences(desc) UpdateTemplateWhere = utility.clean_dict() UpdateTemplateSet = utility.clean_dict() DBSet = utility.clean_dict() UpdateTemplateWhere['_id'] = data['_id'] UpdateTemplateSet['nounPhrases'] = noun_phrases UpdateTemplateSet['description'] = desc DBSet['$set'] = UpdateTemplateSet status += " |" + str(datetime.datetime.now()) custom.update_data_to_Db_con(mongoport, config.ConfigManager().IntelligenceDb, col, UpdateTemplateWhere, DBSet, data['connection']) status += " |" + str(datetime.datetime.now()) status += " :" + "{:<9}".format(str(data['doc_id'])) print(status) except BaseException as ex: exception_message = '\n' + 'Exception:' + '\n' str(datetime.datetime.now()) + '\n' exception_message += 'File: ' + '\n' exception_message += '\n' + str(ex) + '\n' exception_message += '-' * 100 utility.write_to_file(dcrconfig.ConfigManager().SemanticGraphLogFile, 'a', exception_message)
def split_tree(root, clsid): if type(root) is not dict: return count = root['count'] if clsid in knowoidgroup: txt = json.dumps(root, indent=4) write_to_file(os.path.join(CHECKED_OID_SAVE_FOLDER, '{}.json'.format(clsid)), txt.encode(encoding='utf_8', errors='strict'), mode='wb+') return if count < OID_NODE_MAX_COUNT: #dump the tree to file if count < OID_NODE_MAX_COUNT if count < 4: return txt = json.dumps(root, indent=4) write_to_file(os.path.join(OID_SAVE_FOLDER, '{}.json'.format(clsid)), txt.encode(encoding='utf_8', errors='strict'), mode='wb+') return left, right = root['children'] oid = '{}1'.format(clsid) split_tree(left, oid) oid = '{}2'.format(clsid) split_tree(right, oid)
def folder_to_basic_feature(folder, feature_save_path): featureheader = [ 'file_id', 'label', 'threadnum', 'totalapicall', 'maxapicall', 'minapicall', 'meanapicallperthread' ] stime = time.time() count = 0 write_to_file(feature_save_path, '', mode='w+') for subfolder in os.listdir(folder): fullsubfolder = os.path.join(folder, subfolder) for fname in os.listdir(fullsubfolder): fullname = os.path.join(fullsubfolder, fname) try: feature = basic_feature(fullname) attr = [] for head in featureheader: attr.append(str(feature.get(head, 0))) txt = ','.join(attr) write_to_file(feature_save_path, txt + '\n', mode='a+') except Exception as e: traceback.print_exc()
def automate_processes(): utility.write_to_file(config.ConfigManager().LogFile, 'a', 'staging automationscript running') try: # industry data read exec(open('industrydataread.py').read(), globals()) # currency data read exec(open('currencydataread.py').read(), globals()) # ST msp users data read exec(open('stmspdataread.py').read(), globals()) # ST clients data read exec(open('stclientsdataread.py').read(), globals()) # data move staging to master exec(open('stagingdataread.py').read(), globals()) except BaseException as ex: exception_message = '\n' + 'Exception:' + \ str(datetime.datetime.now()) + '\n' exception_message += 'File: ' + '\n' exception_message += '\n' + str(ex) + '\n' exception_message += '-' * 100 # .encode('utf8')) utility.write_to_file(config.ConfigManager().LogFile, 'a', exception_message)