def printSuggRankedDocs(smoothness, usedTestFiles): result = False try: testScore = smoothness.sum(axis=1) testMapping = {} for files in range(len(usedTestFiles)): testMapping[usedTestFiles[files]] = testScore[files] sorted_x = sorted(testMapping.items(), key=operator.itemgetter(1), reverse=True) todayDateFolder = util.getTodayDateFolder() write_directory = os.path.join(Constants.ROOT_FOLDER, Constants.RECOMMENDATION_DIR, Constants.ENGINE_DIR, todayDateFolder, Constants.SUGG_GOOGLENEWS) if not os.path.exists(write_directory): os.makedirs(write_directory) outfile = open( os.path.join(write_directory, Constants.SMOOTHNESS_FILE), 'w') json_write = {} count = 1 for (key, val) in sorted_x: json_write[key] = count count = count + 1 json.dump(json_write, outfile) outfile.close() result = True except Exception, e: util.logger.eror( "Exception at printing Smoothness GoogleSugg docs for data : %s" % write_directory)
def printSuggRankedDocs(testMapping): result = False try: sorted_x = sorted(testMapping.items(), key=operator.itemgetter(1), reverse=True) todayDateFolder = util.getTodayDateFolder() write_directory = os.path.join(Constants.ROOT_FOLDER, Constants.RECOMMENDATION_DIR, Constants.ENGINE_DIR, todayDateFolder, Constants.SUGG_GOOGLENEWS) if not os.path.exists(write_directory): os.makedirs(write_directory) outfile = open(os.path.join(write_directory, Constants.RELEVANCE_FILE), 'w') json_write = {} count = 1 for (key, val) in sorted_x: json_write[key] = count count = count + 1 json.dump(json_write, outfile) outfile.close() result = True except Exception, e: util.logger.error( "Exception at printing Relevance docs for GoogleSugg : %s" % write_directory)
def RemoveBoiler(): todayDate = util.getTodayDateFolder() lastSuggNewsBoiled = util.loadSettings( Constants.LAST_BOILER_SUGGGOOGLENEWS) lastNewsBoiled = util.loadSettings(Constants.LAST_BOILER_GOOGLENEWS) lastDataBoiled = util.loadSettings(Constants.LAST_BOILER_DATA_DIR) if lastNewsBoiled: util.logger.info("Google news last boiled for =" + lastNewsBoiled) else: util.logger.info("Google news last boiled for None") if lastSuggNewsBoiled: util.logger.info("Sugg Google news last boiled for =" + lastSuggNewsBoiled) else: util.logger.info("Sugg Google news last boiled for None") if lastDataBoiled: util.logger.info("data last boiled for =" + lastDataBoiled) else: util.logger.info("data last boiled for = None") boilerNewsStatus = True boilerSuggNewsStatus = True boilerDataStatus = True #Check whether today links have been extracted or not if lastNewsBoiled != todayDate: boilerNewsStatus = BoilerNews(todayDate) else: util.logger.info("Boiler news already done for today :" + todayDate) #Check whether today sugg links have been extracted or not if lastSuggNewsBoiled != todayDate: boilerSuggNewsStatus = BoilerSuggNews(todayDate) else: util.logger.info("Sugg Boiler news already done for today :" + todayDate) trainingFolders = util.findTrainingDays() anyTrainingFolderBoiled = True trainingDoneForDays = 0 if lastDataBoiled == trainingFolders[0]: util.logger.info("Boiler data already done for today :" + lastDataBoiled) else: anyTrainingFolderBoiled = False folderIndex = 0 if lastDataBoiled != None: try: folderIndex = trainingFolders.index(lastDataBoiled) anyTrainingFolderBoiled = True except Exception, e: folderIndex = Constants.MAX_PREVIOUS_DAYS else:
def printRecommendedDocs(recDocs, downloadDate): jsonData = readLinksJson(downloadDate) if jsonData is None: return False result = False jsonData['suggestGoogle'][Constants.BASELINE] = [] recommInfo = {} googleLinks = jsonData['suggestGoogle'][Constants.GOOGLE] for linkObj in googleLinks: download = linkObj['download'] htmlFile = linkObj['id'] if download == "yes": recommInfo[htmlFile] = linkObj try: sorted_x = sorted(recDocs.items(), key=operator.itemgetter(1)) todayDateFolder = util.getTodayDateFolder() write_directory = os.path.join(Constants.ROOT_FOLDER, Constants.FINAL_DIR, todayDateFolder) if not os.path.exists(write_directory): os.makedirs(write_directory) outfile = open(os.path.join(write_directory, Constants.ULTIMATE_FILE), 'w') feedback_directory = os.path.join(Constants.ROOT_FOLDER, Constants.FEEDBACK_DIR, todayDateFolder) if not os.path.exists(feedback_directory): os.makedirs(feedback_directory) feedbackfile = open( os.path.join(feedback_directory, Constants.ULTIMATE_FILE), 'w') json_write = {} count = 1 for (key, val) in sorted_x: if key in recommInfo: linkObj = recommInfo[key] linkObj['rank'] = -1 jsonData['suggestGoogle'][Constants.BASELINE].append(linkObj) count = count + 1 if count >= Constants.RECOMMENDED_LINKS: break else: util.logger.error( "Key not found in baseline suggestgoogle news = " + key) json.dump(jsonData, outfile) outfile.close() json.dump(jsonData, feedbackfile) feedbackfile.close() result = True except Exception, e: util.logger.error( "Exception at writing final GoogleSugg Recommendation docs for data : %s" % write_directory)
def RecommendationMetric(): todayDateFolder = util.getTodayDateFolder() lastRecommended= util.loadSettings(Constants.LAST_RECOMMENDATION_DONE) if todayDateFolder == lastRecommended: return True try: relevance_json = {} #Get Relevance json readRelevanceDir = os.path.join(Constants.ROOT_FOLDER,Constants.RECOMMENDATION_DIR,Constants.ENGINE_DIR ,todayDateFolder,Constants.RELEVANCE_FILE) if os.path.isfile(readRelevanceDir) is True: with open(readRelevanceDir) as json_data: json_text = json_data.read() relevance_json = json.loads(json_text) json_data.close() #Get Smoothness json smoothness_json = {} readSmoothnessDir = os.path.join(Constants.ROOT_FOLDER,Constants.RECOMMENDATION_DIR,Constants.ENGINE_DIR ,todayDateFolder,Constants.SMOOTHNESS_FILE) if os.path.isfile(readSmoothnessDir) is True: with open(readSmoothnessDir) as json_data: json_text = json_data.read() smoothness_json = json.loads(json_text) json_data.close() #Get Clarity json clarity_json = {} readClarityDir = os.path.join(Constants.ROOT_FOLDER,Constants.RECOMMENDATION_DIR,Constants.ENGINE_DIR ,todayDateFolder,Constants.CLARITY_FILE) if os.path.isfile(readClarityDir) is True: with open(readClarityDir) as json_data: json_text = json_data.read() clarity_json = json.loads(json_text) json_data.close() #Lets not apply any linear weight formula for now cou = Counter() cou.update(relevance_json) cou.update(smoothness_json) cou.update(clarity_json) #Convert above back to a dictionary final_json = dict(cou) result = printRecommendedDocs(final_json, todayDateFolder) if result == True: util.saveSettings(Constants.LAST_RECOMMENDATION_DONE, todayDateFolder) util.logger.info("Recommended links done for ="+ todayDateFolder) pass except Exception, e: util.logger.error( "Exception at recommending links for : %s Exception = %s" % (todayDateFolder,traceback.print_exc))
def RecommendationMetric(): todayDateFolder = util.getTodayDateFolder() lastRecommended= util.loadSettings(Constants.LAST_RECOMMENDATION_DONE) if todayDateFolder == lastRecommended: return True try: relevance_json = {} #Get Relevance json readRelevanceDir = os.path.join(Constants.ROOT_FOLDER,Constants.RECOMMENDATION_DIR,Constants.ENGINE_DIR ,todayDateFolder,Constants.GOOGLENEWS,Constants.RELEVANCE_FILE) if os.path.isfile(readRelevanceDir) is True: with open(readRelevanceDir) as json_data: json_text = json_data.read() relevance_json = json.loads(json_text) json_data.close() #Get Smoothness json smoothness_json = {} readSmoothnessDir = os.path.join(Constants.ROOT_FOLDER,Constants.RECOMMENDATION_DIR,Constants.ENGINE_DIR ,todayDateFolder,Constants.GOOGLENEWS,Constants.SMOOTHNESS_FILE) if os.path.isfile(readSmoothnessDir) is True: with open(readSmoothnessDir) as json_data: json_text = json_data.read() smoothness_json = json.loads(json_text) json_data.close() #Get Clarity json clarity_json = {} readClarityDir = os.path.join(Constants.ROOT_FOLDER,Constants.RECOMMENDATION_DIR,Constants.ENGINE_DIR ,todayDateFolder,Constants.GOOGLENEWS,Constants.CLARITY_FILE) if os.path.isfile(readClarityDir) is True: with open(readClarityDir) as json_data: json_text = json_data.read() clarity_json = json.loads(json_text) json_data.close() #Lets not apply any linear weight formula for now cou = Counter() cou.update(relevance_json) cou.update(smoothness_json) cou.update(clarity_json) #Convert above back to a dictionary final_json = dict(cou) result = printRecommendedDocs(final_json, todayDateFolder) if result == True: util.saveSettings(Constants.LAST_RECOMMENDATION_DONE, todayDateFolder) util.logger.info("Recommended Google links done for ="+ todayDateFolder) pass except Exception, e: util.logger.error( "Exception at recommending google links for : %s Exception = %s" % (todayDateFolder,traceback.print_exc))
def ConnectionClarity(): todayDate = util.getTodayDateFolder() lastClarityDate = util.loadSettings(Constants.LAST_CLARITY_DIR) if todayDate == lastClarityDate : util.logger.info("Clarity signal done for today =" + todayDate) return True trainFiles = util.findTrainingFiles() testFiles = util.findTestFiles() trainCorpus, usedTrainFiles = util.findCorpus(trainFiles) testCorpus, usedTestFiles = util.findCorpus(testFiles) clarityobj = Clarity(trainCorpus,testCorpus) clarityScore = clarityobj.ClarityScore() ret = printRankedDocs(clarityScore, usedTestFiles) if ret == True: util.saveSettings(Constants.LAST_CLARITY_DIR, todayDate) util.logger.info("Clarity info just completed for ="+todayDate) return ret
def printRankedDocs(smoothness, usedTestFiles): testScore = smoothness.sum(axis=1) testMapping = {} for files in range(len(usedTestFiles)): testMapping[usedTestFiles[files]] = testScore[files] sorted_x = sorted(testMapping.items(), key=operator.itemgetter(1), reverse=True) todayDateFolder = util.getTodayDateFolder() write_directory = os.path.join(Constants.ROOT_FOLDER,Constants.RECOMMENDATION_DIR,Constants.ENGINE_DIR,todayDateFolder) if not os.path.exists(write_directory): os.makedirs(write_directory) outfile = open(os.path.join(write_directory,Constants.SMOOTHNESS_FILE), 'w') json_write = {} count = 1 for (key,val) in sorted_x: json_write[key] = count count = count + 1 json.dump(json_write, outfile) outfile.close()
def printRankedDocs(testMapping): result = False try: sorted_x = sorted(testMapping.items(), key=operator.itemgetter(1), reverse=True) todayDateFolder = util.getTodayDateFolder() write_directory = os.path.join(Constants.ROOT_FOLDER,Constants.RECOMMENDATION_DIR,Constants.ENGINE_DIR,todayDateFolder) if not os.path.exists(write_directory): os.makedirs(write_directory) outfile = open(os.path.join(write_directory,Constants.RELEVANCE_FILE), 'w') json_write = {} count = 1 for (key,val) in sorted_x: json_write[key] = count count = count + 1 json.dump(json_write, outfile) outfile.close() result = True except Exception, e: util.logger.error("Exception at printing Relevance docs for data : %s" % write_directory)
def GoogleNews(): downloadedLinks = [] todayDate = util.getTodayDateFolder() lastNewsDownloaded = util.loadSettings(Constants.LAST_GOOGLENEWS_DOWNLOAD) lastLinksDownloaded = util.loadSettings(Constants.LAST_GOOGLELINKS_DOWNLOAD) googleLinksStatus = True googleNewsStatus = True #Check whether today links have been extracted or not if lastLinksDownloaded != todayDate: googleLinksStatus = getGoogleLinks(todayDate) else: util.logger.info("Google links downloaded successfully for = "+todayDate) #Check whether today news has been extracted or not if todayDate != lastNewsDownloaded: googleNewsStatus = downloadGoogleNews(todayDate) else: util.logger.info("Google news already downloaded successfully for = "+ todayDate) return googleLinksStatus & googleNewsStatus
def Relevance(): todayDate = util.getTodayDateFolder() lastRelevanceDate = util.loadSettings(Constants.LAST_RELEVANCE_DIR) if todayDate == lastRelevanceDate : util.logger.info("Relevance signal already done for today :" + todayDate) return True trainFiles = util.findTrainingFiles() testFiles = util.findTestFiles() trainCorpus, usedTrainFiles = util.findCorpus(trainFiles) testCorpus, usedTestFiles = util.findCorpus(testFiles) all_tokens = sum(trainCorpus, []) tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1) texts = [[word for word in text if word not in tokens_once] for text in trainCorpus] pass dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] tfidf = models.TfidfModel(corpus=corpus, id2word=dictionary,normalize=True) index = similarities.SparseMatrixSimilarity(tfidf[corpus],num_features=len(dictionary)) count = 0 testJson = {} for text in testCorpus: vec=dictionary.doc2bow(text) sims = index[tfidf[vec]] score = sum(sims) #print(list(enumerate(sims))) testJson[usedTestFiles[count]] = score count = count + 1 ret = printRankedDocs(testJson) if ret == True: util.saveSettings(Constants.LAST_RELEVANCE_DIR, todayDate) util.logger.info("Relevance info just completed for ="+todayDate) return ret
def printRankedDocs(clarityScore, usedTestFiles): result = False try: testScore = clarityScore.sum(axis=1) testMapping = {} for files in range(len(usedTestFiles)): testMapping[usedTestFiles[files]] = testScore[files] sorted_x = sorted(testMapping.items(), key=operator.itemgetter(1), reverse=True) todayDateFolder = util.getTodayDateFolder() write_directory = os.path.join(Constants.ROOT_FOLDER,Constants.RECOMMENDATION_DIR,Constants.ENGINE_DIR,todayDateFolder) if not os.path.exists(write_directory): os.makedirs(write_directory) outfile = open(os.path.join(write_directory,Constants.CLARITY_FILE), 'w') json_write = {} count = 1 for (key,val) in sorted_x: json_write[key] = count count = count + 1 json.dump(json_write, outfile) outfile.close() result = True except Exception, e: util.logger.error("Exception at printing Clarity docs for data : %s" % write_directory)
def RemoveBoiler(): todayDate = util.getTodayDateFolder() lastNewsBoiled = util.loadSettings(Constants.LAST_BOILER_GOOGLENEWS) lastDataBoiled = util.loadSettings(Constants.LAST_BOILER_DATA_DIR) boilerNewsStatus = True boilerDataStatus = True #Check whether today links have been extracted or not # if lastNewsBoiled != todayDate: # boilerNewsStatus = BoilerNews(todayDate) # else: # util.logger.info("Boiler news already done for today :" + todayDate) # trainingFolders = util.findTrainingDays() anyTrainingFolderBoiled = True if lastDataBoiled == trainingFolders[0]: util.logger.info("Boiler data already done for today :" + lastDataBoiled) else: anyTrainingFolderBoiled = False folderIndex = 0 if lastDataBoiled != None: folderIndex = trainingFolders.index(lastDataBoiled) else: folderIndex = Constants.TRAINING_DAY if folderIndex < 0: folderIndex = 0 util.logger.info("Boiler data for none of the last %d days have been downloaded" % Constants.TRAINING_DAY) for folder in range(folderIndex - 1, -1, -1): boilerDataStatus = BoilerData(trainingFolders[folder]) if boilerDataStatus == False: util.logger.error("Boiler data not done for today :" + trainingFolders[folder]) else: anyTrainingFolderBoiled = True return boilerNewsStatus & anyTrainingFolderBoiled
def printRecommendedDocs(recDocs, downloadDate): jsonData = readLinksJson(downloadDate) if jsonData is None: return False result = False final_json = {} algo1List = [] try: sorted_x = sorted(recDocs.items(), key=operator.itemgetter(1)) todayDateFolder = util.getTodayDateFolder() write_directory = os.path.join(Constants.ROOT_FOLDER,Constants.FINAL_DIR,todayDateFolder) if not os.path.exists(write_directory): os.makedirs(write_directory) outfile = open(os.path.join(write_directory,Constants.ULTIMATE_FILE), 'w') json_write = {} count = 0 for (key,val) in sorted_x: if key not in jsonData: continue linkObj = {} linkObj['url'] = jsonData[key]['url'] linkObj['content'] = jsonData[key]['content'] linkObj['title'] = jsonData[key]['title'] linkObj['id'] = key linkObj['use'] = 'NA' algo1List.append(linkObj) count = count + 1 if count >= Constants.RECOMMENDED_LINKS: break final_json['algo1'] = algo1List json.dump(final_json, outfile) outfile.close() result = True except Exception, e: util.logger.error("Exception at writing final Recommendation docs for data : %s" % write_directory)
def GoogleNews(): downloadedLinks = [] todayDate = util.getTodayDateFolder() lastNewsDownloaded = util.loadSettings(Constants.LAST_GOOGLENEWS_DOWNLOAD) lastLinksDownloaded = util.loadSettings( Constants.LAST_GOOGLELINKS_DOWNLOAD) googleLinksStatus = True googleNewsStatus = True #Check whether today links have been extracted or not if lastLinksDownloaded != todayDate: googleLinksStatus = getGoogleLinks(todayDate) else: util.logger.info("Google links downloaded successfully for = " + todayDate) #Check whether today news has been extracted or not if todayDate != lastNewsDownloaded: googleNewsStatus = downloadGoogleNews(todayDate) else: util.logger.info("Google news already downloaded successfully for = " + todayDate) return googleLinksStatus & googleNewsStatus