def PPR(): todayDate = graphUtils.getTodayDateFolder() lastRecommendationnDate = graphUtils.loadSettings( graphConstants.LAST_GRAPH_RECOMM_DONE) #lastRecommendationnDate = None if todayDate == lastRecommendationnDate: graphUtils.logger.info( "Simple Graph recommendation PPR done for today ") return graphUtils.logger.info("Simple graph recommendation PPR last done for =" + str(lastRecommendationnDate)) #Get the current version of stored graphs G = None graph_path = os.path.join(graphConstants.ROOT_FOLDER, graphConstants.GRAPH_DIR, graphConstants.GRAPH_DIR, graphConstants.TYPE_MAIN) graph_file = os.path.join(graph_path, graphConstants.GRAPH_FILE) G = nx.read_gexf(graph_file) list_nodes = {x: i for i, x in enumerate(G.nodes())} R = get_init_R(G, list_nodes) #Normalize edge transition weights M = normalize_edge_Weights(list_nodes, G) S, list_seednode_names = graphSeedNodes.findSeedNodes(G, list_nodes) for idx, node in enumerate(list_seednode_names): graphUtils.logger.info( str(idx) + " seed node for simple graph today = " + node) newR = personalizedPageRank(R, M, S) printGraphRecommendedDocs(G, list_nodes, newR) writeNewR(G, list_nodes, newR, graph_file) graphUtils.saveSettings(graphConstants.LAST_GRAPH_RECOMM_DONE, todayDate) pass
def printGraphRecommendedDocs(G, list_nodes, R): todayDateFolder = graphUtils.getTodayDateFolder() jsonData = readLinksJson(todayDateFolder) if jsonData is None: return False result = False jsonData['GoogleNews'][Constants.NERGRAPH] = [] recommInfo = {} graphDocs = {} googleLinks = jsonData['GoogleNews'][Constants.GOOGLE] for linkObj in googleLinks: download = linkObj['download'] htmlFile = graphConstants.TYPE_GOOGLE + "_" + linkObj[ 'id'] + "_" + todayDateFolder if download == "yes" and htmlFile in list_nodes: recommInfo[htmlFile] = linkObj htmlFile_idx = list_nodes[htmlFile] graphDocs[htmlFile] = R[htmlFile_idx] try: sorted_x = sorted(graphDocs.items(), key=operator.itemgetter(1)) sorted_x.reverse() write_directory = os.path.join(graphConstants.ROOT_FOLDER, graphConstants.FINAL_DIR, todayDateFolder) if not os.path.exists(write_directory): os.makedirs(write_directory) outfile = open( os.path.join(write_directory, graphConstants.ULTIMATE_FILE), 'w') json_write = {} count = 1 for (key, val) in sorted_x: if key in recommInfo: linkObj = recommInfo[key] linkObj['rank'] = -1 jsonData['GoogleNews'][Constants.NERGRAPH].append(linkObj) count = count + 1 if count >= graphConstants.RECOMMENDED_LINKS: break else: graphUtils.logger.error( "NER Graph normalGoogle key not found = " + key) json.dump(jsonData, outfile) outfile.close() result = True except Exception, e: graphUtils.logger.error("Exception = %s" % e) graphUtils.logger.error( "Exception at writing final Graph Recommendation docs for data : %s" % write_directory)
def Relevance(): todayDate = graphUtils.getTodayDateFolder() lastRelevanceDate = graphUtils.loadSettings( graphConstants.LAST_GRAPH_RELEVANCE_DIR) lastSuggRelevanceDate = graphUtils.loadSettings( graphConstants.LAST_GRAPH_SUGG_RELEVANCE_DIR) if lastRelevanceDate: graphUtils.logger.info("Graph Relevance done last for =" + lastRelevanceDate) else: graphUtils.logger.info("Graph Relevance done last for None") if lastSuggRelevanceDate: graphUtils.logger.info("GraphSugg Relevance done last for =" + lastSuggRelevanceDate) else: graphUtils.logger.info("GraphSugg Relevance done last for None") if todayDate == lastRelevanceDate and todayDate == lastSuggRelevanceDate: graphUtils.logger.info( "Graph Relevance signal already done for today :" + todayDate) return True graph_path = os.path.join(graphConstants.ROOT_FOLDER, graphConstants.GRAPH_DIR, graphConstants.GRAPH_DIR, graphConstants.TYPE_MAIN) graph_file = os.path.join(graph_path, graphConstants.GRAPH_FILE) write_graph_file = os.path.join(graph_path, graphConstants.GRAPH_FILE) if not os.path.exists(graph_path): os.makedirs(graph_path) G = nx.read_gexf(graph_file) trainFiles, trainFileNames = graphUtils.findRecommTrainGraphFiles() trainCorpus = graphUtils.findCorpus(trainFiles) all_tokens = sum(trainCorpus, []) tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1) texts = [[word for word in text if word not in tokens_once] for text in trainCorpus] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] tfidf = models.TfidfModel(corpus=corpus, id2word=dictionary, normalize=True) index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=len(dictionary)) if todayDate != lastRelevanceDate: testFiles, testFileName = graphUtils.findRecommFiles() testCorpus = graphUtils.findCorpus(testFiles) mini = 100 maxi = -1 count = 0 for idx, text in enumerate(testCorpus): #Add this recommendation node recomm_nodename = testFileName[idx] if recomm_nodename not in G.nodes(): G.add_node(recomm_nodename) G.node[recomm_nodename]['type'] = graphConstants.TYPE_GOOGLE vec = dictionary.doc2bow(text) sims = index[tfidf[vec]] for idxsim, prob in enumerate(sims): if prob < 0.1: continue trainNode = trainFileNames[idxsim] if trainNode in G.nodes(): if prob < mini: mini = prob if prob > maxi: maxi = prob G.add_edge(recomm_nodename, trainNode, weight=prob) G.add_edge(trainNode, recomm_nodename, weight=prob) count = count + 1 text = readFromFile(testFiles[idx]) #NERFunc(text,G, recomm_nodename) graphUtils.logger.info( "Simple graph relevance completed for today. Stats follow") graphUtils.logger.info("mini =" + str(mini)) graphUtils.logger.info("maxi =" + str(maxi)) graphUtils.logger.info("Relevance count =" + str(count)) nx.write_gexf(G, write_graph_file) graphUtils.saveSettings(graphConstants.LAST_GRAPH_RELEVANCE_DIR, todayDate) if todayDate != lastRelevanceDate: testFiles, testFileName = graphUtils.findSuggRecommFiles() testCorpus = graphUtils.findCorpus(testFiles) mini = 100 maxi = -1 count = 0 for idx, text in enumerate(testCorpus): #Add this recommendation node recomm_nodename = testFileName[idx] if recomm_nodename not in G.nodes(): G.add_node(recomm_nodename) G.node[recomm_nodename]['type'] = graphConstants.TYPE_SUGG vec = dictionary.doc2bow(text) sims = index[tfidf[vec]] for idxsim, prob in enumerate(sims): if prob < 0.1: continue trainNode = trainFileNames[idxsim] if trainNode in G.nodes(): if prob < mini: mini = prob if prob > maxi: maxi = prob G.add_edge(recomm_nodename, trainNode, weight=prob) G.add_edge(trainNode, recomm_nodename, weight=prob) count = count + 1 text = readFromFile(testFiles[idx]) #NERFunc(text,G, recomm_nodename) graphUtils.logger.info( "Simple graph relevance completed for suggestGoogle today. Stats follow" ) graphUtils.logger.info("mini =" + str(mini)) graphUtils.logger.info("maxi =" + str(maxi)) graphUtils.logger.info("Relevance count =" + str(count)) nx.write_gexf(G, write_graph_file) graphUtils.saveSettings(graphConstants.LAST_GRAPH_SUGG_RELEVANCE_DIR, todayDate) pass
def Smoothness(): todayDate = graphUtils.getTodayDateFolder() lastSmoothnessDate = graphUtils.loadSettings( graphConstants.LAST_GRAPHNER_SMOOTHNESS_DIR) lastSuggSmoothnessDate = graphUtils.loadSettings( graphConstants.LAST_GRAPHNER_SUGG_SMOOTHNESS_DIR) if lastSmoothnessDate: graphUtils.logger.info("NERGraph Smoothness done last for =" + lastSmoothnessDate) else: graphUtils.logger.info("NERGraph Smoothness done last for None") if lastSuggSmoothnessDate: graphUtils.logger.info("NERGraphSugg Smoothness done last for =" + lastSuggSmoothnessDate) else: graphUtils.logger.info("NERGraphSugg Smoothness done last for None") if todayDate == lastSmoothnessDate and todayDate == lastSuggSmoothnessDate: graphUtils.logger.info( "NERGraph Smoothness signal already done for today :" + todayDate) return True graph_path = os.path.join(graphConstants.ROOT_FOLDER, graphConstants.GRAPH_DIR, graphConstants.GRAPH_DIR, graphConstants.TYPE_NER) graph_file = os.path.join(graph_path, graphConstants.GRAPH_FILE) write_graph_file = os.path.join(graph_path, graphConstants.GRAPH_FILE) if not os.path.exists(graph_path): os.makedirs(graph_path) G = nx.read_gexf(graph_file) trainFiles, trainFileNames = graphUtils.findRecommTrainGraphNerFiles() trainCorpus = graphUtils.findCorpus(trainFiles) bm25obj = Bm25(trainCorpus) trainUniqueWords = [] for trainText in trainCorpus: trainUniqueWords.append(set(trainText)) if todayDate != lastSmoothnessDate: testFiles, testFileName = graphUtils.findRecommFiles() testCorpus = graphUtils.findCorpus(testFiles) testUniqueWords = [] mini = 100 maxi = -1 count = 0 smoothness = zeros((len(testCorpus), len(trainCorpus))) for testText in testCorpus: testUniqueWords.append(set(testText)) for testDoc in range(len(testCorpus)): recomm_nodename = testFileName[testDoc] uniqueTest = testUniqueWords[testDoc] SminusDcontext = zeros(bm25obj.N) DminusScontext = zeros(bm25obj.N) for trainDoc in range(len(trainCorpus)): uniqueTrain = trainUniqueWords[trainDoc] SminusD = [ word for word in trainCorpus[trainDoc] if word not in uniqueTest ] DminusS = [ word for word in testCorpus[testDoc] if word not in uniqueTrain ] SminusDcontext = bm25obj.BM25Score(SminusD) DminusScontext = bm25obj.BM25Score(DminusS) smoothness[testDoc][trainDoc] = np.dot(SminusDcontext, DminusScontext) dict_arr = { key: value for (key, value) in enumerate(smoothness[testDoc]) } sorted_x = sorted(dict_arr.items(), key=operator.itemgetter(1)) sorted_x.reverse() sorted_x = sorted_x[:graphConstants.MAX_SMOOTHNESS_EDGE] total = sum([pair[1] for pair in sorted_x]) for (idxsim, val) in sorted_x: prob = val / total if recomm_nodename not in G.nodes(): G.add_node(recomm_nodename) G.node[recomm_nodename][ 'type'] = graphConstants.TYPE_GOOGLE trainNode = trainFileNames[idxsim] if trainNode in G.nodes(): if prob < mini: mini = prob if prob > maxi: maxi = prob if G.has_edge(recomm_nodename, trainNode) is False: G.add_edge(recomm_nodename, trainNode, weight=prob * graphConstants.SMOOTHNESS_EDGE_WEIGHT) else: G[recomm_nodename][trainNode][ 'weight'] = G[recomm_nodename][trainNode][ 'weight'] + prob * graphConstants.SMOOTHNESS_EDGE_WEIGHT if G.has_edge(trainNode, recomm_nodename) is False: G.add_edge(trainNode, recomm_nodename, weight=prob * graphConstants.SMOOTHNESS_EDGE_WEIGHT) else: G[trainNode][recomm_nodename][ 'weight'] = G[trainNode][recomm_nodename][ 'weight'] + prob * graphConstants.SMOOTHNESS_EDGE_WEIGHT count = count + 1 #print smoothness[testDoc] graphUtils.logger.info( " ner graph Smoothness completed for normalGoogle today. Stats follow" ) graphUtils.logger.info("mini =" + str(mini)) graphUtils.logger.info("maxi =" + str(maxi)) graphUtils.logger.info("Smoothness edges count =" + str(count)) nx.write_gexf(G, write_graph_file) graphUtils.saveSettings(graphConstants.LAST_GRAPHNER_SMOOTHNESS_DIR, todayDate) pass if todayDate != lastSuggSmoothnessDate: testFiles, testFileName = graphUtils.findSuggRecommFiles() testCorpus = graphUtils.findCorpus(testFiles) testUniqueWords = [] mini = 100 maxi = -1 count = 0 smoothness = zeros((len(testCorpus), len(trainCorpus))) for testText in testCorpus: testUniqueWords.append(set(testText)) for testDoc in range(len(testCorpus)): recomm_nodename = testFileName[testDoc] uniqueTest = testUniqueWords[testDoc] SminusDcontext = zeros(bm25obj.N) DminusScontext = zeros(bm25obj.N) for trainDoc in range(len(trainCorpus)): uniqueTrain = trainUniqueWords[trainDoc] SminusD = [ word for word in trainCorpus[trainDoc] if word not in uniqueTest ] DminusS = [ word for word in testCorpus[testDoc] if word not in uniqueTrain ] SminusDcontext = bm25obj.BM25Score(SminusD) DminusScontext = bm25obj.BM25Score(DminusS) smoothness[testDoc][trainDoc] = np.dot(SminusDcontext, DminusScontext) dict_arr = { key: value for (key, value) in enumerate(smoothness[testDoc]) } sorted_x = sorted(dict_arr.items(), key=operator.itemgetter(1)) sorted_x.reverse() sorted_x = sorted_x[:graphConstants.MAX_SMOOTHNESS_EDGE] total = sum([pair[1] for pair in sorted_x]) for (idxsim, val) in sorted_x: prob = val / total if recomm_nodename not in G.nodes(): G.add_node(recomm_nodename) G.node[recomm_nodename]['type'] = graphConstants.TYPE_SUGG trainNode = trainFileNames[idxsim] if trainNode in G.nodes(): if prob < mini: mini = prob if prob > maxi: maxi = prob if G.has_edge(recomm_nodename, trainNode) is False: G.add_edge(recomm_nodename, trainNode, weight=prob * graphConstants.SMOOTHNESS_EDGE_WEIGHT) else: G[recomm_nodename][trainNode][ 'weight'] = G[recomm_nodename][trainNode][ 'weight'] + prob * graphConstants.SMOOTHNESS_EDGE_WEIGHT if G.has_edge(trainNode, recomm_nodename) is False: G.add_edge(trainNode, recomm_nodename, weight=prob * graphConstants.SMOOTHNESS_EDGE_WEIGHT) else: G[trainNode][recomm_nodename][ 'weight'] = G[trainNode][recomm_nodename][ 'weight'] + prob * graphConstants.SMOOTHNESS_EDGE_WEIGHT count = count + 1 #print smoothness[testDoc] graphUtils.logger.info( " ner graph Smoothness completed for suggestGoogle today. Stats follow" ) graphUtils.logger.info("mini =" + str(mini)) graphUtils.logger.info("maxi =" + str(maxi)) graphUtils.logger.info("Smoothness edges count =" + str(count)) nx.write_gexf(G, write_graph_file) graphUtils.saveSettings( graphConstants.LAST_GRAPHNER_SUGG_SMOOTHNESS_DIR, todayDate) pass