def test(): G = NX.DiGraph() edges = [(1,2), (1,3), (3,1), (3,2), (3,5), (4,5), (4,6), (5,4), (5,6), (6,4)] G.add_edges_from(edges) M = page_rank.google_matrix(G, alpha=0.9) e, ev = numpy.linalg.eig(M.T) p = numpy.array(ev[:,0] / ev[:,0].sum())[:,0] print "exact ", p pr = page_rank.page_rank(G,alpha=0.9,tol=1.0e-8) print "networkx", pr.values() np = page_rank.page_rank_numpy(G,alpha=0.9) print "numpy ", np try: ns = page_rank.page_rank_scipy(G,alpha=0.9) print "scipy ", ns except Error: print "scipy not working"
def pagerank_calc(self): ranklist=page_rank.page_rank(self._from_to_link, 20, 1) con=lite.connect("dbFile.db") cur=con.cursor() for i in ranklist: cur.execute('INSERT OR REPLACE INTO PageRank VALUES(?,?)',[int(i),ranklist[i]]) con.commit() con.close
def pagerank_calc(self): ranklist = page_rank.page_rank(self._from_to_link, 20, 1) con = lite.connect("dbFile.db") cur = con.cursor() for i in ranklist: cur.execute('INSERT OR REPLACE INTO PageRank VALUES(?,?)', [int(i), ranklist[i]]) con.commit() con.close
def question_1(): b = 0.7 M = numpy.matrix([ [0, 1, 0, 0], [1/2, 0, 0, 0], [1/2, 0, 0, 1], [0, 0, 1, 0] ]) r = numpy.matrix([[1/4, 1/4, 1/4, 1/4]]).T S = numpy.matrix([[0.2, 0.1, 0, 0]]).T e = 1 / 10000 r = page_rank(b, M, r, S, e) print(r)
def question_2(): b = 0.85 M = numpy.matrix([[0, 0, 1], [1/2, 0, 0], [1/2, 1, 0]]) r = numpy.matrix([1/3, 1/3, 1/3]).T S = numpy.matrix([(1 - b)/3, (1 - b)/3, (1 - b)/3]).T e = 1 / 10000 r = page_rank(b, M, r, S, e) a = r.flat[0] b = r.flat[1] c = r.flat[2] print('c = .9b + .475a: {0}'.format(round(c, 3) == round(0.9 * b + 0.475 * a, 3))) print('.95c = .9b + .475a: {0}'.format(round(0.95 * c, 3) == round(0.9 * b + 0.475 * a,3))) print('a = c + .15b: {0}'.format(round(a, 3) == round(c + 0.15 * b, 3))) print('.85a = c + .15b: {0}'.format(round(0.85 * a, 3) == round(c + 0.15 * b, 3)))
def question_1(): b = 0.7 M = numpy.matrix([[0, 0, 0], [1/2, 0, 0], [1/2, 1, 1]]) r = numpy.matrix([1/3, 1/3, 1/3]).T S = numpy.matrix([(1 - b)/3, (1 - b)/3, (1 - b)/3]).T e = 1 / 10000 r = page_rank(b, M, r, S, e) r = 3 * r a = r.flat[0] b = r.flat[1] c = r.flat[2] print('a + c = 2.035: {0}'.format(round(a + c, 3) == 2.035)) print('b + c = 2.5: {0}'.format(round(b + c, 3) == 2.5)) print('b + c = 2.7: {0}'.format(round(b + c, 3) == 2.7)) print('a + b = 0.55: {0}'.format(round(a + b, 3) == 0.55))
def test(): G = NX.DiGraph() edges = [(1, 2), (1, 3), (3, 1), (3, 2), (3, 5), (4, 5), (4, 6), (5, 4), (5, 6), (6, 4)] G.add_edges_from(edges) M = page_rank.google_matrix(G, alpha=0.9) e, ev = numpy.linalg.eig(M.T) p = numpy.array(ev[:, 0] / ev[:, 0].sum())[:, 0] print "exact ", p pr = page_rank.page_rank(G, alpha=0.9, tol=1.0e-8) print "networkx", pr.values() np = page_rank.page_rank_numpy(G, alpha=0.9) print "numpy ", np try: ns = page_rank.page_rank_scipy(G, alpha=0.9) print "scipy ", ns except Error: print "scipy not working"
def summarize(text, sentences_count=20, cosine_similarity=True): LOGGER.info("Summarizing text") plain_sentences = text_to_sentences(text) sentences = tokenize_sentences(plain_sentences) sentences = remove_punctuation(sentences) sentences = get_tagged_sentences(sentences, cosine_similarity) sentences, plain_sentences = get_long_sentences(sentences, plain_sentences) LOGGER.debug( "All word tags: %s", str(set([tag for sentence in sentences for word, tag in sentence]))) graph = create_sentences_similarity_graph(sentences, cosine_similarity) file = open('testfile.txt', 'w') for idx, row in enumerate(graph): file.write("Sentence 1: %s\n" % sentences[idx]) file.write("Sentence 2: %s\n" % sentences[np.argmax(row)]) file.write(str(np.max(row)) + '\n') # raise Exception file.close() LOGGER.info('Calculating scores') scores = page_rank(graph) sorted_scores = sorted(enumerate(scores), key=lambda item: item[1], reverse=True)[:sentences_count] LOGGER.info('Top scores: %s', str(sorted_scores)) summary = [plain_sentences[idx] for idx, _ in sorted(sorted_scores)] file = open('testfile2.txt', 'w') [ file.write("\tRank: %d, Score: %f\nSentence: %s\n" % (len(sentences) - i, score, sentences[idx])) for i, (idx, score) in enumerate( sorted(enumerate(scores), key=lambda item: item[1])) ] file.close() LOGGER.info("Summarizing completed") return summary
print "Finished reading the dataset --> Number of nodes of dataset = %s" % len(G.nodes()) #print G.edges() print "Do you want to run numpy.linalg.eig? (y/n) [It might be very heavy]" answer=sys.stdin.readline()[:-1] if answer == "y": import numpy M=NX_future_page_rank.google_matrix(G,alpha=0.9) e,ev=numpy.linalg.eig(M.T) p=numpy.array(ev[:,0]/ev[:,0].sum())[:,0] print "exact ", p print "Do you want to run page_rank? (y/n) [It might be very heavy]" answer=sys.stdin.readline()[:-1] if answer == "y": pr = NX_future_page_rank.page_rank(G,alpha=0.9,tol=1.0e-8) print "networkx", pr.values() print "Do you want to run page_rank numpy? (y/n) [It might be very heavy]" answer=sys.stdin.readline()[:-1] if answer == "y": np=NX_future_page_rank.page_rank_numpy(G,alpha=0.9) print "numpy ", np print "Do you want to run page_rank scipy? (y/n) [It might be very heavy]" answer=sys.stdin.readline()[:-1] if answer == "y": ns=NX_future_page_rank.page_rank_scipy(G,alpha=0.9) print "scipy ", ns
#The first part of this script creates a graph of words as per the exercise instructions. print('1. Creating the graph.') tokenizedFolder = tokenizer.FolderTokenizer( './www/abstracts', pathToStopWords='./stopwords.txt', wordsToKeep=['NN', 'NNS', 'NNP', 'NNPS', 'JJ'], stemmer=PorterStemmer()) folderTokenizer = tokenizedFolder.ngrams #2 #The second part of this script calculates the PageRank score for each word print('2. Calculating Page Rank.') document_scores = {} convergence = 20 for graph in folderTokenizer: document_scores[graph] = page_rank.page_rank(folderTokenizer[graph], 0.85, convergence) #3 #The third part of this script retrieves ngrams and calculates their score. #Retrieving 1-grams, 2-grams, 3-grams from the original text. print('3. Building ngrams and calculating score.') document_multingrams = {} for i in range(1, 4): document_multingrams[i] = tokenizer.nGramTokenizer('./www/abstracts', stemmer=PorterStemmer(), n=i).ngrams #Joining the ngrams into one dictionary with their summed score. document_ngrams = {} for n in document_multingrams: for file in document_multingrams[n]:
def extract_keywords(text, keywords_count=10): LOGGER.info("Extracting keywords") text_sentences = text_to_sentences(text) tokenized_sentences = tokenize_sentences(text_sentences) stemmed_words = words_to_stemmed_words(tokenized_sentences) tagged_words = get_tagged_words(tokenized_sentences) tagged_sentences = get_tagged_sentences(tokenized_sentences) words_for_graph = _get_words_for_graph(tagged_words) # LOGGER.info(words_for_graph) # print('++++++++++++++++++++', words_for_graph, len(words_for_graph)) indexed_words = words_to_indexed_words(words_for_graph) # print('indexed', indexed_words) graph = np.zeros((len(indexed_words), len(indexed_words))) for sentence in tagged_sentences: print('sent', sentence) for idx in range(len(sentence) - WORD_DISTANCE + 1): # print('sent', sentence) # print('word', sentence[idx]) # print('tag', tagged_words[sentence[idx]]) if sentence[idx][1] not in TAG_CLASSES: continue # TODO try only filtered words (nouns and adjectives) word1 = ps.stem(sentence[idx][0]) for i in range(1, WORD_DISTANCE): if i >= len(sentence): break word2 = ps.stem(sentence[idx + i][0]) if word1 in indexed_words and word2 in indexed_words: # print('edge', word1, word2) graph[indexed_words[word1]][indexed_words[word2]] = 1 graph[indexed_words[word2]][indexed_words[word1]] = 1 scores = page_rank(graph) sorted_scores = sorted(enumerate(scores), key=lambda item: item[1], reverse=True) # print('sorted', [(words_for_graph[idx], score) # for (idx, score) in sorted_scores]) ranked_words = [ stemmed_words[words_for_graph[idx]] for idx, score in sorted_scores ] print('top ranked', ranked_words[:keywords_count]) # matched_pairs = match_pairs(ranked_words, tokenized_sentences, keywords_count) # print('match', matched_pairs) # paired_words = list(sum(matched_pairs, ())) # pairs_count = len(paired_words) // 2 # keywords = [' '.join(pair) for pair in matched_pairs[:min(keywords_count, pairs_count)]] # keywords = keywords + list(filter( # lambda word: word not in paired_words and word in tagged_words and # tagged_words[word] in ['NN'], # ranked_words # ))[:keywords_count - len(keywords)] # print('matched pairs', matched_pairs) collocations = match_collocations(set(ranked_words[:keywords_count]), set(ranked_words[:2 * keywords_count]), tokenized_sentences) sorted_average = sorted(collocations[0].items(), key=lambda x: abs(x[1])) sorted_deviation = sorted(collocations[1].items(), key=lambda x: abs(x[1])) print('average', sorted_average) print('deviation', sorted_deviation) top_averages = list( filter(lambda average: abs(average[1]) <= AVERAGE_BOUNDARY, sorted_average)) top_deviation = list( filter(lambda deviation: deviation[1] <= DEVIATION_BOUNDARY, sorted_deviation)) print('top aver', top_averages) print('top dev', top_deviation) matched_words = list( set([x[0] for x in top_averages ]).intersection(set([x[0] for x in top_deviation]))) print('paired words', matched_words) paired_words = paired_words = list(sum(matched_words, ())) pairs_count = len(paired_words) keywords = [ ' '.join(pair) for pair in matched_words[:min(keywords_count, pairs_count)] ] keywords = keywords + list( filter( lambda word: word not in paired_words and word in tagged_words and tagged_words[word] in ['NN'], ranked_words))[:keywords_count - len(keywords)] # print('matched pairs', matched_pairs) LOGGER.info("Extracting keywords completed") LOGGER.debug(keywords) return keywords
from graph_generator import create_graph from matrix_generator import matrix_from_graph from page_rank import page_rank from services import get_page_links_v2 if __name__ == '__main__': URL = "https://makingfun.com/" # d = get_page_links_v2(URL) # create_graph(d) matrix, height, G_LIST = matrix_from_graph() page_rank(matrix, height, G_LIST)
if ilk in graph: graph[docid]["inlinks"].append(ilk) count = 0 for item in graph: if count % 2000 == 0: print count for ilk in graph[item]['inlinks']: if graph.get(ilk): graph[ilk]['outlinks'] += 1 count += 1 print "Finish building graph" return graph if __name__ == '__main__': wg = build_graph() rank = page_rank(wg) sorted_rank = sorted(rank.items(), key=operator.itemgetter(1), reverse=True) outf = open("/Users/Sun/Dropbox/CS6200_W_Sun/HW4/data/crawl_pagerank.txt", "a") for i in range(500): res = sorted_rank[i][0] + " " + str(sorted_rank[i][1]) + "\n" outf.write(res)
G.nodes()) #print G.edges() print "Do you want to run numpy.linalg.eig? (y/n) [It might be very heavy]" answer = sys.stdin.readline()[:-1] if answer == "y": import numpy M = NX_future_page_rank.google_matrix(G, alpha=0.9) e, ev = numpy.linalg.eig(M.T) p = numpy.array(ev[:, 0] / ev[:, 0].sum())[:, 0] print "exact ", p print "Do you want to run page_rank? (y/n) [It might be very heavy]" answer = sys.stdin.readline()[:-1] if answer == "y": pr = NX_future_page_rank.page_rank(G, alpha=0.9, tol=1.0e-8) print "networkx", pr.values() print "Do you want to run page_rank numpy? (y/n) [It might be very heavy]" answer = sys.stdin.readline()[:-1] if answer == "y": np = NX_future_page_rank.page_rank_numpy(G, alpha=0.9) print "numpy ", np print "Do you want to run page_rank scipy? (y/n) [It might be very heavy]" answer = sys.stdin.readline()[:-1] if answer == "y": ns = NX_future_page_rank.page_rank_scipy(G, alpha=0.9) print "scipy ", ns
def NXPR(G): pr = page_rank.page_rank(G,alpha=0.9,tol=1.0e-8) return pr.values()
def NXPR(G): pr = page_rank.page_rank(G, alpha=0.9, tol=1.0e-8) return pr.values()