def test():
    G = NX.DiGraph()
    edges = [(1,2), (1,3),
             (3,1), (3,2), (3,5),
             (4,5), (4,6),
             (5,4), (5,6),
             (6,4)]
    G.add_edges_from(edges)

    M = page_rank.google_matrix(G, alpha=0.9)
    e, ev = numpy.linalg.eig(M.T)
    p = numpy.array(ev[:,0] / ev[:,0].sum())[:,0]
    print "exact  ", p

    pr = page_rank.page_rank(G,alpha=0.9,tol=1.0e-8)
    print "networkx", pr.values()

    np = page_rank.page_rank_numpy(G,alpha=0.9)
    print "numpy  ", np

    try:
        ns = page_rank.page_rank_scipy(G,alpha=0.9)
        print "scipy  ", ns
    except Error:
        print "scipy not working"
예제 #2
0
    def pagerank_calc(self):
    	ranklist=page_rank.page_rank(self._from_to_link, 20, 1)
    	con=lite.connect("dbFile.db")
       	cur=con.cursor()


       	for i in ranklist:
       		cur.execute('INSERT OR REPLACE INTO PageRank VALUES(?,?)',[int(i),ranklist[i]])
       		con.commit()
    	con.close
예제 #3
0
    def pagerank_calc(self):
        ranklist = page_rank.page_rank(self._from_to_link, 20, 1)
        con = lite.connect("dbFile.db")
        cur = con.cursor()

        for i in ranklist:
            cur.execute('INSERT OR REPLACE INTO PageRank VALUES(?,?)',
                        [int(i), ranklist[i]])
            con.commit()
        con.close
예제 #4
0
파일: basic.py 프로젝트: vmous/jazzy-moocs
def question_1():
    b = 0.7
    M = numpy.matrix([
        [0, 1, 0, 0],
        [1/2, 0, 0, 0],
        [1/2, 0, 0, 1],
        [0, 0, 1, 0]
    ])
    r = numpy.matrix([[1/4, 1/4, 1/4, 1/4]]).T
    S = numpy.matrix([[0.2, 0.1, 0, 0]]).T
    e = 1 / 10000

    r = page_rank(b, M, r, S, e)

    print(r)
예제 #5
0
파일: basic.py 프로젝트: vmous/jazzy-moocs
def question_2():
    b = 0.85
    M = numpy.matrix([[0, 0, 1], [1/2, 0, 0], [1/2, 1, 0]])
    r = numpy.matrix([1/3, 1/3, 1/3]).T
    S = numpy.matrix([(1 - b)/3, (1 - b)/3, (1 - b)/3]).T
    e = 1 / 10000

    r = page_rank(b, M, r, S, e)

    a = r.flat[0]
    b = r.flat[1]
    c = r.flat[2]
    print('c = .9b + .475a: {0}'.format(round(c, 3) ==  round(0.9 * b + 0.475 * a, 3)))
    print('.95c = .9b + .475a: {0}'.format(round(0.95 * c, 3) == round(0.9 * b + 0.475 * a,3)))
    print('a = c + .15b: {0}'.format(round(a, 3) == round(c + 0.15 * b, 3)))
    print('.85a = c + .15b: {0}'.format(round(0.85 * a, 3) == round(c + 0.15 * b, 3)))
예제 #6
0
파일: basic.py 프로젝트: vmous/jazzy-moocs
def question_1():
    b = 0.7
    M = numpy.matrix([[0, 0, 0], [1/2, 0, 0], [1/2, 1, 1]])
    r = numpy.matrix([1/3, 1/3, 1/3]).T
    S = numpy.matrix([(1 - b)/3, (1 - b)/3, (1 - b)/3]).T
    e = 1 / 10000

    r = page_rank(b, M, r, S, e)

    r = 3 * r

    a = r.flat[0]
    b = r.flat[1]
    c = r.flat[2]
    print('a + c = 2.035: {0}'.format(round(a + c, 3) == 2.035))
    print('b + c = 2.5: {0}'.format(round(b + c, 3) == 2.5))
    print('b + c = 2.7: {0}'.format(round(b + c, 3) == 2.7))
    print('a + b = 0.55:  {0}'.format(round(a + b, 3) == 0.55))
예제 #7
0
def test():
    G = NX.DiGraph()
    edges = [(1, 2), (1, 3), (3, 1), (3, 2), (3, 5), (4, 5), (4, 6), (5, 4),
             (5, 6), (6, 4)]
    G.add_edges_from(edges)

    M = page_rank.google_matrix(G, alpha=0.9)
    e, ev = numpy.linalg.eig(M.T)
    p = numpy.array(ev[:, 0] / ev[:, 0].sum())[:, 0]
    print "exact  ", p

    pr = page_rank.page_rank(G, alpha=0.9, tol=1.0e-8)
    print "networkx", pr.values()

    np = page_rank.page_rank_numpy(G, alpha=0.9)
    print "numpy  ", np

    try:
        ns = page_rank.page_rank_scipy(G, alpha=0.9)
        print "scipy  ", ns
    except Error:
        print "scipy not working"
예제 #8
0
def summarize(text, sentences_count=20, cosine_similarity=True):
    LOGGER.info("Summarizing text")
    plain_sentences = text_to_sentences(text)
    sentences = tokenize_sentences(plain_sentences)
    sentences = remove_punctuation(sentences)
    sentences = get_tagged_sentences(sentences, cosine_similarity)
    sentences, plain_sentences = get_long_sentences(sentences, plain_sentences)
    LOGGER.debug(
        "All word tags: %s",
        str(set([tag for sentence in sentences for word, tag in sentence])))
    graph = create_sentences_similarity_graph(sentences, cosine_similarity)

    file = open('testfile.txt', 'w')

    for idx, row in enumerate(graph):
        file.write("Sentence 1: %s\n" % sentences[idx])
        file.write("Sentence 2: %s\n" % sentences[np.argmax(row)])
        file.write(str(np.max(row)) + '\n')
    # raise Exception
    file.close()

    LOGGER.info('Calculating scores')
    scores = page_rank(graph)
    sorted_scores = sorted(enumerate(scores),
                           key=lambda item: item[1],
                           reverse=True)[:sentences_count]
    LOGGER.info('Top scores: %s', str(sorted_scores))
    summary = [plain_sentences[idx] for idx, _ in sorted(sorted_scores)]
    file = open('testfile2.txt', 'w')
    [
        file.write("\tRank: %d, Score: %f\nSentence: %s\n" %
                   (len(sentences) - i, score, sentences[idx]))
        for i, (idx, score) in enumerate(
            sorted(enumerate(scores), key=lambda item: item[1]))
    ]
    file.close()
    LOGGER.info("Summarizing completed")
    return summary
예제 #9
0
print "Finished reading the dataset --> Number of nodes of dataset = %s" % len(G.nodes())

#print G.edges()

print "Do you want to run numpy.linalg.eig? (y/n) [It might be very heavy]"
answer=sys.stdin.readline()[:-1]
if answer == "y":
    import numpy
    M=NX_future_page_rank.google_matrix(G,alpha=0.9)
    e,ev=numpy.linalg.eig(M.T)
    p=numpy.array(ev[:,0]/ev[:,0].sum())[:,0]
    print "exact  ", p

print "Do you want to run page_rank? (y/n) [It might be very heavy]"
answer=sys.stdin.readline()[:-1]
if answer == "y":
    pr = NX_future_page_rank.page_rank(G,alpha=0.9,tol=1.0e-8)
    print "networkx", pr.values()

print "Do you want to run page_rank numpy? (y/n) [It might be very heavy]"
answer=sys.stdin.readline()[:-1]
if answer == "y":
    np=NX_future_page_rank.page_rank_numpy(G,alpha=0.9)
    print "numpy  ", np

print "Do you want to run page_rank scipy? (y/n) [It might be very heavy]"
answer=sys.stdin.readline()[:-1]
if answer == "y":
    ns=NX_future_page_rank.page_rank_scipy(G,alpha=0.9)
    print "scipy  ", ns
예제 #10
0
#The first part of this script creates a graph of words as per the exercise instructions.
print('1. Creating the graph.')
tokenizedFolder = tokenizer.FolderTokenizer(
    './www/abstracts',
    pathToStopWords='./stopwords.txt',
    wordsToKeep=['NN', 'NNS', 'NNP', 'NNPS', 'JJ'],
    stemmer=PorterStemmer())
folderTokenizer = tokenizedFolder.ngrams

#2
#The second part of this script calculates the PageRank score for each word
print('2. Calculating Page Rank.')
document_scores = {}
convergence = 20
for graph in folderTokenizer:
    document_scores[graph] = page_rank.page_rank(folderTokenizer[graph], 0.85,
                                                 convergence)

#3
#The third part of this script retrieves ngrams and calculates their score.
#Retrieving 1-grams, 2-grams, 3-grams from the original text.
print('3. Building ngrams and calculating score.')
document_multingrams = {}
for i in range(1, 4):
    document_multingrams[i] = tokenizer.nGramTokenizer('./www/abstracts',
                                                       stemmer=PorterStemmer(),
                                                       n=i).ngrams

#Joining the ngrams into one dictionary with their summed score.
document_ngrams = {}
for n in document_multingrams:
    for file in document_multingrams[n]:
예제 #11
0
def extract_keywords(text, keywords_count=10):
    LOGGER.info("Extracting keywords")
    text_sentences = text_to_sentences(text)
    tokenized_sentences = tokenize_sentences(text_sentences)
    stemmed_words = words_to_stemmed_words(tokenized_sentences)
    tagged_words = get_tagged_words(tokenized_sentences)
    tagged_sentences = get_tagged_sentences(tokenized_sentences)
    words_for_graph = _get_words_for_graph(tagged_words)
    # LOGGER.info(words_for_graph)
    # print('++++++++++++++++++++', words_for_graph, len(words_for_graph))
    indexed_words = words_to_indexed_words(words_for_graph)
    # print('indexed', indexed_words)
    graph = np.zeros((len(indexed_words), len(indexed_words)))
    for sentence in tagged_sentences:
        print('sent', sentence)
        for idx in range(len(sentence) - WORD_DISTANCE + 1):
            # print('sent', sentence)
            # print('word', sentence[idx])
            # print('tag', tagged_words[sentence[idx]])
            if sentence[idx][1] not in TAG_CLASSES:
                continue
            # TODO try only filtered words (nouns and adjectives)

            word1 = ps.stem(sentence[idx][0])
            for i in range(1, WORD_DISTANCE):
                if i >= len(sentence):
                    break
                word2 = ps.stem(sentence[idx + i][0])

                if word1 in indexed_words and word2 in indexed_words:
                    # print('edge', word1, word2)
                    graph[indexed_words[word1]][indexed_words[word2]] = 1
                    graph[indexed_words[word2]][indexed_words[word1]] = 1

    scores = page_rank(graph)
    sorted_scores = sorted(enumerate(scores),
                           key=lambda item: item[1],
                           reverse=True)
    # print('sorted', [(words_for_graph[idx], score)
    #                  for (idx, score) in sorted_scores])
    ranked_words = [
        stemmed_words[words_for_graph[idx]] for idx, score in sorted_scores
    ]
    print('top ranked', ranked_words[:keywords_count])
    # matched_pairs = match_pairs(ranked_words, tokenized_sentences, keywords_count)
    # print('match', matched_pairs)
    # paired_words = list(sum(matched_pairs, ()))
    # pairs_count = len(paired_words) // 2
    # keywords = [' '.join(pair) for pair in matched_pairs[:min(keywords_count, pairs_count)]]
    # keywords = keywords + list(filter(
    #     lambda word: word not in paired_words and word in tagged_words and
    #     tagged_words[word] in ['NN'],
    #     ranked_words
    # ))[:keywords_count - len(keywords)]
    # print('matched pairs', matched_pairs)

    collocations = match_collocations(set(ranked_words[:keywords_count]),
                                      set(ranked_words[:2 * keywords_count]),
                                      tokenized_sentences)
    sorted_average = sorted(collocations[0].items(), key=lambda x: abs(x[1]))
    sorted_deviation = sorted(collocations[1].items(), key=lambda x: abs(x[1]))
    print('average', sorted_average)
    print('deviation', sorted_deviation)
    top_averages = list(
        filter(lambda average: abs(average[1]) <= AVERAGE_BOUNDARY,
               sorted_average))
    top_deviation = list(
        filter(lambda deviation: deviation[1] <= DEVIATION_BOUNDARY,
               sorted_deviation))
    print('top aver', top_averages)
    print('top dev', top_deviation)
    matched_words = list(
        set([x[0] for x in top_averages
             ]).intersection(set([x[0] for x in top_deviation])))
    print('paired words', matched_words)
    paired_words = paired_words = list(sum(matched_words, ()))
    pairs_count = len(paired_words)
    keywords = [
        ' '.join(pair)
        for pair in matched_words[:min(keywords_count, pairs_count)]
    ]
    keywords = keywords + list(
        filter(
            lambda word: word not in paired_words and word in tagged_words and
            tagged_words[word] in ['NN'],
            ranked_words))[:keywords_count - len(keywords)]
    # print('matched pairs', matched_pairs)

    LOGGER.info("Extracting keywords completed")
    LOGGER.debug(keywords)
    return keywords
예제 #12
0
from graph_generator import create_graph
from matrix_generator import matrix_from_graph
from page_rank import page_rank
from services import get_page_links_v2

if __name__ == '__main__':
    URL = "https://makingfun.com/"

    # d = get_page_links_v2(URL)

    # create_graph(d)

    matrix, height, G_LIST = matrix_from_graph()

    page_rank(matrix, height, G_LIST)
예제 #13
0
파일: crawl.py 프로젝트: Tsgzj/CS6200
                if ilk in graph:
                    graph[docid]["inlinks"].append(ilk)

    count = 0
    for item in graph:
        if count % 2000 == 0:
            print count
        for ilk in graph[item]['inlinks']:
            if graph.get(ilk):
                graph[ilk]['outlinks'] += 1
        count += 1

    print "Finish building graph"
    return graph


if __name__ == '__main__':
    wg = build_graph()
    rank = page_rank(wg)

    sorted_rank = sorted(rank.items(),
                         key=operator.itemgetter(1),
                         reverse=True)

    outf = open("/Users/Sun/Dropbox/CS6200_W_Sun/HW4/data/crawl_pagerank.txt",
                "a")

    for i in range(500):
        res = sorted_rank[i][0] + " " + str(sorted_rank[i][1]) + "\n"
        outf.write(res)
예제 #14
0
    G.nodes())

#print G.edges()

print "Do you want to run numpy.linalg.eig? (y/n) [It might be very heavy]"
answer = sys.stdin.readline()[:-1]
if answer == "y":
    import numpy
    M = NX_future_page_rank.google_matrix(G, alpha=0.9)
    e, ev = numpy.linalg.eig(M.T)
    p = numpy.array(ev[:, 0] / ev[:, 0].sum())[:, 0]
    print "exact  ", p

print "Do you want to run page_rank? (y/n) [It might be very heavy]"
answer = sys.stdin.readline()[:-1]
if answer == "y":
    pr = NX_future_page_rank.page_rank(G, alpha=0.9, tol=1.0e-8)
    print "networkx", pr.values()

print "Do you want to run page_rank numpy? (y/n) [It might be very heavy]"
answer = sys.stdin.readline()[:-1]
if answer == "y":
    np = NX_future_page_rank.page_rank_numpy(G, alpha=0.9)
    print "numpy  ", np

print "Do you want to run page_rank scipy? (y/n) [It might be very heavy]"
answer = sys.stdin.readline()[:-1]
if answer == "y":
    ns = NX_future_page_rank.page_rank_scipy(G, alpha=0.9)
    print "scipy  ", ns
예제 #15
0
def NXPR(G):
    pr = page_rank.page_rank(G,alpha=0.9,tol=1.0e-8)
    return pr.values()
예제 #16
0
def NXPR(G):
    pr = page_rank.page_rank(G, alpha=0.9, tol=1.0e-8)
    return pr.values()