Пример #1
0
def test_graph_score():
    g = Graph()
    g.insert_edge(0, 1)
    g.insert_edge(0, 4)
    g.insert_edge(1, 2)
    g.insert_edge(1, 3)
    g.insert_edge(2, 0)
    g.insert_edge(3, 2)
    g.save_graph()
    p = PageRank(g)
    p.iterate(max_iter=None)
    p.save_score()

    return True
Пример #2
0
def main(content_path='Contents', graph_path='Data/graph.p', bir_name=None):
    if not bir_name:
        # Create BIR
        bir = BIR(normalization_factor=2)

    else:
        with open("{}/{}".format('Data', bir_name), 'rb') as f:
            bir = p.load(f)
    n_file = 0
    for filename in os.listdir(content_path):
        if not filename.startswith('.'):
            idx = int(filename[:-4])  # Content file must be stored as doc_id.txt
            with open("{}/{}".format(content_path, filename), 'r') as f:
                if n_file % 100 == 0:
                    print("Have Parsed {} documents".format(n_file))
                kw = extract_keywords(f.read())
                bir.insert_document(doc=kw, idx=idx)
                n_file += 1
    # Calculate and save tf-idf table
    print("Create BIR and tf-idf for all documents...")
    bir.create_and_save_tf_idf(filename='tf_idx.p', path=os.getcwd())
    # Save the BIR to a pickle file
    print("Saving Inverted Index Table...")
    bir.save(path=os.getcwd())

    # Upload the web graph
    with open(graph_path, 'rb') as f:
        graph = p.load(f)

    print("Calculate PageRank ...")
    # Build PageRank using the web graph
    pagerank = PageRank(graph, prev_path=None, damping_factor=0.32, epsilon=0.0000001, default_weight=None)
    # Iterate until converge
    pagerank.iterate(max_iter=100000)

    # Save the PageRank score
    pagerank.save_score(filename=None, path=os.getcwd())
    print("Finished!!!")

    return True