예제 #1
0
def main(follow, followed, test_file, submission_file, data_file,
         validation_file, max_suggestion):
    """ The main method for the problem. """

    print 'Reading graph...'
    test_nodes = utilities.read_nodes_list(test_file)

    print 'Training with logistic regression...'
    clf = rank.train(data_file, validation_file)

    print 'Getting popular people...'
    popular_people = get_popular_people(followed, max_suggestion)

    print 'Predicting...'
    predictions = []
    count = 0
    for node in test_nodes:
        suggested = suggest_friends(follow, followed, clf, node,
                                    popular_people, max_suggestion)
        predictions.append(suggested)

        count += 1
        if count % 100 == 0:
            print 'Suggested %d friends.' % count

    print 'Writing submission files...'
    utilities.write_submission_file(submission_file, test_nodes, predictions)
예제 #2
0
파일: main.py 프로젝트: FindBoat/Kaggle
def main(follow, followed, test_file, submission_file, data_file,
    validation_file, max_suggestion):
    """ The main method for the problem. """

    print 'Reading graph...'
    test_nodes = utilities.read_nodes_list(test_file)

    print 'Training with logistic regression...'
    clf = rank.train(data_file, validation_file)

    print 'Getting popular people...'
    popular_people = get_popular_people(followed, max_suggestion)

    print 'Predicting...'
    predictions = []
    count = 0
    for node in test_nodes:
        suggested = suggest_friends(follow, followed, clf, node,
            popular_people, max_suggestion)
        predictions.append(suggested)

        count += 1
        if count % 100 == 0:
            print 'Suggested %d friends.' % count

    print 'Writing submission files...'
    utilities.write_submission_file(submission_file, test_nodes, predictions)
def top_k_benchmark(train_file, test_file, submission_file, num_predictions):
    """
    Runs the top k benchmark
    """
    top_k_nodes = get_top_k_nodes(train_file, num_predictions)
    test_nodes = utilities.read_nodes_list(test_file)
    test_predictions = [top_k_nodes for node in test_nodes]
    utilities.write_submission_file(submission_file, test_nodes,
                                    test_predictions)
def communicative_basic(train_file, test_file, submission_file, num_predictions):
    '''
    main function
    '''
    
    print ">>> reading the graph from file ...",
    graph = {}
    graph = utilities.read_graph(train_file)
    print " done!"
    print ">> the graph contains %d ndoes" % len(graph)
    
    print ">>> building the edge set ...",
    edgeSet = set()
    nodeCredit = {}
    for node in graph.keys():
        nodeCredit[node] = 0
        for frdNode in graph[node]:
            edgeSet.add((node,frdNode))
    print "done!"
    
    def compareCredit(key):
        '''
        utility function to comapre the two credits given the key
        '''
        return nodeCredit[key]
    
    
    missingEdgeSet = set()
    print ">>> reversing the edge set, computing the credicts of each node and finding missing edges ...",
    for edge in edgeSet:
        if (edge[1], edge[0]) not in edgeSet:
            missingEdgeSet.add((edge[1], edge[0]))
        nodeCredit[edge[1]]+=1
    print " done!"
    
    testResult = {}
    testNodeList = utilities.read_nodes_list(test_file)
    testNodeSet = set(testNodeList)
    print ">> %d test Nodes read." % len(testNodeList)
    print ">>> making the missing edge dictionary for test nodes ...",
    for testNode in testNodeList: # pre-build the dictionary
        testResult[testNode] = []
    
    for edge in missingEdgeSet:
        if (edge[0] in testNodeSet):
            testResult[edge[0]].append(edge[1])
    print " done!"
    
    print ">>> sorting the final results according to node credits ...",
    for testNode in testNodeList:
        testResult[testNode].sort(key=compareCredit, reverse=True)
    print " done!"
    
    print ">>> outputing the final result ...",
    utilities.write_submission_file(submission_file, testNodeList, [testResult[testNode] for testNode in testNodeList])
    print " done!"
def top_k_benchmark(train_file, test_file, submission_file, num_predictions):
    """
    Runs the top k benchmark
    """
    top_k_nodes = get_top_k_nodes(train_file, num_predictions)
    test_nodes = utilities.read_nodes_list(test_file)
    test_predictions = [top_k_nodes for node in test_nodes]
    utilities.write_submission_file(submission_file, 
                                    test_nodes, 
                                    test_predictions)
예제 #6
0
def random_benchmark(train_file, test_file, submission_file, num_predictions):
    """
    Runs the random benchmark.
    """
    nodes = read_nodes_from_training(train_file)
    test_nodes = utilities.read_nodes_list(test_file)
    test_predictions = [[random.choice(nodes) for x in range(num_predictions)]
                        for node in test_nodes]
    utilities.write_submission_file(submission_file, test_nodes,
                                    test_predictions)
def random_benchmark(train_file, test_file, submission_file, num_predictions):
    """
    Runs the random benchmark.
    """
    nodes = read_nodes_from_training(train_file)
    test_nodes = utilities.read_nodes_list(test_file)
    test_predictions = [[random.choice(nodes) for x in range(num_predictions)]
                        for node in test_nodes]
    utilities.write_submission_file(submission_file, 
                                    test_nodes, 
                                    test_predictions)
def bfs_benchmark(train_file, test_file, submission_file, num_predictions):
    """
    Runs the breadth-first search benchmark.
    """
    graph = utilities.read_graph(train_file)
    test_nodes = utilities.read_nodes_list(test_file)
    test_predictions = [breadth_first_search(graph, node, num_predictions)
                        for node in test_nodes]
    utilities.write_submission_file(submission_file, 
                                    test_nodes, 
                                    test_predictions)
예제 #9
0
def bfs_benchmark(train_file, test_file, submission_file, num_predictions):
    """
    Runs the breadth-first search benchmark.
    """
    graph = utilities.read_graph(train_file)
    test_nodes = utilities.read_nodes_list(test_file)
    test_predictions = [breadth_first_search(graph, node, num_predictions)
                        for node in test_nodes]
    utilities.write_submission_file(submission_file, 
                                    test_nodes, 
                                    test_predictions)
예제 #10
0
def generate_test_set(follow, followed, test_file, validation_file,
    solution_file, num,  max_remove_num):
    """ Generates the test set for analysis. """

    nodes_exclude = utilities.read_nodes_list(test_file)

    print 'Generating test nodes...'
    nodes_test = generate_test_nodes(follow, nodes_exclude, num)
    writable_nodes_test = [[n] for n in nodes_test]
    solution = generate_solution(follow, followed, nodes_test, max_remove_num)

    utilities.write_file(validation_file, writable_nodes_test)
    utilities.write_file(solution_file, solution)
예제 #11
0
def generate_test_set(follow, followed, test_file, validation_file,
                      solution_file, num, max_remove_num):
    """ Generates the test set for analysis. """

    nodes_exclude = utilities.read_nodes_list(test_file)

    print 'Generating test nodes...'
    nodes_test = generate_test_nodes(follow, nodes_exclude, num)
    writable_nodes_test = [[n] for n in nodes_test]
    solution = generate_solution(follow, followed, nodes_test, max_remove_num)

    utilities.write_file(validation_file, writable_nodes_test)
    utilities.write_file(solution_file, solution)
예제 #12
0
def run_recs(train_file, test_file, submission_file):
    global graph, graph_inverse
    graph, graph_inverse = utilities.read_graph_and_inverse(train_file)
    test_nodes = utilities.read_nodes_list(test_file)
    
    #change the val below to match cpu/memory usage, allow for ~1.2G of ram per cpu, swap will kill performance
    pool = multiprocessing.Pool(8)
    predictions = {}
    for target_node, recs in pool.imap_unordered(make_recs, test_nodes, chunksize=10000): #can experiment w/chunksize
      predictions[target_node] = recs
    test_predictions = [predictions[node] for node in test_nodes]
    utilities.write_submission_file(submission_file, 
                                    test_nodes, 
                                    test_predictions)
예제 #13
0
def jaccard_benchmark(train_file, test_file, submission_file, num_predictions):
    """
    Runs the breadth-first search benchmark.
    """

    start_time = time.time()
    (graph, reversegraph) = utilities.read_graph(train_file)
    print "Graph forming time = ", time.time() - start_time, "seconds"
    start_time = time.time()
    test_nodes = utilities.read_nodes_list(test_file)
    test_predictions = [jaccard_search(graph, reversegraph, node, num_predictions) for node in test_nodes]

    print "Prediction time = ", time.time() - start_time, "seconds"

    utilities.write_submission_file(submission_file, test_nodes, test_predictions)
def main_entrance(train_data_file, test_data_file, submit_data_file):
    '''
    the main entrance of the program
    '''
    ###############Configs#################
    minMutualFrd = 2
    ############End of Configs#############
    
    print ">>> reading the graph from file ...",
    following_graph = utilities.read_graph(train_data_file)
    print " done!"
    print ">> the graph contains %d ndoes" % len(following_graph)
    
    print ">>> reading test nodes ...",
    testNodeList = utilities.read_nodes_list(test_data_file)
    print " done!"
    
    edgeSet = get_edge_set(following_graph)
    nodeCredit = get_node_credit(edgeSet, following_graph.keys())
    commu_missingEdgeDict = get_commu_missing_edge(edgeSet, testNodeList)
    mutual_missingEdgeDict = get_mutual_missing_edge(following_graph, testNodeList, edgeSet, following_graph.keys(), minMutualFrd)
    
    # union two edge dicts
    finalPrediction = {}
    for node in testNodeList:
        finalPrediction[node] = list(set(mutual_missingEdgeDict[node]) | set(commu_missingEdgeDict[node]))
    
    # customized comparator for final prediction
    def compareCredit(key):
        '''
        utility function to comapre the two credits given the key
        '''
        return nodeCredit[key]
    
    # rank the predictions
    print ">>> sorting the final results according to node credits ...",
    for testNode in testNodeList:
        finalPrediction[testNode].sort(key=compareCredit, reverse=True)
    print " done!"
    
    # write prediction to file
    print ">>> outputing the final result ...",
    utilities.write_submission_file(submit_data_file, testNodeList, [finalPrediction[testNode] for testNode in testNodeList])
    print " done!"
예제 #15
0
def jaccard_benchmark(train_file, test_file, submission_file, num_predictions):
    """
    Runs the breadth-first search benchmark.
    """

    start_time = time.time()
    (graph, reversegraph) = utilities.read_graph(train_file)
    print "Graph forming time = ", time.time() - start_time, "seconds"
    start_time = time.time()
    test_nodes = utilities.read_nodes_list(test_file)
    test_predictions = [
        jaccard_search(graph, reversegraph, node, num_predictions)
        for node in test_nodes
    ]

    print "Prediction time = ", time.time() - start_time, "seconds"

    utilities.write_submission_file(submission_file, test_nodes,
                                    test_predictions)
def communicative_basic(train_file, test_file, submission_file, num_predictions):
    '''
    main function
    '''
    
    print ">>> reading the graph from file ...",
    graph = {}
    graph = utilities.read_graph(train_file)
    print " done!"
    print ">> the graph contains %d ndoes" % len(graph)
    
    print ">>> building the edge set ...",
    edgeSet = set()
    for node in graph.keys():
        for frdNode in graph[node]:
            edgeSet.add((node,frdNode))
    print "done!"
    
    missingEdgeSet = set()
    print ">>> reversing the edge set, finding missing edges ...",
    for edge in edgeSet:
        if (edge[1], edge[0]) not in edgeSet:
            missingEdgeSet.add((edge[1], edge[0]))
    print " done!"
    
    testResult = {}
    testNodeList = utilities.read_nodes_list(test_file)
    testNodeSet = set(testNodeList)
    print ">> %d test Nodes read." % len(testNodeList)
    print ">>> making the missing edge dictionary for test nodes ...",
    for testNode in testNodeList: # pre-build the dictionary
        testResult[testNode] = []
    
    for edge in missingEdgeSet:
        if (edge[0] in testNodeSet):
            testResult[edge[0]].append(edge[1])
    print " done!"
    
    print ">>> outputing the final result ...",
    utilities.write_submission_file(submission_file, testNodeList, [testResult[testNode] for testNode in testNodeList])
    print " done!"