def get_features(reviews, is_train):
    end_date = datetime.date(2012, 1, 1) if is_train else datetime.date(2013, 1, 1)
    # we multiply some values by constants as a hacky way of normalizing the features
    return {
        "age": 50.0 / ((end_date - get_date(reviews[0])).days + 30),
        "age_0.5": 10.0 / (((end_date - get_date(reviews[0])).days + 30) ** 0.5),
        "age_0.2": 3.0 / (((end_date - get_date(reviews[0])).days + 30) ** 0.2),
        "stars": int(reviews[0]["stars"]) / 5.0,
        "liked": 1 if int(reviews[0]["stars"]) > 3 else 0,
        "bias": 1.0,
    }
def get_features(reviews, is_train):
    end_date = datetime.date(2012, 1, 1) if is_train else datetime.date(2013, 1, 1)
    # we multiply some values by constants as a hacky way of normalizing the features
    return {
        "age": 50.0 / ((end_date - get_date(reviews[0])).days + 30),
        "age_0.5": 10.0 / (((end_date - get_date(reviews[0])).days + 30) ** 0.5),
        "age_0.2": 3.0 / (((end_date - get_date(reviews[0])).days + 30) ** 0.2),
        "stars": int(reviews[0]["stars"]) / 5.0,
        "liked": 1 if int(reviews[0]["stars"]) > 3 else 0,
        "bias": 1.0
    }
def run_random_walks(data_dir, weight_edges=False):
    print "Loading data and building transition matrix..."
    examples = util.load_json('./data/' + data_dir + '/examples.json')
    G = nx.read_edgelist('./data/' + data_dir + '/graph.txt', nodetype=int)
    if weight_edges:
        reviews = util.load_json('./data/' + data_dir + '/review.json')
        end_date = datetime.date(2012, 1, 1) if data_dir == 'train' else datetime.date(2013, 1, 1)
        edges = G.edges()
        for e in util.logged_loop(edges, util.LoopLogger(20000, len(edges), True)):
            n1, n2 = str(e[0]), str(e[1])
            if n1 not in reviews or n2 not in reviews[n1]:
                n1, n2 = n2, n1
            G[e[0]][e[1]]['weight'] = 1.0 / ((end_date - get_date(reviews[n1][n2][0])).days + 90)
        del reviews  # save some memory

    adjacency_matrix = nx.adjacency_matrix(G)
    inverse_degree_matrix = sparse.diags([[1.0 / adjacency_matrix.getrow(i).sum()
                                           for i in range(adjacency_matrix.shape[0])]], [0])
    transition_matrix = inverse_degree_matrix.dot(adjacency_matrix)

    print "Running random walks..."
    for u in util.logged_loop(examples, util.LoopLogger(10, len(examples), True)):
        p = run_random_walk(transition_matrix, int(u), 10).todense()
        for b in examples[u]:
            examples[u][b] = p[0, int(b)]

    util.write_json(examples, './data/' + data_dir
                    + ('/weighted_random_walks.json' if weight_edges else '/random_walks.json'))
예제 #4
0
def run_random_walks(data_dir, weight_edges=False):
    print "Loading data and building transition matrix..."
    examples = util.load_json('./data/' + data_dir + '/examples.json')
    G = nx.read_edgelist('./data/' + data_dir + '/graph.txt', nodetype=int)
    if weight_edges:
        reviews = util.load_json('./data/' + data_dir + '/review.json')
        end_date = datetime.date(
            2012, 1, 1) if data_dir == 'train' else datetime.date(2013, 1, 1)
        edges = G.edges()
        for e in util.logged_loop(edges,
                                  util.LoopLogger(20000, len(edges), True)):
            n1, n2 = str(e[0]), str(e[1])
            if n1 not in reviews or n2 not in reviews[n1]:
                n1, n2 = n2, n1
            G[e[0]][e[1]]['weight'] = 1.0 / (
                (end_date - get_date(reviews[n1][n2][0])).days + 90)
        del reviews  # save some memory

    adjacency_matrix = nx.adjacency_matrix(G)
    inverse_degree_matrix = sparse.diags([[
        1.0 / adjacency_matrix.getrow(i).sum()
        for i in range(adjacency_matrix.shape[0])
    ]], [0])
    transition_matrix = inverse_degree_matrix.dot(adjacency_matrix)

    print "Running random walks..."
    for u in util.logged_loop(examples,
                              util.LoopLogger(10, len(examples), True)):
        p = run_random_walk(transition_matrix, int(u), 10).todense()
        for b in examples[u]:
            examples[u][b] = p[0, int(b)]

    util.write_json(
        examples,
        './data/' + data_dir + ('/weighted_random_walks.json'
                                if weight_edges else '/random_walks.json'))