예제 #1
0
    def getInfo(products):
        # Group products by 5 (limit of bulk action on amazon api)
        groups = groupBy(products, 5)
        p = 0
        for group in groups:
            dprint('Progress... %s' % (p / len(groups) * 100))

            ok = False
            # Try loop (try until is ok, no more throttling)
            while not ok:
                try:
                    # Create product lookup
                    res = amazon.lookup_bulk(
                        ItemId=', '.join([asin for (_, asin) in group]))
                    ok = True
                except HTTPError as e:
                    dprint('Throttling... waiting...', e)
                    time.sleep(5)
            p += 1

            # Yield returned data
            for ((id, asin), r) in zip(group, res):
                yield {
                    'id': id,
                    'asin': asin,
                    'category': r.product_type_name,
                    'price': r.list_price[0],
                    'rank': r.sales_rank
                }
예제 #2
0
        def runForK(k):
            dprint("Running for k: ", k)
            with timer:
                predicted = predict_buyers_for_products(B, testProducts, weights.cutOffK(w, k), m)

            scores = validate_buyers_for_products(B_test, predicted, all_c)
            #return tuple(np.average(list(scores), axis=0))
            return list(scores)
예제 #3
0
def predict_buyers_mining(testProducts,
                          krange=[0],
                          method=linear_model.LinearRegression):
    i = 0
    Xy = []
    for buyer, prts in buyer_product_count.items():
        products_count = [int(prts[p]) for p in products]
        Xy.append(products_count)

    Xytest = []
    for buyer, prts in buyer_product_test_count.items():
        products_count = [int(prts[p]) for p in products]
        Xytest.append(products_count)

    Xy = np.array(Xy)
    Xytest = np.array(Xytest)

    for product in testProducts:
        dprint(i / len(testProducts))
        i += 1

        # Train
        idx = products.index(product)
        X = np.delete(Xy, idx, 1)
        y = Xy[:, idx]

        if KNeighborsRegressor == method:
            l = method(n_neighbors=3)
        elif method == 1:
            l = make_pipeline(PolynomialFeatures(5), Ridge())
        else:
            l = method()

        if method == LogisticRegression:
            if sum(y > 0) <= 0:
                for k in krange:
                    yield (k, product, [])
                continue
            l.fit(X, y > 0)
        else:
            l.fit(X, y)

        Xtest = np.delete(Xytest, idx, 1)
        predicted = l.predict(Xtest)

        for k in krange:
            potentialBuyers = np.array(list(
                buyer_product_test_count.keys()))[predicted >= k]
            yield (k, product, potentialBuyers)
예제 #4
0
def construct_relation_graph(B,
                             set=0,
                             weight_fn=lambda i1, i2, w: len(w),
                             name=""):
    hash = hashlib.md5(nx.info(B).encode('utf-8')).hexdigest()
    filename = 'graph_' + hash + '_' + name + '.pkl'
    if os.path.isfile('cache/' + filename) and False:
        with open('cache/' + filename, 'rb') as f:
            dprint("Relation graph loaded...")
            return pickle.load(f)
    """ Construct relation graph"""
    dprint("Constructing relation graph...")
    # Get buyers and products
    sets = nx.bipartite.sets(B)

    # Get all combinations between (0 - buyers, 1 - products) set items
    combinations = itertools.combinations(sets[set], 2)

    G = nx.empty_graph(len(sets[set]))

    # Construct edges with weights
    edges = [(i1, i2, weight_fn(i1, i2, list(nx.common_neighbors(B, i1, i2))))
             for (i1, i2) in combinations]

    # Add edges to graph
    G.add_weighted_edges_from([edge for edge in edges if edge[2] is not None])

    dprint("Relation graph constructed")

    # Save to cache
    with open('cache/' + filename, 'wb') as f:
        pickle.dump(G, f)
    return G
예제 #5
0
def construct_bi_graph_buyer_product(orders, name='Bipartite'):
    """ Constructs bipartite graph of buyers and products"""
    G = nx.Graph()
    G.name = name
    dprint("Constructing bipartite graph...")

    buyers = set([order['buyer'] for order in orders])
    products = set([order['product'] for order in orders])

    # Add nodes to bipartite graph
    G.add_nodes_from(buyers, bipartite=0)
    G.add_nodes_from(products, bipartite=1)

    # Loop trough orders and add edges to bipartite graph
    edges = defaultdict(int)
    for idx, order in enumerate(orders):
        edges[(order['buyer'], order['product'])] += order['quantity']
        #* (idx / len(orders))

    G.add_weighted_edges_from([(b, p, w) for ((b, p), w) in edges.items()])

    if (nx.is_bipartite(G)):
        dprint('Bipartite is constructed')
    else:
        dprint('Error not bipartite graph')
        exit(-1)

    return G
예제 #6
0
def load_orders_from_db():
    """ Loads orders """
    # Create mysql connection
    db = pymysql.connect(
        host=config.MYSQL_CONFIG['host'],  # your host, usually localhost
        user=config.MYSQL_CONFIG['user'],  # your username
        passwd=config.MYSQL_CONFIG['password'],  # your password
        db=config.MYSQL_CONFIG['dbname'])  # name of the data base

    # Connect to mysql
    with db.cursor() as cur:

        dprint("Fetching buyers...")
        # Execute sql query which gets all buyers who had ordered at least 20 orders
        cur.execute(
            "SELECT mb.orders.customer_email FROM mb.orders WHERE orders.customer_email IS NOT NULL GROUP BY orders.customer_email HAVING COUNT(*) > 7;"
        )

        # Get all distinct buyers
        buyers = set(map(lambda x: x[0], cur.fetchall()))

        dprint("Querying orders...")
        # Execute query which gets all ordered products with buyer, seller
        format_strings = ','.join(['%s'] * len(buyers))
        cur.execute(
            "SELECT id FROM user_amazon_credentials WHERE marketplace_id != 1")
        credentials = ','.join(
            map(str, np.ndarray.flatten(np.array(cur.fetchall()))))
        cur.execute(
            #AVG(IF(ISNULL(feedback.rating),0,feedback.rating)) GROUP_BY order_items.id
            "SELECT orders.credential_id, customer_email, order_items.product_id, AVG(reviews.rating), promotion_id, IFNULL(quantity_ordered, 1) FROM orders INNER JOIN order_items ON orders.id = order_items.order_id LEFT JOIN mb.order_item_promotion ON mb.order_items.id = mb.order_item_promotion.order_item_id LEFT JOIN reviews ON reviews.product_id = order_items.product_id WHERE orders.customer_email IN (%s) AND orders.credential_id NOT IN (%s) GROUP BY order_items.id ORDER BY orders.status_shipped_at"
            % (format_strings, credentials),
            tuple(buyers))

        # Orders to list
        orders = list(
            map(
                lambda order: {
                    'seller': order[0],
                    'buyer': hashlib.md5(order[1].encode('utf-8')).hexdigest(),
                    'product': order[2],
                    'rating': order[3],
                    'promotion': order[4],
                    'quantity': order[5]
                }, filter(lambda order: order[1] is not None, cur.fetchall())))

        # Print orders stat info
        dprint("Orders count: ", len(orders))
        #dprint("Buyers count: ", len(buyers))

        return orders
예제 #7
0
B_test = graph.construct_bi_graph_buyer_product(test)
testBuyers, testProducts = nx.bipartite.sets(B_test)

timer = MeasureTimer()


def predict_random_buyers(testProducts, k):
    for product in testProducts:
        by = np.random.choice(list(buyers),
                              int(all_c * k / 100),
                              replace=False)
        yield (product, by)


results = {}
for k in map(int, np.linspace(100, 0, K)):
    dprint("Running for k: ", k)
    with timer:
        predicted = predict_random_buyers(testProducts, k)

    scores = validate_buyers_for_products(B_test, predicted, all_c)
    #results[k] = tuple(np.average(list(scores), axis=0))
    results[k] = list(scores)

if saveto:
    with open(saveto, 'wb') as f:
        pickle.dump(Result(results, timer), f)

printResults(results)

print("Average time %s" % timer.getAverage())
예제 #8
0
        yield (buyer, set(products))


results = [{}, {}]
for mi, m in enumerate([methodIntersect, methodUnion]):
    for k2 in range(0, K):
        with timer:
            predictedProducts = predict_products_for_buyers(
                B, testBuyers, weights.cutOffK(weights.simple_weight(), k2))
            buyer_products = {
                buyer: products
                for buyer, products in predictedProducts
            }

            for k in map(int, np.linspace(230, 240, K)):
                dprint("Running for k: ", k, k2)

                predictedBuyers = predict_buyers_for_products(
                    B, testProducts,
                    weights.cutOffK(weights.bipartite_products_weights(B), k))

                product_buyers = {
                    product: buyers
                    for product, buyers in predictedBuyers
                }

                predicted = list(m(buyer_products, product_buyers))

                #scores = validate_buyers_for_products(B_test, predicted, all_c)
                scores = validate_products_for_buyers(B_test, predicted, all_c)
                #results[(k, k2)] = tuple(np.average(list(scores), axis=0))
예제 #9
0
def get_additional_data_from_amz(orders):
    # Create mysql connection
    db = pymysql.connect(
        host=config.MYSQL_CONFIG['host'],  # your host, usually localhost
        user=config.MYSQL_CONFIG['user'],  # your username
        passwd=config.MYSQL_CONFIG['password'],  # your password
        db=config.MYSQL_CONFIG['dbname'])  # name of the data base

    with db.cursor() as cur:
        querylist = ','.join([str(order['product']) for order in orders])
        cur.execute("SELECT id, asin FROM products WHERE id IN (%s)" %
                    querylist)

        # Orders to list
        #products = {id: asin for (id, asin) in cur.fetchall()}
        products = list(cur.fetchall())

    # Setup amazon api
    amazon = AmazonAPI(config.AWS_CONFIG['key'], config.AWS_CONFIG['secret'],
                       config.AWS_CONFIG['tag'])

    def groupBy(input, n):
        return [input[i:i + n] for i in range(0, len(input), n)]

    def getInfo(products):
        # Group products by 5 (limit of bulk action on amazon api)
        groups = groupBy(products, 5)
        p = 0
        for group in groups:
            dprint('Progress... %s' % (p / len(groups) * 100))

            ok = False
            # Try loop (try until is ok, no more throttling)
            while not ok:
                try:
                    # Create product lookup
                    res = amazon.lookup_bulk(
                        ItemId=', '.join([asin for (_, asin) in group]))
                    ok = True
                except HTTPError as e:
                    dprint('Throttling... waiting...', e)
                    time.sleep(5)
            p += 1

            # Yield returned data
            for ((id, asin), r) in zip(group, res):
                yield {
                    'id': id,
                    'asin': asin,
                    'category': r.product_type_name,
                    'price': r.list_price[0],
                    'rank': r.sales_rank
                }

    # Get additional information for products
    productsifno = getInfo(products)

    # Create dictionary for easier merging orders
    resdict = {
        pi['id']: {k: p
                   for k, p in pi.items() if k != 'id'}
        for pi in list(productsifno)
    }

    # Merge old orders with new products
    orders_adi = [{
        **order,
        **resdict[order['product']]
    } for order in orders if order['product'] in resdict]

    dprint('Progress... 100%')
    return orders_adi