def getInfo(products): # Group products by 5 (limit of bulk action on amazon api) groups = groupBy(products, 5) p = 0 for group in groups: dprint('Progress... %s' % (p / len(groups) * 100)) ok = False # Try loop (try until is ok, no more throttling) while not ok: try: # Create product lookup res = amazon.lookup_bulk( ItemId=', '.join([asin for (_, asin) in group])) ok = True except HTTPError as e: dprint('Throttling... waiting...', e) time.sleep(5) p += 1 # Yield returned data for ((id, asin), r) in zip(group, res): yield { 'id': id, 'asin': asin, 'category': r.product_type_name, 'price': r.list_price[0], 'rank': r.sales_rank }
def runForK(k): dprint("Running for k: ", k) with timer: predicted = predict_buyers_for_products(B, testProducts, weights.cutOffK(w, k), m) scores = validate_buyers_for_products(B_test, predicted, all_c) #return tuple(np.average(list(scores), axis=0)) return list(scores)
def predict_buyers_mining(testProducts, krange=[0], method=linear_model.LinearRegression): i = 0 Xy = [] for buyer, prts in buyer_product_count.items(): products_count = [int(prts[p]) for p in products] Xy.append(products_count) Xytest = [] for buyer, prts in buyer_product_test_count.items(): products_count = [int(prts[p]) for p in products] Xytest.append(products_count) Xy = np.array(Xy) Xytest = np.array(Xytest) for product in testProducts: dprint(i / len(testProducts)) i += 1 # Train idx = products.index(product) X = np.delete(Xy, idx, 1) y = Xy[:, idx] if KNeighborsRegressor == method: l = method(n_neighbors=3) elif method == 1: l = make_pipeline(PolynomialFeatures(5), Ridge()) else: l = method() if method == LogisticRegression: if sum(y > 0) <= 0: for k in krange: yield (k, product, []) continue l.fit(X, y > 0) else: l.fit(X, y) Xtest = np.delete(Xytest, idx, 1) predicted = l.predict(Xtest) for k in krange: potentialBuyers = np.array(list( buyer_product_test_count.keys()))[predicted >= k] yield (k, product, potentialBuyers)
def construct_relation_graph(B, set=0, weight_fn=lambda i1, i2, w: len(w), name=""): hash = hashlib.md5(nx.info(B).encode('utf-8')).hexdigest() filename = 'graph_' + hash + '_' + name + '.pkl' if os.path.isfile('cache/' + filename) and False: with open('cache/' + filename, 'rb') as f: dprint("Relation graph loaded...") return pickle.load(f) """ Construct relation graph""" dprint("Constructing relation graph...") # Get buyers and products sets = nx.bipartite.sets(B) # Get all combinations between (0 - buyers, 1 - products) set items combinations = itertools.combinations(sets[set], 2) G = nx.empty_graph(len(sets[set])) # Construct edges with weights edges = [(i1, i2, weight_fn(i1, i2, list(nx.common_neighbors(B, i1, i2)))) for (i1, i2) in combinations] # Add edges to graph G.add_weighted_edges_from([edge for edge in edges if edge[2] is not None]) dprint("Relation graph constructed") # Save to cache with open('cache/' + filename, 'wb') as f: pickle.dump(G, f) return G
def construct_bi_graph_buyer_product(orders, name='Bipartite'): """ Constructs bipartite graph of buyers and products""" G = nx.Graph() G.name = name dprint("Constructing bipartite graph...") buyers = set([order['buyer'] for order in orders]) products = set([order['product'] for order in orders]) # Add nodes to bipartite graph G.add_nodes_from(buyers, bipartite=0) G.add_nodes_from(products, bipartite=1) # Loop trough orders and add edges to bipartite graph edges = defaultdict(int) for idx, order in enumerate(orders): edges[(order['buyer'], order['product'])] += order['quantity'] #* (idx / len(orders)) G.add_weighted_edges_from([(b, p, w) for ((b, p), w) in edges.items()]) if (nx.is_bipartite(G)): dprint('Bipartite is constructed') else: dprint('Error not bipartite graph') exit(-1) return G
def load_orders_from_db(): """ Loads orders """ # Create mysql connection db = pymysql.connect( host=config.MYSQL_CONFIG['host'], # your host, usually localhost user=config.MYSQL_CONFIG['user'], # your username passwd=config.MYSQL_CONFIG['password'], # your password db=config.MYSQL_CONFIG['dbname']) # name of the data base # Connect to mysql with db.cursor() as cur: dprint("Fetching buyers...") # Execute sql query which gets all buyers who had ordered at least 20 orders cur.execute( "SELECT mb.orders.customer_email FROM mb.orders WHERE orders.customer_email IS NOT NULL GROUP BY orders.customer_email HAVING COUNT(*) > 7;" ) # Get all distinct buyers buyers = set(map(lambda x: x[0], cur.fetchall())) dprint("Querying orders...") # Execute query which gets all ordered products with buyer, seller format_strings = ','.join(['%s'] * len(buyers)) cur.execute( "SELECT id FROM user_amazon_credentials WHERE marketplace_id != 1") credentials = ','.join( map(str, np.ndarray.flatten(np.array(cur.fetchall())))) cur.execute( #AVG(IF(ISNULL(feedback.rating),0,feedback.rating)) GROUP_BY order_items.id "SELECT orders.credential_id, customer_email, order_items.product_id, AVG(reviews.rating), promotion_id, IFNULL(quantity_ordered, 1) FROM orders INNER JOIN order_items ON orders.id = order_items.order_id LEFT JOIN mb.order_item_promotion ON mb.order_items.id = mb.order_item_promotion.order_item_id LEFT JOIN reviews ON reviews.product_id = order_items.product_id WHERE orders.customer_email IN (%s) AND orders.credential_id NOT IN (%s) GROUP BY order_items.id ORDER BY orders.status_shipped_at" % (format_strings, credentials), tuple(buyers)) # Orders to list orders = list( map( lambda order: { 'seller': order[0], 'buyer': hashlib.md5(order[1].encode('utf-8')).hexdigest(), 'product': order[2], 'rating': order[3], 'promotion': order[4], 'quantity': order[5] }, filter(lambda order: order[1] is not None, cur.fetchall()))) # Print orders stat info dprint("Orders count: ", len(orders)) #dprint("Buyers count: ", len(buyers)) return orders
B_test = graph.construct_bi_graph_buyer_product(test) testBuyers, testProducts = nx.bipartite.sets(B_test) timer = MeasureTimer() def predict_random_buyers(testProducts, k): for product in testProducts: by = np.random.choice(list(buyers), int(all_c * k / 100), replace=False) yield (product, by) results = {} for k in map(int, np.linspace(100, 0, K)): dprint("Running for k: ", k) with timer: predicted = predict_random_buyers(testProducts, k) scores = validate_buyers_for_products(B_test, predicted, all_c) #results[k] = tuple(np.average(list(scores), axis=0)) results[k] = list(scores) if saveto: with open(saveto, 'wb') as f: pickle.dump(Result(results, timer), f) printResults(results) print("Average time %s" % timer.getAverage())
yield (buyer, set(products)) results = [{}, {}] for mi, m in enumerate([methodIntersect, methodUnion]): for k2 in range(0, K): with timer: predictedProducts = predict_products_for_buyers( B, testBuyers, weights.cutOffK(weights.simple_weight(), k2)) buyer_products = { buyer: products for buyer, products in predictedProducts } for k in map(int, np.linspace(230, 240, K)): dprint("Running for k: ", k, k2) predictedBuyers = predict_buyers_for_products( B, testProducts, weights.cutOffK(weights.bipartite_products_weights(B), k)) product_buyers = { product: buyers for product, buyers in predictedBuyers } predicted = list(m(buyer_products, product_buyers)) #scores = validate_buyers_for_products(B_test, predicted, all_c) scores = validate_products_for_buyers(B_test, predicted, all_c) #results[(k, k2)] = tuple(np.average(list(scores), axis=0))
def get_additional_data_from_amz(orders): # Create mysql connection db = pymysql.connect( host=config.MYSQL_CONFIG['host'], # your host, usually localhost user=config.MYSQL_CONFIG['user'], # your username passwd=config.MYSQL_CONFIG['password'], # your password db=config.MYSQL_CONFIG['dbname']) # name of the data base with db.cursor() as cur: querylist = ','.join([str(order['product']) for order in orders]) cur.execute("SELECT id, asin FROM products WHERE id IN (%s)" % querylist) # Orders to list #products = {id: asin for (id, asin) in cur.fetchall()} products = list(cur.fetchall()) # Setup amazon api amazon = AmazonAPI(config.AWS_CONFIG['key'], config.AWS_CONFIG['secret'], config.AWS_CONFIG['tag']) def groupBy(input, n): return [input[i:i + n] for i in range(0, len(input), n)] def getInfo(products): # Group products by 5 (limit of bulk action on amazon api) groups = groupBy(products, 5) p = 0 for group in groups: dprint('Progress... %s' % (p / len(groups) * 100)) ok = False # Try loop (try until is ok, no more throttling) while not ok: try: # Create product lookup res = amazon.lookup_bulk( ItemId=', '.join([asin for (_, asin) in group])) ok = True except HTTPError as e: dprint('Throttling... waiting...', e) time.sleep(5) p += 1 # Yield returned data for ((id, asin), r) in zip(group, res): yield { 'id': id, 'asin': asin, 'category': r.product_type_name, 'price': r.list_price[0], 'rank': r.sales_rank } # Get additional information for products productsifno = getInfo(products) # Create dictionary for easier merging orders resdict = { pi['id']: {k: p for k, p in pi.items() if k != 'id'} for pi in list(productsifno) } # Merge old orders with new products orders_adi = [{ **order, **resdict[order['product']] } for order in orders if order['product'] in resdict] dprint('Progress... 100%') return orders_adi