def find_observed_prices(self, listings_frame): """ Search listings_frame with only one product, and create prices from them. These prices are called 'observed' prices here. """ # Price data is first collected in list of dicts, that is later # converted to a ``DataFrame``. Each dict is a row of the ``DataFrame``. price_data = [] for _, listing in listings_frame.iterrows(): # Select sold listings with only one product curr_prods = listing["products"] if len(curr_prods) != 1: continue if listing["sold"] != 1.0: continue # Put the price data into dict single_price_data = {} single_price_data["price"] = listing["price"] single_price_data["currency"] = listing["currency"] single_price_data["condition"] = listing["condition"] single_price_data["time"] = listing["time"] single_price_data["product"] = curr_prods[0] single_price_data["listing"] = listing["id"] single_price_data["type"] = "observed" single_price_data["avg_period"] = "none" single_price_data["avg_num_listings"] = 1 single_price_data["id"] = make_price_id(single_price_data) price_data.append(single_price_data) price_frame = pd.DataFrame(price_data) price_frame.set_index("id", drop=False, inplace=True, verify_integrity=True) return price_frame
def create_prices_lstsq_soln( self, matrix, listing_prices, listing_ids, product_prices, product_ids, good_rows, good_cols, listings=None ): """ Create product prices from the results of the linear least square algorithm. Parameters ---------- matrix : np.array[float] System matrix of linear least square problem. Each row represents one listing. each column represents one product. Each entry represents the condition of a product in a listing. Conditions range from 1...0.; 1: new, 0.7: used, 0: unusable. listing_prices : np.array[float] Prices of listings, constant (known) term of equation system listing_ids : np.array[basestring] Listing ID of each matrix's row. product_prices : np.array[float] Average price of each product. The solution of the equation system. product_ids : np.array[basestring] IDs of the products, represented by elements of `product_prices` and columns of `matrix`. good_cols : np.array[bool] Where `True` prices could be computed by least square algorithm. good_rows : np.array[bool] Where `True` listings contain only products whose prices could be computed by the solution algorithm. listings : pd.DataFrame The listings from which the the system of equations was generated. Will usually contain additional listings. Returns ------- prices : pd.DataFrame The computed prices as a `pd.DataFrame`. """ assert matrix.shape[0] == len(listing_prices) == len(listing_ids) assert matrix.shape[1] == len(product_prices) == len(product_ids) good_prod_idxs = np.argwhere(good_cols)[:, 0] # Create the average prices # Price data is first collected in list of dicts, that is later # converted to a ``DataFrame``. Each dict is a row of the ``DataFrame``. prices = make_price_frame(0) price_data = [] for iprod in range(len(product_prices)): if iprod not in good_prod_idxs: continue single_price_data = {} # Multiply with condition, solver returns prices for condition "new". single_price_data["price"] = product_prices[iprod] * self.default_condition single_price_data["currency"] = self.default_currency single_price_data["condition"] = self.default_condition single_price_data["time"] = self.average_mid_time single_price_data["product"] = product_ids[iprod] single_price_data["listing"] = u"{}-average".format(self.average_mid_time) single_price_data["type"] = "average" single_price_data["avg_period"] = self.avg_period # Get number of listings that were used for this average price, from # the system matrix. Count non-zero entries in the price's column. prod_col = matrix[:, iprod] prod_col = np.where(prod_col > 0, 1, 0) # Don't count NaNs n_listings = np.sum(prod_col) single_price_data["avg_num_listings"] = n_listings single_price_data["id"] = make_price_id(single_price_data) price_data.append(single_price_data) avg_prices = pd.DataFrame(price_data) prices = prices.append(avg_prices, ignore_index=True, verify_integrity=False) # Create prices for each item of each listing # Protect against prices that are NaN good_prod_prices = np.where(np.isnan(product_prices), 0, product_prices) # Price data is first collected in list of dicts. Each dict is a price. price_data = [] for ilist in range(len(listing_prices)): # Each row of `matrix` represents a listing row = matrix[ilist, :] # compute percentage of each product on total listing price # from average prices. virt_prod_prices = row * good_prod_prices list_prod_percent = virt_prod_prices / virt_prod_prices.sum() # compute price of each item in listing based on these percentages listing_price = listing_prices[ilist] list_prod_prices = list_prod_percent * listing_price # `listings` data frame can be `None` for more easy testing. if listings is not None: list_id = listing_ids[ilist] list_currency = listings.ix[list_id, "currency"] list_time = listings.ix[list_id, "time"] else: list_id = listing_ids[ilist] list_currency = "Unknown Currency" list_time = datetime(2000, 1, 1) prod_idxs = np.argwhere(row > 0)[:, 0] if len(prod_idxs) == 1: price_type = "observed" avg_period = "none" else: price_type = "estimated" avg_period = self.avg_period # Create a price record for each of the estimated product prices for iprod in prod_idxs: if iprod not in good_prod_idxs: continue single_price_data = {} single_price_data["price"] = list_prod_prices[iprod] single_price_data["currency"] = list_currency single_price_data["condition"] = row[iprod] single_price_data["time"] = list_time single_price_data["product"] = product_ids[iprod] single_price_data["listing"] = list_id single_price_data["type"] = price_type single_price_data["avg_period"] = avg_period # TODO: Better algorithm, analogous to algorithm above for average prices. single_price_data["avg_num_listings"] = len(listing_prices) single_price_data["id"] = make_price_id(single_price_data) price_data.append(single_price_data) list_prices = pd.DataFrame(price_data) prices = prices.append(list_prices, ignore_index=True, verify_integrity=False) prices.set_index("id", drop=False, inplace=True, verify_integrity=True) return prices