Пример #1
0
 def prices_single_prod(prod_id, base_val):
     rand = np.random.normal(size=(n_prices,))
     #Random times, frequency approximately daily.
     delta = np.array([timedelta(r * 5) for r in rand])
     times = pd.date_range(start="2000-1-1", periods=n_prices, freq="d")
     #TODO: Report Pandas bug "Very odd behavior" ``times + times == times``
     times = np.array([pd.Timestamp(t) for t in times]) + delta
     #ID string, based on time.
     index = [str(t) + "-" + prod_id for t in times]
     #Random prices that don't follow the normal distribution
     prices = 7 * rand + base_val + np.sin(rand * 3) * 3
     #Types, a small amount are observed prices
     randu = np.random.uniform(size=n_prices)
     types = np.array(["notsold"] * n_prices, dtype=object)
     types[randu < 0.6] = "estimated"
     types[randu < 0.2] = "observed"
     #Realistic pattern for guessed prices
     types[[0, 50]] = "guessed"
     prices[[0, 50]] = base_val
     
     price_frame = make_price_frame(index=index)
     price_frame["id"] = index
     price_frame["time"] = times
     price_frame["price"] = prices
     price_frame["type"] = types
     price_frame["product"] = prod_id
     
     #Create mean prices of actually sold items
     include = (price_frame["type"] != "notsold") & \
               (price_frame["type"] != "guessed")
     prices_t = price_frame[include]
     prices_t = prices_t.set_index("time")
     monthly = pd.TimeGrouper(freq="M")
     stats = prices_t.groupby(monthly).aggregate(np.mean)
     stats["time"] = stats.index - timedelta(15)
     stats["id"] = [str(t) + "-mean-" + prod_id for t in stats["time"]]
     stats["product"] = prod_id
     stats["type"] = "average"
 #    print stat
 
     price_frame = price_frame.append(stats, ignore_index=True)
     price_frame.set_index("id", drop=False, inplace=True)
     return price_frame
Пример #2
0
    def compute_prices(self, listings, products, time_start=None, time_end=None, avg_period="week"):
        """
        Compute prices from listings. 
        
        Uses linear least square method to compute prices of items that are
        sold together with other items. This is equivalent to averaging, to
        prices over the listings that were used to compute the prices.
        
        TODO: Delete old prices.
        """
        logging.info("Starting to compute prices...")

        if avg_period == "week":
            offset = pd.datetools.Week(weekday=0)
            self.avg_period = "week"
        else:
            raise NotImplementedError()

        # If no start- or end-points are given start or end of listing sequence
        # Include listings in incomplete periods at start and end of sequence.
        if time_start is None:
            time_start = listings["time"].min()
            time_start = offset.rollback(time_start)
        if time_end is None:
            time_end = listings["time"].max()
            time_end = offset.rollforward(time_end)

        # Create start and end of desired intervals.
        intervals = pd.date_range(time_start, time_end, freq=offset)
        prices = make_price_frame(0)

        # Create list of product IDs. Exclude the place holders the have
        # names"xxx-unknown" starting with
        product_ids = [p.id for p in products if not p.id.startswith("xxx-unknown")]
        if len(product_ids) == 0:
            logging.error("Empty product list.")
            return prices

        # Chop listings into intervals and loop over them.
        listings = listings.sort("time")
        for i in range(len(intervals) - 1):
            intv_start = intervals[i]
            intv_end = intervals[i + 1]

            offset_mid = timedelta(seconds=(intv_end - intv_start).total_seconds() / 2)
            self.average_mid_time = intv_start + offset_mid

            intv_listings = listings.ix[(listings["time"] >= intervals[i]) & (listings["time"] < intervals[i + 1])]
            logging.debug(
                "Interval start: {s}, end: {e}, n listings: {n}.".format(
                    s=intervals[i], e=intervals[i + 1], n=len(intv_listings)
                )
            )
            if len(intv_listings) == 0:
                continue

            matrix, listing_prices, listing_ids, product_ids = self.compute_product_occurrence_matrix(
                intv_listings, product_ids
            )
            if matrix.shape[0] == 0:
                logging.debug("No valid listing prices.")
                continue
            product_prices, good_rows, good_cols, problem_products = self.solve_prices_lstsq(
                matrix, listing_prices, listing_ids, product_ids
            )
            intv_prices = self.create_prices_lstsq_soln(
                matrix, listing_prices, listing_ids, product_prices, product_ids, good_rows, good_cols, listings
            )
            prices = prices.append(intv_prices)

        return prices
Пример #3
0
def create_models():
    """
    Create a Qt-model-view models for listings, products and tasks.
    Additionally returns the related ``DataStore``.
    
    Returns
    -------
    listings_model, product_model, task_model, price_model, data_store
    """
    from clair.qtgui import TaskModel, ProductModel, ListingsModel, PriceModel
    from clair.coredata import Product, SearchTask, DataStore, \
                               make_listing_frame, make_price_frame
    
    fr = make_listing_frame(3)
    #All listings need unique ids
    fr["id"] = ["eb-110685959294", "eb-111014122908", "eb-457"]
    
    fr["training_sample"] = [1.0, 0.0, nan]
    fr["search_tasks"] = ["s-nikon-d90", "s-nikon-d70", "s-nikon-d90"]
    
    fr["expected_products"][0] = ["nikon-d90", "nikon-sb-24"]
    fr["expected_products"][1] = ["nikon-d70"]
    fr["expected_products"][2] = ["nikon-d90", "nikon-sb-24"]
    fr["products"] = [["nikon-d90"], ["nikon-d70"], ["nikon-d90"]]
    fr["products_absent"][0] = ["nikon-sb-24"]
    
    fr["thumbnail"][0] = "www.some.site/dir/to/thumb.pg"
    fr["image"][0] = "www.some.site/dir/to/img.pg"
    fr["title"] = [u"Nikon D90 super duper!", u"Süper Camera", None]
    fr["description"][0] = "Buy my old Nikon D90 camera <b>now</b>!"
    fr["prod_spec"][0] = {"Marke":"Nikon", "Modell":"D90"}
    fr["active"][0] = False
    fr["sold"] = [1., 1., 0.]
    fr["currency"][0] = "EUR"
    fr["price"] = [400., 150, 300]
    fr["shipping"][0] = 12.
    fr["type"][0] = "auction"
    fr["time"] = [datetime(2013,1,10), datetime(2013,2,2), datetime(2013,2,3)]
    fr["location"][0] = u"Köln"
    fr["postcode"][0] = u"50667"
    fr["country"][0] = "DE"
    fr["condition"][0] = 0.7
    fr["server"][0] = "Ebay-Germany"
    fr["server_id"] = ["110685959294", "111014122908", "457"] #ID of listing on server
    fr["final_price"][0] = True
#    fr["data_directory"] = ""
    fr["url_webui"][0] = "www.some.site/dir/to/web-page.html"
#     fr["server_repr"][0] = nan
    #Put our IDs into index
    fr.set_index("id", drop=False, inplace=True, verify_integrity=True)
    
    tasks = [SearchTask("s-nikon-d90", datetime(2000, 1, 1), "ebay-de", 
                        "Nikon D90", "daily", 100, 150, 300, "EUR", 
                        ["nikon-d90", "nikon-18-105-f/3.5-5.6--1"]),
            SearchTask("s-nikon-d70", datetime(2000, 1, 1), "ebay-de", 
                        "Nikon D70", "daily", 100, 75, 150, "EUR", 
                        ["nikon-d70", "nikon-18-105-f/3.5-5.6--1"]),]
    
    products = [Product("nikon-d90", "Nikon D90", "Nikon D90 DSLR camera.", 
                        ["Nikon", "D 90"], ["photo.system.nikon.camera",
                                            "photo.camera.system.nikon"]),
                Product("nikon-d70", "Nikon D70", "Nikon D70 DSLR camera.", 
                        ["Nikon", "D 70"], ["photo.system.nikon.camera",
                                            "photo.camera.system.nikon"])]
    
    pri = make_price_frame(3)
    pri["id"] = ["pri-123", "pri-456", "pri-457"]
    pri["price"] = [310., 150., 300.]
    pri["currency"] = ["EUR", "EUR", "EUR"]
    pri["condition"] = [0.7, 0.7, 0.7]
    pri["time"] = [datetime(2013,1,10), datetime(2013,2,2), datetime(2013,2,3)]
    pri["product"] = ["nikon-d90", "nikon-d70", "nikon-d90"]
    pri["listing"] = ["eb-123", "eb-456", "eb-457"]
    pri["type"] = ["observed", "observed", "observed"]
    pri["avg_period"] = None
    pri["avg_num_listings"] = None
    
    conf_dir = relative("../../example-data")
    data_store = DataStore(conf_dir, None)
    data_store.merge_listings(fr)
    data_store.set_products(products)
    data_store.add_tasks(tasks)
    data_store.merge_prices(pri)
    
    #The models are tested here, creating them may fail. 
    #Don't break all test, because a single model is broken.
    try:
        listings_model = ListingsModel()
        listings_model.setDataStore(data_store)
    except: #IGNORE:W0702
        print "Error! ``listings_model`` could not be initialized!"
        listings_model = None
    try:
        task_model = TaskModel()
        task_model.setDataStore(data_store)
    except: #IGNORE:W0702
        print "Error! ``task_model`` could not be initialized!"
        task_model = None
    try:
        product_model = ProductModel()
        product_model.setDataStore(data_store)
    except: #IGNORE:W0702
        print "Error! ``product_model`` could not be initialized!"
        product_model = None
    try:
        price_model = PriceModel()
        price_model.setDataStore(data_store)
    except: #IGNORE:W0702
        print "Error! ``price_model`` could not be initialized!"
        price_model = None
    
    return listings_model, product_model, task_model, price_model, data_store
Пример #4
0
    def create_prices_lstsq_soln(
        self, matrix, listing_prices, listing_ids, product_prices, product_ids, good_rows, good_cols, listings=None
    ):
        """
        Create product prices from the results of the linear least 
        square algorithm.

        Parameters
        ----------
        matrix : np.array[float]
            System matrix of linear least square problem. Each row represents 
            one listing. each column represents one product. Each entry
            represents the condition of a product in a listing. Conditions
            range from 1...0.; 1: new, 0.7: used, 0: unusable.
            
        listing_prices : np.array[float]
            Prices of listings, constant (known) term of equation system
            
        listing_ids : np.array[basestring]
            Listing ID of each matrix's row.
        
        product_prices : np.array[float]
            Average price of each product. The solution of the equation system.
        
        product_ids : np.array[basestring]
            IDs of the products, represented by elements of `product_prices`
            and columns of `matrix`.
        
        good_cols : np.array[bool]
            Where `True` prices could be computed by least square algorithm.
        
        good_rows : np.array[bool]
            Where `True` listings contain only products whose prices could be
            computed by the solution algorithm. 
        
        listings : pd.DataFrame
            The listings from which the the system of equations was generated.
            Will usually contain additional listings.
            
        Returns
        -------
        prices : pd.DataFrame
            The computed prices as a `pd.DataFrame`.
        """
        assert matrix.shape[0] == len(listing_prices) == len(listing_ids)
        assert matrix.shape[1] == len(product_prices) == len(product_ids)

        good_prod_idxs = np.argwhere(good_cols)[:, 0]

        # Create the average prices
        # Price data is first collected in list of dicts, that is later
        # converted to a ``DataFrame``. Each dict is a row of the ``DataFrame``.
        prices = make_price_frame(0)
        price_data = []
        for iprod in range(len(product_prices)):
            if iprod not in good_prod_idxs:
                continue
            single_price_data = {}
            # Multiply with condition, solver returns prices for condition "new".
            single_price_data["price"] = product_prices[iprod] * self.default_condition
            single_price_data["currency"] = self.default_currency
            single_price_data["condition"] = self.default_condition
            single_price_data["time"] = self.average_mid_time
            single_price_data["product"] = product_ids[iprod]
            single_price_data["listing"] = u"{}-average".format(self.average_mid_time)
            single_price_data["type"] = "average"
            single_price_data["avg_period"] = self.avg_period
            # Get number of listings that were used for this average price, from
            # the system matrix. Count non-zero entries in the price's column.
            prod_col = matrix[:, iprod]
            prod_col = np.where(prod_col > 0, 1, 0)  # Don't count NaNs
            n_listings = np.sum(prod_col)
            single_price_data["avg_num_listings"] = n_listings
            single_price_data["id"] = make_price_id(single_price_data)
            price_data.append(single_price_data)

        avg_prices = pd.DataFrame(price_data)
        prices = prices.append(avg_prices, ignore_index=True, verify_integrity=False)

        # Create prices for each item of each listing
        # Protect against prices that are NaN
        good_prod_prices = np.where(np.isnan(product_prices), 0, product_prices)
        # Price data is first collected in list of dicts. Each dict is a price.
        price_data = []
        for ilist in range(len(listing_prices)):
            # Each row of `matrix` represents a listing
            row = matrix[ilist, :]

            # compute percentage of each product on total listing price
            # from average prices.
            virt_prod_prices = row * good_prod_prices
            list_prod_percent = virt_prod_prices / virt_prod_prices.sum()
            # compute price of each item in listing based on these percentages
            listing_price = listing_prices[ilist]
            list_prod_prices = list_prod_percent * listing_price

            # `listings` data frame can be `None` for more easy testing.
            if listings is not None:
                list_id = listing_ids[ilist]
                list_currency = listings.ix[list_id, "currency"]
                list_time = listings.ix[list_id, "time"]
            else:
                list_id = listing_ids[ilist]
                list_currency = "Unknown Currency"
                list_time = datetime(2000, 1, 1)
            prod_idxs = np.argwhere(row > 0)[:, 0]
            if len(prod_idxs) == 1:
                price_type = "observed"
                avg_period = "none"
            else:
                price_type = "estimated"
                avg_period = self.avg_period

            # Create a price record for each of the estimated product prices
            for iprod in prod_idxs:
                if iprod not in good_prod_idxs:
                    continue
                single_price_data = {}
                single_price_data["price"] = list_prod_prices[iprod]
                single_price_data["currency"] = list_currency
                single_price_data["condition"] = row[iprod]
                single_price_data["time"] = list_time
                single_price_data["product"] = product_ids[iprod]
                single_price_data["listing"] = list_id
                single_price_data["type"] = price_type
                single_price_data["avg_period"] = avg_period
                # TODO: Better algorithm, analogous to algorithm above for average prices.
                single_price_data["avg_num_listings"] = len(listing_prices)
                single_price_data["id"] = make_price_id(single_price_data)
                price_data.append(single_price_data)

        list_prices = pd.DataFrame(price_data)
        prices = prices.append(list_prices, ignore_index=True, verify_integrity=False)
        prices.set_index("id", drop=False, inplace=True, verify_integrity=True)
        return prices