Пример #1
0
def ZooplaPriceChanges():
    total = 0
    pSame = 0
    priceMap = {}
    #    distribution = DiscountDistribution()
    data = ds.ZooplaMatchedDaily()
    #    store = pd.HDFStore('rawDaily.hd5',mode='w')
    #    for chunk in data.parser:
    chunk = data.read(1000)
    chunk.rename(columns={'\xef\xbb\xbfLISTING ID': 'LISTING ID'},
                 inplace=True)
    filteredchunk = chunk[chunk["MARKET"] == "SALE"][[
        'LISTING ID', 'DAY', 'PRICE'
    ]][chunk['PRICE'] > 0]
    for row in filteredchunk.values:
        currentState = priceMap.get(row[0])
        if currentState == None:
            priceMap[row[0]] = PriceCalc(row[1], row[2])
        else:
            startDay, endDay, percent = currentState.add(row[1], row[2])
            distribution.add(startDay, endDay, percent)

    # now get deletion dates
    delData = ds.ZooplaMatchedCollated()
    #    for chunk in delData.parser:
    chunk = delData.read(1000)
    chunk.rename(columns={'\xef\xbb\xbfLISTING ID': 'LISTING ID'},
                 inplace=True)
    filteredchunk = chunk[chunk["MARKET"] == "SALE"][['LISTING ID', 'DELETED']]
    for row in filteredchunk.values:
        currentState = priceMap.get(row[0])
        if currentState != None:
            if (currentState.currentprice == currentState.initialmarketprice):
                pSame += 1
            total += 1
            startDay, endDay, percent = currentState.add(row[1], 0)
            distribution.add(startDay, endDay, percent)
            priceMap.pop(row[0])
    print len(priceMap)
    print pSame, total, pSame * 1.0 / total
    plotProbability(distribution.dist)
Пример #2
0
def ZooplaPriceChanges():
    total = 0
    pSame = 0
    priceMap = {}
#    distribution = DiscountDistribution()    
    data = ds.ZooplaMatchedDaily()
    #    store = pd.HDFStore('rawDaily.hd5',mode='w')
    #    for chunk in data.parser:
    chunk = data.read(10000000)
    chunk.rename(columns={'\xef\xbb\xbfLISTING ID':'LISTING ID'},inplace=True)
    filteredchunk = chunk[chunk["MARKET"]=="SALE"][['LISTING ID','DAY','PRICE']][chunk['PRICE']>0]
    change = []
    changeprice = []
    nochange = []
    for row in filteredchunk.values:
        if row[0] in priceMap:
            if(priceMap[row[0]].currentprice == row[2]):
                # no change
                nochange.append(priceMap[row[0]].daysonmarket/30)
            else:'
                change.append(priceMap[row[0]].daysonmarket/30)
                changeprice.append([priceMap[row[0]].daysonmarket/30, -(priceMap[row[0]].currentprice-row[2])/row[2]*100])
          
                
            startDay, endDay, percent = priceMap[row[0]].add(row[1],row[2])
            distribution.add(startDay, endDay, percent)
        else:
            priceMap[row[0]] = PriceCalc(row[1],row[2])
               
    # now get deletion dates
    delData = ds.ZooplaMatchedCollated()
#    for chunk in delData.parser:
    chunk = delData.read(10000000)
    chunk.rename(columns={'\xef\xbb\xbfLISTING ID':'LISTING ID'},inplace=True)
    filteredchunk = chunk[chunk["MARKET"]=="SALE"][['LISTING ID','DELETED']]

    for row in filteredchunk.values:
       if row[0] in priceMap:
           if(priceMap[row[0]].currentprice == priceMap[row[0]].initialmarketprice):
               pSame += 1
           total += 1

    print pSame, total, pSame*1.0/total
    

    for row in filteredchunk.values:
       if row[0] in priceMap:
           startDay, endDay, percent = priceMap[row[0]].add(row[1],0)
           distribution.add(startDay, endDay, percent)
           priceMap.pop(row[0])
           
    print len(priceMap)

    global savedOutput1
    global savedOutput2
    global savedOutput3
    savedOutput1 = nochange
    savedOutput2 = change
    savedOutput3 = changeprice
    plotProbability(distribution.dist)
    
    global hist
    global n, n1, n2, nprice, df
 
 #   hist = np.histogram(savedOutput1)
    

    n1, bins1, patches1 = pyl.hist(savedOutput1,bins=range(min(savedOutput1), max(savedOutput1) + 1, 1))
    
    n2, bins2, patches2 = pyl.hist(savedOutput2,bins=range(min(savedOutput2), max(savedOutput2) + 1, 1))
    
    dist, binsa, binsb = np.histogram2d([x[0] for x in savedOutput3], [x[1] for x in savedOutput3], range=[[0,30],[-30,0]], bins=[30,20])
    
# plt.imshow(dist)

    
    n = n2/(n1+n2)
  
    return(n, n1, n2)