def determineUpperRangeForTemporalDistances(timeRange, outputFolder): i = 1 temporalDistancesForAllLattices = [] for latticeObject in FileIO.iterateJsonFromFile(hashtagsLatticeGraphFile%(mrOutputFolder,'%s_%s'%timeRange)): print i, latticeObject['id']; i+=1 for neighborLattice, neighborHashtags in latticeObject['links'].iteritems(): temporalDistancesForAllLattices+=zip(*filterOutNeighborHashtagsOutside1_5IQROfTemporalDistance(latticeObject['hashtags'], neighborHashtags).iteritems())[1] # if i==10: break print getOutliersRangeUsingIRQ(temporalDistancesForAllLattices)[1]
def estimateUpperRangeForTimePeriod(timeRange, outputFolder): dataX = [] i=1 for hashtagObject in FileIO.iterateJsonFromFile(hashtagsWithoutEndingWindowFile%(outputFolder,'%s_%s'%timeRange)): print i;i+=1 occuranesInHighestActiveRegion = getOccuranesInHighestActiveRegion(hashtagObject) dataX.append((occuranesInHighestActiveRegion[-1][1]-occuranesInHighestActiveRegion[0][1])/TIME_UNIT_IN_SECONDS) print getOutliersRangeUsingIRQ(dataX) plt.hist(dataX, bins=10) plt.show()
def plotTemporalLocality(): distances = Locality._getDistances() dataToPlot, dataX, dataY = defaultdict(list), [], [] ax = plt.gca() for _, data in distances.iteritems(): dataToPlot[int(data['geoDistance'])/100*100+100].append(data['temporalDistance']/(60*60)) for k in sorted(dataToPlot): _, upperRange = getOutliersRangeUsingIRQ(dataToPlot[k]) points = filter(lambda i:i<upperRange, dataToPlot[k]) if len(points)>50: dataX.append(k), dataY.append(np.mean(points)) pearsonCoeff, p_value = scipy.stats.pearsonr(dataX, dataY) print round(pearsonCoeff,2), round(p_value, 2) plt.scatter(dataX, dataY, c='r', lw = 0) # plt.title('Temporal distance between lattices ' + getLatexForString('( \\rho = %0.2f, p-value = %0.2f )'%(pearsonCoeff, p_value))) plt.xlabel('Spatial affinity (miles)', fontsize=20), plt.ylabel('Temporal affinity (hours)', fontsize=20) # plt.show() plt.savefig('../images/temporalLocality.png')
def timePeriods(timeRange, folderType): distribution = defaultdict(list) # i = 1 # for h in FileIO.iterateJsonFromFile(hashtagsWithoutEndingWindowFile%(folderType,'%s_%s'%timeRange)): ## if h['h']=='jartic': # classId = HashtagsClassifier.classify(h) # if classId: # print i, unicode(h['h']).encode('utf-8'), classId;i+=1 # occs = getOccuranesInHighestActiveRegion(h) # distribution[classId].append((occs[-1][1]-occs[0][1])/TIME_UNIT_IN_SECONDS) # for k,v in distribution.iteritems(): # FileIO.writeToFileAsJson({'id':k, 'dist': v}, '../data/hashtagsClassTimePeriods.txt') i = 1 for data in FileIO.iterateJsonFromFile('../data/hashtagsClassTimePeriods.txt'): # print data.keys() plt.subplot(220+i);i+=1 plt.hist(data['dist'], bins=100) boundary = getOutliersRangeUsingIRQ(data['dist'])[1] actualHashtags = filter(lambda t:t<=boundary, data['dist']) meanTimePeriod = np.mean(actualHashtags) print {data['id'] : {'meanTimePeriod': meanTimePeriod, 'outlierBoundary': boundary}} plt.title(data['id']+' %0.2f %0.2f %d'%(meanTimePeriod, boundary, len(actualHashtags))) plt.xlim(xmax=200) plt.show()
def getRadius(locations): meanLid = getCenterOfMass(locations,accuracy=LATTICE_ACCURACY) distances = [getHaversineDistance(meanLid, p) for p in locations] _, upperBoundForDistance = getOutliersRangeUsingIRQ(distances) return np.mean(filter(lambda d: d<=upperBoundForDistance, distances))