def removeRetweetsRelative(): l = [ ] for doc in SimpleDoc.getall(): if len(doc.tweets) >= 1: numNormalTweets = sum(1 for tweet in doc.tweets if not tweet.isRetweet()) numRetweets = sum(1 for tweet in doc.tweets if tweet.isRetweet()) l.append([numNormalTweets+numRetweets, numNormalTweets]) diffs = map(lambda x: float(x[0]-x[1])/x[0], l) relBins, labels = pieData([ [lambda x: x==0.0, "0%"], [lambda x: x>0.0 and x<=0.3, "0%<d<=30%"], [lambda x: x>0.3 and x<=0.5, "30%<d<=50%"], [lambda x: x>0.5, ">50%"] ], diffs) plt.figure() plt.pie(relBins, autopct='%1.1f%%', startangle=90, labels=labels) plt.title("Differenzen in Prozent, Retweets entfernt werden") plt.show()
def removeRetweetsAbsolute(): l = [ ] for doc in SimpleDoc.getall(): if len(doc.tweets) >= 1: numNormalTweets = sum(1 for tweet in doc.tweets if not tweet.isRetweet()) numRetweets = sum(1 for tweet in doc.tweets if tweet.isRetweet()) l.append([numNormalTweets, numRetweets]) diffs = map(lambda x: x[0]-x[1], l) """relBins, labels = pieData([ [lambda x: x==0, "diff: 0"], [lambda x: x==1, "diff: 1"], [lambda x: x>=2 and x <= 5, "diff: 2-5"], [lambda x: x>5, "diff: >5"] ], diffs)""" relBins, labels = pieData([ [lambda x: x>=6 and x <=10, "diff: 6-10"], [lambda x: x>=11 and x <=50, "diff: 11-50"], [lambda x: x>51, "diff: >51"] ], diffs) plt.figure() plt.pie(relBins, autopct='%1.1f%%', startangle=90, labels=labels) plt.title("Differenzen in Anzahl Tweets zu Dokument, wenn Retweets entfernt werden.") plt.show()
def removeDoubleUsersAbsolute(): l = [ ] for doc in SimpleDoc.getall(): tweetUsers = map(lambda x: x.user, doc.tweets) if len(tweetUsers) >= 1: l.append([len(tweetUsers), len(set(tweetUsers))]) diffs = map(lambda x: x[0]-x[1], l) """relBins, labels = pieData([ [lambda x: x==0, "diff: 0"], [lambda x: x==1, "diff: 1"], [lambda x: x>=2 and x <= 5, "diff: 2-5"], [lambda x: x>5, "diff: >5"] ], diffs)""" relBins, labels = pieData([ [lambda x: x>=6 and x<=10, "diff: 5-10"], [lambda x: x>=11 and x<=50, "diff: 11-50"], [lambda x: x>=51 and x<=300, "diff: 51-300"] ], diffs) plt.figure() plt.pie(relBins, autopct='%1.1f%%', startangle=90, labels=labels) plt.title("Differenzen in Anzahl Tweets zu Dokument, wenn doppelte Benutzer entfernt werden.") plt.show()
def alteringTweetStreamAfterFirstPeak(): relativeTweetDiffAfter1WeekAndTotal = map( lambda doc: float(doc.numTweets()) / doc.numTweetsBetweenRelative(None, 60*60*24*7), filter(lambda doc: doc.numTweetsBetweenRelative(None, 60*60*24*7) >= 5, SimpleDoc.getall()) ) relBins, labels = pieData([ [lambda x: x==1.0, "+0%"], [lambda x: x>1.0 and x<=1.1, "+0-10%"], [lambda x: x>1.1 and x<=1.2, "+10-20%"], [lambda x: x>1.2 and x<=1.3, "+20-30%"], [lambda x: x>1.3 and x<=1.4, "+30-40%"], [lambda x: x>1.4 and x<=1.5, "+40-50%"], [lambda x: x>1.5, ">50%"] ], relativeTweetDiffAfter1WeekAndTotal) plt.figure() plt.pie(relBins, autopct='%1.1f%%', startangle=90, labels=labels) plt.title("Relative Anzahl Tweets im Vergleich zu Anzahl Tweets nach einer Woche") plt.show()
def removeDoubleUsersRelative(): l = [ ] for doc in SimpleDoc.getall(): tweetUsers = map(lambda x: x.user, doc.tweets) if len(tweetUsers) >= 1: l.append([len(tweetUsers), len(set(tweetUsers))]) diffs = map(lambda x: float(x[0]-x[1])/x[0], l) relBins, labels = pieData([ [lambda x: x==0.0, "0%"], [lambda x: x>0.0 and x<=10.0, "0%<d<=10%"], [lambda x: x>0.1 and x<=0.3, "10%<d<=30%"], [lambda x: x>0.3, ">30%"] ], diffs) plt.figure() plt.pie(relBins, autopct='%1.1f%%', startangle=90, labels=labels) plt.title("Differenzen in Anzahl Tweets zu Dokument, wenn doppelte Benutzer entfernt werden.") plt.show()
def getBins(beta, binConditions): s = map(lambda x: int(x), numpy.random.exponential(beta, 10000)) bins, labels = pieData(binConditions, s) return bins
def distFirstTweetToDoc(): allDocs = list(SimpleDoc.getall()) for param in range(20, 100): diffs = [] maximumTweetAge = 60*60*24*param minimumTweetAge = 60*60*24*10 for doc in filter(lambda doc: len(doc.tweets) != 0 and doc.age() >= maximumTweetAge, allDocs): pubTimestamp = doc.publicationTimestamp # firstTweetTimestamp = max([tweet.timestamp for tweet in doc.tweets]) diffs.extend([tweet.timestamp-pubTimestamp for tweet in filter( lambda tweet: (tweet.timestamp-doc.publicationTimestamp) < maximumTweetAge and (tweet.timestamp-doc.publicationTimestamp) > minimumTweetAge, doc.tweets ) ]) maxBins = 30 timeslot = (float(maximumTweetAge)-float(minimumTweetAge))/maxBins def binNr2Bound(binNr): return minimumTweetAge+(binNr*timeslot) binConditions = map( lambda binNr: [lambda x: x>binNr2Bound(binNr) and x<=binNr2Bound(binNr+1), str(binNr) + "X"], range(0, maxBins) ) # binConditions.append([lambda x: x>binNr2Bound(maxBins), ">" + str(maxBins-1) + str("X")]) diffBins, diffLabels = pieData(binConditions, diffs) distBinConditions = map( lambda binNr: [lambda x: x==binNr, "X=" + str(binNr)], range(0, maxBins) ) def getBins(beta, binConditions): s = map(lambda x: int(x), numpy.random.exponential(beta, 10000)) bins, labels = pieData(binConditions, s) return bins def binDiffs(bins1, bins2): return sum(map((lambda (a, b): abs(a-b)), zip(bins1, bins2))) def searchInRangeRec(minBeta, maxBeta, steps, depth, maxDepth): minError = min( map(lambda beta: [beta, binDiffs(getBins(beta, distBinConditions), diffBins)], numpy.arange(minBeta, maxBeta, steps)), key = lambda x: x[1] ) errorBelow = binDiffs(getBins(minError[0]-(float(steps)/2), distBinConditions), diffBins) errorAbove = binDiffs(getBins(minError[0]+(float(steps)/2), distBinConditions), diffBins) if depth==maxDepth: return minError elif errorBelow <= errorAbove: x = searchInRangeRec(minError[0]-steps, minError[0], float(steps)/10, depth+1, maxDepth) return x[0], x[1] else: x = searchInRangeRec(minError[0], minError[0]+steps, float(steps)/10, depth+1, maxDepth) return x[0], x[1] beta, error = searchInRangeRec(1, 10, 1, 0, 3) print param, (error/maxBins) # s = numpy.random.poisson(1.2, 10000) # s = numpy.random.zipf(1.5, 10000) # f = 3.0 # s = map(lambda x: (float(x)-(1+(random.random()/f)))*f, s) # s.extend([0] * (100*60)) #binConditions2.append([lambda x: x>maxBins, ">" + str(maxBins-1) + str("X")]) """expDistData = map(lambda x: int(x), numpy.random.exponential(beta, 10000))