def removeRetweetsAbsolute(): l = [ ] for doc in SimpleDoc.getall(): if len(doc.tweets) >= 1: numNormalTweets = sum(1 for tweet in doc.tweets if not tweet.isRetweet()) numRetweets = sum(1 for tweet in doc.tweets if tweet.isRetweet()) l.append([numNormalTweets, numRetweets]) diffs = map(lambda x: x[0]-x[1], l) """relBins, labels = pieData([ [lambda x: x==0, "diff: 0"], [lambda x: x==1, "diff: 1"], [lambda x: x>=2 and x <= 5, "diff: 2-5"], [lambda x: x>5, "diff: >5"] ], diffs)""" relBins, labels = pieData([ [lambda x: x>=6 and x <=10, "diff: 6-10"], [lambda x: x>=11 and x <=50, "diff: 11-50"], [lambda x: x>51, "diff: >51"] ], diffs) plt.figure() plt.pie(relBins, autopct='%1.1f%%', startangle=90, labels=labels) plt.title("Differenzen in Anzahl Tweets zu Dokument, wenn Retweets entfernt werden.") plt.show()
def removeRetweetsRelative(): l = [ ] for doc in SimpleDoc.getall(): if len(doc.tweets) >= 1: numNormalTweets = sum(1 for tweet in doc.tweets if not tweet.isRetweet()) numRetweets = sum(1 for tweet in doc.tweets if tweet.isRetweet()) l.append([numNormalTweets+numRetweets, numNormalTweets]) diffs = map(lambda x: float(x[0]-x[1])/x[0], l) relBins, labels = pieData([ [lambda x: x==0.0, "0%"], [lambda x: x>0.0 and x<=0.3, "0%<d<=30%"], [lambda x: x>0.3 and x<=0.5, "30%<d<=50%"], [lambda x: x>0.5, ">50%"] ], diffs) plt.figure() plt.pie(relBins, autopct='%1.1f%%', startangle=90, labels=labels) plt.title("Differenzen in Prozent, Retweets entfernt werden") plt.show()
def publication_years(): plt.figure(num=None, figsize=(8, 4), dpi=80, facecolor='w', edgecolor='k') publicationYears = list(simpleDoc.publicationDatetime().year for simpleDoc in SimpleDoc.getallBetween(None, None)) histDisc(plt, publicationYears, width = 0.5) # plt.savefig(figurePath("publication_years.png")) plt.tight_layout() plt.show()
def tweetVsMendeleyReaders(yearBounds = [None, None], maxTweets = 300, maxReaders = 300): tweetVsMendeleyReaderList = [] totalDocs = 0 for doc in filter(lambda doc: (doc.mendeleyReaders != None and doc.mendeleyReaders<=maxReaders) and (doc.tweets != None and len(doc.tweets)<=maxTweets) and (not yearBounds[0] or doc.publicationDatetime().year>=yearBounds[0]) and (not yearBounds[1] or doc.publicationDatetime().year<=yearBounds[1]), SimpleDoc.getall() ): tweetVsMendeleyReaderList.append([len(doc.tweets), doc.mendeleyReaders]) totalDocs += 1 x, y = zip(*tweetVsMendeleyReaderList) plt.figure() plt.scatter(x, y) plt.title("Korrelation zwischen Tweets und Zitationen (Papieren zwischen " + str(yearBounds[0]) + " und " + str(yearBounds[1]) + "; #Docs: " + str(totalDocs) + ")") plt.ylabel("#Tweets (1-" + str(maxTweets) + ")") plt.xlabel("#Readers (1-" + str(maxReaders) + ")") p = numpy.polyfit(x, y, 1) xTrend = range(min(x), max(x)+1) yTrend = map(lambda x: numpy.polyval(p, x), xTrend) plt.plot(xTrend, yTrend, color='r') plt.figtext(0.80, 0.05, 'korrelationskoeffizient: ' + str(korrelationskoeffizient(x, y))) plt.show()
def numTweets(): plt.figure() numTweets = list(len(simpleDoc.tweets) for simpleDoc in SimpleDoc.getall()) labels, values = hist(numTweets, [1, 2, 5, 10, 20, 50, 100, 500, 1000]) barPlot(plt, labels, values) plt.show()
def removeDoubleUsersAbsolute(): l = [ ] for doc in SimpleDoc.getall(): tweetUsers = map(lambda x: x.user, doc.tweets) if len(tweetUsers) >= 1: l.append([len(tweetUsers), len(set(tweetUsers))]) diffs = map(lambda x: x[0]-x[1], l) """relBins, labels = pieData([ [lambda x: x==0, "diff: 0"], [lambda x: x==1, "diff: 1"], [lambda x: x>=2 and x <= 5, "diff: 2-5"], [lambda x: x>5, "diff: >5"] ], diffs)""" relBins, labels = pieData([ [lambda x: x>=6 and x<=10, "diff: 5-10"], [lambda x: x>=11 and x<=50, "diff: 11-50"], [lambda x: x>=51 and x<=300, "diff: 51-300"] ], diffs) plt.figure() plt.pie(relBins, autopct='%1.1f%%', startangle=90, labels=labels) plt.title("Differenzen in Anzahl Tweets zu Dokument, wenn doppelte Benutzer entfernt werden.") plt.show()
def mendeleyDisciplines(): def generalCondition(doc): time = doc.publicationDatetime() return ( (time.year == 2012 and time.month >= 6 and time.month <= 8) or (time.year > 2012) ) and doc.mendeleyDisciplines != None def domainDocs(domain): return [doc for doc in consideredDocs if generalCondition(doc) and domain in doc.mendeleyDisciplines] consideredDocs = list(filter(lambda doc: generalCondition(doc), SimpleDoc.getall())) totalDocs = len(consideredDocs) distinctDomains = set() for doc in consideredDocs: distinctDomains |= set(doc.mendeleyDisciplines) domainData = [] for domain in distinctDomains: d = domainDocs(domain) numDocs = len(d) meanTweets = numpy.mean([doc.numTweets() for doc in d]) domainData.append((domain, numDocs, ("%2.2f" % (float(numDocs)*100 / totalDocs)) + "\\%", "%2.2f" % meanTweets )) domainDataSorted = sorted(domainData, key=lambda x: x[1], reverse=True) compileTex( simpleTabular(["Disziplin", "\\#Dokumente", "Anteil", "AvgTweets"], domainDataSorted, orientation="lrrr"), figurePath("mendeleyDisciplines2.pdf") )
def twitterHist(): plt.figure(num=None, figsize=(8, 4), dpi=80, facecolor='w', edgecolor='k') tweetTime = [(tweet.datetime().year, tweet.datetime().month) for doc in SimpleDoc.getall() for tweet in doc.tweets] histDisc(plt, tweetTime, width=0.5) #plt.title("Verteilung der Tweets nach Jahr und Monat") plt.tight_layout() plt.show()
def crossrefVsTwitter(yearBounds = [None, None], minTweetAge = None, maxTweetAge = None): tweetVsCrossrefList = [] minTweetAge = 60*60*24*0 maxTweetAge = 60*60*24*100 totalDocs = 0 totalTweets = 0 nullWeights = 0 nonNullWeights = 0 for doc in filter( lambda doc: (doc.publicationDatetime().year==2012 and doc.publicationDatetime().month>=6 and doc.publicationDatetime().month<=8), SimpleDoc.getall() ): docsTweets = filter(lambda tweet: (not minTweetAge or (tweet.timestamp-doc.publicationTimestamp) >= minTweetAge) and (not maxTweetAge or (tweet.timestamp-doc.publicationTimestamp) <= maxTweetAge), doc.tweets) def userWeight(tweet): user = tweet.user() return None if user is None else user.weight() userWeights = map(lambda tweet: userWeight(tweet), docsTweets) nullWeights += sum((1 for weight in userWeights if weight is None)) nonNullWeights += sum((1 for weight in userWeights if not weight is None)) tweetVsCrossrefList.append([doc.numCrossrefs(), 0 if len(userWeights) == 0 else sum(filter(lambda weight: weight != None, userWeights))]) totalDocs += 1 totalTweets += len(docsTweets) print totalDocs print totalTweets print float(nullWeights) / (nullWeights+nonNullWeights) # tweetVsCrossrefList = sorted(tweetVsCrossrefList, key=lambda tc: tc[1], reverse=True)[:100] x, y = zip(*tweetVsCrossrefList) paperFigure(plt) plt.scatter(x, y) # plt.title("Korrelation zwischen Tweets und Zitationen (Papieren zwischen " + str(yearBounds[0]) + " und " + str(yearBounds[1]) + "; #Docs: " + str(totalDocs) + ")") plt.ylabel("#Tweets") plt.xlabel("#Crossrefs") #plt.xlim((0,200)) #plt.ylim((0,30)) p = numpy.polyfit(x, y, 1) xTrend = range(min(x), max(x)+1) yTrend = map(lambda x: numpy.polyval(p, x), xTrend) plt.plot(xTrend, yTrend, color='r') # plt.figtext(0.80, 0.05, 'korrelationskoeffizient: ' + str(korrelationskoeffizient(x, y))) print 'korrelationskoeffizient: ' + str(korrelationskoeffizient(x, y)) plt.tight_layout() plt.show()
def groupByJournalAndVolume(): issns = { } docs = list(SimpleDoc.getall()) for doc in docs: issns[doc.issn] = issns.get(doc.issn, 0) + 1 validIssns = map(lambda kv: kv[0], filter(lambda item: item[1]>5 and item[0] != None, issns.items())) groups = { } for doc in docs: if doc.issn in validIssns: groupList = groups.get((doc.issn, doc.volume), []) # groupList = groups.get(doc.issn, []) groupList.append(doc) groups[(doc.issn, doc.volume)] = groupList # groups[doc.issn] = groupList validGroups = filter(lambda group: len(group[1]) > 5, groups.items()) # validGroups = groups.items() correlationValues = [] for ident, docs in validGroups: docTweets = map(lambda doc: doc.numTweets(), docs) docCrossrefs = map(lambda doc: doc.numCrossrefs(), docs) korr = None # docTweetCrossrefRatios = map(lambda doc: [float(doc.numTweets()) / doc.numCrossrefs() if doc.numCrossrefs() != 0 else float('nan')], docs) maxYear = max(map(lambda doc: doc.publicationDatetime().year, docs)) minYear = min(map(lambda doc: doc.publicationDatetime().year, docs)) yearRange = None if maxYear == minYear: yearRange = str(minYear) else: yearRange = str(minYear) + "-" + str(maxYear) try: korr = "%2.3f" % korrelationskoeffizient(docTweets, docCrossrefs) except ZeroDivisionError: korr = "NaN" # correlationValues.append([ident[0], ident[1], len(docs), "%2.2f" % numpy.mean(docTweets), "%2.2f" % numpy.std(docTweets), korr, yearRange]) correlationValues.append([ident[0], ident[1], len(docs), "%2.2f" % numpy.mean(docTweets), "%2.2f" % numpy.mean(docCrossrefs), "%2.2f" % (float(numpy.sum(docTweets))/numpy.sum(docCrossrefs)), yearRange]) # correlationValues.append([ident, len(docs), "%2.2f" % numpy.mean(docTweets), "%2.2f" % numpy.std(docTweets), korr]) correlationValues = sorted(correlationValues, key=lambda x: x[0]) compileTex( # simpleTabular(["ISSN", "Volume", "\\#Docs", "AVG Tweets", "StdDev", "korr", "Years"], correlationValues, orientation="llrrrrl"), simpleTabular(["ISSN", "Volume", "\\#Docs", "AVG T", "AVG C", "T/C", "Years"], correlationValues, orientation="llrrrrl"), # simpleTabular(["ISSN", "\\#Docs", "AVG Tweets", "StdDev", "korr", "Years"], correlationValues, orientation="lrrrrl"), figurePath("correlationsInJournals2.pdf") )
def overviewData(): # earliest tweet timestamp: 1337079632 docs = list(SimpleDoc.getall()) tweets = [ tweet for doc in docs for tweet in doc.tweets ] print "total Tweets: " + str(len(tweets)) print "latest tweet: " + str(max(tweets, key=lambda x: x.timestamp).datetime()) print "earliest tweet: " + str(min(tweets, key=lambda x: x.timestamp).datetime()) print "total Documents: " + str(len(docs)) print "latest document: " + str(max(docs, key=lambda doc: doc.publicationTimestamp).publicationDatetime()) print "earliest document: " + str(min(docs, key=lambda doc: doc.publicationTimestamp).publicationDatetime())
def tweetsBySpecificUserCorrelations(): docs = SimpleDoc.getallBetween((2012, 6), (2012, 8)) pairs = [] for doc in docs: numTweets = len(filter(lambda tweet: tweet.username=="ATP_CME" ,doc.tweets)) citations = doc.averageCitations() pairs.append([numTweets, citations]) x, y = zip(*pairs) print allCorrelations(x, y) plt.scatter(x, y) plt.show()
def userHist(): plt.figure() users = [] for doc in SimpleDoc.getall(): for tweet in doc.tweets: users.append(tweet.user) userGroupCounts = sorted(groupCount(users), key=lambda x: x[1], reverse=True) filteredUserGroupCounts = filter(lambda x: x[1]>=2, userGroupCounts) plt.plot(map(lambda x: x[1], filteredUserGroupCounts), range(1, len(filteredUserGroupCounts)+1)) plt.title("Users sortiert nach Tweets (1er User abgeschnitten)") plt.xlabel("Rang des Users") plt.ylabel("#Tweets") plt.show()
def tweetHist(): docs = list(filter( lambda doc: (doc.publicationDatetime().year==2012 and doc.publicationDatetime().month>=6 and doc.publicationDatetime().month<=8), SimpleDoc.getall() )) numTweets = [doc.numTweets() for doc in docs] plt.figure() plt.hist(numTweets, bins=xrange(0, 150, 5)) plt.figure() numCite = [doc.numCrossrefs() for doc in docs] plt.hist(numCite, bins=xrange(0, 150, 5)) plt.show()
def topUsers(): plt.figure() users = [] for doc in SimpleDoc.getall(): for tweet in doc.tweets: users.append(tweet.user) userGroupCounts = sorted(groupCount(users), key=lambda x: x[1], reverse=True) topUsers = userGroupCounts[:10] users, values = zip(*topUsers) barPlot(plt, list(users), list(values)) plt.title("Top 10 Users") plt.ylabel("#Tweets") plt.show()
def numRetweets(): numNormalTweets = 0 numRetweets = 0 for doc in SimpleDoc.getall(): for tweet in doc.tweets: if(tweet.isRetweet()): numRetweets += 1 else: numNormalTweets += 1 relNormalTweets, relRetweets = float(numNormalTweets)*100 / (numNormalTweets+numRetweets), float(numRetweets)*100 / (numNormalTweets+numRetweets) plt.figure() plt.pie([relNormalTweets, relRetweets], autopct='%1.1f%%', startangle=90, labels=[ 'normale Tweets (' + str(numNormalTweets) + ')', 'Retweets (' + str(numRetweets) + ')' ], colors=['green', 'yellow']) plt.show()
def removeDoubleUsersRelative(): l = [ ] for doc in SimpleDoc.getall(): tweetUsers = map(lambda x: x.user, doc.tweets) if len(tweetUsers) >= 1: l.append([len(tweetUsers), len(set(tweetUsers))]) diffs = map(lambda x: float(x[0]-x[1])/x[0], l) relBins, labels = pieData([ [lambda x: x==0.0, "0%"], [lambda x: x>0.0 and x<=10.0, "0%<d<=10%"], [lambda x: x>0.1 and x<=0.3, "10%<d<=30%"], [lambda x: x>0.3, ">30%"] ], diffs) plt.figure() plt.pie(relBins, autopct='%1.1f%%', startangle=90, labels=labels) plt.title("Differenzen in Anzahl Tweets zu Dokument, wenn doppelte Benutzer entfernt werden.") plt.show()
def alteringTweetStreamAfterFirstPeak(): relativeTweetDiffAfter1WeekAndTotal = map( lambda doc: float(doc.numTweets()) / doc.numTweetsBetweenRelative(None, 60*60*24*7), filter(lambda doc: doc.numTweetsBetweenRelative(None, 60*60*24*7) >= 5, SimpleDoc.getall()) ) relBins, labels = pieData([ [lambda x: x==1.0, "+0%"], [lambda x: x>1.0 and x<=1.1, "+0-10%"], [lambda x: x>1.1 and x<=1.2, "+10-20%"], [lambda x: x>1.2 and x<=1.3, "+20-30%"], [lambda x: x>1.3 and x<=1.4, "+30-40%"], [lambda x: x>1.4 and x<=1.5, "+40-50%"], [lambda x: x>1.5, ">50%"] ], relativeTweetDiffAfter1WeekAndTotal) plt.figure() plt.pie(relBins, autopct='%1.1f%%', startangle=90, labels=labels) plt.title("Relative Anzahl Tweets im Vergleich zu Anzahl Tweets nach einer Woche") plt.show()
def correlationsForQuartals(): quartals = [ [3, 2008], [0, 2009], [1, 2009], [2, 2009], [3, 2009], [0, 2010], [1, 2010], [2, 2010], [3, 2010], [0, 2011], [1, 2011], [2, 2011], [3, 2011], [0, 2012], [1, 2012], [2, 2012], [3, 2012], [0, 2013], [1, 2013] ] def docInQuartal(doc, quartal): if quartal[1] != doc.publicationDatetime().year: return False elif quartal[0] == 0: return doc.publicationDatetime().month >=1 and doc.publicationDatetime().month<=3 elif quartal[0] == 1: return doc.publicationDatetime().month >=4 and doc.publicationDatetime().month<=6 elif quartal[0] == 2: return doc.publicationDatetime().month >=7 and doc.publicationDatetime().month<=9 elif quartal[0] == 3: return doc.publicationDatetime().month >=10 and doc.publicationDatetime().month<=12 else: raise ValueError("Argument quartal consists of a tuple [quartal, year] where quartal must be between 0 and 3") allDocs = list(SimpleDoc.getall()) coefficients = [] for quartal in quartals: docs = filter(lambda doc: docInQuartal(doc, quartal) and doc.mendeleyReaders != None, allDocs) print len(docs) x, y = zip(*map(lambda doc: [len(doc.tweets), doc.crossrefTimeline[0].totalCrossrefs], docs)) coefficients.append(korrelationskoeffizient(x, y)) plt.figure() plt.plot(range(0, len(quartals)), coefficients) plt.show()
# expertCategories = ['Medicine', 'Health' ] wordExperts = getWordExperts(expertWords) # patrickExperts = getPatrickExperts(expertCategories) """bioDocs = minimizedDocs( filter( lambda doc: doc.mendeleyDisciplines != None and 'Biological Sciences' in doc.mendeleyDisciplines, SimpleDoc.getallBetween((2012,6), (2012,8)) ), metrics )""" docs = minimizedDocs( SimpleDoc.getallBetween((2012,6), (2012,8)), metrics ) usersInTimewindow = set((usr for doc in docs for usr in doc[0])) totalNumTweets = sum((1 for doc in docs for u in doc[0])) """f = open("baselines", "w") for numTweets in range(100, totalNumTweets, 100): print str(numTweets) + " / " + str(totalNumTweets) baseline = getBaseline(docs, metricNames, numTweets) f.write(json.dumps( { "num-tweets" : numTweets, "baseline" : baseline } ) + "\n") f.flush() f.close()"""
def canBeEncoded(text): try: str(text) return True except UnicodeEncodeError: return False def tweetsBetweenDay(documents, lowerBound, upperBound): return [[tweet.text, tweet.timestamp, tweet.username, doc.doi, doc.title, doc.publicationTimestamp] for doc in documents for tweet in doc.tweets if ((lowerBound*60*60*24) <= (tweet.timestamp - doc.publicationTimestamp) <= (upperBound*60*60*24)) and canBeEncoded(tweet.text) and canBeEncoded(doc.title) ] relevantDocuments = SimpleDoc.getallBetween((2012, 6), (2012, 8)) tweets = [] tweets.extend(random.sample(tweetsBetweenDay(relevantDocuments, 0, 1), 111)) tweets.extend(random.sample(tweetsBetweenDay(relevantDocuments, 1, 3), 111)) tweets.extend(random.sample(tweetsBetweenDay(relevantDocuments, 3, 5), 111)) tweets.extend(random.sample(tweetsBetweenDay(relevantDocuments, 7, 30), 333)) tweets.extend(random.sample(tweetsBetweenDay(relevantDocuments, 100, 300), 333)) tweetTexts = map(lambda tweetdata: "\t".join([str(tweetdata[0]), str(tweetdata[1]), tweetdata[2], tweetdata[3], tweetdata[4], str(tweetdata[5])]), tweets) random.shuffle(tweetTexts) f = open("tweetTexts_1.txt", "w") for text in tweetTexts[0:333]: f.write(text.replace("\n", " ").replace("\"", "").replace("'", "") + "\n") f.close()
import json from main.util.db import openDb from scipy import stats from main.util.common import SimpleDoc, powerset, Log import math import itertools expertTopics = list(map(lambda s: s.strip(), open("data/expert_topics", "r"))) l = Log(filename="foo", verbose=True) docs = map(lambda doc: [map(lambda tweet: tweet.username, doc.tweets), map(lambda metric: metric[1](doc), metrics)], SimpleDoc.getallBetween((2012,6), (2012,8))) baseline = { } for ind, metricName in zip(range(0, len(metricNames)), metricNames): pairs = [] for doc in docs: numTweets = len(doc[0]) metricScore = doc[1][ind] pairs.append([numTweets, metricScore]) x, y = zip(*pairs) s, p = stats.spearmanr(x, y) baseline[metricName] = s count = 0 count2 = 0 for ind, metricName in zip(range(0, len(metricNames)), metricNames): pairs = []
def userCorrelationToDiscipline(): """ zuerst user_disc_map erstellen: [ user1 : [ [mendDisc1_1, mendDisc1_2, ...], // Liste von Disziplinen pro Tweet des Nutzers [mendDisc2_1, mendDisc2_2, ...] ], user2: [ ... ] ] """ if not os.path.isfile(dataPath("user_disc_map.json")): userDiscList = [] for doc in SimpleDoc.getall(): twitterUsers = [tweet.user for tweet in doc.tweets] disciplines = doc.mendeleyDisciplines if len(twitterUsers)!=0 and disciplines!=None and len(disciplines)!=0: for twitterUser in twitterUsers: userDiscList.append([twitterUser, disciplines]) userDiscMap = {} for item in userDiscList: discList = userDiscMap.get(item[0], []) discList.append(item[1]) userDiscMap[item[0]] = discList writeJsonToData(userDiscMap, "user_disc_map.json") else: userDiscMap = readJsonFromData("user_disc_map.json") """ dann "user_disc_count_map" erstellen: [ user1 : { "total_posts" : n, "user_posts_in_desc" : { "disc1" : n_1, "disc2" : n_2, ... } }, user2: { ... } ] """ if not os.path.isfile(dataPath("user_disc_count_map.json")): userDiscCountMap = { } for user, descListList in userDiscMap.items(): totalPosts = len(descListList) allUsersDesc = set() for descList in descListList: allUsersDesc |= set(descList) userPostsInDesc = { } for desc in allUsersDesc: postsInDesc = sum(1 for descList in descListList if desc in descList) userPostsInDesc[desc] = postsInDesc userDiscCountMap[user] = { "total_posts" : totalPosts, "user_posts_in_desc" : userPostsInDesc } writeJsonToData(userDiscCountMap, "user_disc_count_map.json") else: userDiscCountMap = readJsonFromData("user_disc_count_map.json") for user, userdata in userDiscCountMap.items(): totalPosts = userdata['total_posts'] relCounts = [] for desc, count in userdata['user_posts_in_desc'].items(): relCounts.append([desc, float(count)/totalPosts]) relCounts = sorted(relCounts, key=lambda x: x[1], reverse=True) if totalPosts > 50: print user print relCounts print "\n\n"
def cummulativeTwitterPlots(): # twitterTimelines, publicationTimestamps = zip(*filter(lambda timelinePubTs: len(timelinePubTs[0]) != 0, map(lambda doc: [doc.cummulativeTwitterTimeline(), doc.publicationTimestamp], SimpleDoc.getall()))) twitterTimelines = filter(lambda tl: len(tl) != 0, map(lambda doc: map(lambda point: [point[0]-doc.publicationTimestamp, point[1]], doc.cummulativeTwitterTimeline()), SimpleDoc.getall())) # twitterTimelines = filter(lambda tl: len(tl) < 20, twitterTimelines) # twitterTimelines = filter(lambda tl: len(tl) > 50, twitterTimelines) plt.figure() for timeline in twitterTimelines: x, y = zip(*timeline) plt.plot(x, y) plt.show()
def correlationTimeTweets(): x, y = zip(*map(lambda doc: [doc.publicationTimestamp, len(doc.tweets)], SimpleDoc.getall())) print korrelationskoeffizient(x, y) # 0.082
def distFirstTweetToDoc(): allDocs = list(SimpleDoc.getall()) for param in range(20, 100): diffs = [] maximumTweetAge = 60*60*24*param minimumTweetAge = 60*60*24*10 for doc in filter(lambda doc: len(doc.tweets) != 0 and doc.age() >= maximumTweetAge, allDocs): pubTimestamp = doc.publicationTimestamp # firstTweetTimestamp = max([tweet.timestamp for tweet in doc.tweets]) diffs.extend([tweet.timestamp-pubTimestamp for tweet in filter( lambda tweet: (tweet.timestamp-doc.publicationTimestamp) < maximumTweetAge and (tweet.timestamp-doc.publicationTimestamp) > minimumTweetAge, doc.tweets ) ]) maxBins = 30 timeslot = (float(maximumTweetAge)-float(minimumTweetAge))/maxBins def binNr2Bound(binNr): return minimumTweetAge+(binNr*timeslot) binConditions = map( lambda binNr: [lambda x: x>binNr2Bound(binNr) and x<=binNr2Bound(binNr+1), str(binNr) + "X"], range(0, maxBins) ) # binConditions.append([lambda x: x>binNr2Bound(maxBins), ">" + str(maxBins-1) + str("X")]) diffBins, diffLabels = pieData(binConditions, diffs) distBinConditions = map( lambda binNr: [lambda x: x==binNr, "X=" + str(binNr)], range(0, maxBins) ) def getBins(beta, binConditions): s = map(lambda x: int(x), numpy.random.exponential(beta, 10000)) bins, labels = pieData(binConditions, s) return bins def binDiffs(bins1, bins2): return sum(map((lambda (a, b): abs(a-b)), zip(bins1, bins2))) def searchInRangeRec(minBeta, maxBeta, steps, depth, maxDepth): minError = min( map(lambda beta: [beta, binDiffs(getBins(beta, distBinConditions), diffBins)], numpy.arange(minBeta, maxBeta, steps)), key = lambda x: x[1] ) errorBelow = binDiffs(getBins(minError[0]-(float(steps)/2), distBinConditions), diffBins) errorAbove = binDiffs(getBins(minError[0]+(float(steps)/2), distBinConditions), diffBins) if depth==maxDepth: return minError elif errorBelow <= errorAbove: x = searchInRangeRec(minError[0]-steps, minError[0], float(steps)/10, depth+1, maxDepth) return x[0], x[1] else: x = searchInRangeRec(minError[0], minError[0]+steps, float(steps)/10, depth+1, maxDepth) return x[0], x[1] beta, error = searchInRangeRec(1, 10, 1, 0, 3) print param, (error/maxBins) # s = numpy.random.poisson(1.2, 10000) # s = numpy.random.zipf(1.5, 10000) # f = 3.0 # s = map(lambda x: (float(x)-(1+(random.random()/f)))*f, s) # s.extend([0] * (100*60)) #binConditions2.append([lambda x: x>maxBins, ">" + str(maxBins-1) + str("X")]) """expDistData = map(lambda x: int(x), numpy.random.exponential(beta, 10000))
(("pmcViews", ()), (2011, 6), "PMC views"), (("maxCitations", ()), (2009, 3), "Citations") ] attributeNames = map(lambda x: x[0][0], attributeList) attributePrintNames = map(lambda x: x[2], attributeList) calls = map(lambda x: x[0], attributeList) stats = [] for ind, attr in zip(range(0, len(attributeList)), attributeList): call = attr[0] lowerBound = attr[1] attName = attr[0][0] valuesForMetric = filter(lambda x: x != None, map(lambda doc: applyCall(doc, call), SimpleDoc.getallBetween(lowerBound, None) )) minV, maxV, meanV, std = min(valuesForMetric), max(valuesForMetric), np.mean(valuesForMetric), np.std(valuesForMetric) stats.append((attName, call, meanV, std, len(valuesForMetric))) print attName + "\t" + "\t".join(map(lambda x: str(x), [minV, maxV, meanV, std])) statValues = [] for stat in stats: name = stat[0] call = stat[1] mean = stat[2] std = stat[3] numValues = stat[4]
valuesForMetric = filter(lambda x: x != None, map(lambda doc: applyCall(doc, call), SimpleDoc.getallBetween(lowerBound, None) )) minV, maxV, meanV, std = min(valuesForMetric), max(valuesForMetric), np.mean(valuesForMetric), np.std(valuesForMetric) print attName + "\t" + "\t".join(map(lambda x: str(x), [minV, maxV, meanV, std])) """ cat = "Biological Sciences" """consideredDocs = filter( lambda doc: doc.mendeleyDisciplines != None and cat in doc.mendeleyDisciplines, SimpleDoc.getallBetween((2012,6), (2012,8)) )""" consideredDocs = SimpleDoc.getallBetween((2012,6), (2012,8)) print len(consideredDocs) matrix = getAttributeValueMatrix(consideredDocs, calls) corrs = correlationBetweenEverything(matrix, attributeNames) """f = open("foo", "w") for corr in corrs: f.write(corr.toJson() + "\n") f.close()""" # corrs = CorrelationItem.fromFile("stuff/pairwise_corr_2012-6_2012-8.json") f = open("foo", "w") m = [] for a1 in attributeNames: row = []