from main.util.common import doForEachSimpleDoc, writeJsonToData, dataPath, readJsonFromData, formatHist from os import path import numpy numTweetsPerUserFilename = "num_tweets_per_user.json" if not path.isfile(dataPath(numTweetsPerUserFilename)): numTweetsPerUser = {} def getRelevantData(doc): global userCount twitterData = doc[2] for tweet in twitterData: user = tweet[1] usersTotalTweets = numTweetsPerUser.get(user, 0) + 1 numTweetsPerUser[user] = usersTotalTweets doForEachSimpleDoc(getRelevantData) writeJsonToData(numTweetsPerUser, numTweetsPerUserFilename) else: numTweetsPerUser = readJsonFromData(numTweetsPerUserFilename) hist = numpy.histogram(list(numTweetsPerUser.itervalues()), [1, 2, 3, 4, 5, 10, 20, 100, 500, 1000]) print "\n" * 3 # Tweet Histogaram print "Tweet Histogram:" formatHist(hist[0], hist[1], 6)
- veröffentlichungsdatum """ <<<<<<< HEAD from main.util.common import doForEachPlosDoc, dataPath, readJsonFromData ======= from main.util.common import doForEachPlosDoc, dataPath, readJsonFromData, doForEachDocInPath >>>>>>> c1d847f9933d66a47795fd212c3631dbcb27ee29 import re import json from dateutil.parser import parse import calendar <<<<<<< HEAD file = open(dataPath("relevant_document_data.json"), "w") def findRelevantData(doc): doi = doc['doi'] title = doc['title'] twitterData = None # liste von [ "tweet text", user, retweetUser (None, wenn kein retweet), zeitpunkt ] citations = None # [ zeitpunkt, totalCitations ] mendeleyDisciplineList = None pubDate = doc['publication_date'] mendeleyReaders = None issn = None issue = None volume = None pdfViews = None htmlViews = None citeULikeShares = None
def interpolateDatapoint(timeline, time): exactMatches = filter(lambda x: x[0] == time, timeline) if(len(exactMatches) > 1): raise Exception("more than one exact match for a timepoint: " + str(exactMatches)) elif(len(exactMatches) == 1): return float(exactMatches[0][1]) else: f = interpolate.interp1d(map(lambda x: x[0], timeline), map(lambda x: x[1], timeline)) return float(f(time)) def addDummyPointIfNoDataAvailable(timeline, publicationDate, delayFromPubDate): if(timeline[0][0] > publicationDate+delayFromPubDate): timeline.insert(0, [publicationDate+delayFromPubDate, 0]) lines = open(dataPath("points_filtered_2.json"), "r") for line in lines: doc = json.loads(line) docId = doc['id'] publicationDate = doc['publication_date'] timeline = doc['twitter-data'] addDummyPointIfNoDataAvailable(timeline, publicationDate, -10*24*60*60) normalizedTimeline = [] for dayDiff in [-10, 0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150]: timePoint = publicationDate + (dayDiff * 24*60*60) value = interpolateDatapoint(timeline, timePoint) normalizedTimeline.append([dayDiff, value])
""" from main.util.common import doForEachPlosDoc, dataPath import re import json crossrefs = {} def findRelevantData(doc): doi = doc['doi'] refs = [] for source in doc['sources']: if source['name'] == 'crossref': events = source['events'] if len(events) != 0: for event in [event['event'] for event in events]: issn = event.get('issn', None) referencingDoi = event.get('doi', None) publicationType = event.get('publication_type', None) refs.append({'doi' : referencingDoi, 'issn' : issn, 'type' : publicationType}) crossrefs[doi] = refs file = open(dataPath("crossrefs.json"), "w") doForEachPlosDoc(findRelevantData, verbose=True) file.write(json.dumps(crossrefs)) file.close()
def userCorrelationToDiscipline(): """ zuerst user_disc_map erstellen: [ user1 : [ [mendDisc1_1, mendDisc1_2, ...], // Liste von Disziplinen pro Tweet des Nutzers [mendDisc2_1, mendDisc2_2, ...] ], user2: [ ... ] ] """ if not os.path.isfile(dataPath("user_disc_map.json")): userDiscList = [] for doc in SimpleDoc.getall(): twitterUsers = [tweet.user for tweet in doc.tweets] disciplines = doc.mendeleyDisciplines if len(twitterUsers)!=0 and disciplines!=None and len(disciplines)!=0: for twitterUser in twitterUsers: userDiscList.append([twitterUser, disciplines]) userDiscMap = {} for item in userDiscList: discList = userDiscMap.get(item[0], []) discList.append(item[1]) userDiscMap[item[0]] = discList writeJsonToData(userDiscMap, "user_disc_map.json") else: userDiscMap = readJsonFromData("user_disc_map.json") """ dann "user_disc_count_map" erstellen: [ user1 : { "total_posts" : n, "user_posts_in_desc" : { "disc1" : n_1, "disc2" : n_2, ... } }, user2: { ... } ] """ if not os.path.isfile(dataPath("user_disc_count_map.json")): userDiscCountMap = { } for user, descListList in userDiscMap.items(): totalPosts = len(descListList) allUsersDesc = set() for descList in descListList: allUsersDesc |= set(descList) userPostsInDesc = { } for desc in allUsersDesc: postsInDesc = sum(1 for descList in descListList if desc in descList) userPostsInDesc[desc] = postsInDesc userDiscCountMap[user] = { "total_posts" : totalPosts, "user_posts_in_desc" : userPostsInDesc } writeJsonToData(userDiscCountMap, "user_disc_count_map.json") else: userDiscCountMap = readJsonFromData("user_disc_count_map.json") for user, userdata in userDiscCountMap.items(): totalPosts = userdata['total_posts'] relCounts = [] for desc, count in userdata['user_posts_in_desc'].items(): relCounts.append([desc, float(count)/totalPosts]) relCounts = sorted(relCounts, key=lambda x: x[1], reverse=True) if totalPosts > 50: print user print relCounts print "\n\n"
""" from main.util.common import doForEachPlosDoc, dataPath import re import json mendeleyPublicationOutlets = [] def findRelevantData(doc): for source in doc['sources']: if source['name'] == 'mendeley': events = source['events'] if len(events) != 0: publicationOutlet = None issn = None if 'publication_outlet' in events: publicationOutlet = events['publication_outlet'] if 'identifiers' in events: if 'issn' in events['identifiers']: issn = events['identifiers']['issn'] mendeleyPublicationOutlets.append([publicationOutlet, issn]) file = open(dataPath("publication_outlets.json"), "w") doForEachPlosDoc(findRelevantData, verbose=True) file.write(json.dumps(mendeleyPublicationOutlets)) file.close()
import json from matplotlib.pyplot import show from hcluster import pdist, linkage, dendrogram, fcluster import numpy from numpy.random import rand from main.util.common import dataPath # load distance matrix # Z = linkage(distanceMatrix) # numpy.save("dendrogram.npy", Z) # dendrogram(Z) # show() Z = numpy.load(dataPath("dendrogram.npy")) dendrogram(Z) show() clu = fcluster(Z, 2, depth=5000, criterion='distance') cluInstances = {} for i in clu: cluInstances[i] = cluInstances.get(i, 0) + 1 # numpy.save(dataPath("clusters.npy"), clu) # clu = numpy.load(dataPath("clusters.npy")) """hist1 = numpy.histogram(list(cluInstances.itervalues()))
from main.util.common import doForEachPlosDoc, dataPath, readJsonFromData import re import json from dateutil.parser import parse import calendar file = open(dataPath("document_timelines.json"), "w") def findRelevantData(doc): doi = doc["doi"] pubDate = timestr2timestamp(doc["publication_date"]) citeULikeTimeline = None pubmedTimeline = None scopusTimeline = None counterTimeline = None counterEvents = None pmcTimeline = None pmcEvents = None facebookTimeline = None mendeleyTimeline = None crossrefTimeline = None relativemetricTimeline = None crossrefTimeline = None twitterTimeline = None for source in doc["sources"]: if source["name"] == "citeulike": citeULikeTimeline = map( lambda item: [timestr2timestamp(item["update_date"]), item["total"]], source["histories"] )