from main.util.common import doForEachSimpleDoc, writeJsonToData, dataPath, readJsonFromData, formatHist
from os import path
import numpy


numTweetsPerUserFilename = "num_tweets_per_user.json"
if not path.isfile(dataPath(numTweetsPerUserFilename)):
    numTweetsPerUser = {}

    def getRelevantData(doc):
        global userCount

        twitterData = doc[2]
        for tweet in twitterData:
            user = tweet[1]
            usersTotalTweets = numTweetsPerUser.get(user, 0) + 1
            numTweetsPerUser[user] = usersTotalTweets

    doForEachSimpleDoc(getRelevantData)

    writeJsonToData(numTweetsPerUser, numTweetsPerUserFilename)
else:
    numTweetsPerUser = readJsonFromData(numTweetsPerUserFilename)

hist = numpy.histogram(list(numTweetsPerUser.itervalues()), [1, 2, 3, 4, 5, 10, 20, 100, 500, 1000])

print "\n" * 3

# Tweet Histogaram
print "Tweet Histogram:"
formatHist(hist[0], hist[1], 6)
    - veröffentlichungsdatum

"""

<<<<<<< HEAD
from main.util.common import doForEachPlosDoc, dataPath, readJsonFromData
=======
from main.util.common import doForEachPlosDoc, dataPath, readJsonFromData, doForEachDocInPath
>>>>>>> c1d847f9933d66a47795fd212c3631dbcb27ee29
import re
import json
from dateutil.parser import parse
import calendar

<<<<<<< HEAD
file = open(dataPath("relevant_document_data.json"), "w")

def findRelevantData(doc):
    doi = doc['doi']
    title = doc['title']
    twitterData = None # liste von [ "tweet text", user, retweetUser (None, wenn kein retweet), zeitpunkt ]
    citations = None # [ zeitpunkt, totalCitations ]
    mendeleyDisciplineList = None
    pubDate = doc['publication_date']
    mendeleyReaders = None
    issn = None
    issue = None
    volume = None
    pdfViews = None
    htmlViews = None
    citeULikeShares = None
def interpolateDatapoint(timeline, time):
    exactMatches = filter(lambda x: x[0] == time, timeline)
    if(len(exactMatches) > 1):
        raise Exception("more than one exact match for a timepoint: " + str(exactMatches))
    elif(len(exactMatches) == 1):
        return float(exactMatches[0][1])
    else:
        f = interpolate.interp1d(map(lambda x: x[0], timeline), map(lambda x: x[1], timeline))
        return float(f(time))

def addDummyPointIfNoDataAvailable(timeline, publicationDate, delayFromPubDate):
    if(timeline[0][0] > publicationDate+delayFromPubDate):
        timeline.insert(0, [publicationDate+delayFromPubDate, 0])

lines = open(dataPath("points_filtered_2.json"), "r")

for line in lines:
    doc = json.loads(line)

    docId = doc['id']
    publicationDate = doc['publication_date']
    timeline = doc['twitter-data']

    addDummyPointIfNoDataAvailable(timeline, publicationDate, -10*24*60*60)

    normalizedTimeline = []
    for dayDiff in [-10, 0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150]:
        timePoint = publicationDate + (dayDiff * 24*60*60)
        value = interpolateDatapoint(timeline, timePoint)
        normalizedTimeline.append([dayDiff, value])
Пример #4
0
"""

from main.util.common import doForEachPlosDoc, dataPath
import re
import json

crossrefs = {}
def findRelevantData(doc):
    doi = doc['doi']
    refs = []

    for source in doc['sources']:
        if source['name'] == 'crossref':
            events = source['events']
            if len(events) != 0:
                for event in [event['event'] for event in events]:
                    issn = event.get('issn', None)
                    referencingDoi = event.get('doi', None)
                    publicationType = event.get('publication_type', None)
                    
                    refs.append({'doi' : referencingDoi, 'issn' : issn, 'type' : publicationType})

    crossrefs[doi] = refs


file = open(dataPath("crossrefs.json"), "w")
doForEachPlosDoc(findRelevantData, verbose=True)
file.write(json.dumps(crossrefs))
file.close()
def userCorrelationToDiscipline():
    """
    zuerst user_disc_map erstellen:
    [ user1 : [ 
        [mendDisc1_1, mendDisc1_2, ...], // Liste von Disziplinen pro Tweet des Nutzers
        [mendDisc2_1, mendDisc2_2, ...]
    ], user2: [
        ...
    ] ]
    """
    if not os.path.isfile(dataPath("user_disc_map.json")):
        userDiscList = []

        for doc in SimpleDoc.getall():
            twitterUsers = [tweet.user for tweet in doc.tweets]
            disciplines = doc.mendeleyDisciplines
            if len(twitterUsers)!=0 and disciplines!=None and len(disciplines)!=0:
                for twitterUser in twitterUsers:
                    userDiscList.append([twitterUser, disciplines])
        
        userDiscMap = {}
        for item in userDiscList:
            discList = userDiscMap.get(item[0], [])
            discList.append(item[1])
            userDiscMap[item[0]] = discList

        writeJsonToData(userDiscMap, "user_disc_map.json")
    else:
        userDiscMap = readJsonFromData("user_disc_map.json")


    """
    dann "user_disc_count_map" erstellen:
    [ user1 : { 
        "total_posts" : n,
        "user_posts_in_desc" : {
            "disc1" : n_1,
            "disc2" : n_2, 
            ...
        }
    }, user2: {
        ...
    } ]
    """
    if not os.path.isfile(dataPath("user_disc_count_map.json")):
        userDiscCountMap = { }
        for user, descListList in userDiscMap.items():
            totalPosts = len(descListList)
            allUsersDesc = set()
            for descList in descListList:
                allUsersDesc |= set(descList)

            userPostsInDesc = { }
            for desc in allUsersDesc:
                postsInDesc = sum(1 for descList in descListList if desc in descList)
                userPostsInDesc[desc] = postsInDesc

            userDiscCountMap[user] = { "total_posts" : totalPosts, "user_posts_in_desc" : userPostsInDesc }

        writeJsonToData(userDiscCountMap, "user_disc_count_map.json")
    else:
        userDiscCountMap = readJsonFromData("user_disc_count_map.json")

    for user, userdata in userDiscCountMap.items():
        totalPosts = userdata['total_posts']

        relCounts = []
        for desc, count in userdata['user_posts_in_desc'].items():
            relCounts.append([desc, float(count)/totalPosts])

        relCounts = sorted(relCounts, key=lambda x: x[1], reverse=True)

        if totalPosts > 50:
            print user
            print relCounts
            print "\n\n"
"""

from main.util.common import doForEachPlosDoc, dataPath
import re
import json

mendeleyPublicationOutlets = []
def findRelevantData(doc):
    for source in doc['sources']:
        if source['name'] == 'mendeley':
            events = source['events']
            if len(events) != 0:
                publicationOutlet = None
                issn = None

                if 'publication_outlet' in events:
                    publicationOutlet = events['publication_outlet']

                if 'identifiers' in events:
                    if 'issn' in events['identifiers']:
                        issn = events['identifiers']['issn']

                mendeleyPublicationOutlets.append([publicationOutlet, issn])


file = open(dataPath("publication_outlets.json"), "w")
doForEachPlosDoc(findRelevantData, verbose=True)
file.write(json.dumps(mendeleyPublicationOutlets))
file.close()
import json
from matplotlib.pyplot import show

from hcluster import pdist, linkage, dendrogram, fcluster
import numpy
from numpy.random import rand
from main.util.common import dataPath

# load distance matrix

# Z = linkage(distanceMatrix)
# numpy.save("dendrogram.npy", Z)
# dendrogram(Z)
# show()

Z = numpy.load(dataPath("dendrogram.npy"))

dendrogram(Z)
show()

clu = fcluster(Z, 2, depth=5000, criterion='distance')

cluInstances = {}
for i in clu:
    cluInstances[i] = cluInstances.get(i, 0) + 1

# numpy.save(dataPath("clusters.npy"), clu)
# clu = numpy.load(dataPath("clusters.npy"))

"""hist1 = numpy.histogram(list(cluInstances.itervalues()))
from main.util.common import doForEachPlosDoc, dataPath, readJsonFromData
import re
import json
from dateutil.parser import parse
import calendar

file = open(dataPath("document_timelines.json"), "w")


def findRelevantData(doc):
    doi = doc["doi"]
    pubDate = timestr2timestamp(doc["publication_date"])
    citeULikeTimeline = None
    pubmedTimeline = None
    scopusTimeline = None
    counterTimeline = None
    counterEvents = None
    pmcTimeline = None
    pmcEvents = None
    facebookTimeline = None
    mendeleyTimeline = None
    crossrefTimeline = None
    relativemetricTimeline = None
    crossrefTimeline = None
    twitterTimeline = None

    for source in doc["sources"]:
        if source["name"] == "citeulike":
            citeULikeTimeline = map(
                lambda item: [timestr2timestamp(item["update_date"]), item["total"]], source["histories"]
            )