def buildCryptoChangepointEvents(fromTime, toTime, currency): validateMongoEnvironment() client = getMongoClient() collection = client.cryptoposts.changepoints query = { "coin": currency, "changepoint": { "$gte": fromTime, "$lte": toTime } } returnedData = queryDatabase(collection, query) changepointset = set() fileredData = [] for data in returnedData: data["_id"] = "null" if (data["changepoint"] not in changepointset): changepointset.add(data["changepoint"]) fileredData.append(data) with open('cryptoApp/timelinePlotter/static/cryptoChangepoints.json', 'w+') as outputFile: json.dump(fileredData, outputFile) client.close()
def buildTimeline(client, seriesId): query = {"seriesId": seriesId} collection = client.reddit_data.aggregation returnedData = queryDatabase(collection, query) for data in returnedData: data["_id"] = "null" with open('cryptoApp/timelinePlotter/static/timeline.json', 'w+') as outputFile: json.dump(returnedData, outputFile)
def getChangepoints(client, startTime, endTime, currency): collection = client.cryptoposts.changepoints query = { "coin": currency, "changepoint": { "$gte": startTime, "$lte": endTime } } return queryDatabase(collection, query)
def buildSocialMediaEvents(client, seriesId): query = {"seriesId": seriesId} collection = client.reddit_data.timeline_events returnedData = queryDatabase(collection, query) for data in returnedData: data["_id"] = "null" with open('cryptoApp/timelinePlotter/static/media_events.json', 'w+') as outputFile: json.dump(returnedData, outputFile)
def getMediaEvents(client, seriesId, startTime, endTime): collection = client.reddit_data.timeline_events query = { "seriesId": seriesId, "time": { "$gte": startTime, "$lte": endTime } } return queryDatabase(collection, query)
def castChangepointValuesToNumbers(client): collection = client.cryptoposts.changepoints query = { "changepoint": {"$type": "string"} } changepoints = queryDatabase(collection, query) for changepoint in changepoints: changepoint["changepoint"] = int(changepoint["changepoint"]) changepoint["end"] = int(changepoint["end"]) changepoint["start"] = int(changepoint["start"]) collection.update_one({'_id': changepoint["_id"]}, {"$set": changepoint}, upsert=False)
def getTimeline(timelineId, client, startTime, endTime): collection = client.reddit_data.aggregation query = { "seriesId": timelineId, "startTime": { "$gte": startTime }, "endTime": { "$lte": endTime } } return queryDatabase(collection, query)
def buildCryptoDataSeries(fromTime, toTime, currency): validateMongoEnvironment() client = getMongoClient() collection = client.cryptoposts.crypto query = {"coin": currency, "time": {"$gte": fromTime, "$lt": toTime}} returnedData = queryDatabase(collection, query) for data in returnedData: data["_id"] = "null" with open('cryptoApp/timelinePlotter/static/crypto.json', 'w+') as outputFile: json.dump(returnedData, outputFile) client.close() return
def getAggregation(mongoClient, startTime, endTime, tag, granularity=HOUR, submissionWeight=3, submissionScoreWeight=1, commentWeight=2, commentScoreWeight=1): """ Gets data from mongoDB and aggregates it based on the provided weights and granularity. Arguments: mongoClient {pymongo.client} -- A mongoDB client to use for querying startTime {float} -- Unix time, start of range to query. endTime {float} -- Unix time, end of range to query. tag {string} -- The tag of the data. E.g 'bitcoin' Keyword Arguments: granularity {int} -- The step size in the time dimension to aggregate on (default: {HOUR}) submissionWeight {int} -- The weight given to the existence of a submission (default: {3}) submissionScoreWeight {int} -- The weight given to the score of a submission (default: {1}) commentWeight {int} -- The weight given to the existence of a comment (default: {2}) commentScoreWeight {int} -- The weight given to the score of a comment (default: {1}) Returns: (string, list) string -- The generated uuid used to tag the timeline, corresponding to the seriesId in mongoDB list -- The produced aggregated data """ aggregations = [] aggregationId = str(uuid.uuid1()) slots = int((endTime - startTime) / granularity) currentSlot = 1 while (startTime < endTime): if (currentSlot % 10 == 0): print("Aggregating slot {} of {}".format(currentSlot, slots)) currentSlot += 1 queryFromTime = startTime queryToTime = startTime + granularity submissionQuery = buildQuery(queryFromTime, queryToTime, tag) commentQuery = buildQuery(queryFromTime, queryToTime, tag) submissions = queryDatabase(mongoClient.reddit_data.submissions, submissionQuery) comments = queryDatabase(mongoClient.reddit_data.comments, commentQuery) commentAggregation = getCommentAggregation(comments, commentWeight) commentScoreAggregation = getCommentScoreAggregation( comments, commentScoreWeight) submissionAggregation = getSubmissionAggregation( submissions, submissionWeight) submissionScoreAggregation = getSubmissionScoreAggregation( submissions, submissionScoreWeight) weightedSum = commentAggregation + commentScoreAggregation + submissionAggregation + submissionScoreAggregation aggregation = { "startTime": queryFromTime, "endTime": queryToTime, "timeGranularity": granularity, "tag": tag, "seriesId": aggregationId, "comments": commentAggregation, "commentScores": commentScoreAggregation, "submissions": submissionAggregation, "submissionScores": submissionScoreAggregation, "sum": weightedSum } aggregations.append(aggregation) startTime = queryToTime return aggregationId, aggregations
import pymongo import sys # YYYY MM DD HH MM SS startTime = int(mktime(datetime(2017, 10, 1, 00, 00, 00).timetuple())) endTime = int(mktime(datetime(2017, 10, 1, 23, 59, 59).timetuple())) currency = BITCOIN tag = cryptocurrencies[currency]["tag"] if ("--scrape" in sys.argv): runScraper(currency, startTime, endTime, 200, 2) client = getMongoClient() collection = client.reddit_data.submissions query = { "timestamp": { "$gte": startTime, "$lte": endTime }, "subreddit": "Bitcoin" } results = queryDatabase(collection, query) # Sort on score: results = sorted(results, key=lambda k: k["score"], reverse=True) pprint(results) client.close()