def save_to_collection(): for lower_entity in entity_pseudos.keys(): for entity in entity_pseudos[lower_entity]: topic_id = entity_topic[lower_entity] coll_name = TOPIC_COLLECTION_NAME(topic_id) check_or_create_collection(TOPIC_TWEETS_DB_NAME, coll_name, Collection.TOPIC) coll = topic_db[coll_name] copy_into_collection(raw_collection.find({ENTITIES: entity}), coll)
def aggregate_urls(topic_ids): for topic_id in topic_ids: coll_name = TOPIC_COLLECTION_NAME(topic_id) results_coll_name = TOPIC_URL_AGGR_COLLECTION_NAME(topic_id) check_or_create_collection(TOPIC_TWEETS_DB_NAME, coll_name, Collection.TOPIC) check_or_create_collection(TOPIC_TWEETS_DB_NAME, results_coll_name, Collection.URL_RESULT) coll = topics_db[coll_name] coll.map_reduce(MAP_FUNCTION, REDUCE_FUNCTION, results_coll_name)
import pytz from pymongo import MongoClient from utilities.constants import * from utilities.entities.Collection import Collection from utilities.miscellaneous import display_percentage from utilities.mongo import check_or_create_collection, insert_many from utilities.time_management import datetime, start_timing, stop_timing from utilities.os_util import dirname, get_dir, get_files_in_dir ENGINE_ROOT = dirname(get_dir(__file__)) TEMP_PATH = join(ENGINE_ROOT, EXTRACTOR_DIR, TEMP_DIR) check_or_create_collection(RAW_TWEETS_DB_NAME, TEMP_RAW_COLLECTION_NAME, Collection.TEMP) client = MongoClient() db = client[RAW_TWEETS_DB_NAME] collection = db[TEMP_RAW_COLLECTION_NAME] def is_retweet(tweet): return RETWEETED_STATUS in tweet.keys() def extract_data(file_path): # load json file into a list of dictionaries tweets = [] tweets_file = open(file_path, 'r') for line in tweets_file: try:
from utilities.constants import * from utilities.entities.Collection import Collection from utilities.mongo import check_or_create_collection, copy_into_collection from utilities.os_util import get_dir from utilities.time_management import get_today, get_date_string, start_timing, stop_timing ROOT = get_dir(__file__) JAVASCRIPT_PATH = join(ROOT, JAVASCRIPT_DIR) TODAY = get_today() TODAY_STRING = get_date_string(TODAY) COLLECTION_NAME = RAW_COLLECTION_PREFIX + TODAY_STRING RESULTS_COLLECTION_NAME = ENTITY_RESULTS_COLLECTION_PREFIX + TODAY_STRING check_or_create_collection(RAW_TWEETS_DB_NAME, TEMP_RAW_COLLECTION_NAME, Collection.TEMP) check_or_create_collection(RAW_TWEETS_DB_NAME, COLLECTION_NAME, Collection.RAW) check_or_create_collection(RAW_TWEETS_DB_NAME, TEMP_RESULTS_COLLECTION_NAME, Collection.ENTITY_RESULT) check_or_create_collection(RAW_TWEETS_DB_NAME, RESULTS_COLLECTION_NAME, Collection.ENTITY_RESULT) client = MongoClient() db = client[RAW_TWEETS_DB_NAME] coll = db[COLLECTION_NAME] temp_raw = db[TEMP_RAW_COLLECTION_NAME] temp_results = db[TEMP_RESULTS_COLLECTION_NAME] def execute(): print 'Started Entity Aggregation... ',
from utilities.entities.Collection import Collection from utilities.mongo import check_or_create_collection, copy_into_collection from utilities.os_util import get_dir from utilities.time_management import get_today, get_date_string, start_timing, stop_timing ROOT = get_dir(__file__) JAVASCRIPT_PATH = join(ROOT, JAVASCRIPT_DIR) TODAY = get_today() TODAY_STRING = get_date_string(TODAY) COLLECTION_NAME = RAW_COLLECTION_PREFIX + TODAY_STRING RESULTS_COLLECTION_NAME = ENTITY_RESULTS_COLLECTION_PREFIX + TODAY_STRING check_or_create_collection(RAW_TWEETS_DB_NAME, TEMP_RAW_COLLECTION_NAME, Collection.TEMP) check_or_create_collection(RAW_TWEETS_DB_NAME, COLLECTION_NAME, Collection.RAW) check_or_create_collection(RAW_TWEETS_DB_NAME, TEMP_RESULTS_COLLECTION_NAME, Collection.ENTITY_RESULT) check_or_create_collection(RAW_TWEETS_DB_NAME, RESULTS_COLLECTION_NAME, Collection.ENTITY_RESULT) client = MongoClient() db = client[RAW_TWEETS_DB_NAME] coll = db[COLLECTION_NAME] temp_raw = db[TEMP_RAW_COLLECTION_NAME] temp_results = db[TEMP_RESULTS_COLLECTION_NAME] def execute(): print 'Started Entity Aggregation... ', start_timing()