""" Apply the LDA algorithm using libraries. Use the params in the config file. Save the LDA model. """ from os.path import join from gensim import corpora, models from utilities.config import NUMBER_OF_TOPICS, NUMBER_OF_PASSES, ALPHA from utilities.constants import * from utilities.os_util import get_dir from utilities.time_management import start_timing, stop_timing, get_time, get_today, get_date_string TODAY = get_today() TODAY_STRING = get_date_string(TODAY) ROOT = get_dir(__file__) DICTIONARY_PATH = join(ROOT, DATA_DIR, DICTIONARY_PREFIX + TODAY_STRING + DICT) CORPUS_PATH = join(ROOT, DATA_DIR, CORPUS_PREFIX + TODAY_STRING + MM) LDA_PATH = join(ROOT, DATA_DIR, LDA_MODEL_PREFIX + TODAY_STRING + LDA) CORPUS = corpora.MmCorpus(CORPUS_PATH) DICTIONARY = corpora.Dictionary.load(DICTIONARY_PATH) def execute(): print 'Started LDA at ' + get_time() + '... ',
""" import os from os.path import join import pytz from pymongo import MongoClient from utilities.config import DAY_START, APPROXIMATION_RANGE, NUMBER_OF_TOPICS from utilities.constants import * from utilities.os_util import get_dir, get_files_in_dir from utilities.time_management import get_today, get_prev_day, get_datetime_from_string, get_next_day, \ get_date_time_string, convert_datetime_to_local, localize_datetime, get_differenced_day PROJECT_ROOT = get_dir(get_dir(get_dir(__file__))) TSV_DIR_PATH = join(PROJECT_ROOT, WEBSITE_DIR, STATIC_DIR, TSV_DIR) START_DATE = get_differenced_day(get_today(), -16) #get_prev_day(get_today()) print START_DATE client = MongoClient() topics_db = client['tweets'] entities = [] def remove_previous_data(): tsv_files = get_files_in_dir(TSV_DIR_PATH, TSV) for tsv_file in tsv_files: os.remove(join(TSV_DIR_PATH, tsv_file)) def init_writer(tid): filename = TOPIC_FILE_PREFIX + str(tid) + TSV
from bson.code import Code from os.path import join from pymongo import MongoClient from utilities.constants import * from utilities.entities.Collection import Collection from utilities.mongo import check_or_create_collection, copy_into_collection from utilities.os_util import get_dir from utilities.time_management import get_today, get_date_string, start_timing, stop_timing ROOT = get_dir(__file__) JAVASCRIPT_PATH = join(ROOT, JAVASCRIPT_DIR) TODAY = get_today() TODAY_STRING = get_date_string(TODAY) COLLECTION_NAME = RAW_COLLECTION_PREFIX + TODAY_STRING RESULTS_COLLECTION_NAME = ENTITY_RESULTS_COLLECTION_PREFIX + TODAY_STRING check_or_create_collection(RAW_TWEETS_DB_NAME, TEMP_RAW_COLLECTION_NAME, Collection.TEMP) check_or_create_collection(RAW_TWEETS_DB_NAME, COLLECTION_NAME, Collection.RAW) check_or_create_collection(RAW_TWEETS_DB_NAME, TEMP_RESULTS_COLLECTION_NAME, Collection.ENTITY_RESULT) check_or_create_collection(RAW_TWEETS_DB_NAME, RESULTS_COLLECTION_NAME, Collection.ENTITY_RESULT) client = MongoClient() db = client[RAW_TWEETS_DB_NAME] coll = db[COLLECTION_NAME]