Apply the LDA algorithm using libraries. Use the params in the config file. Save the LDA model. """ from os.path import join from gensim import corpora, models from utilities.config import NUMBER_OF_TOPICS, NUMBER_OF_PASSES, ALPHA from utilities.constants import * from utilities.os_util import get_dir from utilities.time_management import start_timing, stop_timing, get_time, get_today, get_date_string TODAY = get_today() TODAY_STRING = get_date_string(TODAY) ROOT = get_dir(__file__) DICTIONARY_PATH = join(ROOT, DATA_DIR, DICTIONARY_PREFIX + TODAY_STRING + DICT) CORPUS_PATH = join(ROOT, DATA_DIR, CORPUS_PREFIX + TODAY_STRING + MM) LDA_PATH = join(ROOT, DATA_DIR, LDA_MODEL_PREFIX + TODAY_STRING + LDA) CORPUS = corpora.MmCorpus(CORPUS_PATH) DICTIONARY = corpora.Dictionary.load(DICTIONARY_PATH) def execute(): print 'Started LDA at ' + get_time() + '... ', start_timing()
Perform cleaning Create and save dictionary and corpus """ from os.path import join from gensim import corpora from pymongo import MongoClient, DESCENDING from utilities.cleaner import clean from utilities.config import NUMBER_OF_TOP_ENTITIES, TWEET_POOLING_SIZE from utilities.constants import * from utilities.os_util import get_dir from utilities.time_management import get_prev_day, get_today, get_date_string, start_timing, stop_timing TODAY = get_today() TODAY_STRING = get_date_string(TODAY) COLLECTION_DAY = TODAY #get_prev_day(TODAY) COLLECTION_DAY_STRING = get_date_string(COLLECTION_DAY) ROOT = get_dir(__file__) DICTIONARY_PATH = join(ROOT, DATA_DIR, DICTIONARY_PREFIX + TODAY_STRING + DICT) CORPUS_PATH = join(ROOT, DATA_DIR, CORPUS_PREFIX + TODAY_STRING + MM) COLLECTION_NAME = RAW_COLLECTION_PREFIX + COLLECTION_DAY_STRING RESULTS_COLLECTION_NAME = ENTITY_RESULTS_COLLECTION_PREFIX + COLLECTION_DAY_STRING client = MongoClient() raw_db = client[RAW_TWEETS_DB_NAME] raw_collection = raw_db[COLLECTION_NAME] results_coll = raw_db[RESULTS_COLLECTION_NAME]
from os.path import join from pymongo import MongoClient from utilities.constants import * from utilities.entities.Collection import Collection from utilities.mongo import check_or_create_collection, copy_into_collection from utilities.os_util import get_dir from utilities.time_management import get_today, get_date_string, start_timing, stop_timing ROOT = get_dir(__file__) JAVASCRIPT_PATH = join(ROOT, JAVASCRIPT_DIR) TODAY = get_today() TODAY_STRING = get_date_string(TODAY) COLLECTION_NAME = RAW_COLLECTION_PREFIX + TODAY_STRING RESULTS_COLLECTION_NAME = ENTITY_RESULTS_COLLECTION_PREFIX + TODAY_STRING check_or_create_collection(RAW_TWEETS_DB_NAME, TEMP_RAW_COLLECTION_NAME, Collection.TEMP) check_or_create_collection(RAW_TWEETS_DB_NAME, COLLECTION_NAME, Collection.RAW) check_or_create_collection(RAW_TWEETS_DB_NAME, TEMP_RESULTS_COLLECTION_NAME, Collection.ENTITY_RESULT) check_or_create_collection(RAW_TWEETS_DB_NAME, RESULTS_COLLECTION_NAME, Collection.ENTITY_RESULT) client = MongoClient() db = client[RAW_TWEETS_DB_NAME] coll = db[COLLECTION_NAME] temp_raw = db[TEMP_RAW_COLLECTION_NAME]
import io from os.path import join from gensim import models, corpora from pymongo import MongoClient, DESCENDING from utilities.cleaner import clean from utilities.constants import * from utilities.config import NUMBER_OF_TOP_ENTITIES, NUMBER_OF_TOPICS from utilities.entities.Collection import Collection from utilities.mongo import check_or_create_collection, copy_into_collection from utilities.os_util import get_dir from utilities.time_management import get_prev_day, start_timing, stop_timing, get_today, get_date_string, get_time, get_differenced_day TODAY = get_today() TODAY_STRING = get_date_string(TODAY) COLLECTION_DAY = TODAY #get_prev_day(TODAY) COLLECTION_DAY_STRING = get_date_string(COLLECTION_DAY) ROOT = get_dir(__file__) DICTIONARY_PATH = join(ROOT, DATA_DIR, DICTIONARY_PREFIX + TODAY_STRING + DICT) CORPUS_PATH = join(ROOT, DATA_DIR, CORPUS_PREFIX + TODAY_STRING + MM) LDA_PATH = join(ROOT, DATA_DIR, LDA_MODEL_PREFIX + TODAY_STRING + LDA) MODEL_DATA_PATH = join(ROOT, MODEL_DATA_DIR) CORPUS = corpora.MmCorpus(CORPUS_PATH) DICTIONARY = corpora.Dictionary.load(DICTIONARY_PATH) LDA_MODEL = models.LdaModel.load(LDA_PATH) COLLECTION_NAME = RAW_COLLECTION_PREFIX + COLLECTION_DAY_STRING RESULTS_COLLECTION_NAME = ENTITY_RESULTS_COLLECTION_PREFIX + COLLECTION_DAY_STRING