Save the LDA model. """ from os.path import join from gensim import corpora, models from utilities.config import NUMBER_OF_TOPICS, NUMBER_OF_PASSES, ALPHA from utilities.constants import * from utilities.os_util import get_dir from utilities.time_management import start_timing, stop_timing, get_time, get_today, get_date_string TODAY = get_today() TODAY_STRING = get_date_string(TODAY) ROOT = get_dir(__file__) DICTIONARY_PATH = join(ROOT, DATA_DIR, DICTIONARY_PREFIX + TODAY_STRING + DICT) CORPUS_PATH = join(ROOT, DATA_DIR, CORPUS_PREFIX + TODAY_STRING + MM) LDA_PATH = join(ROOT, DATA_DIR, LDA_MODEL_PREFIX + TODAY_STRING + LDA) CORPUS = corpora.MmCorpus(CORPUS_PATH) DICTIONARY = corpora.Dictionary.load(DICTIONARY_PATH) def execute(): print 'Started LDA at ' + get_time() + '... ', start_timing() lda = models.LdaModel(CORPUS, id2word=DICTIONARY,
from tweepy import OAuthHandler, Stream from tweepy.streaming import StreamListener from utilities.config import * from utilities.constants import * from utilities.miscellaneous import is_json from utilities.os_util import get_dir from utilities.time_management import get_time display_number = DISPLAY_COMPLETED_TWEETS_INTERVAL file_number = 1 tweets_cnt = 0 total_tweets_cnt = 0 ROOT = get_dir(__file__) FILE_PATH = join(ROOT, DATA_DIR) TEMP_PATH = join(ROOT, TEMP_DIR) def get_filename(directory, number): return join(directory, DATA_FILE_PREFIX + FILE_NAME_FORMATTER % number + JSON) def change_file(): global tweets_file, file_name, file_number tweets_file.close() rename(file_name, get_filename(TEMP_PATH,
import json from os import remove from os.path import join import pytz from pymongo import MongoClient from utilities.constants import * from utilities.entities.Collection import Collection from utilities.miscellaneous import display_percentage from utilities.mongo import check_or_create_collection, insert_many from utilities.time_management import datetime, start_timing, stop_timing from utilities.os_util import dirname, get_dir, get_files_in_dir ENGINE_ROOT = dirname(get_dir(__file__)) TEMP_PATH = join(ENGINE_ROOT, EXTRACTOR_DIR, TEMP_DIR) check_or_create_collection(RAW_TWEETS_DB_NAME, TEMP_RAW_COLLECTION_NAME, Collection.TEMP) client = MongoClient() db = client[RAW_TWEETS_DB_NAME] collection = db[TEMP_RAW_COLLECTION_NAME] def is_retweet(tweet): return RETWEETED_STATUS in tweet.keys() def extract_data(file_path): # load json file into a list of dictionaries tweets = []
from string import Template from gensim import models from pytagcloud import create_tag_image, create_html_data, make_tags, \ LAYOUT_HORIZONTAL, LAYOUTS from pytagcloud.colors import COLOR_SCHEMES from pytagcloud.lang.counter import get_tag_counts from utilities.constants import * from utilities.os_util import get_dir from utilities.time_management import get_prev_day, start_timing, stop_timing, get_today, get_date_string, get_time, get_differenced_day TODAY = get_today() TODAY_STRING = get_date_string(TODAY) ROOT = get_dir(__file__) PROJECT_ROOT = get_dir(get_dir(ROOT)) LDA_PATH = join(ROOT, DATA_DIR, LDA_MODEL_PREFIX + TODAY_STRING + LDA) MODEL_DATA_PATH = join(ROOT, MODEL_DATA_DIR) WORDCLOUD_PATH = join(PROJECT_ROOT, WEBSITE_DIR, STATIC_DIR, WORDCLOUD_DIR) LDA_MODEL = models.LdaModel.load(LDA_PATH) def normalize(arr): sum = 0 for i in arr: sum += i for i in range(len(arr)): arr[i] = arr[i]/sum
Total 24*60 data points Stores in a .tsv file """ import os from os.path import join import pytz from pymongo import MongoClient from utilities.config import DAY_START, APPROXIMATION_RANGE, NUMBER_OF_TOPICS from utilities.constants import * from utilities.os_util import get_dir, get_files_in_dir from utilities.time_management import get_today, get_prev_day, get_datetime_from_string, get_next_day, \ get_date_time_string, convert_datetime_to_local, localize_datetime, get_differenced_day PROJECT_ROOT = get_dir(get_dir(get_dir(__file__))) TSV_DIR_PATH = join(PROJECT_ROOT, WEBSITE_DIR, STATIC_DIR, TSV_DIR) START_DATE = get_differenced_day(get_today(), -16) #get_prev_day(get_today()) print START_DATE client = MongoClient() topics_db = client['tweets'] entities = [] def remove_previous_data(): tsv_files = get_files_in_dir(TSV_DIR_PATH, TSV) for tsv_file in tsv_files: os.remove(join(TSV_DIR_PATH, tsv_file))