def get_nouns(clusters): cur = stats.get_main_cursor(DB_DIR) cl_nouns = get_cluster_nouns(clusters) nouns = stats.get_nouns(cur, cl_nouns) return nouns
def main(): logging.info("start") parser = util.get_dates_range_parser() args = parser.parse_args() ind = Indexer(DB_DIR) cur = stats.get_main_cursor(DB_DIR) cur_word_cnt = stats.get_cursor(DB_DIR + "/word_cnt.db") used_nouns = get_used_nouns(cur) nouns = stats.get_nouns(cur, used_nouns) noun_trend = stats.get_noun_trend(cur) nouns[total_md5] = "__total__" noun_trend[total_md5] = 0.0 logging.info("nouns len %s" % len(nouns)) logging.info("get sim_dict") sim_dict = get_sims(cur) cl = get_clusters(sim_dict, nouns, noun_trend) json.dump(cl, open("./clusters_raw.json", "w"), indent=2) logging.info("Done")
def main(): logging.info("Start") parser = util.get_dates_range_parser() parser.add_argument("-i", "--in-file") args = parser.parse_args() ind = Indexer(DB_DIR) cur = stats.get_main_cursor(DB_DIR) stats.create_given_tables(cur, ["noun_similarity"]) cur.execute( "create table if not exists noun_sim_new as select * from noun_similarity limit 0" ) cur.execute("delete from noun_sim_new") in_file = open(args.in_file, 'r') sims = [] for line in in_file: sims.append(line.split(";")) if len(sims) > 20000: save_sims(cur, sims) sims = [] save_sims(cur, sims) cur.execute("begin transaction") cur.execute("delete from noun_similarity") cur.execute( "insert or ignore into noun_similarity select * from noun_sim_new") cur.execute("commit") logging.info("Done")
def main(): logging.info("start") parser = util.get_dates_range_parser() parser.add_argument("-i") args = parser.parse_args() ind = Indexer(DB_DIR) cur = stats.get_main_cursor(DB_DIR) cur_word_cnt = stats.get_cursor(DB_DIR + "/word_cnt.db") words_db = DB_DIR + "/tweets_lemma.db" bigram_db = DB_DIR + "/tweets_bigram.db" used_nouns = get_used_nouns(cur) total_md5 = util.digest("__total__") nouns = stats.get_nouns(cur, used_nouns) noun_trend = stats.get_noun_trend(cur) nouns[total_md5] = "__total__" noun_trend["__total__"] = 0.0 logging.info("nouns len %s" % len(nouns)) post_cnt = stats.get_noun_cnt(cur_word_cnt) post_cnt[total_md5] = 0 logging.info("get sim_dict") sim_dict = get_sims(cur) cl = get_clusters(args, sim_dict, nouns, noun_trend, post_cnt) json.dump(cl, open("./clusters_raw.json","w"), indent=2) logging.info("Done")
def get_sources(bigram_stats): cur = stats.get_main_cursor(DB_DIR) source_ids = set() for item in bigram_stats: source_ids.add(item["source1"]) source_ids.add(item["source2"]) sources = stats.get_sources(cur, source_ids) return sources
def __init__(self, db_dir, headers, days_back=7, seconds_till_user_retry=3600): db_basename = "tweets" self.db_dir = db_dir self.db_basename = db_basename self.dates_db = {} self.days_back = days_back self.recent_users = {} self.seconds_till_user_retry = seconds_till_user_retry self.log = logging.getLogger('fetcher-' + db_basename) cur = stats.get_main_cursor(self.db_dir) self.main_db = cur stats.create_given_tables(cur, ["users"]) self.client = TwitterClient(headers)
def parse_facts_file(tweet_index, facts, date): ind = Indexer(DB_DIR) cur = ind.get_db_for_date(date) cur_main = stats.get_main_cursor(DB_DIR) cur_bigram = stats.get_cursor(DB_DIR + "/tweets_bigram.db") mcur = stats.get_mysql_cursor(settings) word_time_cnt_table = "word_time_cnt_%s" % date word_hour_cnt_table = "word_hour_cnt_%s" % date word_mates_table = "word_mates_%s" % date bigram_table = "bigram_%s" % date stats.create_mysql_tables( mcur, { word_time_cnt_table: "word_time_cnt", word_hour_cnt_table: "word_hour_cnt", word_mates_table: "word_mates", bigram_table: "bigram_day" }) stats.create_given_tables( cur, ["nouns", "tweets_nouns", "tweets_words", "lemma_word_pairs"]) stats.create_given_tables(cur_bigram, ["lemma_word_pairs"]) stats.create_given_tables(cur, {"sources": "nouns"}) stats.create_given_tables(cur_main, ["nouns"]) stats.create_given_tables(cur_main, {"sources": "nouns"}) logging.info("Parse index: %s; facts: %s" % (tweet_index, facts)) ids = [] for l in open(tweet_index, 'r').read().split("\n"): if l is None or l == '': break tw_id, created_at = l.split("\t") ids.append((tw_id, created_at)) logging.info("Got tweet %s ids" % (len(ids))) tree = ElementTree.iterparse(facts, events=('start', 'end')) # set larger cache, default 2000 * 1024, this 102400*1024 #cur_bigram.execute("pragma cache_size = -102400") nouns_total = set() sources_total = set() noun_sources = [] tweets_nouns = [] lemma_word_pairs = [] word_mates = [] word_cnt = [] match_type_cnt = MatchTypeCnt() for event, elem in tree: if event == 'end' and elem.tag == 'document': cur_doc = int(elem.attrib['di']) post_id, create_time = ids[cur_doc - 1] nouns_preps = get_nouns_preps(elem) match_type_cnt.add_cnt(nouns_preps) lemmas = [] nouns = [] for np in nouns_preps: try: lemmas.append(util.digest(np.with_prep())) nouns.append(util.digest(np.noun_lemma)) nouns_total.add(np.noun_lemma) sources_total.add(np.with_prep()) noun_sources.append((post_id, util.digest(np.noun_lemma), util.digest(np.with_prep()))) word_cnt.append((util.digest(np.noun_lemma), cut_to_tenminute(create_time))) except Exception as e: traceback.print_exc() logging.error(e) lemma_word_pairs += make_lemma_word_pairs( nouns, lemmas, cut_to_tenminute(create_time)) word_mates += make_word_pairs_with_time(nouns, create_time, bag_size=BAG_SIZE) if len(noun_sources) > 10000: logging.info("seen %s docid" % (cur_doc)) save_tweet_nouns(cur, noun_sources) save_word_time_cnt2(mcur, word_cnt, word_time_cnt_table) save_word_hour_cnt(mcur, word_cnt, word_hour_cnt_table) noun_sources = [] word_cnt = [] if len(lemma_word_pairs) >= CHUNK_SIZE: save_bigram_day(mcur, lemma_word_pairs, bigram_table) lemma_word_pairs = [] if len(word_mates) >= CHUNK_SIZE: logging.info("save %s word_mates" % len(word_mates)) save_word_mates2(mcur, word_mates, word_mates_table) word_mates = [] elem.clear() save_tweet_nouns(cur, noun_sources) save_word_time_cnt2(mcur, word_cnt, word_time_cnt_table) save_word_hour_cnt(mcur, word_cnt, word_hour_cnt_table) save_bigram_day(mcur, lemma_word_pairs, bigram_table) save_word_mates2(mcur, word_mates, word_mates_table) save_nouns(cur, nouns_total) save_nouns(cur, sources_total, table="sources") save_nouns(cur_main, nouns_total) save_nouns(cur_main, sources_total, table="sources") logging.info(str(match_type_cnt)) return