def main(): parser = argparse.ArgumentParser() parser.add_argument("--dir") parser.add_argument("--num") parser.add_argument("--clusters") parser.add_argument("--clusters-out") args = parser.parse_args() f_out = codecs.open(args.clusters_out, 'w', encoding="utf8") today = date.today().strftime('%Y%m%d') ystd = (date.today() - timedelta(1)).strftime('%Y%m%d') cl = json.load(codecs.open(args.clusters, 'r', encoding="utf8")) today_time = (datetime.utcnow()).strftime("%Y%m%d%H%M%S") update_time = (datetime.now()).strftime("%Y-%m-%d %H:%M:%S") cur1 = stats.get_cursor("%s/tweets_%s.db" % (args.dir, today)) cur2 = stats.get_cursor("%s/tweets_%s.db" % (args.dir, ystd)) rel_tweets = [] for x in cl: put_trend(x) filtered_cl = [x for x in cl if x["trend"] > 0.0] logging.info("Filtered out %d of %d (trend > 0.0)" % (len(cl) - len(filtered_cl), len(cl))) top_cl = sorted(cl, key=lambda x: x["trend"], reverse=True)[:15] tw_with_embed_cnt = 0 for cluster in top_cl: r = get_relevant_tweets(cur1, cur2, cluster) rel_tweets.append(r) cluster["topic_density"] = r["density"] logging.info("Have %d topics with tweets embeds out of %d" % (tw_with_embed_cnt, len(top_cl))) cur_rel = stats.get_cursor("%s/tweets_relevant.db" % args.dir) stats.create_given_tables(cur_rel, ["relevant"]) save_relevant(cur_rel, today_time, rel_tweets) final_cl = { "clusters": top_cl, "update_time": update_time, "cluster_id": today_time } cl_json = json.dump(final_cl, f_out) f_out.close() return
def main(): logging.info("start") parser = util.get_dates_range_parser() args = parser.parse_args() ind = Indexer(DB_DIR) cur = stats.get_main_cursor(DB_DIR) cur_word_cnt = stats.get_cursor(DB_DIR + "/word_cnt.db") used_nouns = get_used_nouns(cur) nouns = stats.get_nouns(cur, used_nouns) noun_trend = stats.get_noun_trend(cur) nouns[total_md5] = "__total__" noun_trend[total_md5] = 0.0 logging.info("nouns len %s" % len(nouns)) logging.info("get sim_dict") sim_dict = get_sims(cur) cl = get_clusters(sim_dict, nouns, noun_trend) json.dump(cl, open("./clusters_raw.json", "w"), indent=2) logging.info("Done")
def main2(): parser = argparse.ArgumentParser() parser.add_argument("--dir") parser.add_argument("--query") args = parser.parse_args() words = get_words_from_query(args.query.decode('utf8')) print_list(words) if args.db is None: print "Need --db" return cur = stats.get_cursor(args.db) fill_lemmas(cur, words) print_list(words) tweets = lookup_two_days(cur, cur, words) # len(tweets[x].words), #for t in sorted(tweets.keys(), key=lambda x: (tweets[x].created_at), reverse=True)[:10]: # print tweets[t].__str__() return
def main(): logging.info("start") parser = util.get_dates_range_parser() parser.add_argument("-i") args = parser.parse_args() ind = Indexer(DB_DIR) cur = stats.get_main_cursor(DB_DIR) cur_word_cnt = stats.get_cursor(DB_DIR + "/word_cnt.db") words_db = DB_DIR + "/tweets_lemma.db" bigram_db = DB_DIR + "/tweets_bigram.db" used_nouns = get_used_nouns(cur) total_md5 = util.digest("__total__") nouns = stats.get_nouns(cur, used_nouns) noun_trend = stats.get_noun_trend(cur) nouns[total_md5] = "__total__" noun_trend["__total__"] = 0.0 logging.info("nouns len %s" % len(nouns)) post_cnt = stats.get_noun_cnt(cur_word_cnt) post_cnt[total_md5] = 0 logging.info("get sim_dict") sim_dict = get_sims(cur) cl = get_clusters(args, sim_dict, nouns, noun_trend, post_cnt) json.dump(cl, open("./clusters_raw.json","w"), indent=2) logging.info("Done")
def main(): logging.info("start") parser = util.get_dates_range_parser() parser.add_argument("-c", "--clear", action="store_true") parser.add_argument("-p", "--profiles-table", default="post_reply_cnt") parser.add_argument("-o", "--out-file") args = parser.parse_args() cur = stats.get_cursor(DB_DIR + "/word_cnt.db") profiles_dict = stats.setup_noun_profiles( cur, {}, {}, post_min_freq=POST_MIN_FREQ, blocked_nouns=BLOCKED_NOUNS, nouns_limit=NOUNS_LIMIT, db_dir=DB_DIR, profiles_table=args.profiles_table, trash_words=settings["trash_words"], swear_words=settings["swear_words"]) logging.info("profiles len %s" % len(profiles_dict)) profiles_dump = {} for p in profiles_dict: profiles_dump[p] = profiles_dict[p].replys json.dump(profiles_dump, open(args.out_file, 'w'))
def get_clusters(self, skip, before, date): cur = stats.get_cursor(settings["db_dir"] + "/tweets_display.db") if date is not None: cur.execute(""" select cluster from clusters where cluster_date <= '%(date)s' order by cluster_date desc limit 1 """ % ({ 'date': date })) elif before is not None: cur.execute(""" select cluster from clusters where cluster_date < '%(before)s' order by cluster_date desc limit 1 """ % ({ 'before': before })) else: cur.execute(""" select cluster from clusters order by cluster_date desc limit 1 offset %s """ % (skip)) res = cur.fetchone()[0] return res
def get_trending_words(db_dir, word_cnt_tuples): cur = stats.get_cursor(db_dir + "/tweets_display.db") stats.create_given_tables(cur, ["noun_trend"]) cur.execute(""" select noun_md5, trend from noun_trend order by trend desc limit 2000 """) word_trends = map(lambda x: (int(x[0]), float(x[1])), cur.fetchall()) word_ranks = make_tf_idf_ranks(word_cnt_tuples) for w in word_trends: word, trend = w if word not in word_ranks: logging.warn("No such word_md5 at word_ranks %s" % word) continue word_ranks[word].trend.value = trend Rank.weight_ranks(map(lambda x: x.trend, word_ranks.values())) Rank.weight_ranks(map(lambda x: x.cnt, word_ranks.values())) words = [] for word_rank in sorted(word_ranks.values(), key=lambda x: x.cnt.rank + x.trend.rank)[:2000]: words.append(str(word_rank.word)) return words
def main(): parser = argparse.ArgumentParser() parser.add_argument("--clusters") parser.add_argument("--out-bigram-stats") args = parser.parse_args() cur_display = stats.get_cursor(DB_DIR + "/tweets_display.db") cl = json.load(open(args.clusters, 'r')) word_stats = get_word_stats(cl) bigram_stats = get_bigram_stats(cl, word_stats) nouns = get_nouns(cl) sources = get_sources(bigram_stats) bigram_stats2 = apply_word_text(bigram_stats, nouns, sources) logging.info("Got stats for: %d words; %d bigrams" % (len(word_stats), len(bigram_stats))) json.dump(bigram_stats2, codecs.open(args.out_bigram_stats, 'w', encoding="utf8"), indent=2, ensure_ascii=False)
def get_db_for_filename(self, filename): if filename in self.db_curs and self._check_cursor_alive(self.db_curs[filename]): return self.db_curs[filename] else: self.log.info("Setup db connection " + filename) cur = stats.get_cursor(filename) self.db_curs[filename] = cur return cur
def get_db_for_date(self, date): date = date[:8] # assume date format %Y%m%d_%H%M%S if date in self.dates_db: return self.dates_db[date] else: self.log.info("Setup db connection for date " + date) cur = stats.get_cursor(self.db_dir + "/tweets_" + date + ".db") self.dates_db[date] = cur stats.create_given_tables(cur, ["tweets"]) return cur
def get_relevant(self, date): cur = stats.get_cursor(settings["db_dir"] + "/tweets_relevant.db") if date is not None: cur.execute(""" select relevant from relevant where cluster_date = '%(date)s' """ % ({ 'date': date })) res = cur.fetchone()[0] return res
def main(): parser = argparse.ArgumentParser() parser.add_argument("--filter-spam", action="store_true") parser.add_argument("--show-name", action="store_true") args = parser.parse_args() cur = stats.get_cursor(settings["db_dir"] + "/quality_marks.db") m = get_marks(cur, args.filter_spam, args.show_name) print m return
def main(): parser = argparse.ArgumentParser() parser.add_argument("--clusters") args = parser.parse_args() cur_display = stats.get_cursor(DB_DIR + "/tweets_display.db") final_cl_raw = codecs.open(args.clusters, 'r',encoding="utf8").read() final_cl = json.loads(final_cl_raw) cur_display.execute(""" replace into clusters (cluster_date, cluster) values (?, ?) """, (final_cl["cluster_id"], final_cl_raw))
def post(self): req_data = None try: req_data = json.loads(self.request.body) if req_data is not None: cur = stats.get_cursor(settings["db_dir"] + "/quality_marks.db") stats.create_given_tables(cur, ["quality_marks"]) username = "" if "username" in req_data and req_data["username"] is not None: username = req_data["username"] update_time = "" if "update_time" in req_data and req_data[ "update_time"] is not None: update_time = req_data["update_time"] update_time = int(re.sub('[-\s:]', '', update_time)) exp_name = "" if "experiment_name" in req_data and req_data[ "experiment_name"] is not None: exp_name = req_data["experiment_name"] exp_descr = "" if "experiment_descr" in req_data and req_data[ "experiment_descr"] is not None: exp_descr = req_data["experiment_descr"] cur.execute( """ insert into quality_marks (update_time, username, exp_name, exp_descr, marks) values (?, ?, ?, ?, ?) """, (update_time, username, exp_name, exp_descr, json.dumps(req_data["marks"]))) except Exception as e: logging.error(e) raise (e) self.write("")
def build_post_cnt(db_dir): utc_now = datetime.utcnow() word_cnt = stats.get_word_cnt(db_dir) word_cnt_tuples = map(lambda x: (int(x), int(word_cnt[x])), word_cnt.keys()) f_tmp = db_dir + "/word_cnt.db.tmp" f = db_dir + "/word_cnt.db" util.delete_if_exists(f_tmp) cur = stats.get_cursor(f_tmp) stats.create_given_tables(cur, ["chains_nouns", "post_cnt", "post_reply_cnt"]) save_word_cnt(cur, word_cnt_tuples) words = get_trending_words(db_dir, word_cnt_tuples) mcur = stats.get_mysql_cursor(settings) count_currents2(cur, mcur, utc_now, words) os.rename(f_tmp, f)
def parse_facts_file(tweet_index, facts, date): ind = Indexer(DB_DIR) cur = ind.get_db_for_date(date) cur_main = stats.get_main_cursor(DB_DIR) cur_bigram = stats.get_cursor(DB_DIR + "/tweets_bigram.db") mcur = stats.get_mysql_cursor(settings) word_time_cnt_table = "word_time_cnt_%s" % date word_hour_cnt_table = "word_hour_cnt_%s" % date word_mates_table = "word_mates_%s" % date bigram_table = "bigram_%s" % date stats.create_mysql_tables( mcur, { word_time_cnt_table: "word_time_cnt", word_hour_cnt_table: "word_hour_cnt", word_mates_table: "word_mates", bigram_table: "bigram_day" }) stats.create_given_tables( cur, ["nouns", "tweets_nouns", "tweets_words", "lemma_word_pairs"]) stats.create_given_tables(cur_bigram, ["lemma_word_pairs"]) stats.create_given_tables(cur, {"sources": "nouns"}) stats.create_given_tables(cur_main, ["nouns"]) stats.create_given_tables(cur_main, {"sources": "nouns"}) logging.info("Parse index: %s; facts: %s" % (tweet_index, facts)) ids = [] for l in open(tweet_index, 'r').read().split("\n"): if l is None or l == '': break tw_id, created_at = l.split("\t") ids.append((tw_id, created_at)) logging.info("Got tweet %s ids" % (len(ids))) tree = ElementTree.iterparse(facts, events=('start', 'end')) # set larger cache, default 2000 * 1024, this 102400*1024 #cur_bigram.execute("pragma cache_size = -102400") nouns_total = set() sources_total = set() noun_sources = [] tweets_nouns = [] lemma_word_pairs = [] word_mates = [] word_cnt = [] match_type_cnt = MatchTypeCnt() for event, elem in tree: if event == 'end' and elem.tag == 'document': cur_doc = int(elem.attrib['di']) post_id, create_time = ids[cur_doc - 1] nouns_preps = get_nouns_preps(elem) match_type_cnt.add_cnt(nouns_preps) lemmas = [] nouns = [] for np in nouns_preps: try: lemmas.append(util.digest(np.with_prep())) nouns.append(util.digest(np.noun_lemma)) nouns_total.add(np.noun_lemma) sources_total.add(np.with_prep()) noun_sources.append((post_id, util.digest(np.noun_lemma), util.digest(np.with_prep()))) word_cnt.append((util.digest(np.noun_lemma), cut_to_tenminute(create_time))) except Exception as e: traceback.print_exc() logging.error(e) lemma_word_pairs += make_lemma_word_pairs( nouns, lemmas, cut_to_tenminute(create_time)) word_mates += make_word_pairs_with_time(nouns, create_time, bag_size=BAG_SIZE) if len(noun_sources) > 10000: logging.info("seen %s docid" % (cur_doc)) save_tweet_nouns(cur, noun_sources) save_word_time_cnt2(mcur, word_cnt, word_time_cnt_table) save_word_hour_cnt(mcur, word_cnt, word_hour_cnt_table) noun_sources = [] word_cnt = [] if len(lemma_word_pairs) >= CHUNK_SIZE: save_bigram_day(mcur, lemma_word_pairs, bigram_table) lemma_word_pairs = [] if len(word_mates) >= CHUNK_SIZE: logging.info("save %s word_mates" % len(word_mates)) save_word_mates2(mcur, word_mates, word_mates_table) word_mates = [] elem.clear() save_tweet_nouns(cur, noun_sources) save_word_time_cnt2(mcur, word_cnt, word_time_cnt_table) save_word_hour_cnt(mcur, word_cnt, word_hour_cnt_table) save_bigram_day(mcur, lemma_word_pairs, bigram_table) save_word_mates2(mcur, word_mates, word_mates_table) save_nouns(cur, nouns_total) save_nouns(cur, sources_total, table="sources") save_nouns(cur_main, nouns_total) save_nouns(cur_main, sources_total, table="sources") logging.info(str(match_type_cnt)) return
def main(): parser = argparse.ArgumentParser() parser.add_argument("--db-dir", default=DB_DIR) args = parser.parse_args() cur_display = stats.get_cursor(args.db_dir + "/tweets_display.db") cur_main = stats.get_cursor(args.db_dir + "/tweets.db") #cur_main = stats.get_cursor(args.db_dir + "/tweets_20150221.db") #nouns = stats.get_nouns(cur_main) #logging.info(type(nouns.keys()[0])) utc_now = datetime.utcnow() date_3day = (utc_now - timedelta(3)).strftime("%Y%m%d%H%M%S") date_3day_tenminute = date_3day[:11] logging.info("Time left bound: %s" % date_3day_tenminute) hour_word_cnt = {} word_cnt = {} for day in [3, 2, 1, 0]: date = (utc_now - timedelta(day)).strftime("%Y%m%d") word_time_cnt_table = "word_time_cnt_%s" % date mcur = stats.get_mysql_cursor(settings) stats.create_mysql_tables(mcur, {word_time_cnt_table: "word_time_cnt"}) mcur.execute(""" select word_md5, substr(tenminute, 1, 10) as hour, sum(cnt) from %s where tenminute > %s group by word_md5, hour """ % (word_time_cnt_table, date_3day_tenminute)) row_cnt = 0 while True: res = mcur.fetchone() if res is None: break word_md5, hour, cnt = map(int, res) if hour not in hour_word_cnt: hour_word_cnt[hour] = {} hour_word_cnt[hour][word_md5] = cnt if word_md5 not in word_cnt: word_cnt[word_md5] = 0 word_cnt[word_md5] += cnt row_cnt += 1 if row_cnt % 100000 == 0: logging.info('Seen %s rows' % row_cnt) word_series = [] hours = sorted(hour_word_cnt.keys()) for word in word_cnt.keys(): series = [] series_max = 0 for hour in hours: if word in hour_word_cnt[hour]: series.append(hour_word_cnt[hour][word]) if hour_word_cnt[hour][word] > series_max: series_max = hour_word_cnt[hour][word] else: series.append(0) # normalize by maxfreq in series if series_max > 0: series = [(float(x) / series_max) * 100 for x in series] approx = least_squares(series) a, b, app_ser = approx word_series.append({ "word_md5": word, "word_cnt": word_cnt[word], "line_c": a, "slope": b, "delta": app_ser[-1] - app_ser[0] }) word_series = sorted(word_series, key=lambda x: x["slope"], reverse=True)[:2000] for cur in [cur_main, cur_display]: stats.create_given_tables(cur, {"noun_trend_new": "noun_trend"}) cur.execute("begin transaction") for s in word_series: cur.execute("insert into noun_trend_new values (%s, %s)" % (s["word_md5"], s["slope"])) cur.execute("drop table noun_trend") cur.execute("alter table noun_trend_new rename to noun_trend") cur.execute("commit") logging.info("Done")