def get_clusters(args, sim_dict, nouns, noun_trend, post_cnt): trash_words_md5 = map(util.digest, settings["trash_words"]) total_md5 = util.digest("__total__") best_ratio = 10 cl = [] for k in [800, 900, 1000, 1100]: for i in range(0, int(args.i)): logging.info("get %s clusters, iteration %s" % (k, i)) resp = KMeanCluster.get_clusters(sim_dict, int(k), nouns, trash_words=trash_words_md5, pre_clusters=[total_md5]) ratio = resp["intra_dist"] / resp["extra_dist"] if (ratio) < best_ratio: best_ratio = ratio cl = resp["clusters"] logging.info("Best ratio: %s" % best_ratio) logging.info("Best clusters size: %s" % len(cl)) for c in cl: for m in c["members"]: try: m["post_cnt"] = post_cnt[m["id"]] except Exception as e: logging.info("Mess with noun_md5 %s (%s)" % (m["id"], type(m["id"]))) logging.error(e) trend = noun_trend[m["id"]] if m["id"] in noun_trend else 0 m["trend"] = "%.3f" % trend return util.filter_trash_words_cluster(cl)
def main(): logging.info("start") parser = util.get_dates_range_parser() parser.add_argument("-i") args = parser.parse_args() ind = Indexer(DB_DIR) cur = stats.get_main_cursor(DB_DIR) cur_word_cnt = stats.get_cursor(DB_DIR + "/word_cnt.db") words_db = DB_DIR + "/tweets_lemma.db" bigram_db = DB_DIR + "/tweets_bigram.db" used_nouns = get_used_nouns(cur) total_md5 = util.digest("__total__") nouns = stats.get_nouns(cur, used_nouns) noun_trend = stats.get_noun_trend(cur) nouns[total_md5] = "__total__" noun_trend["__total__"] = 0.0 logging.info("nouns len %s" % len(nouns)) post_cnt = stats.get_noun_cnt(cur_word_cnt) post_cnt[total_md5] = 0 logging.info("get sim_dict") sim_dict = get_sims(cur) cl = get_clusters(args, sim_dict, nouns, noun_trend, post_cnt) json.dump(cl, open("./clusters_raw.json","w"), indent=2) logging.info("Done")
def get(self): try: word = self.get_argument("word", default=None) time1 = self.get_argument("time1", default=None) time2 = self.get_argument("time2", default=None) logging.info("Request: %s, %s, %s" % (word, time1, time2)) if word is None: return time1, time2 = self.parse_times(time1, time2) word_md5 = util.digest(word.strip()) logging.info("Get time series for '%s' (%s)" % (word, word_md5)) res = self.get_word_time_cnt(word_md5, time1, time2) res = sorted(res, key=lambda x: x[1]) res = map( lambda x: { "hour": x[1], "count": x[2], "utc_unixtime": x[3] }, res) #mov_av = [0] #for i in range(1, len(res) -1): # ma = float(res[i-1]["count"] + res[i]["count"] + res[i+1]["count"]) / 3 # mov_av.append(ma) #mov_av.append(0) self.write(json.dumps({"word": word_md5, "dataSeries": res})) except Exception as e: logging.error(e) raise e
def _save_nouns(cur, nouns, table="nouns"): cur.execute("begin transaction") for n in nouns: cur.execute( "insert or ignore into %s (noun_md5, noun) values (?, ?)" % (table), (util.digest(n), n)) cur.execute("commit")
def get_words_from_query(query): tokens = re.split('\s+', query) words = [] for t in tokens: w = Word(t) w.word_md5 = util.digest(t) words.append(w) return words
def filter_silly_spam(tw): tw_text = {} for tw_id in tw: tw_text[util.digest(tw[tw_id].text)] = tw_id tw2 = {} for tw_md5 in tw_text: tw_id = tw_text[tw_md5] tw2[tw_id] = tw[tw_id] return tw2
def _add_total_to_profiles(profiles_dict, trash_words): trash_words_md5 = map(util.digest, trash_words) total_md5 = util.digest('__total__') total = NounProfile(total_md5, post_cnt=0) for p in profiles_dict: if p not in trash_words_md5: continue profile = profiles_dict[p] for reply in profile.replys: if reply not in total.replys: total.replys[reply] = 0 total.replys[reply] += profile.replys[reply] total.post_cnt += profile.post_cnt profiles_dict[total_md5] = total
def get_word_time_cnt(self, word_md5, time1, time2): logging.info("Get word time cnt: %s, %s, %s" % (word_md5, time1, time2)) utc_now = datetime.utcnow() res = [] default_left_time_bound = (utc_now - timedelta(3)).strftime("%Y%m%d%H%M%S")[:10] time = "" if time1 is not None: time += " and hour >= " + str(time1)[:10] else: time += " and hour >= " + default_left_time_bound if time2 is not None: time += " and hour < " + str(time2)[:10] where = "word_md5 = %s" % word_md5 if word_md5 == util.digest('0'): where = "1" mcur = stats.get_mysql_cursor(settings) try: for day in [3, 2, 1, 0]: date = (utc_now - timedelta(day)).strftime("%Y%m%d") #stats.create_mysql_tables(mcur, {"word_hour_cnt_"+date: "word_hour_cnt"}) mcur.execute(""" SELECT word_md5, hour, cnt FROM word_hour_cnt_%(date)s WHERE %(where)s %(time)s """ % { "where": where, "time": time, "date": date }) while True: r = mcur.fetchone() if r is None: break word, hour, cnt = r utctime = str(hour) + "0000" utc_unixtime = datetime.strptime( utctime, '%Y%m%d%H%M%S').strftime('%s') res.append((str(word), utc_to_local(utctime), int(cnt), utc_unixtime)) logging.info("word time cnt: %s" % len(res)) except Exception as e: logging.error(e) return res
def dedup_tweets(tweets, all_words=True): dedup_tw = {} for tw_id in tweets: wordset = tweets[tw_id].all_words if all_words else tweets[tw_id].words words_str = [str(x) for x in sorted(wordset)] text_md5 = util.digest(",".join(words_str)) if text_md5 not in dedup_tw: dedup_tw[text_md5] = tw_id elif tweets[dedup_tw[text_md5]].created_at < tweets[tw_id].created_at: dedup_tw[text_md5] = tw_id groupped_tw = {} for text_md5 in dedup_tw: groupped_tw[dedup_tw[text_md5]] = tweets[dedup_tw[text_md5]] return groupped_tw
from molva.Indexer import Indexer import molva.util as util logging.config.fileConfig("logging.conf") POST_MIN_FREQ = 10 settings = {} try: settings = json.load(open('global-settings.json', 'r')) except Exception as e: logging.warn(e) DB_DIR = settings["db_dir"] if "db_dir" in settings else os.environ["MOLVA_DIR"] total_md5 = util.digest("__total__") trash_words = [util.digest(x) for x in settings["trash_words"]] def get_sims(cur): res = cur.execute("select post1_md5, post2_md5, sim from noun_similarity") sim_dict = {} while True: r = cur.fetchone() if r is None: break p1, p2, sim = r if p1 not in sim_dict: sim_dict[p1] = {} if p2 not in sim_dict:
logging.config.fileConfig("logging.conf") settings = {} try: settings = json.load(open('global-settings.json', 'r')) except Exception as e: logging.warn(e) POST_MIN_FREQ = settings["post_min_freq"] if "post_min_freq" in settings else 10 DB_DIR = settings["db_dir"] if "db_dir" in settings else os.environ["MOLVA_DIR"] BLOCKED_NOUNS_LIST = u"\n".join(list(u"абвгдеёжзиклмнопрстуфхцчшщыьъэюя")) BLOCKED_NOUNS = ",".join( map(lambda x: str(util.digest(x)), BLOCKED_NOUNS_LIST.split("\n"))) NOUNS_LIMIT = 2000 def main(): logging.info("start") parser = util.get_dates_range_parser() parser.add_argument("-c", "--clear", action="store_true") parser.add_argument("-p", "--profiles-table", default="post_reply_cnt") parser.add_argument("-o", "--out-file") args = parser.parse_args() cur = stats.get_cursor(DB_DIR + "/word_cnt.db") profiles_dict = stats.setup_noun_profiles(
def parse_facts_file(tweet_index, facts, date): ind = Indexer(DB_DIR) cur = ind.get_db_for_date(date) cur_main = stats.get_main_cursor(DB_DIR) cur_bigram = stats.get_cursor(DB_DIR + "/tweets_bigram.db") mcur = stats.get_mysql_cursor(settings) word_time_cnt_table = "word_time_cnt_%s" % date word_hour_cnt_table = "word_hour_cnt_%s" % date word_mates_table = "word_mates_%s" % date bigram_table = "bigram_%s" % date stats.create_mysql_tables( mcur, { word_time_cnt_table: "word_time_cnt", word_hour_cnt_table: "word_hour_cnt", word_mates_table: "word_mates", bigram_table: "bigram_day" }) stats.create_given_tables( cur, ["nouns", "tweets_nouns", "tweets_words", "lemma_word_pairs"]) stats.create_given_tables(cur_bigram, ["lemma_word_pairs"]) stats.create_given_tables(cur, {"sources": "nouns"}) stats.create_given_tables(cur_main, ["nouns"]) stats.create_given_tables(cur_main, {"sources": "nouns"}) logging.info("Parse index: %s; facts: %s" % (tweet_index, facts)) ids = [] for l in open(tweet_index, 'r').read().split("\n"): if l is None or l == '': break tw_id, created_at = l.split("\t") ids.append((tw_id, created_at)) logging.info("Got tweet %s ids" % (len(ids))) tree = ElementTree.iterparse(facts, events=('start', 'end')) # set larger cache, default 2000 * 1024, this 102400*1024 #cur_bigram.execute("pragma cache_size = -102400") nouns_total = set() sources_total = set() noun_sources = [] tweets_nouns = [] lemma_word_pairs = [] word_mates = [] word_cnt = [] match_type_cnt = MatchTypeCnt() for event, elem in tree: if event == 'end' and elem.tag == 'document': cur_doc = int(elem.attrib['di']) post_id, create_time = ids[cur_doc - 1] nouns_preps = get_nouns_preps(elem) match_type_cnt.add_cnt(nouns_preps) lemmas = [] nouns = [] for np in nouns_preps: try: lemmas.append(util.digest(np.with_prep())) nouns.append(util.digest(np.noun_lemma)) nouns_total.add(np.noun_lemma) sources_total.add(np.with_prep()) noun_sources.append((post_id, util.digest(np.noun_lemma), util.digest(np.with_prep()))) word_cnt.append((util.digest(np.noun_lemma), cut_to_tenminute(create_time))) except Exception as e: traceback.print_exc() logging.error(e) lemma_word_pairs += make_lemma_word_pairs( nouns, lemmas, cut_to_tenminute(create_time)) word_mates += make_word_pairs_with_time(nouns, create_time, bag_size=BAG_SIZE) if len(noun_sources) > 10000: logging.info("seen %s docid" % (cur_doc)) save_tweet_nouns(cur, noun_sources) save_word_time_cnt2(mcur, word_cnt, word_time_cnt_table) save_word_hour_cnt(mcur, word_cnt, word_hour_cnt_table) noun_sources = [] word_cnt = [] if len(lemma_word_pairs) >= CHUNK_SIZE: save_bigram_day(mcur, lemma_word_pairs, bigram_table) lemma_word_pairs = [] if len(word_mates) >= CHUNK_SIZE: logging.info("save %s word_mates" % len(word_mates)) save_word_mates2(mcur, word_mates, word_mates_table) word_mates = [] elem.clear() save_tweet_nouns(cur, noun_sources) save_word_time_cnt2(mcur, word_cnt, word_time_cnt_table) save_word_hour_cnt(mcur, word_cnt, word_hour_cnt_table) save_bigram_day(mcur, lemma_word_pairs, bigram_table) save_word_mates2(mcur, word_mates, word_mates_table) save_nouns(cur, nouns_total) save_nouns(cur, sources_total, table="sources") save_nouns(cur_main, nouns_total) save_nouns(cur_main, sources_total, table="sources") logging.info(str(match_type_cnt)) return