def get_topics(df, dictionary, corpus, num_topics, name, method="GSDMM-Rust", alpha=0.1, beta=0.1): assert len(df) == len(corpus) if method == "LDA": dictionary, topics, scores = sentences_to_topic_model( sentences, args.num_topics) elif method == "GSDMM-Rust": dictionary, topics, scores = sentences_to_gsdmm_rust(dictionary, corpus, args.num_topics, name, alpha=alpha, beta=beta) elif method == "GSDMM": dictionary, topics, scores = sentences_to_gsdmm( sentences, args.num_topics) print("exporting dictionary and nparray") # export everything dictionary.save(getFile(name, Datafile.DICTIONARY)) np.save(getFile(name, Datafile.TOPIC_NDARRAY), topics) print("preparing scores") scores_df = scores assert (scores.index == df.index).all() scores_df.to_csv(getFile(name, Datafile.SCORES), sep="\t", index=False) scores_sums = scores_df.sum() scores_sums = scores_sums.sort_index() # Write this here because the graph algo step requires this info # and the analyze_topics.py as it is right now depends on the graph algo # output records = pd.DataFrame(scores_sums, columns=["size"]).to_dict(orient="index") with open(getFile(name, Datafile.TOPIC_JSON), "wt") as f: f.write(json.dumps(records)) scores_df["dominant_topic"] = scores_df.idxmax(axis=1) scores_df["title"] = df["title"] assert scores_df["title"].notnull().all() scores_df["url"] = df["url"] scores_df["publish_date"] = df["publish_date"] media_names = df["media_name"].fillna("No Media Name") scores_df["media_name"] = media_names scores_df[["dominant_topic", "title", "media_name", "url", "publish_date"]].to_csv(getFile(name, Datafile.HEADLINES_TSV), sep="\t", index=False) assert len(scores_df) == len(corpus) return topics
def get_degrees(name, topics_json): # Maybe this code should live in cluster_topics? graph = nx.read_gpickle(getFile(name, Datafile.GRAPH_PICKLE)) for topic in topics_json: try: topics_json[topic]["degree"] = graph.degree[topic] except: pdb.set_trace() return topics_json
def get_word_relevance(name, topics_json): print("Calcuating word relevance") dictionary = corpora.Dictionary.load(getFile(name, Datafile.DICTIONARY)) LAMBDA = 0.9 topic_ndarray = np.load(getFile(name, Datafile.TOPIC_NDARRAY)) ps_token_corpus = np.array([ dictionary.cfs[token] / dictionary.num_pos for token in dictionary.keys() ]) for topic, row in enumerate(topic_ndarray): if topic not in topics_json: continue sum_topic = np.sum(row) topic_word_relevance = ( row / sum_topic * LAMBDA + (1 - LAMBDA) * row / sum_topic / ps_token_corpus) top_relevant_tokens = np.argsort(topic_word_relevance)[::-1][0:20] top_common_tokens = np.argsort(row)[::-1][0:20] BLACKLIST = [ "ma_zone_forecast", "lottery_state_by", "mobile_world", "ct_boston_norton", "richard_grenell", "east_africa", "credit_cards", ] topics_json[topic]["relevant_words"] = [ [dictionary.id2token[tok], topic_word_relevance[tok]] for tok in top_relevant_tokens if dictionary.id2token[tok] not in BLACKLIST ] topics_json[topic]["common_words"] = [ [dictionary.id2token[tok], topic_word_relevance[tok]] for tok in top_common_tokens if dictionary.id2token[tok] not in BLACKLIST ] # del topics_json[str(topic)]["words"] return topics_json
def export(self): """ Export to a format that we can pass to JS frontend Honestly, the headlines might be slower to fetch We might need to build a backend LOL We'll just dump the points, edges, and elevations basically """ vx_df = pd.DataFrame(self.pts, columns=["x", "y"]) # df.to_csv(getFile(self.name, Datafile.POINTS_TSV), sep="\t") # df = pd.DataFrame(self.vor.vertices, columns=["x", "y"]) # water_coordinates, water_regions = self.compress_water() df = pd.DataFrame() df["elevation"] = self.elevation # df["coordinates"] = self.vor.regions df["coordinates"] = [ json.dumps(row.tolist()) for row in self.delaunay.simplices ] df["headlines"] = self.headlines df["is_edge"] = self.edges # ugly but i hate np df["topics"] = self.topics df["flux"] = self.flux_map df["moisture"] = self.moisture df["temperature"] = self.temperature df["shadow"] = self.shadow # df = df[~df.index.isin(water_regions)] df = df[df["elevation"] > 0] df.to_csv(getFile(self.name, Datafile.REGIONS_TSV), sep="\t") used_vxs = set() for c in df["coordinates"]: used_vxs.update(json.loads(c)) vx_df = vx_df.reset_index().iloc[list(used_vxs)] vx_df.to_csv(getFile(self.name, Datafile.VERTICES_TSV), sep="\t")
def add_lakes(self): """ Add lakes if required given the input graph """ graph = nx.read_gpickle(getFile(name, Datafile.GRAPH_PICKLE)) # Basically, we want to examine each Voronoi ridge # and if its corresponding edge doesn't exist in the input graph, # then we "kill" it with a lake extra_points = [] for i, pair in enumerate(self.rough_voronoi.ridge_points): topicA = self.get_rough_topic(pair[0]) topicB = self.get_rough_topic(pair[1]) if topicA == -1 or topicB == -1: # One of them is a water cell, so w/e continue if topicA == topicB: # bordering yourself is OK continue if topicB in graph[topicA]: # The edge exists in the graph, so it's OK. continue # If we reach this point, we have a Voronoi ridge # that links two topics that SHOULD NOT have an edge ridge_vertices = self.rough_voronoi.vertices[ self.rough_voronoi.ridge_vertices[i]] extra_points.extend(self.create_lake_points(ridge_vertices)) self.set_rough_points( self.rough_topic_points, pd.concat( ( self.rough_points, pd.DataFrame(extra_points, columns=["x", "y"]), ), ignore_index=True, ), ) plt_voronoi(self.rough_points, self.topic_df)
required=False, help= "if provided, a group number to restrict to (useful for debugging)", ) parser.add_argument( "-matplotlib", dest="matplotlib", action="store_const", const=True, required=False, help="if provided, output intermediate matplotlibs", ) args = parser.parse_args() name = names.getName(args.name, args.start, args.interval) topic_df = pd.read_csv(getFile(name, Datafile.TOPIC_METADATA_TSV), sep="\t", index_col=0) with open(getFile(name, Datafile.LAYOUT), "rt") as f: layout = json.load(f)["layouts"] layout = sorted([item for sl in layout for item in sl], key=lambda x: x["id"]) layout_df = pd.DataFrame(layout) layout_df.index = layout_df["id"] # this should be moved somewhere else :P assert (topic_df.index == layout_df.index).all() topic_df["x"] = layout_df["x"] topic_df["y"] = layout_df["y"] topic_df["group"] = layout_df["group"]
def analyze_topics(name, headlines=None, scores=None): if headlines == None: headlines = pd.read_csv(open(getFile(name, Datafile.HEADLINES_TSV)), sep="\t") if scores == None: scores = pd.read_csv(open(getFile(name, Datafile.SCORES)), sep="\t") # rps = np.genfromtxt(getFile(name, Datafile.RUST_PROBABILITIES), delimiter=",") # headlines = headlines[headlines["title"].notnull()].reset_index() assert len(headlines) == len(scores) scores.columns = scores.columns.astype(int) scores_sums = scores.sum() scores_sums = scores_sums.sort_index() sentiments = np.zeros(len(headlines)) """ for i, row in headlines.iterrows(): try: sentiments[i] = score_sentiment(row["title"]) except: pdb.set_trace() """ headlines["subjectivity"] = sentiments subj_map = headlines.groupby("dominant_topic").mean("subjectivity") count_map = headlines.groupby("dominant_topic").count()["title"] # get normalized count by media_name media_diversity = (headlines.groupby( ["dominant_topic", "media_name"]).count()["title"].unstack().fillna( 0).apply(lambda x: x / np.sum(x)).apply( lambda x: scipy.stats.mstats.gmean(x) / np.mean(x), axis=1)) assert (scores_sums.index == subj_map.index).all() subj_map["media_diversity"] = media_diversity subj_map["count"] = count_map subj_map["size"] = scores_sums subj_map.to_csv(getFile(name, Datafile.TOPIC_METADATA_TSV), sep="\t") records = subj_map.to_dict(orient="index") for topic in subj_map.index: recent_headlines = scores[topic][ scores[topic] > 0.9999].iloc[::-1][0:100] try: hdf = headlines.iloc[recent_headlines.index][[ "title", "url", "media_name", "publish_date" ]] records[topic]["articles"] = hdf.to_dict(orient="records") except: pdb.set_trace() records = get_degrees(name, records) records = get_word_relevance(name, records) for topic in records: records[topic]["region_name"] = get_name(records, topic) topic_json = json.load(open(getFile(name, Datafile.TOPIC_JSON))) for topic in topic_json: topic_json[topic].update(records[int(topic)]) with open(getFile(name, Datafile.TOPIC_JSON), "wt") as f: f.write(json.dumps(topic_json))
def sentences_to_gsdmm_rust(dictionary, corpus, num_topics, name, alpha=0.1, beta=0.1): # prepare files for consumption by rust executable: # vocabfile: one token per line print("preparing input for gsdmm-rust") with open("data/vocabfile.txt", "wt") as f: for t in dictionary.itervalues(): f.write(t) f.write("\n") index = [] with open("data/sentences.txt", "wt") as f: for i, doc in enumerate(corpus): arr = [] for tok in doc: for _ in range(tok[1]): arr.append(dictionary.id2token[tok[0]]) if arr: f.write(" ".join(arr)) f.write("\n") index.append(i) print("spawning gsdmm-rust subprocess") # spawn the rust subprocess stream_p = subprocess.Popen( [ "gsdmm-rust/target/release/gsdmm", "data/sentences.txt", "data/vocabfile.txt", f"data/{name}", "-k", str(num_topics), "-a", f"{alpha}", "-b", f"{beta}", "-m", "50", ], shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) while True: output = stream_p.stdout.readline() if stream_p.poll() is not None: break if output: print(output.strip()) # now read the cluster descriptions file into an ndarray print("retrieving gsdmm-rust output") mapping = dictionary.token2id topic_ndarray = np.zeros((num_topics, len(mapping))) with open(getFile(name, Datafile.RUST_CLUSTER_DESC), "rt") as f: while True: line = f.readline().strip() if not line: break line = line.split(" ") cluster_i = int(line[0]) cluster_words = line[1:] for pair in cluster_words: comps = pair.split(":") token = comps[0] val = int(comps[1]) tokid = mapping[token] topic_ndarray[cluster_i][tokid] = val scores = [] with open(getFile(name, Datafile.RUST_LABELS), "rt") as f: while True: line = f.readline().strip() if not line: break comps = line.split(",") scores.append({int(comps[0]): float(comps[1])}) scores = pd.DataFrame(scores).fillna(0) scores.index = index return dictionary, topic_ndarray, scores
nonempty = [i for i, val in enumerate(corpus) if val != []] corpus = [i for i in corpus if i != []] df = df.iloc[nonempty].reset_index() topics = get_topics( df, dictionary, corpus, args.num_topics, name, alpha=args.alpha, beta=args.beta, ) print(f"saving topics and dictionaries...") dictionary.save(getFile(name, Datafile.DICTIONARY)) np.save(getFile(name, Datafile.TOPIC_NDARRAY), topics) elif args.load: basename = args.load name = names.getName(basename, args.start, args.interval) topics = np.load(getFile(name, Datafile.TOPIC_NDARRAY)) dictionary = corpora.Dictionary.load(getFile(name, Datafile.DICTIONARY)) name2 = None if args.step: print("calculating intertopic distances") name2 = names.getPrevName(basename, args.start, args.interval, args.step) topic2_filename = getFile(name2, Datafile.TOPIC_NDARRAY)