def get_input_stream(event, gold_probs, extractor="goose", thresh=.8, delay=None, topk=20, use_2015F=False, truncate=False): max_nuggets = 3 corpus = cuttsum.corpora.get_raw_corpus(event) if use_2015F is True and event.query_num > 25: corpus = cuttsum.corpora.FilteredTS2015() print event, corpus res = InputStreamResource() df = pd.concat( res.get_dataframes(event, corpus, extractor, thresh, delay, topk)) selector = (df["n conf"] == 1) & (df["nugget probs"].apply(len) == 0) df.loc[selector, "nugget probs"] = df.loc[selector, "nuggets"].apply( lambda x: {n: 1 for n in x}) df["true probs"] = df["nugget probs"].apply( lambda x: [val for key, val in x.items()] + [0]) df["true probs"] = df["true probs"].apply(lambda x: np.max(x)) df.loc[(df["n conf"] == 1) & (df["nuggets"].apply(len) == 0), "true probs"] = 0 if gold_probs is True: df["probs"] = df["true probs"] else: df["probs"] = NuggetRegressor().predict(event, df) df["nuggets"] = df["nugget probs"].apply( lambda x: set([key for key, val in x.items() if val > .9])) nid2time = {} nids = set(matches_df[matches_df["query id"] == event.query_id] ["nugget id"].tolist()) for nid in nids: ts = matches_df[matches_df["nugget id"] == nid]["update id"].apply( lambda x: int(x.split("-")[0])).tolist() ts.sort() nid2time[nid] = ts[0] fltr_nuggets = [] for name, row in df.iterrows(): fltr_nuggets.append( set([ nug for nug in row["nuggets"] if nid2time[nug] <= row["timestamp"] ])) #print df[["nuggets", "timestamp"]].apply(lambda y: print y[0]) # datetime.utcfromtimestamp(int(y["timestamp"]))) #print nids df["nuggets"] = fltr_nuggets df["nuggets"] = df["nuggets"].apply(lambda x: x if len(x) <= max_nuggets else set([])) from cuttsum.pipeline import DedupedArticlesResource ded = DedupedArticlesResource() stats_df = ded.get_stats_df(event, corpus, extractor, thresh) stats_df["stream ids"] = stats_df["stream ids"].apply( lambda x: set(eval(x))) sid2match = {} for _, row in stats_df.iterrows(): for sid in row["stream ids"]: sid2match[sid] = row["match"] all_ts = [] all_docs = [] new_docs = [] for (sid, ts), doc in df.groupby(["stream id", "timestamp"]): #if truncate is True: doc = doc.iloc[0:20] # print sub_doc if len(all_ts) > 0: assert ts >= all_ts[-1] all_ts.append(ts) if sid2match[sid] is True: new_docs.append(doc) all_docs.append(doc) df = pd.concat(new_docs) print len(all_docs), len(new_docs) return df
chunk_res = SCChunkResource() articles_res = ArticlesResource() ded_articles_res = DedupedArticlesResource() data = [] event2ids = defaultdict(set) fltr_event2ids = defaultdict(set) for event in cuttsum.events.get_events(): corpus = cuttsum.corpora.get_raw_corpus(event) hours = event.list_event_hours() hour2ded = defaultdict(int) hour2ded_fltr = defaultdict(int) ded_df = ded_articles_res.get_stats_df(event, corpus, "goose", 0.8) if ded_df is not None: if event.query_num > 25: for ids in ded_df["stream ids"].apply(eval).tolist(): for id1 in ids: event2ids[event.fs_name()].add(id1) for _, row in ded_df.iterrows(): dt = datetime.utcfromtimestamp(row["earliest"]) hour = datetime(dt.year, dt.month, dt.day, dt.hour) hour2ded[hour] += 1 if row["match"] == True: hour2ded_fltr[hour] += 1
def get_input_stream( event, gold_probs, extractor="goose", thresh=0.8, delay=None, topk=20, use_2015F=False, truncate=False ): max_nuggets = 3 corpus = cuttsum.corpora.get_raw_corpus(event) if use_2015F is True and event.query_num > 25: corpus = cuttsum.corpora.FilteredTS2015() print event, corpus res = InputStreamResource() df = pd.concat(res.get_dataframes(event, corpus, extractor, thresh, delay, topk)) selector = (df["n conf"] == 1) & (df["nugget probs"].apply(len) == 0) df.loc[selector, "nugget probs"] = df.loc[selector, "nuggets"].apply(lambda x: {n: 1 for n in x}) df["true probs"] = df["nugget probs"].apply(lambda x: [val for key, val in x.items()] + [0]) df["true probs"] = df["true probs"].apply(lambda x: np.max(x)) df.loc[(df["n conf"] == 1) & (df["nuggets"].apply(len) == 0), "true probs"] = 0 if gold_probs is True: df["probs"] = df["true probs"] else: df["probs"] = NuggetRegressor().predict(event, df) df["nuggets"] = df["nugget probs"].apply(lambda x: set([key for key, val in x.items() if val > 0.9])) nid2time = {} nids = set(matches_df[matches_df["query id"] == event.query_id]["nugget id"].tolist()) for nid in nids: ts = matches_df[matches_df["nugget id"] == nid]["update id"].apply(lambda x: int(x.split("-")[0])).tolist() ts.sort() nid2time[nid] = ts[0] fltr_nuggets = [] for name, row in df.iterrows(): fltr_nuggets.append(set([nug for nug in row["nuggets"] if nid2time[nug] <= row["timestamp"]])) # print df[["nuggets", "timestamp"]].apply(lambda y: print y[0]) # datetime.utcfromtimestamp(int(y["timestamp"]))) # print nids df["nuggets"] = fltr_nuggets df["nuggets"] = df["nuggets"].apply(lambda x: x if len(x) <= max_nuggets else set([])) from cuttsum.pipeline import DedupedArticlesResource ded = DedupedArticlesResource() stats_df = ded.get_stats_df(event, corpus, extractor, thresh) stats_df["stream ids"] = stats_df["stream ids"].apply(lambda x: set(eval(x))) sid2match = {} for _, row in stats_df.iterrows(): for sid in row["stream ids"]: sid2match[sid] = row["match"] all_ts = [] all_docs = [] new_docs = [] for (sid, ts), doc in df.groupby(["stream id", "timestamp"]): if truncate is True: doc = doc.iloc[0:5] # print sub_doc if len(all_ts) > 0: assert ts >= all_ts[-1] all_ts.append(ts) if sid2match[sid] is True: new_docs.append(doc) all_docs.append(doc) df = pd.concat(new_docs) print len(all_docs), len(new_docs) return df
chunk_res = SCChunkResource() articles_res = ArticlesResource() ded_articles_res = DedupedArticlesResource() data = [] event2ids = defaultdict(set) fltr_event2ids = defaultdict(set) for event in cuttsum.events.get_events(): corpus = cuttsum.corpora.get_raw_corpus(event) hours = event.list_event_hours() hour2ded = defaultdict(int) hour2ded_fltr = defaultdict(int) ded_df = ded_articles_res.get_stats_df(event, corpus, "goose", .8) if ded_df is not None: if event.query_num > 25: for ids in ded_df["stream ids"].apply(eval).tolist(): for id1 in ids: event2ids[event.fs_name()].add(id1) for _, row in ded_df.iterrows(): dt = datetime.utcfromtimestamp(row["earliest"]) hour = datetime(dt.year, dt.month, dt.day, dt.hour) hour2ded[hour] += 1 if row["match"] == True: hour2ded_fltr[hour] += 1