예제 #1
0
def get_input_stream(event,
                     gold_probs,
                     extractor="goose",
                     thresh=.8,
                     delay=None,
                     topk=20,
                     use_2015F=False,
                     truncate=False):
    max_nuggets = 3

    corpus = cuttsum.corpora.get_raw_corpus(event)
    if use_2015F is True and event.query_num > 25:
        corpus = cuttsum.corpora.FilteredTS2015()
    print event, corpus

    res = InputStreamResource()
    df = pd.concat(
        res.get_dataframes(event, corpus, extractor, thresh, delay, topk))

    selector = (df["n conf"] == 1) & (df["nugget probs"].apply(len) == 0)
    df.loc[selector, "nugget probs"] = df.loc[selector, "nuggets"].apply(
        lambda x: {n: 1
                   for n in x})

    df["true probs"] = df["nugget probs"].apply(
        lambda x: [val for key, val in x.items()] + [0])
    df["true probs"] = df["true probs"].apply(lambda x: np.max(x))
    df.loc[(df["n conf"] == 1) & (df["nuggets"].apply(len) == 0),
           "true probs"] = 0

    if gold_probs is True:
        df["probs"] = df["true probs"]
    else:
        df["probs"] = NuggetRegressor().predict(event, df)

    df["nuggets"] = df["nugget probs"].apply(
        lambda x: set([key for key, val in x.items() if val > .9]))

    nid2time = {}
    nids = set(matches_df[matches_df["query id"] == event.query_id]
               ["nugget id"].tolist())
    for nid in nids:
        ts = matches_df[matches_df["nugget id"] == nid]["update id"].apply(
            lambda x: int(x.split("-")[0])).tolist()
        ts.sort()
        nid2time[nid] = ts[0]

    fltr_nuggets = []
    for name, row in df.iterrows():
        fltr_nuggets.append(
            set([
                nug for nug in row["nuggets"]
                if nid2time[nug] <= row["timestamp"]
            ]))
    #print df[["nuggets", "timestamp"]].apply(lambda y: print y[0]) #  datetime.utcfromtimestamp(int(y["timestamp"])))
    #print nids
    df["nuggets"] = fltr_nuggets

    df["nuggets"] = df["nuggets"].apply(lambda x: x
                                        if len(x) <= max_nuggets else set([]))

    from cuttsum.pipeline import DedupedArticlesResource
    ded = DedupedArticlesResource()
    stats_df = ded.get_stats_df(event, corpus, extractor, thresh)
    stats_df["stream ids"] = stats_df["stream ids"].apply(
        lambda x: set(eval(x)))
    sid2match = {}
    for _, row in stats_df.iterrows():
        for sid in row["stream ids"]:
            sid2match[sid] = row["match"]

    all_ts = []
    all_docs = []
    new_docs = []
    for (sid, ts), doc in df.groupby(["stream id", "timestamp"]):
        #if truncate is True:
        doc = doc.iloc[0:20]
        #            print sub_doc
        if len(all_ts) > 0:
            assert ts >= all_ts[-1]
        all_ts.append(ts)
        if sid2match[sid] is True:
            new_docs.append(doc)
        all_docs.append(doc)

    df = pd.concat(new_docs)
    print len(all_docs), len(new_docs)
    return df
예제 #2
0
chunk_res = SCChunkResource()
articles_res = ArticlesResource()
ded_articles_res = DedupedArticlesResource()
data = []

event2ids = defaultdict(set)
fltr_event2ids = defaultdict(set)
for event in cuttsum.events.get_events():

    corpus = cuttsum.corpora.get_raw_corpus(event)
    hours = event.list_event_hours()

    hour2ded = defaultdict(int)
    hour2ded_fltr = defaultdict(int)
    ded_df = ded_articles_res.get_stats_df(event, corpus, "goose", 0.8)

    if ded_df is not None:

        if event.query_num > 25:
            for ids in ded_df["stream ids"].apply(eval).tolist():
                for id1 in ids:
                    event2ids[event.fs_name()].add(id1)

        for _, row in ded_df.iterrows():
            dt = datetime.utcfromtimestamp(row["earliest"])
            hour = datetime(dt.year, dt.month, dt.day, dt.hour)
            hour2ded[hour] += 1
            if row["match"] == True:
                hour2ded_fltr[hour] += 1
예제 #3
0
파일: run.py 프로젝트: kedz/cuttsum
def get_input_stream(
    event, gold_probs, extractor="goose", thresh=0.8, delay=None, topk=20, use_2015F=False, truncate=False
):
    max_nuggets = 3

    corpus = cuttsum.corpora.get_raw_corpus(event)
    if use_2015F is True and event.query_num > 25:
        corpus = cuttsum.corpora.FilteredTS2015()
    print event, corpus

    res = InputStreamResource()
    df = pd.concat(res.get_dataframes(event, corpus, extractor, thresh, delay, topk))

    selector = (df["n conf"] == 1) & (df["nugget probs"].apply(len) == 0)
    df.loc[selector, "nugget probs"] = df.loc[selector, "nuggets"].apply(lambda x: {n: 1 for n in x})

    df["true probs"] = df["nugget probs"].apply(lambda x: [val for key, val in x.items()] + [0])
    df["true probs"] = df["true probs"].apply(lambda x: np.max(x))
    df.loc[(df["n conf"] == 1) & (df["nuggets"].apply(len) == 0), "true probs"] = 0

    if gold_probs is True:
        df["probs"] = df["true probs"]
    else:
        df["probs"] = NuggetRegressor().predict(event, df)

    df["nuggets"] = df["nugget probs"].apply(lambda x: set([key for key, val in x.items() if val > 0.9]))

    nid2time = {}
    nids = set(matches_df[matches_df["query id"] == event.query_id]["nugget id"].tolist())
    for nid in nids:
        ts = matches_df[matches_df["nugget id"] == nid]["update id"].apply(lambda x: int(x.split("-")[0])).tolist()
        ts.sort()
        nid2time[nid] = ts[0]

    fltr_nuggets = []
    for name, row in df.iterrows():
        fltr_nuggets.append(set([nug for nug in row["nuggets"] if nid2time[nug] <= row["timestamp"]]))
    # print df[["nuggets", "timestamp"]].apply(lambda y: print y[0]) #  datetime.utcfromtimestamp(int(y["timestamp"])))
    # print nids
    df["nuggets"] = fltr_nuggets

    df["nuggets"] = df["nuggets"].apply(lambda x: x if len(x) <= max_nuggets else set([]))

    from cuttsum.pipeline import DedupedArticlesResource

    ded = DedupedArticlesResource()
    stats_df = ded.get_stats_df(event, corpus, extractor, thresh)
    stats_df["stream ids"] = stats_df["stream ids"].apply(lambda x: set(eval(x)))
    sid2match = {}
    for _, row in stats_df.iterrows():
        for sid in row["stream ids"]:
            sid2match[sid] = row["match"]

    all_ts = []
    all_docs = []
    new_docs = []
    for (sid, ts), doc in df.groupby(["stream id", "timestamp"]):
        if truncate is True:
            doc = doc.iloc[0:5]
        #            print sub_doc
        if len(all_ts) > 0:
            assert ts >= all_ts[-1]
        all_ts.append(ts)
        if sid2match[sid] is True:
            new_docs.append(doc)
        all_docs.append(doc)

    df = pd.concat(new_docs)
    print len(all_docs), len(new_docs)
    return df
예제 #4
0
chunk_res = SCChunkResource()
articles_res = ArticlesResource()
ded_articles_res = DedupedArticlesResource()
data = []

event2ids = defaultdict(set)
fltr_event2ids = defaultdict(set)
for event in cuttsum.events.get_events():

    corpus = cuttsum.corpora.get_raw_corpus(event)
    hours = event.list_event_hours()

    hour2ded = defaultdict(int)
    hour2ded_fltr = defaultdict(int)
    ded_df = ded_articles_res.get_stats_df(event, corpus, "goose", .8)

    if ded_df is not None:

        if event.query_num > 25:
            for ids in ded_df["stream ids"].apply(eval).tolist():
                for id1 in ids:
                    event2ids[event.fs_name()].add(id1)

        for _, row in ded_df.iterrows():
            dt = datetime.utcfromtimestamp(row["earliest"])
            hour = datetime(dt.year, dt.month, dt.day, dt.hour)
            hour2ded[hour] += 1
            if row["match"] == True:
                hour2ded_fltr[hour] += 1