Пример #1
0
    def do_job_unit(self, event, corpus, unit, **kwargs):
        assert unit == 0

        extractor = kwargs.get('extractor', "goose")
        thresh = kwargs.get('thresh', .8)
        delay = kwargs.get('delay', None)
        topk = kwargs.get('topk', 20)

        train_events = [e for e in cuttsum.events.get_events()
                        if e.query_num not in set([event.query_num, 7])]
        res = InputStreamResource()

        y = []
        X = []
        for train_event in train_events:

            y_e = []
            X_e = []

            istream = res.get_dataframes(
                train_event,
                cuttsum.corpora.get_raw_corpus(train_event), 
                extractor, thresh, delay, topk)
            for df in istream:

                selector = (df["n conf"] == 1) & (df["nugget probs"].apply(len) == 0)
                df.loc[selector, "nugget probs"] = \
                    df.loc[selector, "nuggets"].apply(lambda x: {n:1 for n in x})


                df["probs"] = df["nugget probs"].apply(lambda x: [val for key, val in x.items()] +[0])
                df["probs"] = df["probs"].apply(lambda x: np.max(x))
                df.loc[(df["n conf"] == 1) & (df["nuggets"].apply(len) == 0), "probs"] = 0
                y_t = df["probs"].values
                y_t = y_t[:, np.newaxis]
                y_e.append(y_t)
                X_t = df[self.cols].values
                X_e.append(X_t)

            y_e = np.vstack(y_e)
            y.append(y_e)
            X_e = np.vstack(X_e)
            X.append(X_e)

 #       print "WARNING NOT USING 2014 EVENTS"
        X = np.vstack(X)
        y = np.vstack(y)

        gbc = GradientBoostingRegressor(
            n_estimators=100, learning_rate=1.,
            max_depth=3, random_state=0)
        print "fitting", event
        gbc.fit(X, y.ravel())
        print event, "SCORE", gbc.score(X, y.ravel())
        
        model_dir = self.get_model_dir(event)
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)
        joblib.dump(gbc, self.get_model_path(event), compress=9)
Пример #2
0
def get_input_stream(event,
                     gold_probs,
                     extractor="goose",
                     thresh=.8,
                     delay=None,
                     topk=20):
    max_nuggets = 3
    corpus = cuttsum.corpora.FilteredTS2015()
    res = InputStreamResource()
    df = pd.concat(
        res.get_dataframes(event, corpus, extractor, thresh, delay, topk))

    selector = (df["n conf"] == 1) & (df["nugget probs"].apply(len) == 0)
    df.loc[selector, "nugget probs"] = df.loc[selector, "nuggets"].apply(
        lambda x: {n: 1
                   for n in x})

    df["true probs"] = df["nugget probs"].apply(
        lambda x: [val for key, val in x.items()] + [0])
    df["true probs"] = df["true probs"].apply(lambda x: np.max(x))
    df.loc[(df["n conf"] == 1) & (df["nuggets"].apply(len) == 0),
           "true probs"] = 0

    if gold_probs is True:
        df["probs"] = df["true probs"]
    else:
        df["probs"] = NuggetRegressor().predict(event, df)

    df["nuggets"] = df["nugget probs"].apply(
        lambda x: set([key for key, val in x.items() if val > .9]))

    nid2time = {}
    nids = set(matches_df[matches_df["query id"] == event.query_id]
               ["nugget id"].tolist())
    for nid in nids:
        ts = matches_df[matches_df["nugget id"] == nid]["update id"].apply(
            lambda x: int(x.split("-")[0])).tolist()
        ts.sort()
        nid2time[nid] = ts[0]

    fltr_nuggets = []
    for name, row in df.iterrows():
        fltr_nuggets.append(
            set([
                nug for nug in row["nuggets"]
                if nid2time[nug] <= row["timestamp"]
            ]))
    #print df[["nuggets", "timestamp"]].apply(lambda y: print y[0]) #  datetime.utcfromtimestamp(int(y["timestamp"])))
    #print nids
    df["nuggets"] = fltr_nuggets

    df["nuggets"] = df["nuggets"].apply(lambda x: x
                                        if len(x) <= max_nuggets else set([]))

    return df
Пример #3
0
def get_input_stream(event, gold_probs, extractor="goose", thresh=.8, delay=None, topk=20,
        max_nuggets=None, is_filter=False):
    corpus = cuttsum.corpora.get_raw_corpus(event)
    res = InputStreamResource()
    df = pd.concat(
        res.get_dataframes(event, corpus, extractor, thresh, delay, topk))
    selector = (df["n conf"] == 1) & (df["nugget probs"].apply(len) == 0)
    df.loc[selector, "nugget probs"] = df.loc[selector, "nuggets"].apply(lambda x: {n:1 for n in x})

    df["true probs"] = df["nugget probs"].apply(lambda x: [val for key, val in x.items()] +[0])
    df["true probs"] = df["true probs"].apply(lambda x: np.max(x))
    df.loc[(df["n conf"] == 1) & (df["nuggets"].apply(len) == 0), "true probs"] = 0

    if gold_probs is True:
        df["probs"] = df["true probs"]
    else:
        df["probs"] = NuggetRegressor().predict(event, df)   


    df["nuggets"] = df["nugget probs"].apply(
        lambda x: set([key for key, val in x.items() if val > .97]))
    
    if max_nuggets is not None:        

        def sortme(x):    
            l = [(key, val) for key, val in x.items() if val > .5]
            sorted(l, key=lambda y: y[1], reverse=True)
            return [k for k,v in l[:max_nuggets]]
        df["nuggets"] = df["nuggets"].apply(lambda x: x if len(x) <= max_nuggets else set([]))
        #df["nuggets"] = df["nugget probs"].apply(sortme)

    if is_filter:
        nid2time = {}
        nids = set(matches_df[matches_df["query id"] == event.query_id]["nugget id"].tolist())
        for nid in nids:
            ts = matches_df[matches_df["nugget id"] == nid]["update id"].apply(lambda x: int(x.split("-")[0])).tolist()
            ts.sort()
            nid2time[nid] = ts[0]
        #tss = nuggets[nuggets["query id"] == event.query_id]["timestamp"].tolist()
        #ids = nuggets[nuggets["query id"] == event.query_id]["nugget id"].tolist()
        #nt = {nid: ts for ts, nid in zip(tss, ids)}
        fltr_nuggets = []
        for name, row in df.iterrows():
            fltr_nuggets.append(
                set([nug for nug in row["nuggets"] if nid2time[nug] <= row["timestamp"]]))
        #print df[["nuggets", "timestamp"]].apply(lambda y: print y[0]) #  datetime.utcfromtimestamp(int(y["timestamp"])))
        #print nids
        df["nuggets"] = fltr_nuggets
    return df
Пример #4
0
def get_input_stream(event, gold_probs, extractor="goose", thresh=.8, delay=None, topk=20):
    max_nuggets = 3
    corpus = cuttsum.corpora.get_raw_corpus(event)
    res = InputStreamResource()
    df = pd.concat(
        res.get_dataframes(event, corpus, extractor, thresh, delay, topk))

    selector = (df["n conf"] == 1) & (df["nugget probs"].apply(len) == 0)
    df.loc[selector, "nugget probs"] = df.loc[selector, "nuggets"].apply(lambda x: {n:1 for n in x})

    df["true probs"] = df["nugget probs"].apply(lambda x: [val for key, val in x.items()] +[0])
    df["true probs"] = df["true probs"].apply(lambda x: np.max(x))
    df.loc[(df["n conf"] == 1) & (df["nuggets"].apply(len) == 0), "true probs"] = 0

    if gold_probs is True:
        df["probs"] = df["true probs"]
    else:
        df["probs"] = NuggetRegressor().predict(event, df)   
    
    df["nuggets"] = df["nugget probs"].apply(
        lambda x: set([key for key, val in x.items() if val > .9]))


    nid2time = {}
    nids = set(matches_df[matches_df["query id"] == event.query_id]["nugget id"].tolist())
    for nid in nids:
        ts = matches_df[matches_df["nugget id"] == nid]["update id"].apply(lambda x: int(x.split("-")[0])).tolist()
        ts.sort()
        nid2time[nid] = ts[0]

    fltr_nuggets = []
    for name, row in df.iterrows():
        fltr_nuggets.append(
            set([nug for nug in row["nuggets"] if nid2time[nug] <= row["timestamp"]]))
    #print df[["nuggets", "timestamp"]].apply(lambda y: print y[0]) #  datetime.utcfromtimestamp(int(y["timestamp"])))
    #print nids
    df["nuggets"] = fltr_nuggets

    df["nuggets"] = df["nuggets"].apply(lambda x: x if len(x) <= max_nuggets else set([]))

    return df
Пример #5
0
def get_input_stream(event,
                     gold_probs,
                     extractor="goose",
                     thresh=.8,
                     delay=None,
                     topk=20,
                     use_2015F=False,
                     truncate=False):
    max_nuggets = 3

    corpus = cuttsum.corpora.get_raw_corpus(event)
    if use_2015F is True and event.query_num > 25:
        corpus = cuttsum.corpora.FilteredTS2015()
    print event, corpus

    res = InputStreamResource()
    df = pd.concat(
        res.get_dataframes(event, corpus, extractor, thresh, delay, topk))

    selector = (df["n conf"] == 1) & (df["nugget probs"].apply(len) == 0)
    df.loc[selector, "nugget probs"] = df.loc[selector, "nuggets"].apply(
        lambda x: {n: 1
                   for n in x})

    df["true probs"] = df["nugget probs"].apply(
        lambda x: [val for key, val in x.items()] + [0])
    df["true probs"] = df["true probs"].apply(lambda x: np.max(x))
    df.loc[(df["n conf"] == 1) & (df["nuggets"].apply(len) == 0),
           "true probs"] = 0

    if gold_probs is True:
        df["probs"] = df["true probs"]
    else:
        df["probs"] = NuggetRegressor().predict(event, df)

    df["nuggets"] = df["nugget probs"].apply(
        lambda x: set([key for key, val in x.items() if val > .9]))

    nid2time = {}
    nids = set(matches_df[matches_df["query id"] == event.query_id]
               ["nugget id"].tolist())
    for nid in nids:
        ts = matches_df[matches_df["nugget id"] == nid]["update id"].apply(
            lambda x: int(x.split("-")[0])).tolist()
        ts.sort()
        nid2time[nid] = ts[0]

    fltr_nuggets = []
    for name, row in df.iterrows():
        fltr_nuggets.append(
            set([
                nug for nug in row["nuggets"]
                if nid2time[nug] <= row["timestamp"]
            ]))
    #print df[["nuggets", "timestamp"]].apply(lambda y: print y[0]) #  datetime.utcfromtimestamp(int(y["timestamp"])))
    #print nids
    df["nuggets"] = fltr_nuggets

    df["nuggets"] = df["nuggets"].apply(lambda x: x
                                        if len(x) <= max_nuggets else set([]))

    from cuttsum.pipeline import DedupedArticlesResource
    ded = DedupedArticlesResource()
    stats_df = ded.get_stats_df(event, corpus, extractor, thresh)
    stats_df["stream ids"] = stats_df["stream ids"].apply(
        lambda x: set(eval(x)))
    sid2match = {}
    for _, row in stats_df.iterrows():
        for sid in row["stream ids"]:
            sid2match[sid] = row["match"]

    all_ts = []
    all_docs = []
    new_docs = []
    for (sid, ts), doc in df.groupby(["stream id", "timestamp"]):
        #if truncate is True:
        doc = doc.iloc[0:20]
        #            print sub_doc
        if len(all_ts) > 0:
            assert ts >= all_ts[-1]
        all_ts.append(ts)
        if sid2match[sid] is True:
            new_docs.append(doc)
        all_docs.append(doc)

    df = pd.concat(new_docs)
    print len(all_docs), len(new_docs)
    return df
Пример #6
0
def get_input_stream(
    event, gold_probs, extractor="goose", thresh=0.8, delay=None, topk=20, use_2015F=False, truncate=False
):
    max_nuggets = 3

    corpus = cuttsum.corpora.get_raw_corpus(event)
    if use_2015F is True and event.query_num > 25:
        corpus = cuttsum.corpora.FilteredTS2015()
    print event, corpus

    res = InputStreamResource()
    df = pd.concat(res.get_dataframes(event, corpus, extractor, thresh, delay, topk))

    selector = (df["n conf"] == 1) & (df["nugget probs"].apply(len) == 0)
    df.loc[selector, "nugget probs"] = df.loc[selector, "nuggets"].apply(lambda x: {n: 1 for n in x})

    df["true probs"] = df["nugget probs"].apply(lambda x: [val for key, val in x.items()] + [0])
    df["true probs"] = df["true probs"].apply(lambda x: np.max(x))
    df.loc[(df["n conf"] == 1) & (df["nuggets"].apply(len) == 0), "true probs"] = 0

    if gold_probs is True:
        df["probs"] = df["true probs"]
    else:
        df["probs"] = NuggetRegressor().predict(event, df)

    df["nuggets"] = df["nugget probs"].apply(lambda x: set([key for key, val in x.items() if val > 0.9]))

    nid2time = {}
    nids = set(matches_df[matches_df["query id"] == event.query_id]["nugget id"].tolist())
    for nid in nids:
        ts = matches_df[matches_df["nugget id"] == nid]["update id"].apply(lambda x: int(x.split("-")[0])).tolist()
        ts.sort()
        nid2time[nid] = ts[0]

    fltr_nuggets = []
    for name, row in df.iterrows():
        fltr_nuggets.append(set([nug for nug in row["nuggets"] if nid2time[nug] <= row["timestamp"]]))
    # print df[["nuggets", "timestamp"]].apply(lambda y: print y[0]) #  datetime.utcfromtimestamp(int(y["timestamp"])))
    # print nids
    df["nuggets"] = fltr_nuggets

    df["nuggets"] = df["nuggets"].apply(lambda x: x if len(x) <= max_nuggets else set([]))

    from cuttsum.pipeline import DedupedArticlesResource

    ded = DedupedArticlesResource()
    stats_df = ded.get_stats_df(event, corpus, extractor, thresh)
    stats_df["stream ids"] = stats_df["stream ids"].apply(lambda x: set(eval(x)))
    sid2match = {}
    for _, row in stats_df.iterrows():
        for sid in row["stream ids"]:
            sid2match[sid] = row["match"]

    all_ts = []
    all_docs = []
    new_docs = []
    for (sid, ts), doc in df.groupby(["stream id", "timestamp"]):
        if truncate is True:
            doc = doc.iloc[0:5]
        #            print sub_doc
        if len(all_ts) > 0:
            assert ts >= all_ts[-1]
        all_ts.append(ts)
        if sid2match[sid] is True:
            new_docs.append(doc)
        all_docs.append(doc)

    df = pd.concat(new_docs)
    print len(all_docs), len(new_docs)
    return df
Пример #7
0
def get_input_stream(event, extractor="goose", thresh=0.8, delay=None, topk=20):
    corpus = cuttsum.corpora.get_raw_corpus(event)
    res = InputStreamResource()
    return res.get_dataframes(event, corpus, extractor, thresh, delay, topk)
Пример #8
0
def get_input_stream(event,
                     gold_probs,
                     extractor="goose",
                     thresh=.8,
                     delay=None,
                     topk=20,
                     max_nuggets=None,
                     is_filter=False):
    corpus = cuttsum.corpora.get_raw_corpus(event)
    res = InputStreamResource()
    df = pd.concat(
        res.get_dataframes(event, corpus, extractor, thresh, delay, topk))
    selector = (df["n conf"] == 1) & (df["nugget probs"].apply(len) == 0)
    df.loc[selector, "nugget probs"] = df.loc[selector, "nuggets"].apply(
        lambda x: {n: 1
                   for n in x})

    df["true probs"] = df["nugget probs"].apply(
        lambda x: [val for key, val in x.items()] + [0])
    df["true probs"] = df["true probs"].apply(lambda x: np.max(x))
    df.loc[(df["n conf"] == 1) & (df["nuggets"].apply(len) == 0),
           "true probs"] = 0

    if gold_probs is True:
        df["probs"] = df["true probs"]
    else:
        df["probs"] = NuggetRegressor().predict(event, df)

    df["nuggets"] = df["nugget probs"].apply(
        lambda x: set([key for key, val in x.items() if val > .97]))

    if max_nuggets is not None:

        def sortme(x):
            l = [(key, val) for key, val in x.items() if val > .5]
            sorted(l, key=lambda y: y[1], reverse=True)
            return [k for k, v in l[:max_nuggets]]

        df["nuggets"] = df["nuggets"].apply(
            lambda x: x if len(x) <= max_nuggets else set([]))
        #df["nuggets"] = df["nugget probs"].apply(sortme)

    if is_filter:
        nid2time = {}
        nids = set(matches_df[matches_df["query id"] == event.query_id]
                   ["nugget id"].tolist())
        for nid in nids:
            ts = matches_df[matches_df["nugget id"] == nid]["update id"].apply(
                lambda x: int(x.split("-")[0])).tolist()
            ts.sort()
            nid2time[nid] = ts[0]
        #tss = nuggets[nuggets["query id"] == event.query_id]["timestamp"].tolist()
        #ids = nuggets[nuggets["query id"] == event.query_id]["nugget id"].tolist()
        #nt = {nid: ts for ts, nid in zip(tss, ids)}
        fltr_nuggets = []
        for name, row in df.iterrows():
            fltr_nuggets.append(
                set([
                    nug for nug in row["nuggets"]
                    if nid2time[nug] <= row["timestamp"]
                ]))
        #print df[["nuggets", "timestamp"]].apply(lambda y: print y[0]) #  datetime.utcfromtimestamp(int(y["timestamp"])))
        #print nids
        df["nuggets"] = fltr_nuggets
    return df
Пример #9
0
def main(learner, training_ids, test_ids, sample_size, 
         n_iters, report_dir_base):

    extractor = "goose" 
    topk = 20
    delay = None
    threshold = .8
    res = InputStreamResource()

    events = [e for e in cuttsum.events.get_events()
              if e.query_num in training_ids or e.query_num in test_ids]
    training_insts = []
    test_insts = []
    for event in events:
        print "Loading event", event.fs_name()
        corpus = cuttsum.corpora.get_raw_corpus(event)

        # A list of dataframes. Each dataframe is a document with =< 20 sentences.
        # This is the events document stream.
        dataframes = res.get_dataframes(event, corpus, extractor, threshold,
                delay, topk)

        if event.query_num in training_ids:
            training_insts.append((event, dataframes))    
           
        if event.query_num in test_ids:
            test_insts.append((event, dataframes))    

    # Init l2s task.
    vw = pyvw.vw("--search 0 --csoaa_ldf m --search_task hook --ring_size 1024  --quiet  --search_no_caching")

    #task = vw.init_search_task(UpdateSummarizer)
    if learner == "PerfectOracle":
        task = vw.init_search_task(PerfectOracle)
    elif learner == "LessPerfectOracle":
        task = vw.init_search_task(LessPerfectOracle)
    elif learner == "SelectLexNextOracle":
        task = vw.init_search_task(SelectLexNextOracle)
    elif learner == "SelectLexNextLex":
        task = vw.init_search_task(SelectLexNextLex)
    elif learner == "SelectLexNextLexCache":
        task = vw.init_search_task(SelectLexNextLexCache)
    elif learner == "SelectLexGenericNextOracle":
        task = vw.init_search_task(SelectLexGenericNextOracle)
    elif learner == "SelectBasicNextBias":
        task = vw.init_search_task(SelectBasicNextBias)
    elif learner == "SelectBasicNextBiasDocAvg":
        task = vw.init_search_task(SelectBasicNextBiasDocAvg)
    
    for n_iter in range(n_iters):
        print "iter", n_iter + 1
        ds = downsample(training_insts, size=sample_size)
        task.learn(ds)
        all_train_df = [df for inst in training_insts for df in inst[1]]
        feature_weights = task.get_feature_weights(all_train_df)

        write_model(feature_weights, report_dir_base, n_iter)

        for event, dataframes in training_insts:
            # Predict a sequence for this training examples and see if it is sensible.
            print "PREDICTING", event.fs_name()
            sequence, scores = task.predict_with_scores((event, dataframes))
            print sequence
            make_report(event, dataframes, sequence, scores, "train", n_iter,
                report_dir_base)


        for event, dataframes in test_insts:
            # Predict a sequence for this training examples and see if it is sensible.
            print "PREDICTING", event.fs_name()
            sequence, scores = task.predict_with_scores((event, dataframes))
            print sequence
            make_report(event, dataframes, sequence, scores, "test", n_iter,
                report_dir_base)
Пример #10
0
def main(learner, training_ids, test_ids, sample_size, n_iters,
         report_dir_base):

    extractor = "goose"
    topk = 20
    delay = None
    threshold = .8
    res = InputStreamResource()

    events = [
        e for e in cuttsum.events.get_events()
        if e.query_num in training_ids or e.query_num in test_ids
    ]
    training_insts = []
    test_insts = []
    for event in events:
        print "Loading event", event.fs_name()
        corpus = cuttsum.corpora.get_raw_corpus(event)

        # A list of dataframes. Each dataframe is a document with =< 20 sentences.
        # This is the events document stream.
        dataframes = res.get_dataframes(event, corpus, extractor, threshold,
                                        delay, topk)

        if event.query_num in training_ids:
            training_insts.append((event, dataframes))

        if event.query_num in test_ids:
            test_insts.append((event, dataframes))

    # Init l2s task.
    vw = pyvw.vw(
        "--search 0 --csoaa_ldf m --search_task hook --ring_size 1024  --quiet  --search_no_caching"
    )

    #task = vw.init_search_task(UpdateSummarizer)
    if learner == "PerfectOracle":
        task = vw.init_search_task(PerfectOracle)
    elif learner == "LessPerfectOracle":
        task = vw.init_search_task(LessPerfectOracle)
    elif learner == "SelectLexNextOracle":
        task = vw.init_search_task(SelectLexNextOracle)
    elif learner == "SelectLexNextLex":
        task = vw.init_search_task(SelectLexNextLex)
    elif learner == "SelectLexNextLexCache":
        task = vw.init_search_task(SelectLexNextLexCache)
    elif learner == "SelectLexGenericNextOracle":
        task = vw.init_search_task(SelectLexGenericNextOracle)
    elif learner == "SelectBasicNextBias":
        task = vw.init_search_task(SelectBasicNextBias)
    elif learner == "SelectBasicNextBiasDocAvg":
        task = vw.init_search_task(SelectBasicNextBiasDocAvg)

    for n_iter in range(n_iters):
        print "iter", n_iter + 1
        ds = downsample(training_insts, size=sample_size)
        task.learn(ds)
        all_train_df = [df for inst in training_insts for df in inst[1]]
        feature_weights = task.get_feature_weights(all_train_df)

        write_model(feature_weights, report_dir_base, n_iter)

        for event, dataframes in training_insts:
            # Predict a sequence for this training examples and see if it is sensible.
            print "PREDICTING", event.fs_name()
            sequence, scores = task.predict_with_scores((event, dataframes))
            print sequence
            make_report(event, dataframes, sequence, scores, "train", n_iter,
                        report_dir_base)

        for event, dataframes in test_insts:
            # Predict a sequence for this training examples and see if it is sensible.
            print "PREDICTING", event.fs_name()
            sequence, scores = task.predict_with_scores((event, dataframes))
            print sequence
            make_report(event, dataframes, sequence, scores, "test", n_iter,
                        report_dir_base)
Пример #11
0
def get_input_stream(event, extractor="goose", thresh=.8, delay=None, topk=20):
    corpus = cuttsum.corpora.get_raw_corpus(event)
    res = InputStreamResource()
    return res.get_dataframes(event, corpus, extractor, thresh, delay, topk)