def main():
    results = []
    res = ArticlesResource()

    for ext in ["gold", "goose"]:
        for event in cuttsum.events.get_2013_events():
            if event.query_id.startswith("TS13"):
                corpus = cuttsum.corpora.EnglishAndUnknown2013()
            else:
                raise Exception()
            min_hour = datetime.datetime(datetime.MAXYEAR, 1, 1)
            max_hour = datetime.datetime(datetime.MINYEAR, 1, 1)
            total = 0

            for hour, path, si in res.streamitem_iter(event, corpus, ext):
                if hour < min_hour:
                    min_hour = hour
                if hour > max_hour:
                    max_hour = hour
                total += 1
            if total == 0:
                continue
            results.append({"event": event.fs_name(),
                            "event start": event.list_event_hours()[0],
                            "event stop": event.list_event_hours()[-1],
                            "article start": min_hour,
                            "article stop": max_hour,
                            "total": total,
                            "annotator": ext})
    df = pd.DataFrame(results, 
        columns=["event", "annotator", "event start", "event stop",
                 "article start", "article stop", "total", "annotator"])
    print df
Пример #2
0
def main():
    results = []
    res = ArticlesResource()

    for ext in ["gold", "goose"]:
        for event in cuttsum.events.get_2013_events():
            if event.query_id.startswith("TS13"):
                corpus = cuttsum.corpora.EnglishAndUnknown2013()
            else:
                raise Exception()
            min_hour = datetime.datetime(datetime.MAXYEAR, 1, 1)
            max_hour = datetime.datetime(datetime.MINYEAR, 1, 1)
            total = 0

            for hour, path, si in res.streamitem_iter(event, corpus, ext):
                if hour < min_hour:
                    min_hour = hour
                if hour > max_hour:
                    max_hour = hour
                total += 1
            if total == 0:
                continue
            results.append({
                "event": event.fs_name(),
                "event start": event.list_event_hours()[0],
                "event stop": event.list_event_hours()[-1],
                "article start": min_hour,
                "article stop": max_hour,
                "total": total,
                "annotator": ext
            })
    df = pd.DataFrame(results,
                      columns=[
                          "event", "annotator", "event start", "event stop",
                          "article start", "article stop", "total", "annotator"
                      ])
    print df
Пример #3
0
    def do_job_unit(self, event, corpus, unit, **kwargs):
        if unit != 0:
            raise Exception("Job unit {} out of range".format(unit))
        
        res = ArticlesResource()
        thresh = kwargs.get("dedupe-sim-threshold", .8)
        extractor = kwargs.get("extractor", "goose")
        hasher = FeatureHasher(input_type="pair", non_negative=True)
        si_iter = res.streamitem_iter(
            event, corpus, extractor) 

        def to_df(all_ids, all_times, all_matches):
            d = []
            for ids, times, match in izip(all_ids, all_times, all_matches):

                times.sort()
                d.append({
                    "stream ids": ids, "hits": len(ids), "match": match,
                    "earliest": times[0], "latest": times[-1], 
                    "second": times[1] if len(times) >= 2 else None,
                    "third": times[2] if len(times) >= 3 else None,
                })
            return pd.DataFrame(d, columns=["stream ids", "match", "hits", 
                                            "earliest", "latest", 
                                            "second", "third"])    

        def query_in_top20(event, df):
            text = u"\n".join(df["sent text"].tolist()[:20]) 
            for query in event.query:
                if not re.search(query, text, flags=re.I|re.UNICODE):
                    return False
            return True

        def make_time(df):
            return df["timestamp"].tolist()[0]

        def make_counts(df, slimit=20):
            counts = defaultdict(int)
            for words in df["words"].tolist()[:slimit]:
                for word in words:
                    counts[word.lower()] += 1   
            return counts

        def next_chunk_file(chunk_file_num):
            deduped_path_fmt = self.get_deduped_path_fmt(
                event, corpus, extractor, threshold=thresh)
            deduped_path = deduped_path_fmt.format(
                chunk_file_num)
            deduped_dir = os.path.dirname(deduped_path)
            if not os.path.exists(deduped_dir):
                os.makedirs(deduped_dir)
            
            if os.path.exists(deduped_path):
                os.remove(deduped_path)

            return sc.Chunk(path=deduped_path, mode="wb", 
                message=corpus.sc_msg())



        X = None

        chunk_file_num = 1
        chunk = next_chunk_file(chunk_file_num)

        for hour, path, si in si_iter:
            df = si2df(si, extractor=extractor)
            counts = make_counts(df)
            x = hasher.transform([counts.items()])
            x.shape = (1, hasher.n_features)
            
            if X is None:
                X = x
                times = [[make_time(df)]]
                ids = [[si.stream_id]]
                matches = [query_in_top20(event, df)]

                chunk.add(si)
                        
            else:
                K = cosine_similarity(X, x)
                k_argmax = K.argmax()
                
                if K[k_argmax] < thresh:
                    
                    X = vstack([X, x])
                    times.append([make_time(df)])
                    ids.append([si.stream_id])
                    matches.append(query_in_top20(event, df))

                    if X.shape[0] % 1000 == 0:
                        chunk.close()
                        chunk_file_num += 1
                        chunk = next_chunk_file(chunk_file_num)

                    chunk.add(si)
                    
                else:
                    times[k_argmax].append(make_time(df))
                    ids[k_argmax].append(si.stream_id)
               
        chunk.close() 
     
        df = to_df(ids, times, matches)            
        print df

        stats_path = self.get_stats_path(
            event, corpus, extractor, thresh)
        with open(stats_path, "w") as f:
            df.to_csv(f, index=False, sep="\t")
Пример #4
0
    def do_job_unit(self, event, corpus, unit, **kwargs):
        if unit != 0:
            raise Exception("unit of work out of bounds!")
        extractor = kwargs.get("extractor", "gold")
        soft_match = kwargs.get("soft_match", False)
        budget = kwargs.get("budget", 25)

        output_path_prefix = self.get_path_prefix(event, corpus, extractor,
                                                  budget, soft_match)

        ## Set up summarizer ###

        # This is the monotone submodular objective function (basically
        # nugget coverage).
        def f_of_A(system, A, V_min_A, e, input_df, ndarray_data):
            return len(
                set([
                    nugget for nuggets in input_df.ix[A, "nuggets"].tolist()
                    for nugget in nuggets
                ]))

        system = sumpy.system.MonotoneSubmodularBasic(f_of_A=f_of_A, k=budget)

        # Get gold matchings for oracle.
        articles = ArticlesResource()
        all_matches = cuttsum.judgements.get_merged_dataframe()
        matches = all_matches[all_matches["query id"] == event.query_id]

        # Set up soft matching if we are using it.
        if soft_match is True:
            from cuttsum.classifiers import NuggetClassifier
            classify_nuggets = NuggetClassifier().get_classifier(event)
            if event.query_id.startswith("TS13"):
                judged = cuttsum.judgements.get_2013_updates()
                judged = judged[judged["query id"] == event.query_id]
                judged_uids = set(judged["update id"].tolist())
            else:
                raise Exception("Bad corpus!")

        # All sentences containing nuggets will go in all_df.
        all_df = []
        # Pull out articles with nuggets.
        for hour, path, si in articles.streamitem_iter(event, corpus,
                                                       extractor):

            # Convert stream item to dataframe and add gold label nuggets.
            df = si2df(si, extractor=extractor)
            df["nuggets"] = df["update id"].apply(lambda x: set(matches[
                matches["update id"] == x]["nugget id"].tolist()))

            # Perform soft nugget matching on unjudged sentences.
            if soft_match is True:
                ### NOTE BENE: geting an array of indices to index unjudged
                # sentences so I can force pandas to return a view and not a
                # copy.
                I = np.where(
                    df["update id"].apply(lambda x: x not in judged_uids))[0]

                unjudged = df[df["update id"].apply(
                    lambda x: x not in judged_uids)]
                unjudged_sents = unjudged["sent text"].tolist()
                assert len(unjudged_sents) == I.shape[0]

                df.loc[I, "nuggets"] = classify_nuggets(unjudged_sents)

            # Add sentences with nuggets to final set for summarzing
            df = df[df["nuggets"].apply(len) > 0]
            all_df.append(df)

        # Collect all dataframes into one and reset index (ALWAYS RESET
        # THE INDEX because pandas hates me.)
        all_df = pd.concat(all_df)
        all_df.reset_index(inplace=True)

        summary = system.summarize(all_df)
        F_of_S = len(
            set(n for ns in summary._df["nuggets"].tolist() for n in ns))

        #print "F(S)", F_of_S
        #print "summary nuggets"
        sum_nuggets = list(
            set(n for ns in summary._df["nuggets"].tolist() for n in ns))
        sum_nuggets.sort()
        print sum_nuggets

        possible_nuggets = list(
            set(n for ns in all_df["nuggets"].tolist() for n in ns))
        possible_nuggets.sort()
        print possible_nuggets
        print len(possible_nuggets)

        event_nuggets = set(matches["nugget id"].tolist())
        total_nuggets = len(event_nuggets)
        timestamp = int(si.stream_id.split("-")[0])

        output_df = pd.DataFrame(
            [
                {
                    "Cum. F(S)": F_of_S,
                    "F(S)": F_of_S,
                    "UB no const.":
                    len(possible_nuggets),  #      total_nuggets, 
                    "budget": budget,
                    "Tot. Updates": len(summary._df),
                    "event title": event.fs_name(),
                    "timestamp": timestamp,
                    "query id": event.query_id
                },
            ],
            columns=[
                "timestamp",
                "query id",
                "event title",
                "Cum. F(S)",
                "F(S)",
                "UB no const.",
                "Tot. Updates",
                "budget",
            ])

        parent = os.path.dirname(output_path_prefix)
        if not os.path.exists(parent):
            os.makedirs(parent)

        stats_path = output_path_prefix + ".stats.tsv"
        updates_path = output_path_prefix + ".updates.tsv"

        with open(stats_path, "w") as f:
            output_df.to_csv(f, sep="\t", index=False)
        summary._df["sent text"] = summary._df["sent text"].apply(
            lambda x: x.encode("utf-8"))
        with open(updates_path, "w") as f:
            summary._df[["timestamp", "update id",
                         "sent text"]].sort(["update id"]).to_csv(f,
                                                                  sep="\t",
                                                                  index=False)
Пример #5
0
    def do_job_unit(self, event, corpus, unit, **kwargs):

        if unit != 0:
            raise Exception("unit of work out of bounds!")
        extractor = kwargs.get("extractor", "gold")
        soft_match = kwargs.get("soft_match", False)
        budget = kwargs.get("budget", 25)

        output_path_prefix = self.get_path_prefix(event, corpus, extractor,
                                                  budget, soft_match)

        ## Set up summarizer ###
        def f_of_A(system, A, V_min_A, e, input_df, ndarray_data):
            return len(
                set([
                    nugget for nuggets in input_df.ix[A, "nuggets"].tolist()
                    for nugget in nuggets
                ]))

        system = sumpy.system.MonotoneSubmodularBasic(f_of_A=f_of_A, k=budget)

        # Collect all previously collected nuggets here.
        nugget_cache = set()

        # Get gold matchings for oracle.
        articles = ArticlesResource()
        all_matches = cuttsum.judgements.get_merged_dataframe()
        matches = all_matches[all_matches["query id"] == event.query_id]

        # Set up soft matching if we are using it.
        if soft_match is True:
            from cuttsum.classifiers import NuggetClassifier
            classify_nuggets = NuggetClassifier().get_classifier(event)
            if event.query_id.startswith("TS13"):
                judged = cuttsum.judgements.get_2013_updates()
                judged = judged[judged["query id"] == event.query_id]
                judged_uids = set(judged["update id"].tolist())
            else:
                raise Exception("Bad corpus!")

        # Collect stats for each document here.
        stats = []

        # Aggregate summaries in summary_df.
        summary_df = []
        cum_F_of_S = 0

        all_seen_nuggets = set()

        #        event_nuggets = set(matches["nugget id"].tolist())
        #        total_nuggets = len(event_nuggets)
        total_updates = 0

        # Pull out articles with nuggets.
        for hour, path, si in articles.streamitem_iter(event, corpus,
                                                       extractor):
            print hour, si.stream_id
            # Convert stream item to dataframe and add gold label nuggets.
            df = si2df(si, extractor=extractor)
            df["nuggets"] = df["update id"].apply(lambda x: set(matches[
                matches["update id"] == x]["nugget id"].tolist()))

            # Perform soft nugget matching on unjudged sentences.
            if soft_match is True:
                ### NOTE BENE: geting an array of indices to index unjudged
                # sentences so I can force pandas to return a view and not a
                # copy.
                I = np.where(
                    df["update id"].apply(lambda x: x not in judged_uids))[0]

                unjudged = df[df["update id"].apply(
                    lambda x: x not in judged_uids)]
                unjudged_sents = unjudged["sent text"].tolist()
                assert len(unjudged_sents) == I.shape[0]

                df.loc[I, "nuggets"] = classify_nuggets(unjudged_sents)

            # Remove nuggets from dataframe if we have already collected them in
            # the cache. The scoring function should ignore these.
            df = df[df["nuggets"].apply(len) > 0]
            all_seen_nuggets.update(
                set(n for ns in df["nuggets"].tolist() for n in ns))
            df["nuggets"] = df["nuggets"].apply(
                lambda x: x.difference(nugget_cache))
            if len(df) == 0:
                continue

            # Run summarizer on current document and update stats about it.
            summary = system.summarize(df)
            summary_nuggets = set(n for ns in summary._df["nuggets"].tolist()
                                  for n in ns)
            nugget_cache.update(summary_nuggets)
            system.k -= len(summary._df)

            F_of_S = len(summary_nuggets)
            cum_F_of_S += F_of_S
            total_updates += len(summary._df)
            timestamp = int(si.stream_id.split("-")[0])

            stats.append({
                "Cum. F(S)": cum_F_of_S,
                "F(S)": F_of_S,
                "UB no const.": len(all_seen_nuggets),
                "budget": budget,
                "Tot. Updates": total_updates,
                "event title": event.fs_name(),
                "timestamp": timestamp,
                "query id": event.query_id,
            })
            summary_df.append(summary._df)
            if system.k <= 0:
                print "Budget exceeded!"
                break

        output_df = pd.DataFrame(stats,
                                 columns=[
                                     "timestamp",
                                     "query id",
                                     "event title",
                                     "Cum. F(S)",
                                     "F(S)",
                                     "UB no const.",
                                     "Tot. Updates",
                                     "budget",
                                 ])

        # Write stats and updates to file.
        parent = os.path.dirname(output_path_prefix)
        if not os.path.exists(parent):
            os.makedirs(parent)

        stats_path = output_path_prefix + ".stats.tsv"
        updates_path = output_path_prefix + ".updates.tsv"

        with open(stats_path, "w") as f:
            output_df.to_csv(f, sep="\t", index=False)

        summary_df = pd.concat(summary_df)
        summary_df["sent text"] = summary_df["sent text"].apply(
            lambda x: x.encode("utf-8"))
        with open(updates_path, "w") as f:
            summary_df[["timestamp", "update id",
                        "sent text"]].sort(["update id"]).to_csv(f,
                                                                 sep="\t",
                                                                 index=False)
Пример #6
0
    def do_job_unit(self, event, corpus, unit, **kwargs):
        if unit != 0:
            raise Exception("unit of work out of bounds!") 
        extractor = kwargs.get("extractor", "gold")
        soft_match = kwargs.get("soft_match", False)
        budget = kwargs.get("budget", 25)
        
        output_path_prefix = self.get_path_prefix(
            event, corpus, extractor, budget, soft_match)

        ## Set up summarizer ###

        # This is the monotone submodular objective function (basically
        # nugget coverage).
        def f_of_A(system, A, V_min_A, e, input_df, ndarray_data):
            return len(
                set([nugget for nuggets in input_df.ix[A, "nuggets"].tolist()
                     for nugget in nuggets]))

        system = sumpy.system.MonotoneSubmodularBasic(f_of_A=f_of_A, k=budget)

        # Get gold matchings for oracle.
        articles = ArticlesResource()
        all_matches = cuttsum.judgements.get_merged_dataframe()
        matches = all_matches[all_matches["query id"] == event.query_id]
        
        # Set up soft matching if we are using it.
        if soft_match is True:
            from cuttsum.classifiers import NuggetClassifier
            classify_nuggets = NuggetClassifier().get_classifier(event)
            if event.query_id.startswith("TS13"):
                judged = cuttsum.judgements.get_2013_updates() 
                judged = judged[judged["query id"] == event.query_id]
                judged_uids = set(judged["update id"].tolist())
            else:
                raise Exception("Bad corpus!")
  
        # All sentences containing nuggets will go in all_df.
        all_df = [] 
        # Pull out articles with nuggets.
        for hour, path, si in articles.streamitem_iter(
                event, corpus, extractor):

            # Convert stream item to dataframe and add gold label nuggets.
            df = si2df(si, extractor=extractor)
            df["nuggets"] = df["update id"].apply(
                lambda x: set(
                    matches[matches["update id"] == x]["nugget id"].tolist()))

            # Perform soft nugget matching on unjudged sentences.
            if soft_match is True:
                ### NOTE BENE: geting an array of indices to index unjudged
                # sentences so I can force pandas to return a view and not a
                # copy.
                I = np.where(
                    df["update id"].apply(lambda x: x not in judged_uids))[0]
                
                unjudged = df[
                    df["update id"].apply(lambda x: x not in judged_uids)]
                unjudged_sents = unjudged["sent text"].tolist()
                assert len(unjudged_sents) == I.shape[0]

                df.loc[I, "nuggets"] = classify_nuggets(unjudged_sents)
            
            # Add sentences with nuggets to final set for summarzing
            df = df[df["nuggets"].apply(len) > 0]
            all_df.append(df)

        # Collect all dataframes into one and reset index (ALWAYS RESET 
        # THE INDEX because pandas hates me.)
        all_df = pd.concat(all_df)
        all_df.reset_index(inplace=True)

        summary =  system.summarize(all_df)
        F_of_S = len(
            set(n for ns in summary._df["nuggets"].tolist() for n in ns))

        #print "F(S)", F_of_S
        #print "summary nuggets" 
        sum_nuggets = list(set(
            n for ns in summary._df["nuggets"].tolist() for n in ns))
        sum_nuggets.sort()
        print sum_nuggets
 
        possible_nuggets = list(set(
            n for ns in all_df["nuggets"].tolist() for n in ns))
        possible_nuggets.sort()
        print possible_nuggets
        print len(possible_nuggets) 

        event_nuggets = set(matches["nugget id"].tolist())
        total_nuggets = len(event_nuggets)
        timestamp = int(si.stream_id.split("-")[0])

        output_df = pd.DataFrame(
            [{"Cum. F(S)": F_of_S, 
              "F(S)": F_of_S, 
              "UB no const.": len(possible_nuggets), #      total_nuggets, 
              "budget": budget, 
              "Tot. Updates": len(summary._df), 
              "event title": event.fs_name(), 
              "timestamp": timestamp, 
              "query id": event.query_id},],
            columns=["timestamp", "query id", "event title", "Cum. F(S)", 
                     "F(S)", "UB no const.",
                     "Tot. Updates", "budget",])   
        
        parent = os.path.dirname(output_path_prefix)
        if not os.path.exists(parent):
            os.makedirs(parent)

        stats_path = output_path_prefix + ".stats.tsv"
        updates_path = output_path_prefix + ".updates.tsv"
        
        with open(stats_path, "w") as f:
            output_df.to_csv(f, sep="\t", index=False)
        summary._df["sent text"] = summary._df["sent text"].apply(
            lambda x: x.encode("utf-8"))
        with open(updates_path, "w") as f:
            summary._df[["timestamp", "update id", "sent text"]].sort(
                ["update id"]).to_csv(f, sep="\t", index=False)
Пример #7
0
    def do_job_unit(self, event, corpus, unit, **kwargs):

        if unit != 0:
            raise Exception("unit of work out of bounds!") 
        extractor = kwargs.get("extractor", "gold")
        soft_match = kwargs.get("soft_match", False)
        budget = kwargs.get("budget", 25)
        
        output_path_prefix = self.get_path_prefix(
            event, corpus, extractor, budget, soft_match)
       
        ## Set up summarizer ###
        def f_of_A(system, A, V_min_A, e, input_df, ndarray_data):
            return len(
                set([nugget for nuggets in input_df.ix[A, "nuggets"].tolist() 
                     for nugget in nuggets]))
        system = sumpy.system.MonotoneSubmodularBasic(f_of_A=f_of_A, k=budget)

        # Collect all previously collected nuggets here.
        nugget_cache = set()

        # Get gold matchings for oracle.
        articles = ArticlesResource()            
        all_matches = cuttsum.judgements.get_merged_dataframe()
        matches = all_matches[all_matches["query id"] == event.query_id]
        
        # Set up soft matching if we are using it.
        if soft_match is True:
            from cuttsum.classifiers import NuggetClassifier
            classify_nuggets = NuggetClassifier().get_classifier(event)
            if event.query_id.startswith("TS13"):
                judged = cuttsum.judgements.get_2013_updates() 
                judged = judged[judged["query id"] == event.query_id]
                judged_uids = set(judged["update id"].tolist())
            else:
                raise Exception("Bad corpus!")
 

        # Collect stats for each document here.
        stats = []
        
        # Aggregate summaries in summary_df.
        summary_df = []
        cum_F_of_S = 0

        all_seen_nuggets = set()

#        event_nuggets = set(matches["nugget id"].tolist())
#        total_nuggets = len(event_nuggets)
        total_updates = 0

        # Pull out articles with nuggets.
        for hour, path, si in articles.streamitem_iter(
                event, corpus, extractor):
            print hour, si.stream_id
            # Convert stream item to dataframe and add gold label nuggets.
            df = si2df(si, extractor=extractor)
            df["nuggets"] = df["update id"].apply(
                lambda x: set(
                    matches[matches["update id"] == x]["nugget id"].tolist()))

            # Perform soft nugget matching on unjudged sentences.
            if soft_match is True:
                ### NOTE BENE: geting an array of indices to index unjudged
                # sentences so I can force pandas to return a view and not a
                # copy.
                I = np.where(
                    df["update id"].apply(lambda x: x not in judged_uids))[0]
                
                unjudged = df[
                    df["update id"].apply(lambda x: x not in judged_uids)]
                unjudged_sents = unjudged["sent text"].tolist()
                assert len(unjudged_sents) == I.shape[0]

                df.loc[I, "nuggets"] = classify_nuggets(unjudged_sents)
            
            # Remove nuggets from dataframe if we have already collected them in
            # the cache. The scoring function should ignore these.
            df = df[df["nuggets"].apply(len) > 0]
            all_seen_nuggets.update(
                set(n for ns in df["nuggets"].tolist() for n in ns))
            df["nuggets"] = df["nuggets"].apply(
                lambda x: x.difference(nugget_cache))
            if len(df) == 0:
                continue

            # Run summarizer on current document and update stats about it.
            summary = system.summarize(df)
            summary_nuggets = set(n for ns in summary._df["nuggets"].tolist() 
                                  for n in ns)
            nugget_cache.update(summary_nuggets)
            system.k -= len(summary._df)

            F_of_S = len(summary_nuggets)
            cum_F_of_S += F_of_S
            total_updates += len(summary._df)
            timestamp = int(si.stream_id.split("-")[0])

            stats.append({
                "Cum. F(S)": cum_F_of_S, 
                "F(S)": F_of_S, 
                "UB no const.": len(all_seen_nuggets), 
                "budget": budget, 
                "Tot. Updates": total_updates, 
                "event title": event.fs_name(), 
                "timestamp": timestamp, 
                "query id": event.query_id,
            })
            summary_df.append(summary._df)
            if system.k <= 0:
                print "Budget exceeded!"
                break


        output_df = pd.DataFrame(stats,
            columns=["timestamp", "query id", "event title", 
                     "Cum. F(S)", "F(S)", "UB no const.",
                     "Tot. Updates", "budget",])   
       
        # Write stats and updates to file. 
        parent = os.path.dirname(output_path_prefix)
        if not os.path.exists(parent):
            os.makedirs(parent)

        stats_path = output_path_prefix + ".stats.tsv"
        updates_path = output_path_prefix + ".updates.tsv"

        with open(stats_path, "w") as f:
            output_df.to_csv(f, sep="\t", index=False)

        summary_df = pd.concat(summary_df)
        summary_df["sent text"] = summary_df["sent text"].apply(
            lambda x: x.encode("utf-8"))
        with open(updates_path, "w") as f:
            summary_df[["timestamp", "update id", "sent text"]].sort(
                ["update id"]).to_csv(f, sep="\t", index=False)