예제 #1
0
    def dataframe_iter(self, event, corpus, extractor, include_matches=None, threshold=.8):

        if include_matches is not None:

            all_matches = cuttsum.judgements.get_merged_dataframe()
            matches = all_matches[all_matches["query id"] == event.query_id]
        
        if include_matches == "soft":
            from cuttsum.classifiers import NuggetClassifier
            classify_nuggets = NuggetClassifier().get_classifier(event)
            if event.query_id.startswith("TS13"):
                judged = cuttsum.judgements.get_2013_updates() 
                judged = judged[judged["query id"] == event.query_id]
                judged_uids = set(judged["update id"].tolist())
            elif event.query_id.startswith("TS14"):
                judged = cuttsum.judgements.get_2014_sampled_updates()
                judged = judged[judged["query id"] == event.query_id]
                judged_uids = set(judged["update id"].tolist())
            else:
                
                raise Exception("Bad corpus!")

        for si in self.streamitem_iter(event, corpus, extractor, threshold):
            
            df = si2df(si, extractor=extractor)
                
            if include_matches is not None:
                df["nuggets"] = df["update id"].apply(
                    lambda x: set(
                        matches[
                            matches["update id"] == x]["nugget id"].tolist()))
                df["n conf"] = df["update id"].apply(lambda x: 1 if x in judged_uids else None)
                

            if include_matches == "soft":
                ### NOTE BENE: geting an array of indices to index unjudged
                # sentences so I can force pandas to return a view and not a
                # copy.
                I = np.where(
                    df["update id"].apply(lambda x: x not in judged_uids))[0]
                
                unjudged = df[
                    df["update id"].apply(lambda x: x not in judged_uids)]
                unjudged_sents = unjudged["sent text"].tolist()
                assert len(unjudged_sents) == I.shape[0]
                if I.shape[0] > 0:
                    nuggets, conf, nugget_probs = classify_nuggets(unjudged_sents)
                    df.loc[I, "nuggets"] = nuggets
                    df.loc[I, "n conf"] = conf
                    


            yield df
예제 #2
0
    def dataframe_iter(self, event, corpus, extractor, include_matches=None):

        if include_matches is not None:

            all_matches = cuttsum.judgements.get_merged_dataframe()
            matches = all_matches[all_matches["query id"] == event.query_id]
        
        if include_matches == "soft":
            from cuttsum.classifiers import NuggetClassifier
            classify_nuggets = NuggetClassifier().get_classifier(event)
            if event.query_id.startswith("TS13"):
                judged = cuttsum.judgements.get_2013_updates() 
                judged = judged[judged["query id"] == event.query_id]
                judged_uids = set(judged["update id"].tolist())
            else:
                raise Exception("Bad corpus!")

        for hour, path, si in self.streamitem_iter(event, corpus, extractor):
            df = si2df(si, extractor=extractor)
                
            if include_matches is not None:
                df["nuggets"] = df["update id"].apply(
                    lambda x: set(
                        matches[
                            matches["update id"] == x]["nugget id"].tolist()))

            if include_matches == "soft":
                ### NOTE BENE: geting an array of indices to index unjudged
                # sentences so I can force pandas to return a view and not a
                # copy.
                I = np.where(
                    df["update id"].apply(lambda x: x not in judged_uids))[0]
                
                unjudged = df[
                    df["update id"].apply(lambda x: x not in judged_uids)]
                unjudged_sents = unjudged["sent text"].tolist()
                assert len(unjudged_sents) == I.shape[0]

                df.loc[I, "nuggets"] = classify_nuggets(unjudged_sents)


            yield df
예제 #3
0
    def do_job_unit(self, event, corpus, unit, **kwargs):
        if unit != 0:
            raise Exception("Job unit {} out of range".format(unit))
        
        res = ArticlesResource()
        thresh = kwargs.get("dedupe-sim-threshold", .8)
        extractor = kwargs.get("extractor", "goose")
        hasher = FeatureHasher(input_type="pair", non_negative=True)
        si_iter = res.streamitem_iter(
            event, corpus, extractor) 

        def to_df(all_ids, all_times, all_matches):
            d = []
            for ids, times, match in izip(all_ids, all_times, all_matches):

                times.sort()
                d.append({
                    "stream ids": ids, "hits": len(ids), "match": match,
                    "earliest": times[0], "latest": times[-1], 
                    "second": times[1] if len(times) >= 2 else None,
                    "third": times[2] if len(times) >= 3 else None,
                })
            return pd.DataFrame(d, columns=["stream ids", "match", "hits", 
                                            "earliest", "latest", 
                                            "second", "third"])    

        def query_in_top20(event, df):
            text = u"\n".join(df["sent text"].tolist()[:20]) 
            for query in event.query:
                if not re.search(query, text, flags=re.I|re.UNICODE):
                    return False
            return True

        def make_time(df):
            return df["timestamp"].tolist()[0]

        def make_counts(df, slimit=20):
            counts = defaultdict(int)
            for words in df["words"].tolist()[:slimit]:
                for word in words:
                    counts[word.lower()] += 1   
            return counts

        def next_chunk_file(chunk_file_num):
            deduped_path_fmt = self.get_deduped_path_fmt(
                event, corpus, extractor, threshold=thresh)
            deduped_path = deduped_path_fmt.format(
                chunk_file_num)
            deduped_dir = os.path.dirname(deduped_path)
            if not os.path.exists(deduped_dir):
                os.makedirs(deduped_dir)
            
            if os.path.exists(deduped_path):
                os.remove(deduped_path)

            return sc.Chunk(path=deduped_path, mode="wb", 
                message=corpus.sc_msg())



        X = None

        chunk_file_num = 1
        chunk = next_chunk_file(chunk_file_num)

        for hour, path, si in si_iter:
            df = si2df(si, extractor=extractor)
            counts = make_counts(df)
            x = hasher.transform([counts.items()])
            x.shape = (1, hasher.n_features)
            
            if X is None:
                X = x
                times = [[make_time(df)]]
                ids = [[si.stream_id]]
                matches = [query_in_top20(event, df)]

                chunk.add(si)
                        
            else:
                K = cosine_similarity(X, x)
                k_argmax = K.argmax()
                
                if K[k_argmax] < thresh:
                    
                    X = vstack([X, x])
                    times.append([make_time(df)])
                    ids.append([si.stream_id])
                    matches.append(query_in_top20(event, df))

                    if X.shape[0] % 1000 == 0:
                        chunk.close()
                        chunk_file_num += 1
                        chunk = next_chunk_file(chunk_file_num)

                    chunk.add(si)
                    
                else:
                    times[k_argmax].append(make_time(df))
                    ids[k_argmax].append(si.stream_id)
               
        chunk.close() 
     
        df = to_df(ids, times, matches)            
        print df

        stats_path = self.get_stats_path(
            event, corpus, extractor, thresh)
        with open(stats_path, "w") as f:
            df.to_csv(f, index=False, sep="\t")
예제 #4
0
    def do_job_unit(self, event, corpus, unit, **kwargs):
        if unit != 0:
            raise Exception("unit of work out of bounds!")
        extractor = kwargs.get("extractor", "gold")
        soft_match = kwargs.get("soft_match", False)
        budget = kwargs.get("budget", 25)

        output_path_prefix = self.get_path_prefix(event, corpus, extractor,
                                                  budget, soft_match)

        ## Set up summarizer ###

        # This is the monotone submodular objective function (basically
        # nugget coverage).
        def f_of_A(system, A, V_min_A, e, input_df, ndarray_data):
            return len(
                set([
                    nugget for nuggets in input_df.ix[A, "nuggets"].tolist()
                    for nugget in nuggets
                ]))

        system = sumpy.system.MonotoneSubmodularBasic(f_of_A=f_of_A, k=budget)

        # Get gold matchings for oracle.
        articles = ArticlesResource()
        all_matches = cuttsum.judgements.get_merged_dataframe()
        matches = all_matches[all_matches["query id"] == event.query_id]

        # Set up soft matching if we are using it.
        if soft_match is True:
            from cuttsum.classifiers import NuggetClassifier
            classify_nuggets = NuggetClassifier().get_classifier(event)
            if event.query_id.startswith("TS13"):
                judged = cuttsum.judgements.get_2013_updates()
                judged = judged[judged["query id"] == event.query_id]
                judged_uids = set(judged["update id"].tolist())
            else:
                raise Exception("Bad corpus!")

        # All sentences containing nuggets will go in all_df.
        all_df = []
        # Pull out articles with nuggets.
        for hour, path, si in articles.streamitem_iter(event, corpus,
                                                       extractor):

            # Convert stream item to dataframe and add gold label nuggets.
            df = si2df(si, extractor=extractor)
            df["nuggets"] = df["update id"].apply(lambda x: set(matches[
                matches["update id"] == x]["nugget id"].tolist()))

            # Perform soft nugget matching on unjudged sentences.
            if soft_match is True:
                ### NOTE BENE: geting an array of indices to index unjudged
                # sentences so I can force pandas to return a view and not a
                # copy.
                I = np.where(
                    df["update id"].apply(lambda x: x not in judged_uids))[0]

                unjudged = df[df["update id"].apply(
                    lambda x: x not in judged_uids)]
                unjudged_sents = unjudged["sent text"].tolist()
                assert len(unjudged_sents) == I.shape[0]

                df.loc[I, "nuggets"] = classify_nuggets(unjudged_sents)

            # Add sentences with nuggets to final set for summarzing
            df = df[df["nuggets"].apply(len) > 0]
            all_df.append(df)

        # Collect all dataframes into one and reset index (ALWAYS RESET
        # THE INDEX because pandas hates me.)
        all_df = pd.concat(all_df)
        all_df.reset_index(inplace=True)

        summary = system.summarize(all_df)
        F_of_S = len(
            set(n for ns in summary._df["nuggets"].tolist() for n in ns))

        #print "F(S)", F_of_S
        #print "summary nuggets"
        sum_nuggets = list(
            set(n for ns in summary._df["nuggets"].tolist() for n in ns))
        sum_nuggets.sort()
        print sum_nuggets

        possible_nuggets = list(
            set(n for ns in all_df["nuggets"].tolist() for n in ns))
        possible_nuggets.sort()
        print possible_nuggets
        print len(possible_nuggets)

        event_nuggets = set(matches["nugget id"].tolist())
        total_nuggets = len(event_nuggets)
        timestamp = int(si.stream_id.split("-")[0])

        output_df = pd.DataFrame(
            [
                {
                    "Cum. F(S)": F_of_S,
                    "F(S)": F_of_S,
                    "UB no const.":
                    len(possible_nuggets),  #      total_nuggets, 
                    "budget": budget,
                    "Tot. Updates": len(summary._df),
                    "event title": event.fs_name(),
                    "timestamp": timestamp,
                    "query id": event.query_id
                },
            ],
            columns=[
                "timestamp",
                "query id",
                "event title",
                "Cum. F(S)",
                "F(S)",
                "UB no const.",
                "Tot. Updates",
                "budget",
            ])

        parent = os.path.dirname(output_path_prefix)
        if not os.path.exists(parent):
            os.makedirs(parent)

        stats_path = output_path_prefix + ".stats.tsv"
        updates_path = output_path_prefix + ".updates.tsv"

        with open(stats_path, "w") as f:
            output_df.to_csv(f, sep="\t", index=False)
        summary._df["sent text"] = summary._df["sent text"].apply(
            lambda x: x.encode("utf-8"))
        with open(updates_path, "w") as f:
            summary._df[["timestamp", "update id",
                         "sent text"]].sort(["update id"]).to_csv(f,
                                                                  sep="\t",
                                                                  index=False)
예제 #5
0
    def do_job_unit(self, event, corpus, unit, **kwargs):

        if unit != 0:
            raise Exception("unit of work out of bounds!")
        extractor = kwargs.get("extractor", "gold")
        soft_match = kwargs.get("soft_match", False)
        budget = kwargs.get("budget", 25)

        output_path_prefix = self.get_path_prefix(event, corpus, extractor,
                                                  budget, soft_match)

        ## Set up summarizer ###
        def f_of_A(system, A, V_min_A, e, input_df, ndarray_data):
            return len(
                set([
                    nugget for nuggets in input_df.ix[A, "nuggets"].tolist()
                    for nugget in nuggets
                ]))

        system = sumpy.system.MonotoneSubmodularBasic(f_of_A=f_of_A, k=budget)

        # Collect all previously collected nuggets here.
        nugget_cache = set()

        # Get gold matchings for oracle.
        articles = ArticlesResource()
        all_matches = cuttsum.judgements.get_merged_dataframe()
        matches = all_matches[all_matches["query id"] == event.query_id]

        # Set up soft matching if we are using it.
        if soft_match is True:
            from cuttsum.classifiers import NuggetClassifier
            classify_nuggets = NuggetClassifier().get_classifier(event)
            if event.query_id.startswith("TS13"):
                judged = cuttsum.judgements.get_2013_updates()
                judged = judged[judged["query id"] == event.query_id]
                judged_uids = set(judged["update id"].tolist())
            else:
                raise Exception("Bad corpus!")

        # Collect stats for each document here.
        stats = []

        # Aggregate summaries in summary_df.
        summary_df = []
        cum_F_of_S = 0

        all_seen_nuggets = set()

        #        event_nuggets = set(matches["nugget id"].tolist())
        #        total_nuggets = len(event_nuggets)
        total_updates = 0

        # Pull out articles with nuggets.
        for hour, path, si in articles.streamitem_iter(event, corpus,
                                                       extractor):
            print hour, si.stream_id
            # Convert stream item to dataframe and add gold label nuggets.
            df = si2df(si, extractor=extractor)
            df["nuggets"] = df["update id"].apply(lambda x: set(matches[
                matches["update id"] == x]["nugget id"].tolist()))

            # Perform soft nugget matching on unjudged sentences.
            if soft_match is True:
                ### NOTE BENE: geting an array of indices to index unjudged
                # sentences so I can force pandas to return a view and not a
                # copy.
                I = np.where(
                    df["update id"].apply(lambda x: x not in judged_uids))[0]

                unjudged = df[df["update id"].apply(
                    lambda x: x not in judged_uids)]
                unjudged_sents = unjudged["sent text"].tolist()
                assert len(unjudged_sents) == I.shape[0]

                df.loc[I, "nuggets"] = classify_nuggets(unjudged_sents)

            # Remove nuggets from dataframe if we have already collected them in
            # the cache. The scoring function should ignore these.
            df = df[df["nuggets"].apply(len) > 0]
            all_seen_nuggets.update(
                set(n for ns in df["nuggets"].tolist() for n in ns))
            df["nuggets"] = df["nuggets"].apply(
                lambda x: x.difference(nugget_cache))
            if len(df) == 0:
                continue

            # Run summarizer on current document and update stats about it.
            summary = system.summarize(df)
            summary_nuggets = set(n for ns in summary._df["nuggets"].tolist()
                                  for n in ns)
            nugget_cache.update(summary_nuggets)
            system.k -= len(summary._df)

            F_of_S = len(summary_nuggets)
            cum_F_of_S += F_of_S
            total_updates += len(summary._df)
            timestamp = int(si.stream_id.split("-")[0])

            stats.append({
                "Cum. F(S)": cum_F_of_S,
                "F(S)": F_of_S,
                "UB no const.": len(all_seen_nuggets),
                "budget": budget,
                "Tot. Updates": total_updates,
                "event title": event.fs_name(),
                "timestamp": timestamp,
                "query id": event.query_id,
            })
            summary_df.append(summary._df)
            if system.k <= 0:
                print "Budget exceeded!"
                break

        output_df = pd.DataFrame(stats,
                                 columns=[
                                     "timestamp",
                                     "query id",
                                     "event title",
                                     "Cum. F(S)",
                                     "F(S)",
                                     "UB no const.",
                                     "Tot. Updates",
                                     "budget",
                                 ])

        # Write stats and updates to file.
        parent = os.path.dirname(output_path_prefix)
        if not os.path.exists(parent):
            os.makedirs(parent)

        stats_path = output_path_prefix + ".stats.tsv"
        updates_path = output_path_prefix + ".updates.tsv"

        with open(stats_path, "w") as f:
            output_df.to_csv(f, sep="\t", index=False)

        summary_df = pd.concat(summary_df)
        summary_df["sent text"] = summary_df["sent text"].apply(
            lambda x: x.encode("utf-8"))
        with open(updates_path, "w") as f:
            summary_df[["timestamp", "update id",
                        "sent text"]].sort(["update id"]).to_csv(f,
                                                                 sep="\t",
                                                                 index=False)
예제 #6
0
파일: _oracle.py 프로젝트: kedz/cuttsum
    def do_job_unit(self, event, corpus, unit, **kwargs):
        if unit != 0:
            raise Exception("unit of work out of bounds!") 
        extractor = kwargs.get("extractor", "gold")
        soft_match = kwargs.get("soft_match", False)
        budget = kwargs.get("budget", 25)
        
        output_path_prefix = self.get_path_prefix(
            event, corpus, extractor, budget, soft_match)

        ## Set up summarizer ###

        # This is the monotone submodular objective function (basically
        # nugget coverage).
        def f_of_A(system, A, V_min_A, e, input_df, ndarray_data):
            return len(
                set([nugget for nuggets in input_df.ix[A, "nuggets"].tolist()
                     for nugget in nuggets]))

        system = sumpy.system.MonotoneSubmodularBasic(f_of_A=f_of_A, k=budget)

        # Get gold matchings for oracle.
        articles = ArticlesResource()
        all_matches = cuttsum.judgements.get_merged_dataframe()
        matches = all_matches[all_matches["query id"] == event.query_id]
        
        # Set up soft matching if we are using it.
        if soft_match is True:
            from cuttsum.classifiers import NuggetClassifier
            classify_nuggets = NuggetClassifier().get_classifier(event)
            if event.query_id.startswith("TS13"):
                judged = cuttsum.judgements.get_2013_updates() 
                judged = judged[judged["query id"] == event.query_id]
                judged_uids = set(judged["update id"].tolist())
            else:
                raise Exception("Bad corpus!")
  
        # All sentences containing nuggets will go in all_df.
        all_df = [] 
        # Pull out articles with nuggets.
        for hour, path, si in articles.streamitem_iter(
                event, corpus, extractor):

            # Convert stream item to dataframe and add gold label nuggets.
            df = si2df(si, extractor=extractor)
            df["nuggets"] = df["update id"].apply(
                lambda x: set(
                    matches[matches["update id"] == x]["nugget id"].tolist()))

            # Perform soft nugget matching on unjudged sentences.
            if soft_match is True:
                ### NOTE BENE: geting an array of indices to index unjudged
                # sentences so I can force pandas to return a view and not a
                # copy.
                I = np.where(
                    df["update id"].apply(lambda x: x not in judged_uids))[0]
                
                unjudged = df[
                    df["update id"].apply(lambda x: x not in judged_uids)]
                unjudged_sents = unjudged["sent text"].tolist()
                assert len(unjudged_sents) == I.shape[0]

                df.loc[I, "nuggets"] = classify_nuggets(unjudged_sents)
            
            # Add sentences with nuggets to final set for summarzing
            df = df[df["nuggets"].apply(len) > 0]
            all_df.append(df)

        # Collect all dataframes into one and reset index (ALWAYS RESET 
        # THE INDEX because pandas hates me.)
        all_df = pd.concat(all_df)
        all_df.reset_index(inplace=True)

        summary =  system.summarize(all_df)
        F_of_S = len(
            set(n for ns in summary._df["nuggets"].tolist() for n in ns))

        #print "F(S)", F_of_S
        #print "summary nuggets" 
        sum_nuggets = list(set(
            n for ns in summary._df["nuggets"].tolist() for n in ns))
        sum_nuggets.sort()
        print sum_nuggets
 
        possible_nuggets = list(set(
            n for ns in all_df["nuggets"].tolist() for n in ns))
        possible_nuggets.sort()
        print possible_nuggets
        print len(possible_nuggets) 

        event_nuggets = set(matches["nugget id"].tolist())
        total_nuggets = len(event_nuggets)
        timestamp = int(si.stream_id.split("-")[0])

        output_df = pd.DataFrame(
            [{"Cum. F(S)": F_of_S, 
              "F(S)": F_of_S, 
              "UB no const.": len(possible_nuggets), #      total_nuggets, 
              "budget": budget, 
              "Tot. Updates": len(summary._df), 
              "event title": event.fs_name(), 
              "timestamp": timestamp, 
              "query id": event.query_id},],
            columns=["timestamp", "query id", "event title", "Cum. F(S)", 
                     "F(S)", "UB no const.",
                     "Tot. Updates", "budget",])   
        
        parent = os.path.dirname(output_path_prefix)
        if not os.path.exists(parent):
            os.makedirs(parent)

        stats_path = output_path_prefix + ".stats.tsv"
        updates_path = output_path_prefix + ".updates.tsv"
        
        with open(stats_path, "w") as f:
            output_df.to_csv(f, sep="\t", index=False)
        summary._df["sent text"] = summary._df["sent text"].apply(
            lambda x: x.encode("utf-8"))
        with open(updates_path, "w") as f:
            summary._df[["timestamp", "update id", "sent text"]].sort(
                ["update id"]).to_csv(f, sep="\t", index=False)
예제 #7
0
파일: _oracle.py 프로젝트: kedz/cuttsum
    def do_job_unit(self, event, corpus, unit, **kwargs):

        if unit != 0:
            raise Exception("unit of work out of bounds!") 
        extractor = kwargs.get("extractor", "gold")
        soft_match = kwargs.get("soft_match", False)
        budget = kwargs.get("budget", 25)
        
        output_path_prefix = self.get_path_prefix(
            event, corpus, extractor, budget, soft_match)
       
        ## Set up summarizer ###
        def f_of_A(system, A, V_min_A, e, input_df, ndarray_data):
            return len(
                set([nugget for nuggets in input_df.ix[A, "nuggets"].tolist() 
                     for nugget in nuggets]))
        system = sumpy.system.MonotoneSubmodularBasic(f_of_A=f_of_A, k=budget)

        # Collect all previously collected nuggets here.
        nugget_cache = set()

        # Get gold matchings for oracle.
        articles = ArticlesResource()            
        all_matches = cuttsum.judgements.get_merged_dataframe()
        matches = all_matches[all_matches["query id"] == event.query_id]
        
        # Set up soft matching if we are using it.
        if soft_match is True:
            from cuttsum.classifiers import NuggetClassifier
            classify_nuggets = NuggetClassifier().get_classifier(event)
            if event.query_id.startswith("TS13"):
                judged = cuttsum.judgements.get_2013_updates() 
                judged = judged[judged["query id"] == event.query_id]
                judged_uids = set(judged["update id"].tolist())
            else:
                raise Exception("Bad corpus!")
 

        # Collect stats for each document here.
        stats = []
        
        # Aggregate summaries in summary_df.
        summary_df = []
        cum_F_of_S = 0

        all_seen_nuggets = set()

#        event_nuggets = set(matches["nugget id"].tolist())
#        total_nuggets = len(event_nuggets)
        total_updates = 0

        # Pull out articles with nuggets.
        for hour, path, si in articles.streamitem_iter(
                event, corpus, extractor):
            print hour, si.stream_id
            # Convert stream item to dataframe and add gold label nuggets.
            df = si2df(si, extractor=extractor)
            df["nuggets"] = df["update id"].apply(
                lambda x: set(
                    matches[matches["update id"] == x]["nugget id"].tolist()))

            # Perform soft nugget matching on unjudged sentences.
            if soft_match is True:
                ### NOTE BENE: geting an array of indices to index unjudged
                # sentences so I can force pandas to return a view and not a
                # copy.
                I = np.where(
                    df["update id"].apply(lambda x: x not in judged_uids))[0]
                
                unjudged = df[
                    df["update id"].apply(lambda x: x not in judged_uids)]
                unjudged_sents = unjudged["sent text"].tolist()
                assert len(unjudged_sents) == I.shape[0]

                df.loc[I, "nuggets"] = classify_nuggets(unjudged_sents)
            
            # Remove nuggets from dataframe if we have already collected them in
            # the cache. The scoring function should ignore these.
            df = df[df["nuggets"].apply(len) > 0]
            all_seen_nuggets.update(
                set(n for ns in df["nuggets"].tolist() for n in ns))
            df["nuggets"] = df["nuggets"].apply(
                lambda x: x.difference(nugget_cache))
            if len(df) == 0:
                continue

            # Run summarizer on current document and update stats about it.
            summary = system.summarize(df)
            summary_nuggets = set(n for ns in summary._df["nuggets"].tolist() 
                                  for n in ns)
            nugget_cache.update(summary_nuggets)
            system.k -= len(summary._df)

            F_of_S = len(summary_nuggets)
            cum_F_of_S += F_of_S
            total_updates += len(summary._df)
            timestamp = int(si.stream_id.split("-")[0])

            stats.append({
                "Cum. F(S)": cum_F_of_S, 
                "F(S)": F_of_S, 
                "UB no const.": len(all_seen_nuggets), 
                "budget": budget, 
                "Tot. Updates": total_updates, 
                "event title": event.fs_name(), 
                "timestamp": timestamp, 
                "query id": event.query_id,
            })
            summary_df.append(summary._df)
            if system.k <= 0:
                print "Budget exceeded!"
                break


        output_df = pd.DataFrame(stats,
            columns=["timestamp", "query id", "event title", 
                     "Cum. F(S)", "F(S)", "UB no const.",
                     "Tot. Updates", "budget",])   
       
        # Write stats and updates to file. 
        parent = os.path.dirname(output_path_prefix)
        if not os.path.exists(parent):
            os.makedirs(parent)

        stats_path = output_path_prefix + ".stats.tsv"
        updates_path = output_path_prefix + ".updates.tsv"

        with open(stats_path, "w") as f:
            output_df.to_csv(f, sep="\t", index=False)

        summary_df = pd.concat(summary_df)
        summary_df["sent text"] = summary_df["sent text"].apply(
            lambda x: x.encode("utf-8"))
        with open(updates_path, "w") as f:
            summary_df[["timestamp", "update id", "sent text"]].sort(
                ["update id"]).to_csv(f, sep="\t", index=False)
예제 #8
0
    def do_job_unit(self, event, corpus, unit, **kwargs):
        
        extractor = kwargs.get("extractor", "gold")
        data_dir = os.path.join(self.dir_, extractor, event.fs_name())
        chunks_resource = SCChunkResource()

        if not os.path.exists(data_dir):	
            try:
                os.makedirs(data_dir)
            except OSError as exc:
                if exc.errno == errno.EEXIST and os.path.isdir(data_dir):
                    pass


        
        if extractor == "gold":
            import cuttsum.judgements
            all_matches = cuttsum.judgements.get_matches()
            matches = all_matches[all_matches["query id"] == event.query_id]
            stream_ids = set(
                matches["update id"].apply(
                    lambda x: "-".join(x.split("-")[:-1])).tolist())

            hours = set([datetime.utcfromtimestamp(
                            int(update_id.split("-")[0])).replace(
                             minute=0, second=0)
                         for update_id in matches["update id"].tolist()])
            hours = sorted(list(hours))
            hour = hours[unit]
            output_path = self.get_chunk_path(event, extractor, hour, corpus)
            gold_si = []
            for path in chunks_resource.get_chunks_for_hour(hour, corpus, event):
                with sc.Chunk(path=path, mode="rb", 
                        message=corpus.sc_msg()) as chunk:
                    for si in chunk:
                        if si.stream_id in stream_ids:
                            gold_si.append(si)

            gold_si.sort(key=lambda x: x.stream_id)
            for si in gold_si:
                print si.stream_id

            if os.path.exists(output_path):
                os.remove(path)            
            with sc.Chunk(path=output_path, mode="wb", 
                    message=corpus.sc_msg()) as chunk:
                for si in gold_si:
                    chunk.add(si)

        elif extractor == "goose":

            import nltk
            from nltk.tokenize import WordPunctTokenizer
            sent_tok = nltk.data.load('tokenizers/punkt/english.pickle')
            word_tok = WordPunctTokenizer()           
 
            from goose import Goose, Configuration
            config = Configuration()
            config.enable_image_fetching = False
            g = Goose(config)
            
            hour = event.list_event_hours()[unit]
            output_path_tmp = self.get_chunk_template(event, extractor, hour, corpus)
            good_si = []

            for path in chunks_resource.get_chunks_for_hour(hour, corpus, event):
                try:
                    with sc.Chunk(path=path, mode="rb", 
                            message=corpus.sc_msg()) as chunk:
                        
                        for si in chunk:

                            if si.body.clean_visible is None:
                                continue

                            article_text = self._get_goose_text(g, si)
                            if article_text is None:
                                continue

                            if not self._contains_query(event, article_text):
                                continue
                    
                            art_pretty = sent_tok.tokenize(article_text)
                            art_sents = [word_tok.tokenize(sent) 
                                         for sent in art_pretty]

                            df = si2df(si)
                            I = self._map_goose2streamitem(
                                art_sents, df["words"].tolist())
                                
                            if "serif" in si.body.sentences:
                                si_sentences = si.body.sentences["serif"]
                            elif "lingpipe" in si.body.sentences:
                                si_sentences = si.body.sentences["lingpipe"]
                            else:
                                raise Exception("Bad sentence annotator.")
                            
                            ann = sc.Annotator()
                            ann.annotator_id = "goose"
                            si.body.sentences["goose"] = [sc.Sentence() 
                                                          for _ in si_sentences]
                            for i_goose, i_si in enumerate(I):
                                #print art_pretty[i_goose]
                                #print df.loc[i_si, "sent text"]
                                #print
                                tokens = [sc.Token(token=token.encode("utf-8")) 
                                          for token in art_sents[i_goose]]
                                si.body.sentences["goose"][i_si].tokens.extend(
                                    tokens)
                            good_si.append(si)
                except TypeError:
                    continue
            #if len(good_si) == 0:
            #    print "Nothing in hour:", hour
            #    return 
            output_path = output_path_tmp.format(len(good_si))
            odir = os.path.dirname(output_path)
            if not os.path.exists(odir):
                os.makedirs(odir)
            if os.path.exists(output_path):
                os.remove(path)            

            good_si.sort(key=lambda x: x.stream_id)
            for si in good_si:
                print si.stream_id

            if os.path.exists(output_path):
                os.remove(output_path)                

            print "Writing to", output_path
            with sc.Chunk(path=output_path, mode="wb", 
                    message=corpus.sc_msg()) as chunk:
                for si in good_si:
                    chunk.add(si)
        else:
            raise Exception("extractor: {} not implemented!".format(extractor))