예제 #1
0
        path = articles_res.get_chunk_path(event, "goose", hour, corpus)
        if path is None:
            continue
        # print path
        fname = os.path.split(path)[1]
        num_goose = int(fname.split("-")[0])
        hour2goose[hour] = num_goose
    #    goose_df = articles_res.get_stats_df(event, "goose")
    #    if goose_df is not None:
    #        for _, row in goose_df.iterrows():
    #            dt = datetime.utcfromtimestamp(row["hour"])
    #            hour = datetime(dt.year, dt.month, dt.day, dt.hour)
    #            hour2goose[hour] = row["goose articles"]

    for hour in hours:
        raw_chunks = chunk_res.get_chunks_for_hour(hour, corpus, event)
        num_raw_si = 0

        for chunk in raw_chunks:
            fname = os.path.split(chunk)[1]
            num_raw_si += int(fname.split("-")[1])
        # num_fltr_si = len(articles_res.get_si(event, corpus, "goose", hour))
        data.append(
            {
                "event": event.query_id,
                "title": event.title,
                "hour": hour,
                "raw articles": num_raw_si,
                "goose articles": hour2goose[hour],
                "deduped articles": hour2ded[hour],
                "deduped match articles": hour2ded_fltr[hour],
예제 #2
0
    def do_job_unit(self, event, corpus, unit, **kwargs):
        
        extractor = kwargs.get("extractor", "gold")
        data_dir = os.path.join(self.dir_, extractor, event.fs_name())
        chunks_resource = SCChunkResource()

        if not os.path.exists(data_dir):	
            try:
                os.makedirs(data_dir)
            except OSError as exc:
                if exc.errno == errno.EEXIST and os.path.isdir(data_dir):
                    pass


        
        if extractor == "gold":
            import cuttsum.judgements
            all_matches = cuttsum.judgements.get_matches()
            matches = all_matches[all_matches["query id"] == event.query_id]
            stream_ids = set(
                matches["update id"].apply(
                    lambda x: "-".join(x.split("-")[:-1])).tolist())

            hours = set([datetime.utcfromtimestamp(
                            int(update_id.split("-")[0])).replace(
                             minute=0, second=0)
                         for update_id in matches["update id"].tolist()])
            hours = sorted(list(hours))
            hour = hours[unit]
            output_path = self.get_chunk_path(event, extractor, hour, corpus)
            gold_si = []
            for path in chunks_resource.get_chunks_for_hour(hour, corpus, event):
                with sc.Chunk(path=path, mode="rb", 
                        message=corpus.sc_msg()) as chunk:
                    for si in chunk:
                        if si.stream_id in stream_ids:
                            gold_si.append(si)

            gold_si.sort(key=lambda x: x.stream_id)
            for si in gold_si:
                print si.stream_id

            if os.path.exists(output_path):
                os.remove(path)            
            with sc.Chunk(path=output_path, mode="wb", 
                    message=corpus.sc_msg()) as chunk:
                for si in gold_si:
                    chunk.add(si)

        elif extractor == "goose":

            import nltk
            from nltk.tokenize import WordPunctTokenizer
            sent_tok = nltk.data.load('tokenizers/punkt/english.pickle')
            word_tok = WordPunctTokenizer()           
 
            from goose import Goose, Configuration
            config = Configuration()
            config.enable_image_fetching = False
            g = Goose(config)
            
            hour = event.list_event_hours()[unit]
            output_path_tmp = self.get_chunk_template(event, extractor, hour, corpus)
            good_si = []

            for path in chunks_resource.get_chunks_for_hour(hour, corpus, event):
                try:
                    with sc.Chunk(path=path, mode="rb", 
                            message=corpus.sc_msg()) as chunk:
                        
                        for si in chunk:

                            if si.body.clean_visible is None:
                                continue

                            article_text = self._get_goose_text(g, si)
                            if article_text is None:
                                continue

                            if not self._contains_query(event, article_text):
                                continue
                    
                            art_pretty = sent_tok.tokenize(article_text)
                            art_sents = [word_tok.tokenize(sent) 
                                         for sent in art_pretty]

                            df = si2df(si)
                            I = self._map_goose2streamitem(
                                art_sents, df["words"].tolist())
                                
                            if "serif" in si.body.sentences:
                                si_sentences = si.body.sentences["serif"]
                            elif "lingpipe" in si.body.sentences:
                                si_sentences = si.body.sentences["lingpipe"]
                            else:
                                raise Exception("Bad sentence annotator.")
                            
                            ann = sc.Annotator()
                            ann.annotator_id = "goose"
                            si.body.sentences["goose"] = [sc.Sentence() 
                                                          for _ in si_sentences]
                            for i_goose, i_si in enumerate(I):
                                #print art_pretty[i_goose]
                                #print df.loc[i_si, "sent text"]
                                #print
                                tokens = [sc.Token(token=token.encode("utf-8")) 
                                          for token in art_sents[i_goose]]
                                si.body.sentences["goose"][i_si].tokens.extend(
                                    tokens)
                            good_si.append(si)
                except TypeError:
                    continue
            #if len(good_si) == 0:
            #    print "Nothing in hour:", hour
            #    return 
            output_path = output_path_tmp.format(len(good_si))
            odir = os.path.dirname(output_path)
            if not os.path.exists(odir):
                os.makedirs(odir)
            if os.path.exists(output_path):
                os.remove(path)            

            good_si.sort(key=lambda x: x.stream_id)
            for si in good_si:
                print si.stream_id

            if os.path.exists(output_path):
                os.remove(output_path)                

            print "Writing to", output_path
            with sc.Chunk(path=output_path, mode="wb", 
                    message=corpus.sc_msg()) as chunk:
                for si in good_si:
                    chunk.add(si)
        else:
            raise Exception("extractor: {} not implemented!".format(extractor))
예제 #3
0
        path = articles_res.get_chunk_path(event, "goose", hour, corpus)
        if path is None:
            continue
        #print path
        fname = os.path.split(path)[1]
        num_goose = int(fname.split("-")[0])
        hour2goose[hour] = num_goose
#    goose_df = articles_res.get_stats_df(event, "goose")
#    if goose_df is not None:
#        for _, row in goose_df.iterrows():
#            dt = datetime.utcfromtimestamp(row["hour"])
#            hour = datetime(dt.year, dt.month, dt.day, dt.hour)
#            hour2goose[hour] = row["goose articles"]

    for hour in hours:
        raw_chunks = chunk_res.get_chunks_for_hour(hour, corpus, event)
        num_raw_si = 0

        for chunk in raw_chunks:
            fname = os.path.split(chunk)[1]
            num_raw_si += int(fname.split("-")[1])
        #num_fltr_si = len(articles_res.get_si(event, corpus, "goose", hour))
        data.append({
            "event": event.query_id,
            "title": event.title,
            "hour": hour,
            "raw articles": num_raw_si,
            "goose articles": hour2goose[hour],
            "deduped articles": hour2ded[hour],
            "deduped match articles": hour2ded_fltr[hour],
        })