示例#1
0
def cmd_oracle(args, t, active_events):
    import cuttsum.judgements
    print t.clear

    all_matches = cuttsum.judgements.get_merged_dataframe()
    for event in active_events:
        print event
        matches = all_matches[all_matches["query id"] == event.query_id]
        matching_doc_ids = set(matches["document id"].tolist())

        print matches[0:10]
        if event.query_id.startswith("TS13"):
            corpus = cuttsum.corpora.EnglishAndUnknown2013()
        elif event.query_id.startswith("TS14"):
            corpus = cuttsum.corpora.SerifOnly2014()
        else:
            raise Exception("Bad query id: {}".format(event.query_id))

        from cuttsum.trecdata import SCChunkResource

        chunks_res = SCChunkResource()
        for hour, path, si in chunks_res.streamitem_iter(event, corpus):
            print si.stream_id, hour
            if si.stream_id in matching_doc_ids:
                print si.stream_id, si.stream_time, hour
                for match in matches[matches["document id"] ==
                                     si.stream_id].iterrows():
                    print match[["nugget text", "update text"]]

            #if si.body.clean_visible is not None:

                with t.cbreak():
                    t.inkey()
示例#2
0
文件: trec-info.py 项目: kedz/cuttsum
def cmd_oracle(args, t, active_events):
    import cuttsum.judgements
    print t.clear

    all_matches = cuttsum.judgements.get_merged_dataframe()
    for event in active_events:
        print event
        matches = all_matches[all_matches["query id"]  == event.query_id]
        matching_doc_ids = set(matches["document id"].tolist())
        
        print matches[0:10]
        if event.query_id.startswith("TS13"):        
            corpus = cuttsum.corpora.EnglishAndUnknown2013()
        elif event.query_id.startswith("TS14"):
            corpus = cuttsum.corpora.SerifOnly2014()
        else:
            raise Exception("Bad query id: {}".format(event.query_id)) 
        
        
        
        from cuttsum.trecdata import SCChunkResource

        chunks_res = SCChunkResource()
        for hour, path, si in chunks_res.streamitem_iter(event, corpus):
            print si.stream_id, hour
            if si.stream_id in matching_doc_ids:
                print si.stream_id, si.stream_time, hour
                for match in matches[matches["document id"] == si.stream_id].iterrows():
                    print match[["nugget text", "update text"]]

            #if si.body.clean_visible is not None:
        
                with t.cbreak():
                    t.inkey()
示例#3
0
def cmd_readstream(args, t, active_events):

    import textwrap
    from goose import Goose, Configuration
    config = Configuration()
    config.enable_image_fetching = False
    g = Goose(config)
    raw_stream = True

    for arg in args:
        if arg == "articles":
            raw_stream = False

    for event in active_events:
        print event
        if event.query_id.startswith("TS13"):
            corpus = cuttsum.corpora.EnglishAndUnknown2013()
        elif event.query_id.startswith("TS14"):
            corpus = cuttsum.corpora.SerifOnly2014()
        else:
            raise Exception("Bad query id: {}".format(event.query_id))

        if raw_stream is True:
            from cuttsum.trecdata import SCChunkResource
            si_iter = SCChunkResource().streamitem_iter(event, corpus)
        else:
            from cuttsum.pipeline import ArticlesResource
            si_iter = ArticlesResource().streamitem_iter(event, corpus)

        for hour, path, si in si_iter:
            if si.body.clean_visible is not None:
                print si.stream_id
                try:
                    text_height = t.height - 4
                    #n_chars = t.
                    article = g.extract(raw_html=si.body.clean_html)
                    lines = textwrap.wrap(article.cleaned_text)
                    idx = 0
                    while 1:
                        print t.clear
                        print "hour:", hour
                        print "title:", article.title
                        print "article:"
                        print "\n".join(lines[idx:idx + text_height])
                        #print article.cleaned_text

                        with t.cbreak():
                            char = t.inkey()
                            if char == "i" and idx > 0:
                                idx -= 1  #idx - 1 if idx > 0 else 0
                            elif char == "k" and idx + text_height < len(
                                    lines):
                                idx += 1
                            elif char == "l":
                                break

                except Exception, e:
                    print e
                    continue
示例#4
0
    def get_job_units(self, event, corpus, **kwargs):

        extractor = kwargs.get("extractor", "gold")
        overwrite = kwargs.get("overwrite", False)
        data_dir = os.path.join(self.dir_, extractor, event.fs_name())
        if not os.path.exists(data_dir):
            os.makedirs(data_dir)
        chunks_resource = SCChunkResource()

        if extractor == "gold":

            import cuttsum.judgements

            all_matches = cuttsum.judgements.get_matches()
            matches = all_matches[all_matches["query id"] == event.query_id]

            hours = set([datetime.utcfromtimestamp(int(update_id.split("-")[0])).replace(
                             minute=0, second=0)
                         for update_id in matches["update id"].tolist()])
            hours = sorted(list(hours))
           
            units = []
            for h, hour in enumerate(hours):
                output_path = self.get_chunk_path(event, extractor, hour)
                if overwrite is True or ouput_path is None or not os.path.exists(output_path):
                    units.append(h)
            return units

        elif extractor == "goose":
            hours = event.list_event_hours()
            units = []
            for h, hour in enumerate(hours):
                output_path = self.get_chunk_path(event, extractor, hour, corpus)
                if overwrite is True or output_path is None or not os.path.exists(output_path):
                    units.append(h)
            return units

        else:
            raise Exception("extractor: {} not implemented!".format(extractor))
示例#5
0
    def do_job_unit(self, event, corpus, unit, **kwargs):
        
        extractor = kwargs.get("extractor", "gold")
        data_dir = os.path.join(self.dir_, extractor, event.fs_name())
        chunks_resource = SCChunkResource()

        if not os.path.exists(data_dir):	
            try:
                os.makedirs(data_dir)
            except OSError as exc:
                if exc.errno == errno.EEXIST and os.path.isdir(data_dir):
                    pass


        
        if extractor == "gold":
            import cuttsum.judgements
            all_matches = cuttsum.judgements.get_matches()
            matches = all_matches[all_matches["query id"] == event.query_id]
            stream_ids = set(
                matches["update id"].apply(
                    lambda x: "-".join(x.split("-")[:-1])).tolist())

            hours = set([datetime.utcfromtimestamp(
                            int(update_id.split("-")[0])).replace(
                             minute=0, second=0)
                         for update_id in matches["update id"].tolist()])
            hours = sorted(list(hours))
            hour = hours[unit]
            output_path = self.get_chunk_path(event, extractor, hour, corpus)
            gold_si = []
            for path in chunks_resource.get_chunks_for_hour(hour, corpus, event):
                with sc.Chunk(path=path, mode="rb", 
                        message=corpus.sc_msg()) as chunk:
                    for si in chunk:
                        if si.stream_id in stream_ids:
                            gold_si.append(si)

            gold_si.sort(key=lambda x: x.stream_id)
            for si in gold_si:
                print si.stream_id

            if os.path.exists(output_path):
                os.remove(path)            
            with sc.Chunk(path=output_path, mode="wb", 
                    message=corpus.sc_msg()) as chunk:
                for si in gold_si:
                    chunk.add(si)

        elif extractor == "goose":

            import nltk
            from nltk.tokenize import WordPunctTokenizer
            sent_tok = nltk.data.load('tokenizers/punkt/english.pickle')
            word_tok = WordPunctTokenizer()           
 
            from goose import Goose, Configuration
            config = Configuration()
            config.enable_image_fetching = False
            g = Goose(config)
            
            hour = event.list_event_hours()[unit]
            output_path_tmp = self.get_chunk_template(event, extractor, hour, corpus)
            good_si = []

            for path in chunks_resource.get_chunks_for_hour(hour, corpus, event):
                try:
                    with sc.Chunk(path=path, mode="rb", 
                            message=corpus.sc_msg()) as chunk:
                        
                        for si in chunk:

                            if si.body.clean_visible is None:
                                continue

                            article_text = self._get_goose_text(g, si)
                            if article_text is None:
                                continue

                            if not self._contains_query(event, article_text):
                                continue
                    
                            art_pretty = sent_tok.tokenize(article_text)
                            art_sents = [word_tok.tokenize(sent) 
                                         for sent in art_pretty]

                            df = si2df(si)
                            I = self._map_goose2streamitem(
                                art_sents, df["words"].tolist())
                                
                            if "serif" in si.body.sentences:
                                si_sentences = si.body.sentences["serif"]
                            elif "lingpipe" in si.body.sentences:
                                si_sentences = si.body.sentences["lingpipe"]
                            else:
                                raise Exception("Bad sentence annotator.")
                            
                            ann = sc.Annotator()
                            ann.annotator_id = "goose"
                            si.body.sentences["goose"] = [sc.Sentence() 
                                                          for _ in si_sentences]
                            for i_goose, i_si in enumerate(I):
                                #print art_pretty[i_goose]
                                #print df.loc[i_si, "sent text"]
                                #print
                                tokens = [sc.Token(token=token.encode("utf-8")) 
                                          for token in art_sents[i_goose]]
                                si.body.sentences["goose"][i_si].tokens.extend(
                                    tokens)
                            good_si.append(si)
                except TypeError:
                    continue
            #if len(good_si) == 0:
            #    print "Nothing in hour:", hour
            #    return 
            output_path = output_path_tmp.format(len(good_si))
            odir = os.path.dirname(output_path)
            if not os.path.exists(odir):
                os.makedirs(odir)
            if os.path.exists(output_path):
                os.remove(path)            

            good_si.sort(key=lambda x: x.stream_id)
            for si in good_si:
                print si.stream_id

            if os.path.exists(output_path):
                os.remove(output_path)                

            print "Writing to", output_path
            with sc.Chunk(path=output_path, mode="wb", 
                    message=corpus.sc_msg()) as chunk:
                for si in good_si:
                    chunk.add(si)
        else:
            raise Exception("extractor: {} not implemented!".format(extractor))
示例#6
0
pd.set_option("display.width", 200)

import locale

locale.setlocale(locale.LC_ALL, "en_US.UTF8")


def format_int(x):
    return locale.format("%d", x, grouping=True)


def epoch(dt):
    return int((dt - datetime(1970, 1, 1)).total_seconds())


chunk_res = SCChunkResource()
articles_res = ArticlesResource()
ded_articles_res = DedupedArticlesResource()
data = []

event2ids = defaultdict(set)
fltr_event2ids = defaultdict(set)
for event in cuttsum.events.get_events():

    corpus = cuttsum.corpora.get_raw_corpus(event)
    hours = event.list_event_hours()

    hour2ded = defaultdict(int)
    hour2ded_fltr = defaultdict(int)
    ded_df = ded_articles_res.get_stats_df(event, corpus, "goose", 0.8)
示例#7
0
pd.set_option('display.width', 200)

import locale

locale.setlocale(locale.LC_ALL, 'en_US.UTF8')


def format_int(x):
    return locale.format("%d", x, grouping=True)


def epoch(dt):
    return int((dt - datetime(1970, 1, 1)).total_seconds())


chunk_res = SCChunkResource()
articles_res = ArticlesResource()
ded_articles_res = DedupedArticlesResource()
data = []

event2ids = defaultdict(set)
fltr_event2ids = defaultdict(set)
for event in cuttsum.events.get_events():

    corpus = cuttsum.corpora.get_raw_corpus(event)
    hours = event.list_event_hours()

    hour2ded = defaultdict(int)
    hour2ded_fltr = defaultdict(int)
    ded_df = ded_articles_res.get_stats_df(event, corpus, "goose", .8)