def cmd_oracle(args, t, active_events): import cuttsum.judgements print t.clear all_matches = cuttsum.judgements.get_merged_dataframe() for event in active_events: print event matches = all_matches[all_matches["query id"] == event.query_id] matching_doc_ids = set(matches["document id"].tolist()) print matches[0:10] if event.query_id.startswith("TS13"): corpus = cuttsum.corpora.EnglishAndUnknown2013() elif event.query_id.startswith("TS14"): corpus = cuttsum.corpora.SerifOnly2014() else: raise Exception("Bad query id: {}".format(event.query_id)) from cuttsum.trecdata import SCChunkResource chunks_res = SCChunkResource() for hour, path, si in chunks_res.streamitem_iter(event, corpus): print si.stream_id, hour if si.stream_id in matching_doc_ids: print si.stream_id, si.stream_time, hour for match in matches[matches["document id"] == si.stream_id].iterrows(): print match[["nugget text", "update text"]] #if si.body.clean_visible is not None: with t.cbreak(): t.inkey()
def cmd_readstream(args, t, active_events): import textwrap from goose import Goose, Configuration config = Configuration() config.enable_image_fetching = False g = Goose(config) raw_stream = True for arg in args: if arg == "articles": raw_stream = False for event in active_events: print event if event.query_id.startswith("TS13"): corpus = cuttsum.corpora.EnglishAndUnknown2013() elif event.query_id.startswith("TS14"): corpus = cuttsum.corpora.SerifOnly2014() else: raise Exception("Bad query id: {}".format(event.query_id)) if raw_stream is True: from cuttsum.trecdata import SCChunkResource si_iter = SCChunkResource().streamitem_iter(event, corpus) else: from cuttsum.pipeline import ArticlesResource si_iter = ArticlesResource().streamitem_iter(event, corpus) for hour, path, si in si_iter: if si.body.clean_visible is not None: print si.stream_id try: text_height = t.height - 4 #n_chars = t. article = g.extract(raw_html=si.body.clean_html) lines = textwrap.wrap(article.cleaned_text) idx = 0 while 1: print t.clear print "hour:", hour print "title:", article.title print "article:" print "\n".join(lines[idx:idx + text_height]) #print article.cleaned_text with t.cbreak(): char = t.inkey() if char == "i" and idx > 0: idx -= 1 #idx - 1 if idx > 0 else 0 elif char == "k" and idx + text_height < len( lines): idx += 1 elif char == "l": break except Exception, e: print e continue
def get_job_units(self, event, corpus, **kwargs): extractor = kwargs.get("extractor", "gold") overwrite = kwargs.get("overwrite", False) data_dir = os.path.join(self.dir_, extractor, event.fs_name()) if not os.path.exists(data_dir): os.makedirs(data_dir) chunks_resource = SCChunkResource() if extractor == "gold": import cuttsum.judgements all_matches = cuttsum.judgements.get_matches() matches = all_matches[all_matches["query id"] == event.query_id] hours = set([datetime.utcfromtimestamp(int(update_id.split("-")[0])).replace( minute=0, second=0) for update_id in matches["update id"].tolist()]) hours = sorted(list(hours)) units = [] for h, hour in enumerate(hours): output_path = self.get_chunk_path(event, extractor, hour) if overwrite is True or ouput_path is None or not os.path.exists(output_path): units.append(h) return units elif extractor == "goose": hours = event.list_event_hours() units = [] for h, hour in enumerate(hours): output_path = self.get_chunk_path(event, extractor, hour, corpus) if overwrite is True or output_path is None or not os.path.exists(output_path): units.append(h) return units else: raise Exception("extractor: {} not implemented!".format(extractor))
def do_job_unit(self, event, corpus, unit, **kwargs): extractor = kwargs.get("extractor", "gold") data_dir = os.path.join(self.dir_, extractor, event.fs_name()) chunks_resource = SCChunkResource() if not os.path.exists(data_dir): try: os.makedirs(data_dir) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(data_dir): pass if extractor == "gold": import cuttsum.judgements all_matches = cuttsum.judgements.get_matches() matches = all_matches[all_matches["query id"] == event.query_id] stream_ids = set( matches["update id"].apply( lambda x: "-".join(x.split("-")[:-1])).tolist()) hours = set([datetime.utcfromtimestamp( int(update_id.split("-")[0])).replace( minute=0, second=0) for update_id in matches["update id"].tolist()]) hours = sorted(list(hours)) hour = hours[unit] output_path = self.get_chunk_path(event, extractor, hour, corpus) gold_si = [] for path in chunks_resource.get_chunks_for_hour(hour, corpus, event): with sc.Chunk(path=path, mode="rb", message=corpus.sc_msg()) as chunk: for si in chunk: if si.stream_id in stream_ids: gold_si.append(si) gold_si.sort(key=lambda x: x.stream_id) for si in gold_si: print si.stream_id if os.path.exists(output_path): os.remove(path) with sc.Chunk(path=output_path, mode="wb", message=corpus.sc_msg()) as chunk: for si in gold_si: chunk.add(si) elif extractor == "goose": import nltk from nltk.tokenize import WordPunctTokenizer sent_tok = nltk.data.load('tokenizers/punkt/english.pickle') word_tok = WordPunctTokenizer() from goose import Goose, Configuration config = Configuration() config.enable_image_fetching = False g = Goose(config) hour = event.list_event_hours()[unit] output_path_tmp = self.get_chunk_template(event, extractor, hour, corpus) good_si = [] for path in chunks_resource.get_chunks_for_hour(hour, corpus, event): try: with sc.Chunk(path=path, mode="rb", message=corpus.sc_msg()) as chunk: for si in chunk: if si.body.clean_visible is None: continue article_text = self._get_goose_text(g, si) if article_text is None: continue if not self._contains_query(event, article_text): continue art_pretty = sent_tok.tokenize(article_text) art_sents = [word_tok.tokenize(sent) for sent in art_pretty] df = si2df(si) I = self._map_goose2streamitem( art_sents, df["words"].tolist()) if "serif" in si.body.sentences: si_sentences = si.body.sentences["serif"] elif "lingpipe" in si.body.sentences: si_sentences = si.body.sentences["lingpipe"] else: raise Exception("Bad sentence annotator.") ann = sc.Annotator() ann.annotator_id = "goose" si.body.sentences["goose"] = [sc.Sentence() for _ in si_sentences] for i_goose, i_si in enumerate(I): #print art_pretty[i_goose] #print df.loc[i_si, "sent text"] #print tokens = [sc.Token(token=token.encode("utf-8")) for token in art_sents[i_goose]] si.body.sentences["goose"][i_si].tokens.extend( tokens) good_si.append(si) except TypeError: continue #if len(good_si) == 0: # print "Nothing in hour:", hour # return output_path = output_path_tmp.format(len(good_si)) odir = os.path.dirname(output_path) if not os.path.exists(odir): os.makedirs(odir) if os.path.exists(output_path): os.remove(path) good_si.sort(key=lambda x: x.stream_id) for si in good_si: print si.stream_id if os.path.exists(output_path): os.remove(output_path) print "Writing to", output_path with sc.Chunk(path=output_path, mode="wb", message=corpus.sc_msg()) as chunk: for si in good_si: chunk.add(si) else: raise Exception("extractor: {} not implemented!".format(extractor))
pd.set_option("display.width", 200) import locale locale.setlocale(locale.LC_ALL, "en_US.UTF8") def format_int(x): return locale.format("%d", x, grouping=True) def epoch(dt): return int((dt - datetime(1970, 1, 1)).total_seconds()) chunk_res = SCChunkResource() articles_res = ArticlesResource() ded_articles_res = DedupedArticlesResource() data = [] event2ids = defaultdict(set) fltr_event2ids = defaultdict(set) for event in cuttsum.events.get_events(): corpus = cuttsum.corpora.get_raw_corpus(event) hours = event.list_event_hours() hour2ded = defaultdict(int) hour2ded_fltr = defaultdict(int) ded_df = ded_articles_res.get_stats_df(event, corpus, "goose", 0.8)
pd.set_option('display.width', 200) import locale locale.setlocale(locale.LC_ALL, 'en_US.UTF8') def format_int(x): return locale.format("%d", x, grouping=True) def epoch(dt): return int((dt - datetime(1970, 1, 1)).total_seconds()) chunk_res = SCChunkResource() articles_res = ArticlesResource() ded_articles_res = DedupedArticlesResource() data = [] event2ids = defaultdict(set) fltr_event2ids = defaultdict(set) for event in cuttsum.events.get_events(): corpus = cuttsum.corpora.get_raw_corpus(event) hours = event.list_event_hours() hour2ded = defaultdict(int) hour2ded_fltr = defaultdict(int) ded_df = ded_articles_res.get_stats_df(event, corpus, "goose", .8)