def main(): event_file, event_title, nuggets_tsv, odir = parse_args() event = load_event(event_title, event_file) nuggets = read_nuggets_tsv(nuggets_tsv, filter_query_id=event.query_id) hours = [dth for dth in gen_dates(event.start, event.end)] updates_file = os.path.join(odir, u"updates.txt") write_updates(nuggets, updates_file) sum_sents = [] for hour in hours: while len(nuggets) > 0: if nuggets[0].timestamp.strftime(u"%Y-%m-%d-%H") <= hour: sum_sents.append(nuggets[0].text) nuggets.pop(0) else: break if len(sum_sents) > 0: ofile = os.path.join(odir, u"{}.txt".format(hour)) write_summary(sum_sents, ofile) if len(nuggets) > 0: sum_sents.extend([nugget.text for nugget in nuggets]) ofile = os.path.join(odir, u"final.txt") write_summary(sum_sents, ofile)
def main(): event_file, event_title, nuggets_tsv, odir = parse_args() event = load_event(event_title, event_file) nuggets = read_nuggets_tsv(nuggets_tsv, filter_query_id=event.query_id) hours = [dth for dth in gen_dates(event.start, event.end)] updates_file = os.path.join(odir, u'updates.txt') write_updates(nuggets, updates_file) sum_sents = [] for hour in hours: while len(nuggets) > 0: if nuggets[0].timestamp.strftime(u'%Y-%m-%d-%H') <= hour: sum_sents.append(nuggets[0].text) nuggets.pop(0) else: break if len(sum_sents) > 0: ofile = os.path.join(odir, u'{}.txt'.format(hour)) write_summary(sum_sents, ofile) if len(nuggets) > 0: sum_sents.extend([nugget.text for nugget in nuggets]) ofile = os.path.join(odir, u'final.txt') write_summary(sum_sents, ofile)
def main(): event_file, rc_dir, event_title, nuggets_tsv, ss_params, ofile = parse_args() ss_model, ss_vocab, ss_dims = ss_params event = load_event(event_title, event_file) nuggets = read_nuggets_tsv(nuggets_tsv, filter_query_id=event.query_id) hours = [dth for dth in gen_dates(event.start, event.end)] print u"Found", len(nuggets), u"nuggets." print u"Loading sentence-sim model..." wmat_model = cuttsum.wtmf.load_model(ss_model, ss_vocab, latent_dims=ss_dims) nugget_lvecs = wmat_model.factor_unicode([n.text for n in nuggets]) meta_data = [] unicodes = [] print u"Loading sentence data..." nhours = len(hours) for h, hour in enumerate(hours, 1): chunk = os.path.join(rc_dir, u'{}.sc.gz'.format(hour)) for si_idx, si in enumerate(sc.Chunk(path=chunk)): if u'article-clf' not in si.body.sentences: continue sent_idx_map = {} for idx, sent in enumerate(si.body.sentences[u'serif']): sent_idx_map[sentence_uni(sent)] = idx for sent in si.body.sentences[u'article-clf']: uni = sentence_uni(sent) meta_data.append((hour, si.stream_id, sent_idx_map[uni])) unicodes.append(uni) print u"Computing similarities..." sent_lvecs = wmat_model.factor_unicode(unicodes) S = cosine_similarity(sent_lvecs, nugget_lvecs) S = np.ma.masked_array(S, np.isnan(S)) Szmuv = (S - S.mean(axis=0)) / S.std(axis=0) M = np.amax(Szmuv, axis=1) m = np.amin(Szmuv, axis=1) U = np.mean(Szmuv, axis=1) T = np.sum(Szmuv, axis=1) ### WRITE TSV HEADER AND DATA ### print u"Writing to", ofile header = 'date-hour\tstream-id\tsent-id\tmax-sim\tmin-sim' + \ '\tmean-sim\ttotal-sim' for i in range(ss_dims): header += '\tlv{}'.format(i) with open(ofile, 'w') as f: f.write(header) f.write('\n') for idx, meta_datum in enumerate(meta_data): f.write('{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(meta_datum[0], meta_datum[1], meta_datum[2], M[idx], m[idx], U[idx])) for c in range(ss_dims): f.write('\t{}'.format(sent_lvecs[idx,c])) f.write('\n') f.flush()
def main(): args = parse_args() event_file, rc_dir, event_title, nuggets_tsv = args[0:4] doc_freqs, word_freqs = args[4:] print 'Generating Regression Features' print '==============================' event = load_event(event_title, event_file) nuggets = read_nuggets_tsv(nuggets_tsv, filter_query_id=event.query_id) hours = [dth for dth in gen_dates(event.start, event.end)] worker((rc_dir, nuggets, hours, event, doc_freqs, word_freqs))
def main(): args = parse_args() event_file, rc_dir, event_title, nuggets_tsv = args[0:4] doc_freqs, word_freqs = args[4:] print 'Generating Regression Features' print '==============================' event = load_event(event_title, event_file) nuggets = read_nuggets_tsv(nuggets_tsv, filter_query_id=event.query_id) hours = [dth for dth in gen_dates(event.start, event.end)] worker((rc_dir, nuggets, hours, event, doc_freqs, word_freqs))
def main(): event_file, rc_dir, event_title, nuggets_tsv, ss_params, ofile = parse_args( ) ss_model, ss_vocab, ss_dims = ss_params event = load_event(event_title, event_file) nuggets = read_nuggets_tsv(nuggets_tsv, filter_query_id=event.query_id) hours = [dth for dth in gen_dates(event.start, event.end)] print u"Found", len(nuggets), u"nuggets." print u"Loading sentence-sim model..." wmat_model = cuttsum.wtmf.load_model(ss_model, ss_vocab, latent_dims=ss_dims) nugget_lvecs = wmat_model.factor_unicode([n.text for n in nuggets]) meta_data = [] unicodes = [] print u"Loading sentence data..." nhours = len(hours) for h, hour in enumerate(hours, 1): chunk = os.path.join(rc_dir, u'{}.sc.gz'.format(hour)) for si_idx, si in enumerate(sc.Chunk(path=chunk)): if u'article-clf' not in si.body.sentences: continue sent_idx_map = {} for idx, sent in enumerate(si.body.sentences[u'serif']): sent_idx_map[sentence_uni(sent)] = idx for sent in si.body.sentences[u'article-clf']: uni = sentence_uni(sent) meta_data.append((hour, si.stream_id, sent_idx_map[uni])) unicodes.append(uni) print u"Computing similarities..." sent_lvecs = wmat_model.factor_unicode(unicodes) S = cosine_similarity(sent_lvecs, nugget_lvecs) S = np.ma.masked_array(S, np.isnan(S)) Szmuv = (S - S.mean(axis=0)) / S.std(axis=0) M = np.amax(Szmuv, axis=1) m = np.amin(Szmuv, axis=1) U = np.mean(Szmuv, axis=1) T = np.sum(Szmuv, axis=1) ### WRITE TSV HEADER AND DATA ### print u"Writing to", ofile header = 'date-hour\tstream-id\tsent-id\tmax-sim\tmin-sim' + \ '\tmean-sim\ttotal-sim' for i in range(ss_dims): header += '\tlv{}'.format(i) with open(ofile, 'w') as f: f.write(header) f.write('\n') for idx, meta_datum in enumerate(meta_data): f.write('{}\t{}\t{}\t{}\t{}\t{}\t{}'.format( meta_datum[0], meta_datum[1], meta_datum[2], M[idx], m[idx], U[idx])) for c in range(ss_dims): f.write('\t{}'.format(sent_lvecs[idx, c])) f.write('\n') f.flush()