def main(): event_file, event_title, nuggets_tsv, odir = parse_args() event = load_event(event_title, event_file) nuggets = read_nuggets_tsv(nuggets_tsv, filter_query_id=event.query_id) hours = [dth for dth in gen_dates(event.start, event.end)] updates_file = os.path.join(odir, u"updates.txt") write_updates(nuggets, updates_file) sum_sents = [] for hour in hours: while len(nuggets) > 0: if nuggets[0].timestamp.strftime(u"%Y-%m-%d-%H") <= hour: sum_sents.append(nuggets[0].text) nuggets.pop(0) else: break if len(sum_sents) > 0: ofile = os.path.join(odir, u"{}.txt".format(hour)) write_summary(sum_sents, ofile) if len(nuggets) > 0: sum_sents.extend([nugget.text for nugget in nuggets]) ofile = os.path.join(odir, u"final.txt") write_summary(sum_sents, ofile)
def main(): event_file, event_title, nuggets_tsv, odir = parse_args() event = load_event(event_title, event_file) nuggets = read_nuggets_tsv(nuggets_tsv, filter_query_id=event.query_id) hours = [dth for dth in gen_dates(event.start, event.end)] updates_file = os.path.join(odir, u'updates.txt') write_updates(nuggets, updates_file) sum_sents = [] for hour in hours: while len(nuggets) > 0: if nuggets[0].timestamp.strftime(u'%Y-%m-%d-%H') <= hour: sum_sents.append(nuggets[0].text) nuggets.pop(0) else: break if len(sum_sents) > 0: ofile = os.path.join(odir, u'{}.txt'.format(hour)) write_summary(sum_sents, ofile) if len(nuggets) > 0: sum_sents.extend([nugget.text for nugget in nuggets]) ofile = os.path.join(odir, u'final.txt') write_summary(sum_sents, ofile)
def main(): event_file, rc_dir, event_title, nuggets_tsv, ss_params, ofile = parse_args() ss_model, ss_vocab, ss_dims = ss_params event = load_event(event_title, event_file) nuggets = read_nuggets_tsv(nuggets_tsv, filter_query_id=event.query_id) hours = [dth for dth in gen_dates(event.start, event.end)] print u"Found", len(nuggets), u"nuggets." print u"Loading sentence-sim model..." wmat_model = cuttsum.wtmf.load_model(ss_model, ss_vocab, latent_dims=ss_dims) nugget_lvecs = wmat_model.factor_unicode([n.text for n in nuggets]) meta_data = [] unicodes = [] print u"Loading sentence data..." nhours = len(hours) for h, hour in enumerate(hours, 1): chunk = os.path.join(rc_dir, u'{}.sc.gz'.format(hour)) for si_idx, si in enumerate(sc.Chunk(path=chunk)): if u'article-clf' not in si.body.sentences: continue sent_idx_map = {} for idx, sent in enumerate(si.body.sentences[u'serif']): sent_idx_map[sentence_uni(sent)] = idx for sent in si.body.sentences[u'article-clf']: uni = sentence_uni(sent) meta_data.append((hour, si.stream_id, sent_idx_map[uni])) unicodes.append(uni) print u"Computing similarities..." sent_lvecs = wmat_model.factor_unicode(unicodes) S = cosine_similarity(sent_lvecs, nugget_lvecs) S = np.ma.masked_array(S, np.isnan(S)) Szmuv = (S - S.mean(axis=0)) / S.std(axis=0) M = np.amax(Szmuv, axis=1) m = np.amin(Szmuv, axis=1) U = np.mean(Szmuv, axis=1) T = np.sum(Szmuv, axis=1) ### WRITE TSV HEADER AND DATA ### print u"Writing to", ofile header = 'date-hour\tstream-id\tsent-id\tmax-sim\tmin-sim' + \ '\tmean-sim\ttotal-sim' for i in range(ss_dims): header += '\tlv{}'.format(i) with open(ofile, 'w') as f: f.write(header) f.write('\n') for idx, meta_datum in enumerate(meta_data): f.write('{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(meta_datum[0], meta_datum[1], meta_datum[2], M[idx], m[idx], U[idx])) for c in range(ss_dims): f.write('\t{}'.format(sent_lvecs[idx,c])) f.write('\n') f.flush()
def main(): event_file, rc_dir, title, out_dir, ad_dir, ldir, nprocs = parse_args() event = load_event(title, event_file) hours = [dth for dth in gen_dates(event.start, event.end)] nhours = len(hours) hours_per_proc = int(math.ceil(nhours / float(nprocs))) jobs = [] pid = 1 for i in xrange(0, nhours, hours_per_proc): log_file = os.path.join(ldir, '.rel_extractor_{}.log'.format(pid)) jobs.append((rc_dir, out_dir, hours[i:i+hours_per_proc], event, ad_dir, log_file)) pid += 1 if nprocs == 1: for job in jobs: worker(job) else: import multiprocessing as mp pool = mp.Pool(nprocs) x = pool.map_async(worker, jobs) x.get() pool.close() pool.join() # COMPILE LOG INFO log_data = [] for fname in os.listdir(ldir): if re.search(r'\.rel_extractor', fname): with open(os.path.join(ldir, fname), 'r') as f: for line in f: items = line.strip().split('\t') if len(items) != 5: continue log_data.append(items) os.remove(os.path.join(ldir, fname)) log_data.sort(key=lambda x: x[0]) log_fname = os.path.join(ldir, 'rel_log.txt') with open(log_fname, 'w') as f: f.write("hour\tnum_docs\tnum_sents\tnum_relevant_docs") f.write("\tnum_relevant_sents\n") f.flush() for date, n_docs, n_sents, n_rel_docs, n_rel_sents in log_data: f.write('{}\t{}\t{}\t{}\t{}\n'.format(date, n_docs, n_sents, n_rel_docs, n_rel_sents)) f.flush()
def main(): args = parse_args() event_file, rc_dir, event_title, nuggets_tsv = args[0:4] doc_freqs, word_freqs = args[4:] print 'Generating Regression Features' print '==============================' event = load_event(event_title, event_file) nuggets = read_nuggets_tsv(nuggets_tsv, filter_query_id=event.query_id) hours = [dth for dth in gen_dates(event.start, event.end)] worker((rc_dir, nuggets, hours, event, doc_freqs, word_freqs))
def main(): event_file, rc_dir, title, out_dir, ad_dir, ldir, nprocs = parse_args() event = load_event(title, event_file) hours = [dth for dth in gen_dates(event.start, event.end)] nhours = len(hours) hours_per_proc = int(math.ceil(nhours / float(nprocs))) jobs = [] pid = 1 for i in xrange(0, nhours, hours_per_proc): log_file = os.path.join(ldir, '.rel_extractor_{}.log'.format(pid)) jobs.append((rc_dir, out_dir, hours[i:i + hours_per_proc], event, ad_dir, log_file)) pid += 1 if nprocs == 1: for job in jobs: worker(job) else: import multiprocessing as mp pool = mp.Pool(nprocs) x = pool.map_async(worker, jobs) x.get() pool.close() pool.join() # COMPILE LOG INFO log_data = [] for fname in os.listdir(ldir): if re.search(r'\.rel_extractor', fname): with open(os.path.join(ldir, fname), 'r') as f: for line in f: items = line.strip().split('\t') if len(items) != 5: continue log_data.append(items) os.remove(os.path.join(ldir, fname)) log_data.sort(key=lambda x: x[0]) log_fname = os.path.join(ldir, 'rel_log.txt') with open(log_fname, 'w') as f: f.write("hour\tnum_docs\tnum_sents\tnum_relevant_docs") f.write("\tnum_relevant_sents\n") f.flush() for date, n_docs, n_sents, n_rel_docs, n_rel_sents in log_data: f.write('{}\t{}\t{}\t{}\t{}\n'.format(date, n_docs, n_sents, n_rel_docs, n_rel_sents)) f.flush()
def main(): event_xml, filter_list, hour_list = parse_args() events = read_events_xml(event_xml) valid_hours = set() for event in events: start_dt = event.start - timedelta(hours=5) end_dt = event.end for dth in gen_dates(start_dt, end_dt): valid_hours.add(dth) with open(hour_list, u'r') as f, open(filter_list, u'w') as o: for line in f: path = line.strip() dth, fname = path.split('/') if dth in valid_hours and re.search(r'news', fname, re.I): o.write(path) o.write('\n') o.flush()
def main(): event_file, rc_dir, event_title, ofile = parse_args() event = load_event(event_title, event_file) hours = [dth for dth in gen_dates(event.start, event.end)] num_hours = len(hours) meta_data = [] bow_dicts = [] for h, hour in enumerate(hours, 1): path = os.path.join(rc_dir, '{}.sc.gz'.format(hour)) for si in sc.Chunk(path=path): uni2id = {} for sid, sentence in enumerate(si.body.sentences[u'serif'], 0): uni2id[sentence_uni(sentence)] = sid for sent in si.body.sentences[u'article-clf']: bow_dict = {} for token in sent.tokens: t = token.token.decode(u'utf-8').lower() bow_dict[t] = 1 bow_dicts.append(bow_dict) uni = sentence_uni(sent) sent_id = uni2id[uni] meta_data.append((hour, si.stream_id, sent_id, uni)) vctr = DictVectorizer() X = vctr.fit_transform(bow_dicts) with codecs.open(ofile, 'w', 'utf-8') as f: for i, (hour, stream_id, sent_id, uni) in enumerate(meta_data): uni = uni.replace(u'\n', u' ').replace(u'\t', u' ') f.write(u'{}\t{}\t{}\t{}\t'.format(hour, stream_id, sent_id, uni)) x = u' '.join([unicode(col) for col in X[i, :].indices]) f.write(x) f.write(u'\n') f.flush()
def main(): event_file, rc_dir, event_title, ofile = parse_args() event = load_event(event_title, event_file) hours = [dth for dth in gen_dates(event.start, event.end)] num_hours = len(hours) meta_data = [] bow_dicts = [] for h, hour in enumerate(hours, 1): path = os.path.join(rc_dir, '{}.sc.gz'.format(hour)) for si in sc.Chunk(path=path): uni2id = {} for sid, sentence in enumerate(si.body.sentences[u'serif'], 0): uni2id[sentence_uni(sentence)] = sid for sent in si.body.sentences[u'article-clf']: bow_dict = {} for token in sent.tokens: t = token.token.decode(u'utf-8').lower() bow_dict[t] = 1 bow_dicts.append(bow_dict) uni = sentence_uni(sent) sent_id = uni2id[uni] meta_data.append((hour, si.stream_id, sent_id, uni)) vctr = DictVectorizer() X = vctr.fit_transform(bow_dicts) with codecs.open(ofile, 'w', 'utf-8') as f: for i, (hour, stream_id, sent_id, uni) in enumerate(meta_data): uni = uni.replace(u'\n', u' ').replace(u'\t', u' ') f.write(u'{}\t{}\t{}\t{}\t'.format(hour, stream_id, sent_id, uni)) x = u' '.join([unicode(col) for col in X[i,:].indices]) f.write(x) f.write(u'\n') f.flush()
def main(): event_file, event_title, query, rel_dir, c_dir, cpus, log_dir = parse_args() event = load_event(event_title, event_file) hours = [hour for hour in gen_dates(event.start, event.end)] nhours = len(hours) hours_per_cpu = nhours / cpus jobs = [] pid = 0 now = datetime.now().strftime('%Y-%m-%d-%H') print now for i in xrange(0, nhours, hours_per_cpu): jobs.append((event, event_title, query, hours[i:i+hours_per_cpu], rel_dir, c_dir, os.path.join(log_dir, '.rel_log_{}_{}_log.txt'.format(now, pid)))) pid += 1 print 'Query Match Relevance Extractor' print '===============================' print print 'SYSTEM INFO' print '===========' print 'KBA location:', c_dir print 'Relevant chunks:', rel_dir print 'n-threads:', cpus print print 'EVENT INFO' print '==========' print 'Event Title:', event.title print 'Event Type:', event.type print 'Date Range: {} -- {}'.format(event.start, event.end) print 'Spanning', nhours, 'hours.' print 'Query:', query print if cpus == 1: for job in jobs: worker(job) else: import multiprocessing as mp pool = mp.Pool(cpus) pool.map_async(worker, jobs) pool.close() pool.join() log_data = [] for fname in os.listdir(log_dir): if re.search(r'\.rel_log_{}'.format(now), fname): with open(os.path.join(log_dir, fname), 'r') as f: for line in f: items = line.strip().split('\t') if len(items) != 3: continue log_data.append(items) os.remove(os.path.join(log_dir, fname)) log_data.sort(key=lambda x: x[0]) log_fname = os.path.join(log_dir, 'rel_log_{}.txt'.format(now)) with open(log_fname, 'w') as f: for date, tot_rel, tot_docs in log_data: f.write('{}\t{}\t{}\n'.format(date, tot_rel, tot_docs)) f.flush()
def main(): event_file, rc_dir, event_title, ofile, ports, cnts_dirs = parse_args() wc_dir, dc_dir = cnts_dirs event = load_event(event_title, event_file) hours = [dth for dth in gen_dates(event.start, event.end)] print "Connecting lm clients..." dm_lm_score = lm_client_init(ports[0]) bg_lm3_score = lm_client_init(ports[1][0]) bg_lm4_score = lm_client_init(ports[1][1]) bg_lm5_score = lm_client_init(ports[1][2]) print "Query words:", event.query query_matcher = query_term_match_init(event.query) wn_terms = wn_synset_terms(event.type) print "WordNet synset terms:", wn_terms synset_matcher = query_term_match_init(wn_terms) tfidfers = [] preroll = [get_previous_hour(hours[0], i) for i in range(1, 6)] for hour in preroll: tfidfers.append(init_tfidfers(wc_dir, dc_dir, hour, lower=True)) tfidfers.append(None) of = open(ofile, 'w') header = "hour\tstream-id\tsent-id\t" \ + "avg-tfidf\tavg-tfidf-m1\tavg-tfidf-m5\t" \ + "dm-logprob\tdm-avg-logprob\tbg3-logprob\tbg3-avg-logprob\t" \ + "bg4-logprob\tbg4-avg-logprob\tbg5-logprob\tbg5-avg-logprob\t" \ + "query-matches\tsynset-matches\tnum-tokens\tarticle-position\t" \ + "article-position-rel\tcapsrate\n" of.write(header) of.flush() num_hours = len(hours) for h, hour in enumerate(hours, 1): tfidfers = [init_tfidfers(wc_dir, dc_dir, hour, lower=True)] \ + tfidfers[0:-1] print "({}/{}) {}".format(h, num_hours, hour) path = os.path.join(rc_dir, '{}.sc.gz'.format(hour)) for si in sc.Chunk(path=path): ticks = float(si.stream_time.epoch_ticks) si_datetime = datetime.utcfromtimestamp(ticks) tdelta = si_datetime - event.start uni2id = {} doc_word_counts = defaultdict(int) for sid, sentence in enumerate(si.body.sentences[u'serif'], 0): uni2id[sentence_uni(sentence)] = sid for token in sentence.tokens: t = token.token.decode(u'utf-8').lower() doc_word_counts[t] += 1 nsents = len(si.body.sentences[u'article-clf']) for apos, sent in enumerate(si.body.sentences[u'article-clf'], 1): tf_dict = {} for token in sent.tokens: t = token.token.decode(u'utf-8').lower() tf_dict[t] = doc_word_counts[t] tfidfs_now = tfidfers[0](tf_dict) tfidfs_m1 = tfidfers[1](tf_dict) tfidfs_m5 = tfidfers[5](tf_dict) scores = compute_tfidfs(tfidfs_now, tfidfs_m1, tfidfs_m5) avg_tfidf, avg_tfidf_m1, avg_tfidf_m5 = scores uni = sentence_uni(sent) sent_id = uni2id[uni] apos_rel = apos / float(nsents) num_tokens = len(sent.tokens) caps_rate = get_caps_rate(sent) dm_lp, dm_alp = dm_lm_score(uni) bg3_lp, bg3_alp = bg_lm3_score(uni) bg4_lp, bg4_alp = bg_lm4_score(uni) bg5_lp, bg5_alp = bg_lm5_score(uni) query_matches = query_matcher(uni) synset_matches = synset_matcher(uni) # print dm_lp, dm_alp, bg3_lp, bg3_alp, bg4_lp, bg4_alp, bg5_lp, bg5_alp dstr = ('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}' \ +'\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n').format( hour, si.stream_id, sent_id, avg_tfidf, avg_tfidf_m1, avg_tfidf_m5, dm_lp, dm_alp, bg3_lp, bg3_alp, bg4_lp, bg4_alp, bg5_lp, bg5_alp, query_matches, synset_matches, num_tokens, apos, apos_rel, caps_rate) of.write(dstr) of.flush() of.close()
def main(): event_file, rc_dir, event_title, nuggets_tsv, ss_params, ofile = parse_args( ) ss_model, ss_vocab, ss_dims = ss_params event = load_event(event_title, event_file) nuggets = read_nuggets_tsv(nuggets_tsv, filter_query_id=event.query_id) hours = [dth for dth in gen_dates(event.start, event.end)] print u"Found", len(nuggets), u"nuggets." print u"Loading sentence-sim model..." wmat_model = cuttsum.wtmf.load_model(ss_model, ss_vocab, latent_dims=ss_dims) nugget_lvecs = wmat_model.factor_unicode([n.text for n in nuggets]) meta_data = [] unicodes = [] print u"Loading sentence data..." nhours = len(hours) for h, hour in enumerate(hours, 1): chunk = os.path.join(rc_dir, u'{}.sc.gz'.format(hour)) for si_idx, si in enumerate(sc.Chunk(path=chunk)): if u'article-clf' not in si.body.sentences: continue sent_idx_map = {} for idx, sent in enumerate(si.body.sentences[u'serif']): sent_idx_map[sentence_uni(sent)] = idx for sent in si.body.sentences[u'article-clf']: uni = sentence_uni(sent) meta_data.append((hour, si.stream_id, sent_idx_map[uni])) unicodes.append(uni) print u"Computing similarities..." sent_lvecs = wmat_model.factor_unicode(unicodes) S = cosine_similarity(sent_lvecs, nugget_lvecs) S = np.ma.masked_array(S, np.isnan(S)) Szmuv = (S - S.mean(axis=0)) / S.std(axis=0) M = np.amax(Szmuv, axis=1) m = np.amin(Szmuv, axis=1) U = np.mean(Szmuv, axis=1) T = np.sum(Szmuv, axis=1) ### WRITE TSV HEADER AND DATA ### print u"Writing to", ofile header = 'date-hour\tstream-id\tsent-id\tmax-sim\tmin-sim' + \ '\tmean-sim\ttotal-sim' for i in range(ss_dims): header += '\tlv{}'.format(i) with open(ofile, 'w') as f: f.write(header) f.write('\n') for idx, meta_datum in enumerate(meta_data): f.write('{}\t{}\t{}\t{}\t{}\t{}\t{}'.format( meta_datum[0], meta_datum[1], meta_datum[2], M[idx], m[idx], U[idx])) for c in range(ss_dims): f.write('\t{}'.format(sent_lvecs[idx, c])) f.write('\n') f.flush()
def main(): event_file, rc_dir, event_title, ofile, ports, cnts_dirs = parse_args() wc_dir, dc_dir = cnts_dirs event = load_event(event_title, event_file) hours = [dth for dth in gen_dates(event.start, event.end)] print "Connecting lm clients..." dm_lm_score = lm_client_init(ports[0]) bg_lm3_score = lm_client_init(ports[1][0]) bg_lm4_score = lm_client_init(ports[1][1]) bg_lm5_score = lm_client_init(ports[1][2]) print "Query words:", event.query query_matcher = query_term_match_init(event.query) wn_terms = wn_synset_terms(event.type) print "WordNet synset terms:", wn_terms synset_matcher = query_term_match_init(wn_terms) tfidfers = [] preroll = [get_previous_hour(hours[0], i) for i in range(1,6)] for hour in preroll: tfidfers.append(init_tfidfers(wc_dir, dc_dir, hour, lower=True)) tfidfers.append(None) of = open(ofile, 'w') header = "hour\tstream-id\tsent-id\t" \ + "avg-tfidf\tavg-tfidf-m1\tavg-tfidf-m5\t" \ + "dm-logprob\tdm-avg-logprob\tbg3-logprob\tbg3-avg-logprob\t" \ + "bg4-logprob\tbg4-avg-logprob\tbg5-logprob\tbg5-avg-logprob\t" \ + "query-matches\tsynset-matches\tnum-tokens\tarticle-position\t" \ + "article-position-rel\tcapsrate\n" of.write(header) of.flush() num_hours = len(hours) for h, hour in enumerate(hours, 1): tfidfers = [init_tfidfers(wc_dir, dc_dir, hour, lower=True)] \ + tfidfers[0:-1] print "({}/{}) {}".format(h, num_hours, hour) path = os.path.join(rc_dir, '{}.sc.gz'.format(hour)) for si in sc.Chunk(path=path): ticks = float(si.stream_time.epoch_ticks) si_datetime = datetime.utcfromtimestamp(ticks) tdelta = si_datetime - event.start uni2id = {} doc_word_counts = defaultdict(int) for sid, sentence in enumerate(si.body.sentences[u'serif'], 0): uni2id[sentence_uni(sentence)] = sid for token in sentence.tokens: t = token.token.decode(u'utf-8').lower() doc_word_counts[t] += 1 nsents = len(si.body.sentences[u'article-clf']) for apos, sent in enumerate(si.body.sentences[u'article-clf'], 1): tf_dict = {} for token in sent.tokens: t = token.token.decode(u'utf-8').lower() tf_dict[t] = doc_word_counts[t] tfidfs_now = tfidfers[0](tf_dict) tfidfs_m1 = tfidfers[1](tf_dict) tfidfs_m5 = tfidfers[5](tf_dict) scores = compute_tfidfs(tfidfs_now, tfidfs_m1, tfidfs_m5) avg_tfidf, avg_tfidf_m1, avg_tfidf_m5 = scores uni = sentence_uni(sent) sent_id = uni2id[uni] apos_rel = apos / float(nsents) num_tokens = len(sent.tokens) caps_rate = get_caps_rate(sent) dm_lp, dm_alp = dm_lm_score(uni) bg3_lp, bg3_alp = bg_lm3_score(uni) bg4_lp, bg4_alp = bg_lm4_score(uni) bg5_lp, bg5_alp = bg_lm5_score(uni) query_matches = query_matcher(uni) synset_matches = synset_matcher(uni) # print dm_lp, dm_alp, bg3_lp, bg3_alp, bg4_lp, bg4_alp, bg5_lp, bg5_alp dstr = ('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}' \ +'\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n').format( hour, si.stream_id, sent_id, avg_tfidf, avg_tfidf_m1, avg_tfidf_m5, dm_lp, dm_alp, bg3_lp, bg3_alp, bg4_lp, bg4_alp, bg5_lp, bg5_alp, query_matches, synset_matches, num_tokens, apos, apos_rel, caps_rate) of.write(dstr) of.flush() of.close()