def check_coverage(self, event, corpus, feature_set, prefix, model_events, n_samples=100, **kwargs): feats = get_resource_manager(u'SentenceFeaturesResource') n_feats = 0 n_covered = 0 sm = SalienceModels() model_paths = [] for model_event in model_events: model_paths.extend( sm.get_model_paths( model_event, feature_set, prefix, n_samples)) for hour in event.list_event_hours(): for model_path in model_paths: feat_tsv_path = feats.get_tsv_path(event, hour) sal_tsv_path = self.get_tsv_path( event, hour, prefix, feature_set, model_path) if os.path.exists(feat_tsv_path): n_feats += 1 if os.path.exists(sal_tsv_path): n_covered += 1 if n_feats == 0: return 0 return n_covered / float(n_feats)
def predict_salience(self, event, corpus, feature_set, prefix, model_events, n_procs=1, n_samples=10, progress_bar=False, **kwargs): data_dir = self.get_tsv_dir(event, prefix, feature_set) if not os.path.exists(data_dir): os.makedirs(data_dir) feats = get_resource_manager(u'SentenceFeaturesResource') sm = SalienceModels() model_paths = [] for model_event in model_events: model_paths.extend( sm.get_model_paths( model_event, feature_set, prefix, n_samples)) jobs = [] # for hour in event.list_event_hours(): for model_path in model_paths: # feat_tsv_path = feats.get_tsv_path(event, hour) # sal_tsv_path = self.get_tsv_path( # event, hour, prefix, feature_set, model_path) #if os.path.exists(feat_tsv_path): jobs.append(model_path) self.do_work(salience_predict_worker_, jobs, n_procs, progress_bar, event=event, feature_set=feature_set, prefix=prefix)
def check_coverage(self, event, corpus, feature_set, prefix, model_events, n_samples=100, **kwargs): feats = get_resource_manager(u'SentenceFeaturesResource') n_feats = 0 n_covered = 0 sm = SalienceModels() model_paths = [] for model_event in model_events: model_paths.extend( sm.get_model_paths(model_event, feature_set, prefix, n_samples)) for hour in event.list_event_hours(): for model_path in model_paths: feat_tsv_path = feats.get_tsv_path(event, hour) sal_tsv_path = self.get_tsv_path(event, hour, prefix, feature_set, model_path) if os.path.exists(feat_tsv_path): n_feats += 1 if os.path.exists(sal_tsv_path): n_covered += 1 if n_feats == 0: return 0 return n_covered / float(n_feats)
def get(self, event, corpus, feature_set, prefix, model_events, n_samples=10, **kwargs): feats = get_resource_manager(u'SentenceFeaturesResource') sm = SalienceModels() sp = SaliencePredictions() model_paths = [] for model_event in model_events: model_paths.extend( sm.get_model_paths(model_event, feature_set, prefix, n_samples)) hours = event.list_event_hours() n_hours = len(hours) pb = ProgressBar(n_hours) for hour in hours: pb.update() tsv_paths = \ [sp.get_tsv_path(event, hour, prefix, feature_set, model_path) for model_path in model_paths] tsv_paths = [path for path in tsv_paths if os.path.exists(path)] if len(tsv_paths) == 0: continue data = [] for tsv_path in tsv_paths: with gzip.open(tsv_path, u'r') as f: df = pd.io.parsers.read_csv(f, sep='\t', quoting=3, header=0) df.set_index([u'stream id', u'sentence id'], inplace=True) data.append(df) df = pd.concat(data, axis=1) agg_path = self.get_tsv_path(event, hour, prefix, feature_set) agg_dir = os.path.dirname(agg_path) if not os.path.exists(agg_dir): os.makedirs(agg_dir) df.columns = sorted(df.columns) with gzip.open(agg_path, u'w') as f: df.to_csv(f, sep='\t')
def check_coverage(self, event, corpus, feature_set, prefix, model_events, n_samples=10, **kwargs): feats = get_resource_manager(u'SentenceFeaturesResource') n_hours = 0 n_covered = 0 for hour in event.list_event_hours(): feats_tsv_path = feats.get_tsv_path(event, hour) sal_tsv_path = self.get_tsv_path(event, hour, prefix, feature_set) if os.path.exists(feats_tsv_path): n_hours += 1 if os.path.exists(sal_tsv_path): n_covered += 1 if n_hours == 0: return 0 else: return n_covered / float(n_hours)
def get(self, event, corpus, feature_set, prefix, model_events, n_samples=10, **kwargs): feats = get_resource_manager(u'SentenceFeaturesResource') sm = SalienceModels() sp = SaliencePredictions() model_paths = [] for model_event in model_events: model_paths.extend( sm.get_model_paths( model_event, feature_set, prefix, n_samples)) hours = event.list_event_hours() n_hours = len(hours) pb = ProgressBar(n_hours) for hour in hours: pb.update() tsv_paths = \ [sp.get_tsv_path(event, hour, prefix, feature_set, model_path) for model_path in model_paths] tsv_paths = [path for path in tsv_paths if os.path.exists(path)] if len(tsv_paths) == 0: continue data = [] for tsv_path in tsv_paths: with gzip.open(tsv_path, u'r') as f: df = pd.io.parsers.read_csv( f, sep='\t', quoting=3, header=0) df.set_index([u'stream id', u'sentence id'], inplace=True) data.append(df) df = pd.concat(data, axis=1) agg_path = self.get_tsv_path(event, hour, prefix, feature_set) agg_dir = os.path.dirname(agg_path) if not os.path.exists(agg_dir): os.makedirs(agg_dir) df.columns=sorted(df.columns) with gzip.open(agg_path, u'w') as f: df.to_csv(f, sep='\t')
def predict_salience(self, event, corpus, feature_set, prefix, model_events, n_procs=1, n_samples=10, progress_bar=False, **kwargs): data_dir = self.get_tsv_dir(event, prefix, feature_set) if not os.path.exists(data_dir): os.makedirs(data_dir) feats = get_resource_manager(u'SentenceFeaturesResource') sm = SalienceModels() model_paths = [] for model_event in model_events: model_paths.extend( sm.get_model_paths(model_event, feature_set, prefix, n_samples)) jobs = [] # for hour in event.list_event_hours(): for model_path in model_paths: # feat_tsv_path = feats.get_tsv_path(event, hour) # sal_tsv_path = self.get_tsv_path( # event, hour, prefix, feature_set, model_path) #if os.path.exists(feat_tsv_path): jobs.append(model_path) self.do_work(salience_predict_worker_, jobs, n_procs, progress_bar, event=event, feature_set=feature_set, prefix=prefix)
def make_summary(self, event, corpus, prefix, feature_set): string_res = get_resource_manager(u'SentenceStringsResource') lvec_res = get_resource_manager(u'SentenceLatentVectorsResource') tsv_path = self.get_tsv_path(event) updates = [] epoch = datetime.utcfromtimestamp(0) for hour in event.list_event_hours(): hp1 = hour + timedelta(hours=1) timestamp = str(int((hp1 - epoch).total_seconds())) string_df = string_res.get_dataframe(event, hour) lvec_df = lvec_res.get_dataframe(event, hour) if string_df is None or lvec_df is None: continue string_df = string_df.drop_duplicates( subset=[u'stream id', u'sentence id']) lvec_df = lvec_df.drop_duplicates( subset=[u'stream id', u'sentence id']) string_df.sort([u"stream id", u"sentence id"], inplace=True) lvec_df.sort([u"stream id", u"sentence id"], inplace=True) X = lvec_df.as_matrix()[:,2:].astype(np.float64) good_rows = np.where(X.any(axis=1))[0] string_df = string_df.iloc[good_rows] lvec_df = lvec_df.iloc[good_rows] assert len(string_df) == len(lvec_df) n_sents = len(string_df) for i in xrange(n_sents): assert string_df[u'stream id'].iloc[i] == \ lvec_df[u'stream id'].iloc[i] assert string_df[u'sentence id'].iloc[i] == \ lvec_df[u'sentence id'].iloc[i] lvec_df.reset_index(drop=True, inplace=True) string_df.reset_index(drop=True, inplace=True) string_df.drop_duplicates(subset=['streamcorpus'], inplace=True) string_df['update id'] = string_df['stream id'].map(str) + "-" + \ string_df['sentence id'].map(str) good_uids = set(string_df['update id'].tolist()) lvec_df['update id'] = lvec_df['stream id'].map(str) + "-" + \ lvec_df['sentence id'].map(str) lvec_df = lvec_df[lvec_df['update id'].isin(good_uids)].copy() #sal_df['update id'] = sal_df['stream id'].map(str) + "-" + \ # sal_df['sentence id'].map(str) #sal_df = sal_df[sal_df['update id'].isin(good_uids)].copy() string_df.sort([u"stream id", u"sentence id"], inplace=True) lvec_df.sort([u"stream id", u"sentence id"], inplace=True) #sal_df.sort([u"stream id", u"sentence id"], inplace=True) lvec_df.reset_index(drop=True, inplace=True) string_df.reset_index(drop=True, inplace=True) #sal_df.reset_index(drop=True, inplace=True) n_sents = len(string_df) for i in xrange(n_sents): assert string_df[u'stream id'].iloc[i] == \ lvec_df[u'stream id'].iloc[i] #assert string_df[u'stream id'].iloc[i] == \ # sal_df[u'stream id'].iloc[i] assert string_df[u'sentence id'].iloc[i] == \ lvec_df[u'sentence id'].iloc[i] #assert string_df[u'sentence id'].iloc[i] == \ # sal_df[u'sentence id'].iloc[i] good_rows = [] for name, doc in string_df.groupby("stream id"): for rname, row in doc.iterrows(): scstring = row["streamcorpus"] words = len(re.findall(r'\b[^\W\d_]+\b', scstring)) socs = len(re.findall( r'Digg|del\.icio\.us|Facebook|Kwoff|Myspace', scstring)) langs = len(re.findall( r'Flash|JavaScript|CSS', scstring, re.I)) assert lvec_df.loc[rname][u'sentence id'] == \ row[u'sentence id'] assert lvec_df.loc[rname][u'stream id'] == \ row[u'stream id'] if words > 9 and len(doc) < 200 \ and socs < 2 and langs < 2: good_rows.append(rname) lvec_df = lvec_df.loc[good_rows] string_df = string_df.loc[good_rows] n_sents = len(string_df) if n_sents < 10: continue for i in xrange(n_sents): assert string_df[u'stream id'].iloc[i] == \ lvec_df[u'stream id'].iloc[i] assert string_df[u'sentence id'].iloc[i] == \ lvec_df[u'sentence id'].iloc[i] del lvec_df['update id'] #del sal_df['update id'] X = lvec_df.as_matrix()[:,2:].astype(np.float64) A = cosine_similarity(X) Aupper = A[np.triu_indices_from(A, k=1)] Amu = np.mean(Aupper) Astd = np.std(Aupper) A = (A - Amu) / Astd A = A - np.max(A[np.triu_indices_from(A, k=1)]) af = AffinityPropagation( preference=None, affinity='precomputed', max_iter=100, damping=.7, verbose=False).fit(A) II = np.arange(n_sents) if af.cluster_centers_indices_ is None: continue for cnum, cluster in enumerate(np.unique(af.labels_)): e = af.cluster_centers_indices_[cluster] cluster_size = II[cluster == af.labels_].shape[0] scstring = string_df.iloc[e][u'streamcorpus'] stream_id = string_df.iloc[e][u'stream id'] sentence_id = str(string_df.iloc[e][u'sentence id']) updates.append({"stream id": stream_id, "sentence id": sentence_id, "hour": hour, "timestamp": timestamp, "cluster size": cluster_size, "string": scstring}) df = pd.DataFrame(updates, columns=["stream id", "sentence id", "hour", "timestamp", "cluster size", "string"]) with gzip.open(tsv_path, u'w') as f: df.to_csv(f, sep='\t', index=False, index_label=False)
def salience_predict_worker_(job_queue, result_queue, **kwargs): signal.signal(signal.SIGINT, signal.SIG_IGN) event = kwargs.get(u'event') fs = kwargs.get(u'feature_set') prefix = kwargs.get(u'prefix') feats = get_resource_manager(u'SentenceFeaturesResource') sp = SaliencePredictions() # model_paths = kwargs.get(u'model_paths') # model_paths.sort() # model_keys = [] # key2models = {} # n_models = len(model_paths) # for i, model_path in enumerate(model_paths, 1): # print i, "/", n_models #if not os.path.exists(model_path): # continue # parent_dir, model_num = os.path.split(model_path) # model_name = os.path.split(parent_dir)[-1] # key = model_name + "." + model_num # key2models[key] = joblib.load(model_path) # model_keys.append(key) # n_model_paths = len(model_paths) fregex = fs.get_feature_regex() + "|stream id|sentence id" while not job_queue.empty(): try: model_path = job_queue.get(block=False) parent_dir, model_num = os.path.split(model_path) model_name = os.path.split(parent_dir)[-1] model_key = model_name + "." + model_num xrescaler, yrescaler, m = joblib.load(model_path) for hour in event.list_event_hours(): feat_tsv_path = feats.get_tsv_path(event, hour) sal_tsv_path = sp.get_tsv_path( event, hour, prefix, fs, model_path) if not os.path.exists(feat_tsv_path): continue if os.path.exists(sal_tsv_path): continue with gzip.open(feat_tsv_path, u'r') as f: feats_df = pd.io.parsers.read_csv( f, sep='\t', quoting=3, header=0) feats_df = feats_df.filter(regex=fregex) feats_df = feats_df.sort([u'stream id', u'sentence id']) n_points = len(feats_df) sims = [] X = [] for i in xrange(n_points): streamid = feats_df.iloc[i][u'stream id'] sentid = feats_df.iloc[i][u'sentence id'] x = feats_df.iloc[i].values[2:] #print len(feats_df.columns), len(x), x X.append(x) sims.append({u'stream id': streamid, u'sentence id': sentid}) X = np.array(X, dtype=np.float64) # print X.shape #for model_key, (xrescaler, yrescaler, m) in key2models.items(): # print model_key #for model_path in model_paths: # parent_dir, model_num = os.path.split(model_path) # model_name = os.path.split(parent_dir)[-1] # model_key = model_name + "." + model_num #if not os.path.exists(model_path): # continue #print model_path #xrescaler, yrescaler, m = joblib.load(model_path) #print xrescaler.mean_.shape Xscale = xrescaler.transform(X) result = m.predict(Xscale) #print len(result) ##print result[0].shape yp = result[0] for i, y in enumerate(yp): sims[i][model_key] = y[0] sims_df = pd.DataFrame( sims, columns=[u'stream id', u'sentence id', model_key]) with gzip.open(sal_tsv_path, u'w') as f: sims_df.to_csv(f, sep='\t', index=False, index_label=False) result_queue.put(None) except Queue.Empty: pass
def load_all_feats(event, fs, sample_size, random_seed): random.seed(random_seed) all_df = None features = get_resource_manager(u'SentenceFeaturesResource') fregex = fs.get_feature_regex() + "|stream id|sentence id" resevoir = None current_index = 0 hours = [] res_sims = None for hour in event.list_event_hours(): path = features.get_tsv_path(event, hour) #print path, current_index if not os.path.exists(path): continue with gzip.open(path, u'r') as f: df = pd.io.parsers.read_csv(f, sep='\t', quoting=3, header=0) df = df.filter(regex=fregex) if resevoir is None: resevoir = pd.DataFrame(columns=df.columns) for _, row in df.iterrows(): if current_index < sample_size: resevoir.loc[current_index] = row hours.append(hour) else: r = random.randint(0, current_index) if r < sample_size: resevoir.loc[r] = row hours[r] = hour current_index += 1 # if resevoir is None: # resevoir = df.iloc[range(0, sample_size)] # current_index = sample_size # paths = [path for i in range(0, sample_size)] s = get_resource_manager(u'NuggetSimilaritiesResource') for hour in set(hours): path = s.get_tsv_path(event, hour) with gzip.open(path, u'r') as f: df = pd.io.parsers.read_csv(f, sep='\t', quoting=3, header=0) if res_sims is None: res_sims = pd.DataFrame([{} for x in range(sample_size)], columns=df.columns) for idx, row_hour in enumerate(hours): if hour != row_hour: continue stream_id = resevoir.iloc[idx][u'stream id'] sent_id = resevoir.iloc[idx][u'sentence id'] res_sims.loc[idx] = df.loc[ (df[u'stream id'] == stream_id) & \ (df[u'sentence id'] == sent_id)].iloc[0] for i in range(sample_size): assert resevoir[u'sentence id'].iloc[i] == \ res_sims[u'sentence id'].iloc[i] assert resevoir[u'stream id'].iloc[i] == \ res_sims[u'stream id'].iloc[i] return resevoir, res_sims
def make_summary(self, event, corpus, prefix, feature_set, cutoff): string_res = get_resource_manager(u'SentenceStringsResource') lvec_res = get_resource_manager(u'SentenceLatentVectorsResource') spa = SaliencePredictionAggregator() tsv_path = self.get_tsv_path(event, cutoff) updates = [] epoch = datetime.utcfromtimestamp(0) for hour in event.list_event_hours(): hp1 = hour + timedelta(hours=1) timestamp = str(int((hp1 - epoch).total_seconds())) string_df = string_res.get_dataframe(event, hour) lvec_df = lvec_res.get_dataframe(event, hour) sal_df = spa.get_dataframe(event, hour, prefix, feature_set) if string_df is None or lvec_df is None or sal_df is None: continue string_df = string_df.drop_duplicates( subset=[u'stream id', u'sentence id']) lvec_df = lvec_df.drop_duplicates( subset=[u'stream id', u'sentence id']) sal_df = sal_df.drop_duplicates( subset=[u'stream id', u'sentence id']) string_df.sort([u"stream id", u"sentence id"], inplace=True) lvec_df.sort([u"stream id", u"sentence id"], inplace=True) sal_df.sort([u"stream id", u"sentence id"], inplace=True) X = lvec_df.as_matrix()[:,2:].astype(np.float64) good_rows = np.where(X.any(axis=1))[0] string_df = string_df.iloc[good_rows] lvec_df = lvec_df.iloc[good_rows] sal_df = sal_df.iloc[good_rows] assert len(string_df) == len(lvec_df) assert len(string_df) == len(sal_df) n_sents = len(string_df) for i in xrange(n_sents): assert string_df[u'stream id'].iloc[i] == \ lvec_df[u'stream id'].iloc[i] assert string_df[u'stream id'].iloc[i] == \ sal_df[u'stream id'].iloc[i] assert string_df[u'sentence id'].iloc[i] == \ lvec_df[u'sentence id'].iloc[i] assert string_df[u'sentence id'].iloc[i] == \ sal_df[u'sentence id'].iloc[i] lvec_df.reset_index(drop=True, inplace=True) string_df.reset_index(drop=True, inplace=True) sal_df.reset_index(drop=True, inplace=True) good_rows = [] for name, doc in string_df.groupby("stream id"): for rname, row in doc.iterrows(): scstring = row["streamcorpus"] words = len(re.findall(r'\b[^\W\d_]+\b', scstring)) socs = len(re.findall( r'Digg|del\.icio\.us|Facebook|Kwoff|Myspace', scstring)) langs = len(re.findall( r'Flash|JavaScript|CSS', scstring, re.I)) assert lvec_df.loc[rname][u'sentence id'] == \ row[u'sentence id'] assert lvec_df.loc[rname][u'stream id'] == \ row[u'stream id'] assert sal_df.loc[rname][u'sentence id'] == \ row[u'sentence id'] assert sal_df.loc[rname][u'stream id'] == \ row[u'stream id'] if words > 9 and len(doc) < 200 \ and socs < 2 and langs < 2: good_rows.append(rname) lvec_df = lvec_df.loc[good_rows] string_df = string_df.loc[good_rows] sal_df = sal_df.loc[good_rows] n_sents = len(string_df) if n_sents < 10: continue for i in xrange(n_sents): assert string_df[u'stream id'].iloc[i] == \ lvec_df[u'stream id'].iloc[i] assert string_df[u'stream id'].iloc[i] == \ sal_df[u'stream id'].iloc[i] assert string_df[u'sentence id'].iloc[i] == \ lvec_df[u'sentence id'].iloc[i] assert string_df[u'sentence id'].iloc[i] == \ sal_df[u'sentence id'].iloc[i] X = lvec_df.as_matrix()[:,2:].astype(np.float64) S = sal_df.as_matrix()[:,2:].astype(np.float64) s = np.mean(S, axis=1) #Xn = Normalizer().fit_transform(X) z = fastcluster.linkage(X, method='single', metric='euclidean', preserve_input=True) clusters = hac.fcluster(z, cutoff, 'distance') II = np.arange(n_sents) #print set(clusters) for cluster_id, cluster in enumerate(set(clusters)): # print cluster # print (clusters == cluster).shape # print II.shape ii = II[clusters == cluster] #print ii.shape C = X[clusters == cluster,:] u = np.mean(C, axis=0) dist_2 = np.sum((C - u)**2, axis=1) cidx = np.argmin(dist_2) #cidx = np.argmax(cosine_similarity(C, u)) e = ii[cidx] # #Cs = s[clusters == cluster] # e = ii[np.argmax(Cs)], cluster_size = C.shape[0] scstring = string_df.iloc[e][u'streamcorpus'] stream_id = string_df.iloc[e][u'stream id'] sentence_id = str(string_df.iloc[e][u'sentence id']) updates.append({"stream id": stream_id, "sentence id": sentence_id, "hour": hour, "timestamp": timestamp, "cluster size": cluster_size, "string": scstring}) df = pd.DataFrame(updates, columns=["stream id", "sentence id", "hour", "timestamp", "cluster size", "string"]) with gzip.open(tsv_path, u'w') as f: df.to_csv(f, sep='\t', index=False, index_label=False)
def salience_predict_worker_(job_queue, result_queue, **kwargs): signal.signal(signal.SIGINT, signal.SIG_IGN) event = kwargs.get(u'event') fs = kwargs.get(u'feature_set') prefix = kwargs.get(u'prefix') feats = get_resource_manager(u'SentenceFeaturesResource') sp = SaliencePredictions() # model_paths = kwargs.get(u'model_paths') # model_paths.sort() # model_keys = [] # key2models = {} # n_models = len(model_paths) # for i, model_path in enumerate(model_paths, 1): # print i, "/", n_models #if not os.path.exists(model_path): # continue # parent_dir, model_num = os.path.split(model_path) # model_name = os.path.split(parent_dir)[-1] # key = model_name + "." + model_num # key2models[key] = joblib.load(model_path) # model_keys.append(key) # n_model_paths = len(model_paths) fregex = fs.get_feature_regex() + "|stream id|sentence id" while not job_queue.empty(): try: model_path = job_queue.get(block=False) parent_dir, model_num = os.path.split(model_path) model_name = os.path.split(parent_dir)[-1] model_key = model_name + "." + model_num xrescaler, yrescaler, m = joblib.load(model_path) for hour in event.list_event_hours(): feat_tsv_path = feats.get_tsv_path(event, hour) sal_tsv_path = sp.get_tsv_path(event, hour, prefix, fs, model_path) if not os.path.exists(feat_tsv_path): continue if os.path.exists(sal_tsv_path): continue with gzip.open(feat_tsv_path, u'r') as f: feats_df = pd.io.parsers.read_csv(f, sep='\t', quoting=3, header=0) feats_df = feats_df.filter(regex=fregex) feats_df = feats_df.sort([u'stream id', u'sentence id']) n_points = len(feats_df) sims = [] X = [] for i in xrange(n_points): streamid = feats_df.iloc[i][u'stream id'] sentid = feats_df.iloc[i][u'sentence id'] x = feats_df.iloc[i].values[2:] #print len(feats_df.columns), len(x), x X.append(x) sims.append({ u'stream id': streamid, u'sentence id': sentid }) X = np.array(X, dtype=np.float64) # print X.shape #for model_key, (xrescaler, yrescaler, m) in key2models.items(): # print model_key #for model_path in model_paths: # parent_dir, model_num = os.path.split(model_path) # model_name = os.path.split(parent_dir)[-1] # model_key = model_name + "." + model_num #if not os.path.exists(model_path): # continue #print model_path #xrescaler, yrescaler, m = joblib.load(model_path) #print xrescaler.mean_.shape Xscale = xrescaler.transform(X) result = m.predict(Xscale) #print len(result) ##print result[0].shape yp = result[0] for i, y in enumerate(yp): sims[i][model_key] = y[0] sims_df = pd.DataFrame( sims, columns=[u'stream id', u'sentence id', model_key]) with gzip.open(sal_tsv_path, u'w') as f: sims_df.to_csv(f, sep='\t', index=False, index_label=False) result_queue.put(None) except Queue.Empty: pass
def make(self, event, prefix, feature_set, sal_cutoff, sim_cutoff): tsv_path = self.get_tsv_path(event, sal_cutoff, sim_cutoff) lvecs = SentenceLatentVectorsResource() string_res = get_resource_manager(u'SentenceStringsResource') spa = SaliencePredictionAggregator() #cluster_df = hac.get_dataframe(event, dist_cutoff) #for _, row in cluster_df.iterrows(): # print row['hour'], datetime.utcfromtimestamp(row['timestamp']) epoch = datetime.utcfromtimestamp(0) updates = [] Xcache = None #timestamps = sorted(list(cluster_df['timestamp'].unique())) hours = event.list_event_hours() for hour in hours: #hour = datetime.utcfromtimestamp(timestamp) - timedelta(hours=1) hp1 = hour + timedelta(hours=1) timestamp = str(int((hp1 - epoch).total_seconds())) lvec_df = lvecs.get_dataframe(event, hour) sal_df = spa.get_dataframe(event, hour, prefix, feature_set) str_df = string_res.get_dataframe(event, hour) if lvec_df is None or sal_df is None or str_df is None: continue str_df.drop_duplicates(['stream id', 'sentence id'], inplace=True) lvec_df.drop_duplicates(['stream id', 'sentence id'], inplace=True) sal_df.drop_duplicates(['stream id', 'sentence id'], inplace=True) str_df.sort(['stream id', 'sentence id'], inplace=True) str_df.reset_index(drop=True, inplace=True) lvec_df.sort(['stream id', 'sentence id'], inplace=True) lvec_df.reset_index(drop=True, inplace=True) sal_df.sort(['stream id', 'sentence id'], inplace=True) sal_df.reset_index(drop=True, inplace=True) str_df = str_df.join( str_df.groupby('stream id')['sentence id'].agg('count'), on='stream id', rsuffix='_r').rename( columns={"sentence id_r": "document count"}) good_sents = str_df.apply(lambda x: passes_simple_filter( x['streamcorpus'], x['document count']), axis=1) #good_sents = good_sents.reset_index() str_df = str_df[good_sents] lvec_df = lvec_df[good_sents] sal_df = sal_df[good_sents] n_rows = len(sal_df) for i in xrange(n_rows): assert sal_df['stream id'].iloc[i] == \ lvec_df['stream id'].iloc[i] assert sal_df['sentence id'].iloc[i] == \ lvec_df['sentence id'].iloc[i] assert str_df['stream id'].iloc[i] == \ lvec_df['stream id'].iloc[i] assert str_df['sentence id'].iloc[i] == \ lvec_df['sentence id'].iloc[i] if n_rows == 0: continue Xsal = sal_df.as_matrix()[:, 2:].astype(np.float64).mean(axis=1) mu_sal = np.mean(Xsal) sig_sal = np.std(Xsal) Xsal_norm = (Xsal - mu_sal) / sig_sal lvec_df = lvec_df[Xsal_norm > sal_cutoff] str_df = str_df[Xsal_norm > sal_cutoff] str_df = str_df.set_index(['stream id', 'sentence id']) lvec_df['salience'] = Xsal_norm[Xsal_norm > sal_cutoff] lvec_df.sort(['salience'], inplace=True, ascending=False) if Xcache is None: Xlvecs = lvec_df.as_matrix()[:, 2:-2].astype(np.float64) K = cosine_similarity(Xlvecs) K_ma = np.ma.array(K, mask=True) good_indexes = [] for i, (_, row) in enumerate(lvec_df.iterrows()): sim = K_ma[i, :].max(fill_value=0.0) if not isinstance(sim, np.float64): sim = 0 if sim < sim_cutoff: up_str = str_df.loc[row['stream id'], row['sentence id']]['streamcorpus'] updates.append({ 'query id': event.query_num, 'system id': 'cunlp', 'run id': 'sal-ranked-sal_{}-sim_{}'.format( sal_cutoff, sim_cutoff), 'stream id': row['stream id'], 'sentence id': row['sentence id'], 'timestamp': timestamp, 'conf': row['salience'], 'string': up_str }) K_ma.mask[:, i] = False good_indexes.append(i) Xcache = Xlvecs[good_indexes].copy() else: xtmp = lvec_df.as_matrix()[:, 2:-2].astype(np.float64) Xlvecs = np.vstack([Xcache, xtmp]) start_index = Xcache.shape[0] K = cosine_similarity(Xlvecs) K_ma = np.ma.array(K, mask=True) K_ma.mask.T[np.arange(0, start_index)] = False good_indexes = [] for i, (_, row) in enumerate(lvec_df.iterrows(), start_index): #print i sim = K_ma[i, :].max(fill_value=0.0) #print sim if sim < sim_cutoff: up_str = str_df.loc[row['stream id'], row['sentence id']]['streamcorpus'] updates.append({ 'query id': event.query_num, 'system id': 'cunlp', 'run id': 'sal-ranked-sal_{}-sim_{}'.format( sal_cutoff, sim_cutoff), 'stream id': row['stream id'], 'sentence id': row['sentence id'], 'timestamp': timestamp, 'conf': row['salience'], 'string': up_str }) K_ma.mask[:, i] = False good_indexes.append(i) if len(good_indexes) > 0: Xcache = np.vstack([Xcache, Xlvecs[good_indexes].copy()]) #Xcache = Xlvecs[good_indexes].copy() if len(updates) == 0: updates.append({ 'query id': event.query_num, 'system id': 'cunlp', 'run id': 'sal-ranked-sal_{}-sim_{}'.format(sal_cutoff, sim_cutoff), 'stream id': 1111, 'sentence id': 1, 'timestamp': timestamp, 'conf': 0, 'string': 'place holder' }) df = pd.DataFrame(updates, columns=[ "query id", "system id", "run id", "stream id", "sentence id", "timestamp", "conf", "string" ]) with open(tsv_path, u'w') as f: df.to_csv(f, sep='\t', index=False, index_label=False, header=False)
def load_all_feats(event, fs, sample_size, random_seed): random.seed(random_seed) all_df = None features = get_resource_manager(u'SentenceFeaturesResource') fregex = fs.get_feature_regex() + "|stream id|sentence id" resevoir = None current_index = 0 hours = [] res_sims = None for hour in event.list_event_hours(): path = features.get_tsv_path(event, hour) #print path, current_index if not os.path.exists(path): continue with gzip.open(path, u'r') as f: df = pd.io.parsers.read_csv( f, sep='\t', quoting=3, header=0) df = df.filter(regex=fregex) if resevoir is None: resevoir = pd.DataFrame(columns=df.columns) for _, row in df.iterrows(): if current_index < sample_size: resevoir.loc[current_index] = row hours.append(hour) else: r = random.randint(0, current_index) if r < sample_size: resevoir.loc[r] = row hours[r] = hour current_index += 1 # if resevoir is None: # resevoir = df.iloc[range(0, sample_size)] # current_index = sample_size # paths = [path for i in range(0, sample_size)] s = get_resource_manager(u'NuggetSimilaritiesResource') for hour in set(hours): path = s.get_tsv_path(event, hour) with gzip.open(path, u'r') as f: df = pd.io.parsers.read_csv( f, sep='\t', quoting=3, header=0) if res_sims is None: res_sims = pd.DataFrame([{} for x in range(sample_size)], columns=df.columns) for idx, row_hour in enumerate(hours): if hour != row_hour: continue stream_id = resevoir.iloc[idx][u'stream id'] sent_id = resevoir.iloc[idx][u'sentence id'] res_sims.loc[idx] = df.loc[ (df[u'stream id'] == stream_id) & \ (df[u'sentence id'] == sent_id)].iloc[0] for i in range(sample_size): assert resevoir[u'sentence id'].iloc[i] == \ res_sims[u'sentence id'].iloc[i] assert resevoir[u'stream id'].iloc[i] == \ res_sims[u'stream id'].iloc[i] return resevoir, res_sims
def make_summary(self, event, corpus, prefix, feature_set, cutoff): string_res = get_resource_manager(u'SentenceStringsResource') lvec_res = get_resource_manager(u'SentenceLatentVectorsResource') spa = SaliencePredictionAggregator() tsv_path = self.get_tsv_path(event, cutoff) updates = [] epoch = datetime.utcfromtimestamp(0) for hour in event.list_event_hours(): hp1 = hour + timedelta(hours=1) timestamp = str(int((hp1 - epoch).total_seconds())) string_df = string_res.get_dataframe(event, hour) lvec_df = lvec_res.get_dataframe(event, hour) sal_df = spa.get_dataframe(event, hour, prefix, feature_set) if string_df is None or lvec_df is None or sal_df is None: continue string_df = string_df.drop_duplicates( subset=[u'stream id', u'sentence id']) lvec_df = lvec_df.drop_duplicates( subset=[u'stream id', u'sentence id']) sal_df = sal_df.drop_duplicates( subset=[u'stream id', u'sentence id']) string_df.sort([u"stream id", u"sentence id"], inplace=True) lvec_df.sort([u"stream id", u"sentence id"], inplace=True) sal_df.sort([u"stream id", u"sentence id"], inplace=True) X = lvec_df.as_matrix()[:, 2:].astype(np.float64) good_rows = np.where(X.any(axis=1))[0] string_df = string_df.iloc[good_rows] lvec_df = lvec_df.iloc[good_rows] sal_df = sal_df.iloc[good_rows] assert len(string_df) == len(lvec_df) assert len(string_df) == len(sal_df) n_sents = len(string_df) for i in xrange(n_sents): assert string_df[u'stream id'].iloc[i] == \ lvec_df[u'stream id'].iloc[i] assert string_df[u'stream id'].iloc[i] == \ sal_df[u'stream id'].iloc[i] assert string_df[u'sentence id'].iloc[i] == \ lvec_df[u'sentence id'].iloc[i] assert string_df[u'sentence id'].iloc[i] == \ sal_df[u'sentence id'].iloc[i] lvec_df.reset_index(drop=True, inplace=True) string_df.reset_index(drop=True, inplace=True) sal_df.reset_index(drop=True, inplace=True) good_rows = [] for name, doc in string_df.groupby("stream id"): for rname, row in doc.iterrows(): scstring = row["streamcorpus"] words = len(re.findall(r'\b[^\W\d_]+\b', scstring)) socs = len( re.findall( r'Digg|del\.icio\.us|Facebook|Kwoff|Myspace', scstring)) langs = len( re.findall(r'Flash|JavaScript|CSS', scstring, re.I)) assert lvec_df.loc[rname][u'sentence id'] == \ row[u'sentence id'] assert lvec_df.loc[rname][u'stream id'] == \ row[u'stream id'] assert sal_df.loc[rname][u'sentence id'] == \ row[u'sentence id'] assert sal_df.loc[rname][u'stream id'] == \ row[u'stream id'] if words > 9 and len(doc) < 200 \ and socs < 2 and langs < 2: good_rows.append(rname) lvec_df = lvec_df.loc[good_rows] string_df = string_df.loc[good_rows] sal_df = sal_df.loc[good_rows] n_sents = len(string_df) if n_sents < 10: continue for i in xrange(n_sents): assert string_df[u'stream id'].iloc[i] == \ lvec_df[u'stream id'].iloc[i] assert string_df[u'stream id'].iloc[i] == \ sal_df[u'stream id'].iloc[i] assert string_df[u'sentence id'].iloc[i] == \ lvec_df[u'sentence id'].iloc[i] assert string_df[u'sentence id'].iloc[i] == \ sal_df[u'sentence id'].iloc[i] X = lvec_df.as_matrix()[:, 2:].astype(np.float64) S = sal_df.as_matrix()[:, 2:].astype(np.float64) s = np.mean(S, axis=1) #Xn = Normalizer().fit_transform(X) z = fastcluster.linkage(X, method='single', metric='euclidean', preserve_input=True) clusters = hac.fcluster(z, cutoff, 'distance') II = np.arange(n_sents) #print set(clusters) for cluster_id, cluster in enumerate(set(clusters)): # print cluster # print (clusters == cluster).shape # print II.shape ii = II[clusters == cluster] #print ii.shape C = X[clusters == cluster, :] u = np.mean(C, axis=0) dist_2 = np.sum((C - u)**2, axis=1) cidx = np.argmin(dist_2) #cidx = np.argmax(cosine_similarity(C, u)) e = ii[cidx] # #Cs = s[clusters == cluster] # e = ii[np.argmax(Cs)], cluster_size = C.shape[0] scstring = string_df.iloc[e][u'streamcorpus'] stream_id = string_df.iloc[e][u'stream id'] sentence_id = str(string_df.iloc[e][u'sentence id']) updates.append({ "stream id": stream_id, "sentence id": sentence_id, "hour": hour, "timestamp": timestamp, "cluster size": cluster_size, "string": scstring }) df = pd.DataFrame(updates, columns=[ "stream id", "sentence id", "hour", "timestamp", "cluster size", "string" ]) with gzip.open(tsv_path, u'w') as f: df.to_csv(f, sep='\t', index=False, index_label=False)
def make(self, event, prefix, feature_set, sal_cutoff, sim_cutoff): tsv_path = self.get_tsv_path(event, sal_cutoff, sim_cutoff) lvecs = SentenceLatentVectorsResource() string_res = get_resource_manager(u'SentenceStringsResource') spa = SaliencePredictionAggregator() #cluster_df = hac.get_dataframe(event, dist_cutoff) #for _, row in cluster_df.iterrows(): # print row['hour'], datetime.utcfromtimestamp(row['timestamp']) epoch = datetime.utcfromtimestamp(0) updates = [] Xcache = None #timestamps = sorted(list(cluster_df['timestamp'].unique())) hours = event.list_event_hours() for hour in hours: #hour = datetime.utcfromtimestamp(timestamp) - timedelta(hours=1) hp1 = hour + timedelta(hours=1) timestamp = str(int((hp1 - epoch).total_seconds())) lvec_df = lvecs.get_dataframe(event, hour) sal_df = spa.get_dataframe(event, hour, prefix, feature_set) str_df = string_res.get_dataframe(event, hour) if lvec_df is None or sal_df is None or str_df is None: continue str_df.drop_duplicates(['stream id', 'sentence id'], inplace=True) lvec_df.drop_duplicates(['stream id', 'sentence id'], inplace=True) sal_df.drop_duplicates(['stream id', 'sentence id'], inplace=True) str_df.sort(['stream id', 'sentence id'], inplace=True) str_df.reset_index(drop=True, inplace=True) lvec_df.sort(['stream id', 'sentence id'], inplace=True) lvec_df.reset_index(drop=True, inplace=True) sal_df.sort(['stream id', 'sentence id'], inplace=True) sal_df.reset_index(drop=True, inplace=True) str_df = str_df.join( str_df.groupby('stream id')['sentence id'].agg('count'), on='stream id', rsuffix='_r').rename( columns={"sentence id_r": "document count"}) good_sents = str_df.apply( lambda x: passes_simple_filter( x['streamcorpus'], x['document count']), axis=1) #good_sents = good_sents.reset_index() str_df = str_df[good_sents] lvec_df = lvec_df[good_sents] sal_df = sal_df[good_sents] n_rows = len(sal_df) for i in xrange(n_rows): assert sal_df['stream id'].iloc[i] == \ lvec_df['stream id'].iloc[i] assert sal_df['sentence id'].iloc[i] == \ lvec_df['sentence id'].iloc[i] assert str_df['stream id'].iloc[i] == \ lvec_df['stream id'].iloc[i] assert str_df['sentence id'].iloc[i] == \ lvec_df['sentence id'].iloc[i] if n_rows == 0: continue Xsal = sal_df.as_matrix()[:,2:].astype(np.float64).mean(axis=1) mu_sal = np.mean(Xsal) sig_sal = np.std(Xsal) Xsal_norm = (Xsal - mu_sal) / sig_sal lvec_df = lvec_df[Xsal_norm > sal_cutoff] str_df = str_df[Xsal_norm > sal_cutoff] str_df = str_df.set_index(['stream id', 'sentence id']) lvec_df['salience'] = Xsal_norm[Xsal_norm > sal_cutoff] lvec_df.sort(['salience'], inplace=True, ascending=False) if Xcache is None: Xlvecs = lvec_df.as_matrix()[:, 2:-2].astype(np.float64) K = cosine_similarity(Xlvecs) K_ma = np.ma.array(K, mask=True) good_indexes = [] for i, (_, row) in enumerate(lvec_df.iterrows()): sim = K_ma[i,:].max(fill_value=0.0) if not isinstance(sim, np.float64): sim = 0 if sim < sim_cutoff: up_str = str_df.loc[ row['stream id'], row['sentence id']]['streamcorpus'] updates.append({ 'query id': event.query_num, 'system id': 'cunlp', 'run id': 'sal-ranked-sal_{}-sim_{}'.format( sal_cutoff, sim_cutoff), 'stream id': row['stream id'], 'sentence id': row['sentence id'], 'timestamp': timestamp, 'conf': row['salience'], 'string': up_str}) K_ma.mask[:,i] = False good_indexes.append(i) Xcache = Xlvecs[good_indexes].copy() else: xtmp = lvec_df.as_matrix()[:, 2:-2].astype(np.float64) Xlvecs = np.vstack([Xcache, xtmp]) start_index = Xcache.shape[0] K = cosine_similarity(Xlvecs) K_ma = np.ma.array(K, mask=True) K_ma.mask.T[np.arange(0, start_index)] = False good_indexes = [] for i, (_, row) in enumerate(lvec_df.iterrows(), start_index): #print i sim = K_ma[i,:].max(fill_value=0.0) #print sim if sim < sim_cutoff: up_str = str_df.loc[ row['stream id'], row['sentence id']]['streamcorpus'] updates.append({ 'query id': event.query_num, 'system id': 'cunlp', 'run id': 'sal-ranked-sal_{}-sim_{}'.format( sal_cutoff, sim_cutoff), 'stream id': row['stream id'], 'sentence id': row['sentence id'], 'timestamp': timestamp, 'conf': row['salience'], 'string': up_str}) K_ma.mask[:,i] = False good_indexes.append(i) if len(good_indexes) > 0: Xcache = np.vstack([Xcache, Xlvecs[good_indexes].copy()]) #Xcache = Xlvecs[good_indexes].copy() if len(updates) == 0: updates.append({ 'query id': event.query_num, 'system id': 'cunlp', 'run id': 'sal-ranked-sal_{}-sim_{}'.format( sal_cutoff, sim_cutoff), 'stream id': 1111, 'sentence id': 1, 'timestamp': timestamp, 'conf': 0, 'string': 'place holder'}) df = pd.DataFrame(updates, columns=["query id", "system id", "run id", "stream id", "sentence id", "timestamp", "conf", "string"]) with open(tsv_path, u'w') as f: df.to_csv( f, sep='\t', index=False, index_label=False, header=False)