def make(self, event, min_cluster_size=2, sim_threshold=.2264): tsv_path = self.get_tsv_path(event, sim_threshold) lvecs = SentenceLatentVectorsResource() ap = APSummarizer() cluster_df = ap.get_dataframe(event) #for _, row in cluster_df.iterrows(): # print row['hour'], datetime.utcfromtimestamp(row['timestamp']) updates = [] Xcache = None timestamps = sorted(list(cluster_df['timestamp'].unique())) for timestamp in timestamps: hour = datetime.utcfromtimestamp(timestamp) - timedelta(hours=1) lvec_df = lvecs.get_dataframe(event, hour) lvec_df.drop_duplicates(['stream id', 'sentence id'], inplace=True) clusters = cluster_df[cluster_df['timestamp'] == timestamp].copy() clusters.sort(['stream id', 'sentence id'], inplace=True) for _, row in clusters.iterrows(): if row['cluster size'] < min_cluster_size: continue vec = lvec_df.loc[ (lvec_df['stream id'] == row['stream id']) & \ (lvec_df['sentence id'] == row['sentence id'])].as_matrix()[:,2:].astype(np.float64) if Xcache is None: Xcache = vec else: if np.max(cosine_similarity(Xcache, vec)) >= sim_threshold: continue else: Xcache = np.vstack((Xcache, vec)) updates.append({ 'query id': event.query_id[5:], 'system id': 'cunlp', 'run id': 'ap-sim_{}'.format(sim_threshold), 'stream id': row['stream id'], 'sentence id': row['sentence id'], 'timestamp': timestamp, 'conf': 1.0, 'string': row['string'] }) df = pd.DataFrame(updates, columns=[ "query id", "system id", "run id", "stream id", "sentence id", "timestamp", "conf", "string" ]) with open(tsv_path, u'w') as f: df.to_csv(f, sep='\t', index=False, index_label=False, header=False)
def make(self, event, min_cluster_size=2, sim_threshold=.2264): tsv_path = self.get_tsv_path(event, sim_threshold) lvecs = SentenceLatentVectorsResource() ap = APSummarizer() cluster_df = ap.get_dataframe(event) #for _, row in cluster_df.iterrows(): # print row['hour'], datetime.utcfromtimestamp(row['timestamp']) updates = [] Xcache = None timestamps = sorted(list(cluster_df['timestamp'].unique())) for timestamp in timestamps: hour = datetime.utcfromtimestamp(timestamp) - timedelta(hours=1) lvec_df = lvecs.get_dataframe(event, hour) lvec_df.drop_duplicates(['stream id', 'sentence id'], inplace=True) clusters = cluster_df[cluster_df['timestamp'] == timestamp].copy() clusters.sort(['stream id', 'sentence id'], inplace=True) for _, row in clusters.iterrows(): if row['cluster size'] < min_cluster_size: continue vec = lvec_df.loc[ (lvec_df['stream id'] == row['stream id']) & \ (lvec_df['sentence id'] == row['sentence id'])].as_matrix()[:,2:].astype(np.float64) if Xcache is None: Xcache = vec else: if np.max(cosine_similarity(Xcache, vec)) >= sim_threshold: continue else: Xcache = np.vstack((Xcache, vec)) updates.append({ 'query id': event.query_id[5:], 'system id': 'cunlp', 'run id': 'ap-sim_{}'.format(sim_threshold), 'stream id': row['stream id'], 'sentence id': row['sentence id'], 'timestamp': timestamp, 'conf': 1.0, 'string': row['string'] }) df = pd.DataFrame(updates, columns=["query id", "system id", "run id", "stream id", "sentence id", "timestamp", "conf", "string"]) with open(tsv_path, u'w') as f: df.to_csv( f, sep='\t', index=False, index_label=False, header=False)
def make(self, event, prefix, feature_set, min_cluster_size=2, sim_threshold=.2264, dist_cutoff=1.35): tsv_path = self.get_tsv_path(event, dist_cutoff, sim_threshold) lvecs = SentenceLatentVectorsResource() #spa = SaliencePredictionAggregator() hac = HACSummarizer() cluster_df = hac.get_dataframe(event, dist_cutoff) #for _, row in cluster_df.iterrows(): # print row['hour'], datetime.utcfromtimestamp(row['timestamp']) updates = [] Xcache = None timestamps = sorted(list(cluster_df['timestamp'].unique())) for timestamp in timestamps: hour = datetime.utcfromtimestamp(timestamp) - timedelta(hours=1) lvec_df = lvecs.get_dataframe(event, hour) lvec_df.drop_duplicates(['stream id', 'sentence id'], inplace=True) # sal_df = spa.get_dataframe(event, hour, prefix, feature_set) # sal_df.drop_duplicates(['stream id', 'sentence id'], inplace=True) clusters = cluster_df[cluster_df['timestamp'] == timestamp].copy() clusters.sort(['stream id', 'sentence id'], inplace=True) # salience = [] # for _, row in clusters.iterrows(): # sal_pred = sal_df.loc[ # (sal_df['stream id'] == row['stream id']) & \ # (sal_df['sentence id'] == row['sentence id']) # ].as_matrix()[:,2:].astype(np.float64).mean() # salience.append(sal_pred) # clusters['salience'] = salience # clusters.sort(['salience'], ascending=False, inplace=True) # sal_mean = np.mean(salience) # sal_std = np.std(salience) for _, row in clusters.iterrows(): if row['cluster size'] < min_cluster_size: continue vec = lvec_df.loc[ (lvec_df['stream id'] == row['stream id']) & \ (lvec_df['sentence id'] == row['sentence id']) ].as_matrix()[:,2:].astype(np.float64) #sal_norm = (row['salience'] - sal_mean) / sal_std #if sal_norm < center_threshold: # continue if Xcache is None: Xcache = vec else: if np.max(cosine_similarity(Xcache, vec)) >= sim_threshold: continue else: Xcache = np.vstack((Xcache, vec)) updates.append({ 'query id': event.query_id[5:], 'system id': 'cunlp', 'run id': 'hac-dist_{}-sim_{}'.format(dist_cutoff, sim_threshold), 'stream id': row['stream id'], 'sentence id': row['sentence id'], 'timestamp': timestamp, 'conf': 1.0, 'string': row['string'] }) df = pd.DataFrame(updates, columns=[ "query id", "system id", "run id", "stream id", "sentence id", "timestamp", "conf", "string" ]) with open(tsv_path, u'w') as f: df.to_csv(f, sep='\t', index=False, index_label=False, header=False)
def make(self, event, prefix, feature_set, sal_cutoff, sim_cutoff): tsv_path = self.get_tsv_path(event, sal_cutoff, sim_cutoff) lvecs = SentenceLatentVectorsResource() string_res = get_resource_manager(u'SentenceStringsResource') spa = SaliencePredictionAggregator() #cluster_df = hac.get_dataframe(event, dist_cutoff) #for _, row in cluster_df.iterrows(): # print row['hour'], datetime.utcfromtimestamp(row['timestamp']) epoch = datetime.utcfromtimestamp(0) updates = [] Xcache = None #timestamps = sorted(list(cluster_df['timestamp'].unique())) hours = event.list_event_hours() for hour in hours: #hour = datetime.utcfromtimestamp(timestamp) - timedelta(hours=1) hp1 = hour + timedelta(hours=1) timestamp = str(int((hp1 - epoch).total_seconds())) lvec_df = lvecs.get_dataframe(event, hour) sal_df = spa.get_dataframe(event, hour, prefix, feature_set) str_df = string_res.get_dataframe(event, hour) if lvec_df is None or sal_df is None or str_df is None: continue str_df.drop_duplicates(['stream id', 'sentence id'], inplace=True) lvec_df.drop_duplicates(['stream id', 'sentence id'], inplace=True) sal_df.drop_duplicates(['stream id', 'sentence id'], inplace=True) str_df.sort(['stream id', 'sentence id'], inplace=True) str_df.reset_index(drop=True, inplace=True) lvec_df.sort(['stream id', 'sentence id'], inplace=True) lvec_df.reset_index(drop=True, inplace=True) sal_df.sort(['stream id', 'sentence id'], inplace=True) sal_df.reset_index(drop=True, inplace=True) str_df = str_df.join( str_df.groupby('stream id')['sentence id'].agg('count'), on='stream id', rsuffix='_r').rename( columns={"sentence id_r": "document count"}) good_sents = str_df.apply(lambda x: passes_simple_filter( x['streamcorpus'], x['document count']), axis=1) #good_sents = good_sents.reset_index() str_df = str_df[good_sents] lvec_df = lvec_df[good_sents] sal_df = sal_df[good_sents] n_rows = len(sal_df) for i in xrange(n_rows): assert sal_df['stream id'].iloc[i] == \ lvec_df['stream id'].iloc[i] assert sal_df['sentence id'].iloc[i] == \ lvec_df['sentence id'].iloc[i] assert str_df['stream id'].iloc[i] == \ lvec_df['stream id'].iloc[i] assert str_df['sentence id'].iloc[i] == \ lvec_df['sentence id'].iloc[i] if n_rows == 0: continue Xsal = sal_df.as_matrix()[:, 2:].astype(np.float64).mean(axis=1) mu_sal = np.mean(Xsal) sig_sal = np.std(Xsal) Xsal_norm = (Xsal - mu_sal) / sig_sal lvec_df = lvec_df[Xsal_norm > sal_cutoff] str_df = str_df[Xsal_norm > sal_cutoff] str_df = str_df.set_index(['stream id', 'sentence id']) lvec_df['salience'] = Xsal_norm[Xsal_norm > sal_cutoff] lvec_df.sort(['salience'], inplace=True, ascending=False) if Xcache is None: Xlvecs = lvec_df.as_matrix()[:, 2:-2].astype(np.float64) K = cosine_similarity(Xlvecs) K_ma = np.ma.array(K, mask=True) good_indexes = [] for i, (_, row) in enumerate(lvec_df.iterrows()): sim = K_ma[i, :].max(fill_value=0.0) if not isinstance(sim, np.float64): sim = 0 if sim < sim_cutoff: up_str = str_df.loc[row['stream id'], row['sentence id']]['streamcorpus'] updates.append({ 'query id': event.query_num, 'system id': 'cunlp', 'run id': 'sal-ranked-sal_{}-sim_{}'.format( sal_cutoff, sim_cutoff), 'stream id': row['stream id'], 'sentence id': row['sentence id'], 'timestamp': timestamp, 'conf': row['salience'], 'string': up_str }) K_ma.mask[:, i] = False good_indexes.append(i) Xcache = Xlvecs[good_indexes].copy() else: xtmp = lvec_df.as_matrix()[:, 2:-2].astype(np.float64) Xlvecs = np.vstack([Xcache, xtmp]) start_index = Xcache.shape[0] K = cosine_similarity(Xlvecs) K_ma = np.ma.array(K, mask=True) K_ma.mask.T[np.arange(0, start_index)] = False good_indexes = [] for i, (_, row) in enumerate(lvec_df.iterrows(), start_index): #print i sim = K_ma[i, :].max(fill_value=0.0) #print sim if sim < sim_cutoff: up_str = str_df.loc[row['stream id'], row['sentence id']]['streamcorpus'] updates.append({ 'query id': event.query_num, 'system id': 'cunlp', 'run id': 'sal-ranked-sal_{}-sim_{}'.format( sal_cutoff, sim_cutoff), 'stream id': row['stream id'], 'sentence id': row['sentence id'], 'timestamp': timestamp, 'conf': row['salience'], 'string': up_str }) K_ma.mask[:, i] = False good_indexes.append(i) if len(good_indexes) > 0: Xcache = np.vstack([Xcache, Xlvecs[good_indexes].copy()]) #Xcache = Xlvecs[good_indexes].copy() if len(updates) == 0: updates.append({ 'query id': event.query_num, 'system id': 'cunlp', 'run id': 'sal-ranked-sal_{}-sim_{}'.format(sal_cutoff, sim_cutoff), 'stream id': 1111, 'sentence id': 1, 'timestamp': timestamp, 'conf': 0, 'string': 'place holder' }) df = pd.DataFrame(updates, columns=[ "query id", "system id", "run id", "stream id", "sentence id", "timestamp", "conf", "string" ]) with open(tsv_path, u'w') as f: df.to_csv(f, sep='\t', index=False, index_label=False, header=False)
def make(self, event, prefix, feature_set, min_cluster_size=2, sim_threshold=.2264, dist_cutoff=1.35): tsv_path = self.get_tsv_path(event, dist_cutoff, sim_threshold) lvecs = SentenceLatentVectorsResource() #spa = SaliencePredictionAggregator() hac = HACSummarizer() cluster_df = hac.get_dataframe(event, dist_cutoff) #for _, row in cluster_df.iterrows(): # print row['hour'], datetime.utcfromtimestamp(row['timestamp']) updates = [] Xcache = None timestamps = sorted(list(cluster_df['timestamp'].unique())) for timestamp in timestamps: hour = datetime.utcfromtimestamp(timestamp) - timedelta(hours=1) lvec_df = lvecs.get_dataframe(event, hour) lvec_df.drop_duplicates(['stream id', 'sentence id'], inplace=True) # sal_df = spa.get_dataframe(event, hour, prefix, feature_set) # sal_df.drop_duplicates(['stream id', 'sentence id'], inplace=True) clusters = cluster_df[cluster_df['timestamp'] == timestamp].copy() clusters.sort(['stream id', 'sentence id'], inplace=True) # salience = [] # for _, row in clusters.iterrows(): # sal_pred = sal_df.loc[ # (sal_df['stream id'] == row['stream id']) & \ # (sal_df['sentence id'] == row['sentence id']) # ].as_matrix()[:,2:].astype(np.float64).mean() # salience.append(sal_pred) # clusters['salience'] = salience # clusters.sort(['salience'], ascending=False, inplace=True) # sal_mean = np.mean(salience) # sal_std = np.std(salience) for _, row in clusters.iterrows(): if row['cluster size'] < min_cluster_size: continue vec = lvec_df.loc[ (lvec_df['stream id'] == row['stream id']) & \ (lvec_df['sentence id'] == row['sentence id']) ].as_matrix()[:,2:].astype(np.float64) #sal_norm = (row['salience'] - sal_mean) / sal_std #if sal_norm < center_threshold: # continue if Xcache is None: Xcache = vec else: if np.max(cosine_similarity(Xcache, vec)) >= sim_threshold: continue else: Xcache = np.vstack((Xcache, vec)) updates.append({ 'query id': event.query_id[5:], 'system id': 'cunlp', 'run id': 'hac-dist_{}-sim_{}'.format( dist_cutoff, sim_threshold), 'stream id': row['stream id'], 'sentence id': row['sentence id'], 'timestamp': timestamp, 'conf': 1.0, 'string': row['string'] }) df = pd.DataFrame(updates, columns=["query id", "system id", "run id", "stream id", "sentence id", "timestamp", "conf", "string"]) with open(tsv_path, u'w') as f: df.to_csv( f, sep='\t', index=False, index_label=False, header=False)
def make(self, event, prefix, feature_set, sal_cutoff, sim_cutoff): tsv_path = self.get_tsv_path(event, sal_cutoff, sim_cutoff) lvecs = SentenceLatentVectorsResource() string_res = get_resource_manager(u'SentenceStringsResource') spa = SaliencePredictionAggregator() #cluster_df = hac.get_dataframe(event, dist_cutoff) #for _, row in cluster_df.iterrows(): # print row['hour'], datetime.utcfromtimestamp(row['timestamp']) epoch = datetime.utcfromtimestamp(0) updates = [] Xcache = None #timestamps = sorted(list(cluster_df['timestamp'].unique())) hours = event.list_event_hours() for hour in hours: #hour = datetime.utcfromtimestamp(timestamp) - timedelta(hours=1) hp1 = hour + timedelta(hours=1) timestamp = str(int((hp1 - epoch).total_seconds())) lvec_df = lvecs.get_dataframe(event, hour) sal_df = spa.get_dataframe(event, hour, prefix, feature_set) str_df = string_res.get_dataframe(event, hour) if lvec_df is None or sal_df is None or str_df is None: continue str_df.drop_duplicates(['stream id', 'sentence id'], inplace=True) lvec_df.drop_duplicates(['stream id', 'sentence id'], inplace=True) sal_df.drop_duplicates(['stream id', 'sentence id'], inplace=True) str_df.sort(['stream id', 'sentence id'], inplace=True) str_df.reset_index(drop=True, inplace=True) lvec_df.sort(['stream id', 'sentence id'], inplace=True) lvec_df.reset_index(drop=True, inplace=True) sal_df.sort(['stream id', 'sentence id'], inplace=True) sal_df.reset_index(drop=True, inplace=True) str_df = str_df.join( str_df.groupby('stream id')['sentence id'].agg('count'), on='stream id', rsuffix='_r').rename( columns={"sentence id_r": "document count"}) good_sents = str_df.apply( lambda x: passes_simple_filter( x['streamcorpus'], x['document count']), axis=1) #good_sents = good_sents.reset_index() str_df = str_df[good_sents] lvec_df = lvec_df[good_sents] sal_df = sal_df[good_sents] n_rows = len(sal_df) for i in xrange(n_rows): assert sal_df['stream id'].iloc[i] == \ lvec_df['stream id'].iloc[i] assert sal_df['sentence id'].iloc[i] == \ lvec_df['sentence id'].iloc[i] assert str_df['stream id'].iloc[i] == \ lvec_df['stream id'].iloc[i] assert str_df['sentence id'].iloc[i] == \ lvec_df['sentence id'].iloc[i] if n_rows == 0: continue Xsal = sal_df.as_matrix()[:,2:].astype(np.float64).mean(axis=1) mu_sal = np.mean(Xsal) sig_sal = np.std(Xsal) Xsal_norm = (Xsal - mu_sal) / sig_sal lvec_df = lvec_df[Xsal_norm > sal_cutoff] str_df = str_df[Xsal_norm > sal_cutoff] str_df = str_df.set_index(['stream id', 'sentence id']) lvec_df['salience'] = Xsal_norm[Xsal_norm > sal_cutoff] lvec_df.sort(['salience'], inplace=True, ascending=False) if Xcache is None: Xlvecs = lvec_df.as_matrix()[:, 2:-2].astype(np.float64) K = cosine_similarity(Xlvecs) K_ma = np.ma.array(K, mask=True) good_indexes = [] for i, (_, row) in enumerate(lvec_df.iterrows()): sim = K_ma[i,:].max(fill_value=0.0) if not isinstance(sim, np.float64): sim = 0 if sim < sim_cutoff: up_str = str_df.loc[ row['stream id'], row['sentence id']]['streamcorpus'] updates.append({ 'query id': event.query_num, 'system id': 'cunlp', 'run id': 'sal-ranked-sal_{}-sim_{}'.format( sal_cutoff, sim_cutoff), 'stream id': row['stream id'], 'sentence id': row['sentence id'], 'timestamp': timestamp, 'conf': row['salience'], 'string': up_str}) K_ma.mask[:,i] = False good_indexes.append(i) Xcache = Xlvecs[good_indexes].copy() else: xtmp = lvec_df.as_matrix()[:, 2:-2].astype(np.float64) Xlvecs = np.vstack([Xcache, xtmp]) start_index = Xcache.shape[0] K = cosine_similarity(Xlvecs) K_ma = np.ma.array(K, mask=True) K_ma.mask.T[np.arange(0, start_index)] = False good_indexes = [] for i, (_, row) in enumerate(lvec_df.iterrows(), start_index): #print i sim = K_ma[i,:].max(fill_value=0.0) #print sim if sim < sim_cutoff: up_str = str_df.loc[ row['stream id'], row['sentence id']]['streamcorpus'] updates.append({ 'query id': event.query_num, 'system id': 'cunlp', 'run id': 'sal-ranked-sal_{}-sim_{}'.format( sal_cutoff, sim_cutoff), 'stream id': row['stream id'], 'sentence id': row['sentence id'], 'timestamp': timestamp, 'conf': row['salience'], 'string': up_str}) K_ma.mask[:,i] = False good_indexes.append(i) if len(good_indexes) > 0: Xcache = np.vstack([Xcache, Xlvecs[good_indexes].copy()]) #Xcache = Xlvecs[good_indexes].copy() if len(updates) == 0: updates.append({ 'query id': event.query_num, 'system id': 'cunlp', 'run id': 'sal-ranked-sal_{}-sim_{}'.format( sal_cutoff, sim_cutoff), 'stream id': 1111, 'sentence id': 1, 'timestamp': timestamp, 'conf': 0, 'string': 'place holder'}) df = pd.DataFrame(updates, columns=["query id", "system id", "run id", "stream id", "sentence id", "timestamp", "conf", "string"]) with open(tsv_path, u'w') as f: df.to_csv( f, sep='\t', index=False, index_label=False, header=False)