def make(self, event, prefix, feature_set, min_cluster_size=2, sim_threshold=.2264, center_threshold=1.0): tsv_path = self.get_tsv_path(event, prefix, feature_set, center_threshold, sim_threshold) lvecs = SentenceLatentVectorsResource() spa = SaliencePredictionAggregator() apsal = APSalienceSummarizer() cluster_df = apsal.get_dataframe(event, prefix, feature_set) #for _, row in cluster_df.iterrows(): # print row['hour'], datetime.utcfromtimestamp(row['timestamp']) updates = [] Xcache = None timestamps = sorted(list(cluster_df['timestamp'].unique())) for timestamp in timestamps: hour = datetime.utcfromtimestamp(timestamp) - timedelta(hours=1) lvec_df = lvecs.get_dataframe(event, hour) lvec_df.drop_duplicates(['stream id', 'sentence id'], inplace=True) sal_df = spa.get_dataframe(event, hour, prefix, feature_set) sal_df.drop_duplicates(['stream id', 'sentence id'], inplace=True) clusters = cluster_df[cluster_df['timestamp'] == timestamp].copy() clusters.sort(['stream id', 'sentence id'], inplace=True) salience = [] for _, row in clusters.iterrows(): sal_pred = sal_df.loc[ (sal_df['stream id'] == row['stream id']) & \ (sal_df['sentence id'] == row['sentence id'])].as_matrix()[:,2:].astype(np.float64).mean() salience.append(sal_pred) clusters['salience'] = salience #clusters.sort(['salie'], inplace=True) sal_mean = np.mean(salience) sal_std = np.std(salience) #print clusters for _, row in clusters.iterrows(): if row['cluster size'] < min_cluster_size: continue vec = lvec_df.loc[ (lvec_df['stream id'] == row['stream id']) & \ (lvec_df['sentence id'] == row['sentence id'])].as_matrix()[:,2:].astype(np.float64) sal_norm = (row['salience'] - sal_mean) / sal_std if sal_norm < center_threshold: continue if Xcache is None: Xcache = vec else: if np.max(cosine_similarity(Xcache, vec)) >= sim_threshold: continue else: Xcache = np.vstack((Xcache, vec)) updates.append({ 'query id': event.query_id[5:], 'system id': 'cunlp', 'run id': 'apsal-time-ranked-sal_{}-sim_{}'.format( center_threshold, sim_threshold), 'stream id': row['stream id'], 'sentence id': row['sentence id'], 'timestamp': timestamp, 'conf': row['salience'], 'string': row['string'] }) print "Writing", tsv_path, "For ", center_threshold, sim_threshold df = pd.DataFrame(updates, columns=[ "query id", "system id", "run id", "stream id", "sentence id", "timestamp", "conf", "string" ]) with open(tsv_path, u'w') as f: df.to_csv(f, sep='\t', index=False, index_label=False, header=False)
def make_summary(self, event, corpus, prefix, feature_set, cutoff): string_res = get_resource_manager(u'SentenceStringsResource') lvec_res = get_resource_manager(u'SentenceLatentVectorsResource') spa = SaliencePredictionAggregator() tsv_path = self.get_tsv_path(event, cutoff) updates = [] epoch = datetime.utcfromtimestamp(0) for hour in event.list_event_hours(): hp1 = hour + timedelta(hours=1) timestamp = str(int((hp1 - epoch).total_seconds())) string_df = string_res.get_dataframe(event, hour) lvec_df = lvec_res.get_dataframe(event, hour) sal_df = spa.get_dataframe(event, hour, prefix, feature_set) if string_df is None or lvec_df is None or sal_df is None: continue string_df = string_df.drop_duplicates( subset=[u'stream id', u'sentence id']) lvec_df = lvec_df.drop_duplicates( subset=[u'stream id', u'sentence id']) sal_df = sal_df.drop_duplicates( subset=[u'stream id', u'sentence id']) string_df.sort([u"stream id", u"sentence id"], inplace=True) lvec_df.sort([u"stream id", u"sentence id"], inplace=True) sal_df.sort([u"stream id", u"sentence id"], inplace=True) X = lvec_df.as_matrix()[:, 2:].astype(np.float64) good_rows = np.where(X.any(axis=1))[0] string_df = string_df.iloc[good_rows] lvec_df = lvec_df.iloc[good_rows] sal_df = sal_df.iloc[good_rows] assert len(string_df) == len(lvec_df) assert len(string_df) == len(sal_df) n_sents = len(string_df) for i in xrange(n_sents): assert string_df[u'stream id'].iloc[i] == \ lvec_df[u'stream id'].iloc[i] assert string_df[u'stream id'].iloc[i] == \ sal_df[u'stream id'].iloc[i] assert string_df[u'sentence id'].iloc[i] == \ lvec_df[u'sentence id'].iloc[i] assert string_df[u'sentence id'].iloc[i] == \ sal_df[u'sentence id'].iloc[i] lvec_df.reset_index(drop=True, inplace=True) string_df.reset_index(drop=True, inplace=True) sal_df.reset_index(drop=True, inplace=True) good_rows = [] for name, doc in string_df.groupby("stream id"): for rname, row in doc.iterrows(): scstring = row["streamcorpus"] words = len(re.findall(r'\b[^\W\d_]+\b', scstring)) socs = len( re.findall( r'Digg|del\.icio\.us|Facebook|Kwoff|Myspace', scstring)) langs = len( re.findall(r'Flash|JavaScript|CSS', scstring, re.I)) assert lvec_df.loc[rname][u'sentence id'] == \ row[u'sentence id'] assert lvec_df.loc[rname][u'stream id'] == \ row[u'stream id'] assert sal_df.loc[rname][u'sentence id'] == \ row[u'sentence id'] assert sal_df.loc[rname][u'stream id'] == \ row[u'stream id'] if words > 9 and len(doc) < 200 \ and socs < 2 and langs < 2: good_rows.append(rname) lvec_df = lvec_df.loc[good_rows] string_df = string_df.loc[good_rows] sal_df = sal_df.loc[good_rows] n_sents = len(string_df) if n_sents < 10: continue for i in xrange(n_sents): assert string_df[u'stream id'].iloc[i] == \ lvec_df[u'stream id'].iloc[i] assert string_df[u'stream id'].iloc[i] == \ sal_df[u'stream id'].iloc[i] assert string_df[u'sentence id'].iloc[i] == \ lvec_df[u'sentence id'].iloc[i] assert string_df[u'sentence id'].iloc[i] == \ sal_df[u'sentence id'].iloc[i] X = lvec_df.as_matrix()[:, 2:].astype(np.float64) S = sal_df.as_matrix()[:, 2:].astype(np.float64) s = np.mean(S, axis=1) #Xn = Normalizer().fit_transform(X) z = fastcluster.linkage(X, method='single', metric='euclidean', preserve_input=True) clusters = hac.fcluster(z, cutoff, 'distance') II = np.arange(n_sents) #print set(clusters) for cluster_id, cluster in enumerate(set(clusters)): # print cluster # print (clusters == cluster).shape # print II.shape ii = II[clusters == cluster] #print ii.shape C = X[clusters == cluster, :] u = np.mean(C, axis=0) dist_2 = np.sum((C - u)**2, axis=1) cidx = np.argmin(dist_2) #cidx = np.argmax(cosine_similarity(C, u)) e = ii[cidx] # #Cs = s[clusters == cluster] # e = ii[np.argmax(Cs)], cluster_size = C.shape[0] scstring = string_df.iloc[e][u'streamcorpus'] stream_id = string_df.iloc[e][u'stream id'] sentence_id = str(string_df.iloc[e][u'sentence id']) updates.append({ "stream id": stream_id, "sentence id": sentence_id, "hour": hour, "timestamp": timestamp, "cluster size": cluster_size, "string": scstring }) df = pd.DataFrame(updates, columns=[ "stream id", "sentence id", "hour", "timestamp", "cluster size", "string" ]) with gzip.open(tsv_path, u'w') as f: df.to_csv(f, sep='\t', index=False, index_label=False)
def make(self, event, prefix, feature_set, sal_cutoff, sim_cutoff): tsv_path = self.get_tsv_path(event, sal_cutoff, sim_cutoff) lvecs = SentenceLatentVectorsResource() string_res = get_resource_manager(u'SentenceStringsResource') spa = SaliencePredictionAggregator() #cluster_df = hac.get_dataframe(event, dist_cutoff) #for _, row in cluster_df.iterrows(): # print row['hour'], datetime.utcfromtimestamp(row['timestamp']) epoch = datetime.utcfromtimestamp(0) updates = [] Xcache = None #timestamps = sorted(list(cluster_df['timestamp'].unique())) hours = event.list_event_hours() for hour in hours: #hour = datetime.utcfromtimestamp(timestamp) - timedelta(hours=1) hp1 = hour + timedelta(hours=1) timestamp = str(int((hp1 - epoch).total_seconds())) lvec_df = lvecs.get_dataframe(event, hour) sal_df = spa.get_dataframe(event, hour, prefix, feature_set) str_df = string_res.get_dataframe(event, hour) if lvec_df is None or sal_df is None or str_df is None: continue str_df.drop_duplicates(['stream id', 'sentence id'], inplace=True) lvec_df.drop_duplicates(['stream id', 'sentence id'], inplace=True) sal_df.drop_duplicates(['stream id', 'sentence id'], inplace=True) str_df.sort(['stream id', 'sentence id'], inplace=True) str_df.reset_index(drop=True, inplace=True) lvec_df.sort(['stream id', 'sentence id'], inplace=True) lvec_df.reset_index(drop=True, inplace=True) sal_df.sort(['stream id', 'sentence id'], inplace=True) sal_df.reset_index(drop=True, inplace=True) str_df = str_df.join( str_df.groupby('stream id')['sentence id'].agg('count'), on='stream id', rsuffix='_r').rename( columns={"sentence id_r": "document count"}) good_sents = str_df.apply(lambda x: passes_simple_filter( x['streamcorpus'], x['document count']), axis=1) #good_sents = good_sents.reset_index() str_df = str_df[good_sents] lvec_df = lvec_df[good_sents] sal_df = sal_df[good_sents] n_rows = len(sal_df) for i in xrange(n_rows): assert sal_df['stream id'].iloc[i] == \ lvec_df['stream id'].iloc[i] assert sal_df['sentence id'].iloc[i] == \ lvec_df['sentence id'].iloc[i] assert str_df['stream id'].iloc[i] == \ lvec_df['stream id'].iloc[i] assert str_df['sentence id'].iloc[i] == \ lvec_df['sentence id'].iloc[i] if n_rows == 0: continue Xsal = sal_df.as_matrix()[:, 2:].astype(np.float64).mean(axis=1) mu_sal = np.mean(Xsal) sig_sal = np.std(Xsal) Xsal_norm = (Xsal - mu_sal) / sig_sal lvec_df = lvec_df[Xsal_norm > sal_cutoff] str_df = str_df[Xsal_norm > sal_cutoff] str_df = str_df.set_index(['stream id', 'sentence id']) lvec_df['salience'] = Xsal_norm[Xsal_norm > sal_cutoff] lvec_df.sort(['salience'], inplace=True, ascending=False) if Xcache is None: Xlvecs = lvec_df.as_matrix()[:, 2:-2].astype(np.float64) K = cosine_similarity(Xlvecs) K_ma = np.ma.array(K, mask=True) good_indexes = [] for i, (_, row) in enumerate(lvec_df.iterrows()): sim = K_ma[i, :].max(fill_value=0.0) if not isinstance(sim, np.float64): sim = 0 if sim < sim_cutoff: up_str = str_df.loc[row['stream id'], row['sentence id']]['streamcorpus'] updates.append({ 'query id': event.query_num, 'system id': 'cunlp', 'run id': 'sal-ranked-sal_{}-sim_{}'.format( sal_cutoff, sim_cutoff), 'stream id': row['stream id'], 'sentence id': row['sentence id'], 'timestamp': timestamp, 'conf': row['salience'], 'string': up_str }) K_ma.mask[:, i] = False good_indexes.append(i) Xcache = Xlvecs[good_indexes].copy() else: xtmp = lvec_df.as_matrix()[:, 2:-2].astype(np.float64) Xlvecs = np.vstack([Xcache, xtmp]) start_index = Xcache.shape[0] K = cosine_similarity(Xlvecs) K_ma = np.ma.array(K, mask=True) K_ma.mask.T[np.arange(0, start_index)] = False good_indexes = [] for i, (_, row) in enumerate(lvec_df.iterrows(), start_index): #print i sim = K_ma[i, :].max(fill_value=0.0) #print sim if sim < sim_cutoff: up_str = str_df.loc[row['stream id'], row['sentence id']]['streamcorpus'] updates.append({ 'query id': event.query_num, 'system id': 'cunlp', 'run id': 'sal-ranked-sal_{}-sim_{}'.format( sal_cutoff, sim_cutoff), 'stream id': row['stream id'], 'sentence id': row['sentence id'], 'timestamp': timestamp, 'conf': row['salience'], 'string': up_str }) K_ma.mask[:, i] = False good_indexes.append(i) if len(good_indexes) > 0: Xcache = np.vstack([Xcache, Xlvecs[good_indexes].copy()]) #Xcache = Xlvecs[good_indexes].copy() if len(updates) == 0: updates.append({ 'query id': event.query_num, 'system id': 'cunlp', 'run id': 'sal-ranked-sal_{}-sim_{}'.format(sal_cutoff, sim_cutoff), 'stream id': 1111, 'sentence id': 1, 'timestamp': timestamp, 'conf': 0, 'string': 'place holder' }) df = pd.DataFrame(updates, columns=[ "query id", "system id", "run id", "stream id", "sentence id", "timestamp", "conf", "string" ]) with open(tsv_path, u'w') as f: df.to_csv(f, sep='\t', index=False, index_label=False, header=False)
def make_summary(self, event, corpus, prefix, feature_set): string_res = get_resource_manager(u'SentenceStringsResource') lvec_res = get_resource_manager(u'SentenceLatentVectorsResource') spa = SaliencePredictionAggregator() tsv_path = self.get_tsv_path(event, prefix, feature_set) updates = [] epoch = datetime.utcfromtimestamp(0) for hour in event.list_event_hours(): hp1 = hour + timedelta(hours=1) timestamp = str(int((hp1 - epoch).total_seconds())) string_df = string_res.get_dataframe(event, hour) lvec_df = lvec_res.get_dataframe(event, hour) sal_df = spa.get_dataframe(event, hour, prefix, feature_set) if string_df is None or lvec_df is None or sal_df is None: continue string_df = string_df.drop_duplicates( subset=[u'stream id', u'sentence id']) lvec_df = lvec_df.drop_duplicates( subset=[u'stream id', u'sentence id']) sal_df = sal_df.drop_duplicates( subset=[u'stream id', u'sentence id']) string_df.sort([u"stream id", u"sentence id"], inplace=True) lvec_df.sort([u"stream id", u"sentence id"], inplace=True) sal_df.sort([u"stream id", u"sentence id"], inplace=True) X = lvec_df.as_matrix()[:,2:].astype(np.float64) good_rows = np.where(X.any(axis=1))[0] string_df = string_df.iloc[good_rows] lvec_df = lvec_df.iloc[good_rows] sal_df = sal_df.iloc[good_rows] assert len(string_df) == len(lvec_df) assert len(string_df) == len(sal_df) n_sents = len(string_df) for i in xrange(n_sents): assert string_df[u'stream id'].iloc[i] == \ lvec_df[u'stream id'].iloc[i] assert string_df[u'stream id'].iloc[i] == \ sal_df[u'stream id'].iloc[i] assert string_df[u'sentence id'].iloc[i] == \ lvec_df[u'sentence id'].iloc[i] assert string_df[u'sentence id'].iloc[i] == \ sal_df[u'sentence id'].iloc[i] lvec_df.reset_index(drop=True, inplace=True) string_df.reset_index(drop=True, inplace=True) sal_df.reset_index(drop=True, inplace=True) string_df.drop_duplicates(subset=['streamcorpus'], inplace=True) string_df['update id'] = string_df['stream id'].map(str) + "-" + \ string_df['sentence id'].map(str) good_uids = set(string_df['update id'].tolist()) lvec_df['update id'] = lvec_df['stream id'].map(str) + "-" + \ lvec_df['sentence id'].map(str) lvec_df = lvec_df[lvec_df['update id'].isin(good_uids)].copy() sal_df['update id'] = sal_df['stream id'].map(str) + "-" + \ sal_df['sentence id'].map(str) sal_df = sal_df[sal_df['update id'].isin(good_uids)].copy() string_df.sort([u"stream id", u"sentence id"], inplace=True) lvec_df.sort([u"stream id", u"sentence id"], inplace=True) sal_df.sort([u"stream id", u"sentence id"], inplace=True) lvec_df.reset_index(drop=True, inplace=True) string_df.reset_index(drop=True, inplace=True) sal_df.reset_index(drop=True, inplace=True) n_sents = len(string_df) for i in xrange(n_sents): assert string_df[u'stream id'].iloc[i] == \ lvec_df[u'stream id'].iloc[i] assert string_df[u'stream id'].iloc[i] == \ sal_df[u'stream id'].iloc[i] assert string_df[u'sentence id'].iloc[i] == \ lvec_df[u'sentence id'].iloc[i] assert string_df[u'sentence id'].iloc[i] == \ sal_df[u'sentence id'].iloc[i] good_rows = [] for name, doc in string_df.groupby("stream id"): for rname, row in doc.iterrows(): scstring = row["streamcorpus"] words = len(re.findall(r'\b[^\W\d_]+\b', scstring)) socs = len(re.findall( r'Digg|del\.icio\.us|Facebook|Kwoff|Myspace', scstring)) langs = len(re.findall( r'Flash|JavaScript|CSS', scstring, re.I)) assert lvec_df.loc[rname][u'sentence id'] == \ row[u'sentence id'] assert lvec_df.loc[rname][u'stream id'] == \ row[u'stream id'] assert sal_df.loc[rname][u'sentence id'] == \ row[u'sentence id'] assert sal_df.loc[rname][u'stream id'] == \ row[u'stream id'] if words > 9 and len(doc) < 200 \ and socs < 2 and langs < 2: good_rows.append(rname) lvec_df = lvec_df.loc[good_rows] string_df = string_df.loc[good_rows] sal_df = sal_df.loc[good_rows] n_sents = len(string_df) if n_sents < 10: continue for i in xrange(n_sents): assert string_df[u'stream id'].iloc[i] == \ lvec_df[u'stream id'].iloc[i] assert string_df[u'stream id'].iloc[i] == \ sal_df[u'stream id'].iloc[i] assert string_df[u'sentence id'].iloc[i] == \ lvec_df[u'sentence id'].iloc[i] assert string_df[u'sentence id'].iloc[i] == \ sal_df[u'sentence id'].iloc[i] del lvec_df['update id'] del sal_df['update id'] X = lvec_df.as_matrix()[:,2:].astype(np.float64) S = sal_df.as_matrix()[:,2:].astype(np.float64) s = np.mean(S, axis=1) A = cosine_similarity(X) Aupper = A[np.triu_indices_from(A, k=1)] Amu = np.mean(Aupper) Astd = np.std(Aupper) A = (A - Amu) / Astd max_sim = np.max( (np.max(A[np.triu_indices_from(A, k=1)]), np.max(s))) A = A - max_sim P = s - max_sim #P = MinMaxScaler(feature_range=(-9, -5)).fit_transform(s) #A = MinMaxScaler(feature_range=(-3, -1)).fit_transform(A) #std_s = StandardScaler().fit_transform(s) #assert X.shape[0] == s.shape[0] #period = (((hour + timedelta(hours=6)) - \ # event.start).total_seconds() // (6 * 3600)) #cutoff = 2. * period / (1. + period) af = AffinityPropagation( preference=P, affinity='precomputed', max_iter=500, damping=.7, verbose=False).fit(A) if af.cluster_centers_indices_ is None: continue II = np.arange(n_sents) for cnum, cluster in enumerate(np.unique(af.labels_)): e = af.cluster_centers_indices_[cluster] cluster_size = II[cluster == af.labels_].shape[0] scstring = string_df.iloc[e][u'streamcorpus'] stream_id = string_df.iloc[e][u'stream id'] sentence_id = str(string_df.iloc[e][u'sentence id']) updates.append({"stream id": stream_id, "sentence id": sentence_id, "hour": hour, "timestamp": timestamp, "cluster size": cluster_size, "string": scstring}) df = pd.DataFrame(updates, columns=["stream id", "sentence id", "hour", "timestamp", "cluster size", "string"]) with gzip.open(tsv_path, u'w') as f: df.to_csv(f, sep='\t', index=False, index_label=False)
def make_summary(self, event, corpus, prefix, feature_set, cutoff): string_res = get_resource_manager(u'SentenceStringsResource') lvec_res = get_resource_manager(u'SentenceLatentVectorsResource') spa = SaliencePredictionAggregator() tsv_path = self.get_tsv_path(event, cutoff) updates = [] epoch = datetime.utcfromtimestamp(0) for hour in event.list_event_hours(): hp1 = hour + timedelta(hours=1) timestamp = str(int((hp1 - epoch).total_seconds())) string_df = string_res.get_dataframe(event, hour) lvec_df = lvec_res.get_dataframe(event, hour) sal_df = spa.get_dataframe(event, hour, prefix, feature_set) if string_df is None or lvec_df is None or sal_df is None: continue string_df = string_df.drop_duplicates( subset=[u'stream id', u'sentence id']) lvec_df = lvec_df.drop_duplicates( subset=[u'stream id', u'sentence id']) sal_df = sal_df.drop_duplicates( subset=[u'stream id', u'sentence id']) string_df.sort([u"stream id", u"sentence id"], inplace=True) lvec_df.sort([u"stream id", u"sentence id"], inplace=True) sal_df.sort([u"stream id", u"sentence id"], inplace=True) X = lvec_df.as_matrix()[:,2:].astype(np.float64) good_rows = np.where(X.any(axis=1))[0] string_df = string_df.iloc[good_rows] lvec_df = lvec_df.iloc[good_rows] sal_df = sal_df.iloc[good_rows] assert len(string_df) == len(lvec_df) assert len(string_df) == len(sal_df) n_sents = len(string_df) for i in xrange(n_sents): assert string_df[u'stream id'].iloc[i] == \ lvec_df[u'stream id'].iloc[i] assert string_df[u'stream id'].iloc[i] == \ sal_df[u'stream id'].iloc[i] assert string_df[u'sentence id'].iloc[i] == \ lvec_df[u'sentence id'].iloc[i] assert string_df[u'sentence id'].iloc[i] == \ sal_df[u'sentence id'].iloc[i] lvec_df.reset_index(drop=True, inplace=True) string_df.reset_index(drop=True, inplace=True) sal_df.reset_index(drop=True, inplace=True) good_rows = [] for name, doc in string_df.groupby("stream id"): for rname, row in doc.iterrows(): scstring = row["streamcorpus"] words = len(re.findall(r'\b[^\W\d_]+\b', scstring)) socs = len(re.findall( r'Digg|del\.icio\.us|Facebook|Kwoff|Myspace', scstring)) langs = len(re.findall( r'Flash|JavaScript|CSS', scstring, re.I)) assert lvec_df.loc[rname][u'sentence id'] == \ row[u'sentence id'] assert lvec_df.loc[rname][u'stream id'] == \ row[u'stream id'] assert sal_df.loc[rname][u'sentence id'] == \ row[u'sentence id'] assert sal_df.loc[rname][u'stream id'] == \ row[u'stream id'] if words > 9 and len(doc) < 200 \ and socs < 2 and langs < 2: good_rows.append(rname) lvec_df = lvec_df.loc[good_rows] string_df = string_df.loc[good_rows] sal_df = sal_df.loc[good_rows] n_sents = len(string_df) if n_sents < 10: continue for i in xrange(n_sents): assert string_df[u'stream id'].iloc[i] == \ lvec_df[u'stream id'].iloc[i] assert string_df[u'stream id'].iloc[i] == \ sal_df[u'stream id'].iloc[i] assert string_df[u'sentence id'].iloc[i] == \ lvec_df[u'sentence id'].iloc[i] assert string_df[u'sentence id'].iloc[i] == \ sal_df[u'sentence id'].iloc[i] X = lvec_df.as_matrix()[:,2:].astype(np.float64) S = sal_df.as_matrix()[:,2:].astype(np.float64) s = np.mean(S, axis=1) #Xn = Normalizer().fit_transform(X) z = fastcluster.linkage(X, method='single', metric='euclidean', preserve_input=True) clusters = hac.fcluster(z, cutoff, 'distance') II = np.arange(n_sents) #print set(clusters) for cluster_id, cluster in enumerate(set(clusters)): # print cluster # print (clusters == cluster).shape # print II.shape ii = II[clusters == cluster] #print ii.shape C = X[clusters == cluster,:] u = np.mean(C, axis=0) dist_2 = np.sum((C - u)**2, axis=1) cidx = np.argmin(dist_2) #cidx = np.argmax(cosine_similarity(C, u)) e = ii[cidx] # #Cs = s[clusters == cluster] # e = ii[np.argmax(Cs)], cluster_size = C.shape[0] scstring = string_df.iloc[e][u'streamcorpus'] stream_id = string_df.iloc[e][u'stream id'] sentence_id = str(string_df.iloc[e][u'sentence id']) updates.append({"stream id": stream_id, "sentence id": sentence_id, "hour": hour, "timestamp": timestamp, "cluster size": cluster_size, "string": scstring}) df = pd.DataFrame(updates, columns=["stream id", "sentence id", "hour", "timestamp", "cluster size", "string"]) with gzip.open(tsv_path, u'w') as f: df.to_csv(f, sep='\t', index=False, index_label=False)
def make(self, event, prefix, feature_set, min_cluster_size=2, sim_threshold=.2264, center_threshold=1.0): tsv_path = self.get_tsv_path(event, prefix, feature_set, center_threshold, sim_threshold) lvecs = SentenceLatentVectorsResource() spa = SaliencePredictionAggregator() apsal = APSalienceSummarizer() cluster_df = apsal.get_dataframe(event, prefix, feature_set) #for _, row in cluster_df.iterrows(): # print row['hour'], datetime.utcfromtimestamp(row['timestamp']) updates = [] Xcache = None timestamps = sorted(list(cluster_df['timestamp'].unique())) for timestamp in timestamps: hour = datetime.utcfromtimestamp(timestamp) - timedelta(hours=1) lvec_df = lvecs.get_dataframe(event, hour) lvec_df.drop_duplicates(['stream id', 'sentence id'], inplace=True) sal_df = spa.get_dataframe(event, hour, prefix, feature_set) sal_df.drop_duplicates(['stream id', 'sentence id'], inplace=True) clusters = cluster_df[cluster_df['timestamp'] == timestamp].copy() clusters.sort(['stream id', 'sentence id'], inplace=True) salience = [] for _, row in clusters.iterrows(): sal_pred = sal_df.loc[ (sal_df['stream id'] == row['stream id']) & \ (sal_df['sentence id'] == row['sentence id'])].as_matrix()[:,2:].astype(np.float64).mean() salience.append(sal_pred) clusters['salience'] = salience #clusters.sort(['salie'], inplace=True) sal_mean = np.mean(salience) sal_std = np.std(salience) #print clusters for _, row in clusters.iterrows(): if row['cluster size'] < min_cluster_size: continue vec = lvec_df.loc[ (lvec_df['stream id'] == row['stream id']) & \ (lvec_df['sentence id'] == row['sentence id'])].as_matrix()[:,2:].astype(np.float64) sal_norm = (row['salience'] - sal_mean) / sal_std if sal_norm < center_threshold: continue if Xcache is None: Xcache = vec else: if np.max(cosine_similarity(Xcache, vec)) >= sim_threshold: continue else: Xcache = np.vstack((Xcache, vec)) updates.append({ 'query id': event.query_id[5:], 'system id': 'cunlp', 'run id': 'apsal-time-ranked-sal_{}-sim_{}'.format( center_threshold, sim_threshold), 'stream id': row['stream id'], 'sentence id': row['sentence id'], 'timestamp': timestamp, 'conf': row['salience'], 'string': row['string'] }) print "Writing", tsv_path, "For ", center_threshold, sim_threshold df = pd.DataFrame(updates, columns=["query id", "system id", "run id", "stream id", "sentence id", "timestamp", "conf", "string"]) with open(tsv_path, u'w') as f: df.to_csv( f, sep='\t', index=False, index_label=False, header=False)
def make(self, event, prefix, feature_set, sal_cutoff, sim_cutoff): tsv_path = self.get_tsv_path(event, sal_cutoff, sim_cutoff) lvecs = SentenceLatentVectorsResource() string_res = get_resource_manager(u'SentenceStringsResource') spa = SaliencePredictionAggregator() #cluster_df = hac.get_dataframe(event, dist_cutoff) #for _, row in cluster_df.iterrows(): # print row['hour'], datetime.utcfromtimestamp(row['timestamp']) epoch = datetime.utcfromtimestamp(0) updates = [] Xcache = None #timestamps = sorted(list(cluster_df['timestamp'].unique())) hours = event.list_event_hours() for hour in hours: #hour = datetime.utcfromtimestamp(timestamp) - timedelta(hours=1) hp1 = hour + timedelta(hours=1) timestamp = str(int((hp1 - epoch).total_seconds())) lvec_df = lvecs.get_dataframe(event, hour) sal_df = spa.get_dataframe(event, hour, prefix, feature_set) str_df = string_res.get_dataframe(event, hour) if lvec_df is None or sal_df is None or str_df is None: continue str_df.drop_duplicates(['stream id', 'sentence id'], inplace=True) lvec_df.drop_duplicates(['stream id', 'sentence id'], inplace=True) sal_df.drop_duplicates(['stream id', 'sentence id'], inplace=True) str_df.sort(['stream id', 'sentence id'], inplace=True) str_df.reset_index(drop=True, inplace=True) lvec_df.sort(['stream id', 'sentence id'], inplace=True) lvec_df.reset_index(drop=True, inplace=True) sal_df.sort(['stream id', 'sentence id'], inplace=True) sal_df.reset_index(drop=True, inplace=True) str_df = str_df.join( str_df.groupby('stream id')['sentence id'].agg('count'), on='stream id', rsuffix='_r').rename( columns={"sentence id_r": "document count"}) good_sents = str_df.apply( lambda x: passes_simple_filter( x['streamcorpus'], x['document count']), axis=1) #good_sents = good_sents.reset_index() str_df = str_df[good_sents] lvec_df = lvec_df[good_sents] sal_df = sal_df[good_sents] n_rows = len(sal_df) for i in xrange(n_rows): assert sal_df['stream id'].iloc[i] == \ lvec_df['stream id'].iloc[i] assert sal_df['sentence id'].iloc[i] == \ lvec_df['sentence id'].iloc[i] assert str_df['stream id'].iloc[i] == \ lvec_df['stream id'].iloc[i] assert str_df['sentence id'].iloc[i] == \ lvec_df['sentence id'].iloc[i] if n_rows == 0: continue Xsal = sal_df.as_matrix()[:,2:].astype(np.float64).mean(axis=1) mu_sal = np.mean(Xsal) sig_sal = np.std(Xsal) Xsal_norm = (Xsal - mu_sal) / sig_sal lvec_df = lvec_df[Xsal_norm > sal_cutoff] str_df = str_df[Xsal_norm > sal_cutoff] str_df = str_df.set_index(['stream id', 'sentence id']) lvec_df['salience'] = Xsal_norm[Xsal_norm > sal_cutoff] lvec_df.sort(['salience'], inplace=True, ascending=False) if Xcache is None: Xlvecs = lvec_df.as_matrix()[:, 2:-2].astype(np.float64) K = cosine_similarity(Xlvecs) K_ma = np.ma.array(K, mask=True) good_indexes = [] for i, (_, row) in enumerate(lvec_df.iterrows()): sim = K_ma[i,:].max(fill_value=0.0) if not isinstance(sim, np.float64): sim = 0 if sim < sim_cutoff: up_str = str_df.loc[ row['stream id'], row['sentence id']]['streamcorpus'] updates.append({ 'query id': event.query_num, 'system id': 'cunlp', 'run id': 'sal-ranked-sal_{}-sim_{}'.format( sal_cutoff, sim_cutoff), 'stream id': row['stream id'], 'sentence id': row['sentence id'], 'timestamp': timestamp, 'conf': row['salience'], 'string': up_str}) K_ma.mask[:,i] = False good_indexes.append(i) Xcache = Xlvecs[good_indexes].copy() else: xtmp = lvec_df.as_matrix()[:, 2:-2].astype(np.float64) Xlvecs = np.vstack([Xcache, xtmp]) start_index = Xcache.shape[0] K = cosine_similarity(Xlvecs) K_ma = np.ma.array(K, mask=True) K_ma.mask.T[np.arange(0, start_index)] = False good_indexes = [] for i, (_, row) in enumerate(lvec_df.iterrows(), start_index): #print i sim = K_ma[i,:].max(fill_value=0.0) #print sim if sim < sim_cutoff: up_str = str_df.loc[ row['stream id'], row['sentence id']]['streamcorpus'] updates.append({ 'query id': event.query_num, 'system id': 'cunlp', 'run id': 'sal-ranked-sal_{}-sim_{}'.format( sal_cutoff, sim_cutoff), 'stream id': row['stream id'], 'sentence id': row['sentence id'], 'timestamp': timestamp, 'conf': row['salience'], 'string': up_str}) K_ma.mask[:,i] = False good_indexes.append(i) if len(good_indexes) > 0: Xcache = np.vstack([Xcache, Xlvecs[good_indexes].copy()]) #Xcache = Xlvecs[good_indexes].copy() if len(updates) == 0: updates.append({ 'query id': event.query_num, 'system id': 'cunlp', 'run id': 'sal-ranked-sal_{}-sim_{}'.format( sal_cutoff, sim_cutoff), 'stream id': 1111, 'sentence id': 1, 'timestamp': timestamp, 'conf': 0, 'string': 'place holder'}) df = pd.DataFrame(updates, columns=["query id", "system id", "run id", "stream id", "sentence id", "timestamp", "conf", "string"]) with open(tsv_path, u'w') as f: df.to_csv( f, sep='\t', index=False, index_label=False, header=False)