def rank_phrase(case_file): ph_dist_map = {} smoothing_factor = 0.0 phrase_map, cell_map, cell_cnt = read_caseolap_result(case_file) unif = [1.0 / cell_cnt] * cell_cnt for ph in phrase_map: ph_vec = [x[1] for x in phrase_map[ph].items()] # Modified by MILI if len(ph_vec) < cell_cnt: ph_vec += [0] * (cell_cnt - len(ph_vec)) # smoothing ph_vec = [x + smoothing_factor for x in ph_vec] ph_vec = utils.l1_normalize(ph_vec) ph_dist_map[ph] = utils.kl_divergence(ph_vec, unif) ranked_list = sorted(ph_dist_map.items(), key=operator.itemgetter(1), reverse=True) return ranked_list
def scips_approximable_pi(lmdp, gamma: float, sigma: float, time_horizon=100, n_samples=1000) -> Policy: """Generate policy tensor under SCIPS assumption. Args: lmdp (FLMDP): FLMDP for which to make SCIPS approximable policy. gamma (float): Time discounting parameter used in the SCIPS, in [0.0, 1.0]. sigma (float): Standard deviaton of noise added to policy. time_horizon (int): Trajectory length. n_samples (int): Number of trajectories to simulate. policy (Distribution): Policy distribution over actions given the current history. """ # Start with a random policy random_policy = FLMDP.random_pi(lmdp=lmdp) # Simuate some trajectories s_t, r_t, a_t = lmdp.simulate(policy=random_policy, time_horizon=time_horizon, n_samples=n_samples) # Fit the policy to SCIPS scips = sparsity_corrected_approx(states=s_t, actions=a_t, rewards=r_t, gamma=gamma, lmdp=lmdp) # Add noise for history_action in scips: scips[history_action] += np.random.normal(loc=0, scale=sigma) # Normalize the next action distribution for history in history_tuples(lmdp.state_size, lmdp.history_length): scips[history] = l1_normalize(scips[history]) return scips
def expan(embs, l_prel_file, dp_file, lp_file, mode='EMB'): # the part to verify iterative expansion # Mode = EMB: meaning that the similarity is learned from embedding # Mode = DIS: meaning that the similarity is from L-P assignment target_type = 'p' source_type = 'l' multiplier = 5 thre_softmax = 0.5 ori_embs = embs agg_embs = copy.copy(embs) pd_map = load_dp(dp_file, reverse=True) dp_map = load_edge_map(dp_file) lp_map = load_edge_map(lp_file) dist_map = {x:1 for x in embs[target_type]} vec_size = 0 for d in ori_embs[target_type]: vec_size = len(ori_embs[target_type][d]) break seeds_map = {} # label : seed set all_seeds = set() with open(l_prel_file, 'r') as f: for line in f: segs = line.strip('\r\n').split('\t') if segs[1] == '*': continue seeds_map[segs[1]] = set() seeds_map[segs[1]].add(segs[2].lower()) all_seeds.add(segs[2].lower()) print '*********** Direct Embedding' evaluate(ori_embs, true_file, target_dim) agg_embs[source_type] = weighted_avg_embedding(lp_map, agg_embs[target_type], dist_map, vec_size) agg_embs['d'] = weighted_avg_embedding(dp_map, agg_embs[target_type], dist_map, vec_size) print '*********** Aggregate without expansion' evaluate(agg_embs, true_file, target_dim) for i in range(2): print '======== iter ' + str(i) + ' of expansion.' extended_seeds = expan_round(agg_embs, seeds_map, all_seeds, 3, 1, mode=mode, pd_map=pd_map) print '============= seeds expanded' for seed in extended_seeds: label, phrase = seed.split('@') if label not in lp_map or phrase in lp_map[label]: print 'ERRRROR!!! ' + seed all_seeds.add(phrase.lower()) seeds_map[label].add(phrase.lower()) lp_map[label][phrase] = 1 agg_embs[source_type] = weighted_avg_embedding(lp_map, agg_embs[target_type], dist_map, vec_size) print '*********** Aggregate with expansion at iter ' + str(i) evaluate(agg_embs, true_file, target_dim) normal = False source_type = 'd' target_type = 'l' mid_type = 'p' for i in range(2): if i > 0: normal = True print '============= iter ' + str(i) + ' of dist started.' pred_label, doc_score = doc_assignment(agg_embs, 'd', 'l') top_labels = [w.path for w in hier.get_nodes_at_level(1)] print '============= docs assigned to labels' # # print meta stats # top_label_cnts = {} # for label in top_labels: # top_label_cnts[label] = 0 # for doc_pair in filtered_docs: # l = pred_label[doc_pair[0]] # top_label_cnts[l] += 1 # print top_label_cnts # print 'top level labels: ' + str(top_labels) label_to_idx = {} for idx, label in enumerate(top_labels): label_to_idx[label] = idx uniform_vec = [1.0/len(top_labels)] * len(top_labels) # print uniform_vec label_to_doc = {} for label in top_labels: label_to_doc[label] = set() docs_used = {} if normal: print 'used docs in reweighting: ' + str(len(pred_label)) for doc, score in doc_score.iteritems(): label_to_doc[pred_label[doc]].add(doc) else: for label in top_labels: p = label.lower() # idx = label_to_idx[label] for doc in pd_map[p]: label_to_doc[label].add(doc) if doc not in docs_used: docs_used[doc] = set() docs_used[doc].add(label) print 'docs used: %d' % len(docs_used) cnt_vec = [0.0] * len(top_labels) for label in label_to_doc: cnt_vec[label_to_idx[label]] = len(label_to_doc[label]) comp_vec = utils.l1_normalize(cnt_vec) print cnt_vec # print comp_vec distinct_map = {} if normal: for phrase in embs[mid_type]: p_vec = [0.0] * len(top_labels) # if len(pd_map[phrase]) < 100: # continue for doc in pd_map[phrase]: idx = label_to_idx[pred_label[doc]] p_vec[idx] += 1.0 if sum(p_vec) == 0: print 'ERROR!!!!!!!!!!' continue p_vec = utils.l1_normalize(p_vec) # kl = 0.1 + 0.9 * utils.kl_divergence(p_vec, uniform_vec) kl = utils.kl_divergence(p_vec, uniform_vec) # kl = utils.kl_divergence(p_vec, comp_vec) distinct_map[phrase] = kl else: for phrase in embs[mid_type]: p_vec = [0.0] * len(top_labels) # if len(pd_map[phrase]) < 100: # continue for doc in pd_map[phrase]: if doc in docs_used: for label in docs_used[doc]: idx = label_to_idx[label] p_vec[idx] += 1.0 # print p_vec if sum(p_vec) == 0: distinct_map[phrase] = 0 # print 'ERROR!!!!!!!!!!' continue # p_vec = [x / cnt_vec[i] for i, x in enumerate(p_vec)] p_vec = utils.l1_normalize(p_vec) # kl = 0.1 + 0.9 * utils.kl_divergence(p_vec, uniform_vec) # kl = utils.kl_divergence(p_vec, uniform_vec) kl = utils.kl_divergence(p_vec, comp_vec) distinct_map[phrase] = kl dist_map = distinct_map with open('focal_comp.txt', 'w+') as g: for (ph, score) in sorted(dist_map.items(), key=operator.itemgetter(1), reverse=True): g.write('%s,%f\t' % (ph, score)) print '============= phrase distinctness computed.' agg_embs[source_type] = weighted_avg_embedding(dp_map, agg_embs[mid_type], dist_map, vec_size) print '============= doc embedding aggregated.' print '*********** Aggregate with distinct at iter ' + str(i) evaluate(agg_embs, true_file, target_dim) return
def reweight_test(embs, dp_file): source_type = 'd' target_type = 'l' target_embs = embs[target_type] pred_label = {} doc_score = {} ratio = 1 for doc in embs[source_type]: doc_emb = embs[source_type][doc] sim_map = classify_doc(doc_emb, target_embs) pred_label[doc] = hier.get_node(sim_map[0][0]).get_ascendant(1).path doc_score[doc] = sim_map[0][1] doc_score = sorted(doc_score.items(), key=operator.itemgetter(1), reverse=True) filtered_docs = doc_score[:int(len(doc_score)*ratio)] top_labels = [w.path for w in hier.get_nodes_at_level(1)] # print meta stats top_label_cnts = {} for label in top_labels: top_label_cnts[label] = 0 for doc_pair in filtered_docs: l = pred_label[doc_pair[0]] top_label_cnts[l] += 1 print top_label_cnts print 'top level labels: ' + str(top_labels) # return label_to_idx = {} for idx, label in enumerate(top_labels): label_to_idx[label] = idx uniform_vec = [1.0/len(top_labels)] * len(top_labels) print uniform_vec label_to_doc = {} # new_filter = [] # new_pred_ls = {} # for (doc, score) in filtered_docs: # if pred_label[doc] not in top_labels: # continue # new_filter.append((doc, score)) # new_pred_ls[doc] = pred_label[doc] # filtered_docs = new_filter # pred_label = new_pred_ls pd_map = load_dp(dp_file, reverse=True) for label in top_labels: label_to_doc[label] = set() print 'used docs in reweighting: ' + str(len(filtered_docs)) for (doc, score) in filtered_docs: label_to_doc[pred_label[doc]].add(doc) distinct_map = {} cnt = 0 for phrase in embs['p']: p_vec = [0.0] * len(top_labels) if len(pd_map[phrase]) < 100: continue for doc in pd_map[phrase]: if doc not in pred_label: continue idx = label_to_idx[pred_label[doc]] p_vec[idx] += 1.0 if sum(p_vec) == 0: continue p_vec = utils.l1_normalize(p_vec) kl = utils.kl_divergence(p_vec, uniform_vec) distinct_map[phrase] = kl distinct_map = sorted(distinct_map.items(), key=operator.itemgetter(1), reverse=False) print distinct_map[:100] print print distinct_map[:-100]
def reweight(embs, dp_file, lp_file): source_type = 'd' target_type = 'l' mid_type = 'p' ori_embs = embs agg_embs = copy.copy(embs) # Step 0: check original embedding's performance print '*********** Direct Embedding' evaluate(ori_embs, true_file, target_dim) pd_map = load_dp(dp_file, reverse=True) dp_map = load_edge_map(dp_file) lp_map = load_edge_map(lp_file) dist_map = {x:1 for x in embs[mid_type]} vec_size = 0 for d in ori_embs[mid_type]: vec_size = len(ori_embs[mid_type][d]) break # print '============= dp, pd maps loaded' # Step 1: check with D weighted avg, what's the performance agg_embs[source_type] = weighted_avg_embedding(dp_map, agg_embs[mid_type], dist_map, vec_size) # optional L - embedding also aggregated from P normal = False if not normal: agg_embs[target_type] = weighted_avg_embedding(lp_map, agg_embs[mid_type], dist_map, vec_size) # print '============= doc embedding aggregated.' print '*********** Aggregate iter 0' evaluate(agg_embs, true_file, target_dim) for i in range(2): if i > 0: normal = True print '============= iter ' + str(i+1) + ' of dist started.' pred_label, doc_score = doc_assignment(agg_embs, source_type, target_type) top_labels = [w.path for w in hier.get_nodes_at_level(1)] # print '============= docs assigned to labels' # # print meta stats # top_label_cnts = {} # for label in top_labels: # top_label_cnts[label] = 0 # for doc_pair in filtered_docs: # l = pred_label[doc_pair[0]] # top_label_cnts[l] += 1 # print top_label_cnts # print 'top level labels: ' + str(top_labels) label_to_idx = {} for idx, label in enumerate(top_labels): label_to_idx[label] = idx uniform_vec = [1.0/len(top_labels)] * len(top_labels) # print uniform_vec label_to_doc = {} for label in top_labels: label_to_doc[label] = set() docs_used = {} if normal: print 'used docs in reweighting: ' + str(len(pred_label)) for doc, score in doc_score.iteritems(): label_to_doc[pred_label[doc]].add(doc) else: for label in top_labels: p = label.lower() # idx = label_to_idx[label] for doc in pd_map[p]: label_to_doc[label].add(doc) if doc not in docs_used: docs_used[doc] = set() docs_used[doc].add(label) print 'docs used: %d' % len(docs_used) cnt_vec = [0.0] * len(top_labels) for label in label_to_doc: cnt_vec[label_to_idx[label]] = len(label_to_doc[label]) comp_vec = utils.l1_normalize(cnt_vec) print cnt_vec # print comp_vec distinct_map = {} if normal: for phrase in embs[mid_type]: p_vec = [0.0] * len(top_labels) # if len(pd_map[phrase]) < 100: # continue for doc in pd_map[phrase]: idx = label_to_idx[pred_label[doc]] p_vec[idx] += 1.0 if sum(p_vec) == 0: print 'ERROR!!!!!!!!!!' continue p_vec = utils.l1_normalize(p_vec) # kl = 0.1 + 0.9 * utils.kl_divergence(p_vec, uniform_vec) kl = utils.kl_divergence(p_vec, uniform_vec) # kl = utils.kl_divergence(p_vec, comp_vec) distinct_map[phrase] = kl else: for phrase in embs[mid_type]: p_vec = [0.0] * len(top_labels) # if len(pd_map[phrase]) < 100: # continue for doc in pd_map[phrase]: if doc in docs_used: for label in docs_used[doc]: idx = label_to_idx[label] p_vec[idx] += 1.0 # print p_vec if sum(p_vec) == 0: distinct_map[phrase] = 0 # print 'ERROR!!!!!!!!!!' continue # p_vec = [x / cnt_vec[i] for i, x in enumerate(p_vec)] p_vec = utils.l1_normalize(p_vec) # kl = 0.1 + 0.9 * utils.kl_divergence(p_vec, uniform_vec) # kl = utils.kl_divergence(p_vec, uniform_vec) kl = utils.kl_divergence(p_vec, comp_vec) distinct_map[phrase] = kl dist_map = distinct_map # with open('focal_comp.txt', 'w+') as g: # for (ph, score) in sorted(dist_map.items(), key=operator.itemgetter(1), reverse=True): # g.write('%s,%f\t' % (ph, score)) # print '============= phrase distinctness computed.' agg_embs[source_type] = weighted_avg_embedding(dp_map, agg_embs[mid_type], dist_map, vec_size) # print '============= doc embedding aggregated.' print '*********** Aggregate with distinct at iter ' + str(i + 1) evaluate(agg_embs, true_file, target_dim)
def expan_round(embs, seeds_map, all_seeds, limit, cate_lim, mode='EMB', pd_map=None): target_type = 'p' multiplier = 5 thre_softmax = 0.5 extended_seeds = set() candidates = {} if mode == 'EMB': for phrase in embs[target_type]: if phrase in all_seeds: continue t_emb = embs[target_type][phrase] rel_values = {} # flat comparison for label in seeds_map: max_sim = 0 for seed in seeds_map[label]: sim = multiplier * utils.cossim(t_emb, embs[target_type][seed]) if sim > max_sim: max_sim = sim rel_values[label] = max_sim utils.softmax_for_map(rel_values) best_label = sorted(rel_values.items(), key=operator.itemgetter(1), reverse=True)[0][0] candidates[best_label + '@' + phrase] = rel_values[best_label] elif mode == 'DIS': pred_label, doc_score = doc_assignment(embs, 'd', 'l', mode='FLAT') top_labels = [w.path for w in hier.get_all_nodes()] print 'Doc Assignment done...' label_to_idx = {} for idx, label in enumerate(top_labels): label_to_idx[label] = idx # print uniform_vec label_to_doc = {} for label in top_labels: label_to_doc[label] = set() for doc, score in doc_score.iteritems(): label_to_doc[pred_label[doc]].add(doc) cnt_vec = [0.0] * len(top_labels) for label in label_to_doc: cnt_vec[label_to_idx[label]] = len(label_to_doc[label]) comp_vec = utils.l1_normalize(cnt_vec) uniform_vec = [1.0/len(top_labels)] * len(top_labels) # print cnt_vec # print comp_vec for phrase in embs['p']: if phrase in all_seeds: continue p_vec = [0.0] * len(top_labels) for doc in pd_map[phrase]: idx = label_to_idx[pred_label[doc]] p_vec[idx] += 1.0 max_label_value = 0 best_label = '' best_cnt = 0 for label in top_labels: idx = label_to_idx[label] if p_vec[idx] > 0: norm_value = p_vec[idx] / cnt_vec[idx] if norm_value > max_label_value: max_label_value = norm_value best_label = label best_cnt = p_vec[idx] if sum(p_vec) == 0: print 'ERROR!!!!!!!!!!' continue p_vec = utils.l1_normalize(p_vec) # kl = 0.1 + 0.9 * utils.kl_divergence(p_vec, uniform_vec) # kl = utils.kl_divergence(p_vec, comp_vec) kl = utils.kl_divergence(p_vec, uniform_vec) # best_label = sorted(rel_values.items(), key=operator.itemgetter(1), reverse=True)[0][0] pop = max_label_value # * (1 + math.log(1 + max_label_value)) candidates[best_label + '@' + phrase] = kl * max_label_value candidates = sorted(candidates.items(), key=operator.itemgetter(1), reverse=True) # cands_by_label = {} # for cand in candidates: # label, phrase = cand.split('@') # if label not in cands_by_label: # cands_by_label[label] = {} # cands_by_label[label][phrase] = candidates[cand] # for label in cands_by_label: # print '\n' + label # cand_cate = cands_by_label[label] # best_exps = sorted(cand_cate.items(), key=operator.itemgetter(1), reverse=True)[:10] # # best_exps = sorted(candidates.items(), key=operator.itemgetter(1), reverse=True)[:30] # print best_exps # exit(1) added = 0 added_cates = {} for (cand, score) in candidates: label, phrase = cand.split('@') if label not in added_cates: added_cates[label] = 0 if added_cates[label] >= cate_lim: continue if len(seeds_map[label]) >= 3: continue extended_seeds.add(cand) added_cates[label] += 1 added += 1 if added > limit: break print 'extended: ' + str(extended_seeds) return extended_seeds
def _build_model(self): """Build the core model within the graph.""" with tf.variable_scope('im_dup'): # Duplicate images to get multiple draws from the DP label # ditribution (each duplicate gets an independent noise draw # before going through the rest of the network). ones = tf.ones([len(self._images.get_shape()) - 1], dtype=tf.int32) x = tf.tile(self._images, tf.concat([[self.hps.n_draws], ones], axis=0)) with tf.variable_scope('init'): with tf.variable_scope('init_conv'): filter_size = 3 in_filters = 3 out_filters = 16 stride = 1 strides = self._stride_arr(2) n = filter_size * filter_size * out_filters self.kernel = tf.get_variable( 'DW', [filter_size, filter_size, in_filters, out_filters], tf.float32, initializer=tf.random_normal_initializer( stddev=np.sqrt(2.0 / n))) if self.hps.noise_scheme == 'l2_l2_s1': # Parseval projection, see: https://arxiv.org/abs/1704.08847 self._parseval_convs.append(self.kernel) k = stride * self.kernel / float(filter_size) elif self.hps.noise_scheme == 'l1_l2_s1': # Sensitivity 1 by L2 normalization k = tf.nn.l2_normalize(self.kernel, dim=[0, 1, 3]) elif self.hps.noise_scheme == 'l1_l1_s1': # Sensitivity 1 by L1 normalization k = utils.l1_normalize(self.kernel, dim=[0, 1, 3]) else: k = self.kernel x = tf.nn.conv2d(x, k, strides, padding='SAME') ############ # DP noise # # This is a factor applied to the noise layer, # used to rampup the noise at the beginning of training. self.noise_scale = tf.placeholder(tf.float32, shape=(), name='noise_scale') if self.hps.noise_scheme == 'l1_l2': sqr_sum = tf.reduce_sum(tf.square(x), [0, 1, 3], keep_dims=True) self.l2_norms = tf.sqrt(sqr_sum) dp_mult = self._dp_mult() epsilon = tf.random_normal(tf.shape(x), mean=0, stddev=1) self.sensitivity = tf.reduce_max(self.l2_norms) self.sigma = tf.multiply(dp_mult, self.sensitivity) self.noise_stddev = self.noise_scale * self.sigma self.noise = self.noise_stddev * epsilon x = x + self.noise elif self.hps.noise_scheme == 'l1_l2_s1': dp_mult = self._dp_mult() epsilon = tf.random_normal(tf.shape(x), mean=0, stddev=1) self.sensitivity = 1.0 # we bound it self.sigma = tf.multiply(dp_mult, self.sensitivity) self.noise_stddev = self.noise_scale * self.sigma self.noise = self.noise_stddev * epsilon x = x + self.noise elif self.hps.noise_scheme == 'l2_l2_s1': # Compute the actual sensitivity to rescale later shape = self.kernel.get_shape().as_list() w_t = tf.reshape(self.kernel, [-1, shape[-1]]) w = tf.transpose(w_t) self.norms = tf.svd(w, compute_uv=False) self.sensitivity_multiplier = tf.reduce_max(self.norms) # dp_mult = self._dp_mult() epsilon = tf.random_normal(tf.shape(x), mean=0, stddev=1) self.sensitivity = 1.0 self.sigma = tf.multiply(dp_mult, self.sensitivity) self.noise_stddev = self.noise_scale * self.sigma self.noise = self.noise_stddev * epsilon x = x + self.noise elif self.hps.noise_scheme == 'l1_l1': self.l1_norms = tf.reduce_sum(tf.abs(x), [0, 1, 3], keep_dims=True) dp_mult = self._dp_mult() laplace_shape = tf.shape(x) loc = tf.zeros(laplace_shape, dtype=tf.float32) scale = tf.ones(laplace_shape, dtype=tf.float32) epsilon = tf.distributions.Laplace(loc, scale).sample() self.sensitivity = tf.reduce_max(self.l1_norms) self.b = self.noise_scale * dp_mult * self.sensitivity self.noise = self.b * epsilon x = x + self.noise elif self.hps.noise_scheme == 'l1_l1_s1': dp_mult = self._dp_mult() laplace_shape = tf.shape(x) loc = tf.zeros(laplace_shape, dtype=tf.float32) scale = tf.ones(laplace_shape, dtype=tf.float32) epsilon = tf.distributions.Laplace(loc, scale).sample() self.sensitivity = 1.0 # because we normalize self.b = self.noise_scale * dp_mult * self.sensitivity self.noise = self.b * epsilon x = x + self.noise # DP noise # ############ strides = [1, 2, 2] activate_before_residual = [True, False, False] if self.hps.use_bottleneck: res_func = self._bottleneck_residual filters = [16, 64, 128, 256] else: res_func = self._residual # filters = [16, 16, 32, 64] # Uncomment the following codes to use w28-10 wide residual network. # It is more memory efficient than very deep residual network and has # comparably good performance. # https://arxiv.org/pdf/1605.07146v1.pdf filters = [out_filters, 160, 320, 640] # Update hps.num_residual_units to 4 with tf.variable_scope('unit_1_0'): x = res_func(x, filters[0], filters[1], self._stride_arr(strides[0]), activate_before_residual[0]) for i in six.moves.range(1, self.hps.num_residual_units): with tf.variable_scope('unit_1_%d' % i): x = res_func(x, filters[1], filters[1], self._stride_arr(1), False) with tf.variable_scope('unit_2_0'): x = res_func(x, filters[1], filters[2], self._stride_arr(strides[1]), activate_before_residual[1]) for i in six.moves.range(1, self.hps.num_residual_units): with tf.variable_scope('unit_2_%d' % i): x = res_func(x, filters[2], filters[2], self._stride_arr(1), False) with tf.variable_scope('unit_3_0'): x = res_func(x, filters[2], filters[3], self._stride_arr(strides[2]), activate_before_residual[2]) for i in six.moves.range(1, self.hps.num_residual_units): with tf.variable_scope('unit_3_%d' % i): x = res_func(x, filters[3], filters[3], self._stride_arr(1), False) with tf.variable_scope('unit_last'): x = self._batch_norm('final_bn', x) x = self._relu(x, self.hps.relu_leakiness) x = self._global_avg_pool(x) with tf.variable_scope('logit'): logits = self._fully_connected(x, self.hps.num_classes) self.pre_softmax = logits self.predictions = tf.nn.softmax(logits) with tf.variable_scope('label_dup'): ones = tf.ones([len(self.labels.get_shape()) - 1], dtype=tf.int32) labels = tf.tile(self.labels, tf.concat([[self.hps.n_draws], ones], axis=0)) with tf.variable_scope('costs'): xent = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels) self.cost = tf.reduce_mean(xent, name='xent') self.cost += self._decay() tf.summary.scalar('cost', self.cost)