示例#1
0
文件: NGCF.py 项目: zzg2008/NeuRec
    def train_model(self):
        for epoch in range(self.num_epochs):
            # Generate training instances
            user_input, item_input_pos, item_input_neg = data_gen._get_pairwise_all_data(
                self.dataset)

            total_loss = 0.0
            training_start_time = time()
            num_training_instances = len(user_input)
            for num_batch in np.arange(
                    int(num_training_instances / self.batch_size)):
                bat_users,bat_items_pos,bat_items_neg =\
                 data_gen._get_pairwise_batch_data(user_input,\
                 item_input_pos, item_input_neg, num_batch, self.batch_size)
                feed_dict = {
                    self.users: bat_users,
                    self.pos_items: bat_items_pos,
                    self.node_dropout: [0.1],
                    self.mess_dropout: [0.1],
                    self.neg_items: bat_items_neg
                }

                loss, _ = self.sess.run((self.loss, self.optimizer),
                                        feed_dict=feed_dict)
                total_loss += loss

            print("[iter %d : loss : %f, time: %f]" %
                  (epoch + 1, total_loss / num_training_instances,
                   time() - training_start_time))
            if epoch % self.verbose == 0:
                Evaluate.test_model(self, self.dataset)
示例#2
0
文件: WRMF.py 项目: zzg2008/NeuRec
    def train_model(self):
        for epoch in range(self.num_epochs):
            training_start_time = time()
            print('solving for user vectors...')
            for userid in range(self.num_users):
                feed = {
                    self.user_id: [userid],
                    self.Pu: self.Pui[userid].T.reshape([-1, 1]),
                    self.Cu: self.Cui[userid].T.reshape([-1, 1])
                }
                self.sess.run(self.update_user, feed_dict=feed)

            print('solving for item vectors...')
            for itemid in range(self.num_items):
                feed = {
                    self.item_id: [itemid],
                    self.Pi: self.Pui[:, itemid].reshape([-1, 1]),
                    self.Ci: self.Cui[:, itemid].reshape([-1, 1])
                }
                self.sess.run(self.update_item, feed_dict=feed)

            print('iteration %i finished in %f seconds' %
                  (epoch + 1, time() - training_start_time))
            if epoch % self.verbose == 0:
                Evaluate.test_model(self, self.dataset)
示例#3
0
    def train_model(self):
        gen_batch_index = np.arange(self.num_users)
        np.random.shuffle(gen_batch_index)
        dis_batch_index = np.arange(self.num_users)
        np.random.shuffle(dis_batch_index)

        totalEpochs = self.epochs
        totalEpochs = int(totalEpochs / self.step_G)
        for epoch in range(totalEpochs):
            train_matrix, ZR_matrix, PM_matrix = self.get_train_data()
            # training discriminator
            for d_epoch in range(self.step_D):
                for idx in np.arange(0, self.num_users, step=self.batchSize_D):
                    idx = dis_batch_index[idx:idx + self.batchSize_D]
                    train_data = train_matrix[idx].toarray()
                    train_mask = PM_matrix[idx].toarray()
                    feed = {self.realData: train_data, self.mask: train_mask, self.condition: train_data}
                    self.sess.run(self.trainer_D, feed_dict=feed)

            # training generator
            for g_epoch in range(self.step_G):
                for idx in np.arange(0, self.num_users, step=self.batchSize_G):
                    idx = dis_batch_index[idx:idx + self.batchSize_G]
                    train_data = train_matrix[idx].toarray()
                    train_z_mask = ZR_matrix[idx].toarray()
                    train_p_mask = PM_matrix[idx].toarray()
                    feed = {self.realData: train_data, self.condition: train_data,
                            self.mask: train_p_mask, self.G_ZR_dims: train_z_mask}
                    self.sess.run(self.trainer_G, feed_dict=feed)
                self.eval_rating_matrix()
                Evaluate.test_model(self, self.dataset)
示例#4
0
文件: SBPR.py 项目: youngzw/NeuRec
    def train_model(self):
        for epoch in range(self.num_epochs):
            # Generate training instances
            user_input, item_input_pos, item_input_social, item_input_neg, suk_input = self._get_pairwise_all_data(
            )
            total_loss = 0.0
            training_start_time = time()
            num_training_instances = len(user_input)
            for num_batch in np.arange(
                    int(num_training_instances / self.batch_size)):
                num_training_instances = len(user_input)
                id_start = num_batch * self.batch_size
                id_end = (num_batch + 1) * self.batch_size
                if id_end > num_training_instances:
                    id_end = num_training_instances
                bat_users = user_input[id_start:id_end]
                bat_items_pos = item_input_pos[id_start:id_end]
                bat_items_social = item_input_social[id_start:id_end]
                bat_items_neg = item_input_neg[id_start:id_end]
                bat_suk_input = suk_input[id_start:id_end]
                feed_dict = {self.user_input:bat_users,self.item_input:bat_items_pos,\
                            self.item_input_social:bat_items_social,\
                            self.item_input_neg:bat_items_neg,self.suk:bat_suk_input}

                loss, _ = self.sess.run((self.loss, self.optimizer),
                                        feed_dict=feed_dict)
                total_loss += loss
            print("[iter %d : loss : %f, time: %f]" %
                  (epoch + 1, total_loss / num_training_instances,
                   time() - training_start_time))
            if epoch % self.verbose == 0:
                Evaluate.test_model(self, self.dataset)
示例#5
0
文件: DMF.py 项目: skriser/NeuRec
    def train_model(self):

        for epoch in range(self.num_epochs):
            # Generate training instances
            user_input, item_input, lables = self._get_input_all_data()

            total_loss = 0.0
            training_start_time = time()
            num_training_instances = len(user_input)
            for num_batch in np.arange(int(num_training_instances / self.batch_size)):
                num_training_instances = len(user_input)
                id_start = num_batch * self.batch_size
                id_end = (num_batch + 1) * self.batch_size
                if id_end > num_training_instances:
                    id_end = num_training_instances
                bat_users = user_input[id_start:id_end].tolist()
                bat_items = item_input[id_start:id_end].tolist()
                bat_lables = np.array(lables[id_start:id_end])
                feed_dict = {self.one_hot_u: bat_users, self.one_hot_v: bat_items,
                             self.lables: bat_lables}
                loss, _ = self.sess.run((self.loss, self.optimizer), feed_dict=feed_dict)
                total_loss += loss
            print("[iter %d : loss : %f, time: %f]" % (
                epoch + 1, total_loss / num_training_instances, time() - training_start_time))
            if epoch % self.verbose == 0:
                Evaluate.test_model(self, self.dataset)
示例#6
0
文件: NeuMF.py 项目: skriser/NeuRec
    def train_model(self):

        for epoch in range(self.num_epochs):
            # Generate training instances
            if self.ispairwise.lower() == "true":
                user_input, item_input_pos, item_input_neg = data_gen._get_pairwise_all_data(self.dataset)
            else:
                user_input, item_input, lables = data_gen._get_pointwise_all_data(self.dataset, self.num_negatives)

            total_loss = 0.0
            training_start_time = time()
            num_training_instances = len(user_input)
            for num_batch in np.arange(int(num_training_instances / self.batch_size)):
                if self.ispairwise.lower() == "true":
                    bat_users, bat_items_pos, bat_items_neg = \
                        data_gen._get_pairwise_batch_data(user_input,
                                                          item_input_pos, item_input_neg, num_batch, self.batch_size)
                    feed_dict = {self.user_input: bat_users, self.item_input: bat_items_pos,
                                 self.item_input_neg: bat_items_neg}
                else:
                    bat_users, bat_items, bat_lables = \
                        data_gen._get_pointwise_batch_data(user_input,
                                                           item_input, lables, num_batch, self.batch_size)
                    feed_dict = {self.user_input: bat_users, self.item_input: bat_items,
                                 self.lables: bat_lables}

                loss, _ = self.sess.run((self.loss, self.optimizer), feed_dict=feed_dict)
                total_loss += loss
            print("[iter %d : loss : %f, time: %f]" % (
                epoch + 1, total_loss / num_training_instances, time() - training_start_time))
            if epoch % self.verbose == 0:
                Evaluate.test_model(self, self.dataset)
示例#7
0
文件: DeepICF.py 项目: skriser/NeuRec
    def train_model(self):

        for epoch in range(self.num_epochs):
            batches = self.shuffle()
            num_batch = len(batches[1])
            batch_index = np.arange(num_batch)
            training_start_time = time()
            total_loss = 0.0
            for index in batch_index:
                user_input, num_idx, item_input, labels = self.batch_gen(
                    batches, index)
                feed_dict = {
                    self.user_input: user_input,
                    self.num_idx: num_idx,
                    self.item_input: item_input,
                    self.labels: labels,
                    self.is_train_phase: True
                }
                loss, _ = self.sess.run([self.loss, self.optimizer], feed_dict)
                total_loss += loss
            print("[iter %d : loss : %f, time: %f]" %
                  (epoch + 1, total_loss / num_batch,
                   time() - training_start_time))
            if epoch % self.verbose == 0:
                Evaluate.test_model(self, self.dataset)
示例#8
0
文件: NPE.py 项目: zzg2008/NeuRec
    def train_model(self):
        for epoch in range(self.num_epochs):
            # Generate training instances
            user_input, item_input, item_input_recents, lables = data_gen._get_pointwise_all_highorder_data(
                self.dataset, self.high_order, self.num_negatives)

            num_training_instances = len(user_input)
            total_loss = 0.0
            training_start_time = time()

            for num_batch in np.arange(
                    int(num_training_instances / self.batch_size)):

                bat_users, bat_items, bat_items_recents, bat_lables =\
                data_gen._get_pointwise_batch_seqdata(user_input, \
                item_input,item_input_recents, lables, num_batch, self.batch_size)
                feed_dict = {
                    self.user_input: bat_users,
                    self.item_input: bat_items,
                    self.item_input_recents: bat_items_recents,
                    self.lables: bat_lables
                }

                loss, _ = self.sess.run((self.loss, self.optimizer),
                                        feed_dict=feed_dict)
                total_loss += loss

            print("[iter %d : loss : %f, time: %f]" %
                  (epoch + 1, total_loss / num_training_instances,
                   time() - training_start_time))
            if epoch % self.verbose == 0:
                Evaluate.test_model(self, self.dataset)
示例#9
0
文件: IRGAN.py 项目: skriser/NeuRec
 def train_model(self):
     for _ in range(self.epochs):
         for _ in range(self.d_epoch):
             users_list, items_list, labels_list = self.get_train_data()
             self.training_discriminator(users_list, items_list,
                                         labels_list)
         for _ in range(self.g_epoch):
             self.training_generator()
             Evaluate.test_model(self, self.dataset)
示例#10
0
	def run(self, data_filename, do_eval=None, output_dir=None): 
		"""Run the two-steps ExreadCluster algorithm (no edge information).
		Parameter:
			row_header (String[]): column names
			rows (String[[]]): array of string arrays.
		Output:
			data2rep (int array): cluster assignment for each data
			r_emb (K * emb_size): representative embedding
			emb_tensor (N * emb_size): fine-tuned input embedding
			cls_loss (float): clustering loss
		"""
		accuracies = []
		# Get data.
		row_header, rows = FileLoader(data_filename)
		target_rows, aux_rows, gold, spans = self.get_basics(row_header, rows)

		# Generate batches.
		row_iter, aux_c_sizes, aux_weights = DataLoader(target_rows, aux_rows, 
			self.batch_size, self.w2v_model)

		# Get the initial embedding tensor as the average w2v embedding.
		emb_tensor = self.w2v_model.get_emb_matrix(target_rows)
		if do_eval is not None:
			with open(os.path.join(output_dir, "emb_before"), "wb") as handle:
				pickle.dump((emb_tensor, spans, gold), handle, 
					protocol=pickle.HIGHEST_PROTOCOL)
		
		# Run the base cluster module
		print("Run 1st module: clustering algorithm")
		c_emb, labels, cls_loss = self.cluster_module.run(emb_tensor)
	
		# Run the embedding fine-tuning module
		print("Run 2nd module: refine embedding")
		enc, emb_loss, emb_labels = self.emb_module.run(aux_c_sizes, row_iter,
			LabelLoader(labels, self.batch_size), c_emb, spans, gold,
			output_dir, aux_weights=aux_weights)
		print("****cluster loss: %f; emb loss:%f****" % (cls_loss, emb_loss))
		
		# Update embedding tensor
		emb_tensor = enc.data

		if do_eval is not None:
			accuracies.append(Evaluate(labels, gold, do_eval))	
		
		print("Run 3rd module: refinement by clustering algorithm")
		# Final refinement
		c_emb, labels, cls_loss = self.cluster_module.run(emb_tensor)		
		labels = self.post_processing(row_header, rows, labels)

		if do_eval is not None:
			accuracies.append(Evaluate(labels, gold, do_eval))	
			with open(os.path.join(output_dir, "emb_after"), "wb") as handle:
				pickle.dump((emb_tensor, spans, gold), handle, 
					protocol=pickle.HIGHEST_PROTOCOL)

		return row_header, rows, labels, c_emb, emb_tensor, cls_loss, accuracies
示例#11
0
    def train_model(self):
        update_count = 0.0
        # the total number of gradient updates for annealing
        # largest annealing parameter
        for epoch in range(self.num_epochs):
            random_perm_doc_idx = np.random.permutation(self.num_users)
            self.total_batch = self.num_users
            total_loss = 0.0
            training_start_time = time()
            num_training_instances = self.num_users
            for num_batch in np.arange(
                    int(num_training_instances / self.batch_size)):
                if num_batch == self.total_batch - 1:
                    batch_set_idx = random_perm_doc_idx[num_batch *
                                                        self.batch_size:]
                elif num_batch < self.total_batch - 1:
                    batch_set_idx = random_perm_doc_idx[num_batch *
                                                        self.batch_size:
                                                        (num_batch + 1) *
                                                        self.batch_size]

                batch_matrix = np.zeros((len(batch_set_idx), self.num_items))

                if self.total_anneal_steps > 0:
                    anneal = min(self.anneal_cap,
                                 1. * update_count / self.total_anneal_steps)
                else:
                    anneal = self.anneal_cap

                batch_uid = 0
                trainDict = self.dataset.trainDict
                for userid in batch_set_idx:
                    items_by_userid = trainDict[userid]
                    for itemid in items_by_userid:
                        batch_matrix[batch_uid, itemid] = 1

                    batch_uid = batch_uid + 1

                feed_dict = feed_dict = {
                    self.input_ph: batch_matrix,
                    self.keep_prob_ph: 0.5,
                    self.anneal_ph: anneal,
                    self.is_training_ph: 1
                }
                _, loss = self.sess.run([self.optimizer, self.loss],
                                        feed_dict=feed_dict)
                total_loss += loss

                update_count += 1
            print("[iter %d : loss : %f, time: %f]" %
                  (epoch + 1, total_loss / num_training_instances,
                   time() - training_start_time))
            if epoch % self.verbose == 0:
                Evaluate.test_model(self, self.dataset)
示例#12
0
    def evaluate(self, verbose=0):
        evalu = Evaluate(self.model_names,
                         self.X_train,
                         self.y_preds,
                         self.config['evaluate'],
                         verbose=verbose)
        evalu.fit()
        self.metrics = evalu.metrics
        self.boundary_points = evalu.boundary_points


#             self.y_preds[model] = clustering(self.models[model], self.X_train, adjust_label = adjust_label, verbose=verbose)
示例#13
0
	def __init__(self,args,data,ckpt_path): #seq_len,xvocab_size, label_size,ckpt_path,pos_size,type_size,data
		self.opt = args
		self.num_steps = 120
		self.num_class = 2
		self.word_num = data.word_size
		self.ckpt_path=ckpt_path
		self.pos_size=data.pos_size
		self.type_size=data.type_size
		self.util= Util()
		sys.stdout.write('Building Graph ')
		self._build_model(args,embedding_matrix=data.pretrained)
		sys.stdout.write('graph built\n')
		self.eval=Evaluate()
示例#14
0
    def test(self, data, labels):
        
        predictions = []
        cnt_correct = 0
        for idx, doc in enumerate(data):
            _, prediction = self.predict(doc)
            predictions.append(prediction)
            true_label = labels[idx]
            if true_label == prediction:
                cnt_correct += 1

        eval = Evaluate(predictions, labels, self.num_of_cls)
        metrics = {'macro':eval.calc_macro_metrics(), 'micro':eval.calc_micro_metrics()}
        return predictions, metrics
def make_table(ordered_algorithms, evaluation_functions):
    result = PrettyTable()

    result.add_column('Algorithm', [])
    for func_name in [Evaluate.str_mean(name, rank) for name, rank in evaluation_functions]:
        result.add_column(func_name, [])
    for info in ordered_algorithms:
        result.add_row([info[1].get_name()] + [x for x in info[0]])
    return result
示例#16
0
    def train_model(self):

        for epoch in range(self.num_epochs):
            # Generate training instances
            mask_corruption_np = np.random.binomial(
                1, 1 - self.corruption_level, (self.num_users, self.num_items))
            random_perm_doc_idx = np.random.permutation(self.num_users)
            self.total_batch = self.num_users
            total_loss = 0.0
            training_start_time = time()
            num_training_instances = self.num_users
            for num_batch in np.arange(
                    int(num_training_instances / self.batch_size)):
                if num_batch == self.total_batch - 1:
                    batch_set_idx = random_perm_doc_idx[num_batch *
                                                        self.batch_size:]
                elif num_batch < self.total_batch - 1:
                    batch_set_idx = random_perm_doc_idx[num_batch *
                                                        self.batch_size:
                                                        (num_batch + 1) *
                                                        self.batch_size]

                batch_matrix = np.zeros((len(batch_set_idx), self.num_items))

                batch_uid = 0
                trainDict = self.dataset.trainDict
                for userid in batch_set_idx:
                    items_by_userid = trainDict[userid]
                    for itemid in items_by_userid:
                        batch_matrix[batch_uid, itemid] = 1

                    batch_uid = batch_uid + 1

                feed_dict = feed_dict={self.mask_corruption:
                    mask_corruption_np[batch_set_idx, :],\
                    self.input_R: batch_matrix}
                _, loss = self.sess.run([self.optimizer, self.loss],
                                        feed_dict=feed_dict)
                total_loss += loss
            print("[iter %d : loss : %f, time: %f]" %
                  (epoch + 1, total_loss / num_training_instances,
                   time() - training_start_time))
            if epoch % self.verbose == 0:
                Evaluate.test_model(self, self.dataset)
示例#17
0
    def train_model(self):

        for epoch in  range(self.num_epochs):
            random_row_idx = np.random.permutation(self.num_users)  # randomly permute the rows
            random_col_idx = np.random.permutation(self.num_items)  # randomly permute the cols
            training_start_time = time()
            total_loss = 0.0
            for i in range(self.num_batch_U):  # iterate each batch
                if i == self.num_batch_U - 1:
                    row_idx = random_row_idx[i * self.batch_size:]
                else:
                    row_idx = random_row_idx[(i * self.batch_size):((i + 1) * self.batch_size)]
                for j in range(self.num_batch_I):
                    # get the indices of the current batch
                    if j == self.num_batch_I - 1:
                        col_idx = random_col_idx[j * self.batch_size:]
                    else:
                        col_idx = random_col_idx[(j * self.batch_size):((j + 1) * self.batch_size)]
                    
                    p_input, n_input = self.pairwise_neg_sampling(row_idx, col_idx)
                    input_tmp = self.train_R[row_idx, :]
                    input_tmp = input_tmp[:, col_idx]
    
                    input_R_U = self.train_R[row_idx, :]
                    input_R_I = self.train_R[:, col_idx]
                    _, loss = self.sess.run(  # do the optimization by the minibatch
                        [self.optimizer, self.cost],
                        feed_dict={
                            self.input_R_U: input_R_U,
                            self.input_R_I: input_R_I,
                            self.input_OH_I: self.I_OH_mat[col_idx, :],
                            self.input_P_cor: p_input,
                            self.input_N_cor: n_input,
                            self.row_idx: np.reshape(row_idx, (len(row_idx), 1)),
                            self.col_idx: np.reshape(col_idx, (len(col_idx), 1))})
                total_loss+=loss
            print("[iter %d : total_loss : %f, time: %f]" %(epoch+1,total_loss,time()-training_start_time))
            if epoch %self.verbose == 0:
                self.eval_rating_matrix()
                Evaluate.test_model(self,self.dataset)
示例#18
0
def run_evaluation(dataset_path, predictors, additional_roots=None, max_number_of_queries=None, folds_num=5,
                   evaluation_functions=(('precision', 1), ('precision', 3), ('precision', 5), ('ndcg', 1),
                                         ('ndcg', 3), ('ndcg', 5), ('dcg', 1), ('dcg', 3), ('dcg', 5))):

    evaluation_results = [np.zeros(len(evaluation_functions)) for i in range(len(predictors))]

    for fold in load.load_dataset(dataset_path, additional_roots, max_number_of_queries, folds_num):
        (x_train, y_train, id_train), (x_test, y_test, id_test) = fold

        for index_predictor, predictor in enumerate(predictors):
            # sys.stderr.write(predictor.get_name() + '\n')
            # sys.stderr.flush()

            y_pred = predictor.learn_predict(x_train, y_train, x_test)

            for index_function, (func_type, rank) in enumerate(evaluation_functions):
                evaluation_results[index_predictor][index_function] += Evaluate.mean(func_type, rank,
                                                                                     y_test, y_pred, id_test)

    evaluation_results = [result / folds_num for result in evaluation_results]
    return evaluation_results
示例#19
0
def algorithm_evaluation():

    rd = ReadData(500000, 1000, 100)
    (sparse_ratings, books_used, deleted_users) = rd.load_ratings_data()

    # Obtain the filled matrix using iterative singular value thresholding and view mse per iteration
    a = Algorithms(sparse_ratings)
    (ratings_with_nan, filled_ratings_isvt) = a.isvt()
    e = Evaluate(5, 10)
    mse_isvt = e.performance_eval_isvt(ratings_with_nan, filled_ratings_isvt)

    # Obtain the filled matrix using non-negative matrix factorization and view mse per iteration
    filled_ratings_nmf = a.nmf()
    mse_nmf = e.performance_eval_nmf(sparse_ratings, filled_ratings_nmf)

    # Vary hold-out set and find average mse for each algorithm
    hos = [5, 10, 15, 20]
    e_isvt = []
    e_nmf = []
    for i in hos:
        e2 = Evaluate(5, i)
        mse_isvt = e2.performance_eval_isvt(ratings_with_nan,
                                            filled_ratings_isvt,
                                            plot=False)
        e_isvt.append(mse_isvt)

        mse_nmf = e2.performance_eval_nmf(sparse_ratings,
                                          filled_ratings_nmf,
                                          plot=False)
        e_nmf.append(mse_nmf)

    plt.plot(hos, e_isvt, label="Soft Impute")
    plt.plot(hos, e_nmf, label="NMF")
    plt.xlabel("Hold-Out Set %")
    plt.ylabel("Mean Squared Error")
    plt.legend()
    plt.show()
def clusteringDCT(pred_true_txt_ind_prevPreds, wordVectorsDic, batchDocs,
                  maxPredLabel):
    print("#m-stream-cleaned")
    Evaluate(pred_true_txt_ind_prevPreds)

    pred_true_text_ind_prevPreds_to_cluster, pred_true_text_ind_prevPreds_to_not_cluster = extrcatLargeClusterItems(
        pred_true_txt_ind_prevPreds)
    print("3 rd=" + str(pred_true_text_ind_prevPreds_to_cluster[0][3]))
    print("4 rd=" + str(pred_true_text_ind_prevPreds_to_cluster[0][4]))
    '''minPredToC, maxPredToC, minTrueToC, maxTrueToC=findMinMaxLabel(pred_true_text_ind_prevPreds_to_cluster)
  print("minPred, maxPred, minTrue, maxTrue=(pred_true_text_ind_prevPreds_to_cluster)") 
  print(minPredToC, maxPredToC, minTrueToC, maxTrueToC)
  
  minPredToNC, maxPredToNC, minTrueToNC, maxTrueToNC=findMinMaxLabel(pred_true_text_ind_prevPreds_to_not_cluster)
  print("minPred, maxPred, minTrue, maxTrue=(pred_true_text_ind_prevPreds_to_not_cluster)") 
  print(minPredToNC, maxPredToNC, minTrueToNC, maxTrueToNC)'''

    all_pred_clusters = len(groupTxtByClass(pred_true_txt_ind_prevPreds,
                                            False))
    pred_clusters = len(
        groupTxtByClass(pred_true_text_ind_prevPreds_to_cluster, False))
    non_pred_clusters = len(
        groupTxtByClass(pred_true_text_ind_prevPreds_to_not_cluster, False))

    print("#clusters=" + str(pred_clusters))
    print("#not clusters=" + str(non_pred_clusters))
    print("this clustering with embedding DCT")
    pred_clusters = non_pred_clusters - pred_clusters
    print("#update clusters=" + str(pred_clusters))

    nparr = np.array(pred_true_text_ind_prevPreds_to_cluster)
    print("3 rd=" + str(pred_true_text_ind_prevPreds_to_cluster[0][3]))
    print("4 rd=" + str(pred_true_text_ind_prevPreds_to_cluster[0][4]))
    preds = list(nparr[:, 0])
    trues = list(nparr[:, 1])
    texts = list(nparr[:, 2])
    inds = list(nparr[:, 3])
    prevPreds = list(nparr[:, 4])

    skStopWords = getScikitLearn_StopWords()
    texts = processTextsRemoveStopWordTokenized(texts, skStopWords)
    '''dicDocFreq=getDocFreq(texts)
  dctCoffs=1
  X=generate_sent_vecs_toktextdata_DCT(texts, wordVectorsDic, 300,dctCoffs)  
  #vectorizer = TfidfVectorizer(tokenizer=stem_text,max_df=0.5,min_df=1)
  #vectorizer = TfidfVectorizer(max_df=0.5,min_df=2, stop_words='english')
  #X = vectorizer.fit_transform(texts)'''
    '''svd = TruncatedSVD(50)
  #svd = PCA(n_components=50)	
  normalizer = Normalizer(copy=False)
  lsa = make_pipeline(svd, normalizer)
  #X=X.toarray()	
  X = lsa.fit_transform(X)'''
    '''km = KMeans(n_clusters=pred_clusters, init='k-means++', max_iter=100,random_state=0)	
  km.fit(X)
  list_km_pred_true_text=combine_pred_true_txt_from_list(km.labels_, trues, texts)
  print("#k-means")	
  Evaluate(list_km_pred_true_text)'''
    '''ward = AgglomerativeClustering(n_clusters=pred_clusters, linkage='ward').fit(X)
  list_hr_pred_true_text=combine_pred_true_txt_from_list(ward.labels_, trues, texts)
  print("#hr-ward-DCT")
  print(min(ward.labels_), max(ward.labels_))
  pred_true_text_ind_prevPreds_to_not_cluster_hr=change_pred_label(pred_true_text_ind_prevPreds_to_not_cluster, pred_clusters+1)  
  Evaluate(list_hr_pred_true_text)
  Evaluate(list_hr_pred_true_text+pred_true_text_ind_prevPreds_to_not_cluster_hr)
  '''

    X = generate_sent_vecs_toktextdata(texts, wordVectorsDic, 300)
    ward = AgglomerativeClustering(n_clusters=pred_clusters,
                                   linkage='ward').fit(X)
    list_hr_pred_true_text_ind_prevPred = np.column_stack(
        (ward.labels_, trues, texts, inds, prevPreds)).tolist()
    print("#hr-ward-AVG")
    pred_true_text_ind_prevPreds_to_not_cluster_hr = change_pred_label(
        pred_true_text_ind_prevPreds_to_not_cluster, pred_clusters + 1)
    Evaluate(list_hr_pred_true_text_ind_prevPred)
    Evaluate(list_hr_pred_true_text_ind_prevPred +
             pred_true_text_ind_prevPreds_to_not_cluster_hr)
    #print_by_group(list_hr_pred_true_text+pred_true_text_ind_prevPreds_to_not_cluster_hr)

    print("#spectral-avg")
    clustering = SpectralClustering(n_clusters=pred_clusters,
                                    assign_labels="discretize",
                                    random_state=0).fit(X)
    list_sp_pred_true_text_ind_prevPred = np.column_stack(
        (clustering.labels_, trues, texts, inds, prevPreds)).tolist()
    pred_true_text_ind_prevPreds_to_not_cluster_spec = change_pred_label(
        pred_true_text_ind_prevPreds_to_not_cluster, pred_clusters + 1)
    Evaluate(list_sp_pred_true_text_ind_prevPred)
    Evaluate(list_sp_pred_true_text_ind_prevPred +
             pred_true_text_ind_prevPreds_to_not_cluster_spec)
示例#21
0
    def randomization_test(self, labels, y1, y2, epoch=1000):
        import random
        
        e = Evaluate(labels, y1, self.num_of_cls)
        f_1 = e.calc_micro_metrics()['f1']
        e = Evaluate(labels, y2, self.num_of_cls)
        f_2 = e.calc_micro_metrics()['f1']

        s = abs(f_1-f_2)
        cnt = 0
        for i in range(0, epoch):
            temp_y1 = []
            temp_y2 = []
            for idx in range(len(labels)):
                if random.uniform(0, 1) > 0.5:
                    temp_y1.append(y2[idx])
                    temp_y2.append(y1[idx])
                else:
                    temp_y1.append(y1[idx])
                    temp_y2.append(y2[idx])
            
            e = Evaluate(labels, temp_y1, self.num_of_cls)
            f_1 = e.calc_micro_metrics()['f1']
            e = Evaluate(labels, temp_y2, self.num_of_cls)
            f_2 = e.calc_micro_metrics()['f1']
            s_prime = abs(f_1-f_2)

            if s_prime > s:
                cnt += 1

        p_value = (cnt+1)/(epoch+1)

        return p_value
            
示例#22
0
class BiLstm(object):

	def __init__(self,args,data,ckpt_path): #seq_len,xvocab_size, label_size,ckpt_path,pos_size,type_size,data
		self.opt = args
		self.num_steps = 120
		self.num_class = 2
		self.word_num = data.word_size
		self.ckpt_path=ckpt_path
		self.pos_size=data.pos_size
		self.type_size=data.type_size
		self.util= Util()
		sys.stdout.write('Building Graph ')
		self._build_model(args,embedding_matrix=data.pretrained)
		sys.stdout.write('graph built\n')
		self.eval=Evaluate()

	def _build_model(self,flags,embedding_matrix):
		tf.reset_default_graph()
		tf.set_random_seed(123)
		self.input=tf.placeholder(shape=[None,self.num_steps], dtype=tf.int64)
		self.length = tf.placeholder(shape=[None,], dtype=tf.int64)
		self.pos=tf.placeholder(shape=[None,self.num_steps], dtype=tf.int64)
		self.type=tf.placeholder(shape=[None,self.num_steps], dtype=tf.int64)
		self.target = [tf.placeholder(shape=[None, ], dtype=tf.int64, name='li_{}'.format(t)) for t in   range(self.num_steps)]
		self.weight = [tf.placeholder(shape=[None, ], dtype=tf.float32, name='wi_{}'.format(t)) for t in    range(self.num_steps)]
		self.keep_prob = tf.placeholder(tf.float32)  # drop out

		if embedding_matrix is not None:
			self.embedding = tf.Variable(embedding_matrix, trainable=True, name="emb",dtype=tf.float32)#
		else:
			self.embedding = tf.get_variable("emb", [self.word_num, self.emb_dim])
		self.inputs_emb = tf.nn.embedding_lookup(self.embedding, self.input)
		if flags.use_tree:
			pos_embedding = tf.get_variable('pos_embed', [self.pos_size, 40], dtype=tf.float32,initializer=tf.truncated_normal_initializer(stddev=1e-4))
			type_embedding = tf.get_variable('type_embed', [self.type_size, 40], dtype=tf.float32,initializer=tf.truncated_normal_initializer(stddev=1e-4))
			pos_inputs = tf.nn.embedding_lookup(pos_embedding, self.pos)
			type_inputs = tf.nn.embedding_lookup(type_embedding, self.type)
			self.inputs_emb = tf.concat(2, [self.inputs_emb, pos_inputs,type_inputs])


		cell = tf.nn.rnn_cell.LSTMCell(num_units=flags.hidden_size, state_is_tuple=True)
		dropout_cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=self.keep_prob)
		stacked_cell= tf.nn.rnn_cell.MultiRNNCell([dropout_cell] * self.opt.num_layers, state_is_tuple=True)
		outputs, states = tf.nn.bidirectional_dynamic_rnn(cell_fw=stacked_cell,cell_bw=stacked_cell,dtype=tf.float32,sequence_length=self.length,inputs=self.inputs_emb)
		output_fw, output_bw = outputs
		output=	tf.concat(2, [output_fw,output_bw])
		soft_dim=self.opt.hidden_size*2
		self.softmax_w = tf.get_variable("softmax_w", [soft_dim, self.num_class])
		self.softmax_b = tf.get_variable("softmax_b", [self.num_class])
		output=tf.reshape(output,[-1,soft_dim])
		self.logits = tf.matmul(output, self.softmax_w) + self.softmax_b
		self.decode_outputs_test = tf.nn.softmax(self.logits)
		self.decode_outputs_test=tf.reshape(self.decode_outputs_test,[-1,self.num_steps,self.num_class])
		#states_fw, states_bw = states
		self.classify_out=tf.reshape(self.logits,[-1,self.num_steps,self.num_class])
		self.logits= tf.transpose(self.classify_out, [1, 0, 2])
		self.logits=tf.unpack(self.logits,axis=0)
		self.loss = tf.nn.seq2seq.sequence_loss(self.logits, self.target, self.weight, self.num_class)
		self.train_op = tf.train.AdamOptimizer(learning_rate=self.opt.learn_rate).minimize(self.loss)


	'''Training and Evaluation'''
	def train(self, data, sess=None):
		saver = tf.train.Saver()
		if not sess:
			sess = tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=2)) # create a session
			sess.run(tf.global_variables_initializer()) 		# init all variables
		sys.stdout.write('\n Training started ...\n')
		best_loss=100
		best_epoch=0
		t1=time.time()
		for i in range(self.opt.epochs):
			try:
				loss,_=self.run_epoch(sess,data,data.train,True)
				val_loss,pred= self.run_epoch(sess, data,data.valid,False)
				t2=time.time()
				print('epoch:%2d \t time:%.2f\tloss:%f\tvalid_loss:%f'%(i,t2-t1,loss,val_loss))
				t1=time.time()
				if val_loss<best_loss:
					saver.save(sess, self.ckpt_path + self.opt.model_name + '.ckpt')
					best_loss=val_loss
					best_epoch=i
				sys.stdout.flush()
			except KeyboardInterrupt:  # this will most definitely happen, so handle it
				print('Interrupted by user at iteration {}'.format(i))
				self.session = sess
				return sess
		print('best valid accuary:%f\tbest epoch:%d'%(best_loss,best_epoch))

	# prediction
	def predict(self, data, sess):
		_, predicts = self.run_epoch(sess, data, data.test, False)
		if self.opt.use_ilp:
			pred = self.ilp_solution(predicts , data.test['weight'], data.test['length'], data.test['dfather'], data.test['dtype'])
		else:
			pred= np.argmax(predicts, axis=2)

		acc, f1, pratio, gratio=self.eval.values(pred,data.test['target'],data.test['weight'])
		print('accuary:%f,f1:%f,pratio:%f,gratio:%f' %(acc,f1,pratio,gratio))

	def run_epoch(self, sess, data,data_type,is_train):
		losses = []
		num_batch=data.gen_batch_num(data_type)
		predicts=None
		for i in range(num_batch):
			input, target, weight, length, pos, dtype,dfather,sent,compressed=data.gen_batch(data_type, i)
			if is_train:
				feed_dict = self.get_feed(input, target, weight, length, pos, dtype, keep_prob=0.8)
				_, loss_v, predict = sess.run([self.train_op, self.loss, self.decode_outputs_test], feed_dict)
			else:
				feed_dict = self.get_feed(input, target, weight, length, pos, dtype, keep_prob=1.)
				loss_v, predict= sess.run([self.loss, self.decode_outputs_test], feed_dict)
			losses.append(loss_v)
			if predicts is None:
				predicts = predict
			else:
				predicts = np.concatenate((predicts, predict))
		return np.mean(losses),predicts

	def ilp_solution(self,predict,batchW,batchL,fathers,types):
		Myilp = Ilp()
		pred_label = predict[:, :, 1]
		pred = []
		batchW_temp = np.array(batchW,copy=True)
		for j in range(pred_label.shape[0]):
			size = sum(batchW_temp[j] == 1)  #
			curr_label = pred_label[j][:size]
			curr_fathers = [int(f) for f in fathers[j]]
			curr_fathers.insert(0, 0)
			dep_length = self.util.caculate_length(curr_fathers)
			dep_length=dep_length[1:]
			curr_types = types[j][:]
			saved_types = self.util.get_typelist(curr_fathers, curr_types)
			_, retained, values = Myilp.solve_ilp_problem(size, curr_label, dep_length=dep_length, parents=curr_fathers,saved=saved_types)
			values.extend([0] * (120 - len(values)))
			pred.append(values)
		pred = np.array(pred)
		return pred

	def restore_last_session(self):
		saver = tf.train.Saver()
		sess = tf.Session()  # create a session
		saver.restore(sess, self.ckpt_path + self.opt.model_name + '.ckpt')
		print('model restored')
		return sess

	def get_feed(self, input, target, weight, length, pos, dtype, keep_prob):
		feed_dict={self.input:input}
		feed_dict.update({self.target[t]: target[t] for t in range(self.num_steps)})
		feed_dict.update({self.weight[t]: weight[t] for t in range(self.num_steps)})
		feed_dict[self.pos]=pos
		feed_dict[self.type]=dtype
		feed_dict[self.length]=length
		feed_dict[self.keep_prob] = keep_prob  # dropout prob
		return feed_dict
def cluster_biterm(f,
                   list_pred_true_words_index,
                   c_bitermsFreqs={},
                   c_totalBiterms={},
                   c_wordsFreqs={},
                   c_totalWords={},
                   c_txtIds={},
                   c_clusterVecs={},
                   txtId_txt={},
                   last_txtId=0,
                   max_c_id=0,
                   wordVectorsDic={},
                   dic_clus__id={},
                   dic_biterm__clusterId_Freq={},
                   dic_biterm__allClusterFreq={}):
    print("cluster_bigram")

    current_txt_id = last_txtId

    eval_pred_treu_txt = []

    line_count = 0

    t11 = datetime.now()

    for item in list_pred_true_words_index:
        words = item[2]
        bi_terms = construct_biterms(words)

        current_txt_id += 1

        line_count += 1

        txtBitermsFreqs = Counter(bi_terms)
        bi_terms_len = len(bi_terms)

        txtWordsFreqs = Counter(words)
        words_len = len(words)

        #X=generate_sent_vecs_toktextdata([words], wordVectorsDic, embedDim)
        #text_Vec=X[0]
        text_Vec = [0] * embedDim

        clusterId = findCloseCluster(c_bitermsFreqs, c_totalBiterms, c_txtIds,
                                     c_wordsFreqs, c_totalWords, c_clusterVecs,
                                     txtBitermsFreqs, bi_terms_len,
                                     txtWordsFreqs, words_len, max_c_id,
                                     text_Vec)

        max_c_id = max([max_c_id, clusterId, len(c_bitermsFreqs)])

        dic_clus__id[clusterId] = max_c_id

        txtId_txt[current_txt_id] = words

        c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq = populateClusterFeature(
            c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs,
            c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len,
            txtWordsFreqs, words_len, clusterId, current_txt_id, text_Vec,
            dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq)
        '''if line_count%1000==0:	
      c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq=removeHighEntropyFtrs(c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq)'''

        eval_pred_treu_txt.append([clusterId, item[1], item[2]])
        if ignoreMinusOne == True:
            if str(item[1]) != '-1':
                f.write(
                    str(clusterId) + "	" + str(item[1]) + "	" + str(item[2]) +
                    "\n")
        else:
            f.write(
                str(clusterId) + "	" + str(item[1]) + "	" + str(item[2]) +
                "\n")

        if line_count % 500 == 0:

            #print(dic_clus__id)
            print(len(dic_clus__id))
            #delete old and small clusters, remove multi-cluster words from clusters
            list_c_sizes = []
            list_c_ids = []
            #list_size__cid={}

            for c_id, txtIds in c_txtIds.items():
                list_c_sizes.append(len(txtIds))
                list_c_ids.append(dic_clus__id[c_id])
                #list_size__cid[len(txtIds)]=c_id
            mean_c_size = statistics.mean(list_c_sizes)
            std_c_size = statistics.stdev(list_c_sizes)

            mean_c_id = statistics.mean(list_c_ids)
            std_c_id = statistics.stdev(list_c_ids)

            print('preocess', line_count, 'texts', 'mean_c_size', mean_c_size,
                  'std_c_size', std_c_size)
            print('preocess', line_count, 'texts', 'mean_c_id', mean_c_id,
                  'std_c_id', std_c_id)

            list_del_cids = []
            del_count = 0
            '''for c_id, txtIds in c_txtIds.items():
        c_size=	len(txtIds)
        ##print('c_id=', c_id, 'c_size=', c_size)		
        #if c_size<=2 :#or del_count<15:
        #  list_del_cids.append(c_id)
        #  print('delete cluster=',c_id, '#size=', c_size) 		  		  
          #del_count+=1	  
        	  
        #if c_size<=1 or float(c_size)<=float(abs(mean_c_size-std_c_size)) or float(c_size)>=mean_c_size+std_c_size or float(c_size)>=mean_c_size:  		
        #if float(c_size)<float(abs(mean_c_size)):
        #  list_del_cids.append(c_id)
          #print('delete cluster=',c_id, '#size=', c_size)  		  
		  
        #float(c_id)<=float(abs(mean_c_id-std_c_id))		  
        if (c_size<=1 or float(c_size)<=float(abs(mean_c_size-std_c_size))) or float(c_size)>=mean_c_size: #and del_count<100:  		   		
          list_del_cids.append(c_id)
          del_count+=1
        		
        #  print('delete cluster=',c_id, '#size=', c_size) 		  
          
      #list_c_sizes.sort(reverse=True)
	  
      #for c_size in list_c_sizes[0:20]:
      #  list_del_cids.append(list_size__cid[c_size])'''

            for c_id, orderId in dic_clus__id.items():
                if c_id not in c_txtIds:
                    continue
                c_size = len(c_txtIds[c_id])
                #if (float(c_id)<=float(abs(mean_c_id-std_c_id)) or float(orderId)<=float(abs(mean_c_id-std_c_id))):
                #if (c_size<=1 or float(c_size)<=float(abs(mean_c_size-std_c_size)) or float(c_size)>=mean_c_size+std_c_size*1):
                if (float(c_id) <= float(abs(mean_c_id - std_c_id))
                        or float(orderId) <= float(abs(mean_c_id - std_c_id))
                    ) and (c_size <= 1 or float(c_size) <= float(
                        abs(mean_c_size - std_c_size))
                           or float(c_size) >= mean_c_size + std_c_size):
                    list_del_cids.append(c_id)

            print('#list_del_cids', len(list_del_cids), 'len(c_bitermsFreqs)',
                  len(c_bitermsFreqs))

            listTargetBiterms = []

            for c_id in list_del_cids:
                del c_bitermsFreqs[c_id]
                del c_totalBiterms[c_id]
                del c_txtIds[c_id]
                del c_wordsFreqs[c_id]
                del c_totalWords[c_id]
                del dic_clus__id[c_id]
                #del c_clusterVecs[c_id]
                '''for biterm, dic_clusterId__Freq in dic_biterm__clusterId_Freq.items():
          if c_id in dic_biterm__clusterId_Freq[biterm]:
            bitermClusterIdFreq=dic_biterm__clusterId_Freq[biterm][c_id]		  
            #dic_biterm__clusterId_Freq[biterm][c_id]=0	
            dic_biterm__allClusterFreq[biterm]-=bitermClusterIdFreq	
            listTargetBiterms.append(biterm) 			
            del dic_biterm__clusterId_Freq[biterm][c_id]'''
            '''listTargetBiterms=set(listTargetBiterms)
      for biterm in listTargetBiterms:
        if dic_biterm__allClusterFreq[biterm]<=0:
          del dic_biterm__clusterId_Freq[biterm]
          del dic_biterm__allClusterFreq[biterm]'''

            #c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq=removeHighEntropyFtrs(c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq)

        if line_count % 1000 == 0:
            print('#######-personal-eval_pred_treu_txt',
                  len(eval_pred_treu_txt))
            Evaluate(eval_pred_treu_txt, ignoreMinusOne)

            t12 = datetime.now()
            t_diff = t12 - t11
            print("total time diff secs=", t_diff.seconds)

    last_txtId = current_txt_id
    return [
        c_bitermsFreqs, c_totalBiterms, c_wordsFreqs, c_totalWords, c_txtIds,
        c_clusterVecs, txtId_txt, last_txtId, dic_clus__id,
        dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq
    ]
c_txtIds = {}
c_clusterVecs = {}
txtId_txt = {}
last_txtId = 0
max_c_id = 0
dic_clus__id = {}

dic_biterm__clusterId_Freq = {}
dic_biterm__allClusterFreq = {}

f = open(resultFile, 'w')

t11 = datetime.now()

c_bitermsFreqs, c_totalBiterms, c_wordsFreqs, c_totalWords, c_txtIds, c_clusterVecs, txtId_txt, last_txtId, dic_clus__id, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq = cluster_biterm(
    f, list_pred_true_words_index, c_bitermsFreqs, c_totalBiterms,
    c_wordsFreqs, c_totalWords, c_txtIds, c_clusterVecs, txtId_txt, last_txtId,
    max_c_id, wordVectorsDic, dic_clus__id, dic_biterm__clusterId_Freq,
    dic_biterm__allClusterFreq)

t12 = datetime.now()
t_diff = t12 - t11
print("total time diff secs=", t_diff.seconds)

f.close()

listtuple_pred_true_text = ReadPredTrueText(resultFile, ignoreMinusOne)

print('result for', dataset)
Evaluate(listtuple_pred_true_text)
示例#25
0
class Inference(object):
    def __init__(self,
                 data,
                 model,
                 model_params,
                 model_params_grad,
                 savedir,
                 num_obs_samples,
                 num_future_steps,
                 num_mc_samples,
                 ppc_window,
                 z_true=None,
                 true_model_params=None,
                 iters=1000):
        self.data = data
        self.dim = self.data[1].size(2)
        self.T = self.data[1].size(0)
        self.model_params = model_params
        self.train_data = self.data[0:2]
        self.y_future = self.data[4]
        self.x_future = self.data[5]
        self.y_complete = self.data[6]

        self.num_future_steps = self.y_future.shape[0]
        self.model = model
        self.savedir = savedir
        self.num_obs_samples = num_obs_samples
        self.num_future_steps = num_future_steps
        self.num_mc_samples = 1
        self.model_params_grad = model_params_grad
        self.true_model_params = true_model_params
        self.vi = MeanFieldVI(self.model, self.savedir, self.num_mc_samples)

        self.ppc_window = ppc_window
        self.isPPC = False

        init = 'map'  # 'true'
        self.init_z = self.map_estimate()
        if init == 'map':
            self.var_params = self.vi.init_var_params(self.T,
                                                      self.dim,
                                                      self.init_z,
                                                      grad=True)
        elif init == 'true':
            self.var_params = self.vi.init_var_params(self.T,
                                                      self.dim,
                                                      z_true,
                                                      grad=True)
        else:
            print 'specify valid init option.'
        self.iters = iters

        self.opt_params = {
            'var_mu': self.var_params[0],
            'var_log_scale': self.var_params[1]
        }
        for k, v in self.model_params_grad.items():
            if v == True:
                self.opt_params[k] = self.model_params[k]
        # self.var_params_model = self.vi.init_var_params_model()
        # self.opt_params['model_mu'] = self.var_params_model[0]
        # self.opt_params['model_log_scale'] = self.var_params_model[1]
        self.test = self.data[2]

        if self.test is None:
            self.ev = None
            self.num_train = self.data[0].shape[0]
        else:
            self.ev = Evaluate(self.data,
                               self.model,
                               savedir='',
                               num_obs_samples=self.num_obs_samples)
            self.num_test = self.data[2].shape[0]
            self.num_train = self.data[0].shape[0] - self.num_test

    def unpack_data(self, data):
        y = data[0]
        x = data[1]
        return y, x

    def map_estimate(self):
        # initialize to all ones = smooth.
        z = torch.tensor(torch.rand(self.T,
                                    self.dim,
                                    dtype=dtype,
                                    device=device),
                         requires_grad=True,
                         dtype=dtype,
                         device=device)
        y, x = self.unpack_data(self.data)
        self.map_iters = 100
        self.opt_params = [z]
        #self.map_optimizer =  torch.optim.Adam(self.opt_params, lr=1e-3)
        self.map_optimizer = torch.optim.LBFGS(self.opt_params)
        lbfgs = True
        for t in range(self.map_iters):

            def closure():
                self.map_optimizer.zero_grad()
                output = -self.model.log_joint(self.model_params, y, x, z)
                output.backward()
                return output

            if lbfgs:
                self.map_optimizer.step(closure)
                with torch.no_grad():
                    output = -self.model.log_joint(self.model_params, y, x, z)
            else:
                output = -self.model.log_joint(self.model_params, y, x, z)
                self.map_optimizer.zero_grad()
                output.backward()
                self.map_optimizer.step()
            if t % 5 == 0:
                print t, output.item()
            if t % 5 == 0:
                plt.cla()
                plt.plot(to_numpy(z))
                figure = plt.gcf()  # get current figure
                figure.set_size_inches(8, 6)
                plt.savefig(self.savedir + '/plots/curr_map_z.png')
        return self.opt_params[0].clone().detach()

    def run(self):
        self.optimizer = torch.optim.SGD(self.opt_params.values(),
                                         momentum=0.99,
                                         lr=1e-10)  # .99, 1e-6
        #self.optimizer = torch.optim.Adam(self.opt_params.values(), lr=1e-3) # .99, 1e-6

        #return self.optimize(50000, False, 1000)
        return self.optimize(20000, False, 1000)

    def optimize(self, iters, lbfgs, print_every):
        y, x = self.train_data[0], self.train_data[1]
        print 'optimizing...'
        outputs = []
        clip = 5.
        curr_model_params = {}
        for k, v in self.model_params_grad.items():
            if v == True:
                curr_model_params[k] = []

        self.iters = iters
        for t in range(self.iters):

            #torch.nn.utils.clip_grad_norm(self.opt_params.values(), clip)
            self.optimizer.zero_grad()
            output = -self.vi.forward(self.model_params, self.train_data,
                                      self.var_params,
                                      t)  #/ float(self.num_train)
            #output = -self.vi.forward_with_model_param_post(self.model_params, self.train_data, self.opt_params, t) #/ float(self.num_train)

            outputs.append((output.item() / float(self.num_train)))
            output.backward()
            self.optimizer.step()

            for k, v in curr_model_params.items():
                if k in self.opt_params:
                    curr_model_params[k].append(
                        [el.item() for el in self.opt_params[k].flatten()])

            if t % print_every == 0:
                # printing
                ox = output.item() / float(self.num_train)
                print 'iter: ', t, 'loss: %.2f ' % ox, 'scale: ',
                if 'var_log_scale' in self.opt_params:
                    print torch.mean(self.opt_params['var_log_scale'].clone().
                                     detach()).item(),
                if 'model_mu' in self.opt_params:
                    print self.opt_params['model_mu'].item(
                    ), self.opt_params['model_log_scale'].item()
                for k, v in self.model_params_grad.items():
                    if v == True:
                        if k in self.opt_params:
                            for el in self.opt_params[k].flatten():
                                print k, '%.3f ' % el.item(),
                print '\n'
                if self.ev is not None:
                    test_marginal = self.ev.valid_loss(self.opt_params)
                    #y_future, future_trajectories, avg_future_marginal_lh = self.ev.sample_future_trajectory(self.opt_params, self.num_future_steps)
                    train_acc, test_acc = self.ev.accuracy(self.opt_params)
                    print 'train acc: %.3f ' % train_acc.item(), 'test acc: %.3f ' % test_acc.item(), \
                        'test marginal likelihood: %.3f ' % test_marginal#, 'future marginal lh: %.3f' % avg_future_marginal_lh.item()

                # plotting
                plt.cla()
                plt.plot(outputs)
                figure = plt.gcf()  # get current figure
                figure.set_size_inches(8, 6)
                plt.savefig(self.savedir + '/loss.png')
                plt.cla()
                for k, v in curr_model_params.items():
                    plt.cla()
                    if k == 'beta':
                        plt.plot(sigmoid(np.array(v)))
                    else:
                        plt.plot(v)
                        # if self.true_model_params:
                        #     for el in self.true_model_params[k]:
                        #         plt.axhline(y=el, color='r', linestyle='-')

                    figure = plt.gcf()  # get current figure
                    figure.set_size_inches(8, 6)
                    plt.savefig(self.savedir + '/plots/' + k + '.png')

                zx = self.var_params[0]
                zx = to_numpy(zx)
                zx_scale = np.exp(to_numpy(self.var_params[1]))
                plt.cla()
                labels = [
                    'Bias', 'X1', 'X2', 'Choice t-1', 'RW Side t-1', 'X1 t-1',
                    'X2 t-1'
                ]
                for j in range(zx_scale.shape[1]):
                    #plt.plot(zx[:,j], label=labels[j], linewidth=.5)
                    plt.plot(zx[:, j], linewidth=.5)

                    # plt.fill_between(np.arange(zx.shape[0]), zx[:,j] - zx_scale[:,j],  zx[:,j] + zx_scale[:,j])
                figure = plt.gcf()  # get current figure
                figure.set_size_inches(12, 8)
                # plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
                plt.savefig(self.savedir + '/plots/curr_est_z.png')
                test_inds = self.data[-4].cpu().numpy()
                if self.ev is not None:
                    zx_test = zx[test_inds]
                    plt.cla()
                    for j in range(zx_scale.shape[1]):
                        #plt.plot(zx_test[:,j], label=labels[j], linewidth=.5)
                        plt.plot(zx_test[:, j], linewidth=.5)

                    plt.savefig(self.savedir + '/plots/curr_est_test_z.png')

        if self.ev is not None:
            test_marginal = self.ev.valid_loss(self.opt_params)
            np.savetxt(self.savedir + '/test_marginal.txt',
                       np.array([test_marginal.item()]))
            print 'final test marginal: ', test_marginal.item()
            # detach and clone all params
        for k in self.opt_params.keys():
            self.opt_params[k] = self.opt_params[k].clone().detach()

        # access learning and regularization components
        #learning, regularization = self.model.log_prior_relative_contrib(self.var_params[0], y, x)
        #torch.save(learning.clone().detach(), self.savedir+'/model_structs/learning_after_training.pth')
        #torch.save(regularization.clone().detach(), self.savedir+'/model_structs/regularization_after_training.pth')
        #plt.cla()
        #plt.plot(to_numpy(learning.clone().detach()))
        #plt.savefig(self.savedir+'/plots/learning_after_training.png')
        #plt.cla()
        #plt.plot(to_numpy(regularization.clone().detach()))
        #plt.savefig(self.savedir+'/plots/regularization_after_training.png')
        return self.opt_params
示例#26
0
文件: main.py 项目: reutapel/NLP_HW2
def main(train_file_to_use, test_file_to_use, comp_file_to_use, test_type, features_combination_list, number_of_iter,
         comp, train_index=None, test_index=None, best_weights_list=None):

    # start all combination of features
    for features_combination in features_combination_list:
        # Create features for train and test gold trees
        print('{}: Start creating parser model for features : {}'.format(time.asctime(time.localtime(time.time())),
                                                                         features_combination))
        logging.info('{}: Start creating parser model for features : {}'.format(time.asctime(time.localtime(time.time())),
                                                                                features_combination))
        train_start_time = time.time()
        parser_model_obj = ParserModel(directory, train_file_to_use, test_file_to_use, comp_file_to_use,
                                       features_combination, use_edges_existed_on_train, use_pos_edges_existed_on_train,
                                       train_index=train_index, test_index=test_index)

        model_finish_time = time.time()
        model_run_time = (model_finish_time - train_start_time) / 60.0
        print('{}: Finish creating parser model for features : {} in {} minutes'.
              format(time.asctime(time.localtime(time.time())), features_combination, model_run_time))
        logging.info('{}: Finish creating parser model for features : {} in {} minutes'
                     .format(time.asctime(time.localtime(time.time())), features_combination, model_run_time))

        # Run perceptron to learn the best weights
        print('{}: Start Perceptron for features : {} and number of iterations: {}'.
              format(time.asctime(time.localtime(time.time())), features_combination, number_of_iter))
        logging.info('{}: Start Perceptron for features : {} and number of iterations: {}'.
                     format(time.asctime(time.localtime(time.time())), features_combination, number_of_iter))
        perceptron_obj = StructPerceptron(model=parser_model_obj, directory=directory,
                                          feature_combination=features_combination)
        weights = perceptron_obj.perceptron(num_of_iter=number_of_iter)
        # weights = None
        # old_loc = "C:\\Users\\RomG\\PycharmProjects\\NLP_HW2\\output\\stepwise_no_27_23_01_2018_14_06_52\\weights\\30_28_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_9_8_7_3_2_1"
        # with open(os.path.join(old_loc, "final_weight_vec_20.pkl"), 'rb') as f:
        #     weights = pickle.load(f)
        # perceptron_obj.directory = old_loc
        # perceptron_obj.inference_mode('test', weights)
        train_run_time = (time.time() - model_finish_time) / 60.0
        print('{}: Finish Perceptron for features : {} and num_of_iter: {}. run time: {} minutes'.
              format(time.asctime(time.localtime(time.time())), features_combination, number_of_iter, train_run_time))
        logging.info('{}: Finish Perceptron for features : {} and num_of_iter: {}. run time: {} minutes'.
                     format(time.asctime(time.localtime(time.time())), features_combination, number_of_iter,
                            train_run_time))

        evaluate_obj = Evaluate(parser_model_obj, perceptron_obj, directory)
        best_weights_name = str()
        if test_type != 'comp':
            weights_directory = perceptron_obj.directory
            weight_file_names = [f for f in listdir(weights_directory) if isfile(join(weights_directory, f))]
            accuracy = dict()
            mistakes_dict_names = dict()
            for weights in weight_file_names:
                with open(os.path.join(weights_directory, weights), 'rb') as fp:
                    weight_vec = pickle.load(fp)
                weights = weights[:-4]
                if train_index is not None and weights != 'final_weight_vec_{}'.format(number_of_iter):
                    continue
                accuracy[weights], mistakes_dict_names[weights] = evaluate_obj.calculate_accuracy(weight_vec,
                                                                                                  weights, test_type)
            print('{}: The model hyper parameters and results are: \n num_of_iter: {} \n test file: {} \n'
                  'train file: {} \n test type: {} \n features combination list: {} \n accuracy: {:%} \n'
                  'mistakes dict name: {}'
                  .format(time.asctime(time.localtime(time.time())), number_of_iter, test_file_to_use,
                          train_file_to_use, test_type, features_combination_list, accuracy[weights],
                          mistakes_dict_names[weights]))
            logging.info('{}: The model hyper parameters and results are: \n num_of_iter: {} \n test file: {}'
                         '\n train file: {} \n test type: {} \n features combination list: {} \n accuracy: {} \n'
                         'mistakes dict name: {}'
                         .format(time.asctime(time.localtime(time.time())), number_of_iter, test_file_to_use,
                                 train_file_to_use, test_type, features_combination_list, accuracy[weights],
                                 mistakes_dict_names[weights]))

            # get the weights that gave the best accuracy and save as best weights
            best_weights = max(accuracy, key=accuracy.get)
            with open(os.path.join(weights_directory, best_weights + '.pkl'), 'rb') as fp:
                best_weights_vec = pickle.load(fp)
            best_weights_name = os.path.join(weights_directory, "best_weights_" + best_weights + '.pkl')
            with open(best_weights_name, 'wb') as f:
                pickle.dump(best_weights_vec, f)

            if train_index is not None:  # running CV
                return accuracy['final_weight_vec_{}'.format(number_of_iter)]

            logging.info('{}: best weights for {}, {}, {}, with accuracy {}, name is: {} '
                         .format(time.asctime(time.localtime(time.time())), num_of_iter, test_type,
                                 features_combination_list, accuracy[best_weights],best_weights_name))
            print('{}: best weights for {}, {}, {}, with accuracy {}, name is: {} '
                  .format(time.asctime(time.localtime(time.time())), num_of_iter, test_type,
                          features_combination_list, accuracy[best_weights], best_weights_name))
    if comp:
        for best_weights_vec_loaded in best_weights_list:
            inference_file_name = evaluate_obj.infer(best_weights_vec_loaded, test_type)
            print('{}: The inferred file name is: {} for weights: {} '.format(time.asctime(time.localtime
                                                                                           (time.time())),
                                                                              inference_file_name, best_weights_vec_loaded))
            logging.info('{}: The inferred file name is: {} for weights: {} '.format(time.asctime(
                time.localtime(time.time())), inference_file_name, best_weights_vec_loaded))

    logging.info('-----------------------------------------------------------------------------------')

    return
def cluster_biterm_framework(
        f, list_CPost, c_CFVector, max_c_id, dic_txtId__CPost, wordVectorsDic,
        dic_clus__id, dic_bitermTag__clusterIds, dic_bitermTitle__clusterIds,
        dic_bitermBody__clusterIds, dic_ngram__txtIds, min_gram, max_gram,
        oCSimilarityFlgas, c_itemsCount):
    eval_pred_true_txt = []

    line_count = 0

    t11 = datetime.now()

    for oCPost in list_CPost:

        trueLabel = oCPost.trueLabel
        tagWords = oCPost.tagWords
        titleWords = oCPost.titleWords
        bodyWords = oCPost.bodyWords
        id = oCPost.id
        soPostId = oCPost.soPostId
        createtime = oCPost.createtime

        print('id', id, 'tagWords', tagWords, 'titleWords', titleWords,
              'bodyWords', bodyWords)

        txtBitermsFreqs_Tag = None
        bi_terms_len_Tag = 0
        grams_Tag = None

        txtBitermsFreqs_Title = None
        bi_terms_len_Title = 0
        grams_Title = None

        txtBitermsFreqs_Body = None
        bi_terms_len_Body = 0
        grams_Body = None

        text_VecTag = None
        text_VecTitle = None
        text_VecBody = None
        targetClusterIds = []

        dic_txtId__CPost[id] = oCPost

        if oCSimilarityFlgas.isTagSim:
            bi_termsTag = construct_biterms(tagWords)

            grams_Tag = generateGramsConsucetive(tagWords, min_gram, max_gram)
            for gram in grams_Tag:
                if gram in dic_ngram__txtIds and len(
                        set(dic_ngram__txtIds[gram])) > max_cposts:
                    continue
                dic_ngram__txtIds.setdefault(gram, []).append(id)
            txtBitermsFreqs_Tag = Counter(bi_termsTag)
            bi_terms_len_Tag = len(bi_termsTag)
            tCIds = findTargetClusters(txtBitermsFreqs_Tag,
                                       dic_bitermTag__clusterIds)
            # print('dic_bitermTag__clusterIds', dic_bitermTag__clusterIds, 'txtBitermsFreqs_Tag', txtBitermsFreqs_Tag)
            targetClusterIds.extend(tCIds)

            if isSemantic:
                X = generate_sent_vecs_toktextdata([tagWords], wordVectorsDic,
                                                   embedDim)
                text_VecTag = X[0]

        if oCSimilarityFlgas.isTitleSim:
            bi_termsTitle = construct_biterms(titleWords)
            grams_Title = generateGramsConsucetive(titleWords, min_gram,
                                                   max_gram)
            for gram in grams_Title:
                if gram in dic_ngram__txtIds and len(
                        set(dic_ngram__txtIds[gram])) > max_cposts:
                    continue
                dic_ngram__txtIds.setdefault(gram, []).append(id)
            txtBitermsFreqs_Title = Counter(bi_termsTitle)
            bi_terms_len_Title = len(bi_termsTitle)
            tCIds = findTargetClusters(txtBitermsFreqs_Title,
                                       dic_bitermTitle__clusterIds)
            targetClusterIds.extend(tCIds)

            if isSemantic:
                X = generate_sent_vecs_toktextdata([titleWords],
                                                   wordVectorsDic, embedDim)
                text_VecTitle = X[0]

        if oCSimilarityFlgas.isBodySim:
            bi_termsBody = construct_biterms(bodyWords)
            grams_Body = generateGramsConsucetive(bodyWords, min_gram,
                                                  max_gram)
            for gram in grams_Body:
                if gram in dic_ngram__txtIds and len(
                        set(dic_ngram__txtIds[gram])) > max_cposts:
                    continue
                dic_ngram__txtIds.setdefault(gram, []).append(id)
            txtBitermsFreqs_Body = Counter(bi_termsBody)
            bi_terms_len_Body = len(bi_termsBody)
            tCIds = findTargetClusters(txtBitermsFreqs_Body,
                                       dic_bitermBody__clusterIds)
            targetClusterIds.extend(tCIds)

            if isSemantic:
                X = generate_sent_vecs_toktextdata([bodyWords], wordVectorsDic,
                                                   embedDim)
                text_VecBody = X[0]

        oCPostProcessed = CPostProcessed(txtBitermsFreqs_Tag, bi_terms_len_Tag,
                                         txtBitermsFreqs_Title,
                                         bi_terms_len_Title,
                                         txtBitermsFreqs_Body,
                                         bi_terms_len_Body, text_VecTag,
                                         text_VecTitle, text_VecBody)

        targetClusterIds = set(targetClusterIds)

        clusterId = findCloseClusterByTargetClusters_framework(
            c_CFVector, oCPostProcessed, targetClusterIds, max_c_id,
            oCSimilarityFlgas)

        if ignoreMinusOne:
            if str(trueLabel) != '-1':
                f.write(
                    str(clusterId) + "	" + str(trueLabel) + "	" +
                    ' '.join(tagWords) + "	" + str(soPostId) + "\n")
        else:
            f.write(
                str(clusterId) + "	" + str(trueLabel) + "	" +
                ' '.join(tagWords) + "	" + str(soPostId) + "\n")

        eval_pred_true_txt.append([clusterId, trueLabel, tagWords])

        if clusterId not in c_itemsCount:
            c_itemsCount[clusterId] = 0
        c_itemsCount[clusterId] += 1

        max_c_id = max([max_c_id, clusterId, len(c_CFVector)])

        dic_clus__id[clusterId] = max_c_id
        # print('max_c_id, len(c_CFVector)', max_c_id, len(c_CFVector))

        c_CFVector, dic_bitermTag__clusterIds, dic_bitermTitle__clusterIds, dic_bitermBody__clusterIds = populateClusterFeature_framework(
            c_CFVector, oCPostProcessed, dic_bitermTag__clusterIds,
            dic_bitermTitle__clusterIds, dic_bitermBody__clusterIds, clusterId,
            id, oCSimilarityFlgas)

        del oCPostProcessed
        del oCPost

        line_count += 1

        if line_count % DeleteInterval == 0:
            c_CFVector, c_itemsCount = deleteOldClusters_framework(
                c_CFVector, c_itemsCount, dic_clus__id)

        if line_count % 1000 == 0:
            # print('c_itemsCount', c_itemsCount)
            Evaluate(eval_pred_true_txt, ignoreMinusOne)

    return [
        c_CFVector, max_c_id, dic_txtId__CPost, dic_clus__id,
        dic_bitermTag__clusterIds, dic_bitermTitle__clusterIds,
        dic_bitermBody__clusterIds, dic_ngram__txtIds, c_itemsCount
    ]
示例#28
0
  print(len(sub_list_pred_true_words_index))
  #cluster_sd(sub_list_pred_true_words_index)
  
  dic_bitri_keys_selectedClusters_seenBatch=cluster_gram_freq(sub_list_pred_true_words_index, batchNo, dic_bitri_keys_selectedClusters_seenBatch,  list_pred_true_words_index[0:end])
  
  predsSeen_list_pred_true_words_index=evaluateByGram(dic_bitri_keys_selectedClusters_seenBatch, list_pred_true_words_index[0:end])
  not_clustered_inds_batch=extractSeenNotClustered(predsSeen_list_pred_true_words_index, sub_list_pred_true_words_index)
  
  #not_clustered_inds_seen_batch.extend(not_clustered_inds_batch)
  
  #not_clustered_inds_batch=assignToClusterSimDistribution(not_clustered_inds_batch, dic_bitri_keys_selectedClusters_seenBatch, list_pred_true_words_index[0:end], wordVectorsDic)
  globalList_clustered.extend(predsSeen_list_pred_true_words_index)
  globalList_not_clustered.extend(not_clustered_inds_batch)
  
  
  Evaluate(predsSeen_list_pred_true_words_index) #+not_clustered_inds_batch) 
  print("total texts=", len(predsSeen_list_pred_true_words_index)+len(not_clustered_inds_batch))
  
  
  
  #texts in cluster + texts not in cluster should be =2000
  '''dictri_keys_selectedClusters_currentBatch, dicbi_keys_selectedClusters_currentBatch, not_clustered_inds_currentBatch, dic_combined_keys_selectedClusters, new_sub_list_pred_true_words_index=filterClusters(dictri_keys_selectedClusters_currentBatch, dicbi_keys_selectedClusters_currentBatch, sub_list_pred_true_words_index, list_pred_true_words_index[0:end])
  
  not_clustered_inds_seen_batch.extend(not_clustered_inds_currentBatch)
  
  appendResultFile(new_sub_list_pred_true_words_index, fileName)
  
  if batchNo>=1: # and batchNo%2==0:
    dic_preds, new_not_clustered_inds_seen_batch=assignToClusterBySimilarity(not_clustered_inds_seen_batch, list_pred_true_words_index[0:end], dic_combined_keys_selectedClusters, wordVectorsDic)
	
    #appendResultFile(new_not_clustered_inds_seen_batch, fileName)
示例#29
0
    def __init__(self,
                 data,
                 model,
                 model_params,
                 model_params_grad,
                 savedir,
                 num_obs_samples,
                 num_future_steps,
                 num_mc_samples,
                 ppc_window,
                 z_true=None,
                 true_model_params=None,
                 iters=1000):
        self.data = data
        self.dim = self.data[1].size(2)
        self.T = self.data[1].size(0)
        self.model_params = model_params
        self.train_data = self.data[0:2]
        self.y_future = self.data[4]
        self.x_future = self.data[5]
        self.y_complete = self.data[6]

        self.num_future_steps = self.y_future.shape[0]
        self.model = model
        self.savedir = savedir
        self.num_obs_samples = num_obs_samples
        self.num_future_steps = num_future_steps
        self.num_mc_samples = 1
        self.model_params_grad = model_params_grad
        self.true_model_params = true_model_params
        self.vi = MeanFieldVI(self.model, self.savedir, self.num_mc_samples)

        self.ppc_window = ppc_window
        self.isPPC = False

        init = 'map'  # 'true'
        self.init_z = self.map_estimate()
        if init == 'map':
            self.var_params = self.vi.init_var_params(self.T,
                                                      self.dim,
                                                      self.init_z,
                                                      grad=True)
        elif init == 'true':
            self.var_params = self.vi.init_var_params(self.T,
                                                      self.dim,
                                                      z_true,
                                                      grad=True)
        else:
            print 'specify valid init option.'
        self.iters = iters

        self.opt_params = {
            'var_mu': self.var_params[0],
            'var_log_scale': self.var_params[1]
        }
        for k, v in self.model_params_grad.items():
            if v == True:
                self.opt_params[k] = self.model_params[k]
        # self.var_params_model = self.vi.init_var_params_model()
        # self.opt_params['model_mu'] = self.var_params_model[0]
        # self.opt_params['model_log_scale'] = self.var_params_model[1]
        self.test = self.data[2]

        if self.test is None:
            self.ev = None
            self.num_train = self.data[0].shape[0]
        else:
            self.ev = Evaluate(self.data,
                               self.model,
                               savedir='',
                               num_obs_samples=self.num_obs_samples)
            self.num_test = self.data[2].shape[0]
            self.num_train = self.data[0].shape[0] - self.num_test
def cluster_biterm(f,
                   list_pred_true_words_index_postid_createtime,
                   c_bitermsFreqs={},
                   c_totalBiterms={},
                   c_wordsFreqs={},
                   c_totalWords={},
                   c_txtIds={},
                   c_clusterVecs={},
                   txtId_txt={},
                   last_txtId=0,
                   max_c_id=0,
                   wordVectorsDic={},
                   dic_clus__id={},
                   dic_biterm__clusterId_Freq={},
                   dic_biterm__allClusterFreq={},
                   dic_biterm__clusterIds={},
                   c_textItems={},
                   dic_ngram__textItems={},
                   min_gram=1,
                   max_gram=2,
                   isTagSim=True,
                   isTitleSim=False,
                   isBodySim=False):
    print("cluster_bigram")

    # current_txt_id=last_txtId

    eval_pred_true_txt = []

    line_count = 0

    t11 = datetime.now()

    for item in list_pred_true_words_index_postid_createtime:

        words = item[2]
        current_txt_id = int(item[3])
        postId = item[4]

        bi_terms = construct_biterms(words)
        grams = generateGramsConsucetive(words, min_gram, max_gram)
        # bi_terms=generateGramsConsucetive(words,minGSize, maxGSize)
        # print(words, bi_terms)

        for gram in grams:
            dic_ngram__textItems.setdefault(gram, []).append(item)

        line_count += 1

        txtBitermsFreqs = Counter(bi_terms)
        bi_terms_len = len(bi_terms)

        txtWordsFreqs = Counter(words)
        words_len = len(words)

        text_Vec = [0] * embedDim
        if isSemantic == True:
            X = generate_sent_vecs_toktextdata([words], wordVectorsDic,
                                               embedDim)
            text_Vec = X[0]

        # clusterId=findCloseCluster(c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len, txtWordsFreqs, words_len, max_c_id, text_Vec, dic_biterm__clusterIds)

        targetClusterIds = findTargetClusters(txtBitermsFreqs,
                                              dic_biterm__clusterIds)

        clusterId = findCloseClusterByTargetClusters(
            c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs,
            c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len,
            txtWordsFreqs, words_len, max_c_id, text_Vec,
            dic_biterm__clusterIds, targetClusterIds)

        c_textItems.setdefault(clusterId, []).append(item)

        max_c_id = max([max_c_id, clusterId, len(c_bitermsFreqs)])

        dic_clus__id[clusterId] = max_c_id

        txtId_txt[current_txt_id] = words

        c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs, c_totalWords, c_clusterVecs, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq, dic_biterm__clusterIds = populateClusterFeature(
            c_bitermsFreqs, c_totalBiterms, c_txtIds, c_wordsFreqs,
            c_totalWords, c_clusterVecs, txtBitermsFreqs, bi_terms_len,
            txtWordsFreqs, words_len, clusterId, current_txt_id, text_Vec,
            dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq,
            dic_biterm__clusterIds)

        # c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq=removeHighEntropyFtrs(c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq)

        # print('clusterId', clusterId, 'current_txt_id', current_txt_id, len(c_textItems), len(c_txtIds), words, len(targetClusterIds), len(dic_ngram__textItems))

        eval_pred_true_txt.append([clusterId, item[1], item[2]])
        if ignoreMinusOne == True:
            if str(item[1]) != '-1':
                f.write(
                    str(clusterId) + "	" + str(item[1]) + "	" +
                    str(' '.join(item[2])) + "	" + postId + "\n")
        else:
            f.write(
                str(clusterId) + "	" + str(item[1]) + "	" +
                str(' '.join(item[2])) + "	" + postId + "\n")

        if line_count % 500 == 0:

            # print(dic_clus__id)
            print(len(dic_clus__id))
            # delete old and small clusters, remove multi-cluster words from clusters
            list_c_sizes = []
            list_c_ids = []
            # list_size__cid={}

            for c_id, txtIds in c_txtIds.items():
                list_c_sizes.append(len(txtIds))
                list_c_ids.append(dic_clus__id[c_id])
                # list_size__cid[len(txtIds)]=c_id
            mean_c_size = 0
            std_c_size = 0
            if len(list_c_sizes) > 2:
                mean_c_size = statistics.mean(list_c_sizes)
                std_c_size = statistics.stdev(list_c_sizes)

            mean_c_id = 0
            std_c_id = 0
            if len(list_c_ids) > 2:
                mean_c_id = statistics.mean(list_c_ids)
                std_c_id = statistics.stdev(list_c_ids)

            print('preocess', line_count, 'texts', 'mean_c_size', mean_c_size,
                  'std_c_size', std_c_size)
            print('preocess', line_count, 'texts', 'mean_c_id', mean_c_id,
                  'std_c_id', std_c_id)

            list_del_cids = []
            del_count = 0

            for c_id, txtIds in c_txtIds.items():
                c_size = len(txtIds)
                if ((c_size <= 1 or
                     float(c_size) <= float(abs(mean_c_size - std_c_size))) or
                    (float(c_size) >= mean_c_size + std_c_size)) or (
                        (float(c_id) <= float(abs(mean_c_id - std_c_id))) or
                        (float(c_id) >= float(abs(mean_c_id + std_c_id)))):
                    list_del_cids.append(c_id)

            list_del_cids = set(list_del_cids)
            print('#list_del_cids', len(list_del_cids), 'len(c_bitermsFreqs)',
                  len(c_bitermsFreqs))

            listTargetBiterms = []  # need to uncomment

            for c_id in list_del_cids:

                if c_id in c_bitermsFreqs:
                    # print('del c_id', c_id, len(c_bitermsFreqs[c_id]))
                    del c_bitermsFreqs[c_id]

                if c_id in c_totalBiterms:
                    del c_totalBiterms[c_id]

                if c_id in c_txtIds:
                    del c_txtIds[c_id]

                if c_id in c_wordsFreqs:
                    del c_wordsFreqs[c_id]

                if c_id in c_totalWords:
                    del c_totalWords[c_id]

                if c_id in dic_clus__id:
                    del dic_clus__id[c_id]

                if isSemantic == True:
                    del c_clusterVecs[c_id]

            # c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq=removeHighEntropyFtrs(c_bitermsFreqs, c_totalBiterms, c_txtIds, dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq)

        if line_count % 1000 == 0:
            print('#######-personal-eval_pred_true_txt',
                  len(eval_pred_true_txt))
            Evaluate(eval_pred_true_txt, ignoreMinusOne)

            t12 = datetime.now()
            t_diff = t12 - t11
            print("total time diff secs=", t_diff.seconds)

    last_txtId = current_txt_id
    return [
        c_bitermsFreqs, c_totalBiterms, c_wordsFreqs, c_totalWords, c_txtIds,
        c_clusterVecs, txtId_txt, last_txtId, dic_clus__id,
        dic_biterm__clusterId_Freq, dic_biterm__allClusterFreq,
        dic_biterm__clusterIds, c_textItems, dic_ngram__textItems
    ]
示例#31
0
文件: main.py 项目: skriser/NeuRec
            model = TransRec(sess, dataset)

        elif recommender.lower() == "cdae":
            model = CDAE(sess, dataset)

        elif recommender.lower() == "dae":
            model = DAE(sess, dataset)

        elif recommender.lower() == "npe":
            model = NPE(sess, dataset)

        elif recommender.lower() == "multidae":
            model = MultiDAE(sess, dataset)

        elif recommender.lower() == "multivae":
            model = MultiVAE(sess, dataset)

        elif recommender.lower() == "irgan":
            model = IRGAN(sess, dataset)

        elif recommender.lower() == "cfgan":
            model = CFGAN(sess, dataset)

        elif recommender.lower() == "jca":
            model = JCA(sess, dataset)

        model.build_graph()
        sess.run(tf.global_variables_initializer())
        model.train_model()
        Evaluate.test_model(model, dataset, num_thread)
示例#32
0
fileDir = os.path.dirname(os.path.abspath(__file__))
#print(fileDir)
parentDir = os.path.dirname(fileDir)
#print(parentDir)
parentDir = os.path.dirname(parentDir)
#print(parentDir)

outputPath = "result/"

trainingFile = outputPath + 'train_biterm_r.txt'

trainList_pred_true_text_postid = ReadPredTrueTextPostid(
    trainingFile, ignoreMinusOne)

print('result for', trainingFile)
Evaluate(trainList_pred_true_text_postid)

all_words = []
for item in trainList_pred_true_text_postid:
    all_words.extend(item[2].split(' '))
all_words = list(set(all_words))

gloveFile = "glove.6B.50d.txt"
embedDim = 50
wordVectorsDic = {}
if isSemantic == True:
    wordVectorsDic = extractAllWordVecsPartialStemming(gloveFile, embedDim,
                                                       all_words)

c_bitermsFreqs = {}
c_totalBiterms = {}