def load_args(snapshot_dir): ############################ model arguments settings ############################ parser = argparse.ArgumentParser(description='Multi-label Classifier based on Multi-component') args = parser.parse_args() arg_dict = args.__dict__ # load arguments from arg.json print("Processing snapshot %s" % (snapshot_dir), get_current_time()) arg_json = load_json(os.path.join(snapshot_dir, "args.json")) for key, val in arg_json.items(): try: if key == "device": val = -1 if key != "model_selection": val = eval(val) except Exception as e: pass finally: arg_dict[key] = val arg_dict["train"] = None args.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print("Loaded args.", get_current_time()) return args
def load_model(args, param_fpath): # model selection if args.model_selection == 'all': from main.tag_rec.approaches.post2vec.models.model_all import MultiComp model = MultiComp(args) elif args.model_selection == 'title': from main.tag_rec.approaches.post2vec.models.model_title import MultiComp model = MultiComp(args) elif args.model_selection == 'title_desc_text': from main.tag_rec.approaches.post2vec.models.model_title_desc_text import MultiComp model = MultiComp(args) else: print("No such model!") exit() print("Inited model %s use param %s." % (args.model_selection, param_fpath), get_current_time()) model.load_state_dict(torch.load(param_fpath)) if args.cuda: torch.cuda.set_device(-1) model = model.cuda() print("Loaded model.", get_current_time()) return model
def load_corpus_csv(corpus_fpath): """ return SOQuestion list from corpus :param path_file: :return: """ import pandas as pd print("Loading corpus %s" % corpus_fpath, get_current_time()) data = list() line_count = 0 for index, row in pd.read_csv(corpus_fpath).iterrows(): # ["id","title","desc_text","desc_code","creation_date","tags"] qid = row["id"] title = ast.literal_eval(row["title"]) desc_text = ast.literal_eval(row["desc_text"]) desc_code = ast.literal_eval(row["desc_code"]) creation_date = row["creation_date"] tags = ast.literal_eval(row["tags"]) soq = Question(qid, title, desc_text, desc_code, creation_date, tags) data.append(soq) line_count += 1 if line_count % 10000 == 0: print("Loaded %d instances..." % line_count, get_current_time()) print('Processed {%s} lines.' % line_count, get_current_time()) return data
def build_len_dict(qlist): print("Building vocab...", get_current_time()) # leng_fpath title_len_list = list() desc_text_len_list = list() desc_code_len_list = list() sent_num = 0 for q in qlist: title_len_list.append(len(q.title)) desc_text_len_list.append(len(q.desc_text)) desc_code_len_list.append(len(q.desc_code)) sent_num += 1 if sent_num % 10000 == 0: print("Processing %s question..." % sent_num, get_current_time()) # len len_dict = dict() len_dict["max_title_len"] = max(title_len_list) if max(title_len_list) < 100 else 100 len_dict["max_desc_text_len"] = max(desc_text_len_list) if max(desc_text_len_list) < 1000 else 1000 len_dict["max_desc_code_len"] = max(desc_code_len_list) if max(desc_code_len_list) < 1000 else 1000 print("Processed %s questions." % sent_num, get_current_time()) return len_dict
def train(train_iter, dev_iter, model, args, global_train_step): if args.cuda: model.cuda() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) steps = 0 model.train() for epoch in range(1, args.epochs + 1): print("\n#epoch %s" % epoch, get_current_time()) for batch in train_iter: # features t = np.array(get_specific_comp_list("title", batch)) dt = np.array(get_specific_comp_list("desc_text", batch)) dc = np.array(get_specific_comp_list("desc_code", batch)) # label target = get_specific_comp_list("tags", batch) t = torch.tensor(t).long() dt = torch.tensor(dt).long() dc = torch.tensor(dc).long() target = torch.tensor(target).float() if args.cuda: t, dt, dc, target = t.cuda(), dt.cuda(), dc.cuda(), target.cuda() optimizer.zero_grad() logit = model(t, dt, dc) # debug # print("logit.reshape(-1) shape %s"%logit.reshape(-1).shape) # print("logit.reshape(-1).[:10] %s"%logit.reshape(-1)[:10]) # print("target.reshape(-1) shape %s"%target.reshape(-1).shape) # print("target.reshape(-1)[:10] %s"%target.reshape(-1)[:10]) loss = nn.BCELoss() loss = loss(logit.reshape(-1), target.reshape(-1)) loss.backward() optimizer.step() steps += 1 global_train_step += 1 if steps % args.log_interval == 0: sys.stdout.write('\rBatch[{}] - loss: {:.10f}'.format(steps, loss)) if global_train_step % args.save_interval == 0: print("\nglobal_train_step {} - step {} - loss {:.10f}".format(global_train_step, steps, loss), get_current_time()) save(model, args.save_dir, 'snapshot', global_train_step) return model, global_train_step
def build_tag_vacab(tag): print("Building vocabulary...", get_current_time()) new_tags = list() for t in tag: new_tags += t new_tags = sorted(list(set(new_tags))) return new_tags
def build_w2v_model(corpus_fpath, model_fpath): print('training %s' % corpus_fpath, get_current_time()) # size is the dimensionality of the feature vectors. # window is the maximum distance between the current and predicted word within a sentence. # min_count = ignore all words with total frequency lower than this. # workers = use this many worker threads to train the model (=faster training with multicore machines). sentences = LineSentence(corpus_fpath) model = Word2Vec(sentences, size=200, workers=10, min_count=1) model.save(model_fpath) # vocab = dict() # wlist = model.wv.index2word # for i in range(len(wlist)): # vocab[wlist[i]] = i # # save_pickle(vocab, vocab_fpath) print('end time : ', get_current_time())
def build_tag_vocab(qlist): print("Building vocab...", get_current_time()) tag_vocab = set() sent_num = 0 for q in qlist: # tags for t in q.tags: if t not in tag_vocab: tag_vocab.add(t) sent_num += 1 if sent_num % 10000 == 0: print("Processing %s question..." % sent_num, get_current_time()) print("Processed %s questions." % sent_num, get_current_time()) return tag_vocab
def build_corpus(all_fpath, rare_tags, corpus_fpath): import pandas as pd print("Building raw corpus and doing some pre processing...") cnt = 0 filter_cnt = 0 df = pd.read_csv(all_fpath) q_list = list() for idx, row in df.iterrows(): try: qid = row['id'] title = ast.literal_eval(row['title']) desc_text = ast.literal_eval(row['desc_text']) desc_code = ast.literal_eval(row['desc_code']) creation_date = row['creation_date'] tags = ast.literal_eval(row['tags']) # remove rare tags clean_tags = list(set(tags) - set(rare_tags)) if len(clean_tags) == 0: filter_cnt += 1 continue try: q_list.append( Question(qid, title, desc_text, desc_code, creation_date, clean_tags)) cnt += 1 except Exception as e: print("Skip id=%s" % qid) print("Error msg: %s" % e) if cnt % 10000 == 0: print( "Writing %d instances, filter %d instances..." % (cnt, filter_cnt), get_current_time()) except Exception as e: print("Skip qid %s because %s" % (qid, e)) filter_cnt += 1 save_pickle(q_list, corpus_fpath) print("Write %s lines successfully." % cnt) print("Corpus building sucessfully! %s" % corpus_fpath, get_current_time() + '\n')
def load_tag_cnt(tag_cnt_dict_fpath): print("loading tag cnt dict...", get_current_time()) tag_cnt_dict = {} with open(tag_cnt_dict_fpath) as csvfile: reader = csv.reader(csvfile, delimiter=',') header = next(reader) for row in reader: tag = row[0] cnt = row[1] tag_cnt_dict[tag] = int(cnt) return tag_cnt_dict
def build_vocab(text): print("Building vocabulary...", get_current_time()) new_text = list() for t in text: new_text += t new_text = sorted(list(set(new_text))) dictionary = dict() for i in range(len(new_text)): dictionary[new_text[i]] = i dictionary['<PAD>'] = len(new_text) return dictionary
def build_corpus(qlist, comp_list, corpus_fpath): corpus_f = open(corpus_fpath, "w") skip_cnt = 0 cnt = 0 print("processing %s" % fpath, get_current_time()) for q in qlist: try: for comp in comp_list: str_tmp = ' '.join(q.get_comp_by_name(comp)).strip() if str_tmp.strip() == '': continue corpus_f.write(str_tmp + '\n') cnt += 1 if cnt % 50000 == 0: print("Processed %s questions." % cnt, get_current_time()) except Exception as e: skip_cnt += 1 print("Skip %s because %s" % (skip_cnt, e)) corpus_f.close() print("corpus %s building finished." % corpus_fpath)
def build_tf_idf_vocab(comp_list, qlist): min_count = len(qlist) / 100000 if min_count > 50: min_count = 50 print("comp list %s, min count %s" % (comp_list, min_count)) word_dict = dict() sent_num = 0 print("Computing tf-idf...", get_current_time()) for q in qlist: sent_num += 1 cur_word_set = set() comp_word_list = list() for comp in comp_list: comp_word_list += q.get_comp_by_name(comp) for w in comp_word_list: if w not in cur_word_set: cur_word_set.add(w) if w not in word_dict: word_dict[w] = {"tf": 1, "idf": 1} else: word_dict[w]["tf"] += 1 word_dict[w]["idf"] += 1 else: word_dict[w]["tf"] += 1 if sent_num % 10000 == 0: print("Processed %s questions component %s." % (sent_num, comp_list), get_current_time()) for w in word_dict.copy().keys(): if word_dict[w]["tf"] >= min_count: tf = word_dict[w]["tf"] word_dict[w]["idf"] = math.log(sent_num / float(word_dict[w]["idf"])) idf = word_dict[w]["idf"] word_dict[w]["tfidf"] = tf * idf else: word_dict.pop(w) print("# %s dict = %s" % (comp_list, len(word_dict))) return word_dict
def identify_rare_tags(tag_dict, rare_tags_fpath, commom_tags_fpath, ts): """ a tag to be rare if its number of appearances is less than or equal to a predefined threshold ts. :param tag_dict: :param ts: :return: """ rare_tags = [] common_tags = [] for t in tag_dict: if tag_dict[t] <= ts: rare_tags.append(t) else: common_tags.append(t) header = ["tag"] write_list_to_csv(rare_tags, rare_tags_fpath, header) write_list_to_csv(common_tags, commom_tags_fpath, header) print("#rare tags : %s" % len(rare_tags), get_current_time() + '\n')
def get_all_topk_qlist(corpus_dir): parallel_list = [ "0-1000000", "1000000-2000000", "2000000-3000000", "3000000-4000000", "4000000-5000000", "5000000-6000000", "6000000-7000000", "7000000-8000000", "8000000-9000000", "9000000-10000000", "10000000-11000000", "11000000-12000000", "12000000-13000000", "13000000-14000000", "14000000-15000000", "15000000-16000000", "16000000-None" ] qlist = list() for p in parallel_list: target_corpus_fpath = corpus_dir + os.sep + "_2_corpus-without-Raretag-%s.pkl" % p for q in load_pickle(target_corpus_fpath): yy = int(q.creation_date.split('-')[0]) if yy >= 2014: qlist.append(q) print("# all qlist = %s" % len(qlist), get_current_time()) return qlist
def extract_by_id_list(id_list, table_name, rare_tags): q_list = list() con = mdb.connect('localhost', 'root', 'root', db_name) cur = con.cursor() batch_size = 500 total_batch = int(len(id_list) / batch_size) + 1 count = 0 for batch_idx in range(total_batch): if count % 10000 == 0: print('reading %s question from Table %s' % (count, table_name), get_current_time()) count += batch_size batch_sql = "SELECT * FROM %s WHERE PostTypeId = 1 AND Id IN(" % (table_name) if batch_idx < total_batch - 1: for i in range(batch_size): idx = batch_idx * batch_size + i batch_sql += ('%s,' % id_list[idx]) elif batch_idx == total_batch - 1: for idx in range(batch_idx * batch_size, len(id_list)): batch_sql += ('%s,' % id_list[idx]) # -1 because need to remove symbol "," batch_sql = batch_sql[:-1] + ')' try: cur.execute(batch_sql) results = cur.fetchall() for row in results: # id,title,body id = row[0] title = row[11] desc = row[6] tags = row[12].replace('<', ' ').replace('>', ' ').strip().split() clean_tags = list(set(tags) - set(rare_tags)) raw_desc_text, desc_code = separate_text_code(desc) clean_desc_text = clean_html_tags(raw_desc_text) q = Question(id=id, title=title, desc_text=clean_desc_text, desc_code=desc_code, tags=clean_tags) q_list.append(q) except Exception as e: print(e) cur.close() con.close() return q_list
def train_step(x_batch, y_batch): """ A single training step """ # global total_recall_5, total_recall_10 sequence_length = [len(sample) for sample in x_batch] feed_dict = { rcnn.X: x_batch, rcnn.y: y_batch, rcnn.sequence_length: sequence_length, rcnn.dropout_keep_prob: FLAGS.dropout_keep_prob } _, step, loss = sess.run([train_op, global_step, rcnn.loss], feed_dict) print("x_batch len %s" % len(x_batch)) print("y_batch len %s" % len(y_batch)) print("x_batch[0] len %s" % len(x_batch[0])) print("y_batch[0] len %s" % len(y_batch[0])) time_str = datetime.datetime.now().isoformat() if math.isnan(loss): print("train step Loss is nan!", get_current_time()) exit() # print("pre loss %s" % pre_loss, get_current_time()) print("{}: step {}, loss {:g}".format(time_str, step, loss))
def dev_step(x_batch, y_batch, writer=None, test=False): """ Evaluates model on a dev set """ sequence_length = [ len(np.nonzero(sample)[0]) for sample in x_batch ] feed_dict = { rcnn.X: x_batch, rcnn.y: y_batch, rcnn.sequence_length: sequence_length, # rcnn.max_sequence_length: max_sequence_length, rcnn.dropout_keep_prob: 1.0 } step, loss = sess.run([global_step, rcnn.loss], feed_dict) time_str = datetime.datetime.now().isoformat() if math.isnan(loss): print("dev step Loss is nan! Exit!", get_current_time()) # exit(0) print("{}: step {}, loss {:g}".format(time_str, step, loss))
# input # "id", "tags" # id-tags-all.csv id_tags_csv_fpath = dataset_dir + os.sep + "id-tags-all.csv" # output # _0_tag-count-all.csv # "tag", "count" tag_cnt_all_csv_fapth = dataset_dir + os.sep + "_0_tag-count-all.csv" tag_cnt_dict = {} tag_cnt_header = ["tag", "count"] # tag cnt row_num = 0 with open(id_tags_csv_fpath, 'r') as id_tags_file: rd = csv.reader(id_tags_file, escapechar='\\') for row in rd: tags = row[1].replace('<', ' ').replace('>', ' ').strip().split() row_num += 1 for t in tags: if t in tag_cnt_dict: tag_cnt_dict[t] += 1 else: tag_cnt_dict[t] = 1 if row_num % 10000 == 0: print("Processing line %s" % row_num, get_current_time()) write_dict_to_csv(tag_cnt_dict, tag_cnt_all_csv_fapth, tag_cnt_header) print("# Tags = %s" % len(tag_cnt_dict))
os.mkdir(parallel_dir) print("Setting:\ntask : %s\ndataset : %s\n" % (task, dataset)) st_row_num = 13000000 et_row_num = 14000000 print("start line num = %s, end line num = %s" % (st_row_num, et_row_num)) # input # "id", "title", "desc", "creation_date", "tags" all_raw_csv_fpath = dataset_dir + os.sep + 'all-with-Raretag.csv' # output # _0_all-clean-with-Raretag.csv # "id", "title", "desc_text", "desc_code", "creation_date", "tags" all_clean_csv_fpath = parallel_dir + os.sep + '_0_all-clean-with-Raretag-%s-%s.csv' % (st_row_num, et_row_num) print("Preprocessing corpus %s" % all_raw_csv_fpath, get_current_time()) with open(all_raw_csv_fpath, 'r', encoding='utf-8', errors='surrogatepass') as all: rd = csv.reader(all, escapechar='\\') row_num = st_row_num cnt = 0 corpus_header = ["id", "title", "desc_text", "desc_code", "creation_date", "tags"] with open(all_clean_csv_fpath, 'w') as out: wr = csv.writer(out) wr.writerow(corpus_header) for row in islice(rd, st_row_num, et_row_num): row_num += 1 qid = row[0] title = row[1] desc = row[2] creation_date = row[3] tags = row[4].replace('<', ' ').replace('>', ' ').strip().split()
if __name__ == '__main__': task = 'tagRec' dataset = "SO-05-Sep-2018" dataset_dir = data_dir + os.sep + task + os.sep + dataset parallel_dir = dataset_dir + os.sep + "parallel" st_row_num = 16000000 et_row_num = None # ts ts = 50 ts_dir = dataset_dir + os.sep + "ts%s" % ts ts_corpus_dir = ts_dir + os.sep + "corpus" if not os.path.exists(ts_corpus_dir): os.mkdir(ts_corpus_dir) print("start line num = %s, end line num = %s" % (st_row_num, et_row_num)) # Input: target_corpus_fpath = parallel_dir + os.sep + "_0_all-clean-with-Raretag-%s-%s.csv" % ( st_row_num, et_row_num) rare_tags_fpath = ts_dir + os.sep + "_1_rareTags.csv" # Output: corpus_fpath = ts_corpus_dir + os.sep + "_2_corpus-without-Raretag-%s-%s.pkl" % ( st_row_num, et_row_num) rare_tags = load_tags(rare_tags_fpath) build_corpus(target_corpus_fpath, rare_tags, corpus_fpath) print('Done.', get_current_time())
model = load_model(args, param_fpath) if args.model_selection == "all": from main.tag_rec.approaches.post2vec.models.model_all import eval elif args.model_selection == "title": from main.tag_rec.approaches.post2vec.models.model_title import eval elif args.model_selection == "title_desc_text": from main.tag_rec.approaches.post2vec.models.model_title_desc_text import eval print("Loading test data...") # get sample test data sample_test_data_dir = os.path.join(sample_K_dir, "sample_test") for f in os.listdir(sample_test_data_dir): sample_test_data_fpath = os.path.join(sample_test_data_dir, f) sample_test_data = load_pickle(sample_test_data_fpath) print("#test data = %s loaded!" % len(sample_test_data), get_current_time()) processed_test_data = padding_and_indexing_qlist(sample_test_data, len_dict, title_vocab, desc_text_vocab, desc_code_vocab, tag_vocab) print("random mini batch", get_current_time()) batches_test = random_mini_batch(processed_test_data, args.batch_size) pre, rc, f1, cnt = eval(batches_test, model, args, topk_list) pre[:] = [x / cnt for x in pre] rc[:] = [x / cnt for x in rc] f1[:] = [x / cnt for x in f1] print("# test : %s" % cnt) print("Precision\t\t%s" % ("\t".join(str(x) for x in pre)))
graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, device_count={"GPU": 1}, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) # Get the placeholders from the graph by name input_x = graph.get_operation_by_name("input_x").outputs[0] print("input_x...", get_current_time()) # input_y = graph.get_operation_by_name("input_y").outputs[0] dropout_keep_prob = graph.get_operation_by_name( "dropout_keep_prob").outputs[0] print("dropout_keep_prob...", get_current_time()) # Tensors we want to evaluate predictions = graph.get_operation_by_name( "output/predictions").outputs[0] print("prediction...", get_current_time()) # prepare test data sample_test_data_dir = os.path.join(sample_K_dir, "sample_test") test_data_cnt = 0 res_str = '' for f in os.listdir(sample_test_data_dir):
model.load_state_dict(torch.load(args.snapshot)) if args.cuda: torch.cuda.set_device(args.device) model = model.cuda() args.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') save_args(args) ################################################################################# try: global_train_step = 0 train_cnt = 0 for f in sorted(os.listdir(train_dir)): print("\n\n# train file = %s" % train_cnt, get_current_time()) train_cnt += 1 train_data_fpath = os.path.join(train_dir, f) train_data = load_pickle(train_data_fpath) print("padding and indexing train", get_current_time()) processed_train_data = padding_and_indexing_qlist( train_data, len_dict, title_vocab, desc_text_vocab, desc_code_vocab, tag_vocab) print("random mini batch train", get_current_time()) batches_train = random_mini_batch(processed_train_data, args.batch_size) print("Start train %s..." % f, get_current_time()) model, global_train_step = train(train_iter=batches_train, dev_iter=None, model=model, args=args,
} step, loss = sess.run([global_step, rcnn.loss], feed_dict) time_str = datetime.datetime.now().isoformat() if math.isnan(loss): print("dev step Loss is nan! Exit!", get_current_time()) # exit(0) print("{}: step {}, loss {:g}".format(time_str, step, loss)) # Load data print("Loading data...") f_cnt = 0 for f in os.listdir(train_dir): fpath = os.path.join(train_dir, f) print("Processing #%s %s" % (f_cnt, f), get_current_time()) f_cnt += 1 train_data = load_pickle(fpath) x, y = load_data_and_labels(qlist=train_data, text_vocab=text_vocab, max_len=FLAGS.max_len, tag_vocab=tag_vocab) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation dev_sample_index = -1 * int(
ts_dir = dataset_dir + os.sep + "ts%s" % ts ts_corpus_dir = ts_dir + os.sep + "corpus" sample_k = "test100000" sample_k_dir = ts_dir + os.sep + "data-%s" % sample_k if not os.path.exists(sample_k_dir): os.mkdir(sample_k_dir) print("Setting:\ntask : %s\ndataset : %s\nts : %s\nsample k : %s" % (task, dataset, ts, sample_k)) all_qlist = get_all_topk_qlist(ts_corpus_dir) if sample_k == "all" or sample_k == "test100000": qlist = all_qlist else: qlist = sample(all_qlist, sample_k) print("Sorting...", get_current_time()) sorted_qlist = sorted(qlist, key=operator.attrgetter('creation_date')) # training:test=X:100000 size_of_test = 100000 train_data = sorted_qlist[:int(len(sorted_qlist) - size_of_test)] test_data = sorted_qlist[int(len(sorted_qlist) - size_of_test):] print("#train = %s, #test = %s" % (len(train_data), len(test_data))) print("shuffling...", get_current_time()) shuffle(train_data) shuffle(test_data) train_dir = sample_k_dir + os.sep + "train" if not os.path.exists(train_dir):
def save_pickle(data, fpath): print("Saving %s..." % fpath, get_current_time()) with open(fpath, 'wb') as handle: cPickle.dump(data, handle) print("Saved.", get_current_time())
def load_pickle(fpath): print("Loading %s..." % fpath, get_current_time()) with open(fpath, 'rb') as handle: data = cPickle.load(handle) print("Loaded.", get_current_time()) return data
def __init__(self, sequence_length, num_classes, vocab_size, embedding_size, filter_sizes, num_filters, l2_reg_lambda): # Placeholders for input, output and dropout # with tf.device('/device:GPU:0'): self.input_x = tf.placeholder(tf.int32, [None, None], name="input_x") self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y") self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") # Keeping track of l2 regularization loss (optional) l2_loss = tf.constant(0.0) # print("sequence_length:",sequence_length) # Embedding layer tf.device('/cpu:0') , with tf.name_scope("embedding"): print("embedding_textcnn...", get_current_time()) self.W = tf.Variable(tf.random_uniform( [vocab_size, embedding_size], -1.0, 1.0), name="W") print(self.input_x.shape) self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x) self.embedded_chars_expanded = tf.expand_dims( self.embedded_chars, -1) # Create a convolution + maxpool layer for each filter size pooled_outputs = [] for i, filter_size in enumerate(filter_sizes): with tf.name_scope("conv-maxpool-%s" % filter_size): # Convolution Layer filter_shape = [filter_size, embedding_size, 1, num_filters] W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W") b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b") conv = tf.nn.conv2d( self.embedded_chars_expanded, # self.embedded_chars_x_flat, W, strides=[1, 1, 1, 1], padding="VALID", name="conv") # Apply nonlinearity # relu to tanh h = tf.nn.tanh(tf.nn.bias_add(conv, b), name="tanh") # Maxpooling over the outputs pooled = tf.nn.max_pool( h, ksize=[1, sequence_length - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID', name="pool") pooled_outputs.append(pooled) # Combine all the pooled features num_filters_total = num_filters * len(filter_sizes) self.h_pool = tf.concat(pooled_outputs, 3) self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total]) # Add dropout # A dropout layer stochastically “disables” a fraction of its neurons. # This prevent neurons from co-adapting and forces them to learn individually useful features. # The fraction of neurons we keep enabled is defined by the dropout_keep_prob input to our network. # We set this to something like 0.5 during training, and to 1 (disable dropout) during evaluation. with tf.name_scope("dropout"): print("dropout_textcnn...", get_current_time()) self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob, name="post2vec") # Final (unnormalized) scores and predictions with tf.name_scope("output"): print("output_textcnn...", get_current_time()) W = tf.get_variable( "W", shape=[num_filters_total, num_classes], initializer=tf.contrib.layers.xavier_initializer()) b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b") l2_loss += tf.nn.l2_loss(W) l2_loss += tf.nn.l2_loss(b) self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores") # sigmoid转化为由0到1的概率 self.predictions = tf.nn.sigmoid(self.scores, name="predictions") # CalculateMean cross-entropy loss with tf.name_scope("loss"): print("loss_textcnn...", get_current_time()) cross_entropy = -tf.reduce_sum( (self.input_y * tf.log(self.predictions + 1e-9)) + (1 - self.input_y) * tf.log(1 - self.predictions + 1e-9), name="xentropy") # tf.nn.softmax_cross_entropy_with_logits适用于多类别,不适合多标签 # losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y) # losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y) self.loss = tf.reduce_mean(cross_entropy + l2_reg_lambda * l2_loss)
input_x = graph.get_operation_by_name("input_X").outputs[0] input_sequence_length = graph.get_operation_by_name("input_sequence_length").outputs[0] dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0] # post2vec post2vec = graph.get_tensor_by_name("dropout/post2vec/mul:0") # Tensors we want to evaluate predictions = graph.get_operation_by_name("output/predictions").outputs[0] # prepare test data sample_test_data_dir = os.path.join(sample_K_dir, "sample_test") test_data_cnt = 0 res_str = '' for f in os.listdir(sample_test_data_dir): print("Processing #%s test data %s..." % (test_data_cnt, f), get_current_time()) test_data_cnt += 1 sample_test_data_fpath = os.path.join(sample_test_data_dir, f) sample_test_data = load_pickle(sample_test_data_fpath) print("# test data = %s" % len(sample_test_data), get_current_time()) x_test, y_test = load_data_and_labels(qlist=sample_test_data, text_vocab=text_vocab, max_len=FLAGS.max_len, tag_vocab=tag_vocab) # Generate batches for one epoch batches = batch_iter(list(x_test), FLAGS.batch_size, 1, shuffle=False) # Collect the predictions here all_predictions = list()