def build_query(data_path, w2v_path, vocab_path, k): ''' 构建查询 Args: data_path str 查询文件路径 model_path str 词向量模型路径 vocab_path str 词典路径 k int 返回前k个相近词 Returns: query_list list 已扩展的查询列表 ''' # 载入词向量模型,词典模型 w2v_model = load_model(w2v_path) vocab = pickle_load(vocab_path) query_list = [] # 解析xml文档 qurey_dict = {'disease': [], 'gene': [], 'demographic': [], 'other': []} query_dict = xml_parse(data_path, qurey_dict, qurey_dict.keys()) disease_field_list = query_dict['disease'] gene_field_list = query_dict['gene'] demographic_field_list = query_dict['demographic'] other_field_list = query_dict['other'] del query_dict # 遍历查询 for i in range(len(disease_field_list)): query_tmp_list = [] # 获取一条查询的查询词 disease_field_list[i] = preprocess(disease_field_list[i]) disease_list = disease_field_list[i].split(' ') gene_field_list[i] = preprocess(gene_field_list[i]) gene_list = gene_field_list[i].split(' ') other_list = preprocess(other_field_list[i]) other_list = other_field_list[i].split(' ') demographic_list = demographic_split(demographic_field_list[i]) # 对原始查询就行词性还原与去停用词操作 disease_clean_list = clean_data(disease_list) gene_clean_list = clean_data(gene_list) demographic_clean_list = clean_data(demographic_list) other_clean_list = clean_data(other_list) # 查询扩展(含词干还原和去停用词操作) query_tmp_list.append( query_extension(disease_clean_list, w2v_model, vocab, k)) query_tmp_list.append( query_extension(gene_clean_list, w2v_model, vocab, k)) query_tmp_list.append( query_extension(demographic_clean_list, w2v_model, vocab, k)) query_tmp_list.append( query_extension(other_clean_list, w2v_model, vocab, k)) query_list.append(query_tmp_list) return query_list
def generate_tags(dataset, title, body): # Data Preparation # ================================================== path = os.path.join('model', dataset) text = data_helpers.preprocess(title,body) x_text = [data_helpers.clean_str(text)] # Restore vocab file vocab_processor = learn.preprocessing.VocabularyProcessor.restore(os.path.join( path, 'vocab')) x = np.array(list(vocab_processor.fit_transform(x_text))) tags_df = pd.read_csv(os.path.join(path,'tags_df.csv'), encoding='utf8', index_col=0) tag_list = tags_df['TagName'].tolist() # prediction # ================================================== with tf.Graph().as_default(): session_conf = tf.compat.v1.ConfigProto( allow_soft_placement=True, log_device_placement=False, intra_op_parallelism_threads=3, inter_op_parallelism_threads=3) sess = tf.compat.v1.Session(config=session_conf) with sess.as_default(): rcnn = RCNN( num_classes=len(tag_list), vocab_size=len(vocab_processor.vocabulary_), embedding_size=100, hidden_units=100, context_size=50, max_sequence_length=x.shape[1]) # l2_reg_lambda=FLAGS.l2_reg_lambda) # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.compat.v1.train.AdamOptimizer(1e-3) grads_and_vars = optimizer.compute_gradients(rcnn.loss) train_op = optimizer.apply_gradients( grads_and_vars, global_step=global_step) saver = tf.compat.v1.train.Saver(tf.compat.v1.global_variables()) # Loading checkpoint save_path = os.path.join(path, "model") saver.restore(sess, save_path) # predict sequence_length = [len(sample) for sample in x] feed_dict = { rcnn.X: x, rcnn.sequence_length: sequence_length, # rcnn.max_sequence_length: max_sequence_length, rcnn.dropout_keep_prob: 1.0 } prediction = sess.run([rcnn.predictions],feed_dict)[0][0] idx = prediction.argsort()[-5:][::-1] tags = [tag_list[i] for i in idx] return tags
def main(argv=None): x_train, y_train, vocabulary, x_dev, y_dev = data_helpers.preprocess(did) train(x_train, y_train, vocabulary, x_dev, y_dev)
def main(_): x_train, y_train, vocabulary, x_dev, y_dev = data_helpers.preprocess(did) cell_types =["gru"] for cell_type in cell_types: train(x_train, y_train, vocabulary, x_dev, y_dev,cell_type)
ngram = tuple(new_list[i:i + ngram_value]) if ngram in token_indice: new_list.append(token_indice[ngram]) new_sequences.append(new_list) return new_sequences # ngram_range = 2 will add bi-grams features ngram_range = 3 batch_size = 64 embedding_dim = 128 epochs = 100 print('Loading data...') # get data x_train, y_train, x_test, y_test, word2index = data_helpers.preprocess() max_features = len(word2index) print('get ngram feature...') if ngram_range > 1: print(str(ngram_range) + '-gram') # Create set of unique n-gram from the training set. ngram_set = set() for input_list in x_train: for i in range(2, ngram_range + 1): set_of_ngram = create_ngram_set(input_list, ngram_value=i) ngram_set.update(set_of_ngram) # Dictionary mapping n-gram token to a unique integer. # Integer values are greater than max_features in order # to avoid collision with existing features.
def main(argv=None): x_train, y_train, vocab_processor, x_dev, y_dev = data_helpers.preprocess() # print(y_train) train(x_train, y_train, vocab_processor, x_dev, y_dev)