Пример #1
0
def build_query(data_path, w2v_path, vocab_path, k):
    '''
    构建查询
    
    Args:
        data_path str 查询文件路径
        model_path str 词向量模型路径
        vocab_path str 词典路径
        k int 返回前k个相近词
    Returns:
        query_list list 已扩展的查询列表
    '''
    # 载入词向量模型,词典模型
    w2v_model = load_model(w2v_path)
    vocab = pickle_load(vocab_path)
    query_list = []
    # 解析xml文档
    qurey_dict = {'disease': [], 'gene': [], 'demographic': [], 'other': []}
    query_dict = xml_parse(data_path, qurey_dict, qurey_dict.keys())
    disease_field_list = query_dict['disease']
    gene_field_list = query_dict['gene']
    demographic_field_list = query_dict['demographic']
    other_field_list = query_dict['other']
    del query_dict
    # 遍历查询
    for i in range(len(disease_field_list)):
        query_tmp_list = []
        # 获取一条查询的查询词
        disease_field_list[i] = preprocess(disease_field_list[i])
        disease_list = disease_field_list[i].split(' ')
        gene_field_list[i] = preprocess(gene_field_list[i])
        gene_list = gene_field_list[i].split(' ')
        other_list = preprocess(other_field_list[i])
        other_list = other_field_list[i].split(' ')
        demographic_list = demographic_split(demographic_field_list[i])
        # 对原始查询就行词性还原与去停用词操作
        disease_clean_list = clean_data(disease_list)
        gene_clean_list = clean_data(gene_list)
        demographic_clean_list = clean_data(demographic_list)
        other_clean_list = clean_data(other_list)
        # 查询扩展(含词干还原和去停用词操作)
        query_tmp_list.append(
            query_extension(disease_clean_list, w2v_model, vocab, k))
        query_tmp_list.append(
            query_extension(gene_clean_list, w2v_model, vocab, k))
        query_tmp_list.append(
            query_extension(demographic_clean_list, w2v_model, vocab, k))
        query_tmp_list.append(
            query_extension(other_clean_list, w2v_model, vocab, k))
        query_list.append(query_tmp_list)
    return query_list
Пример #2
0
def generate_tags(dataset, title, body):
    # Data Preparation
    # ==================================================

    path = os.path.join('model', dataset)
    text = data_helpers.preprocess(title,body)
    x_text = [data_helpers.clean_str(text)]

    # Restore vocab file
    vocab_processor = learn.preprocessing.VocabularyProcessor.restore(os.path.join(
        path, 'vocab'))

    x = np.array(list(vocab_processor.fit_transform(x_text)))
    tags_df = pd.read_csv(os.path.join(path,'tags_df.csv'), encoding='utf8', index_col=0)
    tag_list = tags_df['TagName'].tolist()

    # prediction
    # ==================================================

    with tf.Graph().as_default():
        session_conf = tf.compat.v1.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=False,
            intra_op_parallelism_threads=3,
            inter_op_parallelism_threads=3)
        sess = tf.compat.v1.Session(config=session_conf)
        with sess.as_default():
            rcnn = RCNN(
                num_classes=len(tag_list),
                vocab_size=len(vocab_processor.vocabulary_),
                embedding_size=100,
                hidden_units=100,
                context_size=50,
                max_sequence_length=x.shape[1])
                # l2_reg_lambda=FLAGS.l2_reg_lambda)

            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.compat.v1.train.AdamOptimizer(1e-3)
            grads_and_vars = optimizer.compute_gradients(rcnn.loss)
            train_op = optimizer.apply_gradients(
                grads_and_vars, global_step=global_step)
            saver = tf.compat.v1.train.Saver(tf.compat.v1.global_variables())

            # Loading checkpoint
            save_path = os.path.join(path, "model")
            saver.restore(sess, save_path)

            # predict
            sequence_length = [len(sample) for sample in x]
            feed_dict = {
                rcnn.X: x,
                rcnn.sequence_length: sequence_length,
                # rcnn.max_sequence_length: max_sequence_length,
                rcnn.dropout_keep_prob: 1.0
            }
            prediction = sess.run([rcnn.predictions],feed_dict)[0][0]
            idx = prediction.argsort()[-5:][::-1]
            tags = [tag_list[i] for i in idx]
    return tags
Пример #3
0
def main(argv=None):
    x_train, y_train, vocabulary, x_dev, y_dev = data_helpers.preprocess(did)
    train(x_train, y_train, vocabulary, x_dev, y_dev)
Пример #4
0
def main(_):
    x_train, y_train, vocabulary, x_dev, y_dev = data_helpers.preprocess(did)
    cell_types =["gru"]
    for cell_type in cell_types:
        train(x_train, y_train, vocabulary, x_dev, y_dev,cell_type)
Пример #5
0
                ngram = tuple(new_list[i:i + ngram_value])
                if ngram in token_indice:
                    new_list.append(token_indice[ngram])
        new_sequences.append(new_list)
    return new_sequences


# ngram_range = 2 will add bi-grams features
ngram_range = 3
batch_size = 64
embedding_dim = 128
epochs = 100

print('Loading data...')
# get data
x_train, y_train, x_test, y_test, word2index = data_helpers.preprocess()
max_features = len(word2index)

print('get ngram feature...')
if ngram_range > 1:
    print(str(ngram_range) + '-gram')
    # Create set of unique n-gram from the training set.
    ngram_set = set()
    for input_list in x_train:
        for i in range(2, ngram_range + 1):
            set_of_ngram = create_ngram_set(input_list, ngram_value=i)
            ngram_set.update(set_of_ngram)

    # Dictionary mapping n-gram token to a unique integer.
    # Integer values are greater than max_features in order
    # to avoid collision with existing features.
def main(argv=None):
    x_train, y_train, vocab_processor, x_dev, y_dev = data_helpers.preprocess()
    # print(y_train)
    train(x_train, y_train, vocab_processor, x_dev, y_dev)