예제 #1
0
def get_sentences():
    data = load_description()
    ids = lmap(lambda x: x['icd10_code'].strip(), data)
    input2 = lmap(lambda x: x['short_desc'], data)
    desc_tokens = lmap(nltk.word_tokenize, input2)

    l = []
    for id_token, desc_token in zip(ids, desc_tokens):
        l.append([id_token] + desc_token)
    return l
예제 #2
0
def manually_build_embedding():
    data = load_description()
    n_input_voca = data[-1]['order_number']
    dim = 100
    n_output_voca, word2idx = build_voca(data)
    icd10_codes, ids, train_tokens = extract_data(data, word2idx)
    print("n_output_voca", n_output_voca)
    W1, W2, code_id_to_code = build_embedding(dim, icd10_codes, ids,
                                              n_input_voca, n_output_voca,
                                              train_tokens)
    all_voca = export_embeddings(W1, W2, code_id_to_code, ids, word2idx)
    save_word2vec_format(all_voca, "manual_voca.txt")
예제 #3
0
파일: eval.py 프로젝트: clover3/Chair
def do_eval(w2v):
    data = load_description()

    ids = set(lmap(lambda x: x['icd10_code'].strip(), data))
    ap_list = []
    for e in data[:1000]:
        word = e['icd10_code'].strip()
        terms = nltk.word_tokenize(e['short_desc'])
        ranked_list = list(
            [w for w in w2v.most_similar(word, topn=50) if w[0] not in ids])

        def is_correct(w_pair):
            return w_pair[0] in terms

        AP = AP_from_binary(lmap(is_correct, ranked_list), len(terms))
        ap_list.append(AP)

    print("MAP", sum(ap_list) / len(ap_list))
예제 #4
0
파일: n_model.py 프로젝트: clover3/Chair
def train_loop(args):
    model_dir = os.path.join(model_path, "w2v_model")
    data = load_description()
    n_input_voca = data[-1]['order_number']
    input2 = lmap(lambda x: x['short_desc'], data)
    dim = 1000
    max_seq = 1
    batch_size = int(args.batch_size)
    epochs = int(args.epochs)
    lr = float(args.lr)
    print("learning rate", lr)
    print("Batch_size", batch_size)
    enc_text, tokenizer = fit_and_tokenize(input2)
    token_config = tokenizer.get_config()
    n_output_voca = len(token_config['word_index'])

    random.shuffle(data)
    train_size = int(0.1 * len(data))
    train_data = data[:train_size]
    val_data = data[train_size:]

    step_per_epoch = int(len(train_data) / batch_size)
    max_step = step_per_epoch * epochs
    config = tf.estimator.RunConfig().replace(
        keep_checkpoint_max=1,
        log_step_count_steps=10,
        save_checkpoints_steps=step_per_epoch)
    tf_logging = logging.getLogger('tensorflow')
    tf_logging.setLevel(logging.DEBUG)
    print("Building_estimator")
    estimator = tf.estimator.Estimator(model_dir=model_dir,
                                       model_fn=build_model_fn(
                                           lr, dim, max_seq, n_input_voca,
                                           n_output_voca),
                                       config=config)

    print("start training")
    # estimator.train(
    #     input_fn=lambda :input_fn(train_data, tokenizer, max_seq),
    #     steps=max_step
    # )
    estimator.predict(
        input_fn=lambda: input_fn(train_data, tokenizer, max_seq),
        steps=max_step)
예제 #5
0
파일: code.py 프로젝트: clover3/Chair
def work(args):
    data = load_description()
    n_input_voca = data[-1]['order_number']
    input2 = lmap(lambda x: x['short_desc'], data)
    dim = 1000
    max_seq = 1
    batch_size = int(args.batch_size)
    epochs = int(args.epochs)
    lr = float(args.lr)
    print("learning rate", lr)
    print("Batch_size", batch_size)
    enc_text, tokenizer = fit_and_tokenize(input2)

    random.shuffle(data)
    train_size = int(0.1 * len(data))
    train_data = data[:train_size]
    val_data = data[train_size:]

    step_per_epoch = int(len(train_data) / batch_size)

    train_dataset = input_fn(train_data, tokenizer, max_seq)
    val_dataset = input_fn(val_data, tokenizer, max_seq)
    token_config = tokenizer.get_config()
    n_output_voca = len(token_config['word_index'])
    loss, model = build_model(dim, max_seq, n_input_voca, n_output_voca)
    model.add_loss(loss)
    #model = multi_gpu_model(model, 4, cpu_relocation=True)
    optimizer = tf.keras.optimizers.Adam(lr=lr,
                                         beta_1=0.9,
                                         beta_2=0.999,
                                         amsgrad=False)

    model.compile(optimizer=optimizer)
    model.fit(
        train_dataset,
        # validation_data=val_dataset,
        # validation_steps=3000,
        epochs=epochs,
        steps_per_epoch=step_per_epoch,
        batch_size=batch_size)
    model.save('my_model.h5')
예제 #6
0
파일: code.py 프로젝트: clover3/Chair
def work2():
    data = load_description()
    n_input_voca = data[-1]['order_number']
    input2 = lmap(lambda x: x['short_desc'], data)
    dim = 100
    max_seq = 30

    all_text = tokenize(input2)
    voca = set(flatten(all_text))

    word2idx = {}
    for idx, word in enumerate(list(voca)):
        word2idx[word] = idx

    def tokens_to_idx(tokens):
        return list([word2idx[t] for t in tokens])

    random.shuffle(data)
    train_size = int(0.9 * len(data))
    train_data = data[:train_size]
    val_data = data[train_size:]
    train_ids = lmap(lambda x: x['order_number'], train_data)
    icd10_codes = lmap(lambda x: x['icd10_code'], train_data)
    train_desc = lmap(lambda x: x['short_desc'], train_data)
    train_tokens = lmap(tokens_to_idx, train_desc)

    n_output_voca = len(voca)
    print("n_output_voca", n_output_voca)
    W2 = np.random.rand(n_output_voca + 1, dim)
    W2 = np.random.normal(0, 1, [n_output_voca + 1, dim])
    W1 = np.zeros([n_input_voca + 1, dim])

    icd10_codes = lmap(lambda x: x.strip(), icd10_codes)

    add_subword = False

    code_id_to_code = {}
    code_to_code_id = {}
    for code_id, icd10_code, text_seq in zip(train_ids, icd10_codes,
                                             train_tokens):
        for idx in text_seq:
            W1[code_id] += W2[idx]

        code_id_to_code[code_id] = icd10_code
        code_to_code_id[icd10_code] = code_id

        l = len(icd10_code)
        if add_subword:
            for j in range(1, l - 1):
                substr = icd10_code[:j]
                if substr in code_id_to_code:
                    W1[code_id] += W1[code_to_code_id[substr]]

    new_w2 = list(
        [W2[i] / np.linalg.norm(W2[i]) for i in range(n_output_voca)])

    all_voca = []
    for code_id in train_ids:
        icd10_code = code_id_to_code[code_id]
        word = icd10_code
        emb = W1[code_id]
        all_voca.append((word, emb))

    print("Testing")
    AP_list = []
    for code_id, text_seq in zip(train_ids, train_tokens):
        a = W1[code_id] / np.linalg.norm(W1[code_id])
        l = []
        for j in range(n_output_voca):
            b = new_w2[j]
            e = j, np.dot(a, b)
            l.append(e)

        l.sort(key=lambda x: x[1], reverse=True)
        ranked_list = l[:50]
        terms = text_seq

        def is_correct(w_pair):
            return w_pair[0] in terms

        AP = AP_from_binary(lmap(is_correct, ranked_list), len(terms))
        AP_list.append(AP)
        print("1")
        if len(AP_list) > 100:
            break

    print(sum(AP_list) / len(AP_list))