Пример #1
0
class EHRTokenizer(object):
    """Runs end-to-end tokenization"""
    def __init__(self, data_dir, special_tokens=("[PAD]", "[CLS]", "[MASK]")):

        self.vocab = Voc()

        # special tokens
        self.vocab.add_sentence(special_tokens)

        self.rx_voc = self.add_vocab(os.path.join(data_dir, 'rx-vocab.txt'))
        self.dx_voc = self.add_vocab(os.path.join(data_dir, 'dx-vocab.txt'))

    def add_vocab(self, vocab_file):
        voc = self.vocab
        specific_voc = Voc()
        with open(vocab_file, 'r') as fin:
            for code in fin:
                voc.add_sentence([code.rstrip('\n')])
                specific_voc.add_sentence([code.rstrip('\n')])
        return specific_voc

    def convert_tokens_to_ids(self, tokens):
        """Converts a sequence of tokens into ids using the vocab."""
        ids = []
        for token in tokens:
            ids.append(self.vocab.word2idx[token])
        return ids

    def convert_ids_to_tokens(self, ids):
        """Converts a sequence of ids in wordpiece tokens using the vocab."""
        tokens = []
        for i in ids:
            tokens.append(self.vocab.idx2word[i])
        return tokens
Пример #2
0
    def __init__(self, data_dir, special_tokens=("[PAD]", "[CLS]", "[MASK]")):

        self.vocab = Voc()

        # special tokens
        self.vocab.add_sentence(special_tokens)

        self.rx_voc = self.add_vocab(os.path.join(data_dir, 'rx-vocab.txt'))
        self.dx_voc = self.add_vocab(os.path.join(data_dir, 'dx-vocab.txt'))

        # code only in multi-visit data
        self.rx_voc_multi = Voc()
        self.dx_voc_multi = Voc()
        self.rx_voc_multi_pa = Voc()
        self.dx_voc_multi_pa = Voc()
        with open(os.path.join(data_dir, 'rx-vocab-multi.txt'), 'r') as fin:
            for code in fin:
                self.rx_voc_multi.add_sentence([code.rstrip('\n')])
        with open(os.path.join(data_dir, 'dx-vocab-multi.txt'), 'r') as fin:
            for code in fin:
                self.dx_voc_multi.add_sentence([code.rstrip('\n')])
        with open(os.path.join(data_dir, 'rx-vocab-multi-pa.txt'), 'r') as fin:
            for code in fin:
                self.rx_voc_multi_pa.add_sentence([code.rstrip('\n')])
        with open(os.path.join(data_dir, 'dx-vocab-multi-pa.txt'), 'r') as fin:
            for code in fin:
                self.dx_voc_multi_pa.add_sentence([code.rstrip('\n')])
Пример #3
0
 def add_vocab(self, vocab_file):
     voc = self.vocab
     specific_voc = Voc()
     with open(vocab_file, 'r') as fin:
         for code in fin:
             voc.add_sentence([code.rstrip('\n')])
             specific_voc.add_sentence([code.rstrip('\n')])
     return specific_voc
Пример #4
0
    def __init__(self, data_dir, special_tokens=("[PAD]", "[CLS]", "[MASK]")):

        self.vocab = Voc()

        # special tokens
        self.vocab.add_sentence(special_tokens)

        self.rx_voc = self.add_vocab(os.path.join(data_dir, 'rx-vocab.txt'))
        self.dx_voc = self.add_vocab(os.path.join(data_dir, 'dx-vocab.txt'))
Пример #5
0
def build_atc_tree(unique_codes):
    res = []
    graph_voc = Voc()

    root_node = 'atc_root'
    for code in unique_codes:
        sample = [code] + [code[:i] for i in [4, 3, 1]] + [root_node]

        graph_voc.add_sentence(sample)
        res.append(sample)

    return res, graph_voc
Пример #6
0
def build_icd9_tree(unique_codes):
    res = []
    graph_voc = Voc()

    root_node = 'icd9_root'
    level3_dict = expand_level2()
    for code in unique_codes:
        level1 = code
        level2 = level1[:4] if level1[0] == 'E' else level1[:3]
        level3 = level3_dict[level2]
        level4 = root_node

        sample = [level1, level2, level3, level4]

        graph_voc.add_sentence(sample)
        res.append(sample)

    return res, graph_voc
Пример #7
0
def corpus(input, output, suffix='sdf'):
    if suffix =='sdf':
        inf = gzip.open(input)
        mols = Chem.ForwardSDMolSupplier(inf)
        # mols = [mol for mol in suppl]
    else:
        df = pd.read_table(input).Smiles.dropna()
        mols = [Chem.MolFromSmiles(s) for s in df]
    voc = Voc('data/voc_smiles.txt')
    charger = rdMolStandardize.Uncharger()
    chooser = rdMolStandardize.LargestFragmentChooser()
    disconnector = rdMolStandardize.MetalDisconnector()
    normalizer = rdMolStandardize.Normalizer()
    words = set()
    canons = []
    tokens = []
    smiles = set()
    for mol in tqdm(mols):
        try:
            mol = disconnector.Disconnect(mol)
            mol = normalizer.normalize(mol)
            mol = chooser.choose(mol)
            mol = charger.uncharge(mol)
            mol = disconnector.Disconnect(mol)
            mol = normalizer.normalize(mol)
            smileR = Chem.MolToSmiles(mol, 0)
            smiles.add(Chem.CanonSmiles(smileR))
        except:
            print('Parsing Error:') #, Chem.MolToSmiles(mol))

    for smile in tqdm(smiles):
        token = voc.split(smile) + ['EOS']
        if {'C', 'c'}.isdisjoint(token):
            print('Warning:', smile)
            continue
        if not {'[Na]', '[Zn]'}.isdisjoint(token):
            print('Redudent', smile)
            continue
        if 10 < len(token) <= 100:
            words.update(token)
            canons.append(smile)
            tokens.append(' '.join(token))
    log = open(output + '_voc.txt', 'w')
    log.write('\n'.join(sorted(words)))
    log.close()

    log = pd.DataFrame()
    log['Smiles'] = canons
    log['Token'] = tokens
    log.drop_duplicates(subset='Smiles')
    log.to_csv(output + '_corpus.txt', sep='\t', index=False)
Пример #8
0
import pandas as pd
from utils import Voc

voc = Voc()
df = pd.read_csv("./data/char.train.csv")
voc.add(df["content"])
voc.dumps("./data/voc.json")
        self.num_layers = 2
        self.pool_kernal = 4
        self.dim_after_pool = int(
            np.ceil((self.hid_dim * 2 - self.pool_kernal) / self.pool_kernal) +
            1)
        self.aspect_dim = 64


filename = "./data/char.valid.csv"
# %%
configs = Configs1()
model = BilstmAspectAttPool(configs)
model.load_state_dict(torch.load("./model-zoo/bilstm_aspect_att_pool2.pt"))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
voc = Voc()
voc.loads("./data/voc.json")
df = pd.read_csv("./data/char.test.csv")
columns = df.columns[-20:]
pred_df = df[columns].copy()
# %%
output_list = []
y_list = []

# %%
for i, content in tqdm(enumerate(df["content"])):
    seq = voc.sentence2idx(content.split(" "))
    seq_len = torch.LongTensor([len(seq)])
    seq = torch.LongTensor(seq)
    seq = seq.unsqueeze(-1)
    seq = seq.to(device)
Пример #10
0
import pickle
import os
import torch
import yaml
from utils import Voc, Config, dump_pickle

FILEPATH = '/home/lanco/zhaoliang/KB/en_concept_net_extracted.csv'
ROOTPATH = '/home/lanco/zhaoliang/KB/'
edgeList = []
errorList = []
nodeList = set()
relationList = []

config = Config(os.path.join(ROOTPATH, 'config.yml'))

voc = Voc(config)

try:
    with open(FILEPATH, 'r') as file:
        for index, line in enumerate(file):
            if index % 100000 == 0:
                print('processing %d' % index)
            lineSearch = re.search(
                "/a/\[/r/(.+)/,/c/en/(.+?)/.*,/c/en/(.+)/\]", line)
            if lineSearch != None and lineSearch.group(
                    1) != None and lineSearch.group(2) != None:
                voc.addWord(lineSearch.group(3))
                voc.addWord(lineSearch.group(2))
                if lineSearch.group(1) not in relationList:
                    relationList.append(lineSearch.group(1))
            else:
Пример #11
0
import time
import pickle
import pickle

with open('dic.pkl', 'rb') as f:
    dic = pickle.load(f)
loadFilename = "300000_checkpoint.tar"
USE_CUDA = torch.cuda.is_available()
#device = torch.device("cuda" if USE_CUDA else "cpu")
device = torch.device("cpu")
PAD_token = 0  # Used for padding short sentences
SOS_token = 1  # Start-of-sentence token
EOS_token = 2  # End-of-sentence token

corpus_name = 'expand_abbr'
voc = Voc(corpus_name)
attn_model = 'dot'
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 100
MAX_LENGTH = 200


def evaluate(sentence, max_length=MAX_LENGTH):
    time_start = time.time()
    sentence = normalizeString(sentence)
    sentence = unicodedata.normalize('NFD', sentence)
    indexes_batch = [indexesFromSentence(voc, sentence)]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])