Пример #1
0
def train():
    """Driver function"""

    corpus = os.path.join(os.environ['DATA_ROOT'], cfg.get('args', 'train'))
    step = cfg.getint('args', 'step')
    maxlen = cfg.getint('args', 'maxlen')
    mintf = cfg.getint('args', 'mintf')

    dp = DatasetProvider(corpus, step, maxlen, min_tf=mintf)
    dp.memory_footprint()

    model = get_model(len(dp.token2int), maxlen)

    for x, y in dp.read_train_data_from_file():

        y = to_categorical(y, len(dp.token2int))
        print('x memory footprint:', hurry.filesize.size(x.nbytes))
        print('y memory footprint', hurry.filesize.size(y.nbytes))
        print('x shape:', x.shape)
        print('y shape:', y.shape)

        model.fit(x,
                  y,
                  epochs=cfg.getint('nn', 'epochs'),
                  batch_size=cfg.getint('nn', 'batch'),
                  verbose=1,
                  validation_split=0.0)

    return model, dp
Пример #2
0
def run_cross_validation(disease, judgement):
    """Run n-fold CV on training set"""

    cfg = configparser.ConfigParser()
    cfg.read(sys.argv[1])
    print_config(cfg)

    base = os.environ['DATA_ROOT']
    data_dir = os.path.join(base, cfg.get('data', 'train_data'))
    annot_xml = os.path.join(base, cfg.get('data', 'train_annot'))
    dataset = DatasetProvider(data_dir,
                              annot_xml,
                              disease,
                              judgement,
                              use_pickled_alphabet=False,
                              min_token_freq=cfg.getint(
                                  'args', 'min_token_freq'))
    x, y = dataset.load()

    classes = len(dataset.label2int)
    maxlen = max([len(seq) for seq in x])
    x = pad_sequences(x, maxlen=maxlen)
    y = to_categorical(y, classes)

    cv_scores = []
    kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=100)
    for train_indices, test_indices in kf.split(x):

        train_x = x[train_indices]
        train_y = y[train_indices]
        test_x = x[test_indices]
        test_y = y[test_indices]

        model = get_model(cfg, dataset.token2int, maxlen, classes, 'softmax')
        optimizer = RMSprop(lr=cfg.getfloat('nn', 'learnrt'))
        model.compile(loss='categorical_crossentropy',
                      optimizer=optimizer,
                      metrics=['accuracy'])
        model.fit(train_x,
                  train_y,
                  epochs=cfg.getint('nn', 'epochs'),
                  batch_size=cfg.getint('nn', 'batch'),
                  validation_split=0.0,
                  verbose=0)

        # probability for each class; (test size, num of classes)
        distribution = model.predict(test_x,
                                     batch_size=cfg.getint('nn', 'batch'))
        # class predictions; (test size,)
        predictions = np.argmax(distribution, axis=1)
        # gold labels; (test size,)
        gold = np.argmax(test_y, axis=1)

        # f1 scores
        f1 = f1_score(gold, predictions, average='macro')
        cv_scores.append(f1)

    print('average f1:', np.mean(cv_scores))
    print('standard deviation:', np.std(cv_scores))
Пример #3
0
def data_sparse(category):
    """Get the data and the vectorizer (for pickling)"""

    cfg = configparser.ConfigParser()
    cfg.read(sys.argv[1])
    base = os.environ['DATA_ROOT']
    train_xml_dir = os.path.join(base, cfg.get('data', 'train_xml_dir'))
    train_cui_dir = os.path.join(base, cfg.get('data', 'train_cui_dir'))

    dataset = DatasetProvider(train_xml_dir,
                              train_cui_dir,
                              category,
                              use_pickled_alphabet=False,
                              alphabet_pickle=cfg.get('model',
                                                      'alphabet_pickle'))
    x, y = dataset.load_for_sklearn()

    vectorizer = TfidfVectorizer()
    x = vectorizer.fit_transform(x)

    # pickle to use on test set
    vectorizer_pickle = 'Model/%s.vec' % category
    pickle.dump(vectorizer, open(vectorizer_pickle, 'wb'))

    return x.toarray(), y
Пример #4
0
def data_pretrained(category):
    """Run n-fold CV on training set"""

    cfg = configparser.ConfigParser()
    cfg.read(sys.argv[1])
    base = os.environ['DATA_ROOT']
    train_xml_dir = os.path.join(base, cfg.get('data', 'train_xml_dir'))
    train_cui_dir = os.path.join(base, cfg.get('data', 'train_cui_dir'))

    # load pre-trained model
    model = load_model(cfg.get('data', 'model_file'))
    interm_layer_model = Model(inputs=model.input,
                               outputs=model.get_layer('HL').output)

    dataset = DatasetProvider(train_xml_dir,
                              train_cui_dir,
                              category,
                              use_pickled_alphabet=True,
                              alphabet_pickle=cfg.get('data',
                                                      'alphabet_pickle'))
    x, y = dataset.load_as_one_hot()

    # make training vectors for target task
    x = interm_layer_model.predict(x)

    return x, y
Пример #5
0
def data_dense(category):
    """Run n-fold CV on training set"""

    cfg = configparser.ConfigParser()
    cfg.read(sys.argv[1])
    base = os.environ['DATA_ROOT']
    train_xml_dir = os.path.join(base, cfg.get('data', 'train_xml_dir'))
    train_cui_dir = os.path.join(base, cfg.get('data', 'train_cui_dir'))

    # load pre-trained model
    model = load_model(cfg.get('data', 'model_file'))
    interm_layer_model = Model(inputs=model.input,
                               outputs=model.get_layer('HL').output)
    maxlen = model.get_layer(name='EL').get_config()['input_length']

    dataset = DatasetProvider(train_xml_dir,
                              train_cui_dir,
                              category,
                              use_pickled_alphabet=True,
                              alphabet_pickle=cfg.get('data',
                                                      'alphabet_pickle'))
    x, y = dataset.load_for_keras()

    classes = len(set(y))
    x = pad_sequences(x, maxlen=maxlen)

    # make training vectors for target task
    x = interm_layer_model.predict(x)

    return x, y
Пример #6
0
def data_sparse(cfg, disease, judgement, use_svd=False):
    """Bag-of-cuis data for sparse evaluation"""

    base = os.environ['DATA_ROOT']
    train_data = os.path.join(base, cfg.get('data', 'train_data'))
    train_annot = os.path.join(base, cfg.get('data', 'train_annot'))
    test_data = os.path.join(base, cfg.get('data', 'test_data'))
    test_annot = os.path.join(base, cfg.get('data', 'test_annot'))

    # handle training data first
    train_data_provider = DatasetProvider(train_data, train_annot, disease,
                                          judgement)
    x_train, y_train = train_data_provider.load_raw()
    print('train examples:', len(x_train))

    vectorizer = TfidfVectorizer(ngram_range=NGRAM_RANGE,
                                 stop_words='english',
                                 min_df=MIN_DF,
                                 vocabulary=None,
                                 binary=False)
    x_train = vectorizer.fit_transform(x_train)

    dump_svmlight_file(train_tfidf_matrix, y_train, disease + "_train.libsvm")

    # now handle the test set
    test_data_provider = DatasetProvider(test_data, test_annot, disease,
                                         judgement)
    x_test, y_test = test_data_provider.load_raw()
    print('test examples:', len(x_test))
    x_test = vectorizer.transform(x_test)

    return x_train.toarray(), y_train, x_test.toarray(), y_test
Пример #7
0
def run_evaluation_svd(disease, judgement):
    """Train on train set and evaluate on test set"""

    print('disease:', disease)
    print('judgement:', judgement)

    cfg = configparser.ConfigParser()
    cfg.read(sys.argv[1])
    base = os.environ['DATA_ROOT']
    train_data = os.path.join(base, cfg.get('data', 'train_data'))
    train_annot = os.path.join(base, cfg.get('data', 'train_annot'))
    test_data = os.path.join(base, cfg.get('data', 'test_data'))
    test_annot = os.path.join(base, cfg.get('data', 'test_annot'))

    # handle training data first
    train_data_provider = DatasetProvider(train_data, train_annot, disease,
                                          judgement)
    x_train, y_train = train_data_provider.load_raw()
    print('train examples:', len(x_train))

    # load tfidf vectorizer model and transform xs into it
    vectorizer = pickle.load(open('../Svd/Model/tfidf.p', 'rb'))
    train_tfidf_matrix = vectorizer.transform(x_train)

    # now handle the test set
    test_data_provider = DatasetProvider(test_data, test_annot, disease,
                                         judgement)
    x_test, y_test = test_data_provider.load_raw()
    print('test examples:', len(x_test))
    test_tfidf_matrix = vectorizer.transform(x_test)

    # load svd model and map train/test to low dimensions
    print('input shape:', train_tfidf_matrix.shape)
    svd = pickle.load(open('../Svd/Model/svd.p', 'rb'))
    train_tfidf_matrix = svd.transform(train_tfidf_matrix)
    test_tfidf_matrix = svd.transform(test_tfidf_matrix)
    print('output shape:', train_tfidf_matrix.shape)

    classifier = LinearSVC(class_weight='balanced')
    classifier.fit(train_tfidf_matrix, y_train)
    predictions = classifier.predict(test_tfidf_matrix)

    p = precision_score(y_test, predictions, average='macro')
    r = recall_score(y_test, predictions, average='macro')
    f1 = f1_score(y_test, predictions, average='macro')
    print('unique labels in train:', len(set(y_train)))
    print('p = %.3f' % p)
    print('r = %.3f' % r)
    print('f1 = %.3f\n' % f1)

    print('%.3f & %.3f & %.3f\n' % (p, r, f1))

    return p, r, f1
Пример #8
0
def get_data(disease, judgement):
    """Sequences of tokens to feed into code prediction model"""

    base = os.environ['DATA_ROOT']
    train_data = os.path.join(base, cfg.get('data', 'train_data'))
    train_annot = os.path.join(base, cfg.get('data', 'train_annot'))
    test_data = os.path.join(base, cfg.get('data', 'test_data'))
    test_annot = os.path.join(base, cfg.get('data', 'test_annot'))

    # determine whether to treat input tokens as a sequence or set
    if cfg.get('data', 'model_type') == 'dan':
        use_cuis = True
        tokens_as_set = True
    else:
        use_cuis = False
        tokens_as_set = False

    # load training data
    train_data_provider = DatasetProvider(
        train_data,
        train_annot,
        disease,
        judgement,
        use_pickled_alphabet=True,
        alphabet_pickle=cfg.get('data', 'alphabet_pickle'),
        min_token_freq=cfg.getint('args', 'min_token_freq'),
        use_cuis=use_cuis)
    x_train, y_train = train_data_provider.load(tokens_as_set=tokens_as_set)
    x_train = pad_sequences(x_train, maxlen=get_maxlen())

    # load the test set
    test_data_provider = DatasetProvider(
        test_data,
        test_annot,
        disease,
        judgement,
        use_pickled_alphabet=True,
        alphabet_pickle=cfg.get('data', 'alphabet_pickle'),
        min_token_freq=cfg.getint('args', 'min_token_freq'),
        use_cuis=use_cuis)
    x_test, y_test = test_data_provider.load(tokens_as_set=tokens_as_set)
    x_test = pad_sequences(x_test, maxlen=get_maxlen())

    return x_train, y_train, x_test, y_test
Пример #9
0
def run_joint_evaluation(exclude, judgement):
    """Predict all comorbidities in one pass"""

    cfg = configparser.ConfigParser()
    cfg.read(sys.argv[1])
    print_config(cfg)
    base = os.environ['DATA_ROOT']
    train_data = os.path.join(base, cfg.get('data', 'train_data'))
    train_annot = os.path.join(base, cfg.get('data', 'train_annot'))
    test_data = os.path.join(base, cfg.get('data', 'test_data'))
    test_annot = os.path.join(base, cfg.get('data', 'test_annot'))

    # load training data first
    train_data_provider = DatasetProvider(train_data,
                                          train_annot,
                                          disease=None,
                                          judgement=judgement,
                                          use_pickled_alphabet=False,
                                          min_token_freq=cfg.getint(
                                              'args', 'min_token_freq'))
    x_train, y_train = train_data_provider.load_vectorized(exclude)

    classes = len(y_train[0])
    maxlen = max([len(seq) for seq in x_train])
    x_train = pad_sequences(x_train, maxlen=maxlen)
    y_train = np.array(y_train)

    # now load the test set
    test_data_provider = DatasetProvider(test_data,
                                         test_annot,
                                         disease=None,
                                         judgement=judgement,
                                         use_pickled_alphabet=True,
                                         min_token_freq=cfg.getint(
                                             'args', 'min_token_freq'))
    x_test, y_test = test_data_provider.load_vectorized(exclude)  # pass maxlen
    x_test = pad_sequences(x_test, maxlen=maxlen)
    y_test = np.array(y_test)

    print('test shape:', x_test.shape, y_test.shape)
    print('train shape:', x_train.shape, y_train.shape)

    model = get_model(cfg, train_data_provider.token2int, maxlen, classes,
                      'sigmoid')
    optimizer = RMSprop(lr=cfg.getfloat('nn', 'learnrt'))
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    model.fit(x_train,
              y_train,
              epochs=cfg.getint('nn', 'epochs'),
              batch_size=cfg.getint('nn', 'batch'),
              validation_split=0.0,
              verbose=1)

    # probability for each class; (test size, num of classes)
    distribution = model.predict(x_test, batch_size=cfg.getint('nn', 'batch'))

    # turn into an indicator matrix
    distribution[distribution < 0.5] = 0
    distribution[distribution >= 0.5] = 1

    f1 = f1_score(y_test, distribution, average='macro')
    precision = precision_score(y_test, distribution, average='macro')
    recall = recall_score(y_test, distribution, average='macro')
    print('macro average p =', precision)
    print('macro average r =', recall)
    print('macro average f1 =', f1)
Пример #10
0
def run_evaluation(disease, judgement):
    """Train on train set and evaluate on test set"""

    cfg = configparser.ConfigParser()
    cfg.read(sys.argv[1])
    print_config(cfg)
    base = os.environ['DATA_ROOT']
    train_data = os.path.join(base, cfg.get('data', 'train_data'))
    train_annot = os.path.join(base, cfg.get('data', 'train_annot'))
    test_data = os.path.join(base, cfg.get('data', 'test_data'))
    test_annot = os.path.join(base, cfg.get('data', 'test_annot'))

    # load training data first
    train_data_provider = DatasetProvider(train_data,
                                          train_annot,
                                          disease,
                                          judgement,
                                          use_pickled_alphabet=False,
                                          min_token_freq=cfg.getint(
                                              'args', 'min_token_freq'))
    x_train, y_train = train_data_provider.load()

    classes = len(train_data_provider.label2int)
    maxlen = max([len(seq) for seq in x_train])
    x_train = pad_sequences(x_train, maxlen=maxlen)
    y_train = to_categorical(y_train, classes)

    # now load the test set
    test_data_provider = DatasetProvider(test_data,
                                         test_annot,
                                         disease,
                                         judgement,
                                         use_pickled_alphabet=True,
                                         min_token_freq=cfg.getint(
                                             'args', 'min_token_freq'))
    x_test, y_test = test_data_provider.load()  # pass maxlen
    x_test = pad_sequences(x_test, maxlen=maxlen)
    y_test = to_categorical(y_test, classes)

    model = get_model(cfg, train_data_provider.token2int, maxlen, classes,
                      'softmax')
    optimizer = RMSprop(lr=cfg.getfloat('nn', 'learnrt'))
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    model.fit(x_train,
              y_train,
              epochs=cfg.getint('nn', 'epochs'),
              batch_size=cfg.getint('nn', 'batch'),
              validation_split=0.0,
              verbose=0)

    # probability for each class; (test size, num of classes)
    distribution = model.predict(x_test, batch_size=cfg.getint('nn', 'batch'))
    # class predictions; (test size,)
    predictions = np.argmax(distribution, axis=1)
    # gold labels; (test size,)
    gold = np.argmax(y_test, axis=1)

    # f1 scores
    f1 = f1_score(gold, predictions, average='macro')
    print('%s: f1 = %.3f' % (disease, f1))

    return f1
def run_evaluation_sparse(disease, judgement, use_svd=False):
  """Train on train set and evaluate on test set"""

  print 'disease:', disease
  print 'judgement:', judgement

  cfg = ConfigParser.ConfigParser()
  cfg.read(sys.argv[1])
  base = os.environ['DATA_ROOT']
  train_data = os.path.join(base, cfg.get('data', 'train_data'))
  train_annot = os.path.join(base, cfg.get('data', 'train_annot'))
  test_data = os.path.join(base, cfg.get('data', 'test_data'))
  test_annot = os.path.join(base, cfg.get('data', 'test_annot'))

  # handle training data first
  train_data_provider = DatasetProvider(
    train_data,
    train_annot,
    disease,
    judgement,
    use_pickled_alphabet=False,
    alphabet_pickle=cfg.get('data', 'alphabet_pickle'))
  x_train, y_train = train_data_provider.load_raw()
  print 'train examples:', len(x_train)

  vectorizer = CountVectorizer(
    ngram_range=NGRAM_RANGE,
    stop_words='english',
    min_df=MIN_DF,
    vocabulary=None,
    binary=False)
  train_count_matrix = vectorizer.fit_transform(x_train)

  tf = TfidfTransformer()
  train_tfidf_matrix = tf.fit_transform(train_count_matrix)

  # now handle the test set
  test_data_provider = DatasetProvider(
    test_data,
    test_annot,
    disease,
    judgement,
    use_pickled_alphabet=True,
    alphabet_pickle=cfg.get('data', 'alphabet_pickle'))
  x_test, y_test = test_data_provider.load_raw()
  print 'test examples:', len(x_test)

  test_count_matrix = vectorizer.transform(x_test)
  test_tfidf_matrix = tf.transform(test_count_matrix)

  if use_svd:
    # reduce sparse vector to 300 dimensions
    svd = TruncatedSVD(n_components=300)
    train_tfidf_matrix = svd.fit_transform(train_tfidf_matrix)
    test_tfidf_matrix = svd.transform(test_tfidf_matrix)

  classifier = LinearSVC(class_weight='balanced')
  classifier.fit(train_tfidf_matrix, y_train)
  predictions = classifier.predict(test_tfidf_matrix)

  p = precision_score(y_test, predictions, average='macro')
  r = recall_score(y_test, predictions, average='macro')
  f1 = f1_score(y_test, predictions, average='macro')
  print 'unique labels in train:', len(set(y_train))
  print 'p = %.3f' % p
  print 'r = %.3f' % r
  print 'f1 = %.3f\n' % f1

  return p, r, f1
def run_evaluation_dense(disease, judgement):
  """Use pre-trained patient representations"""

  print 'disease:', disease
  print 'judgement:', judgement

  cfg = ConfigParser.ConfigParser()
  cfg.read(sys.argv[1])
  base = os.environ['DATA_ROOT']
  train_data = os.path.join(base, cfg.get('data', 'train_data'))
  train_annot = os.path.join(base, cfg.get('data', 'train_annot'))
  test_data = os.path.join(base, cfg.get('data', 'test_data'))
  test_annot = os.path.join(base, cfg.get('data', 'test_annot'))

  # load pre-trained model
  model = load_model(cfg.get('data', 'model_file'))
  interm_layer_model = Model(
    inputs=model.input,
    outputs=model.get_layer('HL').output)

  # load training data first
  train_data_provider = DatasetProvider(
    train_data,
    train_annot,
    disease,
    judgement,
    use_pickled_alphabet=True,
    alphabet_pickle=cfg.get('data', 'alphabet_pickle'),
    min_token_freq=cfg.getint('args', 'min_token_freq'))
  x_train, y_train = train_data_provider.load()

  classes = len(set(y_train))
  print 'unique labels in train:', classes
  maxlen = cfg.getint('data', 'maxlen')
  x_train = pad_sequences(x_train, maxlen=maxlen)

  # make training vectors for target task
  print 'original x_train shape:', x_train.shape
  x_train = interm_layer_model.predict(x_train)
  print 'new x_train shape:', x_train.shape

  # now load the test set
  test_data_provider = DatasetProvider(
    test_data,
    test_annot,
    disease,
    judgement,
    use_pickled_alphabet=True,
    alphabet_pickle=cfg.get('data', 'alphabet_pickle'),
    min_token_freq=cfg.getint('args', 'min_token_freq'))
  x_test, y_test = test_data_provider.load()
  x_test = pad_sequences(x_test, maxlen=maxlen)

  # make test vectors for target task
  print 'original x_test shape:', x_test.shape
  x_test = interm_layer_model.predict(x_test)
  print 'new x_test shape:', x_test.shape

  classifier = LinearSVC(class_weight='balanced')
  model = classifier.fit(x_train, y_train)
  predictions = classifier.predict(x_test)
  p = precision_score(y_test, predictions, average='macro')
  r = recall_score(y_test, predictions, average='macro')
  f1 = f1_score(y_test, predictions, average='macro')
  print 'p = %.3f' % p
  print 'r = %.3f' % r
  print 'f1 = %.3f\n' % f1

  return p, r, f1
Пример #13
0
import pandas as pd
import syft as sy
import torch
import syfertext
from syft.generic.string import String
from dataset import DatasetProvider
from util import send_text_data, send_label_data, generate_workers

# add hook
hook = sy.TorchHook(torch)
me = hook.local_worker

# no of workers
n_workers = 3

# generate workers
workers = generate_workers(n_workers)

# instantiate the dataset provider
data_provider = DatasetProvider(train_path='./data/train.csv',
                                test_path='./data/test.csv')

# get the data
train_data = data_provider.provide_data(dataset='train', splits=n_workers)
test_data = data_provider.provide_data(dataset='test', splits=n_workers)

# send the train data and get back list of (list of pointers)
train_text_worker_ptrs = send_text_data(train_data, workers)
train_label_worker_ptrs = send_label_data(test_data, workers)
Пример #14
0
def data_dense(cfg, disease, judgement):
    """Data to feed into code prediction model"""

    base = os.environ['DATA_ROOT']
    train_data = os.path.join(base, cfg.get('data', 'train_data'))
    train_annot = os.path.join(base, cfg.get('data', 'train_annot'))
    test_data = os.path.join(base, cfg.get('data', 'test_data'))
    test_annot = os.path.join(base, cfg.get('data', 'test_annot'))

    # load pre-trained model
    model = load_model(cfg.get('data', 'model_file'))
    interm_layer_model = Model(inputs=model.input,
                               outputs=model.get_layer(
                                   cfg.get('data', 'rep_layer')).output)
    maxlen = model.get_layer(name='EL').get_config()['input_length']

    # determine whether to treat input tokens as a sequence or set
    if cfg.get('data', 'model_type') == 'dan':
        use_cuis = True
        tokens_as_set = True
    else:
        use_cuis = False
        tokens_as_set = False

    # load training data first
    train_data_provider = DatasetProvider(
        train_data,
        train_annot,
        disease,
        judgement,
        use_pickled_alphabet=True,
        alphabet_pickle=cfg.get('data', 'alphabet_pickle'),
        min_token_freq=cfg.getint('args', 'min_token_freq'),
        use_cuis=use_cuis)
    x_train, y_train = train_data_provider.load(tokens_as_set=tokens_as_set)

    classes = len(set(y_train))
    print('unique labels in train:', classes)
    x_train = pad_sequences(x_train, maxlen=maxlen)

    # make training vectors for target task
    print('original x_train shape:', x_train.shape)
    x_train = interm_layer_model.predict(x_train)
    print('new x_train shape:', x_train.shape)

    # now load the test set
    test_data_provider = DatasetProvider(
        test_data,
        test_annot,
        disease,
        judgement,
        use_pickled_alphabet=True,
        alphabet_pickle=cfg.get('data', 'alphabet_pickle'),
        min_token_freq=cfg.getint('args', 'min_token_freq'),
        use_cuis=use_cuis)
    x_test, y_test = test_data_provider.load(tokens_as_set=tokens_as_set)
    x_test = pad_sequences(x_test, maxlen=maxlen)

    # make test vectors for target task
    print('original x_test shape:', x_test.shape)
    x_test = interm_layer_model.predict(x_test)
    print('new x_test shape:', x_test.shape)

    return x_train, y_train, x_test, y_test