def train(): """Driver function""" corpus = os.path.join(os.environ['DATA_ROOT'], cfg.get('args', 'train')) step = cfg.getint('args', 'step') maxlen = cfg.getint('args', 'maxlen') mintf = cfg.getint('args', 'mintf') dp = DatasetProvider(corpus, step, maxlen, min_tf=mintf) dp.memory_footprint() model = get_model(len(dp.token2int), maxlen) for x, y in dp.read_train_data_from_file(): y = to_categorical(y, len(dp.token2int)) print('x memory footprint:', hurry.filesize.size(x.nbytes)) print('y memory footprint', hurry.filesize.size(y.nbytes)) print('x shape:', x.shape) print('y shape:', y.shape) model.fit(x, y, epochs=cfg.getint('nn', 'epochs'), batch_size=cfg.getint('nn', 'batch'), verbose=1, validation_split=0.0) return model, dp
def run_cross_validation(disease, judgement): """Run n-fold CV on training set""" cfg = configparser.ConfigParser() cfg.read(sys.argv[1]) print_config(cfg) base = os.environ['DATA_ROOT'] data_dir = os.path.join(base, cfg.get('data', 'train_data')) annot_xml = os.path.join(base, cfg.get('data', 'train_annot')) dataset = DatasetProvider(data_dir, annot_xml, disease, judgement, use_pickled_alphabet=False, min_token_freq=cfg.getint( 'args', 'min_token_freq')) x, y = dataset.load() classes = len(dataset.label2int) maxlen = max([len(seq) for seq in x]) x = pad_sequences(x, maxlen=maxlen) y = to_categorical(y, classes) cv_scores = [] kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=100) for train_indices, test_indices in kf.split(x): train_x = x[train_indices] train_y = y[train_indices] test_x = x[test_indices] test_y = y[test_indices] model = get_model(cfg, dataset.token2int, maxlen, classes, 'softmax') optimizer = RMSprop(lr=cfg.getfloat('nn', 'learnrt')) model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy']) model.fit(train_x, train_y, epochs=cfg.getint('nn', 'epochs'), batch_size=cfg.getint('nn', 'batch'), validation_split=0.0, verbose=0) # probability for each class; (test size, num of classes) distribution = model.predict(test_x, batch_size=cfg.getint('nn', 'batch')) # class predictions; (test size,) predictions = np.argmax(distribution, axis=1) # gold labels; (test size,) gold = np.argmax(test_y, axis=1) # f1 scores f1 = f1_score(gold, predictions, average='macro') cv_scores.append(f1) print('average f1:', np.mean(cv_scores)) print('standard deviation:', np.std(cv_scores))
def data_sparse(category): """Get the data and the vectorizer (for pickling)""" cfg = configparser.ConfigParser() cfg.read(sys.argv[1]) base = os.environ['DATA_ROOT'] train_xml_dir = os.path.join(base, cfg.get('data', 'train_xml_dir')) train_cui_dir = os.path.join(base, cfg.get('data', 'train_cui_dir')) dataset = DatasetProvider(train_xml_dir, train_cui_dir, category, use_pickled_alphabet=False, alphabet_pickle=cfg.get('model', 'alphabet_pickle')) x, y = dataset.load_for_sklearn() vectorizer = TfidfVectorizer() x = vectorizer.fit_transform(x) # pickle to use on test set vectorizer_pickle = 'Model/%s.vec' % category pickle.dump(vectorizer, open(vectorizer_pickle, 'wb')) return x.toarray(), y
def data_pretrained(category): """Run n-fold CV on training set""" cfg = configparser.ConfigParser() cfg.read(sys.argv[1]) base = os.environ['DATA_ROOT'] train_xml_dir = os.path.join(base, cfg.get('data', 'train_xml_dir')) train_cui_dir = os.path.join(base, cfg.get('data', 'train_cui_dir')) # load pre-trained model model = load_model(cfg.get('data', 'model_file')) interm_layer_model = Model(inputs=model.input, outputs=model.get_layer('HL').output) dataset = DatasetProvider(train_xml_dir, train_cui_dir, category, use_pickled_alphabet=True, alphabet_pickle=cfg.get('data', 'alphabet_pickle')) x, y = dataset.load_as_one_hot() # make training vectors for target task x = interm_layer_model.predict(x) return x, y
def data_dense(category): """Run n-fold CV on training set""" cfg = configparser.ConfigParser() cfg.read(sys.argv[1]) base = os.environ['DATA_ROOT'] train_xml_dir = os.path.join(base, cfg.get('data', 'train_xml_dir')) train_cui_dir = os.path.join(base, cfg.get('data', 'train_cui_dir')) # load pre-trained model model = load_model(cfg.get('data', 'model_file')) interm_layer_model = Model(inputs=model.input, outputs=model.get_layer('HL').output) maxlen = model.get_layer(name='EL').get_config()['input_length'] dataset = DatasetProvider(train_xml_dir, train_cui_dir, category, use_pickled_alphabet=True, alphabet_pickle=cfg.get('data', 'alphabet_pickle')) x, y = dataset.load_for_keras() classes = len(set(y)) x = pad_sequences(x, maxlen=maxlen) # make training vectors for target task x = interm_layer_model.predict(x) return x, y
def data_sparse(cfg, disease, judgement, use_svd=False): """Bag-of-cuis data for sparse evaluation""" base = os.environ['DATA_ROOT'] train_data = os.path.join(base, cfg.get('data', 'train_data')) train_annot = os.path.join(base, cfg.get('data', 'train_annot')) test_data = os.path.join(base, cfg.get('data', 'test_data')) test_annot = os.path.join(base, cfg.get('data', 'test_annot')) # handle training data first train_data_provider = DatasetProvider(train_data, train_annot, disease, judgement) x_train, y_train = train_data_provider.load_raw() print('train examples:', len(x_train)) vectorizer = TfidfVectorizer(ngram_range=NGRAM_RANGE, stop_words='english', min_df=MIN_DF, vocabulary=None, binary=False) x_train = vectorizer.fit_transform(x_train) dump_svmlight_file(train_tfidf_matrix, y_train, disease + "_train.libsvm") # now handle the test set test_data_provider = DatasetProvider(test_data, test_annot, disease, judgement) x_test, y_test = test_data_provider.load_raw() print('test examples:', len(x_test)) x_test = vectorizer.transform(x_test) return x_train.toarray(), y_train, x_test.toarray(), y_test
def run_evaluation_svd(disease, judgement): """Train on train set and evaluate on test set""" print('disease:', disease) print('judgement:', judgement) cfg = configparser.ConfigParser() cfg.read(sys.argv[1]) base = os.environ['DATA_ROOT'] train_data = os.path.join(base, cfg.get('data', 'train_data')) train_annot = os.path.join(base, cfg.get('data', 'train_annot')) test_data = os.path.join(base, cfg.get('data', 'test_data')) test_annot = os.path.join(base, cfg.get('data', 'test_annot')) # handle training data first train_data_provider = DatasetProvider(train_data, train_annot, disease, judgement) x_train, y_train = train_data_provider.load_raw() print('train examples:', len(x_train)) # load tfidf vectorizer model and transform xs into it vectorizer = pickle.load(open('../Svd/Model/tfidf.p', 'rb')) train_tfidf_matrix = vectorizer.transform(x_train) # now handle the test set test_data_provider = DatasetProvider(test_data, test_annot, disease, judgement) x_test, y_test = test_data_provider.load_raw() print('test examples:', len(x_test)) test_tfidf_matrix = vectorizer.transform(x_test) # load svd model and map train/test to low dimensions print('input shape:', train_tfidf_matrix.shape) svd = pickle.load(open('../Svd/Model/svd.p', 'rb')) train_tfidf_matrix = svd.transform(train_tfidf_matrix) test_tfidf_matrix = svd.transform(test_tfidf_matrix) print('output shape:', train_tfidf_matrix.shape) classifier = LinearSVC(class_weight='balanced') classifier.fit(train_tfidf_matrix, y_train) predictions = classifier.predict(test_tfidf_matrix) p = precision_score(y_test, predictions, average='macro') r = recall_score(y_test, predictions, average='macro') f1 = f1_score(y_test, predictions, average='macro') print('unique labels in train:', len(set(y_train))) print('p = %.3f' % p) print('r = %.3f' % r) print('f1 = %.3f\n' % f1) print('%.3f & %.3f & %.3f\n' % (p, r, f1)) return p, r, f1
def get_data(disease, judgement): """Sequences of tokens to feed into code prediction model""" base = os.environ['DATA_ROOT'] train_data = os.path.join(base, cfg.get('data', 'train_data')) train_annot = os.path.join(base, cfg.get('data', 'train_annot')) test_data = os.path.join(base, cfg.get('data', 'test_data')) test_annot = os.path.join(base, cfg.get('data', 'test_annot')) # determine whether to treat input tokens as a sequence or set if cfg.get('data', 'model_type') == 'dan': use_cuis = True tokens_as_set = True else: use_cuis = False tokens_as_set = False # load training data train_data_provider = DatasetProvider( train_data, train_annot, disease, judgement, use_pickled_alphabet=True, alphabet_pickle=cfg.get('data', 'alphabet_pickle'), min_token_freq=cfg.getint('args', 'min_token_freq'), use_cuis=use_cuis) x_train, y_train = train_data_provider.load(tokens_as_set=tokens_as_set) x_train = pad_sequences(x_train, maxlen=get_maxlen()) # load the test set test_data_provider = DatasetProvider( test_data, test_annot, disease, judgement, use_pickled_alphabet=True, alphabet_pickle=cfg.get('data', 'alphabet_pickle'), min_token_freq=cfg.getint('args', 'min_token_freq'), use_cuis=use_cuis) x_test, y_test = test_data_provider.load(tokens_as_set=tokens_as_set) x_test = pad_sequences(x_test, maxlen=get_maxlen()) return x_train, y_train, x_test, y_test
def run_joint_evaluation(exclude, judgement): """Predict all comorbidities in one pass""" cfg = configparser.ConfigParser() cfg.read(sys.argv[1]) print_config(cfg) base = os.environ['DATA_ROOT'] train_data = os.path.join(base, cfg.get('data', 'train_data')) train_annot = os.path.join(base, cfg.get('data', 'train_annot')) test_data = os.path.join(base, cfg.get('data', 'test_data')) test_annot = os.path.join(base, cfg.get('data', 'test_annot')) # load training data first train_data_provider = DatasetProvider(train_data, train_annot, disease=None, judgement=judgement, use_pickled_alphabet=False, min_token_freq=cfg.getint( 'args', 'min_token_freq')) x_train, y_train = train_data_provider.load_vectorized(exclude) classes = len(y_train[0]) maxlen = max([len(seq) for seq in x_train]) x_train = pad_sequences(x_train, maxlen=maxlen) y_train = np.array(y_train) # now load the test set test_data_provider = DatasetProvider(test_data, test_annot, disease=None, judgement=judgement, use_pickled_alphabet=True, min_token_freq=cfg.getint( 'args', 'min_token_freq')) x_test, y_test = test_data_provider.load_vectorized(exclude) # pass maxlen x_test = pad_sequences(x_test, maxlen=maxlen) y_test = np.array(y_test) print('test shape:', x_test.shape, y_test.shape) print('train shape:', x_train.shape, y_train.shape) model = get_model(cfg, train_data_provider.token2int, maxlen, classes, 'sigmoid') optimizer = RMSprop(lr=cfg.getfloat('nn', 'learnrt')) model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy']) model.fit(x_train, y_train, epochs=cfg.getint('nn', 'epochs'), batch_size=cfg.getint('nn', 'batch'), validation_split=0.0, verbose=1) # probability for each class; (test size, num of classes) distribution = model.predict(x_test, batch_size=cfg.getint('nn', 'batch')) # turn into an indicator matrix distribution[distribution < 0.5] = 0 distribution[distribution >= 0.5] = 1 f1 = f1_score(y_test, distribution, average='macro') precision = precision_score(y_test, distribution, average='macro') recall = recall_score(y_test, distribution, average='macro') print('macro average p =', precision) print('macro average r =', recall) print('macro average f1 =', f1)
def run_evaluation(disease, judgement): """Train on train set and evaluate on test set""" cfg = configparser.ConfigParser() cfg.read(sys.argv[1]) print_config(cfg) base = os.environ['DATA_ROOT'] train_data = os.path.join(base, cfg.get('data', 'train_data')) train_annot = os.path.join(base, cfg.get('data', 'train_annot')) test_data = os.path.join(base, cfg.get('data', 'test_data')) test_annot = os.path.join(base, cfg.get('data', 'test_annot')) # load training data first train_data_provider = DatasetProvider(train_data, train_annot, disease, judgement, use_pickled_alphabet=False, min_token_freq=cfg.getint( 'args', 'min_token_freq')) x_train, y_train = train_data_provider.load() classes = len(train_data_provider.label2int) maxlen = max([len(seq) for seq in x_train]) x_train = pad_sequences(x_train, maxlen=maxlen) y_train = to_categorical(y_train, classes) # now load the test set test_data_provider = DatasetProvider(test_data, test_annot, disease, judgement, use_pickled_alphabet=True, min_token_freq=cfg.getint( 'args', 'min_token_freq')) x_test, y_test = test_data_provider.load() # pass maxlen x_test = pad_sequences(x_test, maxlen=maxlen) y_test = to_categorical(y_test, classes) model = get_model(cfg, train_data_provider.token2int, maxlen, classes, 'softmax') optimizer = RMSprop(lr=cfg.getfloat('nn', 'learnrt')) model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy']) model.fit(x_train, y_train, epochs=cfg.getint('nn', 'epochs'), batch_size=cfg.getint('nn', 'batch'), validation_split=0.0, verbose=0) # probability for each class; (test size, num of classes) distribution = model.predict(x_test, batch_size=cfg.getint('nn', 'batch')) # class predictions; (test size,) predictions = np.argmax(distribution, axis=1) # gold labels; (test size,) gold = np.argmax(y_test, axis=1) # f1 scores f1 = f1_score(gold, predictions, average='macro') print('%s: f1 = %.3f' % (disease, f1)) return f1
def run_evaluation_sparse(disease, judgement, use_svd=False): """Train on train set and evaluate on test set""" print 'disease:', disease print 'judgement:', judgement cfg = ConfigParser.ConfigParser() cfg.read(sys.argv[1]) base = os.environ['DATA_ROOT'] train_data = os.path.join(base, cfg.get('data', 'train_data')) train_annot = os.path.join(base, cfg.get('data', 'train_annot')) test_data = os.path.join(base, cfg.get('data', 'test_data')) test_annot = os.path.join(base, cfg.get('data', 'test_annot')) # handle training data first train_data_provider = DatasetProvider( train_data, train_annot, disease, judgement, use_pickled_alphabet=False, alphabet_pickle=cfg.get('data', 'alphabet_pickle')) x_train, y_train = train_data_provider.load_raw() print 'train examples:', len(x_train) vectorizer = CountVectorizer( ngram_range=NGRAM_RANGE, stop_words='english', min_df=MIN_DF, vocabulary=None, binary=False) train_count_matrix = vectorizer.fit_transform(x_train) tf = TfidfTransformer() train_tfidf_matrix = tf.fit_transform(train_count_matrix) # now handle the test set test_data_provider = DatasetProvider( test_data, test_annot, disease, judgement, use_pickled_alphabet=True, alphabet_pickle=cfg.get('data', 'alphabet_pickle')) x_test, y_test = test_data_provider.load_raw() print 'test examples:', len(x_test) test_count_matrix = vectorizer.transform(x_test) test_tfidf_matrix = tf.transform(test_count_matrix) if use_svd: # reduce sparse vector to 300 dimensions svd = TruncatedSVD(n_components=300) train_tfidf_matrix = svd.fit_transform(train_tfidf_matrix) test_tfidf_matrix = svd.transform(test_tfidf_matrix) classifier = LinearSVC(class_weight='balanced') classifier.fit(train_tfidf_matrix, y_train) predictions = classifier.predict(test_tfidf_matrix) p = precision_score(y_test, predictions, average='macro') r = recall_score(y_test, predictions, average='macro') f1 = f1_score(y_test, predictions, average='macro') print 'unique labels in train:', len(set(y_train)) print 'p = %.3f' % p print 'r = %.3f' % r print 'f1 = %.3f\n' % f1 return p, r, f1
def run_evaluation_dense(disease, judgement): """Use pre-trained patient representations""" print 'disease:', disease print 'judgement:', judgement cfg = ConfigParser.ConfigParser() cfg.read(sys.argv[1]) base = os.environ['DATA_ROOT'] train_data = os.path.join(base, cfg.get('data', 'train_data')) train_annot = os.path.join(base, cfg.get('data', 'train_annot')) test_data = os.path.join(base, cfg.get('data', 'test_data')) test_annot = os.path.join(base, cfg.get('data', 'test_annot')) # load pre-trained model model = load_model(cfg.get('data', 'model_file')) interm_layer_model = Model( inputs=model.input, outputs=model.get_layer('HL').output) # load training data first train_data_provider = DatasetProvider( train_data, train_annot, disease, judgement, use_pickled_alphabet=True, alphabet_pickle=cfg.get('data', 'alphabet_pickle'), min_token_freq=cfg.getint('args', 'min_token_freq')) x_train, y_train = train_data_provider.load() classes = len(set(y_train)) print 'unique labels in train:', classes maxlen = cfg.getint('data', 'maxlen') x_train = pad_sequences(x_train, maxlen=maxlen) # make training vectors for target task print 'original x_train shape:', x_train.shape x_train = interm_layer_model.predict(x_train) print 'new x_train shape:', x_train.shape # now load the test set test_data_provider = DatasetProvider( test_data, test_annot, disease, judgement, use_pickled_alphabet=True, alphabet_pickle=cfg.get('data', 'alphabet_pickle'), min_token_freq=cfg.getint('args', 'min_token_freq')) x_test, y_test = test_data_provider.load() x_test = pad_sequences(x_test, maxlen=maxlen) # make test vectors for target task print 'original x_test shape:', x_test.shape x_test = interm_layer_model.predict(x_test) print 'new x_test shape:', x_test.shape classifier = LinearSVC(class_weight='balanced') model = classifier.fit(x_train, y_train) predictions = classifier.predict(x_test) p = precision_score(y_test, predictions, average='macro') r = recall_score(y_test, predictions, average='macro') f1 = f1_score(y_test, predictions, average='macro') print 'p = %.3f' % p print 'r = %.3f' % r print 'f1 = %.3f\n' % f1 return p, r, f1
import pandas as pd import syft as sy import torch import syfertext from syft.generic.string import String from dataset import DatasetProvider from util import send_text_data, send_label_data, generate_workers # add hook hook = sy.TorchHook(torch) me = hook.local_worker # no of workers n_workers = 3 # generate workers workers = generate_workers(n_workers) # instantiate the dataset provider data_provider = DatasetProvider(train_path='./data/train.csv', test_path='./data/test.csv') # get the data train_data = data_provider.provide_data(dataset='train', splits=n_workers) test_data = data_provider.provide_data(dataset='test', splits=n_workers) # send the train data and get back list of (list of pointers) train_text_worker_ptrs = send_text_data(train_data, workers) train_label_worker_ptrs = send_label_data(test_data, workers)
def data_dense(cfg, disease, judgement): """Data to feed into code prediction model""" base = os.environ['DATA_ROOT'] train_data = os.path.join(base, cfg.get('data', 'train_data')) train_annot = os.path.join(base, cfg.get('data', 'train_annot')) test_data = os.path.join(base, cfg.get('data', 'test_data')) test_annot = os.path.join(base, cfg.get('data', 'test_annot')) # load pre-trained model model = load_model(cfg.get('data', 'model_file')) interm_layer_model = Model(inputs=model.input, outputs=model.get_layer( cfg.get('data', 'rep_layer')).output) maxlen = model.get_layer(name='EL').get_config()['input_length'] # determine whether to treat input tokens as a sequence or set if cfg.get('data', 'model_type') == 'dan': use_cuis = True tokens_as_set = True else: use_cuis = False tokens_as_set = False # load training data first train_data_provider = DatasetProvider( train_data, train_annot, disease, judgement, use_pickled_alphabet=True, alphabet_pickle=cfg.get('data', 'alphabet_pickle'), min_token_freq=cfg.getint('args', 'min_token_freq'), use_cuis=use_cuis) x_train, y_train = train_data_provider.load(tokens_as_set=tokens_as_set) classes = len(set(y_train)) print('unique labels in train:', classes) x_train = pad_sequences(x_train, maxlen=maxlen) # make training vectors for target task print('original x_train shape:', x_train.shape) x_train = interm_layer_model.predict(x_train) print('new x_train shape:', x_train.shape) # now load the test set test_data_provider = DatasetProvider( test_data, test_annot, disease, judgement, use_pickled_alphabet=True, alphabet_pickle=cfg.get('data', 'alphabet_pickle'), min_token_freq=cfg.getint('args', 'min_token_freq'), use_cuis=use_cuis) x_test, y_test = test_data_provider.load(tokens_as_set=tokens_as_set) x_test = pad_sequences(x_test, maxlen=maxlen) # make test vectors for target task print('original x_test shape:', x_test.shape) x_test = interm_layer_model.predict(x_test) print('new x_test shape:', x_test.shape) return x_train, y_train, x_test, y_test