def init(args): """ Load data, build model, create optimizer, create vars to hold metrics, etc. """ #need to handle really large text fields csv.field_size_limit(sys.maxsize) #load vocab and other lookups desc_embed = args.lmbda > 0 print("loading lookups...") dicts = datasets.load_lookups(args, desc_embed=desc_embed) model = tools.pick_model(args, dicts) print(model) if not args.test_model: optimizer = optim.Adam(model.parameters(), weight_decay=args.weight_decay, lr=args.lr) else: optimizer = None params = tools.make_param_dict(args) return args, model, optimizer, params, dicts
def setup(): """ Load data, build model, create optimizer, create vars to hold metrics, etc. """ args = { "data_path": "train_full.csv", "vocab": "./datafiles/vocab.csv", "model": "conv_attn", "filter_size": 10, "num_filter_maps": 50, "dropout": .2, "lr": .0001, "gpu": False, "test_model": "model.pth", "public_model": "true", "Y": "full", "n_epochs": 200 } dicts = datasets.load_lookups(args) dicts["code_descs"] = datasets.load_code_descriptions() model = tools.pick_model(args, dicts) # with open('./datafiles/example_note.txt', 'r') as notefile: # note = notefile.read() # test(model, False, note, dicts) #testing @app.route("/", methods=['POST']) def hello(): note = request.get_json()["note"] print(note) if (note is None): return "Note not found" results = test(model, False, note, dicts) return jsonify(results)
def init(args): """ Load data, build model, create optimizer, create vars to hold metrics, etc. """ #need to handle really large text fields csv.field_size_limit(sys.maxsize) freq_params = None if args.samples or args.lmbda > 0: print("loading code frequencies...") code_freqs, n = datasets.load_code_freqs(args.data_path) print( "code_freqs:", sorted(code_freqs.items(), key=operator.itemgetter(1), reverse=True)[:10], "n:", n) freq_params = (code_freqs, n) #load vocab and other lookups desc_embed = args.lmbda > 0 dicts = datasets.load_lookups(args.data_path, args.vocab, desc_embed=desc_embed) model = tools.pick_model(args, dicts) print(model) optimizer = optim.Adam(model.params_to_optimize(), weight_decay=args.weight_decay, lr=args.lr) params = tools.make_param_dict(args) return args, model, optimizer, params, freq_params, dicts
def init(args): """ Load data, build model, create optimizer, create vars to hold metrics, etc. """ #load vocab and other lookups print("loading lookups...") dicts = datasets.load_lookups(args, hier=args.hier) model, optimizer = init_model(args, dicts) print(model) params = vars(args) return args, model, optimizer, params, dicts
def init(args): """ Load data, build model, create optimizer, create vars to hold metrics, etc. """ #need to handle really large text fields csv.field_size_limit(sys.maxsize) #load vocab and other lookups desc_embed = args.lmbda > 0 print("loading lookups...") dicts = datasets.load_lookups(args, desc_embed=desc_embed) META_TEST = args.test_model is not None model, start_epoch, optimizer = tools.pick_model(args, dicts, META_TEST) print(model) params = tools.make_param_dict(args) return args, model, optimizer, params, dicts, start_epoch
def init(args): """ Load data, build model, create optimizer, create vars to hold metrics, etc. """ # need to handle really large text fields csv.field_size_limit(sys.maxsize) freq_params = None # load vocab and other lookups dicts = datasets.load_lookups(args.data_path, args.vocab) model = tools.pick_model(args, dicts) print(model) optimizer = optim.Adam(model.params_to_optimize(), weight_decay=args.weight_decay, lr=args.lr) # optimizer = optim.Adam(model.module.params_to_optimize(), weight_decay=args.weight_decay, lr=args.lr) params = tools.make_param_dict(args) return args, model, optimizer, params, freq_params, dicts
def init(args): """ Load data, build model, create optimizer, create vars to hold metrics, etc. """ # need to handle really large text fields csv.field_size_limit(sys.maxsize) # load vocab and other lookups desc_embed = args.lmbda > 0 print("loading lookups...") dicts = datasets.load_lookups(args, desc_embed=desc_embed) model = transformer.TransformerAttn(args.Y, args.embed_file, dicts, args.lmbda, args.gpu, args.embed_size, args.num_layers, args.heads, args.d_ff, args.dropout, args.max_relative_positions) if args.gpu: model.cuda() print(model) if not args.test_model: optimizer = optim.Adam(model.parameters(), weight_decay=args.weight_decay, lr=args.lr) optimizer = NoamOpt( 100, 2, 4000, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)) else: optimizer = None params = tools.make_param_dict(args) return args, model, optimizer, params, dicts
def main(Y, train_fname, dev_fname, vocab_file, version, n): n = int(n) #need to handle really large text fields csv.field_size_limit(sys.maxsize) #get lookups from non-BOW data data_path = train_fname.replace( '_bows', '') if "_bows" in train_fname else train_fname dicts = datasets.load_lookups(data_path, vocab_file=vocab_file, Y=Y, version=version) w2ind, ind2c, c2ind = dicts['w2ind'], dicts['ind2c'], dicts['c2ind'] X, yy_tr, hids_tr = read_bows(Y, train_fname, c2ind, version) X_dv, yy_dv, hids_dv = read_bows(Y, dev_fname, c2ind, version) print("X.shape: " + str(X.shape)) print("yy_tr.shape: " + str(yy_tr.shape)) print("X_dv.shape: " + str(X_dv.shape)) print("yy_dv.shape: " + str(yy_dv.shape)) #deal with labels that don't have any positive examples #drop empty columns from yy. keep track of which columns kept #predict on test data with those columns. guess 0 on the others labels_with_examples = yy_tr.sum(axis=0).nonzero()[0] yy = yy_tr[:, labels_with_examples] # build the classifier clf = OneVsRestClassifier(LogisticRegression(C=C, max_iter=MAX_ITER, solver='sag'), n_jobs=-1) # train print("training...") clf.fit(X, yy) #predict print("predicting...") yhat = clf.predict(X_dv) yhat_raw = clf.predict_proba(X_dv) #deal with labels that don't have positive training examples print("reshaping output to deal with labels missing from train set") labels_with_examples = set(labels_with_examples) yhat_full = np.zeros(yy_dv.shape) yhat_full_raw = np.zeros(yy_dv.shape) j = 0 for i in range(yhat_full.shape[1]): if i in labels_with_examples: yhat_full[:, i] = yhat[:, j] yhat_full_raw[:, i] = yhat_raw[:, j] j += 1 #evaluate metrics, fpr, tpr = evaluation.all_metrics(yhat_full, yy_dv, k=[8, 15], yhat_raw=yhat_full_raw) evaluation.print_metrics(metrics) #save metric history, model, params print("saving predictions") model_dir = os.path.join( MODEL_DIR, '_'.join(["log_reg", time.strftime('%b_%d_%H:%M', time.localtime())])) os.mkdir(model_dir) preds_file = tools.write_preds(yhat_full, model_dir, hids_dv, 'test', yhat_full_raw) print("sanity check on train") yhat_tr = clf.predict(X) yhat_tr_raw = clf.predict_proba(X) #reshape output again yhat_tr_full = np.zeros(yy_tr.shape) yhat_tr_full_raw = np.zeros(yy_tr.shape) j = 0 for i in range(yhat_tr_full.shape[1]): if i in labels_with_examples: yhat_tr_full[:, i] = yhat_tr[:, j] yhat_tr_full_raw[:, i] = yhat_tr_raw[:, j] j += 1 #evaluate metrics_tr, fpr_tr, tpr_tr = evaluation.all_metrics( yhat_tr_full, yy_tr, k=[8, 15], yhat_raw=yhat_tr_full_raw) evaluation.print_metrics(metrics_tr) if n > 0: print("generating top important ngrams") if 'bows' in dev_fname: dev_fname = dev_fname.replace('_bows', '') print("calculating top ngrams using file %s" % dev_fname) calculate_top_ngrams(dev_fname, clf, c2ind, w2ind, labels_with_examples, n) #Commenting this out because the models are huge (11G for mimic3 full) #print("saving model") #with open("%s/model.pkl" % model_dir, 'wb') as f: # pickle.dump(clf, f) print("saving metrics") metrics_hist = defaultdict(lambda: []) metrics_hist_tr = defaultdict(lambda: []) for name in metrics.keys(): metrics_hist[name].append(metrics[name]) for name in metrics_tr.keys(): metrics_hist_tr[name].append(metrics_tr[name]) metrics_hist_all = (metrics_hist, metrics_hist, metrics_hist_tr) persistence.save_metrics(metrics_hist_all, model_dir)
# Set parameters: maxlen = 200 embedding_dims = 200 nb_filter = 500 filter_length = 4 batch_size = 8 nb_epoch = 10 nb_labels = 50 train_data_path = "../mimicdata/mimic3/train_50.csv" dev_data_path = "../mimicdata/mimic3/dev_50.csv" test_data_path = "../mimicdata/mimic3/test_50.csv" vocab = "../mimicdata/mimic3/vocab.csv" embed_file = "../mimicdata/mimic3/processed_full.embed" dicts = datasets.load_lookups(train_data_path, vocab, Y=nb_labels) vocab_size = len(dicts[0]) embed_weight = extract_wvs.load_embeddings(embed_file) # Load data print('Loading data...') def slim_data_generator(data_path): while 1: for batch_idx, tup in enumerate( datasets.data_generator(data_path, dicts, batch_size=batch_size, num_labels=nb_labels)): X, y, _, code_set, descs = tup
def init(args): """ Load data, build model, create optimizer, create vars to hold metrics, etc. """ #need to handle really large text fields csv.field_size_limit(sys.maxsize) #load vocab and other lookups desc_embed = args.lmbda > 0 print("loading lookups...") dicts = datasets.load_lookups(args, desc_embed=desc_embed) model = tools.pick_model(args, dicts) print(model) if not args.test_model: if args.model in BERT_MODEL_LIST: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = optim.Adam(optimizer_grouped_parameters, weight_decay=args.weight_decay, lr=args.lr) length = datasets.data_length(args.data_path, args.version) t_total = length // args.batch_size * args.n_epochs scheduler = get_linear_schedule_with_warmup(optimizer, \ num_warmup_steps=args.warmup_steps, \ num_training_steps=t_total, \ ) def get_label_distribution(filename, dicts): ind2w, w2ind, ind2c, c2ind, dv_dict = dicts['ind2w'], dicts[ 'w2ind'], dicts['ind2c'], dicts['c2ind'], dicts['dv'] if args.Y == 'full': labels_idx = [1e-15] * 8921 else: labels_idx = [1e-15] * int(args.Y) with open(filename, 'r') as infile: r = csv.reader(infile) # header next(r) for row in r: for l in row[3].split(';'): if l in c2ind.keys(): code = int(c2ind[l]) labels_idx[code] += 1 max_val = max(labels_idx) return max_val / np.array(labels_idx) if args.pos: labels_weight = get_label_distribution(args.data_path, dicts) else: labels_weight = None else: optimizer = optim.Adam(model.parameters(), weight_decay=args.weight_decay, lr=args.lr) scheduler = None labels_weight = None else: optimizer = None scheduler = None labels_weight = None params = tools.make_param_dict(args) return args, model, optimizer, params, dicts, scheduler, labels_weight