def train(self, xy_data, rnn=None): # This function trains one RNN self.rnn = rnn if rnn is not None else self.rnn xs = [np.array(self.encode_expr(self.scramble(x))) for x, y in xy_data] ys = [y for x, y in xy_data] # for printing purposes only dev_data = get_data('data/neg_dev.txt') dev_xs = [ np.array(self.encode_expr(self.scramble(x))) for x, y in dev_data ] dev_ys = [y for x, y in dev_data] self.rnn.grad_check(dev_xs[0], dev_ys[0]) for j in xrange(self.n_epochs): for x, y in zip(xs, ys): self.rnn.train_point_sgd(x, y, self.alpha) # print 'train loss', rnn_i.compute_loss(xs_i, ys_i) if j % 10 == 0: print 'dev loss', self.rnn.compute_loss( dev_xs[:100], dev_ys[:100]), 'train loss', self.rnn.compute_loss( xs[:100], ys[:100]) # # extra stuff to print # for x,y in zip(xs_i,ys)[:5]: # yhat = rnn_i.predict(x) # print x, yhat, np.argmax(yhat) return self.rnn
def preprocess_data(train_file, asNumpy = True): """filename should be models/<whatever>.p Returns X and Y as lists (per data example) of lists (per character) of indices""" ## Get data raw_data = get_data(train_file) X, Y = labelize(raw_data, asNumpy = asNumpy) return X,Y
def discriminative_test(): discr_add_train = get_data('data/d_add_train.p') discr_add_dev = get_data('data/d_add_dev.p') discr_subtr_train = get_data('data/d_subtr_train.p') discr_subtr_dev = get_data('data/d_subtr_dev.p') discr_mult_train = get_data('data/d_mult_train.p') discr_mult_dev = get_data('data/d_mult_dev.p') # TESTING discriminative stuff drnn = DRNN(len(run_helpers2.invocab), 50, 50, len(run_helpers2.outvocab)) drnn.load_model('models/drnn_mult_full.p') drnn_fn = lambda x: run_helpers2.model_solve_discr(drnn, x) print 'drnn mult train', eval_model(drnn_fn, discr_mult_train, metric=strict_metric) print 'drnn mult dev', eval_model(drnn_fn, discr_mult_dev, metric=strict_metric) drnn.load_model('models/drnn_subtr_full.p') print 'drnn subtr train', eval_model(drnn_fn, discr_subtr_train, metric=strict_metric) print 'drnn subtr dev', eval_model(drnn_fn, discr_subtr_dev, metric=strict_metric) dgru = DGRU(len(run_helpers2.invocab), 50, 50, len(run_helpers2.outvocab)) dgru.load_model('models/dgru_mult_full.p') dgru_fn = lambda x: run_helpers2.model_solve_discr(dgru, x) print 'dgru mult train', eval_model(dgru_fn, discr_mult_train, metric=strict_metric) print 'dgru mult dev', eval_model(dgru_fn, discr_mult_dev, metric=strict_metric) dgru.load_model('models/dgru_add_full.p') print 'dgru add train', eval_model(dgru_fn, discr_add_train, metric=strict_metric) print 'dgru add dev', eval_model(dgru_fn, discr_add_dev, metric=strict_metric) dgru.load_model('models/dgru_subtr_full.p') print 'dgru subtr train', eval_model(dgru_fn, discr_subtr_train, metric=strict_metric) print 'dgru subtr dev', eval_model(dgru_fn, discr_subtr_dev, metric=strict_metric)
def train(self, xy_data, rnns=None): # This function trains one RNN for each possible output digit self.rnns = rnns if rnns is not None else self.rnns # xs = [np.array(self.encode_expr(self.scramble_double(x))) for x,y in xy_data] ys = [self.encode_expr(lengthen(y, self.y_len)) for x, y in xy_data] # for printing purposes only dev_data = get_data('data/dev.txt') for i, rnn_i in enumerate(self.rnns): # where i is the index of the rnn we're using print 'i', i xs_i = [ np.array(self.encode_expr(self.scramble(x, i))) for x, y in xy_data ] ys_i = [y[i] for y in ys] dev_xs_i = [ np.array(self.encode_expr(self.scramble(x, i))) for x, y in dev_data ] dev_ys_i = [ self.encode_expr(lengthen(y, self.y_len))[i] for x, y in dev_data ] rnn_i.grad_check(dev_xs_i[0], dev_ys_i[0]) for j in xrange(self.n_epochs): for x, y in zip(xs_i, ys): rnn_i.train_point_sgd(x, y[i], self.alpha) # print 'train loss', rnn_i.compute_loss(xs_i, ys_i) if j % 10 == 0: print 'dev loss', rnn_i.compute_loss(dev_xs_i, dev_ys_i) # # extra stuff to print # for x,y in zip(xs_i,ys)[:5]: # yhat = rnn_i.predict(x) # print x, yhat, np.argmax(yhat) return self.rnns
def get_pattern(): if app.debug: print request.json req = json.loads(request.json) # req = request.json global obj_cache fields = ['user', 'label'] if not req or any([x not in req for x in fields]): abort(400) keys = fields + ['token', 'labels', 'topn', 'path'] defaults = [''] * 2 + [None, None, 100, ''] user, label, token, labels, topn, path = get_fields(keys, defaults, req) if token is None: if labels is None: abort(400) token = hashlib.md5( bencode.bencode([misc.convert(path), misc.convert(labels)])).hexdigest() aobj = None if user in obj_cache and token in obj_cache[user]: aobj = obj_cache[user][token] else: exp, _ = misc.get_data(user, None, path) aobj = mdata.Mdata(exp.binary, exp.names) aobj.addLabels(exp.genes, 'all') genes = set(exp.genes) for label in labels: lst = list(genes.intersection(labels[label])) if len(lst) == 0: abort(400) aobj.addLabels(lst, label) aobj.findPatterns(2) aobj.findPScores(mode='ar', ratios=aobj.ratios) obj_cache[user][token] = aobj ret = {'token': token, 'patterns': []} for ind, row in aobj.pScore.sort_values( by=[label], ascending=False).head(topn).iterrows(): ret['patterns'].append({"pattern": list(ind), "score": row[label]}) ret['tnames'] = aobj.mods return make_response(json.dumps(ret, ensure_ascii=False, indent=4), 200)
def train(self, xy_data, rnns = None): # This function trains one RNN for each possible output digit self.rnns = rnns if rnns is not None else self.rnns # xs = [np.array(self.encode_expr(self.scramble_double(x))) for x,y in xy_data] ys = [self.encode_expr(lengthen(y, self.y_len)) for x,y in xy_data] # for printing purposes only dev_data = get_data('data/dev.txt') for i,rnn_i in enumerate(self.rnns): # where i is the index of the rnn we're using print 'i',i xs_i = [np.array(self.encode_expr(self.scramble(x, i))) for x,y in xy_data] ys_i = [y[i] for y in ys] dev_xs_i = [np.array(self.encode_expr(self.scramble(x, i))) for x,y in dev_data] dev_ys_i = [self.encode_expr(lengthen(y, self.y_len))[i] for x,y in dev_data] rnn_i.grad_check(dev_xs_i[0], dev_ys_i[0]) for j in xrange(self.n_epochs): for x,y in zip(xs_i, ys): rnn_i.train_point_sgd(x, y[i], self.alpha) # print 'train loss', rnn_i.compute_loss(xs_i, ys_i) if j % 10 == 0: print 'dev loss', rnn_i.compute_loss(dev_xs_i, dev_ys_i) # # extra stuff to print # for x,y in zip(xs_i,ys)[:5]: # yhat = rnn_i.predict(x) # print x, yhat, np.argmax(yhat) return self.rnns
def get_raw(): if app.debug: print request.json req = json.loads(request.json) fields = ['user', 'glst', 'mlst', 'path'] if not req or any([not x in req for x in fields]): abort(400) keys = fields + ['token'] defaults = [''] * 4 + [None] user, glst, mlst, path, token = get_fields(keys, defaults, req) # trying to get cache exp, token = misc.get_data(user, token, path) ret = {} genes = set(exp.genes) for mod in mlst: tmp = {} subset = exp.raw[mod].loc[list(genes.intersection(glst))] for ind, row in subset.iterrows(): tmp[ind] = row.tolist() ret[mod] = tmp ret['token'] = token return make_response(json.dumps(ret, ensure_ascii=False, indent=4), 200)
def get_data(k): cities.get_data(k) countries.get_data(k) states.get_data(k) misc.get_data(k)
def predict_one(self, x, rnns = None): rnns = rnns if rnns is not None else self.rnns if rnns is None: raise Exception('Model not trained!') x_encoded = lambda i : self.encode_expr(self.scramble(x, i)) return ''.join(self.decode([np.argmax(rnns[i].predict(x_encoded(i))) for i in range(self.y_len)])) if __name__ == '__main__': # Possible arguments are 'train', 'retrain'. Default mode is demo rnns_file = 'rnn_naive.txt' train_data = get_data('data/train.txt') nr = NaiveRnnlm(scramble_name = 'unscrambled_simple', bptt = 1) should_retrain = 'retrain' in sys.argv[1:] should_train = 'train' in sys.argv[1:] or should_retrain if should_train: if should_retrain: with open(rnns_file, 'r') as g: nr.rnns = pickle.load(g) rnns = nr.train(train_data) with open(rnns_file, 'w') as f: pickle.dump(rnns, f)
raise Exception('Model not trained!') x_encoded = lambda i: self.encode_expr(self.scramble(x, i)) return ''.join( self.decode([ np.argmax(rnns[i].predict(x_encoded(i))) for i in range(self.y_len) ])) if __name__ == '__main__': # Possible arguments are 'train', 'retrain'. Default mode is demo rnns_file = 'rnn_naive.txt' train_data = get_data('data/train.txt') nr = NaiveRnnlm(scramble_name='unscrambled_simple', bptt=1) should_retrain = 'retrain' in sys.argv[1:] should_train = 'train' in sys.argv[1:] or should_retrain if should_train: if should_retrain: with open(rnns_file, 'r') as g: nr.rnns = pickle.load(g) rnns = nr.train(train_data) with open(rnns_file, 'w') as f: pickle.dump(rnns, f)
def extract_discr_data(train_file, asNumpy = True): raw_data = get_data(train_file) Y = [y for (_, y) in raw_data] X = [encode(x, indico2, asNumpy = asNumpy) for x,y in raw_data] return X, Y
def get_data(k): misc.get_data(k)
metric=strict_metric) print 'dgru add dev', eval_model(dgru_fn, discr_add_dev, metric=strict_metric) dgru.load_model('models/dgru_subtr_full.p') print 'dgru subtr train', eval_model(dgru_fn, discr_subtr_train, metric=strict_metric) print 'dgru subtr dev', eval_model(dgru_fn, discr_subtr_dev, metric=strict_metric) if __name__ == '__main__': add_train = get_data('data/3dig_train.p') add_dev = get_data('data/3dig_dev.p') subtr_train = get_data('data/subtr_train.p') subtr_dev = get_data('data/subtr_dev.p') mult_train = get_data('data/mult_train.p') mult_dev = get_data('data/mult_dev.p') add4_train = get_data('data/4dig_train.p') add4_dev = get_data('data/4dig_dev.p') add5_train = get_data('data/5dig_train.p') add5_dev = get_data('data/5dig_dev.p') add6_train = get_data('data/6dig_train.p') add6_dev = get_data('data/6dig_dev.p') add7_train = get_data('data/7dig_train.p') add7_dev = get_data('data/7dig_dev.p')
def predict_one(self, x, rnn=None): rnn = rnn if rnn is not None else self.rnn if rnn is None: raise Exception('Model not trained!') x_encoded = self.encode_expr(self.scramble(x)) return np.argmax(rnn.predict(x_encoded)) if __name__ == '__main__': # Possible arguments are 'train', 'retrain'. Default mode is demo rnn_file = 'rnn_naive_discr.txt' train_data = get_data('data/neg_train.txt') nr = NaiveRnnlmDiscr(scramble_name='noscramble', bptt=2) should_retrain = 'retrain' in sys.argv[1:] should_train = 'train' in sys.argv[1:] or should_retrain if should_train: if should_retrain: with open(rnn_file, 'r') as g: nr.rnn = pickle.load(g) rnn = nr.train(train_data) with open(rnn_file, 'w') as f: pickle.dump(rnn, f)