def train(self, xy_data, rnn=None):
        # This function trains one RNN

        self.rnn = rnn if rnn is not None else self.rnn

        xs = [np.array(self.encode_expr(self.scramble(x))) for x, y in xy_data]
        ys = [y for x, y in xy_data]

        # for printing purposes only
        dev_data = get_data('data/neg_dev.txt')

        dev_xs = [
            np.array(self.encode_expr(self.scramble(x))) for x, y in dev_data
        ]
        dev_ys = [y for x, y in dev_data]

        self.rnn.grad_check(dev_xs[0], dev_ys[0])

        for j in xrange(self.n_epochs):
            for x, y in zip(xs, ys):
                self.rnn.train_point_sgd(x, y, self.alpha)
            # print 'train loss', rnn_i.compute_loss(xs_i, ys_i)
            if j % 10 == 0:
                print 'dev loss', self.rnn.compute_loss(
                    dev_xs[:100],
                    dev_ys[:100]), 'train loss', self.rnn.compute_loss(
                        xs[:100], ys[:100])

        # # extra stuff to print
        # for x,y in zip(xs_i,ys)[:5]:
        #     yhat = rnn_i.predict(x)
        #     print x, yhat, np.argmax(yhat)

        return self.rnn
예제 #2
0
def preprocess_data(train_file, asNumpy = True):
    """filename should be models/<whatever>.p
    Returns X and Y as lists (per data example) of lists (per character) of indices"""

    ## Get data
    raw_data = get_data(train_file)
    X, Y = labelize(raw_data, asNumpy = asNumpy)
    
    return X,Y
예제 #3
0
def preprocess_data(train_file, asNumpy = True):
    """filename should be models/<whatever>.p
    Returns X and Y as lists (per data example) of lists (per character) of indices"""

    ## Get data
    raw_data = get_data(train_file)
    X, Y = labelize(raw_data, asNumpy = asNumpy)
    
    return X,Y
예제 #4
0
def discriminative_test():
    discr_add_train = get_data('data/d_add_train.p')
    discr_add_dev = get_data('data/d_add_dev.p')
    discr_subtr_train = get_data('data/d_subtr_train.p')
    discr_subtr_dev = get_data('data/d_subtr_dev.p')
    discr_mult_train = get_data('data/d_mult_train.p')
    discr_mult_dev = get_data('data/d_mult_dev.p')

    # TESTING discriminative stuff

    drnn = DRNN(len(run_helpers2.invocab), 50, 50, len(run_helpers2.outvocab))
    drnn.load_model('models/drnn_mult_full.p')
    drnn_fn = lambda x: run_helpers2.model_solve_discr(drnn, x)
    print 'drnn mult train', eval_model(drnn_fn,
                                        discr_mult_train,
                                        metric=strict_metric)
    print 'drnn mult dev', eval_model(drnn_fn,
                                      discr_mult_dev,
                                      metric=strict_metric)

    drnn.load_model('models/drnn_subtr_full.p')
    print 'drnn subtr train', eval_model(drnn_fn,
                                         discr_subtr_train,
                                         metric=strict_metric)
    print 'drnn subtr dev', eval_model(drnn_fn,
                                       discr_subtr_dev,
                                       metric=strict_metric)

    dgru = DGRU(len(run_helpers2.invocab), 50, 50, len(run_helpers2.outvocab))
    dgru.load_model('models/dgru_mult_full.p')
    dgru_fn = lambda x: run_helpers2.model_solve_discr(dgru, x)
    print 'dgru mult train', eval_model(dgru_fn,
                                        discr_mult_train,
                                        metric=strict_metric)
    print 'dgru mult dev', eval_model(dgru_fn,
                                      discr_mult_dev,
                                      metric=strict_metric)

    dgru.load_model('models/dgru_add_full.p')
    print 'dgru add train', eval_model(dgru_fn,
                                       discr_add_train,
                                       metric=strict_metric)
    print 'dgru add dev', eval_model(dgru_fn,
                                     discr_add_dev,
                                     metric=strict_metric)

    dgru.load_model('models/dgru_subtr_full.p')
    print 'dgru subtr train', eval_model(dgru_fn,
                                         discr_subtr_train,
                                         metric=strict_metric)
    print 'dgru subtr dev', eval_model(dgru_fn,
                                       discr_subtr_dev,
                                       metric=strict_metric)
    def train(self, xy_data, rnns=None):
        # This function trains one RNN for each possible output digit

        self.rnns = rnns if rnns is not None else self.rnns

        # xs = [np.array(self.encode_expr(self.scramble_double(x))) for x,y in xy_data]
        ys = [self.encode_expr(lengthen(y, self.y_len)) for x, y in xy_data]

        # for printing purposes only
        dev_data = get_data('data/dev.txt')

        for i, rnn_i in enumerate(self.rnns):
            # where i is the index of the rnn we're using
            print 'i', i

            xs_i = [
                np.array(self.encode_expr(self.scramble(x, i)))
                for x, y in xy_data
            ]
            ys_i = [y[i] for y in ys]
            dev_xs_i = [
                np.array(self.encode_expr(self.scramble(x, i)))
                for x, y in dev_data
            ]
            dev_ys_i = [
                self.encode_expr(lengthen(y, self.y_len))[i]
                for x, y in dev_data
            ]

            rnn_i.grad_check(dev_xs_i[0], dev_ys_i[0])

            for j in xrange(self.n_epochs):
                for x, y in zip(xs_i, ys):
                    rnn_i.train_point_sgd(x, y[i], self.alpha)
                # print 'train loss', rnn_i.compute_loss(xs_i, ys_i)
                if j % 10 == 0:
                    print 'dev loss', rnn_i.compute_loss(dev_xs_i, dev_ys_i)

            # # extra stuff to print
            # for x,y in zip(xs_i,ys)[:5]:
            #     yhat = rnn_i.predict(x)
            #     print x, yhat, np.argmax(yhat)

        return self.rnns
예제 #6
0
def get_pattern():

    if app.debug:
        print request.json
    req = json.loads(request.json)
    #    req = request.json
    global obj_cache
    fields = ['user', 'label']
    if not req or any([x not in req for x in fields]):
        abort(400)

    keys = fields + ['token', 'labels', 'topn', 'path']
    defaults = [''] * 2 + [None, None, 100, '']
    user, label, token, labels, topn, path = get_fields(keys, defaults, req)
    if token is None:
        if labels is None:
            abort(400)
        token = hashlib.md5(
            bencode.bencode([misc.convert(path),
                             misc.convert(labels)])).hexdigest()
    aobj = None
    if user in obj_cache and token in obj_cache[user]:
        aobj = obj_cache[user][token]
    else:
        exp, _ = misc.get_data(user, None, path)
        aobj = mdata.Mdata(exp.binary, exp.names)
        aobj.addLabels(exp.genes, 'all')
        genes = set(exp.genes)
        for label in labels:
            lst = list(genes.intersection(labels[label]))
            if len(lst) == 0:
                abort(400)
            aobj.addLabels(lst, label)

        aobj.findPatterns(2)
        aobj.findPScores(mode='ar', ratios=aobj.ratios)
        obj_cache[user][token] = aobj
    ret = {'token': token, 'patterns': []}
    for ind, row in aobj.pScore.sort_values(
            by=[label], ascending=False).head(topn).iterrows():
        ret['patterns'].append({"pattern": list(ind), "score": row[label]})
        ret['tnames'] = aobj.mods
    return make_response(json.dumps(ret, ensure_ascii=False, indent=4), 200)
예제 #7
0
    def train(self, xy_data, rnns = None):
        # This function trains one RNN for each possible output digit

        self.rnns = rnns if rnns is not None else self.rnns

        # xs = [np.array(self.encode_expr(self.scramble_double(x))) for x,y in xy_data]
        ys = [self.encode_expr(lengthen(y, self.y_len)) for x,y in xy_data]

        # for printing purposes only
        dev_data = get_data('data/dev.txt')

        for i,rnn_i in enumerate(self.rnns):
            # where i is the index of the rnn we're using
            print 'i',i

            xs_i = [np.array(self.encode_expr(self.scramble(x, i))) for x,y in xy_data]
            ys_i = [y[i] for y in ys]
            dev_xs_i = [np.array(self.encode_expr(self.scramble(x, i))) for x,y in dev_data]
            dev_ys_i = [self.encode_expr(lengthen(y, self.y_len))[i] for x,y in dev_data]

            rnn_i.grad_check(dev_xs_i[0], dev_ys_i[0])

            for j in xrange(self.n_epochs):
                for x,y in zip(xs_i, ys):
                    rnn_i.train_point_sgd(x, y[i], self.alpha)
                # print 'train loss', rnn_i.compute_loss(xs_i, ys_i)
                if j % 10 == 0:
                    print 'dev loss', rnn_i.compute_loss(dev_xs_i, dev_ys_i)

            
            # # extra stuff to print
            # for x,y in zip(xs_i,ys)[:5]:
            #     yhat = rnn_i.predict(x)
            #     print x, yhat, np.argmax(yhat)

        return self.rnns
예제 #8
0
def get_raw():
    if app.debug:
        print request.json
    req = json.loads(request.json)
    fields = ['user', 'glst', 'mlst', 'path']
    if not req or any([not x in req for x in fields]):
        abort(400)
    keys = fields + ['token']
    defaults = [''] * 4 + [None]
    user, glst, mlst, path, token = get_fields(keys, defaults, req)

    # trying to get cache
    exp, token = misc.get_data(user, token, path)

    ret = {}
    genes = set(exp.genes)
    for mod in mlst:
        tmp = {}
        subset = exp.raw[mod].loc[list(genes.intersection(glst))]
        for ind, row in subset.iterrows():
            tmp[ind] = row.tolist()
        ret[mod] = tmp
    ret['token'] = token
    return make_response(json.dumps(ret, ensure_ascii=False, indent=4), 200)
예제 #9
0
파일: __init__.py 프로젝트: mpuels/zamia-ai
def get_data(k):

    cities.get_data(k)
    countries.get_data(k)
    states.get_data(k)
    misc.get_data(k)
예제 #10
0
    def predict_one(self, x, rnns = None):
        rnns = rnns if rnns is not None else self.rnns
        if rnns is None:
            raise Exception('Model not trained!')


        x_encoded = lambda i : self.encode_expr(self.scramble(x, i))
        return ''.join(self.decode([np.argmax(rnns[i].predict(x_encoded(i))) for i in range(self.y_len)]))


if __name__ == '__main__':
    # Possible arguments are 'train', 'retrain'. Default mode is demo

    rnns_file = 'rnn_naive.txt'

    train_data = get_data('data/train.txt')

    nr = NaiveRnnlm(scramble_name = 'unscrambled_simple', bptt = 1)

    should_retrain = 'retrain' in sys.argv[1:]
    should_train = 'train' in sys.argv[1:] or should_retrain

    if should_train:
        if should_retrain:
            with open(rnns_file, 'r') as g:
                nr.rnns = pickle.load(g)

        rnns = nr.train(train_data)
        
        with open(rnns_file, 'w') as f:
            pickle.dump(rnns, f)
            raise Exception('Model not trained!')

        x_encoded = lambda i: self.encode_expr(self.scramble(x, i))
        return ''.join(
            self.decode([
                np.argmax(rnns[i].predict(x_encoded(i)))
                for i in range(self.y_len)
            ]))


if __name__ == '__main__':
    # Possible arguments are 'train', 'retrain'. Default mode is demo

    rnns_file = 'rnn_naive.txt'

    train_data = get_data('data/train.txt')

    nr = NaiveRnnlm(scramble_name='unscrambled_simple', bptt=1)

    should_retrain = 'retrain' in sys.argv[1:]
    should_train = 'train' in sys.argv[1:] or should_retrain

    if should_train:
        if should_retrain:
            with open(rnns_file, 'r') as g:
                nr.rnns = pickle.load(g)

        rnns = nr.train(train_data)

        with open(rnns_file, 'w') as f:
            pickle.dump(rnns, f)
예제 #12
0
def extract_discr_data(train_file, asNumpy = True):
    raw_data = get_data(train_file)
    Y = [y for (_, y) in raw_data]
    X = [encode(x, indico2, asNumpy = asNumpy) for x,y in raw_data]
    return X, Y
예제 #13
0
def get_data(k):
    misc.get_data(k)
예제 #14
0
                                       metric=strict_metric)
    print 'dgru add dev', eval_model(dgru_fn,
                                     discr_add_dev,
                                     metric=strict_metric)

    dgru.load_model('models/dgru_subtr_full.p')
    print 'dgru subtr train', eval_model(dgru_fn,
                                         discr_subtr_train,
                                         metric=strict_metric)
    print 'dgru subtr dev', eval_model(dgru_fn,
                                       discr_subtr_dev,
                                       metric=strict_metric)


if __name__ == '__main__':
    add_train = get_data('data/3dig_train.p')
    add_dev = get_data('data/3dig_dev.p')
    subtr_train = get_data('data/subtr_train.p')
    subtr_dev = get_data('data/subtr_dev.p')
    mult_train = get_data('data/mult_train.p')
    mult_dev = get_data('data/mult_dev.p')

    add4_train = get_data('data/4dig_train.p')
    add4_dev = get_data('data/4dig_dev.p')
    add5_train = get_data('data/5dig_train.p')
    add5_dev = get_data('data/5dig_dev.p')
    add6_train = get_data('data/6dig_train.p')
    add6_dev = get_data('data/6dig_dev.p')
    add7_train = get_data('data/7dig_train.p')
    add7_dev = get_data('data/7dig_dev.p')
    def predict_one(self, x, rnn=None):
        rnn = rnn if rnn is not None else self.rnn
        if rnn is None:
            raise Exception('Model not trained!')

        x_encoded = self.encode_expr(self.scramble(x))
        return np.argmax(rnn.predict(x_encoded))


if __name__ == '__main__':
    # Possible arguments are 'train', 'retrain'. Default mode is demo

    rnn_file = 'rnn_naive_discr.txt'

    train_data = get_data('data/neg_train.txt')

    nr = NaiveRnnlmDiscr(scramble_name='noscramble', bptt=2)

    should_retrain = 'retrain' in sys.argv[1:]
    should_train = 'train' in sys.argv[1:] or should_retrain

    if should_train:
        if should_retrain:
            with open(rnn_file, 'r') as g:
                nr.rnn = pickle.load(g)

        rnn = nr.train(train_data)

        with open(rnn_file, 'w') as f:
            pickle.dump(rnn, f)
예제 #16
0
파일: __init__.py 프로젝트: gooofy/voxforge
def get_data(k):

    cities.get_data(k)
    countries.get_data(k)
    states.get_data(k)
    misc.get_data(k)