Exemplo n.º 1
0
Arquivo: BPAT.py Projeto: zhhrozhh/SSW
def update_db():
    today = pd.datetime.now(pytz.timezone('US/Eastern'))
    last = None
    with open('bpats_db/LASTUD', 'r') as f:
        last = pd.datetime(*map(int, f.read().split('-'))) + timedelta(1)
    if today.hour < 9 and today.hour > 4:
        today = today - timedelta(1)
    else:
        today = today - timedelta(2)
    today = pd.datetime(today.year, today.month, today.day)
    if today - last < timedelta(1):
        return -1
    for scode in scodes:
        data = None
        print(scode, end=' ')
        try:
            data = tools.get_data(scode, start=last, end=today)
        except iexfinance.utils.exceptions.IEXSymbolError:
            continue
        except:
            print('update {} failed, {} -> {}'.format(
                scode, '{}-{}-{}'.format(last.year, last.month, last.day),
                '{}-{}-{}'.format(today.year, today.month, today.day)))
            continue
        o = data.Adj_Open
        c = data.Adj_Close
        h = data.Adj_High
        l = data.Adj_Low
        v = data.Adj_Volume
        zm = BPAT(o, c, h, l, v)
        with open('bpats_db/' + scode, 'a') as f:
            for b in zm:
                f.write(b + ' ')
    with open('bpats_db/LASTUD', 'w') as f:
        f.write('{}-{}-{}'.format(today.year, today.month, today.day))
def main(args):
    scores, roc_aucs = [], []
    for i in range(5):
        seed = i
        set_seed(seed)

        (_, _), X, (train_idx,
                    train_y), (val_idx,
                               val_y), (test_idx,
                                        test_y), names = tools.get_data(
                                            args.__dict__, seed=seed)
        if X is None or not X.shape[1]:
            raise ValueError('No features')

        clf = GaussianNB()
        clf.fit(X[train_idx], train_y)
        probs = clf.predict(X[test_idx])

        roc_auc = roc_auc_score(test_y, probs)
        roc_aucs.append(roc_auc)

        preds = (probs > 0.5) * 1
        score = acc(preds, test_y)
        print('Score:', score)
        scores.append(score)

    print('Acc(all):', scores)
    print('Auc(all):', roc_aucs)
    print('Accuracy:', np.mean(scores))
    print('Auc:', np.mean(roc_aucs))

    return np.mean(roc_aucs), np.std(roc_aucs)
Exemplo n.º 3
0
def main(args):
    roc_aucs = []
    for i in range(args.n_runs):
        seed = i
        set_seed(seed)

        (_, _), X, (train_idx,
                    train_y), (val_idx,
                               val_y), (test_idx,
                                        test_y), names = tools.get_data(
                                            args.__dict__, seed=seed)
        if X is None or not X.shape[1]:
            raise ValueError('No features')

        clf = sklearn.svm.SVC(class_weight='balanced',
                              random_state=seed,
                              probability=True)
        clf.fit(X[train_idx], train_y)
        probs = clf.predict_proba(X[test_idx])[:, 1]
        roc_auc = roc_auc_score(test_y, probs)
        roc_aucs.append(roc_auc)

        p = np.stack([names[test_idx], probs], axis=1)
        save_preds(p, args, seed)

    print('Auc(all):', roc_aucs)
    print('Auc:', np.mean(roc_aucs))

    return np.mean(roc_aucs), np.std(roc_aucs)
Exemplo n.º 4
0
def train(param=PARAMS, sv=SOLVE, small=False):
    num_hidden = 4
    num_lstm_layer = 1
    batch_size = 1


    def sym_gen(seq_len):
        return lstm_unroll(num_lstm_layer, seq_len, num_hidden=num_hidden, num_label=1)

    init_c = [('l%d_init_c'%l, (batch_size, num_hidden, 256, 256)) for l in range(num_lstm_layer)]
    init_h = [('l%d_init_h'%l, (batch_size, num_hidden, 256, 256)) for l in range(num_lstm_layer)]
    init_states = init_c + init_h
    
    data_train, data_val = get_data('r', batch_size, 
                    init_states=init_states, small=small)
    
    #data = get(init_states, bs=batch_size, small=small)
    #data_train = data['train']
    #data_val   = data['val']
    param['eval_data'] = data_val

    num_time = data_train.data_list[0].shape[1]
    symbol = sym_gen(num_time)
    
    s = Solver(symbol, data_train, sv, **param)
    print 'Start Training...'
    s.train()
Exemplo n.º 5
0
def train(param=PARAMS, sv=SOLVE, small=False):
    num_hidden = 4
    num_lstm_layer = 1
    batch_size = 1

    def sym_gen(seq_len):
        return lstm_unroll(num_lstm_layer,
                           seq_len,
                           num_hidden=num_hidden,
                           num_label=1)

    init_c = [('l%d_init_c' % l, (batch_size, num_hidden, 256, 256))
              for l in range(num_lstm_layer)]
    init_h = [('l%d_init_h' % l, (batch_size, num_hidden, 256, 256))
              for l in range(num_lstm_layer)]
    init_states = init_c + init_h

    data_train, data_val = get_data('r',
                                    batch_size,
                                    init_states=init_states,
                                    small=small)

    #data = get(init_states, bs=batch_size, small=small)
    #data_train = data['train']
    #data_val   = data['val']
    param['eval_data'] = data_val

    num_time = data_train.data_list[0].shape[1]
    symbol = sym_gen(num_time)

    s = Solver(symbol, data_train, sv, **param)
    print 'Start Training...'
    s.train()
Exemplo n.º 6
0
def train_model(ui):
    
    # get filename from customer
    if f = ui.get_file_name() == None return None

    # validate date in filename
    raw_data = tools.get_data(f)
    if not tools.raw_data_is_valid(raw_data):
        ui.print_error("Raw data file {0} not formatted correctly".format(f))
        return

    # derive new features, handle missing data, and clean
    clean_data = preprocessing.format_data(raw_data)

    # store new data into database?
    database.store_clean_data(clean_data)

    # run model
    model = mlalgorithms.logistic_regression(new_data)

    # store model in database
    database.store_model(model)

    # display performance?
    ui.display_performance(model)
def main(args):
    roc_aucs = []
    for i in range(args.n_runs):
        seed = i
        set_seed(seed)

        _, X, (train_idx, train_y), (val_idx, val_y), (test_idx,
                                                       test_y), names = tools.get_data(args.__dict__, seed=seed)

        if X is None or not X.shape[1]:
            raise ValueError('No features')

        train_x = X[train_idx].cuda()
        val_x = X[val_idx].cuda()
        test_x = X[test_idx].cuda()
        print('train_x', train_x.mean())
        print('test_x', test_x.mean())

        probs = mlp_fit_predict(train_x, train_y, test_x, val=(val_x, val_y))
        roc_auc = roc_auc_score(test_y, probs)
        roc_aucs.append(roc_auc)

        p = np.concatenate(
            [names[test_idx].reshape(-1, 1), probs.reshape(-1, 1)], axis=1)
        save_preds(p, args, seed)

    print('Auc(all):', roc_aucs)
    print('Auc:', np.mean(roc_aucs))

    return np.mean(roc_aucs), np.std(roc_aucs)
def request_asset_list():
    r = get_data(
        "asset?columns=ASSET_DATABASE_ID&columns=TYPE&columns=LABEL&columns=LAST_CLOSE_VALUE_IN_CURR",
        start_date='2013-06-14',
        end_date='2019-04-18')
    dic_asset = json.loads(r)
    return dic_asset
Exemplo n.º 9
0
def tsub(val):
    s = re.match(r'!([a-zA-Z][a-zA-Z0-9_]*)\s*([a-zA-Z][a-zA-Z0-9_]*)?(.*)',
                 t_dbg.text)
    if s:
        cmd = s.groups()[0]
        arg = s.groups()[1]
        oth = s.groups()[2]

        if cmd == 'target':
            #G_var['DATAF'] = quandl.get('EOD/'+arg)[['Adj_Open','Adj_High','Adj_Low','Adj_Close','Adj_Volume']].iloc[-10:]
            G_var['DATAF'] = tools.get_data(arg).iloc[-10:]
            G_var['name'] = arg
            if oth:
                G_var['pat_N'] = int(oth)
                pat_proc(int(oth))
            else:
                pat_proc()
        elif cmd == 'go':
            go()
    else:
        try:
            print(eval(t_dbg.text))
        except:
            exec(t_dbg.text)
    t_dbg.set_val('')
Exemplo n.º 10
0
def main(args):
    scores, roc_aucs = [], []
    for i in range(5):
        seed = i
        set_seed(seed)

        (_, _), X, (train_idx,
                    train_y), (val_idx,
                               val_y), (test_idx,
                                        test_y), names = tools.get_data(
                                            args.__dict__, seed=seed)
        if X is None or not X.shape[1]:
            raise ValueError('No features')

        clf = sklearn.ensemble.RandomForestClassifier(class_weight='balanced',
                                                      random_state=seed,
                                                      n_estimators=500)
        clf.fit(X[train_idx], train_y)
        probs = clf.predict(X[test_idx])

        roc_auc = roc_auc_score(test_y, probs)
        roc_aucs.append(roc_auc)

        preds = (probs > 0.5) * 1
        score = acc(preds, test_y)
        print('Score:', score)
        scores.append(score)

    print('Acc(all):', scores)
    print('Auc(all):', roc_aucs)
    print('Accuracy:', np.mean(scores))
    print('Auc:', np.mean(roc_aucs))

    return np.mean(roc_aucs), np.std(roc_aucs)
Exemplo n.º 11
0
def run_classifiers(rabbit_list, rabbit_dict):

    print('===> Training and evaluating predictive models ... ')
    for i, r in enumerate(rabbit_list):
        print(f'=> Suject {r}: Generating training data ... ', end='')
        #### Generate the data for training and testing the classifier models ####
        train_feats, train_labels, test_feats, test_labels = tls.get_data(['ctd', 'max', 't2', 'adc'])

        temp_label = train_labels.copy()
        temp_feats = train_feats.copy()

        rabbit_dict[r]['test_labels'] = test_labels[i].clone()
        test_feats = test_feats[i].clone()

        _ = temp_label.pop(i)
        _ = temp_feats.pop(i)

        rabbit_dict[r]['train_labels'] = torch.cat(temp_label, 0)
        train_feats = torch.cat(temp_feats, 0)

        # Get the mean of the training features
        train_mean_feats = train_feats.mean(0, keepdims=True)
        train_std_feats = train_feats.std(0, keepdim=True)

        # Normalize the test and train features by the training mean
        # train_feats = train_feats / train_mean_feats
        # test_feats = test_feats / train_mean_feats
        train_feats = (train_feats - train_mean_feats) / train_std_feats
        test_feats = (test_feats - train_mean_feats) / train_std_feats

        # Store the data in the dictionary for later use
        rabbit_dict[r]['train_features'] = train_feats.reshape(train_feats.shape[0], -1).squeeze()
        rabbit_dict[r]['test_features'] = test_feats.reshape(test_feats.shape[0], -1).squeeze()
        print('done')

        print(f'=> Suject {r}: Training and evaluating logistic regression classifier ... ', end='')
        # Train and eval the logistic regression classifier
        rabbit_dict[r]['logistic_model'] = {}
        temp_train_proba, temp_test_proba = tls.logistic_regression(rabbit_dict[r]['train_features'],
                                                                   rabbit_dict[r]['train_labels'],
                                                                   rabbit_dict[r]['test_features'])
        rabbit_dict[r]['logistic_model']['train_proba'] = temp_train_proba
        rabbit_dict[r]['logistic_model']['test_proba'] = temp_test_proba
        rabbit_dict[r]['logistic_model']['test_proba_vol'] = recon_prediction(temp_test_proba, rabbit=r)
        print('done')

        print(f'=> Suject {r}: Training and evaluating random forest classifier ... ', end='')
        # Train and eval the logistic regression classifier
        rabbit_dict[r]['forest_model'] = {}
        temp_train_proba, temp_test_proba = tls.random_forest(rabbit_dict[r]['train_features'],
                                                             rabbit_dict[r]['train_labels'],
                                                             rabbit_dict[r]['test_features'])
        rabbit_dict[r]['forest_model']['train_proba'] = temp_train_proba
        rabbit_dict[r]['forest_model']['test_proba'] = temp_test_proba
        rabbit_dict[r]['forest_model']['test_proba_vol'] = recon_prediction(temp_test_proba, rabbit=r)
        print('done')
    print('===> Done training and evaluating predictive models.')
Exemplo n.º 12
0
def main():
    #(2560, 256, 123)
    X, Y = get_data()

    # model = DNN(X, Y)
    # model = DCNN(X, Y)
    # model = ANN(X, Y)
    model = CNN(X, Y)
    model.cnn()
    print('Done')
def main(args):
    seed = np.random.randint(0, 1000000)
    _, X, (idx1, y1), (idx2, y2), (idx3,
                                   y3), names = tools.get_data(args.__dict__,
                                                               seed=seed)
    idx = np.concatenate([idx1, idx2, idx3], 0)
    y = np.concatenate([y1, y2, y3], 0)
    X = X[idx]

    feats, cors = cor_selector(X, y, 50)
    print(cors)
Exemplo n.º 14
0
def get_company_info(company_name, index):
    dr = get_chrome(index)
    company_data = tools.get_data(
        "select company_name from crm_clue_qichacha where company_name = '%s'"
        % company_name)
    if company_data is not None and len(company_data) > 0:
        tools.update_sql(
            "update 1688_clue_new set 1688_consume =1 where company_name = '%s'"
            % company_name)
        print '%s is exists in db' % company_name
    else:
        infos = getShopInfo(dr, tools.str_process(company_name))
        insert_into_db(infos, company_name)
def calculate_entropy(name):
    """ """
    historic_probability = get_data(name)
    historic_entropy = []
    for probability in historic_probability:
        s = 0
        N = int(probability[len(probability) - 1])
        for i in range(len(probability) - 1):
            s += shanon_entropy(float(probability[i]) / N)

        historic_entropy.append(s)

    return historic_entropy, math.log(N)
Exemplo n.º 16
0
Arquivo: BPAT.py Projeto: zhhrozhh/SSW
def load_to_f():
    i = 0
    for scode in scodes:
        if os.path.isfile('bpats_db/' + scode):
            i += 1
            continue
        data = tools.get_data(scode)
        o = data.Adj_Open
        c = data.Adj_Close
        h = data.Adj_High
        l = data.Adj_Low
        v = data.Adj_Volume
        save(BPAT(o, c, h, l, v), scode)
        i += 1
        print('{}/{}'.format(i, len(scodes)), end=' ')
Exemplo n.º 17
0
def cf_train(sv=SOLVE, param=PARAMS):

    train, val = get_data('c', 2, small=False)

    sv['name'] = 'CF'
    sv['is_rnn'] = False
    param['eval_data'] = val
    param['num_epoch'] = 20
    param['learning_rate'] = 0.1

    print 'SOLVE', sv
    print 'param', param
    s = Solver(net, train, sv, **param)
    s.train()
    # s.predict()

    return s
Exemplo n.º 18
0
def publish(request, partner_id, click_id, **kwargs):
    check = kwargs.get('check', False)
    logger = S2SFactory.get_logger()
    s2s_data = get_data(partner_id=partner_id,logger=logger,settings=settings)
    if not s2s_data:
        logger.info('PARTNER [%s] INFO NOT FOUND' % partner_id)
        return HttpResponse('KO')
    if check:
        task_id = PublishTask.delay(partner_id, s2s_data, click_id, MockSender, DictMessage, settings)
        logger.info('CHECK QUEUED [%s]' % task_id)
        task_id.wait(interval=0.1)
        logger.info('CHECK RESULT [%s]: %s' % (task_id, task_id.result))
        return HttpResponse(task_id.result)
    else:
        task_id = PublishTask.delay(partner_id, s2s_data, click_id, SMTPSender, MIMEBase64, settings)
        logger.info('NOTIFICATION QUEUED %s' % (task_id))
        return HttpResponse(task_id,status=202)
Exemplo n.º 19
0
def get_data():
    data = tools.get_data()
    data_old = data[1]
    data_new = data[0]
    return jsonify({
        "confirm": data_new[1],
        "confirm_add": data_new[2],
        "suspect": data_new[3],
        "suspect_add": data_new[4],
        "heal": data_new[5],
        "heal_add": data_new[6],
        "dead": data_new[7],
        "dead_add": data_new[8],
        "now_confirm": data_new[9],
        "now_confirm_add": data_new[9] - data_old[9],
        "now_severe": data_new[10],
        "now_severe_add": data_new[10] - data_old[10]
    })
Exemplo n.º 20
0
def train(batch_size, param=PARAMS, sv=SOLVE, small=False):
    num_lstm_layer = 1
    num_hidden = 1000
    
    # prepare data
    init_c = [('l%d_init_c'%l, (batch_size, num_hidden)) for l in range(num_lstm_layer)]
    init_h = [('l%d_init_h'%l, (batch_size, num_hidden)) for l in range(num_lstm_layer)]
    init_states = init_c + init_h
    data_train, data_val = get_data('r', batch_size, 
                    init_states=init_states, small=small, splite_rate=0.2)
    param['eval_data'] = data_val

    # prepare symbol
    num_time = data_train.data_list[0].shape[1]
    symbol = lstm_unroll(num_lstm_layer, num_time, num_hidden)
    
    s = Solver(symbol, data_train, sv, **param)
    print 'Start Training...'
    s.train()
Exemplo n.º 21
0
def from_1688_company():
    while True:
        index = 0
        chrome_len = len(list_dr)
        company_names = tools.get_data(
            "select company_name,1688_result,id from test.`1688_clue_new` where id not in (select p_id from crm_process_company where status = 0 and source = '%s') limit 1000"
            % source)
        for company_name in company_names:
            try:
                get_company_info(company_name[0], index % chrome_len)
                tools.update_sql(
                    "insert into crm_process_company(p_id,status,source,updated_at) values(%d,1,'%s','%s') ON DUPLICATE KEY UPDATE status = 1,updated_at='%s'"
                    % (int(company_name[2]), source,
                       datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                       datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
                time.sleep(random.uniform(1, 10))
            except:
                print 'traceback.format_exc():\n%s' % traceback.format_exc()
            index += 1
        time.sleep(3600)
Exemplo n.º 22
0
def main():
    #conn = sqlite3.connect('../ping-analytics.db')
    output = get_data()

    ping = []
    ping = get_ms(output, ping)
    avg_ping = round(cal_average(ping), 1)

    est, mdt, pst = get_time_by_timezone()
    #db_main(est, mdt, pst, avg_ping, conn)
    insert_data(est, mdt, pst, avg_ping, ip)


    #logging
    print(f'Time: {est}')
    print(f'Data: \n {output}')
    print(f'\n\nPing: {avg_ping}')

    
    threading.Timer(wait_seconds, main).start()
Exemplo n.º 23
0
def main():
    orig = pd.read_csv(FILE_PATH, delimiter=DELIMITER)
    print(orig.shape)

    data_fixed = orig.copy()
    ratio_unknown = 0.5
    ages = range(0, 130, 10)
    fix_field(data_fixed, 'age', ['[{}-{})'.format(age, age + 10) for age in ages], [i for i in range(len(ages))])
    remove_unknown_with_ratio(data_fixed, ratio_unknown)
    fix_all_non_numeric(data_fixed)
    replace_unknown_with_average(data_fixed)

    class_key = 'gender'
    data_fixed = same_distribution(data_fixed, class_key)
    data_fixed = data_fixed.sample(n=2000)

    data, cluster_gt = get_data(data_fixed, to_keep, class_key)
    internal_clustering_loss = [metrics.silhouette_score]
    external_clustering_loss = [metrics.mutual_info_score]
    reduction_algorithms = [manifold.TSNE]
    pre_clustering_reduction_algorithm = decomposition.PCA(15)
    clusters_algos = CLUSTERING_ALGOS

    internal_clustering_loss = []
    reduction_algorithms = []

    if pre_clustering_reduction_algorithm:
        data = pre_clustering_reduction_algorithm.fit_transform(data)

    clusters_params_zip = [(algo, CLUSTERING_TO_STEPS_MAP_SMALL[algo]) for algo in clusters_algos]
    file_name, all_clustering_algorithms = run_full_flow(
        data, cluster_gt, clusters_params_zip, internal_clustering_loss, external_clustering_loss, reduction_algorithms
    )
    print(file_name)

    run_plot_by_loss(file_name)
Exemplo n.º 24
0
def train(batch_size, param=PARAMS, sv=SOLVE, small=False):
    num_lstm_layer = 1
    num_hidden = 1000

    # prepare data
    init_c = [('l%d_init_c' % l, (batch_size, num_hidden))
              for l in range(num_lstm_layer)]
    init_h = [('l%d_init_h' % l, (batch_size, num_hidden))
              for l in range(num_lstm_layer)]
    init_states = init_c + init_h
    data_train, data_val = get_data('r',
                                    batch_size,
                                    init_states=init_states,
                                    small=small,
                                    splite_rate=0.2)
    param['eval_data'] = data_val

    # prepare symbol
    num_time = data_train.data_list[0].shape[1]
    symbol = lstm_unroll(num_lstm_layer, num_time, num_hidden)

    s = Solver(symbol, data_train, sv, **param)
    print 'Start Training...'
    s.train()
Exemplo n.º 25
0
def cross_validation_demo():
    sub_sample = True
    y_train, x_train, ids_train, y_test_X, x_test_X, ids_test_X = get_data(
        sub_sample, large=True)
    seed = 1
    k_fold = 4
    lambdas = np.logspace(-4, 1, 30)
    k_indices = build_k_indices(y_train, k_fold, seed)
    # define lists to store the loss of training data and test data
    rmse_tr = []
    rmse_te = []
    W = []
    I = []
    for l in lambdas:
        M_rmse_tr = []
        M_rmse_te = []
        weight = []
        index1 = []
        for k in range(k_fold):
            loss_tr, loss_te, w, indices = cross_validation(
                y_train, x_train, k_indices, k, l)
            weight.append(w)
            index1.extend(indices)
            M_rmse_tr.append(loss_tr)
            M_rmse_te.append(loss_te)
        W.append(weight[np.argmin(M_rmse_te)])
        I.extend(index1)
        rmse_tr.append(np.mean(M_rmse_tr))
        rmse_te.append(np.mean(M_rmse_te))
    plt.hist(I, bins=np.arange(min(I), max(I) + 1))
    plt.title("Frequency diagram")
    plt.xlabel("Value")
    plt.ylabel("Frequency")
    plt.show()
    plt.savefig("freq")
    """
Exemplo n.º 26
0
        logging.info("Epoch {} took {}".format(e, end - start_epoch))
        logging.info("Epoch:{} ValidAccPi    loss:{}".format(
            e, acc_ploss / it))
        logging.info("Epoch:{} ValidAccValue loss: {}".format(
            e, acc_vloss / it))
        logging.info("Epoch:{} ValidTotal    loss:{}".format(e, acc_loss / it))
        logging.info("Epoch:{} checkpoint".format(new_param_file))
        torch.save({
            'state_dict': model.state_dict(),
        }, new_param_file)

    # saving new parameters
    logging.info("Training took {}".format(end - start_train))
    logging.info("Saving model to {}".format(new_param_file))
    torch.save({
        'state_dict': model.state_dict(),
    }, new_param_file)
    logging.info("####################END#################")
    return True


if __name__ == "__main__":
    param_file = tools.get_params()
    new_param_file = tools.get_new_params()
    data_files = tools.get_data()
    if train(param_file, new_param_file, data_files):
        print("New model parameters were saved to {}".format(new_param_file))
    else:
        print("Training failed, check for errors {}/train_{}.log".format(
            LOG_PATH, new_param_file))
Exemplo n.º 27
0
def main():

    try:
        size = sys.argv[1]
    except Exception as e:
        print e
        print '\n\tusage: python %s <small|all>\n' % sys.argv[0]
        exit()

    # hyperparameter
    topN_tfidf_words = 20

    train_notes, train_outcomes = get_data('train', size)
    test_notes, test_outcomes = get_data('test', size)

    # max number of documents (this is used during vectorization)
    num_docs = max(map(len, train_notes.values()))

    # extract feature lists
    train_text_features, df = extract_features_from_notes(train_notes,
                                                          topN_tfidf_words,
                                                          'embeddings',
                                                          df=None)
    test_text_features, df_ = extract_features_from_notes(test_notes,
                                                          topN_tfidf_words,
                                                          'embeddings',
                                                          df=df)
    assert df == df_

    # Fit model for each prediction task
    tasks = [
        'ethnicity', 'age', 'admission_type', 'hosp_expire_flag', 'gender',
        'los', 'diagnosis'
    ]
    #tasks = ['diagnosis']
    for task in tasks:

        print 'task:', task

        ### Train model

        # extract appropriate data
        train_Y, criteria = filter_task(train_outcomes,
                                        task,
                                        per_task_criteria=None)
        train_ids = sorted(train_Y.keys())
        print 'train examples:', len(train_Y)

        # vecotrize notes
        train_X = vectorize_X(train_ids,
                              train_text_features,
                              num_docs=num_docs)
        print 'num_features:  ', train_X.shape[1], '\n'

        train_Y = vectorize_Y(train_ids, train_Y, criteria)
        num_tags = train_Y.shape[1]

        # build model
        lstm_model = create_lstm_model(num_docs, num_tags, train_X, train_Y)
        lstm_model.summary()

        # test data
        test_labels, _ = filter_task(test_outcomes,
                                     task,
                                     per_task_criteria=criteria)
        test_ids = sorted(test_labels.keys())
        test_X = vectorize_X(test_ids, test_text_features, num_docs=num_docs)
        test_Y = vectorize_Y(test_ids, test_labels, criteria)

        # fit model

        # fit model
        filepath = "/tmp/weights-%d.best.hdf5" % random.randint(0, 10000)
        save_best = SaveBestCallback(filepath)
        lstm_model.fit(train_X,
                       train_Y,
                       epochs=100,
                       verbose=1,
                       batch_size=32,
                       validation_data=(test_X, test_Y),
                       callbacks=[save_best])
        lstm_model.load_weights(filepath)
        os.remove(filepath)

        model = (criteria, num_docs, lstm_model)

        ### Evaluation

        with io.StringIO() as out_f:
            # analysis
            pass

            # eval on test data
            results_onehot_keras(model, train_ids, train_X, train_Y, 'TRAIN',
                                 task, out_f)
            results_onehot_keras(model, test_ids, test_X, test_Y, 'TEST', task,
                                 out_f)

            output = out_f.getvalue()
        print output

        # error analysis
        error_analysis(model, test_ids, test_notes, test_text_features, test_X,
                       test_Y, 'TEST', task)

        # serialize trained model
        homedir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
        modelname = '%s/models/rnn_%s_%s.model' % (homedir, size, task)
        M = {
            'criteria': criteria,
            'num_docs': num_docs,
            'model': lstm_pickle(lstm_model),
            'output': output
        }
        with open(modelname, 'wb') as f:
            pickle.dump(M, f)
Exemplo n.º 28
0
import os
import itertools
from keras.callbacks import EarlyStopping
from sklearn.metrics import f1_score
import tools

training_data, training_label, validation_data, validation_label, validation_cate_label = tools.get_data(
)

kernel_size = [5]
num_layers = [10]
batch_size = [30]
learning_rate = [0.0001]
overflow_model = 0


def run(bs, path, lr, ks, num_layer):
    fold = 1
    for X_train, Y_train, X_val, Y_val, val_cat in zip(training_data,
                                                       training_label,
                                                       validation_data,
                                                       validation_label,
                                                       validation_cate_label):
        print("Fold " + str(fold))
        model = tools.create_model(lr, bs, ks, num_layer)
        inner_path = path + "/fold_" + str(fold)
        if not os.path.exists(inner_path):
            os.makedirs(inner_path)

        early_stop = EarlyStopping(patience=20)
        history = model.fit(x=X_train,
Exemplo n.º 29
0
import pyspark.sql.functions as f
import sys
import unidecode
from nltk.tokenize import wordpunct_tokenize
import tools
start_time = time.time()

slug = sys.argv[1]

data_folder = '../data/'

spark = tools.get_spark()
sparkContext = spark.sparkContext

data = tools.get_data(spark, slug)
ops = tools.get_opinions(spark, data)


# Transform 'word5' into 'word'
ops = ops.withColumn(
    "text",
    f.regexp_replace("text", "(\d+|\D+)", " $1")
)

# Transforms word 'the_' into 'the'
ops = ops.withColumn(
    "text",
    f.regexp_replace("text", "(\w+[^\\w{_}]|\w+)", " $1")
)
Exemplo n.º 30
0
dataset_id = str(sys.argv[7])
sigma = float(sys.argv[8])
plt_freqs = sys.argv[9]
dataset_id2 = str(sys.argv[10])
# A-Team positions
CasA=[350.866417,58.811778]
CygA=[299.868153,40.733916]
VirA=[187.705930,12.391123]

min_sep=0. # The absolute minimum allowed separation from the A-Team source, set to zero to enable the code to work independently

###################### MAIN SCRIPT ######################

# Extracting data from the TraP database into a text file
if not os.path.exists('ds_'+dataset_id+'_images.csv'):
    tools.get_data(database, username, password, host, port, databaseType, dataset_id, dataset_id2)

# Extract relevant data from dataset text file
image_info, frequencies, plt_ratios = tools.extract_data(dataset_id, CasA, CygA, VirA)

freq='all'
# RMS noise properties
noise_avg_log, noise_scatter_log, noise_threshold_log = tools.fit_hist([np.log10(image_info[n][4]*1e3) for n in range(len(image_info))], sigma, r'Observed RMS (mJy)', 'ds'+dataset_id+'_rms', freq)
noise_avg=10.**(noise_avg_log)
noise_max=10.**(noise_avg_log+noise_scatter_log)-10.**(noise_avg_log)
noise_min=10.**(noise_avg_log)-10.**(noise_avg_log-noise_scatter_log)
print 'Average RMS Noise in images (1 sigma range, frequency='+str(freq)+' MHz): '+str(round(noise_avg,1))+' (+'+str(round(noise_max,1))+',-'+str(round(noise_min,1))+') mJy'
if plt_ratios:
    # RMS/Theoretical limit for TraP
    ratio_avg_log, ratio_scatter_log, ratio_threshold_log = tools.fit_hist([np.log10(image_info[n][6]) for n in range(len(image_info))], sigma, r'Observed RMS / Theoretical Noise', 'ds'+dataset_id+'_ratio', freq)
    ratio_avg=10.**(ratio_avg_log)
Exemplo n.º 31
0
def validate_variable_content_multilevel(context, path, value):
    assert_somewhere_in(json.loads(value), get_data(context.response, path))
Exemplo n.º 32
0
    'OperatingSystems',
    'Browser',
    'Region',
    'TrafficType',
    # 'VisitorType',
    # 'Weekend',
    # 'Revenue'
}

data_fixed = same_distribution(data_fixed, class_key)

number_classes = data_fixed[class_key].nunique()

sampled_data = data_fixed.sample(n=3000)

data, cluster_gt = get_data(data_fixed, to_keep, class_key)

CLUSTERING_TO_STEPS_MAP = {
    cluster.KMeans: range(2, 10),
    cluster.DBSCAN: [float(i)/20 for i in range(1, 20)],
    mixture.GaussianMixture: range(2, 10),
    cluster.SpectralClustering: range(2, 10),
    cluster.AgglomerativeClustering: range(2, 10)
}


CLUSTERING_ALGOS = [
    cluster.KMeans, cluster.DBSCAN, mixture.GaussianMixture, cluster.SpectralClustering, cluster.AgglomerativeClustering
]

Exemplo n.º 33
0
def define_variable_from_result(context, variable, path):
    context.s[variable] = get_data(context.response, path)
Exemplo n.º 34
0
def validate_variable_content(context, path, value):
    assert_in(json.loads(value), get_data(context.response, path))