示例#1
0
文件: main_app.py 项目: toxaco/dit
    def start_app(self):
        data_origin = str(input('#----------------- > Hello, first, where is your file? (type 1 for Web, 2 for Local): '))
        print('Given origin (1 Web, 2 Local): {}'.format(data_origin))

        ##Uses web to find a file and return the data.
        if data_origin[0] == '1':
            data = Connect_web().start()
            if data == False:
                return False
            else:
                self.p_class = process_data(data)
                if self.p_class.group_month():
                    return True
                else:
                    return False

        ##Uses a local file address to find a file and return the data.
        elif data_origin[0] == '2':
            data = Open_local().start()

            if data == False:
                return False
            else:
                self.p_class = process_data(data)
                if self.p_class.group_month():
                    return True
                else:
                    return False

        else:
            return True
示例#2
0
文件: __main__.py 项目: Wolff09/nap
def main(cmd_args):
	if len(cmd_args) < 3:
		print "Usage: processing path/to/nodes/file path/to/edge/file path/to/output/file [deletion_names...]"
	elif not os.path.isfile(cmd_args[0]):
		print "Node file is no file"
	elif not os.path.isfile(cmd_args[1]):
		print "Edge file is no file"
	else:
		process_data(*cmd_args)
		sys.exit(0)
def get_data(rinv, N):
    df = h5py.File(path.parent / "data" / "jet_images" / f"LL-{rinv}.h5", "r")
    y = df["targets"][:N]
    X = df["features"][:N]
    X = process_data(X)
    return X, y
示例#4
0
def main():
    batch_gen = process_data(VOCAB_SIZE, BATCH_SIZE, SKIP_WINDOW)
    ceters, targets = next(batch_gen)
    word2vec(batch_gen)
示例#5
0
from Plotting.plot_all_third_parameters import plot_all_sensitivities_per_alg
from Plotting.plot_learning_curve import plot_learning_curve
from Plotting.plot_learning_for_two_lambdas import plot_learning_curve_for_lambdas
from Plotting.plot_sensitivity import plot_sensitivity_curve
from Plotting.plot_waterfall import plot_waterfall_scatter
from process_data import process_data

process_data()
plot_learning_curve()
plot_sensitivity_curve()
# plot_waterfall_scatter()
# plot_all_sensitivities_per_alg()
# plot_learning_curve_for_lambdas()
def data():
    success = process_data()
    return {"success": success}
示例#7
0
文件: main.py 项目: smirreee/nnTrader
WINDOW = 30
STEP = 1
FORECAST = 1
ROLLING = 30
EMB_SIZE = 7

random.seed(42)

# Load stock data
eod_data = load_stock_data()

# Loop through all stocks returned
for key, data in eod_data.items():

    # Process data
    processed_data = process_data(data)

    X, Y = [], []
    for idx in range(0, len(processed_data)-WINDOW-FORECAST, STEP):

        # Get data from window
        hl = remap(np.array(processed_data['H-L'][idx:idx+WINDOW]), -1, 1)
        co = remap(np.array(processed_data['C-O'][idx:idx+WINDOW]), -1, 1)
        sma_3 = remap(np.array(processed_data['3day SMA'][idx:idx+WINDOW]), -1, 1)
        sma_10 = remap(np.array(processed_data['10day SMA'][idx:idx+WINDOW]), -1, 1)
        sma_30 = remap(np.array(processed_data['30day SMA'][idx:idx+WINDOW]), -1, 1)
        std_dev = remap(np.array(processed_data['Std_dev'][idx:idx+WINDOW]), -1, 1)
        rsi = remap(np.array(processed_data['RSI'][idx:idx+WINDOW]), -1, 1)

        # Stack in array
        x_i = np.column_stack((hl, co, sma_3, sma_10, sma_30, std_dev, rsi))
示例#8
0
        reader = csv.DictReader(csvfile)
        for row in reader:
            train_id = ast.literal_eval(row['train'])
            test_id = ast.literal_eval(row['test'])
            val_id = ast.literal_eval(row['val'])
    
    return train_id, test_id, val_id

if __name__=="__main__":

    parser = get_parser()
    args = parser.parse_args()    
    

    # training
    sents, W, word_index, vocab, labels, max_l, U, user_idx = process_data(args.input, False, args.vectors, args.user_vectors, args.tagField, args.textField, args.userField, idField=args.idField)    
    # set_trace()
    model = args.model
    # if args.static:
    #     print "model architecture: CNN-static"
    #     non_static = False
    # else:
    #     print "model architecture: CNN-non-static"
    #     non_static = True
    non_static = True
    if args.vectors:
        print "using: word2vec vectors"
    else:
        print "using: random vectors"

    classes = set(x["y"] for x in sents)
示例#9
0
#!/usr/bin/python
from __future__ import print_function

from argparse import ArgumentParser

from get_data import get_data
from setup_data import setup_data
from process_data import process_data
from run_walsh_alg import run

if __name__ == "__main__":
    parser = ArgumentParser()
    parser.add_argument('-s', '--settings', help='Settings file')
    parser.add_argument('-r', '--remove', help='Remove', action='store_true')
    args = parser.parse_args()

    if not args.settings:
        import settings.default as settings
    else:
        raise Exception('not impl')

    if settings.GET_DATA:
        get_data(settings)
    if settings.SETUP_DATA:
        setup_data(settings)
    if settings.PROCESS_DATA:
        process_data(settings)
    if settings.RUN_WALSH:
        run()
def semisupervised_selection(data_dir, dest_dir, initial_pos_filename, initial_neg_filename, initial_pool_filename, w2v_file,
                             word_vectors="-rand", src_lan='en', trg_lan='de', non_static=True, n_iter=10, max_l=50, k=300,
                             test_batch=7000, instances_to_add=50000, debug=False):

    """
    Performs a semisupervised text selection over a pool of sentences based on initial positive/negative files.
    The steps that takes are:
        1. Classify the pool according the positive/negative samples through a CNN
        2. Take the most positive and most negative sentences from the pool and includes it into the positive/negative training samples
        3. With this extended positive/negative sets, trains another CNN and backs to 1.

    :param data_dir: Directoty where the data files are
    :param initial_pos_filename: Initial "in-domain" corpus
    :param initial_neg_filename: Initial "out-of-domain" corpus
    :param initial_pool_filename: Pool of sentences where to perform the selection
    :param w2v_file: Word2vec file (for the CNN input)
    :param word_vectors: To use word vectors from word2vec or random word vector
    :param non_static: Non-static CNNs
    :param n_iter: Number of iterations carried out by the proccess
    :param test_batch: Classify the pool with this batch
    :param instances_to_add: Number of instances to add at each iteration
    :return:
    """
    pos_filename_src = data_dir + '/' + initial_pos_filename + '.' + src_lan
    in_domain_file = open(pos_filename_src, 'r')
    in_domain = in_domain_file.readlines()
    in_domain_file.close()
    pos_filename_trg = data_dir + '/' + initial_pos_filename + '.' + trg_lan

    neg_filename_src = data_dir + '/' + initial_neg_filename + '.' + src_lan

    pool_filename_src = data_dir + '/' + initial_pool_filename + '.' + src_lan
    pool_filename_trg = data_dir + '/' + initial_pool_filename + '.' + trg_lan

    for i in range(n_iter):
        print "------------------ Starting iteration", i, "------------------"
        new_pos_filename_src = dest_dir + '/' + initial_pos_filename + '_' + str(i) + '.' + src_lan
        new_pos_filename_trg = dest_dir + '/' + initial_pos_filename + '_' + str(i) + '.' + trg_lan

        new_pos_filename_src_tmp = dest_dir + '/' + initial_pos_filename + 'tmp' + '.' + src_lan
        if debug:
            new_neg_filename_src_tmp = dest_dir + '/' + initial_neg_filename + 'tmp' + '_' + str(i) + '.' + src_lan

        new_neg_filename_src = dest_dir + '/' + initial_neg_filename + '_' +  str(i) + '.' + src_lan

        new_pool_filename_src = dest_dir + '/' + initial_pool_filename + '_' + str(i) + '.' + src_lan
        new_pool_filename_trg = dest_dir + '/' + initial_pool_filename + '_' + str(i) + '.' + trg_lan

        if i > 0:
            copyfile(pos_filename_src, new_pos_filename_src_tmp)
            copyfile(pos_filename_src, new_pos_filename_src)
            copyfile(pos_filename_trg, new_pos_filename_trg)

        with open(new_pos_filename_src_tmp, "a") as f:
            for line in in_domain:
                f.write(line)

        copyfile(neg_filename_src, new_neg_filename_src)
        copyfile(pool_filename_src, new_pool_filename_src)
        copyfile(pool_filename_trg, new_pool_filename_trg)

        x = process_data(w2v_file, new_pos_filename_src_tmp, new_neg_filename_src, new_pool_filename_src, k=k)
        revs, W, W2, word_idx_map, vocab = x[0], x[1], x[2], x[3], x[4]

        if word_vectors=="-rand":
            print "using: random vectors"
            U = W2
        elif word_vectors=="-word2vec":
            print "using: word2vec vectors"
            U = W
        else:
            raise NotImplementedError, "Choose between -rand or -word2vec options"

        results = []
        datasets = make_idx_data_holdout(revs, word_idx_map, max_l=max_l,k=k, filter_h=5)
        perf, predictions, prediction_probs = train_conv_net(datasets, U, img_w=k, lr_decay=0.95, filter_hs=[3,4,5],
                                                              conv_non_linear="relu", hidden_units=[200,100,2],
                                                              shuffle_batch=True, n_epochs=14, sqr_norm_lim=9,
                                                              non_static=non_static, batch_size=128, dropout_rate=[0.5],
                                                              test_batch=test_batch, savename="predictions_" + str(i),
                                                             savetofile=False)
        positive_lines_src, positive_lines_trg, negative_lines, neutral_lines_src, neutral_lines_trg = \
            process_prediction_probs(prediction_probs, instances_to_add, pool_filename_src, pool_filename_trg)

        print "Adding", len(positive_lines_src), "positive lines"
        print "Positive sample:", positive_lines_src[0], "---", positive_lines_trg[0]
        print "Adding", len(negative_lines), "negative lines"
        print "Negative sample:", negative_lines[0]

        print "Adding", len(neutral_lines_src), "neutral lines"
        print "Neutral sample:", neutral_lines_src[0], "---", neutral_lines_trg[0]

        new_pos_file_src = open(new_pos_filename_src, 'a')
        new_pos_file_trg = open(new_pos_filename_trg, 'a')

        new_neg_file = open(new_neg_filename_src, 'a')
        if debug:
            new_neg_file_tmp = open(new_neg_filename_src_tmp, 'a')

        new_pool_file_src = open(new_pool_filename_src, 'w')
        new_pool_file_trg = open(new_pool_filename_trg, 'w')

        for line in positive_lines_src:
            new_pos_file_src.write(line)
        for line in positive_lines_trg:
            new_pos_file_trg.write(line)

        for line in negative_lines:
            new_neg_file.write(line)
            if debug:
                new_neg_file_tmp.write(line)
        for line in neutral_lines_src:
            new_pool_file_src.write(line)
        for line in neutral_lines_trg:
            new_pool_file_trg.write(line)

        new_pos_file_src.close()
        new_pos_file_trg.close()

        new_neg_file.close()

        new_pool_file_src.close()
        new_pool_file_trg.close()

        if debug:
            new_neg_file_tmp.close()

        pos_filename_src = new_pos_filename_src
        pos_filename_trg = new_pos_filename_trg

        neg_filename_src = new_neg_filename_src

        pool_filename_src = new_pool_filename_src
        pool_filename_trg = new_pool_filename_trg

        print "perf: " + str(perf)
        results.append(perf)
        print str(np.mean(results))
 def _import_data(self):
     return process_data(self.VOCAB_SIZE, self.BATCH_SIZE, self.SKIP_WINDOW)
示例#12
0
if __name__ == '__main__':
    global_start_time = time.time()

    seq_len = 10  # 训练sequence长度
    split_rate = 0.1  # 划分训练数据和测试数据的比例

    # df = pd.read_csv(r'E:\data\data\test_env_12_1m_deal\data_proc1_sort_slot1.txt')
    # data = list(df['number'].values)

    df1 = pd.read_csv(r'E:\data\data\bsg_nova_1030_sort.csv')
    # df1 = df1.sort_values(by='time')
    df1 = df1.head(3000000)  # 自己机器运行报memery error错,因此用前300000条运行

    data = list(df1['event'].values)

    X_train, y_train, X_test, y_test, row = process_data.process_data(
        data, seq_len, split_rate)  # 对数据格式进行处理,对数据进行划分为训练数据和测试数据

    y_train = np_utils.to_categorical(y_train)

    params = {
        'lstm_output_dim': 50,
        'activation_lstm': 'relu',
        'activation_dense': 'relu',
        'activation_last': 'softmax',
        'dense_layer': 1,
        'lstm_layer': 2,
        'nb_epoch': 1
    }
    obj_lstm = lstm_model.RNN_network(**params)

    obj_lstm.model(X_train,
示例#13
0
def run():
    sentences, pos, tag, enc_pos, enc_tag = process_data(DF_PATH)

    meta_data = {
        'enc_pos': enc_pos,
        'enc_tag': enc_tag
    }

    joblib.dump(meta_data, META_PATH)

    num_pos = len(list(enc_pos.classes_))
    num_tag = len(list(enc_tag.classes_))

    (
        train_sentences,
        valid_sentences,
        train_pos,
        valid_pos,
        train_tag,
        valid_tag,
    ) = model_selection.train_test_split(sentences, pos, tag, random_state=2020, test_size=0.1)

    tokenizer = transformers.BertTokenizer.from_pretrained(TOKENIZER_PATH, do_lower_case=True)

    train_dataset = EntityDataset(
        words=train_sentences,
        pos=train_pos,
        tags=train_tag,
        tokenizer=tokenizer,
        max_len=MAX_LEN
    )

    train_dataloader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=TRAIN_BATCH_SIZE,
        num_workers=4
    )

    valid_dataset = EntityDataset(
        words=valid_sentences,
        pos=valid_pos,
        tags=valid_tag,
        tokenizer=tokenizer,
        max_len=MAX_LEN
    )

    valid_dataloader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=VALIDATION_BATCH_SIZE,
        num_workers=4
    )

    model = MODEL_DISPATCHER[BASE_MODEL](bert_path=BERT_PATH,
                                         num_tag=num_tag,
                                         num_pos=num_pos
                                         )
    model.to(DEVICE)

    # parameters_optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

    parameters_optimizer = [
        {
            'params': [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            'weight_decay': 0.001,
        },
        {
            'params': [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            'weight_decay': 0.0,
        }
    ]

    optimizer = AdamW(parameters_optimizer, lr=LR)
    num_training_steps = int(len(train_dataset) / TRAIN_BATCH_SIZE * EPOCHS)

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps
    )

    best_loss = np.inf
    for epoch in range(EPOCHS):
        train_loss = train_loop_fn(train_dataloader, model, optimizer, DEVICE, scheduler)
        valid_loss = eval_loop_fn(valid_dataloader, model, DEVICE)

        print(f'Train_loss = {train_loss}, Valid_loss = {valid_loss}')

        if valid_loss < best_loss:
            torch.save(model.state_dict(), MODEL_PATH)
            best_loss = valid_loss
示例#14
0
import pandas as pd
import json
import os
from process_data import process_data
from daily_readings import daily_readings
from readings_by_title import readings_by_title

if __name__ == '__main__':
    df = pd.read_csv('../source_data/daily-log.csv')
    df['date'] = pd.to_datetime(df['date'])

    # Process raw readings into daily tracking by titles
    readings = process_data(df)

    # Filter logs for 2018 readings
    CY18 = df[df['date'] <= pd.to_datetime('2018-12-31')]

    # Combine titles and create daily readings frame
    CY18_daily_readings = daily_readings(readings, min(CY18['date']),
                                         max(CY18['date']))

    # List of dictionaries containing daily readings by title
    by_title = readings_by_title(readings)
示例#15
0
    class2_vectors = [(j[0], j[1]) for j in class2]
    class1_X = [i[0] for i in class1_vectors]
    class1_Y = [i[1] for i in class1_vectors]
    class2_X = [j[0] for j in class2_vectors]
    class2_Y = [j[1] for j in class2_vectors]
    sigmas = [get_sigma(class1_X, class1_Y, 0, mu1), \
              get_sigma(class2_X, class2_Y, 1, mu2)]
    s1 = sigmas[0]
    s2 = sigmas[1]
    pi_vector = [pi1, pi2]
    mu_vector = [mu1, mu2]
    return (pi_vector, mu_vector, sigmas)


pre_data = process_data.get_data()
data = process_data.process_data(pre_data)
features = data[0]
labels = data[1]
train_features = np.array(features[:80])
train_labels = np.array(labels[:80]).reshape(80, 1)
test_features = np.array(features[80:])
test_labels = np.array(labels[80:]).reshape(20, 1)
X = train_features
y = train_labels
X_ = test_features
y_ = test_labels
q_fit = QDA(X, y)
sigmas = q_fit[2]
qda_correct = 0

for i in range(len(X_)):
def main():

    date = time.localtime()

    log_name = './data/logs/log_'
    log_name += str(date.tm_mon) + '_'
    log_name += str(date.tm_mday) + '_'
    log_name += str(date.tm_hour) + '_'
    log_name += str(date.tm_min) + '_'
    log_name += str(date.tm_sec) + '.txt'

    log_file = open(log_name, 'w')

    current_message = open('./data/starting_message.txt', 'r').read()
    log_file.write(current_message)
    print(current_message)

    beginnings_file = open('./data/starting_sentences.txt', 'r')

    beginnings = [
        word_tokenize(beginning) for beginning in beginnings_file.readlines()
    ]

    if DEBUG:

        print('RUNNING IN DEBUGGING MODE!')

        for look_back in [4, 8, 20]:

            current_message = '-' * 60 + '\n'
            current_message += 'Preparing data for look_back of %d' % look_back + '\n'
            log_file.write(current_message)
            print(current_message)

            start_time = time.time()

            data_train, data_val, data_test, emb_matrix, w2t, t2w, emb_model = process_data(
                log_file=log_file, look_back=look_back, debug=DEBUG)

            current_message = "Data took %.2f seconds to prepare." % (
                time.time() - start_time) + '\n'
            current_message += '\n' + '-' * 60 + '\n'
            log_file.write(current_message)
            print(current_message)

            #LSTM Euclid loss

            for nb_layers in [1, 2, 4]:

                start_time = time.time()

                tf.reset_default_graph()
                model = deep_LSTM_euclid.LSTMmodel(emb_matrix=emb_matrix,
                                                   look_back=look_back,
                                                   nb_layers=nb_layers)
                model.build_graph()
                model.train(data_train,
                            data_val,
                            nb_train_steps=1,
                            folder_to_save='results/LSTM_euclid_layers_' +
                            str(nb_layers) + '_look_back_' + str(look_back))

                current_message = "Model Euclid with %d layers took %.2f for building and training." % (
                    nb_layers, time.time() - start_time)
                current_message += '\n' + '-' * 40 + '\n'
                log_file.write(current_message)
                print(current_message)

                for beginning in beginnings:
                    model.create_story(emb_model=emb_model,
                                       w2t=w2t,
                                       t2w=t2w,
                                       beginning=beginning)

            #LSTM Cross Entropy loss

            for nb_layers in [1, 2, 4]:

                tf.reset_default_graph()
                model = deep_LSTM_cross_entropy.LSTMmodel(
                    emb_matrix=emb_matrix,
                    look_back=look_back,
                    nb_layers=nb_layers)
                model.build_graph()
                model.train(
                    data_train,
                    data_val,
                    nb_train_steps=1,
                    folder_to_save='results/LSTM_cross_entropy_layers_' +
                    str(nb_layers) + ' look_back_' + str(look_back))

                current_message = "Model Entropy with %d layers took %.2f for building and training." % (
                    nb_layers, time.time() - start_time)
                current_message += '\n' + '-' * 40 + '\n'
                log_file.write(current_message)
                print(current_message)

                for beginning in beginnings:
                    model.create_story(w2t=w2t, t2w=t2w, beginning=beginning)

            #LSTM NCE loss

            for nb_layers in [1, 2, 4]:

                tf.reset_default_graph()
                model = deep_LSTM_nce.LSTMmodel(emb_matrix=emb_matrix,
                                                look_back=look_back,
                                                nb_layers=nb_layers)
                model.build_graph()
                model.train(data_train,
                            data_val,
                            nb_train_steps=1,
                            folder_to_save='results/LSTM_nce_layers_' +
                            str(nb_layers) + ' look_back_' + str(look_back))

                current_message = "Model NCE with %d layers took %.2f for building and training." % (
                    nb_layers, time.time() - start_time)
                current_message += '\n' + '-' * 40 + '\n'
                log_file.write(current_message)
                print(current_message)

                for beginning in beginnings:
                    model.create_story(w2t=w2t, t2w=t2w, beginning=beginning)

    else:

        for look_back in [4, 8, 20]:

            current_message = '-' * 60 + '\n'
            current_message += 'Preparing data for look_back of %d' % look_back
            log_file.write(current_message)
            print(current_message)

            start_time = time.time()

            data_train, data_val, data_test, emb_matrix, w2t, t2w = process_data(
                look_back=look_back, debug=DEBUG)

            current_message = "Data took %.2f seconds to prepare." % (
                time.time() - start_time) + '\n'
            current_message += '\n' + '-' * 60 + '\n'
            log_file.write(current_message)
            print(current_message)

            #LSTM Euclid loss

            for nb_layers in [1, 2, 4]:

                start_time = time.time()

                tf.reset_default_graph()
                model = deep_LSTM_euclid.LSTMmodel(emb_matrix=emb_matrix,
                                                   look_back=look_back,
                                                   nb_layers=nb_layers,
                                                   log_file=log_file)
                model.build_graph()
                model.train(data_train,
                            data_val,
                            nb_train_steps=5,
                            folder_to_save='results/LSTM_euclid_layers_' +
                            str(nb_layers) + '_look_back_' + str(look_back))

                current_message = "Model Euclid with %d layers took %.2f for building and training." % (
                    nb_layers, time.time() - start_time)
                current_message += '\n' + '-' * 40 + '\n'
                log_file.write(current_message)
                print(current_message)

                for beginning in beginnings:
                    model.create_story(emb_model=emb_model,
                                       w2t=w2t,
                                       t2w=t2w,
                                       beginning=beginning)

            #LSTM Cross Entropy loss

            for nb_layers in [1, 2, 4]:

                tf.reset_default_graph()
                model = deep_LSTM_cross_entropy.LSTMmodel(
                    emb_matrix=emb_matrix,
                    look_back=look_back,
                    nb_layers=nb_layers)
                model.build_graph()
                model.train(
                    data_train,
                    data_val,
                    nb_train_steps=5,
                    folder_to_save='results/LSTM_cross_entropy_layers_' +
                    str(nb_layers) + ' look_back_' + str(look_back))

                current_message = "Model Entropy with %d layers took %.2f for building and training." % (
                    nb_layers, time.time() - start_time)
                current_message += '\n' + '-' * 40 + '\n'
                log_file.write(current_message)
                print(current_message)

                for beginning in beginnings:
                    model.create_story(w2t=w2t, t2w=t2w, beginning=beginning)

            #LSTM NCE loss

            for nb_layers in [1, 2, 4]:

                tf.reset_default_graph()
                model = deep_LSTM_nce.LSTMmodel(emb_matrix=emb_matrix,
                                                look_back=look_back,
                                                nb_layers=nb_layers)
                model.build_graph()
                model.train(data_train,
                            data_val,
                            nb_train_steps=5,
                            folder_to_save='results/LSTM_nce_layers_' +
                            str(nb_layers) + ' look_back_' + str(look_back))

                current_message = "Model NCE with %d layers took %.2f for building and training." % (
                    nb_layers, time.time() - start_time)
                current_message += '\n' + '-' * 40 + '\n'
                log_file.write(current_message)
                print(current_message)

                for beginning in beginnings:
                    model.create_story(w2t=w2t, t2w=t2w, beginning=beginning)
def main():
    model = SkipGramModel(VOCAB_SIZE, EMBED_SIZE, BATCH_SIZE, NUM_SAMPLED, LEARNING_RATE)
    model.build_graph()
    batch_gen = process_data(VOCAB_SIZE, BATCH_SIZE, SKIP_WINDOW)
    train_model(model, batch_gen, NUM_TRAIN_STEPS, WEIGHTS_FLD)
示例#18
0
    result = pd.Series(window_steps)

    return result


#Test the function
if __name__ == "__main__":

    from process_data import process_data

    ped_data_file = r"C:\Users\dwubu\Documents\mhc\pedometer_walk_dir\ee621e22-c7c2-45dc-b22d-3a9c59fe6e78\2648577.0"
    walk_data_file = r"C:\Users\dwubu\Desktop\accel_walk_dir\ee621e22-c7c2-45dc-b22d-3a9c59fe6e78\2648617.0"

    ped_data = pd.read_json(ped_data_file)
    walk_data = pd.read_json(walk_data_file)
    walk_data_windows = process_data(500, 99, walk_data_file)

    def compare_ped_predictions(df, idx):
        '''
        A plotting function for comparing the pedometer predictions for a window
        and the acceleometry of the window
        '''
        import matplotlib.pyplot as plt
        plt.figure()
        plt.plot(df['xwindows'].iloc[idx], label='x')
        plt.plot(df['ywindows'].iloc[idx], label='y')
        plt.plot(df['zwindows'].iloc[idx], label='z')
        plt.legend(loc='upper left')
        plt.title('Prediction : {}'.format(df['steps'].iloc[idx]))

    compare_ped_predictions(walk_data_windows, 3)
import pandas as pd
import numpy as np
from sklearn import preprocessing
import xgboost as xgb
from process_data import process_data	

df, df_features = process_data()

le = preprocessing.LabelEncoder()
target = le.fit_transform(df[df['train']]['OutcomeType'].values)
features = df_features[df['train']].values
dtrain = xgb.DMatrix(features, label=target)

params = {'objective':'multi:softprob', 'eta':0.75, 'bst:max_depth':2, 'num_class':len(np.unique(target))}
grad = xgb.train(params, dtrain)

dtest = xgb.DMatrix(df_features[df['train'] == False].values)
result = pd.DataFrame(grad.predict(dtest), columns=le.classes_)
result.index += 1
result.to_csv('xg_boost_out_params1.csv', index_label='ID')
示例#20
0
        'hidden_layer_sizes' : [(500,), (1000,)],
        'max_iter' : [400]
        }
}

##############################################
#              Training data                 #
##############################################

## Loading and processing data

# Merge to process
data_train = pd.read_csv('../data/train.csv', header=0)
data_test = pd.read_csv('../data/test.csv', header=0)
data_merge = pd.concat([data_train, data_test], keys=['train', 'test'])
dataset = process_data(data_merge)

# Extracting train test
X = dataset.loc['train'].drop(columns=['PassengerId', 'Survived'])
y = dataset.loc['train'].Survived

# Extracting data test
id = dataset.loc['test'].PassengerId
X_test = dataset.loc['test'].drop(columns=['PassengerId', 'Survived'])

# # Dividing in training and cross validation set
# X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=0.3)

# Table of estimators
helper = EstimatorSelectionHelper(models, params)
helper.fit(X, y)
示例#21
0
import torch
import torch.optim as optim

# %matplotlib inline

# getting the data
from load_data import load_data
train_loader, batch_size, num_workers = load_data()

# visualize the data
from load_data import visualize_data
images, labels = visualize_data(train_loader)

# pre-processing the data
from process_data import process_data
scaled_img = process_data(images)

# define the model
from model import Discriminator, Generator

# define hyperparamaters
conv_dim = 32
z_size = 100

# define discriminator and generator
D = Discriminator(conv_dim)
G = Generator(z_size=z_size, conv_dim=conv_dim)

print(D)
print()
print(G)
示例#22
0
def main():

    batch_gen = process_data(vocabulary_size, batch_size, skip_window)
    word2vec(batch_gen)
示例#23
0
        centers, targets = next(batch_generator)
        batch = [centers, targets]
        # Update one step
        loss_batch, summary = model.step(batch, sess)
        # Summary this step
        writer.add_summary(summary, train_step)
        # Update loss
        total_loss += loss_batch

        # Print out the loss every few steps
        if (train_step + 1) % FLAGS.skip_every == 0:
            print("Average loss at step {}: {:5.1f}".format(
                train_step + 1, total_loss / FLAGS.skip_every))
            total_loss = 0.0

    # Save the session
    saver.save(sess,
               "checkpoints/step{}".format(initial_step +
                                           FLAGS.num_train_steps),
               global_step=initial_step + FLAGS.num_train_steps)


if __name__ == '__main__':
    FLAGS = train_flags()
    model = SkipGramModel(FLAGS)

    batch_generator = process_data(FLAGS.vocab_size, FLAGS.batch_size,
                                   FLAGS.skip_window)
    with tf.Session() as sess:
        train(sess, model, batch_generator, FLAGS.num_train_steps)
        sess.close()
示例#24
0
def helper(proc_info):
    all_proc,proc_num=proc_info
    cities = []
    #cities.append("Bitola")
    #cities.append("Skopje-Petrovec")
    #cities.append("New York")
    ##cities.append("Anchorage")
    ##cities.append("Sidney")
    cities.append("Buffalo")
    ##cities.append("Nairobi")
    ##cities.append("Singapore")
    #cities.append("Seattle")

    countries = []
    ##countries.append("GR")
    countries.append("NO")
    ##countries.append("CA")
    ar = 1
    if   ar == 0:
        print("Removing images, Please wait .. .  .   .    .     .")
        for cc in countries:
            cities, country = weather_codes.get_cities(cc)
            for city in cities:
                rm_img(city[0],country)
    elif ar == 1:
        c_c_r = 0
        if   c_c_r == 0:
            cities = weather_codes.get_cities(countries)
        elif c_c_r == 1:
            cities = weather_codes.get_cities(cities)
        elif c_c_r == 2:
            cities = weather_codes.get_cities(40)
        for c in cities:
            c[0] = c[0].replace('/',' ')
            #print(c)
        #print(cities)
        get_data_multithread.fetch_data_multithread(cities,1900,2016)
        
        dt = datetime.now()
        start=(dt.minute*60+dt.second)*1000000+dt.microsecond
        paralel_time=0
        main_time=0
        qlen=0
        q = queue.Queue()
        print("Number of cities: ",len(cities))
        
        for city in cities:
            print(city)
        #for city in cities:
        bot=int(len(cities)/all_proc*(proc_num-1))
        top=int(len(cities)/all_proc*(proc_num))
        for xxx in range(bot,top):
            city=cities[xxx]
            dt = datetime.now()
            ct=(dt.minute*60+dt.second)*1000000+dt.microsecond
            country = city[2]
            #print(city[0])
            table = []
            flag = True
            count_to_flag = 0
            year=2015
            while flag:
                url="http://www.wunderground.com/history/airport/"+city[1]+"/"+str(year)+"/1/1/CustomHistory.html?dayend=31&monthend=12&yearend="+str(year)+"&req_city=&req_state=&req_statename=&reqdb.zip=&reqdb.magic=&reqdb.wmo=&format=1"

                temp = process_data.process_data(url,year,city[0],country)
                table.append(temp)
                #print(table[0][0])
                #print(len(temp))
                if len(temp) < 1:
                    count_to_flag+=1
                else:
                    count_to_flag = 0
                if  count_to_flag > 20:
                    flag = False
                year-=1
            #print("TABLE: ",len(table))

            ilo=[[None for j in range(0,367)] for k in range(0,len(table)-21)]
            ihi=[[None for j in range(0,367)] for k in range(0,len(table)-21)]
            ipr=[[None for j in range(0,367)] for k in range(0,len(table)-21)]
            icc=[[None for j in range(0,367)] for k in range(0,len(table)-21)]
            
            for l in range(0,len(table)):
                for m in range(0,len(table[l])):
                    #for n in range(0,len(table[l][m])):
                    dy = int(table[l][m][len(table[l][m])-1])
                    if len(table[l][m][5]) >= 1: ilo[l][dy]=int(table[l][m][5])
                    if len(table[l][m][3]) >= 1: ihi[l][dy]=int(table[l][m][3])
                    if len(table[l][m][21]) >= 1:
                        try:
                            pr=float(table[l][m][21])
                            pr=float(math.log10((pr+1))*76.4)
                            ipr[l][dy]=int(pr)
                        except:
                            excflg = True
                    if len(table[l][m][22]) >= 1: icc[l][dy]=int(table[l][m][22])
            

            dt = datetime.now()
            ct=((dt.minute*60+dt.second)*1000000+dt.microsecond-ct)/1000
            print("Main time: ", ct)
            main_time+=ct
            ############################################
            
            ############################################
            qlen+=3
            ##
            t = threading.Thread(target=draw_env, args = (q,ihi,1,-25,55,"hi",city[0],country,3))
            t.daemon = True
            t.start()
            ##
            t = threading.Thread(target=draw_env, args = (q,ilo,1,-40,40,"lo",city[0],country,3))
            t.daemon = True
            t.start()
            ##
            min_val, max_val = minmax.min_max2(ipr,"pr")
            if min_val != max_val:
                t = threading.Thread(target=draw_env, args = (q,ipr,1,min_val,max_val,"pr",city[0],country,3))
                t.daemon = True
                t.start()
                qlen+=1
            ##
            t = threading.Thread(target=draw_env, args = (q,icc,1,0,8,"cc",city[0],country,4))
            t.daemon = True
            t.start()
            ##
            #print(qlen)
            ############################################

            ###########################################
        #print("::::::::::::::::::::::::::::::::::",qlen)
        for i in range(0,qlen):
            s = q.get()
            paralel_time+=s
            #print(i,". ",s)
        ############################################
         
        ############################################
        dt = datetime.now()
        fin=float((dt.minute*60+dt.second)*1000000+dt.microsecond-start)/1000000
        print("Real Time: ",fin," sec")
        print("Paralel Time: ",paralel_time/1000," sec")
        print("Main Time: ",main_time/1000," sec")
    ##    
    ##    for city in cities:
    ##        print(city)
    ##        for x in ("HIGH","LOW","PRECIPITATION","CLOUD COVER"):
    ##            
    ##            file_in = "C:/Python34/Scripts/WeatherData/ALL_IMG/"+x+"/"+str(city[2])+"_"+str(city[0])+".bmp"
    ##            if os.path.isfile(file_in):
    ##                im = Image.open(file_in)
    ##                #print(im.size)
    ##                directory = "C:/Python34/Scripts/WeatherData/ALL_IMG/PNG/"+x+"/"
    ##                file_out = "C:/Python34/Scripts/WeatherData/ALL_IMG/PNG/"+x+"/"+str(city[2])+"_"+str(city[0])+".png"
    ##                #if not os.path.isfile(file_out):
    ##                if not os.path.exists(directory):
    ##                    os.makedirs(directory)
    ##                del_f=True
    ##                try:
    ##                    im.save(file_out,"png")
    ##                    #im.close()
    ##                except Exception as e:
    ##                    print (file_in," failed to convert to png.",e)
    ##                    #print(file_in," failed to convert to png.")
    ##                    del_f=False
    ##                if del_f:
    ##                    try:
    ##                        os.remove(file_in)
    ##                    except:
    ##                        print(file_in," failed to delete.")
    ##                    
    ##        
    ##        
    return fin
示例#25
0
# set up connection to the database
# edit this when working on the server
db.set_up_connection(db.db, 'bence_test', create_tables=True)

# insert stations
stations_df = station_names.get_stations_dataframe()
db.insert_into_table(stations_df, 'Station')

# get daily measurement data
#userpath = os.path.dirname(os.path.realpath(__file__))
userpath = '/local/data_dwd/october2018'
#get_data.get_data(userpath, historical=True, recent=True,
#                  hourly=False, verbose=True)

# insert measurement data
with db.porm.db_session:
    print('inserting measurement data into the database...')
    for i, s_id in enumerate(stations_df.index):
        try:
            mes = process_data.process_data(userpath, s_id, 'daily')
        except BaseException as e:
            print('something went wrong processing station: {}'.format(s_id))
            print(e)
        else:
            if not mes.empty:
                db.insert_into_table(mes, 'DailyMeasurement', overwrite=True)
                print('{}: {}'.format(i, s_id))
            else:
                print('{}: {} was empty'.format(i, s_id))
示例#26
0
predictlist = []
#去除停用词
for p in predictlist1:
    predictlist.append(p)

word2idx = dict((w, i) for i, w in enumerate(vocab))
print(word2idx)

print("predict_txt:"), print(predict_text)
print(predictlist)

# x = [word2idx.get(w[0].lower(), 1) for w in predictlist]
# print("x:"),print(x)

# length = len(x)
# x = pad_sequences([x], maxlen)  # left padding

str, length = process_data.process_data(predictlist, vocab)

model1.load_weights('without_crf.h5')
raw = model1.predict(str)[0][-length:]

print("raw:"), print(raw)
result = [np.argmax(row) for row in raw]
result_tags = [chunk_tags[i] for i in result]

# print(result_tags)
for s, t in zip(predictlist, result_tags):
    print("(" + s + "," + t + ")")
示例#27
0
文件: w2v.py 项目: thinkwee/ML_Learn
def main():
    batch_gen = process_data(VOCAB_SIZE, BATCH_SIZE, SKIP_WINDOW)
    print('Start Word2Vec')
    word2vec(batch_gen)
示例#28
0
import os

# Instantiating our Streamlit dashboard
st.title('Robot Dance')
st.header('Simulador COVID-19')

# File upload interface
st.write("Faça o upload dos arquivos para realizar a simulação.")

day_14 = st.file_uploader("14DayWindow", type="csv")
reduced_mobility = st.file_uploader("reduced_mobility", type="csv")
# file3 = st.file_uploader("file3", type="csv")
# file4 = st.file_uploader("file4", type="csv")

if (day_14 is not None):
    # Simulation running
    if (st.button('Rodar simulação!')):
        with st.spinner('Processando dados'):
            process_data(day_14, reduced_mobility)
        st.success("Concluído")

        my_bar = st.progress(0)

        imgs = [i for i in os.listdir() if (".png" in i)]
        aux = []
        for idx, path in enumerate(imgs):
            aux.append(mpimg.imread(path))
            my_bar.progress((idx + 1) / len(imgs))

        st.image(aux, use_column_width=True)
示例#29
0
def gen_stats(args):
    """
    Perfom k-fold validation. 

    :param args: args for gen_stats
    :type args: Namespace
    """
    # pull args out
    training_dir = args.training_dir
    train_file = args.train_file
    splits = args.splits
    epochs = args.epochs
    test_file = args.test_file
    hyp = args.hyp
    vocab = args.vocab

    # do some checks
    try:
        assert os.path.exists(training_dir), "training_dir must exist"
        assert os.path.exists(train_file), "eval_file must exist"
        if test_file is not None:
            assert os.path.exists(test_file), "test_file must exist"
        assert hyp >= 0 and hyp <= 1, "hyp must be between 0 and 1"
        assert epochs > 0, 'epochs must be positive'
        assert os.path.exists(vocab), "vocab file must exist"
        assert splits > 0, "splits must be positive"
    except AssertionError as err:
        logger.error("Failed check: {}".format(err))
        return

    # get the data together
    train_data_process, train_labels_process = process_data.process_data(
        train_file, vocab)
    data_split, data_lables = process_data.gen_splits(splits,
                                                      train_data_process,
                                                      train_labels_process)

    if test_file is not None:
        test_data_process, test_labels_process = process_data.process_data(
            test_file, vocab)
        test_data_split, test_data_lables = process_data.gen_splits(
            splits, test_data_process, test_labels_process)

    # delte all the data in this directory
    common.clean_dir_dir(training_dir)

    accuracy_per_session = []

    # run the tests
    for split in range(0, splits):
        # pull out the test data for this session
        eval_data = data_split[split]

        start = 0
        end = len(eval_data)
        # just so we aren't testing and validating on the same data
        test_data = eval_data[int(end / 2):end]
        eval_data = eval_data[start:int(end / 2)]

        # the rest is now trainig
        indicies = list(range(0, splits))
        # remove the test data
        del indicies[split]
        train_data = data_split[indicies[0]]
        train_labels = data_lables[indicies[0]]
        # remove the first one
        del indicies[0]
        for i in indicies:
            train_data = np.append(train_data, data_split[i], axis=0)
            train_labels = np.append(train_labels, data_lables[i], axis=0)

        # logger.debug(train_data)

        # make the checkpoint directory
        checkpoint_dir = common.grab_next_session(training_dir)

        # train the model
        logger.debug("TRINING")
        history, model_summary = rnn.train_and_validate(
            train_data, train_labels, eval_data, eval_labels, epochs,
            checkpoint_dir)

        # get the result
        logger.debug("EVAL")
        metrics = rnn.eval(test_data,
                           test_labels,
                           checkpoint_dir,
                           show_results=False)
        common.write_file(str(model_summary),
                          checkpoint_dir + "/MODEL_SUMMARY")
        accuracy_per_session.append(metrics[1])
        logger.debug("accuracy so far: {}".format(accuracy_per_session))
        common.plot_graphs_val(history, 'categorical_accuracy', checkpoint_dir)
        common.plot_graphs_val(history, 'loss', checkpoint_dir)

    t, s, avg = process_data.get_stats(accuracy_per_session, hyp)
    data = "list of accs: {}".format(accuracy_per_session)
    data += "\n t-value:{}, std:{}, avg:{}".format(t, s, avg)
    data += "\n hyp 0: {}".format(hyp)
    common.write_file(data, training_dir + "/SESSION_INFO")
示例#30
0
def worker(data, i):
    if len(np.where(data['segment_label'] == 2)[0]) < 1000:
        return None

    true_energy_fitted_voxels = []
    charge_fitted_voxels = []
    fitted_energy_fitted_voxels = []
    fitted_energy_recorded_voxels = []
    charge_recorded_voxels = []
    true_energy_recorded_voxels = []
    true_energy_all_voxels = []

    pi0_cos = []  # cos theta of gamma pair
    gamma_sep = np.array([])  # minimum backward separation of gammas
    chosen_particles = []  # selected particle indices

    d, true_shower_hits = process_data(data)

    energies = model(
        (torch.Tensor(d[:, :4]).cuda(),
         torch.Tensor(d[:, 4:-1]).cuda())).detach().cpu().numpy().flatten()
    em_primaries = data['em_primaries']
    input_true = data['input_true']
    group_label = data['group_label']

    # assemble group labels to from primaries
    distances = distance_matrix(em_primaries[:, :3], d[:, :3])
    min_indices = np.argmin(distances, axis=1)
    primary_groups = true_shower_hits[min_indices]

    # determine shower directions
    gamma_dir, gamma_pca_data, gamma_pca_nhits = gamma_direction.do_calculation(
        d[:, :3], em_primaries, radius=16.0, eps=7.6, min_samples=5.8)
    if not len(gamma_dir) or not any(gamma_dir[:, 4]):
        print('gamma_dir failure')
        return None

    # pair up gamma candidates
    selected_showers, sep_matrix = gamma2_selection.do_iterative_selection(
        gamma_dir, maximum_sep=10.)
    if not len(selected_showers):
        print('gamma2 failure')
        return None
    gamma_sep = sep_matrix[np.triu_indices(len(selected_showers), k=1)]

    # calculate pi0 parameters for gamma pairs
    paired_gammas_mask = selected_showers[:, -1] != 0
    gamma_pairs = np.unique(selected_showers[paired_gammas_mask, -1])
    if not len(gamma_pairs) == 1:
        print('pairlen failure')
        return None
    vtx_data = np.empty((len(gamma_pairs), 5))

    particles = em_primaries[:, -1]

    # find shower hits
    #     fitted_shower_hits = []
    #     fitted_shower_primary_labels = []

    fitted_shower_hits = cone_clusterer.cluster(
        d[:, :3],
        em_primaries[:, :3],
        params=[50.0, 32.538603965969806, 4.920066409426372, 9.34588243103269],
        inclusive=True)

    #     labels, hits = cone_clusterer.cluster(d[:, :3], em_primaries[:, :3], params=[50.0, 32.538603965969806, 4.920066409426372, 9.34588243103269])
    #     print('EM primary labels', labels)
    #     print('EM primary groups', primary_groups)
    #     fitted_shower_hits.append(hits)
    #     fitted_shower_primary_labels.append(labels)
    #     if not len(fitted_shower_hits[-1]):
    #         print('cone failure')
    #         return None

    #     fitted_shower_hits.append(spectral_clusterer.cluster(d[:, :3], em_primaries[:, :3], params=[46.37086851922889, -1.5574991699405842, 0.7537768189993856, 0.9695745937212652]))
    #     fitted_shower_primary_labels.append(fitted_shower_hits[-1][min_indices])
    #     if not len(fitted_shower_hits[-1]):
    #         print('spectral failure')
    #         return None

    for idx, label in enumerate(gamma_pairs):
        gamma_label_mask = selected_showers[:, -1] == label
        gamma_pair = gamma_dir[gamma_label_mask]

        gamma0_idx = int(np.argwhere(gamma_label_mask)[0])
        true_gamma0_hits = np.where(
            true_shower_hits == primary_groups[gamma0_idx])
        gamma1_idx = int(np.argwhere(gamma_label_mask)[1])
        true_gamma1_hits = np.where(
            true_shower_hits == primary_groups[gamma1_idx])
        print('gamma indices', gamma0_idx, gamma1_idx)

        if len(fitted_shower_hits[gamma0_idx]) == 0 or len(
                fitted_shower_hits[gamma1_idx]) == 0:
            continue

        cos_val = np.dot(
            gamma_pair[0, -3:], gamma_pair[1, -3:]) / np.linalg.norm(
                gamma_pair[0, -3:]) / np.linalg.norm(gamma_pair[1, -3:])
        if cos_val > 1:
            cos_val = 1.0
        pi0_cos += [cos_val]

        true_energy_fitted_voxels += [[
            np.sum(d[fitted_shower_hits[gamma0_idx], -1]),
            np.sum(d[fitted_shower_hits[gamma1_idx], -1])
        ]]
        charge_fitted_voxels += [[
            np.sum(d[:, 4][fitted_shower_hits[gamma0_idx]]),
            np.sum(d[:, 4][fitted_shower_hits[gamma1_idx]])
        ]]
        fitted_energy_fitted_voxels += [[
            np.sum(energies[fitted_shower_hits[gamma0_idx]]),
            np.sum(energies[fitted_shower_hits[gamma1_idx]])
        ]]

        fitted_energy_recorded_voxels += [[
            np.sum(energies[true_gamma0_hits]),
            np.sum(energies[true_gamma1_hits])
        ]]
        charge_recorded_voxels += [[
            np.sum(d[true_gamma0_hits, 4]),
            np.sum(d[true_gamma1_hits, 4])
        ]]
        true_energy_recorded_voxels += [[
            np.sum(d[true_gamma0_hits, -1]),
            np.sum(d[true_gamma1_hits, -1])
        ]]
        true_energy_all_voxels += [[
            np.sum(input_true[np.where(
                group_label[:, -1] == primary_groups[gamma0_idx]), -1]),
            np.sum(input_true[np.where(
                group_label[:, -1] == primary_groups[gamma1_idx]), -1])
        ]]

        chosen_particles.append([particles[gamma0_idx], particles[gamma1_idx]])

    if len(true_energy_fitted_voxels) == 0:
        return None

    all_energies = [
        true_energy_fitted_voxels, charge_fitted_voxels,
        fitted_energy_fitted_voxels, fitted_energy_recorded_voxels,
        charge_recorded_voxels, true_energy_recorded_voxels,
        true_energy_all_voxels
    ]
    return (all_energies, np.array(pi0_cos),
            np.array(chosen_particles).astype(int), gamma_sep,
            gamma_dir[paired_gammas_mask,
                      -3:], gamma_pca_data[paired_gammas_mask, -1],
            gamma_pca_nhits[paired_gammas_mask, -1])
def main():
    batch_gen = process_data(VOCAB_SIZE, BATCH_SIZE, SKIP_WINDOW)
    word2vec(batch_gen)
示例#32
0
SKIP_WINDOW = 5  # the context window
NUM_SAMPLED = 10    # Number of negative examples to sample.
LEARNING_RATE = 0.1

NPY_FILENAME  = 'text17.npy'
MAX_NPY_WORDS = 188333610 - 100 # max words in textNN.npy

VOCAB_FILENAME = 'vocab_50k.tsv'
VOCAB_SEP      = '|'

NUM_TRAIN_STEPS = MAX_NPY_WORDS * 5
SKIP_STEP = 10 # how many steps to skip before reporting the loss


from process_data import process_data
make_batch_gen = lambda: process_data(NPY_FILENAME, VOCAB_SIZE, BATCH_SIZE, SKIP_WINDOW, MAX_NPY_WORDS)


class SkipGramModel:
    """ Build the graph for word2vec model """
    def __init__(self,
                 vocab_size,
                 embed_size,
                 batch_size,
                 num_sampled,
                 learning_rate):
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.batch_size = batch_size
        self.num_sampled = num_sampled
        self.lr = learning_rate
def main():
    model = SkipGramModel(VOCAB_SIZE, EMBED_SIZE, BATCH_SIZE, NUM_SAMPLED, LEARNING_RATE)
    model.build_graph()
    batch_gen = process_data(VOCAB_SIZE, BATCH_SIZE, SKIP_WINDOW)
    train_model(model, batch_gen, NUM_TRAIN_STEPS, WEIGHTS_FLD)
示例#34
0
        textField = args.textField
        test_set_x = read_corpus(args.input, word_index, max_l, pad, textField=textField)
        test_set_y_pred = cnn.predict(test_set_x)
        test_model = theano.function([cnn.x], test_set_y_pred, allow_input_downcast=True)
        results = test_model(test_set_x)
        # invert indices (from process_data.py)
        labels = ['negative', 'positive', 'neutral']
        for line, y in zip(open(args.input), results):
            tokens = line.split("\t")
            tokens[tagField] = labels[y]
            print "\t".join(tokens),
        sys.exit()

    # training
    print "loading data...",
    sents, U, word_index, vocab = process_data(args.input, args.clean, args.vectors,
                                               args.tagField, args.textField)

    # sents is a list of entries, where each entry is a dict:
    # {"y": 0/1, "text": , "num_words": , "split": cv fold}
    # vocab: dict of word doc freq
    print "data loaded!"
    filter_hs = [int(x) for x in args.filters.split(',')]
    model = args.model
    if args.static:
        print "model architecture: CNN-static"
        non_static = False
    else:
        print "model architecture: CNN-non-static"
        non_static = True
    if args.vectors:
        print "using: word2vec vectors"
示例#35
0
    parser.add_argument('-tagField', type=int, default=1,
                        help='label field in files (default %(default)s)')
    parser.add_argument('-textField', type=int, default=2,
                        help='text field in files (default %(default)s)')

    return parser
    

if __name__=="__main__":

    parser = get_parser()
    args = parser.parse_args()    

    userField = 0
    # training
    sents, W, word_index, vocab, labels, max_l, U, user_idx = process_data(args.input, args.clean, args.vectors, None, args.tagField, args.textField, userField)    
    
    model = args.model
    if args.static:
        print "model architecture: CNN-static"
        non_static = False
    else:
        print "model architecture: CNN-non-static"
        non_static = True
    if args.vectors:
        print "using: word2vec vectors"
    else:
        print "using: random vectors"

    classes = set(x["y"] for x in sents)
    width = W.shape[1]
示例#36
0
import bilsm_crf_model
import process_data
import numpy as np
import re

model, (vocab, chunk_tags) = bilsm_crf_model.create_model(train=False)
predict_text = '针对一些在生活中孩子遇到的不常见字,为了方便阅读,我们都加以拼音标注,这样就克服了小朋友自助阅读的障碍,有利于他们快速正确的阅读'
#去掉输入文本的所有标点
predict_text = re.sub("[^\u4e00-\u9fa5]+", "", predict_text)
print(predict_text)
str, length = process_data.process_data(predict_text, vocab)
model.load_weights('model/crf.h5')
raw = model.predict(str)[0][-length:]
result = [np.argmax(row) for row in raw]
result_tags = [chunk_tags[i] for i in result]

per, loc, org = '', '', ''

for s, t in zip(predict_text, result_tags):
    if t == 1:
        per = per + s + ','
    elif t == 2:
        per = per + s + '。'
    elif t == 3:
        per = per + s + '?'
    else:
        per = per + s
    #
    #     per += ' ' + s if (t == 0) else s
    # if t in (2, 1):
    #     org += ' ' + s if (t == 2) else s
def main():
    batch_gen = process_data(VOCAB_SIZE, BATCH_SIZE, SKIP_WINDOW)
    word2vec(batch_gen)
示例#38
0
    current_time = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

    submission.to_csv("output/{}_{}.csv".format(current_time, extension),
                      float_format='%.4f',
                      index=None)


train_data = process_data.read_train()
test_data_raw = process_data.read_test()

train_label = train_data["TARGET"]
train_data.drop("TARGET", axis=1, inplace=True)

process_start_time = time.time()
train_data, test_data, cate_feats = process_data.process_data(
    train_data, test_data_raw, always_label_encode=True)
util.print_time(time.time() - process_start_time)

# pp = pprint.PrettyPrinter(width=200, compact=True)
# pp.pprint(list(train_data))
# scores = cross_val_score(create_classifier(), train_data, train_label, cv=5, scoring='roc_auc')
# print(scores)

# print(train_data.isnull().sum())
print(train_data.shape)

fold_importance_df = pd.DataFrame()
fold_importance_df["feature"] = list(train_data)

skf = StratifiedKFold(n_splits=5, shuffle=True)
示例#39
0
from __future__ import print_function

from argparse import ArgumentParser

from get_data import get_data
from setup_data import setup_data
from process_data import process_data
from run_walsh_alg import run

if __name__ == "__main__":
    parser = ArgumentParser()
    parser.add_argument('-s', '--settings', help='Settings file')
    parser.add_argument('-r', '--remove', help='Remove', action='store_true')
    args = parser.parse_args()

    if not args.settings:
	import settings.default as settings
    else:
	raise Exception('not impl')

    if settings.GET_DATA:
	get_data(settings)
    if settings.SETUP_DATA:
	setup_data(settings)
    if settings.PROCESS_DATA:
	process_data(settings)
    if settings.RUN_WALSH:
        run()


示例#40
0
def refresh_data():
    # call shell script function
    run(['./get_data.sh'], stdout=PIPE, stderr=PIPE)
    process_data()
    build_simple_model_df()
    return render_template('index.html')