示例#1
0
文件: kepler.py 项目: mtctr/kepler
def download_data(kic):
    try:
        folder_path = utils.download_files(kic)
        utils.process_data(folder_path)
    except Exception as e:
        print(e)
        return e
示例#2
0
def get(kic):
    try:
        folder_path = utils.download_files(kic)
        utils.process_data(folder_path)
        data = read_csv(kic)
        return data
    except Exception as e:
        print(e)
        return e
    def prepare_data(self):
        test, train, val = utils.load_test_train_val(self.data_num) # df

        train_texts = list(train.posts)

        glove = Glove()
        glove.create_custom_embedding([word for text in train_texts for word in text.split()])

        self.train_tuple = utils.process_data(train, glove, self.max_words, self.max_posts)
        self.test_tuple = utils.process_data(test, glove, self.max_words, self.max_posts)
        self.val_tuple = utils.process_data(val, glove, self.max_words, self.max_posts)
示例#4
0
def train_specialists(pretrain):
    for setting in SPECIALIST_SETTINGS:
        cols = setting["columns"]
        X, y = process_data(TRAIN_PATH, cols)
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=VAL_PROP)

        if pretrain:
            model = model_from_json(open(MODEL_PATH).read())
            model.load_weights(WEIGHTS_PATH)
        else:
            model = build_model()

        model.layers.pop()
        model.outputs = [model.layers[-1].output]
        model.layers[-1].outbound_nodes = []
        model.add(Dense(len(cols), name="dense_3"))

        flipgen = FlippedImageDataGenerator()
        flipgen.flip_idxs = setting["flip_idxs"]
        sgd = SGD(lr=0.08, decay=1e-4, momentum=0.9, nesterov=True)
        model.compile(loss="mse", optimizer=sgd)
        early_stop = EarlyStopping(monitor="val_loss", patience=100, mode="min")
        print("Training {}...".format(cols[0]))
        model.fit_generator(flipgen.flow(X_train, y_train),
                            samples_per_epoch=X_train.shape[0],
                            nb_epoch=1000,
                            validation_data=(X_val, y_val),
                            callbacks=[early_stop])

        model_path = "data/model_{}.json".format(cols[0])
        weights_path = "data/weights_{}.h5".format(cols[0])
        print("Saving model to ", model_path)
        print("Saving weights to ", weights_path)
        open(model_path, 'w').write(model.to_json())
        model.save_weights(weights_path, overwrite=True)
示例#5
0
def evaluate(model, data_source, cuda=args.cuda):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    model.criterion.loss_type = 'full'

    eval_loss = 0
    total_length = 0

    t = tt = 0.0

    with torch.no_grad():
        for data_batch in tqdm(data_source):
            data, target, length = process_data(data_batch,
                                                cuda=cuda,
                                                sep_target=sep_target)

            l1, l2 = model.forward_normalized(data, target, length)
            cur_length = int(length.data.sum())
            eval_loss += l1.sum().item()

            t += torch.exp(l2 - l1).sum().item()
            tt += (torch.exp(l2 - l1)**2).sum().item()

            total_length += cur_length

    mean = (t / total_length)
    variance = tt / total_length - mean * mean

    model.criterion.loss_type = args.loss

    return math.exp(eval_loss / total_length), mean, variance
示例#6
0
文件: main.py 项目: cfh3c/Pytorch-NCE
def train(model, data_source, lr=1.0, weight_decay=1e-5, momentum=0.9):
    params = model.parameters()
    optimizer = optim.SGD(params=params,
                          lr=lr,
                          momentum=momentum,
                          weight_decay=weight_decay)
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0
    for num_batch, data_batch in enumerate(corpus.train):
        optimizer.zero_grad()
        data, target, length = process_data(data_batch, cuda=args.cuda)
        loss = model(data, target, length)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm(params, args.clip)
        optimizer.step()

        total_loss += loss.data[0]

        if num_batch % args.log_interval == 0 and num_batch > 0:
            if args.prof:
                break
            cur_loss = total_loss / args.log_interval
            print('| epoch {:3d} | {:5d}/{:5d} batches'
                  ' | lr {:02.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(epoch, num_batch,
                                                      len(corpus.train), lr,
                                                      cur_loss,
                                                      math.exp(cur_loss)))
            total_loss = 0
            print('-' * 87)
示例#7
0
def train_model(pretrain):
    X, y = process_data(TRAIN_PATH)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=VAL_PROP)

    if pretrain:
        model = model_from_json(open(MODEL_PATH).read())
        model.load_weights(WEIGHTS_PATH)
    else:
        model = build_model()

    flipgen = FlippedImageDataGenerator()
    sgd = SGD(lr=0.08, decay=1e-4, momentum=0.9, nesterov=True)
    model.compile(loss="mse", optimizer=sgd)
    early_stop = EarlyStopping(monitor="val_loss", patience=100, mode="min")
    model.fit_generator(flipgen.flow(X_train, y_train),
                        samples_per_epoch=X_train.shape[0],
                        nb_epoch=5000,
                        validation_data=(X_val, y_val),
                        callbacks=[early_stop])

    print("Saving model to ", MODEL_PATH)
    print("Saving weights to ", WEIGHTS_PATH)
    open(MODEL_PATH, 'w').write(model.to_json())
    model.save_weights(WEIGHTS_PATH, overwrite=True)

    mse = model.evaluate(X_val, y_val, batch_size=BATCH_SIZE)
    print("MSE: ", mse)
    print("RMSE: ", np.sqrt(mse)*IMG_SIZE)
示例#8
0
def main():
    file_name = 'data/processed_digits.csv'
    df = create_dataframe(file_name)
    X_train, y_train, X_valid, y_valid, X_test, y_test = process_data(df)

    DigitNN = DigitNeuralNetwork(epochs=100, batch_size=32)
    DigitNN.fit(X_train, y_train, X_valid, y_valid, X_test, y_test)
示例#9
0
def main():
    """
        主函数
    """
    # 读取数据集
    raw_data = pd.read_csv(dataset_path, usecols=sel_cols)
    # 查看数据集
    utils.insepct_data(raw_data)

    # 处理数据集
    proc_data = utils.process_data(raw_data)

    # 借贷金额分析可视化
    utils.visualise_loan_amnt(proc_data, col_name='term', title='借贷周期vs借贷金额',
                              xlabel='借贷周期', save_path='./output/term_amnt.png')
    utils.visualise_loan_amnt(proc_data, col_name='loan_status', title='借贷状态vs借贷金额',
                              xlabel='借贷状态', save_path='./output/status_amnt.png')
    utils.visualise_loan_amnt(proc_data, col_name='purpose', title='借贷目的vs借贷金额',
                              xlabel='借贷目的', save_path='./output/purpose_amnt.png')
    utils.visualise_loan_amnt(proc_data, col_name='addr_state', title='州vs借贷金额',
                              xlabel='州', save_path='./output/state_amnt.png')

    # 借贷目的占比可视化
    utils.visualise_loan_purpose_percent(proc_data['purpose'], './output/purpose_percent.png')

    # 变量间关系可视化
    utils.visualise_relation(proc_data, './output/var_relation.png')
示例#10
0
 def __getitem__(self, index):
     file = h5py.File(
         self.rootdir + "train" + str(index // self.filelen) + ".hdf5", "r")
     im_a, im_b, label = self.getimhdf5(file, index % self.filelen)
     if type(label) == np.uint8:
         label = np.expand_dims(label, -1)
     return process_data(im_a, im_b, label, self.preprocess)
示例#11
0
    def run_evaluate(self, fold, seed):
        test_ = process_data(self.test)
        feature_cols = [c for c in test_.columns if c not in ['sig_id']]
        x_test = test_[feature_cols].values
        testdataset = TestDataset(x_test)
        testloader = torch.utils.data.DataLoader(
            testdataset, batch_size=self.cfg.batch_size, shuffle=False)
        target_cols = self.target.drop('sig_id',
                                       axis=1).columns.values.tolist()

        model = Model_old(
            num_features=len(feature_cols),
            num_targets=len(target_cols),
            hidden_size=self.cfg.hidden_size,
        )
        """ model.load_state_dict(torch.load(os.path.join(
            self.load_path, f"seed{seed}", f"FOLD{fold}_.pth"), map_location=torch.device(self.cfg.device))) """
        model.load_state_dict(
            torch.load(os.path.join(self.load_path,
                                    f"SEED{seed}_FOLD{fold}_scored.pth"),
                       map_location=torch.device(self.cfg.device)))
        model.to(self.cfg.device)

        predictions = np.zeros((len(test_), self.target.iloc[:, 1:].shape[1]))
        predictions = inference_fn(model, testloader, self.cfg.device)

        return predictions
示例#12
0
def train(model, data_source, epoch, lr=1.0, weight_decay=1e-5, momentum=0.9):
    optimizer = optim.SGD(params=model.parameters(),
                          lr=lr,
                          momentum=momentum,
                          weight_decay=weight_decay)
    # Turn on training mode which enables dropout.
    model.train()
    model.criterion.loss_type = args.loss
    total_loss = 0
    pbar = tqdm(data_source, desc='Training PPL: ....')
    for num_batch, data_batch in enumerate(pbar):
        optimizer.zero_grad()
        data, target, length = process_data(data_batch,
                                            cuda=args.cuda,
                                            sep_target=sep_target)
        loss = model(data, target, length)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        optimizer.step()

        total_loss += loss.item()

        if args.prof:
            break
        if num_batch % args.log_interval == 0 and num_batch > 0:
            cur_loss = total_loss / args.log_interval
            ppl = math.exp(cur_loss)
            logger.debug('| epoch {:3d} | {:5d}/{:5d} batches '
                         '| lr {:02.2f} | loss {:5.2f} | ppl {:8.2f}'.format(
                             epoch, num_batch, len(corpus.train), lr, cur_loss,
                             ppl))
            pbar.set_description('Training PPL %.1f' % ppl)
            total_loss = 0
示例#13
0
def train(ENV, args):
    processed_train_data_path = os.path.join(ENV.processed_data_path, 'processed_train.pkl')
    processed_test_data_path = os.path.join(ENV.processed_data_path, 'processed_test.pkl')
    if os.path.exists(processed_train_data_path) and os.path.exists(processed_test_data_path):
        processed_train_data = pickle.load(open(processed_train_data_path, 'r'))
        processed_test_data = pickle.load(open(processed_test_data_path, 'r'))
    else:
        train_wav_files, train_phn_files = load_data(ENV.train_data)
        print('Process train data...')
        processed_train_data = process_data(train_wav_files, train_phn_files)
        test_wav_files, test_phn_files = load_data(ENV.test_data)
        print('Process test data...')
        processed_test_data = process_data(test_wav_files, test_phn_files)
        pickle.dump(processed_train_data, open(processed_train_data_path, 'w'))
        pickle.dump(processed_test_data, open(processed_test_data_path, 'w'))
    # print(processed_train_data[0][1])
    print("Define graph...")
    train_model(ENV, processed_train_data, processed_test_data)
 def __getitem__(self, index):
     file_index = str(index // 100000)
     if file_index != self.current_index:
         self.current_file.close()
         self.current_index = file_index
         self.current_file = tables.open_file(self.rootdir + "train" + file_index + ".hdf5", driver="H5FD_CORE")
     im_a, im_b, label = self.getimhdf5(index % 100000)
     if type(label) == np.uint8:
         label = np.expand_dims(label, -1)
     return process_data(im_a, im_b, label, self.preprocess)
示例#15
0
def main(unused_argv):
    # Load data
    data, _ = process_data(path_to_data=DATA_PATH, vocabulary=vocabulary)
    train_data, train_labels, validation_data, validation_labels = split_data(
        data, seq_size)

    # Create the Estimator
    classifier = tf.estimator.Estimator(model_fn=char_rnn_model_fn,
                                        model_dir=MODEL_DIR)

    # Train the model
    train_input_fn = tf.estimator.inputs.numpy_input_fn(x={'x': train_data},
                                                        y=train_labels,
                                                        batch_size=batch_size,
                                                        num_epochs=None,
                                                        shuffle=True)

    # Test the model and print results
    validate_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={'x': validation_data},
        y=validation_labels,
        num_epochs=1,
        shuffle=False)

    best_model_path = None
    best_loss = 100.0
    degradation_block_cnt = 0
    for _ in range(20):
        classifier.train(input_fn=train_input_fn, steps=100)

        intermediate_results = classifier.evaluate(input_fn=validate_input_fn)
        current_loss = intermediate_results['loss']
        if current_loss >= best_loss:
            degradation_block_cnt += 1
            print(
                '\nDegradation detected: last {} blocks loss increases. Best: {}, current: {}\n'
                .format(degradation_block_cnt, best_loss, current_loss))
        else:
            best_loss = current_loss
            print('\nLoss decreases: now best is {}\n'.format(best_loss))
            degradation_block_cnt = 0
            best_model_path = classifier.export_savedmodel(
                MODEL_DIR, serving_input_receiver_fn=serving_input_receiver_fn)
        if degradation_block_cnt >= EARLY_STOPPING_THRESHOLD:
            print(
                '\nEarly stopped because degradation block count exceeded threshold. Best model has loss {} and is located under {}\n'
                .format(best_loss, best_model_path))
            break

    final_results = generate_with_model_located_in(best_model_path)
    print('\nBest model located under {}. Generated text: \n {}'.format(
        best_model_path, final_results))
示例#16
0
def main():
    """Main block of code. Reads the data, constructs the tokeniser and trains the model"""
    args = parse_cli_args()
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Supress TF warnings
    X, y = load_data(
        dataset_path=args.data_path,
        feature_field=args.features_field,
        target_field=args.target_field,
    )

    y = np.array(y)

    print("Dataset loaded {0} examples".format(len(X)))
    model_config = yaml.safe_load(open(args.model_config, "r"))
    X_wide, X_deep = process_data(text_feature=X,
                                  vec_path=args.vectoriser_path,
                                  vocab_size=model_config["vocab_size"])
    X_wide_train, X_wide_test, X_deep_train, X_deep_test, y_train, y_test =\
        train_test_split(X_wide, X_deep, y, test_size=0.2)

    print("Train data contains", y_train.shape[0],
          "examples and test data contains", y_test.shape[0], "examples")

    print("Constructing Keras model")
    model = get_wide_deep_model(
        num_wide_features=X_wide.shape[1],
        num_deep_features=X_deep_train.shape[1],
        **model_config,
    )

    print("Training...")
    model.fit(
        x=[X_wide_train, X_deep_train],
        y=y_train,
        epochs=model_config["epochs"],
        batch_size=model_config["batch_size"],
        verbose=1,
    )

    print("Evaluating...")
    mse = model.evaluate(x=[X_wide_test, X_deep_test],
                         y=y_test,
                         batch_size=model_config["batch_size"],
                         verbose=1)
    print("Evaluation MSE:", mse)

    print("Saving ML model")
    model.save_weights(args.model_path)
示例#17
0
文件: main.py 项目: cfh3c/Pytorch-NCE
def evaluate(model, data_source, cuda=args.cuda):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    eval_loss = 0
    total_length = 0

    data_source.batch_size = eval_batch_size
    for data_batch in data_source:
        data, target, length = process_data(data_batch, cuda=cuda, eval=True)

        loss = model(data, target, length)
        cur_length = length.sum()
        eval_loss += loss.data[0] * cur_length
        total_length += cur_length

    return math.exp(eval_loss / total_length)
示例#18
0
def generate_with_model_located_in(dir,
                                   init_seq='Разрешите мне присесть?',
                                   count=100):
    vocabulary = '\n !"(),-.0123456789:;?NАБВГДЕЖЗИКЛМНОПРСТУФХЦЧШЩЬЭЯабвгдежзийклмнопрстуфхцчшщъыьэюя'
    text_encoded, int_to_vocab = process_data(vocabulary=vocabulary,
                                              content=init_seq)
    for _ in range(count):
        generate_fn = predictor.from_saved_model(dir)
        answer = generate_fn({'x': [text_encoded]})
        # symbol_code = np.argmax(answer['probabilities'][0])
        symbol_code = pick_top_n(answer['probabilities'][0], len(vocabulary))
        text_encoded = np.append(text_encoded, symbol_code)

    text = '\n===\n'
    for code in text_encoded:
        text += int_to_vocab[code]
    return text
示例#19
0
def evaluate(model, data_source, cuda=args.cuda):
    # Turn on evaluation mode which disables dropout.
    model.eval()

    # GRU does not support ce mode right now
    eval_loss = 0
    total_length = 0

    with torch.no_grad():
        for data_batch in data_source:
            data, target, length = process_data(data_batch, cuda=cuda, sep_target=sep_target)

            loss = model(data, target, length)
            cur_length = length.sum().item()
            eval_loss += loss.data.item() * cur_length
            total_length += cur_length

    return math.exp(eval_loss/total_length)
示例#20
0
def main():
    pwd = os.getcwd() # 当前目录
    file_name = '0930-2_NOK_20200929114544.csv' # 数据文件名称

    """构造数据集"""
    feature = utils.process_data(file_name)
    feature = torch.tensor(feature, dtype=torch.float32)
    """加载模型"""
    model_load = utils.SimpleNet()
    checkpoint = torch.load(pwd +'\model_save\model.pth.tar') # 加载训练好的模型
    model_load.load_state_dict(checkpoint['state_dict'])
    outputs = model_load(feature.reshape(1, 12, 8, 8))

    predict = torch.max(outputs, dim=1)[1]
    if predict==0:
        print("铆压结果为:NOK")
    else:
        print("铆压结果为:OK")
示例#21
0
def train(model, data_source, epoch, lr=1.0, weight_decay=1e-5, momentum=0.9):
    # Turn on training mode which enables dropout.
    model.train()
    model.criterion.loss_type = args.loss
    total_loss = 0.0
    total_real_loss = 0.0
    pbar = tqdm(data_source, desc='Training PPL: ....')
    #    pbar = data_source
    total_num_words = 0.0
    for num_batch, data_batch in enumerate(pbar):
        progress = num_batch / len(pbar) + epoch - 1
        optimizer.zero_grad()
        data, target, length = process_data(data_batch,
                                            cuda=args.cuda,
                                            sep_target=sep_target)
        total_num_words += length.sum().item()
        loss, real_loss = model(data, target, length)  # / total_num_words
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        optimizer.step()
        total_loss += loss.item()
        total_real_loss += real_loss.item()

        if args.prof:
            break
        if num_batch % args.log_interval == 0 and num_batch > 0:
            cur_loss = total_loss / total_num_words
            cur_real_loss = total_real_loss / total_num_words
            ppl = 100000
            if True or cur_real_loss < math.log(ppl):
                ppl = math.exp(cur_real_loss)
            logger.debug('| epoch {:3d} | {:5d}/{:5d} batches '
                         '| lr {:02.2f} | loss {:5.2f} | ppl {:8.2f}'.format(
                             epoch, num_batch, len(corpus.train), lr, cur_loss,
                             ppl))
            info_str = ('Training loss %.4f, PPL %.4f' % (cur_loss, ppl))
            #            print('Progress %.4f, Training loss %.4f, PPL %.4f' % (progress, cur_loss, ppl))
            pbar.set_description(info_str)
            total_loss = 0.0
            total_real_loss = 0.0
            total_num_words = 0.0
示例#22
0
def main(argv):
    data = nlp.load_dataset("tiny_shakespeare")
    train_data = data["train"][0]["text"]
    valid_data = data["test"][0]["text"]

    tokenize = Tokenizer()
    vocabulary = Vocab()
    train_data, valid_data, vocab_size = process_data(train_data, valid_data,
                                                      tokenize, vocabulary,
                                                      FLAGS.batch_size)

    charnn = model.create_model(
        seed=FLAGS.seed,
        batch_size=FLAGS.batch_size,
        seq_len=FLAGS.batch_size,
        model_kwargs=dict(
            vocab_size=vocab_size,
            embedding_size=FLAGS.embedding_size,
            hidden_size=FLAGS.hidden_size,
            output_size=vocab_size,
        ),
    )

    trained_model = train_model(
        model=charnn,
        learning_rate=FLAGS.learning_rate,
        num_epochs=FLAGS.num_epochs,
        seed=FLAGS.seed,
        train_data=train_data,
        valid_data=valid_data,
        batch_size=FLAGS.batch_size,
    )

    generated_text = generate_text(
        trained_model,
        vocabulary,
        max_length=100,
        temperature=0.8,
        top_k=3,
        start_letter="T",
    )
    print("Hello Shakespeare: ", generated_text)
示例#23
0
def data_generator(batch_size, seed):
    # Our dataset is small, we can pack it as numpy, then load all data into memory
    # Line, Cond(label), Shade
    x_data, c_data, y_data = load_data('./data.npy')
    print('Load {} data pairs'.format(len(x_data)))

    counts = 0
    while True:
        np.random.seed(seed + counts)
        idx = np.random.randint(0, x_data.shape[0], batch_size)

        x_batch, c_batch, p_batch, y_batch = process_data(x_data[idx],
                                                          c_data[idx],
                                                          y_data[idx],
                                                          seed=(seed + counts))

        counts += batch_size

        # Line, Cond(label), Pos, Shade
        yield x_batch, c_batch, p_batch, y_batch
示例#24
0
def evaluate(model, data_source, cuda=args.cuda):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    model.criterion.loss_type = 'full'

    eval_loss = 0
    total_length = 0

    with torch.no_grad():
        for data_batch in data_source:
            data, target, length = process_data(data_batch, cuda=cuda, sep_target=sep_target)

            loss = model(data, target, length)
            cur_length = int(length.data.sum())
            eval_loss += loss.item() * cur_length
            total_length += cur_length

    model.criterion.loss_type = args.loss

    return math.exp(eval_loss/total_length)
def main():
    data_train, data_val, data_test, char_to_index, index_to_char = process_data(
        look_back=30, batch_size=1024, split=[0.7, 0.2, 0.1], debug=DEBUG)

    vocab_size = len(char_to_index)

    model = LSTMModel(vocab_size,
                      look_back=30,
                      hidden_dim=400,
                      batch_size=1024,
                      lr=1,
                      nb_layers=3)

    model.build_graph()

    model.train(data_train, 1)

    model.create_story(
        index_to_char, char_to_index,
        "how are you my pretty Baobei, are you having a good day?")
示例#26
0
def train_model(pretrain):
    X, y = process_data(TRAIN_PATH)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=VAL_PROP)

    if pretrain:
        model = model_from_json(open(MODEL_PATH).read())
        model.load_weights(WEIGHTS_PATH)
    else:
        model = build_model()

    model.compile(loss="categorical_crossentropy", optimizer="adadelta", metrics=["accuracy"])
    early_stop = EarlyStopping(monitor="val_loss", patience=8, mode="min")
    model.fit(X, y, batch_size=BATCH_SIZE, nb_epoch=100, 
        validation_data=(X_val, y_val), callbacks=[early_stop])

    print("Saving model to ", MODEL_PATH)
    print("Saving weights to ", WEIGHTS_PATH)
    open(MODEL_PATH, "w").write(model.to_json())
    model.save_weights(WEIGHTS_PATH, overwrite=True)

    accuracy = model.evaluate(X_val, y_val, batch_size=BATCH_SIZE)
    print("Accuracy: ", accuracy)
示例#27
0
def plot_prediction(model, sensorId, startDate, endDate, mins_max):
    test_rows = generate_test_rows()
    test_processed_data, _ = process_data(test_rows)
    train_labels = model.predict(test_processed_data).flatten()
    days_of_week = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN']
    x = np.empty(len(train_labels))
    for i in range(len(train_labels)):
        x[i] = i
    y = train_labels
    frequency = 96
    plt.ylabel('Total volume')
    plt.xlabel('Days of week')
    plt.xticks(x[48::frequency], days_of_week)
    # plt.yticks(np.arange(y.min(), y.max(), 0.005))
    plt.plot(x, y)
    plt.plot(x, [el[1] for el in mins_maxx])
    plt.plot(x, [el[2] for el in mins_maxx])
    plt.grid(axis='y', linestyle='-')
    plt.title('Sensor id {}, time period: {} to {}'.format(str(sensorId), '2016-04-11', '2016-04-17'))
    plt.savefig('one_week_volume_9005.png')
    plt.show()
    pass
示例#28
0
def main():
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Supress TF warnings
    print("Loading data...")
    X, y = load_wine_data(DATA_PATH, "points")
    random_idx = (np.random.rand(5) * len(X)).astype(int)
    X = [X[idx] for idx in random_idx]
    y = [y[idx] for idx in random_idx]

    X_wide_deep = [
        process_data(X, count_vec=pickle.load(open(vectoriser, "rb")))
        for vectoriser in VEC_PATH
    ]

    print("Constructing Keras models...")
    prediction_models = [
        get_wide_deep_model(
            num_wide_features=X[0].shape[1],
            num_deep_features=X[1].shape[1],
            **yaml.safe_load(open(model_conf, "r")),
        ) for X, model_conf in zip(X_wide_deep, MODEL_CONFIG)
    ]

    for weights, model in zip(MODEL_PATH, prediction_models):
        model.load_weights(weights)

    print("Predicting...")
    predictions = [
            model.predict([X[0], X[1]], verbose=0)\
                for X, model in zip(X_wide_deep, prediction_models)
    ]

    for pred_idx, description, target in zip(range(len(X)), X, y):
        print("=" * 100)
        print("Wine review:\n", description)
        print("Reviewer score:", target)
        for model_idx in range(len(prediction_models)):
            print("Model", model_idx, "prediction:",
                  predictions[model_idx][pred_idx])
示例#29
0
def main():
    """Main block of code. Loads the data, model and vectoriser and shows a demo"""
    args = parse_cli_args()
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Supress TF warnings

    print("Loading data...")
    X, y = load_data(
        dataset_path=args.data_path,
        feature_field=args.features_field,
        target_field=args.target_field,
    )
    # Choose five random examples to show
    random_idx = (np.random.rand(5) * len(X)).astype(int)
    X = [X[idx] for idx in random_idx]
    y = [y[idx] for idx in random_idx]

    X_wide_deep = process_data(
        text_feature=X,
        count_vec=pickle.load(open(args.vectoriser_path, "rb")),
    )

    print("Constructing Keras model")
    prediction_model = get_wide_deep_model(
        num_wide_features=X_wide_deep[0].shape[1],
        num_deep_features=X_wide_deep[1].shape[1],
        **yaml.safe_load(open(args.model_config, "r")),
    )
    prediction_model.load_weights(args.model_path)
    print("Predicting...")
    predictions = prediction_model.predict([X_wide_deep[0], X_wide_deep[1]],
                                           verbose=0)

    for prediction, text, target in zip(predictions, X, y):
        print("=" * 100)
        print("Text:\n", text)
        print("Target:", target)
        print("Model's prediction:", prediction)
示例#30
0
def preprocess():
    df = pd.read_csv("input/lish-moa/train_features.csv")
    df = utils.process_data(df)
    folds = pd.read_csv("input/folds/train_folds.csv")

    # Create aux target
    # `nsc_labels` means # of labels found in non-scored train set
    non_scored_df = pd.read_csv("input/lish-moa/train_targets_nonscored.csv")
    targets_non_scored = non_scored_df.drop("sig_id", axis=1).to_numpy().sum(axis=1)
    non_scored_df.loc[:, "nsc_labels"] = targets_non_scored
    drop_cols = [c for c in non_scored_df.columns if c not in ("nsc_labels", "sig_id")]
    non_scored_df = non_scored_df.drop(drop_cols, axis=1)
    folds = folds.merge(non_scored_df, on="sig_id", how="left")

    targets = folds.drop(["sig_id", "kfold"], axis=1).columns
    features = df.drop("sig_id", axis=1).columns
    df = df.merge(folds, on="sig_id", how="left")
    df.to_csv("input/folds/train.csv", index=False)

    # Serialize column names
    with open("input/folds/targets", "w") as f:
        f.write("\n".join(targets))
    with open("input/folds/features", "w") as f:
        f.write("\n".join(features))
示例#31
0
    grid_search.best_params_
    cvres = grid_search.cv_results_
    for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
        logger.info(
            "random forest with grid search gave score \n %s for parameters %s"
            % (str(np.sqrt(-mean_score)), str(params))
        )

    feature_importances = grid_search.best_estimator_.feature_importances_
    sorted(zip(feature_importances, X.columns), reverse=True)

    final_model = grid_search.best_estimator_
    joblib.dump(
        final_model, os.path.join(MODEL_PATH, "random_forest_grid_search.pkl")
    )


if __name__ == "__main__":

    housing_prepared, housing_labels = process_data(is_train=True)

    logger = create_logger(LOGGING_PATH, "train.log")

    os.makedirs(MODEL_PATH, exist_ok=True)

    train_linear_regression(housing_prepared, housing_labels, logger)
    train_decision_trees(housing_prepared, housing_labels, logger)
    train_RFR_random_search(housing_prepared, housing_labels, logger)
    train_RFR_grid_search(housing_prepared, housing_labels, logger)
示例#32
0
            doc_text = doc_dict[doc_id]
            top_doc_word_list+=doc_text

        # find the most frequent words
        freq_dist = nltk.FreqDist(word for word in top_doc_word_list)
        best_words = freq_dist.keys()[:num_words]

        # add to the query
        new_query = query_text + best_words

        # recalculate tfidf score and add to score dictionary
        for doc_id, tfidf_score in calculate_tfidf(new_query, doc_dict, average_doc_length, k):
            score_dict[query_id, doc_id] = tfidf_score

    return score_dict


if __name__ == "__main__":
    query_dict = utils.process_data('data/qrys.txt')
    doc_dict = utils.process_data('data/docs.txt')

    standard_tfidf_scores = standard_tfidf(query_dict, doc_dict)

    with open('results/tfidf.top', 'w') as output_file:
        output_file = utils.write_result(standard_tfidf_scores, output_file)

    tfidf_with_prf_scores = tfidf_pseudo_relevance_feedback(query_dict, doc_dict)

    with open('results/best.top', 'w') as output_file:
        output_file = utils.write_result(tfidf_with_prf_scores, output_file)
示例#33
0
    if args.DB_NAME == "dbpedia":
        print("training model on dbpedia")
        DB_START, DB_END = [1, 141], [101, 166]
        base = 25
        skip_num = 40
        db_base = 0
    elif args.DB_NAME == "lmdb":
        print("training model on lmdb")
        DB_START, DB_END = [101, 166], [141, 176]
        base = 10
        skip_num = 25
        db_base = 100
    DB_DIR = path.join(DATADIR, args.DB_NAME)

    # load data
    data, _, label, _, _ = utils.process_data(args.DB_NAME, DB_START, DB_END,
                                              args.top_n, args.file_n)
    entity2vec, pred2vec, entity2ix, pred2ix = utils.load_transE(args.DB_NAME)
    pred2ix_size = len(pred2ix)
    hidden_size = args.transE_dim + args.pred_embedding_dim

    # train
    ## cuda
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("cuda or cpu: {}".format(device))

    ## loss function
    if args.loss_function == "BCE":
        criterion = torch.nn.BCELoss()
    elif args.loss_function == "MSE":
        criterion = torch.nn.MSELoss()
    else:
示例#34
0
    specialists = OrderedDict()
    for setting in SPECIALIST_SETTINGS:
        cols = setting["columns"]
        model_path = "data/model_{}.json".format(cols[0])
        model = model_from_json(open(model_path).read())
        weights_path = "data/weights_{}.h5".format(cols[0])
        model.load_weights(weights_path)
        specialists[cols] = model
    return specialists




if __name__ == "__main__":
    lookup, feature_index = parse_lookup_table(LOOKUP_PATH)
    X = process_data(TEST_PATH, mode="TEST")
    specialists = load_specialists()
    predictions = {}

    for cols, model in specialists.items():
        spec_predictions = model.predict(X, batch_size=BATCH_SIZE)
        spec_predictions *= IMG_SIZE // 2
        spec_predictions += IMG_SIZE // 2
        for i, col in enumerate(cols):
            predictions[col] = spec_predictions[:,i]

    submission_values = []
    for i in range(len(lookup)):
        img_id = lookup["ImageId"][i] - 1
        feature = lookup["FeatureName"][i]
        submission_values.append(predictions[feature][img_id])