Exemplo n.º 1
0
def is_continuous_data(code,code_basic,last_exchange_day):
    print 'entering is_continuous_data',code,code_basic['name']
    isTrue = False
    try:
        data = pd.read_csv(data_const.My_Database_Dir+code+data_const.Suffix)
        date_index_list = [str(u_date) for u_date in data['date']]
        last_index  =len(date_index_list)-1
        last_date = date_index_list[last_index]
        if len(data)!=len(data.dropna()):raise RuntimeError('data hava nan or something should drop')
        if(getYesterdayDate() in date_index_list):
            isTrue = True
            data = prep_d.preprocess_data(data, code_basic)
        else:
            last_date_tomorrow = getTomorrowDate(last_date)
            if(len(ts.get_k_data(code,last_date_tomorrow,getYesterdayDate()))==0):
                isTrue = True#两个时间之间没有数据,说明数据是连续的
                data = prep_d.preprocess_data(data, code_basic)
    except Exception as e:
        print e,code,'something wrong in is_continuous_data'
        print 'leaving is_continuous_data',isTrue,0,None
        return isTrue,0,None
    if(data.loc[last_index]['date'] !=  last_exchange_day):
        last_index+=1
    print 'leaving is_continuous_data',isTrue,last_index
    return isTrue,last_index,data
Exemplo n.º 2
0
def update_all():
    #last_exchange_day = getDatetimeToday().strftime("%Y-%m-%d")
    last_exchange_day = get_last_exchange_day()
    basic_env = pd.read_csv(data_const.My_Store_Dir+'basic.csv',index_col='code')
    realtime_data = pd.read_csv(data_const.My_Store_Dir+'realtime_data.csv')
    realtime_data = realtime_data.drop_duplicates((['code']))#去除重复操作十分重要
    realtime_data = realtime_data.set_index('code')
    #用601988.csv中国银行的连续性评估整体文件csv的连续性
    #引入new_code_exist强制更新全部数据
    new_code_exist_force = True#force update
    is_sh_continuous ,_,_ = is_continuous_data('601988', basic_env.loc[601988],last_exchange_day)
    if(not new_code_exist_force and last_exchange_day !=  getDatetimeToday().strftime("%Y-%m-%d") and last_exchange_day!=getDatetimeYesterday().strftime("%Y-%m-%d") and is_sh_continuous):
        print '今日和昨日都不是交易日并且不需要强制更新全部数据,数据无需更新'
        return realtime_data
    if not realtime_data.index.is_unique:
        print '索引重复exception'
        raise
    i=1;codes = data_const.Whole_codes;
    for code in codes:
        print i,'/',len(codes),'update_data',code;i+=1;
        try:
            code_basic = basic_env.loc[int(code)]
            code_rt_dt = realtime_data.loc[int(code)]
        except:
            print 'code_basic = basic_env.loc[int(code)] error'
            continue
        if code_rt_dt['high'].item()==code_rt_dt['low'].item() and code_rt_dt['open'].item()==0 :#停牌处理
            try:
                data = pd.read_csv(data_const.My_Database_Dir+code+data_const.Suffix)
                data = prep_d.preprocess_data(data, code_basic)
                wanted = pd.DataFrame(data[data_const.Feature])
                wanted.to_csv(data_const.My_Database_Dir+code+data_const.Suffix,index=False)
            except Exception as e:
                print e,code
            continue
        isTrue ,last_index,data = is_continuous_data(code, code_basic,last_exchange_day);
        Force_all_renew = True
        if(isTrue and not Force_all_renew):
            try:
                #print 'new_row'
                new_row = idx.new_data_line(last_exchange_day,data.loc[last_index-1],code_rt_dt)
                data.loc[last_index-1,'review_pc'] = code_rt_dt['changepercent']
            except:
                print 'except in new_row'
                new_row = idx.new_data_line(last_exchange_day,data.loc[last_index],code_rt_dt)                
                data.loc[last_index,'review_pc'] = code_rt_dt['changepercent']
            data = data[data_const.Feature]
            data.loc[last_index] = new_row
            #print data.tail()[['review_pc','date','p_change']]
        else: 
            #data = ts.get_hist_data(code).sort_index().reset_index()#有时候有些股票会丢失数据
            print 'is_continuous_data error decided to data=ts.get_k_data(code):',code
            data=ts.get_k_data(code)
            data = prep_d.preprocess_data(data, code_basic)
        wanted = pd.DataFrame(data[data_const.Feature])
        wanted.to_csv(data_const.My_Database_Dir+code+data_const.Suffix,index=False)
    return realtime_data
Exemplo n.º 3
0
def main():
    test_df, train_df = load_data()

    train_x, train_y, indices_for_masking, pca_instance, scaler_instance = preprocess_data(
        train_df)
    test_x = preprocess_data(test_df, indices_for_masking,
                             pca_instance, scaler_instance)

    model = create_model(train_x.shape[1], MODEL_TYPE)
    train_model(model, MODEL_TYPE, train_x, train_y)
    predictions = predict_data(model, MODEL_TYPE, test_x)

    submission_file_directory = write_output(predictions)
Exemplo n.º 4
0
def get_data(path):
    reviews = preprocess_data.preprocess_data(path)
    cnn_data_train, embedding_weights_train, cnn_data_test, embedding_weights_test = preprocess_data.split_and_tokenize(
        reviews)
    pos_data_train = combine_data_and_weights(
        cnn_data_train, embedding_weights_train)
    pos_data_test = combine_data_and_weights(
        cnn_data_test, embedding_weights_test)
    return pos_data_train, pos_data_test
Exemplo n.º 5
0
def train_model():
    """
    Trains the model.
    """
    # Ensure that the data has been processed already.
    preprocess_data()

    # Saves the checkpoint after every epoch.
    checkpoint_path = "checkpoints\\epoch={epoch:02d} acc={acc:.2f} loss={loss:.2f}" \
                      " val_acc={val_acc:.2f} val_loss={val_loss:.2f}.hdf5"

    checkpoints = ModelCheckpoint(checkpoint_path, verbose=True)

    # Saves the checkpoint with the smallest validation loss.
    val_checkpoint = ModelCheckpoint(Path.model,
                                     monitor='val_loss',
                                     save_best_only=True)

    # Creates the folder: checkpoints.
    if os.path.isdir(os.getcwd() + "\\checkpoints") is False:
        os.mkdir(os.getcwd() + "\\checkpoints")

    model = sentiment_analysis_model()

    steps_per_epoch = (1600000 * 0.8) / Config.batch_size

    # Train the model.
    history = model.fit_generator(generator=training_data_generator(),
                                  steps_per_epoch=steps_per_epoch,
                                  validation_data=testing_data_generator(),
                                  validation_steps=len(dataset),
                                  epochs=10,
                                  verbose=1,
                                  callbacks=[checkpoints, val_checkpoint])

    plot_graph(history)
    model.summary()
Exemplo n.º 6
0
def predict():
    model_filename = 'model_titanic_survival.pkl'
    interesting_columns = ['Pclass', 'Sex', 'Age', 'Cabin', 'Embarked']
    median_age = 28

    json_ = request.json
    df = pd.DataFrame(json_)

    processed_df = preprocess_data(df, interesting_columns, median_age)
    features = processed_df.select_dtypes(include='number')

    loaded_model = pickle.load(open(model_filename, 'rb'))
    prediction = loaded_model.predict(features)
    df['prediction'] = prediction
    return str(df.to_json(orient='records', force_ascii=False))
Exemplo n.º 7
0
    def train(self, epochs=200, lr=0.01, show_fig=False):

        X, Y, Y_one_hot = preprocess_data()

        self.lr = lr

        X_train, X_test = self.train_test_split(X)
        Y_train, Y_test = self.train_test_split(Y)
        Y_train_ohe, Y_test_ohe = self.train_test_split(Y_one_hot)

        self.cost = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits_v2(labels=self.phY,
                                                       logits=self.Y_))
        self.train = tf.train.AdamOptimizer(self.lr).minimize(self.cost)

        init = tf.global_variables_initializer()

        self.epochs = epochs
        self.cost_array = []
        with tf.Session() as sess:
            sess.run(init)
            for i in range(self.epochs):
                sess.run(self.train,
                         feed_dict={
                             self.phX: X_train,
                             self.phY: Y_train_ohe
                         })
                self.c = np.mean(
                    sess.run(self.cost,
                             feed_dict={
                                 self.phX: X_train,
                                 self.phY: Y_train_ohe
                             }))

                if i % 10 == 0:
                    self.predictions = sess.run(self.pred,
                                                feed_dict={self.phX: X_test})
                    self.acc = np.mean(self.predictions == Y_test)
                    print(
                        f"Iteration {i}. Cost: {self.c}. Accuracy: {self.acc}")

                self.cost_array.append(self.c)

            if show_fig:
                plt.plot(self.cost_array)
                plt.show()
            self.save_path = self.saver.save(sess, "/tmp/model.ckpt")
            print("Model saved in path: %s" % self.save_path)
Exemplo n.º 8
0
def train(attribute_names, input_csv, model_output_path, exp):
    x, y = preprocess_data(input_csv)
    num_classes = y.shape[-1]
    y = split_array(y, num_classes)
    x = split_array(x, x.shape[-1])

    monitor_values = [
        "val_output_class_0_precision", "val_output_class_0_recall",
        "val_output_class_1_precision", "val_output_class_1_recall",
        "val_output_class_2_precision", "val_output_class_2_recall",
        "val_output_class_3_precision", "val_output_class_3_recall",
        "val_output_class_4_precision", "val_output_class_4_recall"
    ]

    model = get_model(attribute_names=attribute_names,
                      lr=0.0001,
                      num_output_classes=num_classes)
    # early_stopper_callback = tf.keras.callbacks.EarlyStopping(
    #     monitor='val_loss', min_delta=0, patience=10, verbose=0,
    #     mode='min', baseline=None, restore_best_weights=True
    # )
    early_stopper_callback = EarlyStoppingModified(
        model_output_dir=model_output_path,
        exp=exp,
        monitor='val_loss',
        min_delta=0,
        patience=15,
        verbose=0,
        mode='min',
        baseline=None,
        restore_best_weights=True
    )  # this callback has been modified, taken from tensorflow. It has been modified to use a list of monitor value and also implements model saver callback

    model.fit(x,
              y,
              epochs=2000,
              shuffle=True,
              validation_split=0.5,
              batch_size=64,
              callbacks=[early_stopper_callback])
Exemplo n.º 9
0
def main(argv):
    sys.path.append(app_dir)
    import rw_bat_data as rwd
    import preprocess_data as ppd
    import func as fc
    import scale_data as sd

    para_dict = init_data_para()
    para_dict = fc.deal_argv(argv, para_dict)
    mode = para_dict['run_mode']

    #读取所需的数据,并进行处理后存储到指定位置
    print('starting processing the data...')
    bat_list = rwd.get_bat_list(para_dict, mode)
    regx, mask_filename = fc.get_filename_regx(para_dict['log_pro'],
                                               **para_dict)

    if bat_list is not None:
        for bat_name in bat_list:
            raw_data = rwd.read_bat_data(para_dict,
                                         mode,
                                         bat_name,
                                         limit=para_dict['data_limit'][mode])
            data = ppd.preprocess_data(bat_name, raw_data)
            rwd.save_bat_data(data, para_dict['log_pro'] + '_' + bat_name,
                              para_dict, mode)  #存储处理后的数据
    else:
        print('there is no bat!')

    #将处理好的数据按工作状态划分后存储到指定位置
    print('save the processed data...')

    result = fc.save_workstate_data(regx, mask_filename,
                                    para_dict['processed_data_dir'][mode],
                                    para_dict['processed_data_dir'][mode])
    if not result:
        print(
            'there is not any files included the data which would been scaled.'
        )
        return
    else:
        #进行扩充
        print('to be scaled...')

        for state in para_dict['states']:
            file_name = r'%s_' % state + mask_filename
            processed_data = sd.get_processed_data(
                os.path.join(para_dict['processed_data_dir'][mode], file_name))
            scale_data = sd.generate_data(processed_data, **para_dict)
            sd.save_scale_data(scale_data,
                               para_dict['log_scale'] + '_' + file_name,
                               para_dict['scale_data_dir'][mode])
            print('finished scaling the %s data' % state)

    #训练模型
    print('starting training the model...')
    for state in para_dict['states']:
        file_name = r'%s_%s_%s' % (para_dict['log_scale'], state,
                                   mask_filename)
        para_dict['pkl_dir'] = {
            'run':
            os.path.normpath('/raid/data/processed_data/pkl/' + save_dir +
                             '/%s_pkl' % state),
            'debug':
            os.path.normpath(app_dir + '/%s_pkl' % state)
        }
        import build_model as bm
        bm.train_model(file_name, state, **para_dict)
Exemplo n.º 10
0
current_time = time.strftime("%Y-%m-%d %H:%M:%S %Z", time.gmtime(time_start))
logger.info("Execution started at " + current_time)

# Get paths
dir_data, dir_output, dir_tmp, config_path = get_folder_structure(root_path=ROOT_PATH, \
                                                                  config_fname=CONFIG_NAME)

logger.info("Validation config file..")
# Load config
schema = get_schema()
config = read_config(config_path=config_path)
config = validate_config(config=config, schema=schema)

# Clean data
df_cleaned = clean_data(dir_tmp=dir_tmp, path_data=dir_data)
df_base = preprocess_data(df_cleaned, dir_tmp=dir_tmp, path_data=dir_data)

# Create trained model folder
dir_model = os.path.join(dir_output, "model_trained")
print(dir_model)
print(dir_output)
print(config['n_top_words'])

# If its train
if config['train']:

    if not os.path.isdir(dir_model):
        os.makedirs(dir_model)

    logger.debug("Calculating best LDA model..")
    search_params = {
Exemplo n.º 11
0
# This is where we will be training and evaluating the model

import tensorflow as tf
import preprocess_data
from model import LeNet
from sklearn.utils import shuffle

X_train, y_train, X_validation, y_validation, X_test, y_test = (
    preprocess_data.preprocess_data())

# HYPERSSS!
EPOCHS = 10
BATCH_SIZE = 128
lr = 0.001

x = tf.placeholder(tf.float32, shape=(None, 32, 32, 1))
y = tf.placeholder(tf.int32, shape=(None))
one_hot_y = tf.one_hot(y, 10)

logits = LeNet(x)
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=one_hot_y,
                                                        logits=logits)
loss = tf.math.reduce_mean(cross_entropy)
optimizer = tf.train.AdamOptimizer(learning_rate=lr)
train_op = optimizer.minimize(loss)

correct_pred = tf.math.equal(tf.math.argmax(logits, 1),
                             tf.argmax(one_hot_y, 1))  # bool
accuracy_op = tf.reduce_mean(
    tf.cast(correct_pred,
            tf.float32))  # put all trues to 1, falses to 0, and find mean
Exemplo n.º 12
0
# coding:utf-8
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score

from preprocess_data import preprocess_data
from conf import conf


if __name__ == '__main__':
    train, train_target = preprocess_data(pd.read_csv(conf.train))

    X_train, X_val, Y_train, Y_val = train_test_split(train, train_target, \
                        test_size=conf.test_size)




    best_C = 0
    best_val_auc = 0
    best_val_score = 0
    for C in [1e-2, 3e-2, 1e-1, 3e-1, 1, 3, 10, 1e2, 3e2, 1e3, 3e3]:
    #for C in [1e-2, 3e-3, 1e-3, 3e-4, 1e-4, 3e-5, 1e-5, 3e-6, 1e-6]:
        print 'C:', C
        lr = LogisticRegression( penalty='l1', n_jobs=-1, C=C, random_state=conf.random_state)
        lr.fit(X_train, Y_train)
Exemplo n.º 13
0
def main(data_config,
         output_dir,
         num_epochs=10,
         batch_size=5,
         lr=0.001,
         target_fs=44100,
         audio_window_size=2048,
         patience=5,
         model_type='spectrogram',
         k_smoothing=1):
    """
    Train a deep beat tracker model
    """
    # Set up logger
    init_console_logger(LOGGER, verbose=True)

    with open(data_config, 'r') as f:
        data_config = json.load(f)

    sorted_train_datasets = sorted(data_config['train'].keys())
    train_dataset_desc = "train_" + "_".join(sorted_train_datasets)
    test_dataset_desc = "test_" + "_".join(sorted(data_config['test'].keys()))

    dataset_desc = train_dataset_desc + "-" + test_dataset_desc

    output_dir = os.path.join(output_dir, model_type, dataset_desc)
    LOGGER.info('Output will be saved to {}'.format(output_dir))

    feature_data_dir = os.path.join(output_dir, 'data')
    model_dir = os.path.join(output_dir, 'model',
                             datetime.datetime.now().strftime("%Y%m%d%H%M%S"))
    if not os.path.exists(feature_data_dir):
        os.makedirs(feature_data_dir)
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    LOGGER.info('Saving configuration.')
    config = {
        'data_config': data_config,
        'output_dir': output_dir,
        'num_epochs': num_epochs,
        'batch_size': batch_size,
        'lr': lr,
        'patience': patience,
        'k_smoothing': k_smoothing,
        'target_fs': target_fs,
        'audio_window_size': audio_window_size,
        'model_type': model_type
    }

    config_path = os.path.join(model_dir, 'config.json')
    with open(config_path, 'w') as f:
        json.dump(config, f)

    LOGGER.info('Loading {} data.'.format(dataset_desc))
    train_data_path = os.path.join(feature_data_dir,
                                   '{}_train_data.npz').format(dataset_desc)
    valid_data_path = os.path.join(feature_data_dir,
                                   '{}_valid_data.npz').format(dataset_desc)
    test_data_path = os.path.join(feature_data_dir,
                                  '{}_test_data.npz').format(dataset_desc)


    data_exists = os.path.exists(train_data_path) \
        and os.path.exists(valid_data_path) \
        and os.path.exists(test_data_path)

    if model_type == 'spectrogram':
        assert target_fs == 44100

    hop_length = int(target_fs * HOP_SIZE)

    sorted_train_datasets = sorted(data_config['train'].keys())
    a_train = []
    r_train = []
    # Load audio and annotations
    for dataset in sorted_train_datasets:
        data_dir = data_config['train'][dataset]['data_dir']
        label_dir = data_config['train'][dataset]['label_dir']
        if dataset == 'hainsworth':
            a, r = prep_hainsworth_data(data_dir,
                                        label_dir,
                                        target_fs,
                                        load_audio=not data_exists)
        elif dataset == 'ballroom':
            a, r = prep_ballroom_data(data_dir,
                                      label_dir,
                                      target_fs,
                                      load_audio=not data_exists)

        a_train += a
        r_train += r

    a_test = []
    r_test = []
    for dataset, dataset_dirs in data_config['test'].items():
        data_dir = dataset_dirs['data_dir']
        label_dir = dataset_dirs['label_dir']
        if dataset == 'hainsworth':
            a, r = prep_hainsworth_data(data_dir,
                                        label_dir,
                                        target_fs,
                                        load_audio=not data_exists)
        elif dataset == 'ballroom':
            a, r = prep_ballroom_data(data_dir,
                                      label_dir,
                                      target_fs,
                                      load_audio=not data_exists)

        a_test += a
        r_test += r

    if not data_exists:
        # Create preprocessed data if it doesn't exist
        LOGGER.info(
            'Preprocessing data for model type "{}".'.format(model_type))
        # Get features and targets from data
        X_train, y_train = preprocess_data(a_train,
                                           r_train,
                                           mode=model_type,
                                           hop_size=hop_length,
                                           audio_window_size=audio_window_size,
                                           sr=target_fs)
        X_test, y_test = preprocess_data(a_test,
                                         r_test,
                                         mode=model_type,
                                         hop_size=hop_length,
                                         audio_window_size=audio_window_size,
                                         sr=target_fs)

        test_data = {
            'X': X_test,
            'y': y_test,
            'indices': np.arange(len(y_test))  # Hack
        }

        LOGGER.info('Creating data subsets.')
        train_data, valid_data = create_data_subsets(X_train, y_train)

        LOGGER.info('Saving data subsets to disk.')
        np.savez(train_data_path, **train_data)
        np.savez(valid_data_path, **valid_data)
        np.savez(test_data_path, **test_data)

    else:
        # Otherwise, just load existing data
        train_data = load_data(train_data_path, model_type)
        valid_data = load_data(valid_data_path, model_type)
        test_data = load_data(test_data_path, model_type)

    model_path = os.path.join(model_dir, 'model.hdf5')
    if not os.path.exists(model_path):
        # Only train model if we haven't done so already
        LOGGER.info('Training model.')
        # Create, train, and save model
        model_path = train_model(train_data,
                                 valid_data,
                                 model_type,
                                 model_path,
                                 lr=lr,
                                 batch_size=batch_size,
                                 num_epochs=num_epochs,
                                 audio_window_size=audio_window_size,
                                 patience=patience)

    # Evaluate model
    LOGGER.info('Evaluating model.')
    perform_evaluation(train_data,
                       valid_data,
                       test_data,
                       model_dir,
                       r_train,
                       r_test,
                       target_fs,
                       batch_size,
                       k_smoothing=k_smoothing)

    LOGGER.info('Done!')
Exemplo n.º 14
0
# coding:utf-8
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score

from preprocess_data import preprocess_data
from conf import conf

if __name__ == '__main__':
    train, train_target = preprocess_data(pd.read_csv(conf.train))

    X_train, X_val, Y_train, Y_val = train_test_split(train, train_target, \
                        test_size=conf.test_size)

    best_C = 0
    best_val_auc = 0
    best_val_score = 0
    for C in [1e-2, 3e-2, 1e-1, 3e-1, 1, 3, 10, 1e2, 3e2, 1e3, 3e3]:
        #for C in [1e-2, 3e-3, 1e-3, 3e-4, 1e-4, 3e-5, 1e-5, 3e-6, 1e-6]:
        print 'C:', C
        lr = LogisticRegression(penalty='l1',
                                n_jobs=-1,
                                C=C,
                                random_state=conf.random_state)
        lr.fit(X_train, Y_train)
Exemplo n.º 15
0
            model_checkpoint_callback,
            early_stopping_callback
        ]
    )

    return history


def make_prediction(model, img):
    img = np.array([img])
    res = model.predict(x=img)[0]
    # print(res)
    y_hat = np.argmax(res)  # convert from softmax
    return LABELS[y_hat], res[y_hat]



# ARCHITECTURE_PLOT_PATH = os.path.join(os.path.dirname(__file__), "model_architecture.png")
# print(ARCHITECTURE_PLOT_PATH)


if __name__ == "__main__":
    data = preprocess_data()

    model = get_model()
    model.summary()
    # plot_model(model, to_file=ARCHITECTURE_PLOT_PATH)
    
    history = train_model(model, data)
    print(history)
Exemplo n.º 16
0
import numpy as np
import pandas as pd
from preprocess_data import preprocess_data

df = pd.read_csv("data.csv")

data = preprocess_data("logGDP", "EmanzV", "year", df)


def dysymod(paramnr, var1, var2, chVar1, chVar2, mvar1, mvar2):
    nterms = 17
    nmodelterms = paramnr
    nmodels = 3

    # definition of polynomial terms
    term = [
        "", "/x", "/y", "x", "y", "/(x*y)", "x/y", "y/x", "x*y", "x^2", "/x^2",
        "y^2", "/y^2", "x^3", "y^3", "/x^3", "/y^3"
    ]
    print(term)

    # scaling terms with means
    scaling = []
    scaling.append(1)
    scaling.append(mvar1)
    scaling.append(mvar2)
    scaling.append(1 / mvar1)
    scaling.append(1 / mvar2)
    scaling.append(mvar1 * mvar2)
    scaling.append(mvar2 / mvar1)
    scaling.append(mvar1 / mvar2)
import tensorflow as tf
import preprocess_data as pd
import utils as utl
import numpy as np

train_x, val_x, test_x, train_y, val_y, test_y, vocab_to_int = pd.preprocess_data(
)


# Model input pipes for data feed
def model_inputs():
    inputs_ = tf.placeholder(tf.int32, [None, None], name="inputs")
    labels_ = tf.placeholder(tf.int32, [None, None], name="labels")
    keep_prob_ = tf.placeholder(tf.float32, name="keep_prob")
    return inputs_, labels_, keep_prob_


# Build, multi-dimensional vector of the current word.
def build_embedding_layer(inputs_, vocab_size, embed_size):
    embedding = tf.Variable(tf.random_uniform((vocab_size, embed_size), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, inputs_)
    return embed


# Create LSTM Layer
def build_lstm_layers(lstm_sizes, embed, keep_prob_, batch_size):
    lstms = [tf.contrib.rnn.BasicLSTMCell(size) for size in lstm_sizes]
    # Add dropout to the cell
    drops = [
        tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob_)
        for lstm in lstms
Exemplo n.º 18
0
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
from torchvision import models
from preprocess_data import preprocess_data
from get_input_args import get_input_args
import my_model
from my_model import model, model_classifier, model_criterion, model_optimizer, classifier_hyperparam

train_data, valid_data, test_data, trainloader, validloader, testloader = preprocess_data(
)
input_size, hidden_layers, output_size, dropout_prob = classifier_hyperparam()
input_args = get_input_args()

model.classifier = model_classifier()
optimizer = model_optimizer()
criterion = model_criterion()

#assigne a device
if input_args.gpu_cpu:
    device = input_args.gpu
    if torch.cuda.is_available() and device == 'cuda':
        model = model.to(device)
    else:
        model = model.to('cpu')
else:
    device = 'cpu'
    model = model.to(device)

epochs = input_args.epochs
Exemplo n.º 19
0
from sklearn import neighbors


def build_tree(data):
    tree = neighbors.KDTree(data, leaf_size=2)
    return tree


if __name__ == '__main__':
    from preprocess_data import preprocess_data
    from normalize import normalize
    from numpy import array

    matrix, labels, categories = preprocess_data('datingTestSet.txt')
    normalized_matrix, ranges, min_vals, max_vals = normalize(matrix)

    labelsList = ['not at all', 'in small doses', 'in large doses']
    tree = build_tree(normalized_matrix)
    test = array([34.0, 4400.0, 0.3])
    dist, ind = tree.query([test], k=3)

    ind = ind[0]
    for i in ind:
        print(labelsList[labels[i]])
Exemplo n.º 20
0
import pandas as pd
import pickle

preprocessing_override = pd.read_csv('preprocessing_override.csv')

dataset_X = pd.read_csv('train.csv')
dataset_y = dataset_X['Survived']

dataset_X_verify = pd.read_csv('test.csv')

del dataset_X['Survived']
del preprocessing_override['Survived']

import preprocess_data as prd

preprocessed_data = prd.preprocess_data(dataset_X, dataset_y,
                                        preprocessing_override,
                                        dataset_X_verify)

X = preprocessed_data["X"]
y = preprocessed_data["y"]

# import get_best_model as bfm
#import get_best_classification_model as bfm
import get_best_model as bfm

best_fit_model = pickle.loads(bfm.get_best_model(X, y))

print(best_fit_model.predict(X[0:25]))
print(y[0:25])
Exemplo n.º 21
0
def main(args):
    # Parse arguments
    file_name = args.input
    category = args.attribute
    hidden_layers = args.hiddennodes
    iterations = args.iterations
    repeat = args.repeat
    output_path = args.output

    running = True
    paused = False
    drawing = args.visualise

    # Set up PyGame
    if drawing:
        (width, height) = (1200, 500)
        screen = pygame.display.set_mode((width, height))
        pygame.font.init()

    a = 0  # Counter
    tests = []
    while running:
        # Test network and reset
        if a % iterations == 0:
            if a != 0:
                # Run test and save results
                test = test_network(nn, testing_data)
                tests.append(test)

            if a < iterations * repeat:
                # Preprocess and divide data into two sets, ratio 2:1
                training_data, testing_data, headings, categories = preprocess_data(file_name, category)
                # Initialise neural network
                keys, values = list(training_data.keys()), list(training_data.values())
                input_layers, output_layers = len(keys[0]), len(values[0])
                nn = NeuralNetwork(input_layers, hidden_layers, output_layers, args.learningrate)
                # Get current weights
                weights = nn.get_weights()
                weights_ih = weights['input-hidden'].data
                weights_oh = weights['hidden-output'].data
            else:
                # Save results
                path = output_path + 'results.csv'
                file = open(path, 'w')
                writer = csv.writer(file)
                writer.writerow(['successes', 'failures', 'success%'])
                t_successes = t_failures = 0
                for test in tests:
                    test = list(test)
                    test.append("{:.2f}".format((test[0] / (test[1] + test[0])) * 100))
                    writer.writerow(test)
                    t_successes += test[0]
                    t_failures += test[1]

                print("Total successes: " + str(t_successes) + "\t\tTotal fails: " + str(
                    t_failures) + "\t\tTotal success rate: " + "{:.2f}".format((t_successes / (t_failures + t_successes)) * 100) + "%")

                running = False

        # Create visualisation
        if drawing:
            for event in pygame.event.get():
                # Stop on close
                if event.type == pygame.QUIT:
                    running = False
                if event.type == pygame.KEYDOWN:
                    # Pause/unpause when space is pressed
                    if event.key == pygame.K_SPACE:
                        paused = not paused

            # Black background
            screen.fill((0, 0, 0))

            # Calculate node positions
            c = input_layers * 40 + (input_layers - 1) * 10
            offset_i = 20 + (height - c) // 2
            c = hidden_layers * 40 + (hidden_layers - 1) * 10
            offset_h = 20 + (height - c) // 2
            c = output_layers * 40 + (output_layers - 1) * 10
            offset_o = 20 + (height - c) // 2

            # Calculate min and max values for input-hidden weights
            min_weight = min([abs(item) for sublist in weights_ih for item in sublist])
            max_weight = max([abs(item) for sublist in weights_ih for item in sublist])
            # Draw input-hidden weights
            for i in range(input_layers):
                for j in range(hidden_layers):
                    weight = weights_ih[j][i]
                    if weight < 0:
                        weight = (abs(weight) - min_weight) / (max_weight - min_weight)
                        shade = int(255 * weight)
                        pygame.draw.aaline(screen, (0, shade, 0), (300, offset_i + 50 * i), (width / 2, offset_h + 50 * j))
                    else:
                        weight = (abs(weight) - min_weight) / (max_weight - min_weight)
                        shade = int(255 * weight)
                        pygame.draw.aaline(screen, (shade, 0, 0), (300, offset_i + 50 * i), (width / 2, offset_h + 50 * j))

            # Calculate min and max values for hidden-output weights
            min_weight = min([abs(item) for sublist in weights_oh for item in sublist])
            max_weight = max([abs(item) for sublist in weights_oh for item in sublist])
            # Draw hidden-output weights
            for i in range(hidden_layers):
                for j in range(output_layers):
                    weight = weights_oh[j][i]
                    if weight < 0:
                        weight = (abs(weight) - min_weight) / (max_weight - min_weight)
                        shade = int(255 * weight)
                        pygame.draw.aaline(screen, (0, shade, 0), (width / 2, offset_h + 50 * i), (width - 300, offset_o + 50 * j))
                    else:
                        weight = (abs(weight) - min_weight) / (max_weight - min_weight)
                        shade = int(255 * weight)
                        pygame.draw.aaline(screen, (shade, 0, 0), (width / 2, offset_h + 50 * i), (width - 300, offset_o + 50 * j))

            if a % iterations != 0:
                activations = nn.get_activations()

                # Draw input nodes
                activation = [item for sublist in activations['input'].data for item in sublist]
                for i in range(input_layers):
                    shade = int(activation[i] * 255)
                    pygame.draw.circle(screen, (70, 70, 70), (300, offset_i + 50 * i), 20)
                    pygame.draw.circle(screen, (shade, shade, shade), (300, offset_i + 50 * i), 15)
                    # Label nodes
                    myfont = pygame.font.SysFont('Consolas', 20)
                    textsurface = myfont.render(headings[i], False, (255, 255, 255))
                    rect = textsurface.get_rect()
                    rect.right = 270
                    rect.top = offset_i - 10 + 50 * i
                    screen.blit(textsurface, rect)

                # Draw hidden nodes
                activation = [item for sublist in activations['hidden'].data for item in sublist]
                for i in range(hidden_layers):
                    shade = int(activation[i] * 255)
                    pygame.draw.circle(screen, (100, 100, 100), (width // 2, offset_h + 50 * i), 20)
                    pygame.draw.circle(screen, (shade, shade, shade), (width // 2, offset_h + 50 * i), 15)

                # Draw output nodes
                activation = [item for sublist in activations['output'].data for item in sublist]
                for i in range(output_layers):
                    shade = int(activation[i] * 255)
                    pygame.draw.circle(screen, (100, 100, 100), (width - 300, offset_o + 50 * i), 20)
                    pygame.draw.circle(screen, (shade, shade, shade), (width - 300, offset_o + 50 * i), 15)
                    # Label nodes
                    myfont = pygame.font.SysFont('Consolas', 20)
                    textsurface = myfont.render(categories[i], False, (255, 255, 255))
                    screen.blit(textsurface, (width - 270, offset_o - 10 + 50 * i))

            # Show iterations
            myfont = pygame.font.SysFont('Consolas', 20)
            textsurface = myfont.render('Iterations: ' + str(a % iterations) + '\t Repeat: ' + str(a // iterations),
                                        False, (255, 255, 255))
            screen.blit(textsurface, (width - 500, 30))
            pygame.display.flip()

        # Train network
        inputs, target = random.choice(list(training_data.items()))
        if not paused:
            nn.train(list(inputs), target)
            a += 1
parser.add_argument('--gpu', action='store_true',
                    default=True,
                    dest='gpu',
                    help='Use GPU for training, set a switch to true')

parse_results = parser.parse_args()


# parse  
data_dir = parse_results.data_directory
save_dir = parse_results.save_dir
arch = parse_results.arch
learning_rate = float(parse_results.learning_rate)
hidden_units = int(parse_results.hidden_units)
epochs = int(parse_results.epochs)
device = parse_results.gpu


#load data and preprocessing as well
image_datasets,train_loader,valid_loader,test_loader = preprocess_data(data_dir)
#bulid the pre_trained model structure
model_init,optimizer= load_pre_trained_model(arch, hidden_units)
#train the model
model,validation_accuracies  = nn_classifer_train_valid(epochs,model_init,optimizer,train_loader,valid_loader,device)
#saving the checkpoint
check_point = save_check_point(model,image_datasets['train'],save_dir)
 



def main():

    ##################################################################################################################
    # Prepare data
    ##################################################################################################################

    LOG.info('=' * 50)
    LOG.info('# Prepare data..')
    prepare_data(LOG)

    ##################################################################################################################
    # Preprocessing
    ##################################################################################################################

    LOG.info('=' * 50)
    LOG.info('# Preprocessing data..')
    preprocess_data(LOG)

    ##################################################################################################################
    # Feature Engineering
    ##################################################################################################################

    LOG.info('=' * 50)
    LOG.info('# Feature Engineering..')
    trn_path = './input/trn.csv'
    tst_path = './input/tst.csv'
    trg_path = './input/target.csv'

    # load data
    trn = pd.read_csv(trn_path)
    tst = pd.read_csv(tst_path)
    trg = pd.read_csv(trg_path)

    target_cols = [
        'ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1',
        'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1',
        'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1',
        'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
        'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
        'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
        'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
        'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1'
    ]
    lags = ['_lag_one', '_lag_two', '_lag_thr', '_lag_fou', '_lag_fiv']
    diffs = [['fiv', 'fou'], ['fou', 'thr'], ['thr', 'two'], ['two', 'one']]

    LOG.info('# na_count')
    # null count per row
    trn['na_count'] = trn.isnull().sum(axis=1)
    tst['na_count'] = tst.isnull().sum(axis=1)

    LOG.info('# target_sum_lag')
    # total count of purchases per month
    for lag in lags:
        trn['target_sum' + lag] = (trn[[col + lag
                                        for col in target_cols]].sum(axis=1))
        tst['target_sum' + lag] = (tst[[col + lag
                                        for col in target_cols]].sum(axis=1))

    LOG.info('# avg of cols')
    # average of cols over past 5 months
    cols = ['ind_actividad_cliente', 'ult_fec_cli_1t']
    for col in cols:
        trn[col + lag + '_avg'] = (trn[[col + lag
                                        for lag in lags]]).mean(axis=1)
        tst[col + lag + '_avg'] = (tst[[col + lag
                                        for lag in lags]]).mean(axis=1)

    LOG.info('# target_sum over lag-5')
    # cumulative sum of target cols over past 5 months
    for col in target_cols:
        trn[col + '_sum'] = (trn[[col + lag for lag in lags]].sum(axis=1))
        tst[col + '_sum'] = (tst[[col + lag for lag in lags]].sum(axis=1))

    LOG.info('# target_sum_diff for each months')
    # change in count of purchases per month compared to its last month
    for diff in diffs:
        pre = diff[0]
        post = diff[1]
        trn['target_diff_' + post + '-' +
            pre] = trn['target_sum_lag_' + post] - trn['target_sum_lag_' + pre]
        tst['target_diff_' + post + '-' +
            pre] = tst['target_sum_lag_' + post] - tst['target_sum_lag_' + pre]

    LOG.info('# target_diff for each months')
    # change in individual purchases for each month compared to its last month
    for col in target_cols:
        for diff in diffs:
            pre = diff[0]
            post = diff[1]
            trn[col + '_label_lag_' +
                post] = trn[col + '_lag_' + post] - trn[col + '_lag_' + pre]
            tst[col + '_label_lag_' +
                post] = tst[col + '_lag_' + post] - tst[col + '_lag_' + pre]

    LOG.info('# unique target count')
    # unique count of purchased targets over 5 months
    trn['unique_target_count'] = (trn[[col + '_sum' for col in target_cols]] >
                                  0).astype(int).sum(axis=1)
    tst['unique_target_count'] = (tst[[col + '_sum' for col in target_cols]] >
                                  0).astype(int).sum(axis=1)

    LOG.info('# Drop infrequent targets..')
    rem_targets = [2, 23, 22, 21, 18, 17, 4, 12, 11, 9, 6, 13, 7, 19, 8]
    trn = trn[trg['0'].isin(rem_targets)]
    trg = trg[trg['0'].isin(rem_targets)]
    trg = LabelEncoder().fit_transform(trg)

    LOG.info('# trn : {} | trg : {} | tst : {}'.format(trn.shape, trg.shape,
                                                       tst.shape))

    # cache
    LOG.info('# Caching data as trn.csv / tst.csv ..')
    trn.to_csv('./input/trn_cache.csv', index=False)
    tst.to_csv('./input/tst_cache.csv', index=False)
    pd.DataFrame(trg).to_csv('./input/trg_cache.csv', index=False)

    ##################################################################################################################
    # CV Evaluation
    ##################################################################################################################

    # from cache
    trn = pd.read_csv('./input/trn_cache.csv')
    tst = pd.read_csv('./input/tst_cache.csv')
    trg = pd.read_csv('./input/trg_cache.csv')

    LOG.info('=' * 50)
    LOG.info('# Cross validation..')

    # XGB Model Param
    num_round = 500
    early_stop = 50
    xgb_params = {
        'booster': 'gbtree',
        'gamma': 1,
        'learning_rate': 0.1,
        'max_depth': 4,
        'min_child_weight': 3,
        'nthread': 4,
        'num_class': 15,
        'objective': 'multi:softprob',
        'silent': 1,
        'eval_metric': 'mlogloss',
        'seed': 777,
    }

    trn_scores = []
    vld_scores = []
    best_iters = []
    n_splits = 2
    sss = StratifiedShuffleSplit(n_splits=n_splits,
                                 test_size=0.05,
                                 random_state=777)
    for i, (t_ind, v_ind) in enumerate(sss.split(trn, trg)):
        LOG.info('# Iter {} / {}'.format(i + 1, n_splits))
        x_trn = np.asarray(trn)[t_ind]
        x_vld = np.asarray(trn)[v_ind]
        y_trn = np.asarray(trg)[t_ind]
        y_vld = np.asarray(trg)[v_ind]

        dtrn = xgb.DMatrix(x_trn, label=y_trn)
        dvld = xgb.DMatrix(x_vld, label=y_vld)
        watch_list = [(dtrn, 'train'), (dvld, 'eval')]

        # fit xgb
        bst = xgb.train(xgb_params,
                        dtrn,
                        num_round,
                        watch_list,
                        early_stopping_rounds=early_stop,
                        verbose_eval=True)

        # eval _ trn
        score = log_loss(y_trn, bst.predict(dtrn))
        trn_scores.append(score)

        # eval _ vld
        score = log_loss(y_vld, bst.predict(dvld))
        vld_scores.append(score)

        # best iters
        best_iters.append(bst.best_iteration)

    LOG.info('# TRN logloss: {}'.format(np.mean(trn_scores)))
    LOG.info('# VLD logloss: {}'.format(np.mean(vld_scores)))
    LOG.info('# Best Iters : {}'.format(np.mean(best_iters)))

    ##################################################################################################################
    # Model Fit
    ##################################################################################################################

    LOG.info('=' * 50)
    LOG.info('# Refit and predict on test data..')
    dtrn = xgb.DMatrix(trn, label=trg)
    num_round = int(np.mean(best_iters) / 0.9)
    bst = xgb.train(xgb_params, dtrn, num_round, verbose_eval=False)

    dtst = xgb.DMatrix(tst)
    preds = bst.predict(dtst)
    preds = np.fliplr(np.argsort(preds, axis=1))

    ##################################################################################################################
    # Submission
    ##################################################################################################################

    LOG.info('=' * 50)
    LOG.info('# Generating a submission..')
    submit_cols = [
        target_cols[i] for i, col in enumerate(target_cols) if i in rem_targets
    ]

    final_preds = []
    for pred in preds:
        top_products = []
        for i, product in enumerate(pred):
            top_products.append(submit_cols[product])
            if i == 6:
                break
        final_preds.append(' '.join(top_products))

    t_index = pd.read_csv('../root_input/test_ver2.csv', usecols=['ncodpers'])
    test_id = t_index['ncodpers']
    out_df = pd.DataFrame({'ncodpers': test_id, 'added_products': final_preds})
    file_name = datetime.now().strftime("result_%Y%m%d%H%M%S") + '.csv'
    path = './output'
    if not os.path.exists(path):
        os.makedirs(path)
    out_df.to_csv(os.path.join(path, file_name), index=False)

    LOG.info('# Clean files')
    cmd = 'rm -rf ./input'
    os.system(cmd)

    LOG.info('=' * 50)
    LOG.info('# Finished!')
    LOG.info('=' * 50)
Exemplo n.º 24
0
# Author:   xiaoyi | 小一
# email:    [email protected]
# Date:     2020/3/27 16:09
# Description: 

import pandas as pd
import numpy as np

# 显示所有列
from explore_data import explore_area
from preprocess_data import preprocess_data
from read_data import read_data
from view_data import view_data

pd.set_option('display.max_columns', None)
# 显示所有行
# pd.set_option('display.max_rows', None)


if __name__ == '__main__':
    # 为避免我们频繁的读取数据库,可将数据保存到本地文件
    df_data = read_data()

    """数据预处理"""
    df_data = preprocess_data(df_data)

    """可视化分析"""
    df_data = view_data(df_data)

    """热力图探索"""
    explore_area(df_data)
Exemplo n.º 25
0
from sklearn.model_selection import RandomizedSearchCV

#load the data
df_train = pd.read_csv('data/verkehrsunfaelle_train.csv',
                       engine='python',
                       index_col=0)
df_test = pd.read_csv('data/verkehrsunfaelle_test.csv',
                      engine='python',
                      index_col=0)

#get features and the target variable
accidents = df_train.drop('Unfallschwere', axis=1)
accidents_labels = df_train['Unfallschwere'].copy()

#preprocess the training data
model_sel, X_train, X_test, y_train, y_test = preprocess_data(
    accidents, accidents_labels)

#preprocess the test set
prediction_data = preprocess_data_to_predict(df_test, model_sel)

#initialize the DeepNeuralNetwork
dnn = DeepNeuralNetClassifier(show_progress=None, random_state=42)

#set hyper parameters for RandomizedSearchCV
parameter_distributions = {
    'n_hidden_layers': [3, 4, 5],
    'n_neurons': [40, 50, 100],
    'batch_size': [64, 128],
    'learning_rate': [0.01, 0.005],
    'activation': [tf.nn.elu, tf.nn.relu],
    'max_checks_without_progress': [20, 30],