Exemplo n.º 1
0
 def __init__(self):
     self.test_data = load_data.LoadData().get_test_set()
     self.operator = calculate.Calculate()
     self.item_based = item_based.ItemBased()
def get_input(sample_type,
              shuffle_documents,
              pad,
              trained_sent2vec_model=None):
    # Returns X, Y
    # X: Each row is a sample
    # Y: A 1-D vector for ground truth
    # Also pads the sample input as per the mentioned value of INPUT_VECTOR_LENGTH is needed

    start = time.time()
    data_handler = DataHandler()

    print "==========================================="
    if sample_type == 1:
        # NOT SURE ABOUT THIS TYPE!
        sample_type, samples = data_handler.get_samples(
        )  # Get samples, each sample is a set of INPUT_VECTOR_LENGTH consecutive sentences. No document information captured
    elif sample_type == 2:
        ld = load_data.LoadData()
        sample_type, samples = ld.load_wikipedia_sequence()
    elif sample_type in (2, 3):
        # type2 : Get samples, each sample is a document (a set of sentences resulting in a sequence), or, (NUM_DOCUMENTS, NUM_SENTENCES, SENTENCE)
        # type3 : Same as type2 just merge the samples to remove the sequence information and treat as simple sentence classification problem, i.e. (TOTAL_NUM_SENTENCES, SENTENCE)
        #         This processing will be done in the cnn_clssifier.py itself.
        sample_type, samples = data_handler.get_sequence_samples(sample_type)
        #sample_type, samples = data_handler.get_sequence_samples_PARALLEL()  # Get samples, each sample is a document (a set of sentences resulting in a sequence)
    elif sample_type == 4:
        # type4: Clinical sequence of a multiple samples
        # X.shape = (MULTIPLE_SAMPLES, TOTAL_SENTENCES)
        # Y.shape = (MULTIPLE_SAMPLES, TOTAL_SENTENCES, 1)
        ld = load_data.LoadData()
        sample_type, samples = ld.load_clinical_sequence()
    elif sample_type == 5:
        # type5: Biography sequence of a single sample
        # X.shape = (1, TOTAL_SENTENCES)
        # Y.shape = (TOTAL_SENTENCES, 1)
        ld = load_data.LoadData()
        sample_type, samples = ld.load_biography_sequence()
    elif sample_type == 6:
        # type6: Fiction sequence of a multiple documents
        # X.shape = (NO_OF_BOOKS, TOTAL_SENTENCES)
        # Y.shape = (NO_OF_BOOKS, TOTAL_SENTENCES, 1)
        ld = load_data.LoadData()
        sample_type, samples = ld.load_fiction_sequence()
    elif sample_type == 7:
        # type7: Wiki sequence of a multiple sample
        # Data format is just like the clinical sequence as each line is a sentence
        # X.shape = (MULTIPLE_DOCUMENTS, TOTAL_SENTENCES)
        # Y.shape = (MULTIPLE_DOCUMENTS, TOTAL_SENTENCES, 1)
        ld = load_data.LoadData()
        sample_type, samples = ld.load_wikipedia_sequence()
    else:
        print "NOTE: INVALID SAMPLE_TYPE!"
        return None

    del data_handler
    print "Samples Loading took", time.time() - start, "seconds"

    model = trained_sent2vec_model
    if not trained_sent2vec_model:
        #model = TFIDF(samples)
        #model = MeanWord2vec()
        #model = TFIDFweightedMeanWord2vec(samples)
        model = CustomSent2vec()

    X, Y = [], []
    _total_samples, _start_time = len(samples), time.time()
    print len(samples)
    #pdb.set_trace()
    for _idx, sample in enumerate(samples):
        # Each sample is a document
        # Each sample is a list of tuples with each tuple as (sentence, groundTruth)
        sentences, groundTruths = zip(*sample)  # Unpack a sample

        ## Create Wikipedia test set
        CREATE_WIKI_TEST_SET = False
        if CREATE_WIKI_TEST_SET:
            wiki_prefix = "wiki_save/wiki_test"
            if _idx >= 300:
                break
            with open(wiki_prefix + "_" + str(_idx + 1) + ".ref", "a") as f:
                for (_s, _g) in sample:
                    if _g:
                        f.write("==========\r\n")
                    f.write(_s + "\r\n")
                f.write("==========\r\n")
        else:
            # Traditional code
            if not _idx % 50:
                progbar.simple_update("Converting doc to martices",
                                      _idx + 1,
                                      _total_samples,
                                      time_elapsed=(time.time() - _start_time))

            if sample_type == 1:
                # Correct groundtruth sync problem here
                sentences, groundTruths = model.convert_sample_to_vec(
                    sentences, groundTruths)
            elif sample_type in (2, 3, 4, 5, 6, 7):
                sentences, groundTruths = model.convert_sequence_sample_to_vec(
                    sentences, groundTruths)
            else:
                print "Wrong Sample TYPE"

            if sentences is None:
                continue
            X.append(sentences)  # X[0].shape = matrix([[1,2,3,4.....]])
            Y.append(np.asarray(
                groundTruths))  # Y[0] = [1, 0, 0, ..... 0, 1, 0, 1....]
    progbar.simple_update("Creating a standalone matrix for samples...", -1,
                          -1)
    X, Y = np.asarray(X), np.asarray(Y)
    progbar.end()

    print "Total samples: %d" % (len(X))
    if shuffle_documents:  # Shuffle the X's and Y's if required
        # Both of them have to be in unison
        X, Y = unison_shuffled_copies(X, Y)
        print "SHUFFLE: Shuffled input document order! (X:", X.shape, ", Y:", Y.shape, ")"

    if sample_type == 2 and pad == False:
        print "NOTE: Sample type2 requires PADDING!"

    if pad:
        #### THIS PAD is messy!!!!
        ### Check once before padding
        if STATIC_PAD:
            max_len = AVERAGE_WORDS
        else:
            max_len = None  # Uses the max length of the sequences

        doc_lengths = [len(doc) for doc in X]
        print "Padding sequences. Doc-lengths: Mean=%d, Std=%d" % (
            np.mean(doc_lengths), np.std(doc_lengths))
        X = pad_sequences(X,
                          padding="post",
                          truncating="post",
                          value=0.0,
                          dtype=np.float32)
        Y = pad_sequences(Y,
                          padding="post",
                          truncating="post",
                          value=0.0,
                          dtype=np.float32)

        print "Size of new X(after padding):", X.shape

    return sample_type, X, Y, model
Exemplo n.º 3
0
def RunTest(
    params,
    model_name_template='models_3/{model}_{backbone}_{optimizer}_{augmented_image_size}-{padded_image_size}-{nn_image_size}_lrf{lrf}_{metric}_{CC}_f{test_fold_no}_{phash}'
):

    # # Params

    # In[ ]:

    DEV_MODE_RANGE = 0  # off

    # In[ ]:

    # In[ ]:

    def params_dict():
        return {
            x[0]: x[1]
            for x in vars(params).items() if not x[0].startswith('__')
        }

    def params_str():
        return '\n'.join([
            repr(x[0]) + ' : ' + repr(x[1]) + ','
            for x in vars(params).items() if not x[0].startswith('__')
        ])

    def params_hash(shrink_to=6):
        import hashlib
        import json
        return hashlib.sha1(
            json.dumps(params_dict(),
                       sort_keys=True).encode()).hexdigest()[:shrink_to]

    def params_save(fn, verbose=True):
        params_fn = fn + '.param.txt'
        with open(params_fn, 'w+') as f:
            s = params_str()
            hash = params_hash(shrink_to=1000)
            s = '{\n' + s + '\n}\nhash: ' + hash[:6] + ' ' + hash[6:]
            f.write(s)
            if verbose:
                print('params: ' + s + '\nsaved to ' + params_fn)

    # # Imports

    # In[ ]:

    import sys
    #sys.path.append(r'D:\Programming\3rd_party\keras')

    # In[ ]:

    import sys
    from imp import reload
    import numpy as np
    import keras
    import datetime
    import time

    from keras.models import Model, load_model
    from keras.layers import Input, Dropout, BatchNormalization, Activation, Add
    from keras.layers.core import Lambda
    from keras.layers.convolutional import Conv2D, Conv2DTranspose
    from keras.layers.pooling import MaxPooling2D
    from keras.layers.merge import concatenate
    from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, CSVLogger
    from keras import backend as K

    import tensorflow as tf

    # # Load data

    # In[ ]:

    import load_data
    load_data = reload(load_data)
    import keras_unet_divrikwicky_model
    keras_unet_divrikwicky_model = reload(keras_unet_divrikwicky_model)

    # In[ ]:

    train_df = load_data.LoadData(train_data=True,
                                  DEV_MODE_RANGE=DEV_MODE_RANGE,
                                  to_gray=False)

    # In[ ]:

    train_df.images[0].shape

    # In[ ]:

    train_images, train_masks, validate_images, validate_masks = load_data.SplitTrainData(
        train_df, params.test_fold_no)
    train_images.shape, train_masks.shape, validate_images.shape, validate_masks.shape

    # # Reproducability setup:

    # In[ ]:

    import random as rn

    import os
    os.environ['PYTHONHASHSEED'] = '0'

    np.random.seed(params.seed)
    rn.seed(params.seed)

    #session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
    tf.set_random_seed(params.seed)
    #sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
    sess = tf.Session(graph=tf.get_default_graph())
    K.set_session(sess)

    # # IOU metric

    # In[ ]:

    thresholds = np.array(
        [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95])

    def iou(img_true, img_pred):
        assert (img_true.shape[-1] == 1) and (len(img_true.shape) == 3) or (
            img_true.shape[-1] != 1) and (len(img_true.shape) == 2)
        i = np.sum((img_true * img_pred) > 0)
        u = np.sum((img_true + img_pred) > 0)
        if u == 0:
            return 1
        return i / u

    def iou_metric(img_true, img_pred):
        img_pred = img_pred > 0.5  # added by sgx 20180728
        if img_true.sum() == img_pred.sum() == 0:
            scores = 1
        else:
            scores = (thresholds <= iou(img_true, img_pred)).mean()
        return scores

    def iou_metric_batch(y_true_in, y_pred_in):
        batch_size = len(y_true_in)
        metric = []
        for batch in range(batch_size):
            value = iou_metric(y_true_in[batch], y_pred_in[batch])
            metric.append(value)
        #print("metric = ",metric)
        return np.mean(metric)

    # adapter for Keras
    def my_iou_metric(label, pred):
        metric_value = tf.py_func(iou_metric_batch, [label, pred], tf.float64)
        return metric_value

    # # Data generator

    # In[ ]:

    mean_val = np.mean(train_images.apply(np.mean))
    mean_std = np.mean(train_images.apply(np.std))
    mean_val, mean_std

    #####################################
    def FillCoordConvNumpy(imgs):
        print(imgs.shape)
        assert len(imgs.shape) == 4
        assert imgs.shape[3] == 3
        n = imgs.shape[2]
        hor_img = np.linspace(-1., 1., n).reshape((1, 1, n, 1))
        n = imgs.shape[1]
        ver_img = np.linspace(-1., 1., n).reshape((1, n, 1, 1))
        imgs[:, :, :, 0:1] = hor_img
        imgs[:, :, :, 2:3] = ver_img

    def FillCoordConvList(imgs):
        print(imgs.shape)
        assert len(imgs[0].shape) == 3
        assert imgs[0].shape[2] == 3
        for img in imgs:
            n = img.shape[1]
            hor_img = np.linspace(-1., 1., n).reshape((1, n, 1))
            n = img.shape[0]
            ver_img = np.linspace(-1., 1., n).reshape((n, 1, 1))
            img[:, :, 0:1] = hor_img
            img[:, :, 2:3] = ver_img

    if params.coord_conv:
        FillCoordConvList(train_images)
        FillCoordConvList(validate_images)
        print(train_images[0][0, 0, 0], train_images[0][0, 0, 2])
        assert train_images[0][0, 0, 0] == -1.
        assert train_images[0][0, 0, 2] == 1.

    ######################################

    from my_augs import AlbuDataGenerator

    # # model

    # In[ ]:

    sys.path.append('../3rd_party/segmentation_models')
    import segmentation_models
    segmentation_models = reload(segmentation_models)
    from segmentation_models.utils import set_trainable

    # In[ ]:
    if not hasattr(params, 'model_params'):
        params.model_params = {}

    if params.load_model_from:
        model = load_model(params.load_model_from,
                           custom_objects={'my_iou_metric': my_iou_metric})
        print('MODEL LOADED from: ' + params.load_model_from)
    else:
        model = None
        if params.model == 'FNN':
            model = segmentation_models.FPN(
                backbone_name=params.backbone,
                input_shape=(None, None, params.channels),
                encoder_weights=params.initial_weightns,
                freeze_encoder=True,
                dropout=params.dropout,
                **params.model_params)
        if params.model == 'FNNdrop':
            model = segmentation_models.FPNdrop(
                backbone_name=params.backbone,
                input_shape=(None, None, params.channels),
                encoder_weights=params.initial_weightns,
                freeze_encoder=True,
                dropout=params.dropout,
                **params.model_params)
        if params.model == 'Unet':
            model = segmentation_models.Unet(
                backbone_name=params.backbone,
                input_shape=(None, None, params.channels),
                encoder_weights=params.initial_weightns,
                freeze_encoder=True,
                **params.model_params)
        if params.model == 'Linknet':
            model = segmentation_models.Linknet(
                backbone_name=params.backbone,
                input_shape=(None, None, params.channels),
                encoder_weights=params.initial_weightns,
                freeze_encoder=True,
                **params.model_params)
        if params.model == 'divrikwicky':
            model = keras_unet_divrikwicky_model.CreateModel(
                params.nn_image_size, **params.model_params)
            params.backbone = ''
        assert model

    for l in model.layers:
        if isinstance(
                l, segmentation_models.fpn.layers.UpSampling2D) or isinstance(
                    l, keras.layers.UpSampling2D):
            print(l)
            if hasattr(l, 'interpolation'):
                print(l.interpolation)
                if hasattr(params, 'model_params'
                           ) and 'interpolation' in params.model_params:
                    l.interpolation = params.model_params['interpolation']
            else:
                print('qq')

    if hasattr(params,
               'kernel_constraint_norm') and params.kernel_constraint_norm:
        for l in model.layers:
            if hasattr(l, 'kernel_constraint'):
                print('kernel_constraint for ', l, ' is set to ',
                      params.kernel_constraint_norm)
                l.kernel_constraint = keras.constraints.get(
                    keras.constraints.max_norm(params.kernel_constraint_norm))

    # In[ ]:

    model_out_file = model_name_template.format(
        lrf=params.ReduceLROnPlateau['factor'],
        metric=params.monitor_metric[0],
        CC='CC' if params.coord_conv else '',
        **vars(params)) + '_f{test_fold_no}_{phash}'.format(
            test_fold_no=params.test_fold_no, phash=params_hash())
    now = datetime.datetime.now()
    print('model:   ' + model_out_file + '    started at ' +
          now.strftime("%Y.%m.%d %H:%M:%S"))

    assert not os.path.exists(model_out_file + '.model')

    params_save(model_out_file, verbose=True)
    log_out_file = model_out_file + '.log.csv'

    # In[ ]:

    #model = load_model(model1_file, ) #, 'lavazs_loss': lavazs_loss

    # # Train

    # In[ ]:

    optimizer = params.optimizer
    if optimizer == 'adam':
        optimizer = keras.optimizers.adam(**params.optimizer_params)
    elif optimizer == 'sgd':
        optimizer = keras.optimizers.sgd(**params.optimizer_params)

    model.compile(loss="binary_crossentropy",
                  optimizer=optimizer,
                  metrics=["acc", my_iou_metric])  #, my_iou_metric

    # In[ ]:

    if params.coord_conv:
        mean = ((0, mean_val, 0), (1, mean_std, 1))
    else:
        mean = (mean_val, mean_std)

    train_gen = AlbuDataGenerator(train_images,
                                  train_masks,
                                  batch_size=params.batch_size,
                                  nn_image_size=params.nn_image_size,
                                  mode=params.train_augmentation_mode,
                                  shuffle=True,
                                  params=params,
                                  mean=mean)
    val_gen = AlbuDataGenerator(validate_images,
                                validate_masks,
                                batch_size=params.test_batch_size,
                                nn_image_size=params.nn_image_size,
                                mode=params.test_augmentation_mode,
                                shuffle=False,
                                params=params,
                                mean=mean)

    # In[ ]:

    sys.path.append('../3rd_party/keras-tqdm')
    from keras_tqdm import TQDMCallback, TQDMNotebookCallback

    # In[ ]:

    start_t = time.clock()

    if params.epochs_warmup:
        history = model.fit_generator(
            train_gen,
            validation_data=None,
            epochs=params.epochs_warmup,
            callbacks=[TQDMNotebookCallback(leave_inner=True)],
            validation_steps=None,
            workers=5,
            use_multiprocessing=False,
            verbose=0)

    set_trainable(model)
    batches_per_epoch = len(train_images) // params.batch_size
    print("batches per epoch: ", batches_per_epoch)
    test_epochs = 30
    steps = test_epochs * batches_per_epoch
    val_period = steps // 1000
    print("steps: ", steps, " val_period", val_period)

    lr_sheduler = EvalLrTest(log_out_file,
                             val_gen,
                             val_period=val_period,
                             steps=steps)

    history = model.fit_generator(
        train_gen,
        validation_data=None,
        epochs=params.epochs,
        initial_epoch=params.epochs_warmup,
        callbacks=[TQDMNotebookCallback(leave_inner=True), lr_sheduler],
        validation_steps=None,
        workers=5,
        use_multiprocessing=False,
        verbose=0)

    # In[ ]:

    print(params_str())
    print('done:   ' + model_out_file)
    print('elapsed: {}s ({}s/iter)'.format(
        time.clock() - start_t, (time.clock() - start_t) / len(history.epoch)))

    return model
Exemplo n.º 4
0
class ZjdxDataLinux(object):

    """Load Zhejiang Telecom data.

    Load  Zhejiang Telecom to redis.

    Attributes:
        base_dir: input data path.
    """

    def __init__(self):
        """inition for ZjdxDataLinux.

        Args:
            no
        """
        self.dir_list = ['/home/zjdx/', '/home/jsdx/today_data/']
        # self.dir_list = ['D:/home/zjdx/', 'D:/home/jsdx/']
        self.mysqlconn = MySQLdb.connect(host='120.55.189.211', user='******',
                                         passwd='hadoop', db='stock')
        self.cursor = self.mysqlconn.cursor()

    def main(self):

        """Main function.

        Args:
            no
        """
        for p in self.dir_list:
            self.just_do_it(p)
        self.cursor.close()
        self.mysqlconn.close()

    def just_do_it(self, Path):

        """just do it.

        Args:
            no
        """
        all_file = os.listdir(Path)
        file_list = []
        for file_name in all_file:
            result_1 = re.search(r'^kunyan_\d{10}$', file_name)
            result_2 = re.search(r'^jsdx_\d{10}$', file_name)
            result = result_1 or result_2
            if result:
                file_list.append(file_name)

        # old_file_list = []

        # new_file = open(dir+'files', 'a+')
        # new_file.close()
        #
        # file_old = open(dir+'files', 'r')
        # for lines in file_old:
        #     old_file_list.append(lines.strip('\n'))
        # file_old.close()

        try:
            self.cursor.execute("select * from unbacked_redis_data")
        except Exception, e:
            print e
        result = self.cursor.fetchall()
        old_file_list = []
        for line in result:
            old_file_list.append(line[0])

        tag = 0
        for line in file_list:
            if line not in old_file_list:
                tag = 1
                log = open(Path+'log', 'a+')
                log_time = time.strftime('%Y-%m-%d %H:%M:%S')
                begin_out = line + " begin_time: " + log_time + "\n"
                log.write(begin_out)
                print begin_out
                log.close()
                load = load_data.LoadData(Path+line)
                load.main()
                try:
                    self.cursor.execute("INSERT INTO unbacked_redis_data"
                                        "(unbacked_redis_file) VALUES ('%s')" % line)
                    self.mysqlconn.commit()
                except Exception, e:
                    print e
                isexists = os.path.exists(Path+"unbacked_redis_files/")
                if not isexists:
                    os.makedirs(Path+"unbacked_redis_files/")
                    print Path+"unbacked_redis_files" + u' 创建成功\n'
                isexists = os.path.exists(Path+"unbacked_redis_files/"+line)
                if isexists:
                    file_size_1 = os.path.getsize(Path+"unbacked_redis_files/"+line)
                    file_size_2 = os.path.getsize(Path+line)
                    if file_size_1 < file_size_2:
                        os.remove(Path+"unbacked_redis_files/"+line)
                        shutil.move(Path+line, Path+"unbacked_redis_files")
                else:
                    shutil.move(Path+line, Path+"unbacked_redis_files")
                # file_new = open(dir+'files', 'a+')
                # file_new.write(line)
                # file_new.write('\n')
                # file_new.close()
                log = open(Path+'log', 'a+')
                log_time = time.strftime('%Y-%m-%d %H:%M:%S')
                end_out = line + " end_time: " + log_time + "\n"
                log.write(end_out)
                print end_out
                log.close()
Exemplo n.º 5
0
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import linear_model, datasets
from sklearn.cross_validation import train_test_split
import load_data

print("===================")
d = load_data.LoadData()

### Generate Y
y = d.loc[:, "IsDefault"].values
count_default = sum(y)
print "bad sample: {0}".format(count_default)
print "Default Rate: {0}%".format(100.0 * count_default / len(y))
print("===================")

### Generate X
x_data = d.loc[:, [
    u"借款金额", u"借款期限", u"借款利率", u"年龄", u"历史成功借款次数", u"手机认证", u"户口认证", u"视频认证",
    u"学历认证", u"征信认证", u"淘宝认证"
]]
#, u"初始评级"
x_original = x_data.values
# -*- coding: utf-8 -*-
import settings
import load_data
import json


class DumpData:
    def __init__(self, data_path, data):
        self.data_path = data_path
        self.data = data

    def dump_json(self):
        with open(self.data_path, 'w') as f:
            json.dump(self.data, f)


if __name__ == "__main__":
    json_train = load_data.LoadData(settings.DATA_TRAIN_PATH).get_json()
    DumpData(settings.DATA_TRAIN_JSON, json_train).dump_json()
    json_test = load_data.LoadData(settings.DATA_TEST_PATH).get_json()
    DumpData(settings.DATA_TEST_JSON, json_test).dump_json()
Exemplo n.º 7
0
# this uses a hard-vote-esque method to determine the mismatch (adds up number of mismatches (1s)
# then divides by number of models.  if less than 0.5, then it is not mismatch, and vice versa)

import learner_functions as lf
import load_data as ld

data = ld.LoadData()

mismatch_labels = data.mismatch['mismatch'].tolist()


def find_mismatch_indices_hard(models, data, labels, type="default"):

    predictionForEachModel = list()
    mismatchIndices = list()

    for model in models:
        predictionForEachModel.append(
            lf.make_test_prediction(model, data, labels, False))

    for index in range(len(predictionForEachModel[0])):
        predictionSum = 0
        for array in predictionForEachModel:
            predictionSum += array[index]

        finalPrediction = predictionSum / len(models)
        if (finalPrediction > 0.5):
            mismatchIndices.append(index)

    return mismatchIndices
import tensorflow as tf
from sklearn import cross_validation
import load_data as ld

data_X, data_Y = ld.LoadData()
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(data_X, data_Y, test_size=0.2, random_state=0)

size = 100
sizes = [100,100]
iter = 200
batch_size = 50
epoch = int(X_train.shape[0]/batch_size)

def Layer(input, in_size, out_size, active_function = None):
    W = tf.Variable(tf.zeros([in_size, out_size]))
    b = tf.Variable(tf.zeros([1, out_size]))
    output = tf.add(tf.matmul(input, W), b)
    if(active_function!=None):
        output = active_function(output)
    return output

sess = tf.Session()
with sess.as_default():
    x = tf.placeholder("float", shape=[None, X_train.shape[1]])
    y_ = tf.placeholder("float")

    # l1 = Layer(x, X_train.shape[1], size)
    # l1 = tf.nn.dropout(l1,0.7)
    # y = Layer(l1, size, 1)
    l1 = Layer(x, X_train.shape[1], sizes[0], active_function=tf.nn.relu)
    l2 = Layer(l1, sizes[0], sizes[1])
Exemplo n.º 9
0
def evaluate(args):
    # load test data
    data = DATA.LoadData(args.path, args.dataset, args.seed).data_test
    save_file = make_save_file(args)

    # load the graph
    weight_saver = tf.train.import_meta_graph(save_file + '.meta')
    pretrain_graph = tf.get_default_graph()

    # load tensors
    feature_embeddings = pretrain_graph.get_tensor_by_name(
        'feature_embeddings:0')
    nonzero_embeddings = pretrain_graph.get_tensor_by_name(
        'nonzero_embeddings:0')
    feature_bias = pretrain_graph.get_tensor_by_name('feature_bias:0')
    bias = pretrain_graph.get_tensor_by_name('bias:0')
    fm = pretrain_graph.get_tensor_by_name('fm:0')
    fm_out = pretrain_graph.get_tensor_by_name('fm_out:0')
    out = pretrain_graph.get_tensor_by_name('out:0')
    train_features = pretrain_graph.get_tensor_by_name('train_features_fm:0')
    train_labels = pretrain_graph.get_tensor_by_name('train_labels_fm:0')
    dropout_keep = pretrain_graph.get_tensor_by_name('dropout_keep_fm:0')
    train_phase = pretrain_graph.get_tensor_by_name('train_phase_fm:0')

    # restore session
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    weight_saver.restore(sess, save_file)

    # start evaluation
    num_example = len(data['Y'])
    feed_dict = {
        train_features: data['X'],
        train_labels: [[y] for y in data['Y']],
        dropout_keep: 1.0,
        train_phase: False
    }
    ne, fe = sess.run((nonzero_embeddings, feature_embeddings),
                      feed_dict=feed_dict)
    _fm, _fm_out, predictions = sess.run((fm, fm_out, out),
                                         feed_dict=feed_dict)

    # calculate rmse
    y_pred = np.reshape(predictions, (num_example, ))
    y_true = np.reshape(data['Y'], (num_example, ))

    predictions_bounded = np.maximum(y_pred,
                                     np.ones(num_example) *
                                     min(y_true))  # bound the lower values
    predictions_bounded = np.minimum(predictions_bounded,
                                     np.ones(num_example) *
                                     max(y_true))  # bound the higher values
    RMSE = math.sqrt(mean_squared_error(y_true, predictions_bounded))

    print("Test RMSE: %.4f" % (RMSE))

    # Unify into dataframe
    y_df = pd.DataFrame({'label': y_true, 'pred': y_pred})

    # Write
    if not os.path.exists(args.path_output):
        os.makedirs(args.path_output)
    fullpath_output = args.path_output + '/predictions_%s_%d.csv' % (
        args.dataset, args.factor_k)
    y_df.to_csv(fullpath_output, index=False)
Exemplo n.º 10
0
def train(args):

    # Dictionary of arguments
    argv = vars(args)

    # Data loading
    data = DATA.LoadData(args.path, args.dataset, args.seed)

    # Get arguments from data
    argv['features_p'] = data.features_p
    argv['col_m'] = data.col_m

    if args.verbose > 0:
        print(
            "FM: dataset=%s, factors=%d, #epoch=%d, batch=%d, lr=%.4f, lambda=%.1e, keep=%.2f, optimizer=%s, batch_norm=%d"
            % (args.dataset, args.factor_k, args.epoch, args.batch_size,
               args.learning_rate, args.lamda_bilinear, args.dropout_keep_rate,
               args.optimizer_type, args.batch_norm))

    t1 = time()

    # Choose model
    if args.model_type == 'FM':
        model_class = model_classes.fm.FM
        model = model_class(features_p=data.features_p,
                            factor_k=args.factor_k,
                            col_m=data.col_m,
                            lamda_bilinear=args.lamda_bilinear,
                            dropout_keep_rate=args.dropout_keep_rate,
                            epoch=args.epoch,
                            batch_size=args.batch_size,
                            learning_rate=args.learning_rate,
                            optimizer_type=args.optimizer_type,
                            batch_norm=args.batch_norm,
                            pretrain_flag=args.pretrain,
                            save_file=make_save_file(args),
                            self_terminate=args.self_terminate,
                            verbose=args.verbose,
                            seed=1337)
    elif args.model_type == 'AFM':
        model_class = model_classes.afm.AFM
        model = model_class(features_p=data.features_p,
                            pretrain_flag=args.pretrain,
                            save_file=make_save_file(args),
                            attention=args.attention,
                            hidden_factor_1=args.hidden_factor_1,
                            hidden_factor_2=args.hidden_factor_2,
                            valid_dimension=data.col_m,
                            activation_function=args.activation,
                            freeze_fm=args.freeze_fm,
                            epoch=args.epoch,
                            batch_size=args.batch_size,
                            learning_rate=args.learning_rate,
                            lamda_attention=args.lamda_attention,
                            keep_1=args.keep_1,
                            keep_2=args.keep_2,
                            optimizer_type=args.optimizer_type,
                            batch_norm=args.batch_norm,
                            decay=args.decay,
                            verbose=args.verbose,
                            micro_level_analysis=args.mla,
                            random_seed=args.seed)
    else:
        print("=== Please select a model type.")
        return

    # Begin Training
    model.train(data.data_train, data.data_valid, data.data_test)

    # Find the best validation result across iterations
    best_valid_score = 0
    best_valid_score = min(model.valid_rmse)
    best_epoch = model.valid_rmse.index(best_valid_score)
    print("Best Iter(validation)= %d\t train = %.4f, valid = %.4f [%.1f s]" %
          (best_epoch + 1, model.train_rmse[best_epoch],
           model.valid_rmse[best_epoch], time() - t1))
Exemplo n.º 11
0
import load_data
import numpy as np

if __name__ == "__main__":
    dataset = load_data.LoadData()
    (nf_slave_c_pos, nf_master_j_pos, nf_master_j_vel,
     nf_mcurr_load) = dataset.get_no_feedback()
    (tf_slave_c_pos, tf_master_j_pos, tf_master_j_vel,
     tf_mcurr_load) = dataset.get_torque_feedback()
    (pf_slave_c_pos, pf_master_j_pos, pf_master_j_vel,
     pf_mcurr_load) = dataset.get_position_feedback()
    print(len(nf_slave_c_pos))
    time = []
    for i in range(len(nf_slave_c_pos)):
        time.append(np.linspace(0, 1, len(nf_slave_c_pos[i])))

    aa = nf_slave_c_pos[0]
    # pos = np.array(aa['slave_c_pos'])
    # print(pos.shape)
Exemplo n.º 12
0
import tensorflow as tf
import cv2
from tensorflow.python.platform import gfile
import dlib
from imutils import face_utils
from PIL import Image
import numpy as np
import configuration as cfg
import load_data as ld
import matplotlib.pyplot as plt

configuration = cfg.Configuration()
load = ld.LoadData()
configuration.pickle_data_file = 'training_images.pickle'
load.data(configuration)
classes_n = configuration.data.classes_count
classes = configuration.data.classes
label_images = configuration.data.label_image

video_capture = cv2.VideoCapture(0)

frozen_graph_filename = 'model/train_model.pb'

with gfile.FastGFile(frozen_graph_filename, "rb") as f:
    graph_def = tf.GraphDef()
    byte = f.read()
    graph_def.ParseFromString(byte)

tf.import_graph_def(graph_def, name='')

# for node in graph_def.node:
Exemplo n.º 13
0
def main():
    # 1.导入文件,确定日期列,填充列,目标列
    data_name = input("请输入要转化的文件全名:(例如:all_in_one.csv)\n")
    date_column = input("请输入日期所在的列名:\n")
    target_column = input("请输入预测目标所在的列名:\n")
    type_column = input("请输入需要填充的item(目前仅支持一个):\n")
    loader = load_data.LoadData()
    origin_data = loader.load_data(data_name)

    # 2.根据时间戳范围,生成完整的时间戳
    timer = GetTime(origin_data, date_column)
    data, first_day, last_day = timer.get_time()
    data[date_column] = pd.to_datetime(data[date_column])
    generate_date = GenerateDate(first_day, last_day)
    dates = generate_date.generate()

    # 3.将时间戳和名称合并为DataFrame_1
    name_list = list(set(data[type_column]))
    if str(name_list[0]) == "nan":
        name_list.pop(0)
    else:
        pass
    merge_list = merge(dates, name_list)
    df_merge = pd.DataFrame(merge_list, columns=[date_column, type_column])

    # 4.将DataFrame_1和原来的DataFrame外连接合并
    df_total = pd.merge(df_merge,
                        data,
                        how="outer",
                        on=[date_column, type_column])

    # 5.额外选项
    df_total[date_column] = pd.to_datetime(df_total[date_column])
    week_day_flag = input("是否需要生成星期几: y(Default)/n\n")
    generate_week_day(df_total, date_column, week_day_flag)
    open_flag = input("是否需要生成open: y(Default)/n\n")
    generate_open(df_total, target_column, open_flag)
    print("Filling N/A...")
    df_total = df_total.fillna(0)
    print(df_total.head())

    # 6.特征工程
    feature_flag = input("是否添加统计学特征: y(Default)/n\n")
    if feature_flag == "y" or feature_flag == "":
        period = input("请输入统计学特征间隔时间(Default:7):\n")
        factory_name = []
        for i in range(len(name_list)):
            factory_name.append(df_total[i::len(name_list)])
        if period == "":
            feature_process(factory_name, target_column)
        else:
            feature_process(factory_name, target_column, int(period))
        df_total = pd.concat(factory_name, axis=0)
        df_total = df_total.sort_values(by=[date_column, type_column])
        df_total = df_total.reset_index().drop(columns="index")
    else:
        pass

    # 6.导出
    export_name_first = data_name.split(".")[0] + "_out"
    export_name_end = data_name.split(".")[-1]
    export_name = export_name_first + "." + export_name_end
    if export_name_end == "xlsx":
        df_total.to_excel(export_name, index_label="ID")
    elif export_name_end == "csv":
        df_total.to_csv(export_name, index_label="ID")
    else:
        pass