Exemplo n.º 1
0
def run_job(spark, config):
    """
    This Function will execute the Analytics job 1
    :param spark: SparkSession Object
    :param config: Config file which contain all the parameters
    :return: None
    """
    load_data(analyse_data(spark, extract_data(spark, f"{config.get('source_data_path')}/Primary_Person_use.csv")),f"{config.get('target_data_path')}/job01")
Exemplo n.º 2
0
def test_dA(dataset,
            numFeatures,
            numHidden,
            minCorruptionLevel=0.0,
            maxCorruptionLevel=1.0,
            corruptionLevelStep=0.1,
            learning_rate=0.1,
            batch_size=20,
            output_folder='dA_plots',
            deltaCostStopThreshold=0.001,
            learningRateBoostFactor=1.5):
    '''
    Inputs:
        :dataset (str): path to the dataset in .pkl.gz format
        :numFeatures: number of input features
        :numHidden: number of hidden features
        :learning_rate (float): learning rate used for training the DeNosing
                                AutoEncoder
        :batch_size (int): number of examples to use in each batch
        :output_folder (str): where to put the weights and biases
        :deltaCostStopThreshold (float): string
        :learningRateBoostFactor: a factor to multiply the learning rate by when
                                  deltaCostStopThreshold is not met, use None to not use boosting

    '''

    datasets = load_data(dataset)
    train_set_x, train_set_y = datasets[0]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images

    # Create output folder if it doesn't exist
    if not os.path.isdir(output_folder):
        os.makedirs(output_folder)
    os.chdir(output_folder)

    origLearningRate = learning_rate

    for corruptionLevel in numpy.arange(
            minCorruptionLevel, maxCorruptionLevel + corruptionLevelStep,
            corruptionLevelStep):

        strCorruptionLevel = str(int(100 * corruptionLevel))
        learning_rate = origLearningRate

        ######################
        # BUILDING THE MODEL #
        ######################

        rng = numpy.random.RandomState(123)
        theano_rng = RandomStreams(rng.randint(2**30))

        da = dA(numpy_rng=rng,
                theano_rng=theano_rng,
                input=x,
                n_visible=numFeatures,
                n_hidden=numHidden)

        cost, updates = da.get_cost_updates(corruption_level=corruptionLevel,
                                            learning_rate=learning_rate)

        train_da = theano.function(
            [index],
            cost,
            updates=updates,
            givens={
                x: train_set_x[index * batch_size:(index + 1) * batch_size]
            })

        start_time = time.clock()

        ############
        # TRAINING #
        ############

        # Go through training epochs until stopping criterion is met
        deltaCost = -1
        epoch = 0
        lastEpochCost = 1e6
        epochCost = 1e6

        while deltaCost < -deltaCostStopThreshold or deltaCost < 0:
            # Go through training set
            c = []

            weights = da.W.get_value(borrow=True).T
            biases = da.b.get_value(borrow=True)
            lastEpochCost = epochCost

            for batch_index in xrange(n_train_batches):
                c.append(train_da(batch_index))

            epochCost = numpy.mean(c)
            print 'Training epoch %d, cost ' % epoch, epochCost
            deltaCost = epochCost - lastEpochCost

            epoch += 1

            # Check stopping criteria and update learning rate
            if abs(deltaCost) < deltaCostStopThreshold:
                learning_rate *= learningRateBoostFactor
                print 'increasing learning rate to %0.2f...' % learning_rate
                cost, updates = da.get_cost_updates(
                    corruption_level=corruptionLevel,
                    learning_rate=learning_rate)
                train_da = theano.function(
                    [index],
                    cost,
                    updates=updates,
                    givens={
                        x: train_set_x[index * batch_size:(index + 1) *
                                       batch_size]
                    })

        # Save final weights and biases
        outputFnRoot = 'corruption_%s_nin_%s_nhdn_%s_basz_%s_lnrt_%s' \
                    % (strCorruptionLevel, str(numFeatures), str(numHidden), str(batch_size), str(origLearningRate))
        numpy.savetxt('weights_' + outputFnRoot + '.csv',
                      weights,
                      delimiter=',')
        numpy.savetxt('biases_' + outputFnRoot + '.csv', biases, delimiter=',')

        # calculate time taken for training
        end_time = time.clock()
        training_time = (end_time - start_time)

        # save a list of the training parameters
        if not os.path.exists('training_records.pkl'):
            trainingRecords = []
        else:
            trainingRecords = cPickle.load(open('training_records.pkl', 'rb'))
        record = {
            'Corruption': corruptionLevel,
            '# Feature': numFeatures,
            '# Hidden Units': numHidden,
            '# Epochs': epoch,
            'Batch Size': batch_size,
            'Initial Learning Rate': origLearningRate,
            'Final Learning Rate': learning_rate,
            'Delta Cost Stopping Threshold': deltaCostStopThreshold,
            'Learning Rate Boost Factor': learningRateBoostFactor,
            'Final Cost': lastEpochCost
        }

        trainingRecords.append(record)
        cPickle.dump(trainingRecords, open('training_records.pkl', 'wb'))

        # Print time taken for training
        print >> sys.stderr, ('The ' + strCorruptionLevel +
                              '% corruption code for file ' +
                              os.path.split(__file__)[1] + ' ran for %.2fm' %
                              (training_time / 60.))

    os.chdir('../')
Exemplo n.º 3
0
def test_dA(dataset,
            numFeatures,
            numHidden,
            learning_rate=0.1,
            batch_size=20,
            output_folder='dA_plots',
            deltaCostStopThreshold=0.0005):
    '''
    Inputs:
        :type learning_rate: float
        :param learning_rate: learning rate used for training the DeNosing
                              AutoEncoder
    
        :type training_epochs: int
        :param training_epochs: number of epochs used for training
    
        :type dataset: string
        :param dataset: path to the picked dataset

    '''

    datasets = load_data(dataset)
    train_set_x, train_set_y = datasets[0]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images

    # Create output folder if it doesn't exist
    if not os.path.isdir(output_folder):
        os.makedirs(output_folder)
    os.chdir(output_folder)

    for corruptionLevel in numpy.arange(0.0, 1.01, 0.1):

        strCorruptionLevel = str(int(100 * corruptionLevel))

        ######################
        # BUILDING THE MODEL #
        ######################

        rng = numpy.random.RandomState(123)
        theano_rng = RandomStreams(rng.randint(2**30))

        da = dA(numpy_rng=rng,
                theano_rng=theano_rng,
                input=x,
                n_visible=numFeatures,
                n_hidden=numHidden)

        cost, updates = da.get_cost_updates(corruption_level=corruptionLevel,
                                            learning_rate=learning_rate)

        train_da = theano.function(
            [index],
            cost,
            updates=updates,
            givens={
                x: train_set_x[index * batch_size:(index + 1) * batch_size]
            })

        start_time = time.clock()

        ############
        # TRAINING #
        ############

        # Go through training epochs until stopping criterion is met
        deltaCost = 1
        epoch = 0
        lastEpochCost = 1e6

        while abs(deltaCost) > deltaCostStopThreshold:
            # Go through training set
            c = []
            for batch_index in xrange(n_train_batches):
                c.append(train_da(batch_index))

            epochCost = numpy.mean(c)
            print 'Training epoch %d, cost ' % epoch, epochCost
            deltaCost = epochCost - lastEpochCost
            lastEpochCost = epochCost

            epoch += 1

        # Save final weights and biases
        outputFnRoot = 'corruption_%s_nin_%s_nhdn_%s_basz_%s_lnrt_%s' \
                    % (strCorruptionLevel, str(numFeatures), str(numHidden), str(batch_size), str(learning_rate))
        weights = da.W.get_value(borrow=True).T
        biases = da.b.get_value(borrow=True)
        numpy.savetxt('weights_' + outputFnRoot + '.csv',
                      weights,
                      delimiter=',')
        numpy.savetxt('biases_' + outputFnRoot + '.csv', biases, delimiter=',')

        # calculate time taken for training
        end_time = time.clock()
        training_time = (end_time - start_time)

        # save a list of the training parameters
        if not os.path.exists('training_records.pkl'):
            trainingRecords = []
        else:
            trainingRecords = cPickle.load(open('training_records.pkl', 'rb'))
        record = {
            'Corruption': corruptionLevel,
            '# Feature': numFeatures,
            '# Hidden Units': numHidden,
            '# Epochs': epoch,
            'Batch Size': batch_size,
            'Learning Rate': learning_rate,
            'Delta Cost Stopping Threshold': deltaCostStopThreshold,
            'Final Cost': epochCost
        }

        trainingRecords.append(record)
        cPickle.dump(trainingRecords, open('training_records.pkl', 'wb'))

        # Print time taken for training
        print >> sys.stderr, ('The ' + strCorruptionLevel +
                              '% corruption code for file ' +
                              os.path.split(__file__)[1] + ' ran for %.2fm' %
                              (training_time / 60.))

    os.chdir('../')
Exemplo n.º 4
0
def test_dA(dataset,
            numFeatures,
            numHidden,
            learning_rate=0.1,
            training_epochs=15,
            batch_size=20,
            output_folder='dA_plots'):
    '''
    Inputs:
        :type learning_rate: float
        :param learning_rate: learning rate used for training the DeNosing
                              AutoEncoder
    
        :type training_epochs: int
        :param training_epochs: number of epochs used for training
    
        :type dataset: string
        :param dataset: path to the picked dataset

    '''

    datasets = load_data(dataset)
    train_set_x, train_set_y = datasets[0]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images

    # Create output folder if it doesn't exist
    if not os.path.isdir(output_folder):
        os.makedirs(output_folder)
    os.chdir(output_folder)

    for corruptionLevel in numpy.arange(0.0, 1.01, 0.1):

        strCorruptionLevel = str(int(100 * corruptionLevel))

        ######################
        # BUILDING THE MODEL #
        ######################

        rng = numpy.random.RandomState(123)
        theano_rng = RandomStreams(rng.randint(2**30))

        da = dA(numpy_rng=rng,
                theano_rng=theano_rng,
                input=x,
                n_visible=numFeatures,
                n_hidden=numHidden)

        cost, updates = da.get_cost_updates(corruption_level=corruptionLevel,
                                            learning_rate=learning_rate)

        train_da = theano.function(
            [index],
            cost,
            updates=updates,
            givens={
                x: train_set_x[index * batch_size:(index + 1) * batch_size]
            })

        start_time = time.clock()

        ############
        # TRAINING #
        ############

        # go through training epochs
        for epoch in xrange(training_epochs):
            # go through training set
            c = []
            for batch_index in xrange(n_train_batches):
                c.append(train_da(batch_index))

            print 'Training epoch %d, cost ' % epoch, numpy.mean(c)

            outputFnRoot = 'weights_corruption_%s_epoch_%s_nin_%s_nhdn_%s_basz_%s_lnrt_%s' \
                        % (strCorruptionLevel, str(epoch), str(numFeatures), str(numHidden), str(batch_size), str(learning_rate))
            # save weights
            weights = da.W.get_value(borrow=True).T
            numpy.savetxt(outputFnRoot + '.csv', weights, delimiter=',')

            # save image of weights
            image = Image.fromarray(
                tile_raster_images(X=weights,
                                   img_shape=(1, numFeatures),
                                   tile_shape=(numHidden, 1),
                                   tile_spacing=(5, 5)))
            image.save(outputFnRoot + '.png')

        # calculate and print time taken for training
        end_time = time.clock()
        training_time = (end_time - start_time)

        print >> sys.stderr, ('The ' + strCorruptionLevel +
                              '% corruption code for file ' +
                              os.path.split(__file__)[1] + ' ran for %.2fm' %
                              (training_time / 60.))

    os.chdir('../')
Exemplo n.º 5
0
                        help='The number of epochs',
                        type=int,
                        default=1)
    parser.add_argument("-k",
                        '--clusters',
                        help='The number of clusters',
                        type=int,
                        default=2)
    args = parser.parse_args()
    if args.verbose > 0:
        verbose = args.verbose

    np.set_printoptions(suppress=True)
    """ Solution Start """
    print "Start"
    data = load_data(vsData)
    kmeans(data)

    print "Hac min"
    heights_min = hac(data)
    print heights_min
    heights_max = hac_max(data)
    print heights_max

    plt.figure()
    h_min = hierarchy.linkage(heights_min, 'single')
    hierarchy.dendrogram(h_min)
    plt.savefig("./docs/single_hac.png")

    plt.clf()