示例#1
0
def firstStep(metaOptimizer, smallTrainFilename, smallTestFilename):
    trainData, trainLabel = loadDataset(smallTrainFilename)
    testData, testLabel = loadDataset(smallTestFilename)
    metaOptimizer.initialize_optimizer(
        "random", None, trainData, trainLabel, testData, testLabel, jobs=-1, iterations=700, scoresCsvFilename=None
    )
    metaOptimizer.optimized = metaOptimizer.algorithm()

    sortedScores = sorted(
        [(mean_score, scores.std() / 2, params, scores) for params, mean_score, scores in optimized.grid_scores_],
        reverse=True,
    )
示例#2
0
def mix(gans, num_real, num_synth):
    """
    :return: a Dataloader with num_real real and num_synth synthetic images
    """
    if num_real <= 0:
        return utils.gen_synth_data(gans, n_entries=num_synth)
    elif num_synth <= 0:
        return (utils.loadDataset(train_size=num_real, batch_size=100))[1]
    else:
        synth_data_loader = utils.gen_synth_data(gans, n_entries=num_synth)
        _, orig_data_loader = utils.loadDataset(train_size=num_real, batch_size=100)
        mixed_data_loader = torch.utils.data.DataLoader(torch.cat((orig_data_loader.dataset, synth_data_loader.dataset), 0),
                                                        batch_size=100, shuffle=True)
        return mixed_data_loader
示例#3
0
def main(outputName):
    print("Welcome into RNN implementation, (recording will be on ",
          outputName, ")")

    random.seed("MetaMind")  # Lucky seed ? Fixed seed for replication
    np.random.seed(7)

    print("Parsing dataset, creating dictionary...")
    # Dictionary initialisation
    vocabulary.initVocab()

    # Loading dataset
    datasets = {}
    datasets['training'] = utils.loadDataset("trees/train.txt")
    print("Training loaded !")

    datasets['testing'] = utils.loadDataset("trees/test.txt")
    print("Testing loaded !")
    datasets['validating'] = utils.loadDataset("trees/dev.txt")
    print("Validation loaded !")

    print("Datasets loaded !")
    print("Nb of words", vocabulary.vocab.length())

    # Datatransform (normalisation, remove outliers,...) ?? > Not here

    # Grid search on our hyperparameters (too long for complete k-fold cross validation so just train/test)
    for mBS in miniBatchSize:
        for aRNI in adagradResetNbIter:
            for lR in learningRate:
                for rT in regularisationTerm:
                    params = {}
                    params["nbEpoch"] = nbEpoch
                    params["learningRate"] = lR
                    params["regularisationTerm"] = rT
                    params["adagradResetNbIter"] = aRNI
                    params["miniBatchSize"] = mBS
                    # No need to reset the vocabulary values (contained in model.L so automatically reset)
                    # Same for the training and testing set (output values recomputed at each iterations)
                    model = train.train(outputName, datasets, params)

    # TODO: Plot the cross-validation curve
    # TODO: Plot a heat map of the hyperparameters cost to help tunning them ?

    ## Validate on the last computed model (Only used for final training)
    #print("Training complete, validating...")
    #vaError = model.computeError(datasets['validating'], True)
    #print("Validation error: ", vaError)

    print("The End. Thank you for using this program!")
示例#4
0
def main(outputName):
    print("Welcome into RNTN implementation 0.6 (recording will be on ", outputName, ")")
    
    random.seed("MetaMind") # Lucky seed ? Fixed seed for replication
    np.random.seed(7)
    
    print("Parsing dataset, creating dictionary...")
    # Dictionary initialisation
    vocabulary.initVocab()
    
    # Loading dataset
    datasets = {}
    datasets['training'] = utils.loadDataset("trees/train.txt");
    print("Training loaded !")
    datasets['testing'] = utils.loadDataset("trees/test.txt");
    print("Testing loaded !")
    datasets['validating'] = utils.loadDataset("trees/dev.txt");
    print("Validation loaded !")
    
    print("Datasets loaded !")
    print("Nb of words", vocabulary.vocab.length());
    
    # Datatransform (normalisation, remove outliers,...) ?? > Not here
    
    # Grid search on our hyperparameters (too long for complete k-fold cross validation so just train/test)
    for mBS in miniBatchSize:
        for aRNI in adagradResetNbIter:
            for lR in learningRate:
                for rT in regularisationTerm:
                    params = {}
                    params["nbEpoch"]            = nbEpoch
                    params["learningRate"]       = lR
                    params["regularisationTerm"] = rT
                    params["adagradResetNbIter"] = aRNI
                    params["miniBatchSize"]      = mBS
                    # No need to reset the vocabulary values (contained in model.L so automatically reset)
                    # Same for the training and testing set (output values recomputed at each iterations)
                    model, errors = train.train(outputName, datasets, params)

    # TODO: Plot the cross-validation curve
    # TODO: Plot a heat map of the hyperparameters cost to help tunning them ?

    ## Validate on the last computed model (Only used for final training)
    #print("Training complete, validating...")
    #vaError = model.computeError(datasets['validating'], True)
    #print("Validation error: ", vaError)

    print("The End. Thank you for using this program!")
示例#5
0
def main(args):
    extractor = args.extractor
    classifier = args.classifier
    #print("Use {} is the feature extractor".format(extractor))
    #print("Use {} is the classifier".format(classifier))

    # Load dataset into memory
    dataset, labelset = loadDataset()
    # Feature extraction
    feature_vectors = featureExtraction(dataset=dataset, method=extractor)
    # Split dataset
    print("Step 3. Split dataset into training data and test data")
    print("\tSplitting data...")
    X_train, X_test, y_train, y_test = train_test_split(feature_vectors, labelset, test_size=TEST_SIZE)
    print("\tSplit dataset successfully !")
    print("\tThe size of training data: {}".format(len(X_train)))
    print("\tThe size of test data: {}".format(len(X_test)))

    # Training model using explicited classifier."
    model = trainModel(dataset=X_train, label=y_train, classifier=classifier)

    # Test model on test data.
    print("Step 5. Test model on test data")
    print("\tTesing model ...")
    accuary = model.score(X_test, y_test)
    print("\tThe accuary of model is {} %".format(accuary*100))

    # Save model
    print("Step 6. Save model into disk")
    print("\tSaving ...")
    #with open('model/{}_{}_{}.pkl'.format(extractor,classifier, datetime.datetime.today().strftime('%d-%m-%Y')), 'wb') as fid:
        #cPickle.dump(gnb, fid)
    with open('model/{}_{}.pkl'.format(extractor,classifier), 'wb') as fid:
        cPickle.dump(gnb, fid)
    print("\tSave model successully with name {}".format(classifier, datetime.datetime.today().strftime('%d-%m-%Y')))
def process_arguments():
    parser = argparse.ArgumentParser(description='Optimize set of different classifiers with meta-parameter optimization')
    parser.add_argument('train', help='Train dataset')
    parser.add_argument('test', help='Test dataset')
    parser.add_argument('modelDirectory', help='Directory to save best models')
    parser.add_argument('evaluationsFilename', help='Filename to save  models result')
    parser.add_argument('-t', '--type', default='grid', choices=['grid', 'random', 'pso'], help='Search type')
    parser.add_argument('-i', '--iterations', default=-1, type=int, help='Iterations amount for pso and random search')
    parser.add_argument('-j', '--jobs', default=-1, type=int, help='Processes amount for learning')

    args = parser.parse_args()
    trainData, trainLabel = loadDataset(args.train)
    testData, testLabel = loadDataset(args.test)

    ctp = ClassifierTestParams(args.type, args.iterations, args.modelDirectory, args.evaluationsFilename, trainData, trainLabel, testData, testLabel, args.jobs)
    return ctp
示例#7
0
文件: self_learn.py 项目: dliud/gan
def main():
    real_size = 100  # number of real images per Dataloader to use at the beginning
    dataloaders, _ = utils.loadDataset(
        train_size=real_size,
        batch_size=25,
        image_path='./mnist/train-images-idx3-ubyte',
        label_path='./mnist/train-labels-idx1-ubyte')
    self_learn(dataloaders, train_size=real_size)
示例#8
0
def main(model_size=1, trial=2):
    num_gans = 1
    dataloaders = utils.loadDataset()

    #repeatTrain(dataloaders, trial = trial, epoch_len = 500, end = 2000, alpha = model_size)
    repeatTrain(dataloaders,
                trial=trial,
                epoch_len=50,
                end=100,
                alpha=model_size)
    """
示例#9
0
def main(train_size, model_size, trial):
    num_gans = 10
    dataloaders, labeledDataLoader = utils.loadDataset(
        train_size=train_size,
        batch_size=25,
        image_path='./mnist/train-images-idx3-ubyte',
        label_path='./mnist/train-labels-idx1-ubyte')

    repeatTrain(dataloaders,
                trial=trial,
                epoch_len=500,
                end=2000,
                alpha=model_size)
    """
示例#10
0
def main():
    print("Welcome into RNTN implementation 0.1")
    
    print("Loading dictionary...")
    # Dictionary initialisation
    vocabulary.initVocab(inputModel)
    
    # Loading dataset
    validationSet = utils.loadDataset("trees/dev.txt");
    print("Validation loaded !")
    
    # Creating the model
    model = rntnmodel.Model(inputModel)
            
    print("Computation validation...")
    vaError = model.computeError(validationSet, True)
    print("Validation error: ", vaError)
示例#11
0
 def train_step(test_xs):
     batch_size = flags.batch_size
     # batch_size = 50
     n_epochs = flags.num_epoch
     mask = np.random.binomial(
         1, 1 - flags.corrupt_prob,
         (int(np.round(batch_size * flags.validation)) + 1, 225))
     # print(mask[:5])
     for epoch_i in range(n_epochs):
         # print dataset_train.shape[1] // batch_size
         datasets = utils.loadDataset(batch_size=batch_size,
                                      max=flags.max,
                                      dataset_dir=flags.datasetPath)
         f = 0
         for dataset in datasets:
             dataset_train, dataset_test = partition(dataset,
                                                     shuffle=False)
             mean_img = np.mean(dataset_train, axis=1)
             dataset_train = np.array(
                 [img - mean_img for img in dataset_train.T])
             dataset_train = dataset_train.T
             dataset_train_, dataset_train = corrupt(dataset_train,
                                                     mask=mask)
             _, score, step, summaries = sess.run(
                 [train_op, ae.score, global_step, train_summary_op],
                 feed_dict={
                     ae.x: dataset_train,
                     ae.x_: dataset_train_
                 })
             current_step = tf.train.global_step(sess, global_step)
             if current_step % 100 == 0:
                 print("epoch:{} step:{} score:{}".format(
                     epoch_i, step, score))
             train_summary_writer.add_summary(summaries, step)
             if current_step % 1000 == 0:
                 path = saver.save(sess,
                                   checkpoint_prefix,
                                   global_step=current_step)
                 print("Saved model checkpoint to {}\n".format(path))
         # score, step, summaries, output, W= sess.run([ae.score, global_step, dev_summary_op, ae.output, ae.encoder], feed_dict={
         #         ae.x: test_xs,
         #         ae.x_: test_xs})
         # print("evaluation:\nscore:{}".format(score))
     test_xs = np.asarray(test_xs)
     print("Testxs : " + str(test_xs.shape))
     return test_xs
示例#12
0
def main():
    print("Welcome into RNTN implementation 0.1")

    print("Loading dictionary...")
    # Dictionary initialisation
    vocabulary.initVocab(inputModel)

    # Loading dataset
    validationSet = utils.loadDataset("trees/dev.txt")
    print("Validation loaded !")

    # Creating the model
    model = rntnmodel.Model(inputModel)

    print("Computation validation...")
    vaError = model.computeError(validationSet, True)
    print("Validation error: ", vaError)
def getFeatures(numWordsToUse):

    allTweets, allTweetsSentiment, allRows = utils.loadDataset('twitterCorpus/aggregatedCorpusCleaned.csv',1,2)

    tweets = []
    sentiment = []


    rowCount = 0
    for row in allRows:
        # skip header row
        if rowCount == 0:
            rowCount += 1
        else:
            # the aggregatedCorpus has sentiment scores from 1 - 5, while STS has scores from 0 - 4
            rowSentiment = str( int(row[0]) - 1 )

            # only include the row if this is a fairly extreme sentiment
            if rowSentiment in ('0','4'):
                if rowSentiment == '4':
                    rowSentiment = 1
                else:
                    rowSentiment = 0

                tweets.append(row[2])
                sentiment.append(rowSentiment)


    tokenizedTweets, cleanedSentiment = utils.tokenize(tweets, sentiment)

    global popularWords
    formattedTweets, sentiment, popularWords = utils.nlpFeatureEngineering(
            tokenizedTweets, cleanedSentiment,0,numWordsToUse,'counts'
        )

    # transform list of dictionaries into a sparse matrix
    sparseFeatures = dv.fit_transform(formattedTweets)

    return sparseFeatures, sentiment
def getFeatures(numWordsToUse):

    allTweets, allTweetsSentiment, allRows = utils.loadDataset(
        'twitterCorpus/aggregatedCorpusCleaned.csv', 1, 2)

    tweets = []
    sentiment = []

    rowCount = 0
    for row in allRows:
        # skip header row
        if rowCount == 0:
            rowCount += 1
        else:
            # the aggregatedCorpus has sentiment scores from 1 - 5, while STS has scores from 0 - 4
            rowSentiment = str(int(row[0]) - 1)

            # only include the row if this is a fairly extreme sentiment
            if rowSentiment in ('0', '4'):
                if rowSentiment == '4':
                    rowSentiment = 1
                else:
                    rowSentiment = 0

                tweets.append(row[2])
                sentiment.append(rowSentiment)

    tokenizedTweets, cleanedSentiment = utils.tokenize(tweets, sentiment)

    global popularWords
    formattedTweets, sentiment, popularWords = utils.nlpFeatureEngineering(
        tokenizedTweets, cleanedSentiment, 0, numWordsToUse, 'counts')

    # transform list of dictionaries into a sparse matrix
    sparseFeatures = dv.fit_transform(formattedTweets)

    return sparseFeatures, sentiment
示例#15
0
        model.add(LSTM(150, dropout=0.2, recurrent_dropout=0.2))
    elif networkType == GRU_LABEL:
        model.add(GRU(150, dropout=0.2, recurrent_dropout=0.2))
    elif networkType == MLP_LABEL:
        model.add(Flatten())
    model.add(Dense(50, activation='relu'))
    model.add(Dense(27, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model


if __name__ == '__main__':
    arguments = parseArgs()
    dataset = utils.loadDataset(arguments.reprocessDataset)
    xData, yData = dataset[constants.TWEET_COLUMN], dataset[
        constants.LABEL_COLUMN]

    vocabularySize = 13000
    xEncoder, yEncoder = Tokenizer(
        num_words=vocabularySize), preprocessing.OneHotEncoder()

    print('Encoding and splitting xData, yData')
    xDataEncoded, yDataEncoded = encodeX(xEncoder,
                                         xData), encodeY(yEncoder, yData)
    xTrain, xValid, yTrain, yValid = model_selection.train_test_split(
        xDataEncoded, yDataEncoded)

    model = buildModel(vocabularySize, arguments.networkType)
    print(model.summary())
示例#16
0
    # Number of max historicaal records to keep for predicting
    time_window_threshold = 30

    refresh_time_interval = 15

    if args.mode == 'train':
        logging.info('Mode: Training')
        logging.info('Evaluation: ' +
                     str(True if args.evaluate in ('True', 'true') else False))

        # Initialization of training params
        # Time interval to resample data
        sample_time = '60S'

        logging.info('Loading train & validation dataset')
        df = loadDataset(trainset)

        # Resample every 15 sec, because prometheus & amari exporter have different timestamps
        df = df.resample('15S', closed='right', label='left').mean()

        # Fill train df with 0s if there are NaNs and Ifns
        df.fillna(0, inplace=True)

        logging.info('Saving normalization values')
        normalization_stats = saveNormalizationStats(df, cols_to_normalize)

        saveDictJson(normalization_stats, stats_json)

        logging.info('Normalizing train & validation data')
        for col in cols_to_normalize:
            df[col + '_normalized'] = normalizeFeature(
示例#17
0
from Net import Net, test_model
from utils import confusion, F1_score, loadDataset, saveNNParas
import time

# Loading the previous network status.
feature_num = 11
hidden_num = 30
output_num = 3

load_net = Net(feature_num, hidden_num, output_num)
load_net.load_state_dict(torch.load('net_model_genre.pt'))
#load_net.load_state_dict(torch.load('net_model_subjective_rating.pt'))
load_net.eval()

# Loading testing dataset to evaluate new network.
x_train, y_train = loadDataset('testing')
x_test, y_test = loadDataset('testing')

# Loading the information of vector.
vectors = pd.read_excel('vector_angle_sample.xls', header=None)
raw_df = pd.DataFrame({
    'row': vectors.iloc[:, 0],
    'col': vectors.iloc[:, 1],
    'vector': vectors.iloc[:, 2]
})

# Sorting by the values of vector angle in ascending order.
increase_res = raw_df.sort_values('vector', ascending=True)
unique_row = increase_res.row.unique()
unique_col = increase_res.col.unique()
示例#18
0
    for i in xrange(stateNum):
       for j in xrange(stateNum):
           fout.write(str(A[i,j]))
           fout.write(" ")
       fout.write("\n")
    
    

def dataCorrectRate(Set):
    correct=0
    totalNum=0
    for i in xrange(len(Set[0])):
        maxNum=0
        maxState=-1
        for j in xrange(len(Set[0][i])):    
            if(Set[0][i][j]>maxNum):
                maxNum=Set[0][i][j]
                maxState=j
        if(Set[1][i]==maxState):
            correct=correct+1
        totalNum+=1
    print "correct rate:",float(correct)/float(totalNum)
    
if __name__ == '__main__':
    dataSet=loadDataset("../../pkl/t26v31.pkl",3)
    
    #dataSet=loadDataset("/home/roylu/share/DNNResult/t23.7v29.95/t23.7v29.95.pkl",3)
    dataCorrectRate(dataSet[0])
    trainHMM(dataSet)
    #get_result("A_with_error:0.281822988462")
示例#19
0
def run():
    trainDataset, testDataset, labelGenerator = utils.loadDataset()

    # Making DataLoaders
    trainDataLoader = torch.utils.data.DataLoader(
        trainDataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        shuffle=True,
        num_workers=4,
        pin_memory=True)
    testDataLoader = torch.utils.data.DataLoader(
        testDataset, batch_size=config.TEST_BATCH_SIZE, num_workers=1)

    totalNOsOfLabels = len(labelGenerator.classes_)

    device = torch.device(config.DEVICE)

    # Defining Model
    print("Making model:- ", config.modelName)
    citeModel = None
    if config.modelName == "BertBase":
        citemodel = model.BERTBaseUncased(numOfLabels=totalNOsOfLabels,
                                          dropout=config.DROPOUT)
    elif config.modelName == "SciBert":
        citemodel = model.SciBertUncased(numOfLabels=totalNOsOfLabels,
                                         dropout=config.DROPOUT)
    citemodel.to(device)

    param_optimizer = list(citemodel.named_parameters())
    '''
        There is generally no need to apply L2 penalty (i.e. weight decay) to biases and LayerNorm.weight. 
        Hence, we have following line.
        Update: There is need to apply L2 to LayerNorm.weight as per Google TF implementation so reverting it ;)
    '''
    no_decay = ["bias", "LayerNorm.bias",
                "LayerNorm.weight"]  # Removed "LayerNorm.bias",

    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.01,  # changed this from 0.001 to 0.1
        },
        {
            "params":
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            "weight_decay":
            0.0,
        }
    ]

    num_train_steps = int(len(trainDataLoader) * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=config.LEARNING_RATE)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_train_steps * config.WARMUP_PROPORTION,
        num_training_steps=num_train_steps)

    if config.dotrain:
        print('In Training')
        for epoch in range(config.EPOCHS):
            trainingLoss = engine.train(trainDataLoader, citemodel, optimizer,
                                        device, scheduler)
            print("Epoch: ", epoch, " Loss: ", trainingLoss, '\n')

        # Saving the model
        os.makedirs(os.path.dirname(config.MODEL_SAVED.format(
            config.modelName)),
                    exist_ok=True)
        torch.save(citemodel.state_dict(),
                   config.MODEL_SAVED.format(config.modelName))
        print('Model is saved at: ',
              config.MODEL_SAVED.format(config.modelName))
    '''
     Evaluating the model
    '''

    print("Loading the model")
    #citemodel = model.BERTBaseUncased(*args, **kwargs)
    citemodel.load_state_dict(
        torch.load(config.MODEL_SAVED.format(config.modelName)))
    outputs, targets = engine.eval(testDataLoader, citemodel, device)

    # Saving the results with corresponding targets
    os.makedirs(os.path.dirname(
        config.PREDICTIONS_PATH.format(config.modelName)),
                exist_ok=True)
    with open(config.PREDICTIONS_PATH.format(config.modelName), 'wb') as f:
        pickle.dump(outputs, f)  # First saved the predicted outputs
        pickle.dump(targets, f)  # Then saved the corresponding targets

    print('Starting Evaluation...')
    utils.metric(outputs, targets)
示例#20
0
def optimize_parameters(
    parameters, dataset, train_steps=1000, pretrain_steps=250
):
    """
    Returns the best the hyperparameters tuning for the ARL model.

    args:
        parameters: a dictionary with the hyperparameters and their values,
                    e.g. {'batch_size': [32, 64, 256], [...]}.
        dataset: name of the dataset ([toy_data, uci_adult, compas, law]).
    """

    # Load the training data.
    train_dataset = loadDataset(
        dataset=dataset,
        train_or_test="train",
        embedding_size=32,
    )

    # Create the default model parameters.
    model_params = {
        "embedding_size": train_dataset.categorical_embedding_sizes,
        "n_num_cols": len(train_dataset.mean_std.keys()),
        "learner_hidden_units": [64, 32],
        "adversary_hidden_units": [32],
        "batch_size": None,
    }

    lr_params = {"learner": None, "adversary": None}

    cross_val = KFold(n_splits=5)
    steps = None

    # Create a defaultdict for the results.
    params2aucs = defaultdict(list)

    # Get all possible combinations of parameters.
    options = itertools.product(*parameters.values())
    n_options = len(list(itertools.product(*parameters.values())))

    for i, (batch_size, learner_lr, adversary_lr) in enumerate(options, 1):
        iter_start = time.time()
        model_params["batch_size"] = batch_size
        lr_params["learner"] = learner_lr
        lr_params["adversary"] = adversary_lr

        print(
            f"--- ({i}/{n_options}) batch_size: {batch_size}, "
            f"learner_lr: {learner_lr}, adversary_lr: {adversary_lr}"
        )

        # 5-fold cross-validation
        for train_index, test_index in cross_val.split(train_dataset):

            # Get the performance of the model
            metrics = train_for_n_iters(
                train_dataset.get_split(train_index),
                train_dataset.get_split(test_index),
                model_params,
                lr_params,
                average_over=5,
                train_steps=train_steps,
                pretrain_steps=pretrain_steps,
                print_loss=False,
            )
            params2aucs[(batch_size, learner_lr, adversary_lr)].append(
                metrics.auc_avg
            )
            steps = metrics.steps
        iter_stop = time.time()

        mean_best_auc = np.mean(
            params2aucs[(batch_size, learner_lr, adversary_lr)], axis=0
        )

        best_auc_idx = np.argmax(mean_best_auc)

        print(
            f"\t took {iter_stop - iter_start:.0f} seconds | "
            f"best AUC is {mean_best_auc[best_auc_idx]:.3f} on step "
            f"{steps[best_auc_idx]}"
        )

    # Average the folds.
    params2aucs = {
        option: np.mean(aucs, axis=0) for option, aucs in params2aucs.items()
    }

    # Find the highest AUC.
    params = list(params2aucs.keys())
    aucs = np.array(list(params2aucs.values()))

    param_idx, step_idx = np.unravel_index(np.argmax(aucs), aucs.shape)
    best_auc = aucs[param_idx, step_idx]
    best_params = params[param_idx]
    best_step = steps[step_idx]

    # Return the results.
    results = {
        "batch_size": best_params[0],
        "lr_learner": best_params[1],
        "lr_adversary": best_params[2],
    }
    return results, best_auc, best_step
示例#21
0
from sklearn.feature_extraction import DictVectorizer
from sklearn.cross_validation import train_test_split

import utils
import trainClassifiers

from sentimentCorpora import nltkMovieReviews
from sentimentCorpora import stsTwitterCorpus
from sentimentCorpora import atcTwitterCorpus
from sentimentCorpora import nltkTwitterCorpus
from sentimentCorpora import nltkTwitterNoEmoticonsCorpus


# load the "training" data
trainingTweets, trainingSentiment, allRows = utils.loadDataset('training.1600000.processed.noemoticon.csv', 10)
trainingTweets, trainingSentiment = utils.tokenize(trainingTweets, trainingSentiment)


# load the test data
testTweets, testSentiment, testRows = utils.loadDataset('testdata.manual.2009.06.14.csv', 1)
testTweets, testSentiment = utils.tokenize(testTweets, testSentiment)


# instead of predicting two categories ('0', and '4') that the algorithm doesn't inherently understand are mutually exclusive, we will explicitly turn this into a single binary classification problem (0 or 1)
cleanedTrainingSentiment = []
for score in trainingSentiment:
    if score == '4':
        cleanedTrainingSentiment.append(1)
    else:
        cleanedTrainingSentiment.append(0)
示例#22
0
    def pretrain(self, batch_size, num_epoch, sess):
        graph = tf.Graph()
        # init = tf.global_variables_initializer()
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)
        timestamp = str(int(time.time()))
        out_dir = os.path.abspath(
            os.path.join(os.path.curdir, "runs", timestamp))
        print("Writing to {}\n".format(out_dir))

        checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
        checkpoint_prefix = os.path.join(checkpoint_dir, "model")
        params_prefix = os.path.join(out_dir, 'params')
        os.makedirs(params_prefix)
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        with open("%s\\params.txt" % params_prefix, 'w') as params_file:
            params_file.writelines(flags)

        sess.run(tf.global_variables_initializer())
        # sess.run(init)
        for i in range(len(self.dimensions)):

            # learning_rate = 0.01

            global_step = tf.Variable(0, trainable=False, name="global_step")
            learning_rate = 0.01
            # learning_rate = tf.train.exponential_decay(0.01, global_step, flags.max / flags.batch_size, 0.98, staircase=True)
            optimizer = tf.train.RMSPropOptimizer(learning_rate,
                                                  momentum=self.momentum)
            # optimizer = tf.train.AdamOptimizer(learning_rate)
            # optimizer = tf.train.GradientDescentOptimizer(learning_rate)
            grads_and_vars = optimizer.compute_gradients(self.scores[i])
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)
            sess.run(tf.initialize_all_variables())
            # define summaries
            # grad_summaries = []
            # for g, v in grads_and_vars:
            #     if g is not None:
            #         grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)
            #         sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
            #         grad_summaries.append(grad_hist_summary)
            #         grad_summaries.append(sparsity_summary)
            # grad_summaries_merged = tf.summary.merge(grad_summaries)
            score_summary = tf.summary.scalar("score", self.scores[i])

            train_summary_op = tf.summary.merge([score_summary])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)

            for j in range(num_epoch):
                for batch in utils.loadDataset(batch_size,
                                               max=flags.max,
                                               dataset_dir=flags.datasetPath):
                    mask_t = np.random.binomial(1, 1 - flags.corrupt_prob,
                                                batch.shape)
                    self.batch = batch
                    # mean_img = np.mean(batch, axis=1)
                    # batch = np.array([img - mean_img for img in batch.T])
                    # batch = batch.T
                    _, score, step = sess.run(
                        [train_op, self.scores[i], global_step],
                        feed_dict={
                            self.input_x: batch,
                            self.mask: mask_t
                        })
                    current_step = tf.train.global_step(sess, global_step)
                    time_str = datetime.datetime.now().isoformat()
                    if current_step % 100 == 0:
                        print("{}: traning Layer_{} ".format(time_str, i) +
                              "epoch:%d " % j + "step: %d" % step +
                              "  score: {}".format(score))

                    if current_step % 2000 == 0:
                        _, score, step, summaries = sess.run([
                            train_op, self.scores[i], global_step,
                            train_summary_op
                        ],
                                                             feed_dict={
                                                                 self.input_x:
                                                                 batch,
                                                                 self.mask:
                                                                 mask_t
                                                             })
                        train_summary_writer.add_summary(summaries, step)
                        path = saver.save(sess,
                                          checkpoint_prefix,
                                          global_step=current_step)
                        print("Saved model checkpoint to {}\n".format(path))

        self.finetuning(batch_size, num_epoch, sess, saver, out_dir)
示例#23
0
    def finetuning(self, batch_size, num_epoch, sess, saver, out_dir):

        current_input = self.layer_output[len(self.dimensions) - 1]
        self.ft_losses = [tf.constant(0.0) for _ in self.dimensions]
        for layer_i, dimension in enumerate(self.dimensions):
            print(2 - layer_i)
            if layer_i == 3:
                n_output = 225
            else:
                n_output = self.dimensions[2 - layer_i]
                print(n_output)
            with tf.name_scope("finetuning_decoder_%i" % layer_i):
                W = tf.transpose(self.Ws[3 - layer_i], name="W")
                b = tf.Variable(tf.constant(0.1, shape=[n_output]), name="b")
                self.out_put = tf.nn.sigmoid(tf.matmul(current_input, W) + b)
                self.ft_losses[layer_i] += tf.nn.l2_loss(W)
                current_input = self.out_put
        with tf.name_scope('fn_score'):
            loss = tf.pow(self.out_put - self.input_x, 2)
            self.score = tf.reduce_sum(
                loss,
                name="score") + self.ft_losses[layer_i] * flags.l2_reg_lambda
        global_step = tf.Variable(0, trainable=False, name="global_step")
        learning_rate = 0.001
        # learning_rate = tf.train.exponential_decay(0.01, global_step, flags.max / flags.batch_size, 0.98, staircase=True)
        saver = tf.train.Saver(tf.global_variables())
        # optimizer = tf.train.RMSPropOptimizer(learning_rate, momentum=self.momentum)
        # optimizer = tf.train.AdamOptimizer(learning_rate)
        optimizer = tf.train.GradientDescentOptimizer(learning_rate)
        grads_and_vars = optimizer.compute_gradients(self.score)
        finetune_op = optimizer.apply_gradients(grads_and_vars,
                                                global_step=global_step)
        sess.run(tf.initialize_all_variables())
        print("Writing to {}\n".format(out_dir))

        checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
        checkpoint_prefix = os.path.join(checkpoint_dir, "model")
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        # grad_summaries = []
        # for g, v in grads_and_vars:
        #     if g is not None:
        #         grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)
        #         sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
        #         grad_summaries.append(grad_hist_summary)
        #         grad_summaries.append(sparsity_summary)
        # grad_summaries_merged = tf.summary.merge(grad_summaries)
        score_summary = tf.summary.scalar("score", self.score)

        finetune_summary_op = tf.summary.merge([score_summary])
        finetune_summary_dir = os.path.join(out_dir, "summaries", "finetune")
        finetune_summary_writer = tf.summary.FileWriter(
            finetune_summary_dir, sess.graph)
        sess.run(tf.global_variables_initializer())
        print("Starting finetuning")
        for j in range(num_epoch):
            for batch in utils.loadDataset(batch_size,
                                           max=flags.max,
                                           dataset_dir=flags.datasetPath):
                mask_t = np.random.binomial(1, 1 - flags.corrupt_prob,
                                            batch.shape)
                # mean_img = np.mean(batch, axis=1)
                # batch = np.array([img - mean_img for img in batch.T])
                # batch = batch.T
                _, score, step, summaries = sess.run([
                    finetune_op, self.score, global_step, finetune_summary_op
                ],
                                                     feed_dict={
                                                         self.input_x: batch,
                                                         self.mask: mask_t
                                                     })
                current_step = tf.train.global_step(sess, global_step)
                time_str = datetime.datetime.now().isoformat()
                if current_step % 100 == 0:
                    print("{}: finetuning  step: {}".format(time_str, step) +
                          "  score: {}".format(score))

                if current_step % 10000 == 0:
                    finetune_summary_writer.add_summary(summaries, step)
                    path = saver.save(sess,
                                      checkpoint_prefix,
                                      global_step=current_step)
                    print("Saved model checkpoint to {}\n".format(path))
        # with graph.as_default():
        #     with sess.as_default():
        n_examples = 15
        with tf.device('/cpu:0'):
            test_xs = utils.load_whole_dataset(1000000, flags.datasetPath)
            mask = np.random.binomial(1, 1, test_xs.shape)
        score, recon, encodes = sess.run(
            [self.score, self.out_put, self.layer_output],
            feed_dict={
                self.input_x: test_xs,
                self.mask: mask
            })
        # fig, axs = plt.subplots(2, n_examples, figsize=(10, 2))
        # for example_i in range(n_examples):
        #     axs[0][example_i].imshow(
        #         # np.reshape(test_xs[example_i, :], (28, 28)))
        #         np.reshape(test_xs[example_i, :], (15, 15)))
        #     axs[1][example_i].imshow(
        #         # np.reshape([recon[example_i, :] + mean_img], (28, 28)))
        #         np.reshape([recon[example_i, :]], (15, 15)))
        # print ('Plot complete now showing...')
        clf = svm.OneClassSVM(kernel='rbf', gamma='auto', nu=1e-3)
        clf.fit(encodes[3])
        with open('./svm.model', 'wb') as m:
            pickle.dump(clf, m)
示例#24
0
        'If specified, reads and processes the dataset again. ' +
        'Else reads an already processed dataset from ' +
        constants.CLASSIFICATION_DATA_PATH)
    return parser.parse_args(sys.argv[1:])


def printTopics(model):
    predicted_topics = model.print_topics(num_topics=5, num_words=5)
    for i, topics in predicted_topics:
        print('Words in Topic {}:\n {}'.format(i + 1, topics))


if __name__ == '__main__':
    arguments = parseArgs()
    dataset = utils.loadDataset(arguments.reprocessDataset,
                                classification=False,
                                splitWords=True)

    # Creating dictionary from dataset, where each unique term is assigned an index
    dictionary = corpora.Dictionary(dataset)

    # Converting list of documents into Bag of Words using dictionary
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in dataset]

    # Training models on the document term matrix
    modelList = [
        LdaModel(doc_term_matrix, num_topics=10, id2word=dictionary, passes=2),
        LsiModel(doc_term_matrix, num_topics=10, id2word=dictionary)
    ]

    for model in modelList:
示例#25
0
warnings.filterwarnings("ignore", category=DeprecationWarning)

from sklearn.feature_extraction import DictVectorizer
from sklearn.cross_validation import train_test_split

import utils
import trainClassifiers

from sentimentCorpora import nltkMovieReviews
from sentimentCorpora import stsTwitterCorpus
from sentimentCorpora import atcTwitterCorpus
from sentimentCorpora import nltkTwitterCorpus
from sentimentCorpora import nltkTwitterNoEmoticonsCorpus

# load the "training" data
trainingTweets, trainingSentiment, allRows = utils.loadDataset(
    'training.1600000.processed.noemoticon.csv', 10)
trainingTweets, trainingSentiment = utils.tokenize(trainingTweets,
                                                   trainingSentiment)

# load the test data
testTweets, testSentiment, testRows = utils.loadDataset(
    'testdata.manual.2009.06.14.csv', 1)
testTweets, testSentiment = utils.tokenize(testTweets, testSentiment)

# instead of predicting two categories ('0', and '4') that the algorithm doesn't inherently understand are mutually exclusive, we will explicitly turn this into a single binary classification problem (0 or 1)
cleanedTrainingSentiment = []
for score in trainingSentiment:
    if score == '4':
        cleanedTrainingSentiment.append(1)
    else:
        cleanedTrainingSentiment.append(0)
示例#26
0

class Logger(object):
    def __init__(self, logFilename):
        self.terminal = sys.stdout
        self.log = open(logFilename, "a")

    def write(self, message):
        self.terminal.write(message)
        self.log.write(message)


if __name__ == '__main__':
    P = dnnUtils.Parameters(setting)
    print P.outputFilename
    datasets = utils.loadDataset(filename=P.datasetFilename, totalSetNum=3)

    if not USE_EXIST_MODEL:
        sys.stdout = Logger(P.logFilename)
        bestModel = dnn.trainDNN(datasets, P)
        bestModelFilename = '../model/' + P.outputFilename + '.model'
        utils.makePkl(bestModel, P.bestModelFilename)
    else:
        # TODO use filename to build P
        bestModelFilename = sys.argv[2]
        bestModel = utils.loadPkl(bestModelFilename)

    dnn.getResult(bestModel, datasets[1], P, 'valid', P.validResultFilename)
    dnn.getResult(bestModel, datasets[2], P, 'test', P.testResultFilename)
    dnn.getProb(bestModel, datasets[0], P.trainProbFilename, P)
    dnn.getProb(bestModel, datasets[1], P.validProbFilename, P)
示例#27
0
        cmdString = int(raw_input("\nDEC> "))

        # Exit
        if cmdString == 0:
            import sys
            sys.exit(0)

        # Image preprocessing and segmentation
        elif cmdString == 1:
            imgPath = str(raw_input('\nInsert the path of the folder images: '))
            imS.doSegmentation(imgPath)

        # Model Fitting
        elif cmdString == 2:
            datasetPath = str(raw_input('Insert the path of dataset: '))
            dataset, datasetfileNames = loadDataset(datasetPath)
            epochs = int(raw_input('Insert number of epochs: '))
            batchSize = int(raw_input('Insert the batch size: '))
            dec.fit('adam', 'mean_squared_error', 'mae', dataset, epochs, batchSize)
            dec.save_model()

        # Clustering
        elif cmdString == 3:
            dec.load_model()
            testSetPath = str(raw_input('Insert the path of the test set: '))
            testSet, testSetFileNames = loadDataset(testSetPath)

            print '\n', ' ' * 3, '0) KMeans'

            clustType = int(raw_input("\nDEC> "))
示例#28
0
def filler():
    dataSet=loadDataset("../pkl/small_data.pkl",3)
    fill_trainSet=fillerCore(dataSet[0])
    fill_validSet=fillerCore(dataSet[1])
    fill_testSet =fillerCore(dataSet[2])
    return fill_trainSet,fill_validSet,fill_testSet
MODEL_NAME = 'base'
###########################################################

# Define constants
REGULARIZE = 0.0001
GRU_REGULARIZE = 0.0005
MAX_LENGTH = 30
DROPOUT = 0.2
HIDDEN_RNN_UNITS = 192
HIDDEN_DENSE_UNITS = 2048
LEARNING_RATE = 0.001
EPOCHS = 100
BATCH_SIZE = 64

## Load Datasets
train_x1, train_x2, train_features, train_y, valid_x1, valid_x2, valid_y, valid_features = loadDataset(
)
print('Dataset Loaded')

start_time = time.time()

## Load Embedding Matrix
(embedding_matrix, vocab_size) = load_embedding_matrix()


## Define Model
def build_model():
    input_1 = Input(shape=(MAX_LENGTH, ))
    input_2 = Input(shape=(MAX_LENGTH, ))

    e = Embedding(vocab_size,
                  300,
def evaluate(thresholds_file, cpu_testset, iperf_testset, trainset,
             time_window_threshold):
    plt.clf()

    stats_json = data_prefix + 'normalization_stats.json'

    model = load_model('model/5g_autoencoder.h5')
    normalization_stats = loadDictJson(stats_json)

    cols_to_normalize = getFeatures()
    cols = [c + '_normalized' for c in cols_to_normalize]

    time_window_threshold = 30
    refresh_time_interval = 15

    n_steps = 4
    n_features = len(cols)

    logging.info('Loading evaluation datasets')
    val_df = loadDataset(trainset)
    cpu_df = loadDataset(cpu_testset)
    iperf_df = loadDataset(iperf_testset)

    cpu_df.fillna(method='backfill', inplace=True)
    cpu_df.replace([np.inf, -np.inf], 0.0, inplace=True)

    iperf_df.fillna(method='backfill', inplace=True)
    iperf_df.replace([np.inf, -np.inf], 0.0, inplace=True)

    val_df.fillna(method='backfill', inplace=True)
    val_df.replace([np.inf, -np.inf], 0.0, inplace=True)

    logging.info('Normalizing evaluation data')
    for col in cols_to_normalize:
        cpu_df[col + '_normalized'] = normalizeFeature(
            cpu_df, col, normalization_stats[col + '_min'],
            normalization_stats[col + '_max'])
        iperf_df[col + '_normalized'] = normalizeFeature(
            iperf_df, col, normalization_stats[col + '_min'],
            normalization_stats[col + '_max'])
        val_df[col + '_normalized'] = normalizeFeature(
            val_df, col, normalization_stats[col + '_min'],
            normalization_stats[col + '_max'])

    logging.info('Evaluating for CPU and memory metrics')

    cpu_xs = []
    cpu_ys = []

    net_up_xs = []
    net_up_ys = []

    net_down_xs = []
    net_down_ys = []

    mem_xs_a1 = []
    mem_ys_a1 = []
    for sample_start in range(0, len(cpu_df) - time_window_threshold):
        sample_end = sample_start + time_window_threshold
        cpu_df_sample = cpu_df.iloc[sample_start:sample_end]

        # Select required columns for evaluation data batch
        cpu_dataset = cpu_df_sample[cols].to_numpy()

        # Prepare evaluation dataset batch
        X_test_cpu, y_test_cpu = split_sequences(cpu_dataset, n_steps)
        X_test_cpu = X_test_cpu.reshape((len(X_test_cpu), n_steps, n_features))

        # Predict for evaluation dataset batch
        yhat_cpu = model.predict(X_test_cpu, verbose=0)

        cpu_rmse_dict = printPredictionErrors(y_test_cpu, yhat_cpu)

        net_up_xs.append(len(net_up_xs))
        net_up_ys.append(cpu_rmse_dict['net_up_rmse'])

        net_down_xs.append(len(net_down_xs))
        net_down_ys.append(cpu_rmse_dict['net_down_rmse'])

        cpu_xs.append(len(cpu_xs))
        cpu_ys.append(cpu_rmse_dict['cpu_rmse'])

        mem_xs_a1.append(len(mem_xs_a1))
        mem_ys_a1.append(cpu_rmse_dict['mem_rmse'])

    plt.plot(cpu_xs,
             cpu_ys,
             color='blue',
             label='CPU Percentage Rate (mode=user)')
    #plt.plot(mem_xs_a1, mem_ys_a1, color='red', label='Memory Percentage Rate')
    plt.title('CPU Attack Dataset')
    plt.xlabel('# of Sequence')
    plt.ylabel('RMSE')
    plt.legend()
    plt.savefig('plots/evaluate_cpu.png')
    plt.clf()

    logging.info('Evaluating for network and 5G metrics')

    net_up_xs = []
    net_up_ys = []

    net_down_xs = []
    net_down_ys = []

    net_5g_up_xs = []
    net_5g_up_ys = []

    net_5g_down_xs = []
    net_5g_down_ys = []

    mem_xs_a2 = []
    mem_ys_a2 = []
    for sample_start in range(0, len(iperf_df) - time_window_threshold):
        sample_end = sample_start + time_window_threshold
        iperf_df_sample = iperf_df.iloc[sample_start:sample_end]

        # Select required columns for evaluation data batch
        iperf_dataset = iperf_df_sample[cols].to_numpy()

        # Prepare evaluation dataset batch
        X_test_iperf, y_test_iperf = split_sequences(iperf_dataset, n_steps)
        X_test_iperf = X_test_iperf.reshape(
            (len(X_test_iperf), n_steps, n_features))

        # Predict for evaluation dataset batch
        yhat_iperf = model.predict(X_test_iperf, verbose=0)

        iperf_rmse_dict = printPredictionErrors(y_test_iperf, yhat_iperf)

        net_up_xs.append(len(net_up_xs))
        net_up_ys.append(iperf_rmse_dict['net_up_rmse'])

        net_down_xs.append(len(net_down_xs))
        net_down_ys.append(iperf_rmse_dict['net_down_rmse'])

        net_5g_up_xs.append(len(net_5g_up_xs))
        net_5g_up_ys.append(iperf_rmse_dict['net_up_5g_rmse'])

        net_5g_down_xs.append(len(net_5g_down_xs))
        net_5g_down_ys.append(iperf_rmse_dict['net_down_5g_rmse'])

        mem_xs_a2.append(len(mem_xs_a2))
        mem_ys_a2.append(iperf_rmse_dict['mem_rmse'])

    plt.plot(net_up_xs, net_up_ys, color='green', label='Network Up Rate')
    plt.plot(net_down_xs,
             net_down_ys,
             color='purple',
             label='Network Down Rate')
    #plt.plot(mem_xs_a2, mem_ys_a2, color='red', label='Memory Percentage Rate')
    plt.title('iperf Attack Dataset')
    plt.xlabel('# of Sequence')
    plt.ylabel('RMSE')
    plt.legend()
    plt.savefig('plots/evaluate_iperf_net.png')
    plt.clf()

    plt.plot(net_5g_up_xs,
             net_5g_up_ys,
             color='green',
             label='5G Network Up Rate')
    plt.plot(net_5g_down_xs,
             net_5g_down_ys,
             color='blue',
             label='5G Network Down Rate')
    plt.title('iperf Attack Dataset')
    plt.xlabel('# of Sequence')
    plt.ylabel('RMSE')
    plt.legend()
    plt.savefig('plots/evaluate_iperf_5g.png')
    plt.clf()

    logging.info('Evaluating with training data')

    cpu_xs = []
    cpu_ys = []

    net_up_xs = []
    net_up_ys = []

    net_down_xs = []
    net_down_ys = []

    net_5g_up_xs = []
    net_5g_up_ys = []

    net_5g_down_xs = []
    net_5g_down_ys = []

    mem_xs_n = []
    mem_ys_n = []
    for sample_start in range(0, len(val_df) - time_window_threshold):
        sample_end = sample_start + time_window_threshold
        val_df_sample = val_df.iloc[sample_start:sample_end]

        # Select required columns for evaluation data batch
        val_dataset = val_df_sample[cols].to_numpy()

        # Prepare evaluation dataset batch
        X_test_val, y_test_val = split_sequences(val_dataset, n_steps)
        X_test_val = X_test_val.reshape((len(X_test_val), n_steps, n_features))

        # Predict for evaluation dataset batch
        yhat_val = model.predict(X_test_val, verbose=0)

        val_rmse_dict = printPredictionErrors(y_test_val, yhat_val)

        cpu_xs.append(len(cpu_xs))
        cpu_ys.append(val_rmse_dict['cpu_rmse'])

        mem_xs_n.append(len(mem_xs_n))
        mem_ys_n.append(val_rmse_dict['mem_rmse'])

        net_up_xs.append(len(net_up_xs))
        net_up_ys.append(val_rmse_dict['net_up_rmse'])

        net_down_xs.append(len(net_down_xs))
        net_down_ys.append(val_rmse_dict['net_down_rmse'])

        net_5g_up_xs.append(len(net_5g_up_xs))
        net_5g_up_ys.append(val_rmse_dict['net_up_5g_rmse'])

        net_5g_down_xs.append(len(net_5g_down_xs))
        net_5g_down_ys.append(val_rmse_dict['net_down_5g_rmse'])

    plt.plot(cpu_xs,
             cpu_ys,
             color='blue',
             label='CPU Percentage Rate (mode=user)')
    plt.plot(mem_xs_n, mem_ys_n, color='red', label='Memory Percentage Rate')
    plt.plot(net_up_xs, net_up_ys, color='green', label='Network Up Rate')
    plt.plot(net_down_xs,
             net_down_ys,
             color='purple',
             label='Network Down Rate')
    plt.title('Training Dataset (Edge Metrics)')
    plt.xlabel('# of Sequence')
    plt.ylabel('RMSE')
    plt.legend()
    plt.savefig('plots/evaluate_val_1.png')
    plt.clf()

    plt.plot(net_5g_up_xs,
             net_5g_up_ys,
             color='orange',
             label='5G Network Up Rate')
    plt.plot(net_5g_down_xs,
             net_5g_down_ys,
             color='cyan',
             label='5G Network Down Rate')
    plt.title('Training Dataset (5G Metrics)')
    plt.xlabel('# of Sequence')
    plt.ylabel('RMSE')
    plt.legend()
    plt.savefig('plots/evaluate_val_2.png')
    plt.clf()
from Net import Net, test_model
from utils import confusion, F1_score, loadDataset, saveNNParas
import time

# Loading the previous network status.
feature_num = 11
hidden_num = 30
output_num = 3

load_net = Net(feature_num, hidden_num, output_num)
load_net.load_state_dict(torch.load('ann_net_model_genre.pt'))
#load_net.load_state_dict(torch.load('net_model_subjective_rating.pt'))
load_net.eval()

# Loading testing dataset to evaluate new network.
x_test, y_test = loadDataset('testing')

# Loading the information of vector.
vectors = pd.read_excel('ann_vector_angle_sample.xls', header=None)
raw_df = pd.DataFrame({
    'row': vectors.iloc[:, 0],
    'col': vectors.iloc[:, 1],
    'vector': vectors.iloc[:, 2]
})

# Sorting by the values of vector angle in ascending order.
increase_res = raw_df.sort_values('vector', ascending=True)
unique_row = increase_res.row.unique()
unique_col = increase_res.col.unique()

# Initialize all the status parameters.
示例#32
0
def evaluate(thresholds_file, cpu_testset, iperf_testset, trainset,
             time_window_threshold):
    """
    Evaluate trained model. If user has not set all thresholds for anomalies, evaluation will also set
    the remaining thresholds. Evaluation uses a dataset, that contains a CPU stress test, an iperf 
    stress test and predicting the data used for training. Thresholds are defined by calculating the 
    RMSEs from actual values and taking the 99th percentile of these errors for each feature separately and overall.
    If user has set all thresholds when starting the program,these thresholds will be used.

    param thresholds_file: File, where user-defined thresholds are saved. This will updated if new thresholds are proposed.
    param cpu_testset: File containing dataset with CPU stress test.
    param iperf_testset: File containing dataset with iperf stress test.
    param trainset: File containing the dataset used for training.
    param time_window_threshold: Time window for keeping the last-n records. In evaluation data are predicted in batches of n.

    return: None.
    """

    # Loading thresholds from file. Create an empty dict if no file exists
    thresholds_dict = {}
    if path.exists(thresholds_file):
        thresholds_dict = loadDictJson(thresholds_file)

    logging.info('Loading evaluation datasets')
    val_df = loadDataset(trainset)
    cpu_df = loadDataset(cpu_testset)
    iperf_df = loadDataset(iperf_testset)

    cpu_df.fillna(method='backfill', inplace=True)
    cpu_df.replace([np.inf, -np.inf], 0.0, inplace=True)

    iperf_df.fillna(method='backfill', inplace=True)
    iperf_df.replace([np.inf, -np.inf], 0.0, inplace=True)

    val_df.fillna(method='backfill', inplace=True)
    val_df.replace([np.inf, -np.inf], 0.0, inplace=True)

    logging.info('Normalizing evaluation data')
    for col in cols_to_normalize:
        cpu_df[col + '_normalized'] = normalizeFeature(
            cpu_df, col, normalization_stats[col + '_min'],
            normalization_stats[col + '_max'])
        iperf_df[col + '_normalized'] = normalizeFeature(
            iperf_df, col, normalization_stats[col + '_min'],
            normalization_stats[col + '_max'])
        val_df[col + '_normalized'] = normalizeFeature(
            val_df, col, normalization_stats[col + '_min'],
            normalization_stats[col + '_max'])

    cpu_rmse = []
    cpu_rx_rmse = []
    cpu_tx_rmse = []
    net_down_rmse = []
    net_up_rmse = []
    net_down_5g_rmse = []
    net_up_5g_rmse = []
    mem_rmse = []
    total_rmse = []

    logging.info('Evaluating for CPU and memory metrics')

    for sample_start in range(0, len(cpu_df) - time_window_threshold):
        sample_end = sample_start + time_window_threshold
        cpu_df_sample = cpu_df.iloc[sample_start:sample_end]

        # Select required columns for evaluation data batch
        cpu_dataset = cpu_df_sample[cols].to_numpy()

        # Prepare evaluation dataset batch
        X_test_cpu, y_test_cpu = split_sequences(cpu_dataset, n_steps)
        X_test_cpu = X_test_cpu.reshape((len(X_test_cpu), n_steps, n_features))

        # Predict for evaluation dataset batch
        yhat_cpu = model.predict(X_test_cpu, verbose=0)

        cpu_rmse_dict = printPredictionErrors(y_test_cpu, yhat_cpu)

        total_rmse.append(cpu_rmse_dict['rmse_total'])
        cpu_rmse.append(cpu_rmse_dict['cpu_rmse'])
        cpu_rx_rmse.append(cpu_rmse_dict['cpu_rx_rmse'])
        cpu_tx_rmse.append(cpu_rmse_dict['cpu_tx_rmse'])
        mem_rmse.append(cpu_rmse_dict['mem_rmse'])

    logging.info('Evaluating for network and 5G metrics')

    for sample_start in range(0, len(iperf_df) - time_window_threshold):
        sample_end = sample_start + time_window_threshold
        iperf_df_sample = iperf_df.iloc[sample_start:sample_end]

        # Select required columns for evaluation data batch
        iperf_dataset = iperf_df[cols].to_numpy()

        # Prepare evaluation dataset batch
        X_test_iperf, y_test_iperf = split_sequences(iperf_dataset, n_steps)
        X_test_iperf = X_test_iperf.reshape(
            (len(X_test_iperf), n_steps, n_features))

        # Predict for evaluation dataset batch
        yhat_iperf = model.predict(X_test_iperf, verbose=0)

        iperf_rmse_dict = printPredictionErrors(y_test_iperf, yhat_iperf)

        total_rmse.append(iperf_rmse_dict['rmse_total'])
        net_down_rmse.append(iperf_rmse_dict['net_down_rmse'])
        net_up_rmse.append(iperf_rmse_dict['net_up_rmse'])
        net_down_5g_rmse.append(iperf_rmse_dict['net_down_5g_rmse'])
        net_up_5g_rmse.append(iperf_rmse_dict['net_up_5g_rmse'])
        mem_rmse.append(iperf_rmse_dict['mem_rmse'])

    logging.info('Evaluating with training data')

    for sample_start in range(0, len(val_df) - time_window_threshold):
        sample_end = sample_start + time_window_threshold
        val_df_sample = val_df.iloc[sample_start:sample_end]

        # Select required columns for evaluation data batch
        val_dataset = val_df_sample[cols].to_numpy()

        # Prepare evaluation dataset batch
        X_test_val, y_test_val = split_sequences(val_dataset, n_steps)
        X_test_val = X_test_val.reshape((len(X_test_val), n_steps, n_features))

        # Predict for evaluation dataset batch
        yhat_val = model.predict(X_test_val, verbose=0)

        val_rmse_dict = printPredictionErrors(y_test_val, yhat_val)

        total_rmse.append(val_rmse_dict['rmse_total'])
        cpu_rmse.append(val_rmse_dict['cpu_rmse'])
        cpu_rx_rmse.append(val_rmse_dict['cpu_rx_rmse'])
        cpu_tx_rmse.append(val_rmse_dict['cpu_tx_rmse'])
        mem_rmse.append(val_rmse_dict['mem_rmse'])
        net_down_rmse.append(val_rmse_dict['net_down_rmse'])
        net_up_rmse.append(val_rmse_dict['net_up_rmse'])
        net_down_5g_rmse.append(val_rmse_dict['net_down_5g_rmse'])
        net_up_5g_rmse.append(val_rmse_dict['net_up_5g_rmse'])

    # For thresholds, that are not defined by user, use suggested values
    if 'cpu_threshold' not in thresholds_dict.keys():
        thresholds_dict['cpu_threshold'] = np.percentile(cpu_rmse, 0.99)
    if 'mem_threshold' not in thresholds_dict.keys():
        thresholds_dict['mem_threshold'] = np.percentile(mem_rmse, 0.99)
    if 'cpu_tx_threshold' not in thresholds_dict.keys():
        thresholds_dict['cpu_tx_threshold'] = np.percentile(cpu_tx_rmse, 0.99)
    if 'cpu_rx_threshold' not in thresholds_dict.keys():
        thresholds_dict['cpu_rx_threshold'] = np.percentile(cpu_rx_rmse, 0.99)
    if 'net_up_threshold' not in thresholds_dict.keys():
        thresholds_dict['net_up_threshold'] = np.percentile(net_up_rmse, 0.99)
    if 'net_down_threshold' not in thresholds_dict.keys():
        thresholds_dict['net_down_threshold'] = np.percentile(
            net_down_rmse, 0.99)
    if 'net_5g_up_threshold' not in thresholds_dict.keys():
        thresholds_dict['net_5g_up_threshold'] = np.percentile(
            net_up_5g_rmse, 0.99)
    if 'net_5g_down_threshold' not in thresholds_dict.keys():
        thresholds_dict['net_5g_down_threshold'] = np.percentile(
            net_down_5g_rmse, 0.99)
    if 'overall_threshold' not in thresholds_dict.keys():
        thresholds_dict['overall_threshold'] = np.percentile(total_rmse, 0.99)

    # Save new thresholds in same file
    saveDictJson(thresholds_dict, thresholds_file)
示例#33
0
from reducing_net import reduced_rnn_net
from utils import confusion, F1_score, loadDataset, saveNNParas
import time

# Loading the previous network status.
input_dim = 1
hidden_dim = 50
layer_dim = 1
output_dim = 3  # Four kinds of genres within 12 songs.

load_rnn = RNN_model(input_dim, hidden_dim, layer_dim, output_dim)
load_rnn.load_state_dict(torch.load('rnn_model.pt'))
load_rnn.eval()

# Loading testing dataset to evaluate new network.
x_test, y_test = loadDataset('testing_sequence')
flat_input_test = x_test.unsqueeze(-1)

# Various sequence length used for padding sequence and packed sequence in rnn modol.
l = [1104, 1028, 980, 964, 960, 956, 956, 932, 868, 840, 836, 808]
test_seq_lens = np.zeros((4 * 12))
for i in range(len(l)):
    test_seq_lens[i * 4:(i + 1) * 4] = l[i]

# Loading the information of vector.
vectors = pd.read_excel('rnn_vector_angle_sample.xls', header=None)
raw_df = pd.DataFrame({
    'row': vectors.iloc[:, 0],
    'col': vectors.iloc[:, 1],
    'angle': vectors.iloc[:, 2]
})
# Operation of addition.
load_net.hidden.weight[2] += load_net.hidden.weight[16]
load_net.hidden.weight[5] += load_net.hidden.weight[23]
load_net.hidden.weight[8] += load_net.hidden.weight[22]

# Slicing the remained weight values and bias values in a new-sized network.
new_net = Net(11, 27, 3)
new_net.hidden.weight[:16] = load_net.hidden.weight[:16]
new_net.hidden.weight[16:21] = load_net.hidden.weight[17:22]
new_net.hidden.weight[21:] = load_net.hidden.weight[24:]

new_net.hidden.bias[:16] = load_net.hidden.bias[0:16]
new_net.hidden.bias[16:21] = load_net.hidden.bias[17:22]
new_net.hidden.bias[21:] = load_net.hidden.bias[24:]

new_net.output.weight[:, :16] = load_net.output.weight[:, 0:16]
new_net.output.weight[:, 16:21] = load_net.output.weight[:, 17:22]
new_net.output.weight[:, 21:] = load_net.output.weight[:, 24:]

new_net.output.bias[:] = load_net.output.bias[:]
new_net.eval()

# Reload the test dateset and evaluate the shrinked network.
x_test, y_test = loadDataset()
acc, pred = test_model(new_net, x_test, y_test)

mat = confusion(x_test.size(0), 3, pred, y_test)
print("Confusion Matrix (after pruning):")
print(mat)
F1_score(mat)
示例#35
0
    label = pp.correctLabel(endIndxGroup = endIndxGroup, name = name, label = label)
    pp.writeFile(filename = smoothedFilename, name = name, label = label)

class Logger(object):
    def __init__(self, logFilename):
        self.terminal = sys.stdout
        self.log = open(logFilename, "a")

    def write(self, message):
        self.terminal.write(message)
        self.log.write(message)  

if __name__ == '__main__':
    P = dnnUtils.Parameters(setting)
    print P.outputFilename
    datasets  = utils.loadDataset(filename = P.datasetFilename, totalSetNum=3)

    if not USE_EXIST_MODEL: 
        sys.stdout = Logger(P.logFilename)
        bestModel = dnn.trainDNN(datasets, P)
        bestModelFilename = '../model/' + P.outputFilename + '.model'
        utils.makePkl(bestModel, P.bestModelFilename)
    else:
        # TODO use filename to build P
        bestModelFilename = sys.argv[2]
        bestModel = utils.loadPkl(bestModelFilename)
    
    dnn.getResult(bestModel, datasets[1], P, 'valid', P.validResultFilename)
    dnn.getResult(bestModel, datasets[2], P, 'test', P.testResultFilename)
    dnn.getProb(bestModel, datasets[0], P.trainProbFilename, P)
    dnn.getProb(bestModel, datasets[1], P.validProbFilename, P)
示例#36
0
def build_new_image(path,
                    k_means_instance,
                    input_dim,
                    encoder,
                    new_name='_new.png'):

    import sys

    print '\nSelect the distance you want for the choice of the best centroid'
    print '\n', ' ' * 3, '0) Mean Absolute Error compute on the encoded representation'
    print ' ' * 3, '1) Mean Squared Error compute on the encoded representation'
    print ' ' * 3, '2) Hausdorff distance on the binarized original images'

    distances = ['mae', 'mse', 'hausdorff']
    distance_type = int(raw_input("\nDEC> "))
    image = io.imread(path, as_grey=True)

    processed = image
    normalized_height = int(np.sqrt(input_dim))
    normalized_width = normalized_height

    processed = imageSegmentation.image_preproc(
        processed, binary_threshold=threshold_otsu(processed))
    labeled_image, num_features, max_width, max_height, max_label = imageSegmentation.find_connected_components(
        processed)

    centroidSet, centroidNames = loadDataset('Centroids')

    centroidImages = centroidSet

    # Get the encoded representation of the centroids
    centroidSet = encoder.predict(centroidSet)

    # Here I create a new image all white that will host each replaced char
    base_image = 255 * np.ones(image.shape)

    predictes = list()
    for i in range(1, max_label):

        r_s, c_s = np.where(labeled_image == i)
        if len(r_s) > 1 and len(c_s) > 1:

            # get the char to be replaced
            to_replace = image[min(r_s) - 1:max(r_s) + 2,
                               min(c_s) - 1:max(c_s) + 2]

            # resize before passing through the net
            to_replace = imresize(to_replace,
                                  (normalized_height, normalized_width))
            to_predict = to_replace

            if distance_type != 2:
                # get the encoded representation of the char
                to_predict = np.array(
                    to_replace.reshape(
                        (1, normalized_height * normalized_width)))
                to_predict = to_predict.astype('float32') / 255.
                to_predict = encoder.predict(to_predict)

            predictes.append([to_predict, r_s, c_s])

    pool = ThreadPoolExecutor(8)
    list_futures = list()
    for i in range(1, len(predictes)):
        list_futures.append(
            pool.submit(find_and_subst,
                        i,
                        base_image,
                        centroidSet,
                        predictes,
                        centroidImages,
                        metric=distances[distance_type]))

    for i in range(len(list_futures)):
        r = list_futures[i].result()
        sys.stdout.write(u'\u001b[1000D' + bcolors.RED + 'Creating: ' +
                         str(ceil(i * 100 / len(list_futures))) + '%')
        sys.stdout.flush()

    sys.stdout.write(bcolors.RESET)
    plt.imsave('temp.png', base_image, cmap=plt.cm.gray)