Пример #1
0
def main():
    """
    Fit models and make predictions.
    We'll use one-hot encoding to transform our categorical features
    into binary features.
    y and X will be numpy array objects.
    """
    model = linear_model.LogisticRegression(C=3)  # the classifier we'll use

    # === load data in memory === #
    print "loading data"

    cwd = os.getcwd()
    trainDataLoc = cwd + '/../data/train.csv'
    testDataLoc = cwd + '/../data/test.csv'

    y, X = load_data(trainDataLoc)
    y_test, X_test = load_data(testDataLoc, use_labels=False)

    # === one-hot encoding === #
    # we want to encode the category IDs encountered both in
    # the training and the test set, so we fit the encoder on both
    encoder = preprocessing.OneHotEncoder()
    encoder.fit(np.vstack((X, X_test)))
    X = encoder.transform(X)  # Returns a sparse matrix (see numpy.sparse)
    X_test = encoder.transform(X_test)

    # if you want to create new features, you'll need to compute them
    # before the encoding, and append them to your dataset after

    # === training & metrics === #
    mean_auc = 0.0
    n = 10  # repeat the CV procedure 10 times to get more precise results
    for i in range(n):
        # for each iteration, randomly hold out 20% of the data as CV set
        X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(
            X, y, test_size=.20, random_state=i * SEED)

        # if you want to perform feature selection / hyperparameter
        # optimization, this is where you want to do it

        # train model and make predictions
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_cv)[:, 1]

        # compute AUC metric for this CV fold
        fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds)
        roc_auc = metrics.auc(fpr, tpr)
        print "AUC (fold %d/%d): %f" % (i + 1, n, roc_auc)
        mean_auc += roc_auc

    print "Mean AUC: %f" % (mean_auc / n)

    # === Predictions === #
    # When making predictions, retrain the model on the whole training set
    model.fit(X, y)
    preds = model.predict_proba(X_test)[:, 1]
    #filename = raw_input("Enter name for submission file: ")
    filename = 'LogisticRegressionResults'
    save_results(preds, filename + ".csv")
Пример #2
0
def main():

    cwd = os.getcwd()
    trainDataLoc = cwd + '/../data/train.csv'
    testDataLoc = cwd + '/../data/test.csv'

    y, X = load_data(trainDataLoc)
    y_test, X_test = load_data(testDataLoc, use_labels=False)

    encoder = preprocessing.OneHotEncoder()
    encoder.fit(np.vstack((X, X_test)))
    X = encoder.transform(X)  # Returns a sparse matrix (see numpy.sparse)
    X_test = encoder.transform(X_test)

    #model = findBestModel(X, y) Best model is rbf, gamma = 1, c = 1

    X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(
        X, y, test_size=.20, random_state=SEED)

    model = svm.SVC(C=1, probability=True, kernel='rbf', gamma=1)
    model.fit(X_train, y_train)
    preds = model.predict_proba(X_cv)[:, 1]

    # compute AUC metric for this CV fold
    fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds)
    roc_auc = metrics.auc(fpr, tpr)
    print "AUC : %f" % (roc_auc)

    preds = model.predict_proba(X_test)[:, 1]

    save_results(preds, "SVM_classifier.csv")
Пример #3
0
def solve_challenge(packed_challenge, key, mac_key):
    """ Solve a challenge that was produced by generate_challenge with the
        given key and mac_key. 
        
        Raises InvalidSignature in the event of a message authentication 
        code mismatch. """
    mac, hash_function, package = load_data(packed_challenge)
    if verify_mac(mac_key, package, mac, hash_function):
        challenge, bytes_per_hash, unencrypted_data = load_data(package)
        
        return (decrypt(challenge, key, hmac_factory(hash_function),
                        getattr(hashlib, hash_function)().digestsize, 
                        output_block_size=bytes_per_hash), 
                unencrypted_data)
    else:
        raise InvalidSignature("Message authentication code mismatch")
Пример #4
0
def run_main():
    """
        主函数
    """

    # 聚类个数
    n_cluster = 8
    # 收敛阈值
    cutoff = 0.002

    samples = load_data()

    clusters = kmeans(samples, n_cluster, cutoff)

    # 输出结果
    # for i, c in enumerate(clusters):
    #     for sample in c.samples:
    #         print('聚类--{},样本点--{}'.format(i, sample))

    # 可视化结果
    plt.subplot()
    color_names = list(mcolors.cnames)
    for i, c in enumerate(clusters):
        x = []
        y = []
        # random.choice
        color = [color_names[i % 100 + 10]] * len(c.samples)
        for sample in c.samples:

            x.append(sample.coords[2])
            y.append(sample.coords[1])
        plt.scatter(x, y, c=color)
    plt.show()
Пример #5
0
def compile_jets(data_path, n_events, p_granularity, q_granularity,
                 batch_size):
    # Load in jets from file
    [
        daughters, endings, mothers,
        (discrete_p_splittings, discrete_q_splittings), mother_momenta
    ] = load_data(data_path,
                  n_events=n_events,
                  batch_size=batch_size,
                  split_p_q=True,
                  p_granularity=p_granularity,
                  q_granularity=q_granularity)

    # this unpacking is necessary to remove it from the tuple and put it into a
    # list
    x = [[*a] for a in zip(daughters, mother_momenta, [m[1] for m in mothers],
                           [q[0] for q in discrete_q_splittings])]

    # temporary hack having to do with mask values; this will change later.
    for i in range(len(mothers)):
        mothers[i][0][mothers[i][0] == -1] = 0
        discrete_p_splittings[i][0][discrete_p_splittings[i][0] ==
                                    p_granularity**4] = 0
        discrete_q_splittings[i][0][discrete_q_splittings[i][0] ==
                                    q_granularity] = 0

    y = [[*a] for a in zip([e[0] for e in endings], [m[0] for m in mothers],
                           [d[0] for d in discrete_p_splittings],
                           [q[0] for q in discrete_q_splittings])]
    return x, y
Пример #6
0
    def __init__(self, batch_size, seed=1234):
        """
        Inputs:
          hdf5_path: str
          batch_size: int
          dev_train_csv: str | None, if None then use all data for training
          dev_validate_csv: str | None, if None then use all data for training
          seed: int, random seed
        """

        self.batch_size = batch_size

        self.random_state = np.random.RandomState(seed)
        self.validate_random_state = np.random.RandomState(0)

        # Load data
        
        (self.train_x, self.train_y, self.validate_x, self.validate_y, _, _) = load_data()
        print(self.train_x.shape, self.validate_x.shape)
        
        self.train_audio_names = np.arange(len(self.train_x))
        self.validate_audio_names = np.arange(len(self.validate_x))
        
        # Calculate scalar
        (self.mean, self.std) = calculate_scalar(self.train_x)
Пример #7
0
    def test_project_extent(self):
        """Test project_extent"""
        layer = load_data('point-nyc.shp')

        QgsMapLayerRegistry.instance().addMapLayer(layer)
        geojson = projestions_geoms.project_extent()
        self.assert_valid_extent(geojson)
Пример #8
0
def get_key_ideas(pos, lang, patterns_path):
    # load the patterns
    patterns = load_data(patterns_path)
    patterns = patterns["default_patterns_" + lang]
    # exceptions = patterns["default_exceptions_"+lang]

    # get the key ideas
    key_ideas = []

    tokens = [w[0] for w in pos]
    words = []
    for w in pos:
        if w[1][0] in ['J', 'V', 'N']:
            words.append(w[1][0])
        else:
            words.append(w[0])
    # patterns = [x[0] for x in self.patterns]

    for p in patterns:
        start = search(words, p)

        if start:
            key_idea = tokens[start:start + len(p)]
            key_ideas.append(key_idea)

    return key_ideas
def process_documents(source_path,
                      target_path,
                      processes=None,
                      content_index=0):
    """
    Pre-processes all documents within a CSV file.

    :param Path source_path: Filename for the source CSV file
    :param Path target_path: Filename for destination file
    :param list processes: List of pre-processing functions, (document_content) -> (value, modified_content)
    :param int content_index: Index of the document content
    """
    data = load_data(source_path, index_col=None).values[:, content_index]

    if processes is None:
        processes = standard_processes
    processor = partial(apply_process, processes=processes)

    workers = Pool(n_threads)  # Define workers
    documents = workers.map(processor, data)  # Apply processing
    workers.close()  # Close document queue
    workers.join()  # Wait for processes to finish

    contexts, indexes = split_into_contexts(documents)
    save_contexts(contexts, indexes, target_path)
Пример #10
0
    def load(self):

        # Load labels
        da_labels = utils.load_labels(self.data_path, self.da_labels_file)
        ap_labels = utils.load_labels(self.data_path, self.ap_labels_file)

        # If unable to load labels inform user and exit
        if not da_labels and not ap_labels:
            print("unable to load label lists...Exiting program.")

        # Load default data
        default_data = utils.load_data(self.data_path, 'default')

        # Load JSON file
        data = utils.load_data(self.data_path, self.dialogue_file)

        # If file is not valid or invalid JSON
        if not data:

            # Try default data
            if default_data:
                data = default_data
            # Else exit
            else:
                print("Unable to load default JSON data...Exiting program.")
                exit()

        # Create dialogue object
        dialogues = utils.load_dialogues(data)

        # If JSON is not valid or keys missing
        if not dialogues:

            # Try default dialogues
            default_dialogues = utils.load_dialogues(default_data)
            if default_dialogues:
                # TODO popup to tell user loading default data.
                dialogues = default_dialogues
            # Else exit
            else:
                print("Unable to load default JSON data...Exiting program.")
                exit()

        # Create the dialogue model
        model = DialogueModel(data['dataset'], ap_labels, da_labels, dialogues)

        return model
Пример #11
0
    def test_get_layer_geom(self):
        """Test layer_geom"""
        layer = load_data('point-nyc.shp')
        geojson = projestions_geoms.layer_geom(layer)
        self.assertNotEqual(geojson, '')

        geojson = json.loads(geojson)
        self.assertEqual(geojson['type'], 'FeatureCollection')
Пример #12
0
 def test_get_layer_geom_large(self):
     """Test layer_geom with many features"""
     layer = load_data('many-points-nyc.shp')
     geojson = projestions_geoms.layer_geom(layer)
     self.assertNotEqual(geojson, '')
     geojson = json.loads(geojson)
     self.assertEqual(geojson['type'], 'FeatureCollection')
     self.assertLessEqual(len(geojson['features']),
                          settings.PROJESTIONS_MAX_FEATURES)
Пример #13
0
 def __init__(self, batch_size):
     """Data generator for test data. 
     """
     
     super(TestDataGenerator, self).__init__(batch_size=batch_size)
     
     # Load test data
     (_, _, _, _, self.test_x, self.test_y) = load_data()
     
     self.test_audio_names = np.arange(len(self.test_x))
Пример #14
0
    def test_map_canvas_extent(self):
        """Test map_canvas_extent"""
        layer = load_data('point-nyc.shp')
        QgsMapLayerRegistry.instance().addMapLayer(layer)

        iface = QGIS_APP[2]
        mapCanvas = iface.mapCanvas()
        mapCanvas.zoomToFullExtent()
        geojson = projestions_geoms.map_canvas_extent(mapCanvas)
        self.assert_valid_extent(geojson)
def main():

	cwd = os.getcwd()
	trainDataLoc = cwd + '/../data/train.csv'
	testDataLoc = cwd + '/../data/test.csv'

	y, X = load_data(trainDataLoc)
	y_test, X_test = load_data(testDataLoc, use_labels=False)

	clf = xgb.XGBClassifier(max_depth=15, 
	                        n_estimators=200, learning_rate=.4, colsample_bytree=.8, seed=SEED)

	# fitting
	clf.fit(X, y, early_stopping_rounds=100, eval_metric="logloss", eval_set=[(X_test, y_test)])

	#print y_pred
	preds = clf.predict_proba(X_test)[:,1]

	save_results(preds, "XGBoost_classifier.csv")
Пример #16
0
def run_analysis(dset):
    ids,X,y = load_data(dset)
    X = DataClean([["[^a-z]"," "],
                   [" [ ]+", " "],],html_clean=True).fit(X).transform(X)
    labels = list(set(y))
    for label in labels:
        Xlabel = X[y==label]
        Xlabel_str = ' '.join(Xlabel.tolist())
        generate_wordcloud(Xlabel_str,label,dset,"white")
        generate_wordcloud(Xlabel_str,label,dset,"black")
        print "Label %d : %s" % (label,Xlabel[0])
Пример #17
0
def Main():
    train_loader, validation_loader, test_loader = utilities.load_data(
        directory)
    model, optimizer, criterion = utilities.net_setup(structure, dropout,
                                                      hidden_layer1, lr,
                                                      device)
    utilities.train_network(model, optimizer, criterion, epochs, 20,
                            train_loader, device)
    utilities.save_checkpoint(path, structure, hidden_layer1, dropout, lr)
    print(
        "**************Training Complete !! Thanks for the patience******************"
    )
Пример #18
0
def main(diff_path_neg, diff_path_pos, adv_path_neg, adv_path_pos,
         ind_neg_path, ind_pos_path):
    diff_neg = np.load(diff_path_neg)
    neg_ind = np.load(ind_neg_path)
    l1_norm_neg = cal_l1(diff_neg[neg_ind])
    l2_norm_neg = cal_l2(diff_neg[neg_ind])
    l_inf_neg = cal_l_inf(diff_neg[neg_ind])
    diff_pos = np.load(diff_path_pos)
    pos_ind = np.load(ind_pos_path)
    l1_norm_pos = cal_l1(diff_pos[pos_ind])
    l2_norm_pos = cal_l2(diff_pos[pos_ind])
    l_inf_pos = cal_l_inf(diff_pos[pos_ind])
    print neg_ind.shape[0], "negative adversarial sample have been made"
    print pos_ind.shape[0], "positive adversarial sample have been made"
    print "l1 norm of negative sample is:", l1_norm_neg
    print "l_inf norm of negative sample is:", l_inf_neg
    print "l_2 norm of negative sample is:", l2_norm_neg

    print "l1 norm of positive sample is:", l1_norm_pos
    print "l_inf norm of positive sample is:", l_inf_pos
    print "l_2 norm of positive sample is:", l2_norm_pos

    X_pos, Y_pos, X_neg, Y_neg = load_data()
    neg_cor_index = np.load('./neg_cor_index.npy')
    pos_cor_index = np.load('./pos_cor_index.npy')
    X_pos = X_pos[pos_cor_index]
    X_neg = X_neg[neg_cor_index]

    xadv_pos = np.load(adv_path_pos)[pos_ind]
    pdf_pos = cal_normal_pdf(X_pos, xadv_pos)

    stand_pdf = cal_normal_pdf(X_pos, X_pos)
    print "Gaussian Observation: pdf mean of positive sample is ", pdf_pos.mean(
    )
    a = pdf_pos - stand_pdf.mean() > 0
    print "Gaussian Observation: prob that pdf of positive sample is higher than standard pdf mean is ", a.mean(
    )

    xadv_neg = np.load(adv_path_neg)[neg_ind]
    pdf_neg = cal_normal_pdf(X_neg, xadv_neg)

    stand_pdf = cal_normal_pdf(X_neg, X_neg)
    print "Gaussian Observation: pdf mean of negative sample is ", pdf_neg.mean(
    )
    a = pdf_neg - stand_pdf.mean() > 0
    print "Gaussian Observation: prob that pdf of negative sample is higher than standard pdf mean is ", a.mean(
    )
    sess = tf.Session()
    print "KL Divergense of positive sample is :", sess.run(KL(
        X_pos, xadv_pos))
    print "KL Divergense of negative sample is :", sess.run(KL(
        X_neg, xadv_neg))
Пример #19
0
def main():

    #Run load_data function with command line file path
    trainloader, testloader, validloader = utilities.load_data(root)
    #Run network_setup with command line structure, dropout, hiddenlayer number, and learnrate
    model, criterion, optimizer = utilities.network_setup(
        structure, dropout, hiddenlayer1, learnrate)
    #run deep_learning training function with model, criterion, and optimizer from network_setup and command line arguments
    utilities.deep_learning(model, criterion, optimizer, trainloader, epochs,
                            40)
    #save the checkpoint of the trained model for later use.
    utilities.save_checkpoint(model, path, structure, hiddenlayer1, dropout,
                              learnrate)
    print("Training complete. Model saved at {}".format(path))
Пример #20
0
def make_analyze():

    try:
        #Load the data
        data = request.get_json()

    except Exception as e:
        raise e

    if data == {}:
        return (bad_request())
    else:

        #Get the text and the language

        try:
            lang = data['lang']
        except:
            try:
                lang = detect_language(data['text'])
                print(lang)
            except:
                responses = jsonify(
                    "Error in vectorize: language field is missing")
                return responses
        try:
            text = data['text']  # we assume text is tokenized
        except:
            responses = jsonify("Error in analyze: text is missing")
            return responses

        if lang not in ['en', 'es', 'ar', 'ro', 'fr']:
            responses = jsonify(
                message=
                "Language not available. Language must be in ['en','es','ar','ro','fr']"
            )
            return responses

        filename = os.path.join(os.path.dirname(__file__),
                                'models-registry.json')
        registry = load_data(filename)

        analysis = analyze(text, lang, registry)
        #print(analysis[0])
        #Send the response codes
        responses = jsonify(concepts=analysis[0],
                            key_ideas=analysis[1],
                            topics=analysis[2])
        responses.status_code = 200
        return responses
Пример #21
0
def get_topics(text, lang, topics_path):
    #initialization
    embeddings = Embeddings(emb_dict[lang])

    # get the topics dictionary from the path
    topics_dicts = load_data(topics_path)
    topics_dict = topics_dicts[lang]

    topics = list(topics_dict.keys())

    if lang == 'en':
        #cl = 0.7 # when a topic is "close"
        cl = 0.5
    else:
        cl = 0.5
    # now vectorize the topics
    vect_dict_topics = [
        (w,
         np.mean(to_vector_single_nonzeros(topics_dict[w], embeddings,
                                           len(topics_dict[w])),
                 axis=0)) for w in topics
    ]
    #print(vect_dict_topics)

    # get topics
    assigned_topics = []
    dists = []

    if len(to_vector_single_nonzeros(text, embeddings, len(text))) > 0:
        vectorized_text = np.mean(to_vector_single_nonzeros(
            text, embeddings, len(text)),
                                  axis=0)
    else:
        vectorized_text = np.zeros((300, ) * 1)

    for v in vect_dict_topics:
        dists.append(spatial.distance.cosine(
            vectorized_text, v[1]))  # measure distance to all topics

    good_topics = [
        topics[i].upper() for i in range(len(topics)) if dists[i] < cl
    ]  # choose close topics
    if not good_topics:
        good_topics.append('OTHER')

        # assigned_topics.append(topic)
    assigned_topics.append(good_topics)

    return assigned_topics
Пример #22
0
def main():

    # Get CLI arguments
    args = get_input_args()

    # Prep data
    train_transform = utilities.transform_data('train')
    test_transform = utilities.transform_data('test')

    # Dataloaders
    trainloader = utilities.load_data(args.data_directory + '/' + 'train',
                                      train_transform)
    validationloader = utilities.load_data(args.data_directory + '/' + 'valid',
                                           test_transform)

    # Setup and train model
    model, optimizer, criterion = functions.model_setup(
        args.arch, args.hidden_units, args.learning_rate)
    trained_model = functions.train_model(optimizer, criterion, model,
                                          trainloader, validationloader,
                                          args.gpu, args.epochs)

    # Save the model
    functions.save_checkpoint(trained_model, args.save_dir)
Пример #23
0
def main():
    model_file_path = "output" + os.sep + "linear_regression_model_mv.sav"

    ignored_columns = ['ZN', 'CHAS', 'NOX', 'RM', 'DIS', 'RAD', 'TAX', 'PIRATIO', 'B', 'LSTAT']
    X, Y = load_data('input' + os.sep + 'housing.csv', False, ignored_columns)

    X = preprocess(X, "normalize")

    X_train, y_train, X_test, y_test = split_dataset(X, Y)

    train(X_train, y_train, model_file_path)
    y_predicted = predict(X_test, model_file_path)

    rmse_ration = calculate_rmse_ration(y_test, y_predicted)

    print("rmse ratio:", rmse_ration)
def solve_with_options(algorithm_to_run, seed, run_time, inst):
    print(
        f'''Running algorithm {algorithm_to_run} on file {inst} with a time limit of {run_time} seconds and a random seed of {seed}'''
    )
    np.random.seed(seed)
    random.seed(np.random.randint(999999))
    instance_name, city_data = load_data(inst)
    tracer = Tracer(method=algorithm_to_run,
                    instance=instance_name,
                    seed=seed,
                    cutoff=run_time)
    score, solution = None, None
    if algorithm_to_run == 'LS1':
        score, solution = genetic_algorithm.solve(
            data=city_data,
            timer=early_stop_checker(seconds=run_time),
            tracer=tracer)
    elif algorithm_to_run == 'BnB':
        score, solution = BnB.solve(data=city_data,
                                    timer=early_stop_checker(seconds=run_time),
                                    tracer=tracer)
    elif algorithm_to_run == 'LS2':
        score, solution = two_opt.solve(
            data=city_data,
            timer=early_stop_checker(seconds=run_time),
            tracer=tracer)
    elif algorithm_to_run == 'LS3':
        score, solution = genetic_algorithm_opt_2_hybrid.solve(
            data=city_data,
            timer=early_stop_checker(seconds=run_time),
            tracer=tracer)
    elif algorithm_to_run == 'Approx':
        score, solution = nearest_neighbor.solve(
            data=city_data,
            timer=early_stop_checker(seconds=run_time),
            tracer=tracer)

    if not os.path.exists('output'):
        os.makedirs('output')

    save_solution_file(score,
                       solution,
                       method=algorithm_to_run,
                       instance=instance_name,
                       seed=seed,
                       cutoff=run_time)
    tracer.write_to('output/')
def main(input_path, output_path, ignored_columns, preprocess_type,
         training_data_rate, step_length, threshold_rate, max_loop_num,
         dynamic_step):
    print("input:", input_path)
    print("output:", output_path)
    print("\n")
    if ignored_columns is not None:
        print("ignored_columns:", ignored_columns)
    print("\n")
    print("preprocess_type:", preprocess_type)
    print("training_data_rate:", training_data_rate)
    print("\n")
    print("threshold_rate:", threshold_rate)
    print("max_loop_num:", max_loop_num)
    print("step_length:", step_length)
    if dynamic_step:
        print("dynamic stepping ...")
    else:
        print("static stepping ...")
    print("\n")
    start_time = datetime.now()

    X, Y = load_data(input_path, True, ignored_columns)

    X = preprocess(X, preprocess_type)

    X_train, y_train, X_test, y_test = split_dataset(X, Y, training_data_rate)

    threshold = gen_threshold(Y, threshold_rate)

    train(X_train, y_train, output_path, step_length, threshold, max_loop_num,
          dynamic_step)

    Y_pred = predict(output_path, X_test)

    rmse_ration = calculate_rmse_ration(y_test, Y_pred)
    print("rmse ratio (rmse / y_mean) is:", rmse_ration, "\n")

    end_time = datetime.now()

    execution_duration = end_time - start_time

    print("execution duration:", execution_duration, "\n")

    return
def test(model, directory):
    """
    This command loads the images from the given directory and evaluates a model
    'model1' corresponds to the linear sklearn model
    'model2' corresponds to the linear tensorflow model
    'model3' corresponds to the lenet tensorflow model
    :param model: the model to be used. Either 'model1', 'model2', and 'model3'
    :param directory: the directory where images are saved
    :return: 
    """
    data, labels, _, one_hot_labels = load_data(directory, IMAGE_EXTENSION)
    data_reshaped = data.reshape((data.shape[0], 3072))
    if model == MODEL1:
        sk_linear.predict(data_reshaped, MODEL1_PATH, labels)
    elif model == MODEL2:
        tf_linear.predict(data_reshaped, MODEL2_PATH, Y_test=one_hot_labels)
    elif model == MODEL3:
        tf_lenet.predict(data, MODEL3_PATH, Y_test=one_hot_labels)
Пример #27
0
def predict():
    # load the saved model
    classifier = pickle.load(open('best_model.pkl'))
    # compile a predictor function
    predict_model = theano.function(inputs=[classifier.input],
                                    outputs=classifier.predicted_label)

    # We can test it on some examples from test test
    dataset = '/home/tao/Projects/machine-learning/data/mnist.pkl.gz'
    datasets = load_data(dataset)
    test_set_x, test_set_y = datasets[2]
    test_set_x = test_set_x.get_value()

    predicted_values = predict_model(test_set_x[:10])
    print("Predicted values for the first 10 examples in test set:")
    print(predicted_values)
    print("Ground truth label values for the first 10 examples in test set:")
    print(test_set_y.eval()[:10])
Пример #28
0
def load_port(plot_comps=True, return_raw=False):

    stem = "Data Sets\\Daily_portfolio\\"

    # names = {"GBPEUR=X.csv": ['Adj Close'],
    #          "GBPJPY=X.csv": ['Adj Close'],
    #          "GBPNZD=X.csv": ['Adj Close'],
    #          "GBPUSD=X.csv": ['Adj Close'],
    #          "AAPL.csv": ['Adj Close'],
    #          }

    names = {
        "Crude.csv": ['Adj Close'],
        "TOT.csv": ['Adj Close'],
        "CVX.csv": ['Adj Close'],
        # "Gold.csv": ['Adj Close'],
        "AAPL.csv": ['Adj Close'],
        "INTC.csv": ['Adj Close'],
        "AMD.csv": ['Adj Close'],
        #"W=F.csv": ['Adj Close'],
    }

    data_frame = load_data(stem, names)

    # Take only series values from the data frame
    data = data_frame.values[1:, :].astype('float')

    # data_pos = np.where(data <= 0, 0.05, data)
    data_pos = np.abs(data)

    # Take difference
    data_returns = np.log(data_pos[:, 1:]) - np.log(data_pos[:, :-1])

    # Take the dates from the data frame for plotting
    dates = data_frame.values[0, 1:]

    if plot_comps:
        plot_components(data_returns, dates=dates, global_lims=[-0.2, 0.2])

    if return_raw:
        return data_returns, dates, data_pos
    else:
        return data_returns, dates
Пример #29
0
def main():

    ignored_columns = [
        'ZN', 'CHAS', 'NOX', 'RM', 'DIS', 'RAD', 'TAX', 'PIRATIO', 'B', 'LSTAT'
    ]
    X, Y = load_data('input' + os.sep + 'housing.csv', True, ignored_columns)

    X = preprocess(X, "normalize")

    X_train, y_train, X_test, y_test = split_dataset(X, Y)

    path = 'output' + os.sep + 'lsm_multivariant.csv'

    lsm(X_train, y_train, path)
    y_predicted = predict(path, X_test)

    rmse_ration = calculate_rmse_ration(y_test, y_predicted)
    print("rmse ratio:", rmse_ration)
    return
Пример #30
0
    def test_RegressionOnSubset(self):
        Xtrain, ytrain, Xtest, ytest = utilities.load_data()
        columns = ['Longitude', 'Latitude']
        est = ensemble.RandomForestRegressor()
        est.fit(Xtrain, ytrain)
        predict_est = est.predict(Xtest)
        mad_est = np.mean(np.abs(predict_est - ytest))
        msd_est = np.mean(np.square(predict_est - ytest))

        meta_est = utilities.RegressionOnSubset(est, columns)
        pipe = pipeline.Pipeline([('RegressionOnSubest', meta_est),
                                  ('Regression',
                                   ensemble.RandomForestRegressor())])
        pipe.fit(Xtrain, ytrain)
        predict_pipe = pipe.predict(Xtest)
        mad_pipe = np.mean(np.abs(predict_pipe - ytest))
        msd_pipe = np.mean(np.square(predict_pipe - ytest))
        self.assertTrue(mad_pipe < mad_est)
        self.assertTrue(msd_pipe < msd_est)
Пример #31
0
def load_oil(plot_comps=True, return_raw=False):

    stem = "Data Sets\\Oil\\"

    names = {
        "BP.L.csv": ['Adj Close'],
        "CVX.csv": ['Adj Close'],
        "OGZPY.csv": ['Adj Close'],
        "PBR.csv": ['Adj Close'],
        # "PSX.csv": ['Adj Close'],
        "RDSA.L.csv": ['Adj Close'],
        "SLB.csv": ['Adj Close'],
        "TOT.csv": ['Adj Close'],
        "XOM.csv": ['Adj Close'],
        "Crude.csv": ['Adj Close'],
    }

    data_frame = load_data(stem, names)

    # Take only series values from the data frame
    data = data_frame.values[1:, :].astype('float')

    # data_pos = np.where(data <= 0, 0.05, data)
    data_pos = np.abs(data)

    # Take difference
    data_returns = np.log(data_pos[:, 1:]) - np.log(data_pos[:, :-1])

    # Calculate the number of time series
    num_series = len(data[:, 0])

    # Take the dates from the data frame for plotting
    dates = data_frame.values[0, 1:]

    if plot_comps:
        plot_components(data_returns, dates=dates, global_lims=[-0.2, 0.2])

    if return_raw:
        return data_returns, dates, data_pos
    else:
        return data_returns, dates
def train(model, directory):
    """
    This command load the images from the given directory and train a chosen model
     'model1' corresponds to the linear sklearn model
     'model2' corresponds to the linear tensorflow model
     'model3' corresponds to the lenet tensorflow model
    :param model: the model to be used. Either 'model1', 'model2', and 'model3'
    :param directory: the directory where the training data is saved
    :return: 
    """
    data, labels, class_weights, one_hot_labels = load_data(
        directory, IMAGE_EXTENSION)
    data_reshaped = data.reshape((data.shape[0], 3072))
    if model == MODEL1:
        sk_linear.train(data_reshaped, labels, MODEL1_PATH)
    elif model == MODEL2:
        tf_linear.model(data_reshaped, one_hot_labels, MODEL2_PATH)
    elif model == MODEL3:
        tf_lenet.model(data,
                       one_hot_labels,
                       epochs=400,
                       class_weights=class_weights,
                       model_path=MODEL3_PATH)
Пример #33
0
def deserialize(stream):
    sub_structs, packed_structure = utilities.load_data(stream)
    sub_structs = ast.literal_eval(sub_structs)
    struct = unpack_structure(packed_structure)
    _type, count = struct.__class__.__name__.split('_', 1)
    if _type == "dict":
        output = {}
        for attribute, __type in struct._fields_:
            output[attribute] = getattr(struct, attribute)
    elif _type == "tuple":
        output = tuple(getattr(struct, attribute) for attribute, __type in struct._fields_)
    elif _type == "list":
        output = list(getattr(struct, attribute) for attribute, __type in struct._fields_)    
    else:
        raise ValueError()    
        
    for key in sub_structs:
        print "Deserealizing: "
        print
        print output[key]
        output[key] = deserialize(output[key])
        
    return output
Пример #34
0
    def refresh(self, instance):
        print('The button <refresh> is being pressed')

        # Load JSON file
        data = utils.load_data(self.data_path, self.dialogue_file)

        # Get the current dialogues id
        target_id = self.model.current_dialogue.dialogue_id

        # Loop over the dialogues and utterances in the data
        for dialogue in data['dialogues']:

            # If the id's match get the utterances
            if dialogue['dialogue_id'] == target_id:

                utterances = []
                for utterance in dialogue['utterances']:

                    # Create a new utterance
                    tmp_utterance = Utterance(utterance['text'],
                                              utterance['speaker'])

                    # Set utterance labels if not blank
                    if utterance['ap_label'] is not "":
                        tmp_utterance.set_ap_label(utterance['ap_label'])
                    if utterance['da_label'] is not "":
                        tmp_utterance.set_da_label(utterance['da_label'])

                    # Add to utterance list
                    utterances.append(tmp_utterance)

                # Update current dialogue with the utterances
                self.model.current_dialogue.set_utterances(utterances)
                break

        # Update dialogue_box
        self.update_dialogue()
Пример #35
0
def unpack_structure(packed_data):
    name, fields, packed_bytes = utilities.load_data(packed_data)
    print "\nUnpacking structure", packed_data
    print
    print "Name: ", name
    print "Fields: ", fields
    print "Packed data: ", packed_bytes
    fields = ast.literal_eval(fields)
    format_characters = ''.join(_type for name, _type in fields)
    print "Extracting c types from format characters: ", format_characters
    c_types = get_ctypes_from_format(format_characters)
    print "Got c types: ", c_types
    fields = [(field_info[0], c_types[index]) for index, field_info in enumerate(fields)]
    print "Unpacking fields: ", format_characters, len(packed_bytes), packed_bytes
    #fields = [(name, format_to_type[character]) for name, character in fields]
    values = struct.unpack(format_characters, packed_bytes)
    _values = []
    for value, _type in zip(values, (field[1] for field in fields)):
        if _type == ctypes.c_void_p:
            _values.append(None)
        else:
            _values.append(value)
    struct_type = new_struct_type_from_ctypes(name, *fields)
    return struct_type(*values)   
import numpy as np
from sklearn.cluster import MeanShift, estimate_bandwidth

import utilities

# Load data from input file
X = utilities.load_data('data_multivar.txt')

# Estimating the bandwidth 
bandwidth = estimate_bandwidth(X, quantile=0.1, n_samples=len(X))

# Compute clustering with MeanShift
meanshift_estimator = MeanShift(bandwidth=bandwidth, bin_seeding=True)
meanshift_estimator.fit(X)
labels = meanshift_estimator.labels_
centroids = meanshift_estimator.cluster_centers_
num_clusters = len(np.unique(labels))

print "Number of clusters in input data =", num_clusters

###########################################################
# Plot the points and centroids 

import matplotlib.pyplot as plt
from itertools import cycle

plt.figure()

# specify marker shapes for different clusters
markers = '.*xv'
Пример #37
0
        scores_normalized = []
        num_labels = len(self.labels)
        for score in scores:
            norm_score = float((score - scores_min))/(scores_max - scores_min)
            if norm_score == 1.0:
                norm_score -= 0.001
            elif norm_score == 0.0:
                norm_score += 0.001
            scores_normalized.append(norm_score)
        ypred = [self.labels[int(floor(score*num_labels))] for score in scores_normalized]
        return ypred



if __name__ == '__main__':
    ids,X,y = load_data("cornell")
    pipeline = Pipeline([
        ('cleaner',DataClean(clean_list=[
                            ["[^a-z]"," "],  # only letters
                            [" [ ]+", " "],  # remove extra spaces
                            ],html_clean=True)),
        ('classifier',DictSimple()),
    ])
    cross_validate((X,y),pipeline,accuracy_score)

# Cornell
# accuracy_score : 0.357580308161 +/- 0.156942834821
# Confusion Matrix
# [[  1.74000000e+02   4.14600000e+03   2.67400000e+03   7.30000000e+01
#     5.00000000e+00]
#  [  1.95000000e+02   1.44850000e+04   1.22810000e+04   2.97000000e+02
Пример #38
0
                nwords += 1
            except:
                continue
        return feat_vect/nwords

    def transform(self,X):
        Xtf = np.vstack([self.sentence2vector(x) for x in X])
        return Xtf

    def fit_transform(self,X,y=None):
        self.fit(X,y)
        return self.transform(X)


if __name__ == '__main__':
    _,unlabelledData = load_data("unsupervised")
    ids,X,y = load_data("stanford")
    pipeline = Pipeline([
        ('cleaner',DataClean(clean_list=[
                            ["[^a-z]"," "],  # only letters
                            [" [ ]+", " "],  # remove extra spaces
                            ],html_clean=False)),
        ('w2v',Glove2AverageVector(data_src=unlabelledData)),
        ('classifier',RandomForestClassifier(n_estimators=100))
    ])
    cross_validate((X,y),pipeline,accuracy_score)

# num_features=100,window=10,learning_rate=0.05,epochs=10
# Stanford
# NB
# accuracy_score : 0.72772 +/- 0.00562665086886
Пример #39
0
# ^_^ coding:utf-8 ^_^

import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, grid_search, cross_validation
from sklearn.metrics import classification_report

import utilities

# 加载数据
input_file = 'data_multivar.txt'
X, y = utilities.load_data(input_file)

# 分割数据集
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.25, random_state=5)

# 通过交叉验证设置参数
parameter_grid = [
	{'kernel': ['linear'], 'C': [1, 10, 50, 600]},
	{'kernel': ['poly'], 'degree': [2,3]},
	{'kernel': ['rbf'], 'gamma': [0.01, 0.001], 'C':[1, 10, 50, 600]}]

# 定义需要使用的指标
metrics = ['precision', 'recall_weighted']

# 为每个指标搜索最优参数
for metric in metrics:
	print(u"为指标{}搜索最优参数:".format(metric))
	classifier = grid_search.GridSearchCV(svm.SVC(C=1), parameter_grid, cv=5, scoring=metric)
	classifier.fit(X_train, y_train)
Пример #40
0
# loading each of the trained models
u_model = JUNIPR_class.JUNIPR_energy(p_granularity,
                                     q_granularity,
                                     model_path=up_model_path).model
d_model = JUNIPR_class.JUNIPR_energy(p_granularity,
                                     q_granularity,
                                     model_path=down_model_path).model
ud_path_probs = [[up_path, u_log_probs], [down_path, d_log_probs]]

for ud in range(len(ud_path_probs)):
    [
        daughters, endings, mothers,
        (discrete_p_splittings, discrete_q_splittings), mother_momenta
    ] = load_data(ud_path_probs[ud][0],
                  n_events=n_events,
                  batch_size=batch_size,
                  split_p_q=True,
                  p_granularity=p_granularity,
                  q_granularity=q_granularity)

    #zeros = [[0] * 100 for d in daughters]
    for i in range(len(ud_path_probs[ud][1])):
        ud_path_probs[ud][1][i] = np.zeros((2, len(daughters), 100))

    for i in range(len(mothers)):
        mothers[i][0][mothers[i][0] == -1] = 0

    for n in range(len(daughters)):
        for i_m, model in enumerate([u_model, d_model]):
            # for charge
            e, m, b, q = model.predict_on_batch(x=[
                daughters[n], mother_momenta[n], mothers[n][1],
Пример #41
0
def train_convnet(train_size=200, valid_size=60, iterations=10000, momentum_decay=0.9, learning_rate=0.7, filter_size=10, n_hidden=500, n_filters=6, output_size=21, plot=False):
    theano.config.compute_test_value = 'off'


    # initialize some stuff
    # probably eventually want to un-hard-code this
    nbins_out = output_size
    batch_size=train_size
    rng = np.random.RandomState(4321)

    # load the data
    datasets = utilities.load_data("data/train_skies_grid.npz", train_size, valid_size, flip=False, rotate=False)
    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # get the shape of the input image from data
    nbins = train_set_x.get_value().shape[3]

    # prepare theano objects
    data = T.tensor4('x')
    data.tag.test_value = train_set_x.get_value()
    target = T.matrix('y')
    target.tag.test_value = train_set_y.get_value()

    # create the net
    conv_net_params = [rng, [batch_size, 3, nbins,nbins], (n_filters, 3, filter_size,filter_size), n_hidden, nbins_out]
    cls = ConvNet(data, *conv_net_params )

    val_params = copy.copy(conv_net_params)
    val_params[1][0] = valid_size
    val = ConvNet(data, *val_params)

    #Sanity check to make sure the net works
    cost = theano.function(inputs=[],
                           outputs=[cls.cost(target),cls.softmax_layer.output, cls.hidden_layer.output,
                            cls.conv_layer.output, cls.output_layer.predict(target[:,0], target[:,1])],
                           givens={data:train_set_x,
                                   target: train_set_y}
                         # ,mode=PrintEverythingMode()
    )

    print "Testing to make sure forward propagation works"
    print cost()

    # Setup learning rule
    # Currently using gradient decent with momentum
    grads = T.grad(cls.cost(target), cls.params)

    updates = {}
    momentum = {}
    for p, g in zip(cls.params, grads):
        momentum[p] = theano.shared(np.zeros_like(p.get_value()))
        updates[p] = p+learning_rate*(momentum_decay*momentum[p]-(1-momentum_decay)*g)
        updates[momentum[p]] = momentum_decay*momentum[p]-(1-momentum_decay)*g


    train_model = theano.function(inputs=[],
                                  outputs=[cls.cost(target), grads[0]],
                                  givens = {
                                    data: train_set_x,
                                    target: train_set_y
                                  },
                                  updates = updates
                                 )

    validation_cost = theano.function(inputs=[],
                                     outputs = val.cost(target),
                                     givens = {
                                        data: valid_set_x,
                                        target: valid_set_y
                                     })

    # do the actual training
    print "Training"
    val_score = []
    train_score = []
    for i in xrange(iterations):
        if i%100 == 0:
            # check the score on the validation set every 100 epochs
            # note that this returns the cost *without* the L1 penalty
            val.copy_params(cls)
            vc = validation_cost()
            print "Validation Cost:", vc
            val_score.append(vc)
            # print "Validation Prediction\n", validation_pred()[-1]
            tc = train_model()
            print tc[0], np.cast[np.ndarray](tc[1])
            train_score.append(tc)
            if i > 1500:
                # check stopping condition
                # linear least squares to last 10 points in train_score
                # see np.linalg.lstsq for explanation of how this works
                A = np.vstack([np.arange(10)*100, np.ones(10)]).T
                y = np.asarray(train_score[-10:])
                slope, intercept = np.linalg.lstsq(A, y)[0]
                if -slope < .1:
                    print "{} iterations".format(i)
                    print "Final slope: ", slope
                    print "Final intercept: ", intercept
                    break
        train_model()

        # import pdb
        # pdb.set_trace()

    print "Final Training Cost: {}".format(train_model())
    print "Final Validation Cost: {}".format(validation_cost())

    print "Validation preditions"
    print validation_pred()

    # save the model parameters
    cls.save_params("test_weights_regress.npy")

    if plot:
        plt.figure()
        plt.plot(val_score)
        plt.plot(train_score)
        plt.legend(["Validation Cost", "Training Cost"])
        plt.show()
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn import neighbors, datasets

from utilities import load_data

# Load input data
input_file = 'data_nn_classifier.txt'
data = load_data(input_file)
X, y = data[:,:-1], data[:,-1].astype(np.int)

# Plot input data
plt.figure()
plt.title('Input datapoints')
markers = '^sov<>hp'
mapper = np.array([markers[i] for i in y])
for i in range(X.shape[0]):
    plt.scatter(X[i, 0], X[i, 1], marker=mapper[i], 
            s=50, edgecolors='black', facecolors='none')

# Number of nearest neighbors to consider
num_neighbors = 10

# step size of the grid
h = 0.01  

# Create a K-Neighbours Classifier model and train it
classifier = neighbors.KNeighborsClassifier(num_neighbors, weights='distance')
classifier.fit(X, y)
Пример #43
0
        for word in sentence_tokens:
            if word in word_vocab:
                feat_vect[self.word_centroid_dict[word]] += 1
        return feat_vect

    def transform(self,X):
        Xtf = np.vstack([self.sentence2vector(x) for x in X])
        return Xtf

    def fit_transform(self,X,y=None):
        self.fit(X,y)
        return self.transform(X)


if __name__ == '__main__':
    _,unlabelledData = load_data("unsupervised")
    ids,X,y = load_data("cornell")
    pipeline = Pipeline([
        ('cleaner',DataClean(clean_list=[
                            ["[^a-z]"," "],  # only letters
                            [" [ ]+", " "],  # remove extra spaces
                            ],html_clean=False)),
        ('w2v',Word2VecKMeans(data_src=unlabelledData)),
        ('classifier',BernoulliNB())
    ])
    cross_validate((X,y),pipeline,accuracy_score)

# Stanford
# NB
# accuracy_score : 0.81932 +/- 0.00511171204197
# Confusion Matrix
            for candidate in candidates:
                candidate_feature = self.extract_features(candidate,text,doc_word_counts)
                if candidate_feature != -1:
                    candidate_features[candidate] = candidate_feature
            candidate_features_lst.append(candidate_features)
        return candidate_features_lst

def stem_y(y_true):
    stemmer = PorterStemmer()
    for idx in xrange(len(y_true)):
        for idx_cand in xrange(len(y_true[idx])):
            y_true[idx][idx_cand] = ' '.join([stemmer.stem(word) for word in y_true[idx][idx_cand].split()])
    return y_true

if __name__ == '__main__':
    ids,X,y = load_data()
    to_stem = True
    # ids = ids[:50]
    # X = X[:50]
    # y = y[:50]
    pipeline = Pipeline([
        ('cleaner',DataClean(clean_list=[
                            # ["\."," . "],
                            ["[^a-z-]"," "],  # only letters,fullstops,hyphens(Note!)
                            [" [ ]+", " "],  # remove extra spaces
                            ])),
        ('candidate_features',CandidateFeatureExtractor()),
        ('keyword_selector',PairwiseRankingSVM(keyword_count=10,keyword_maxlen=5,stem=to_stem))
    ])
    # pipeline.fit(X,y)
    # pprint(pipeline.predict(X))
Пример #45
0
from utilities import load_data,cross_validate
from utilities import DataClean
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score



if __name__ == '__main__':
    ids,X,y = load_data("stanford")
    pipeline = Pipeline([
        ('cleaner',DataClean(clean_list=[
                            ["[^a-z]"," "],  # only letters
                            [" [ ]+", " "],  # remove extra spaces
                            ],html_clean=True)),
        ('tf',TfidfVectorizer(use_idf=False,stop_words="english")),
        ('classifier',BernoulliNB())
    ])
    cross_validate((X,y),pipeline,accuracy_score)

# Cornell
# accuracy_score : 0.561444222777 +/- 0.00476207774317
# Confusion Matrix
# [[   744.   2936.   2872.    420.    100.]
#  [   967.   6398.  17320.   2216.    372.]
#  [   435.   4617.  68438.   5425.    667.]
#  [   271.   1767.  18586.  10745.   1558.]
#  [    71.    337.   2807.   4697.   1294.]]

# Stanford
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.cluster import KMeans

import utilities

# Load data
data = utilities.load_data('data_perf.txt')

scores = []
range_values = np.arange(2, 10)

for i in range_values:
    # Train the model
    kmeans = KMeans(init='k-means++', n_clusters=i, n_init=10)
    kmeans.fit(data)
    score = metrics.silhouette_score(data, kmeans.labels_, 
                metric='euclidean', sample_size=len(data))

    print "\nNumber of clusters =", i
    print "Silhouette score =", score
                    
    scores.append(score)

# Plot scores
plt.figure()
plt.bar(range_values, scores, width=0.6, color='k', align='center')
plt.title('Silhouette score vs number of clusters')

# Plot data
Пример #47
0
 def test_get_layer_extent(self):
     """Test layer_extent"""
     layer = load_data('point-nyc.shp')
     geojson = projestions_geoms.layer_extent(layer)
     self.assert_valid_extent(geojson)
                        kp_words = list(takewhile(lambda x:x in keywords, words[i:i+10]))
                        if len(kp_words) != len(set(kp_words)):
                            continue # No repitions
                        avg_pagerank = sum(word_ranks[w] for w in kp_words)/float(len(kp_words))
                        keyphrases[' '.join(kp_words)] = avg_pagerank
                        # to ensure merged keywords are not overlapping
                        j = i + len(kp_words)
                keywords_lst.append([x[0] for x in sorted(keyphrases.iteritems(),key=lambda x: x[1],reverse = True)[:self.keyword_count]])
            else:
                keywords_lst.append(keywords)
        return keywords_lst



if __name__ == '__main__':
    ids,docs,keywords_doc = load_data()
    ids = ids
    docs = docs
    keywords_doc = keywords_doc
    pipeline = Pipeline([
        ('cleaner',DataClean(clean_list=[
                            ["[^a-z\.-]"," "],  # only letters,fullstops
                            [" [ ]+", " "],  # remove extra spaces
                            ])),
        ('keyword_selector',TextRank_KeywordSelection(keyword_count=10,stem=True))
    ])
    cross_validate((docs,keywords_doc),pipeline,keyword_prf,stem_y=True)

# keyword_prf_onegram - top 10 keywords - NounAdj Heuristic Word Extracter
# precision_score : 0.460607928569 +/- 0.0223582417735
# recall_score : 0.101528291878 +/- 0.00369494108571
Пример #49
0
import numpy as np
import matplotlib.pyplot as plt

import utilities

# Load input data
input_file = 'data_multivar.txt'
X, y = utilities.load_data(input_file)

###############################################
# Separate the data into classes based on 'y'
class_0 = np.array([X[i] for i in range(len(X)) if y[i] == 0])
class_1 = np.array([X[i] for i in range(len(X)) if y[i] == 1])

# Plot the input data
plt.figure()
plt.scatter(class_0[:, 0],
            class_0[:, 1],
            facecolors='black',
            edgecolors='black',
            marker='s')
plt.scatter(class_1[:, 0],
            class_1[:, 1],
            facecolors='None',
            edgecolors='black',
            marker='s')
plt.title('Input data')

###############################################
# Train test split and SVM training
from sklearn import model_selection
Пример #50
0
####    This is the URL for the parameters for AdaBoost Classifier

def create_test_submission(filename, prediction):
    content = ['id,ACTION']
    for i, p in enumerate(prediction):
        content.append('%i,%f' %(i+1,p))
    f = open(filename, 'w')
    f.write('\n'.join(content))
    f.close()
    print 'Saved'


cwd = os.getcwd()
trainDataLoc = cwd + '/../data/train.csv'
testDataLoc = cwd + '/../data/test.csv'
y, X = load_data(trainDataLoc)
y_test, X_test = load_data(testDataLoc, use_labels=False)
print ("encoding")
encoder = preprocessing.OneHotEncoder()
print ("fitting")
encoder.fit(np.vstack((X, X_test)))
X = encoder.transform(X)  # Returns a sparse matrix (see numpy.sparse)
X_test = encoder.transform(X_test)

print("about to classify")
clf = AdaBoostClassifier(base_estimator=None, n_estimators=900, learning_rate=1.8)
scores = clf.fit(X, y)
    # """
    # X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(X, y, test_size=.20, random_state=SEED)
    
    # model = svm.SVC(C=1, probability=True, kernel='rbf')
Пример #51
0
    parser.add_argument('--train_labels_path', nargs='?', type=str, default='data/wine_train_labels.csv', help='Path to training labels')
    parser.add_argument('--test_set_path', nargs='?', type=str, default='data/wine_test.csv', help='Path to the test set csv')
    parser.add_argument('--test_labels_path', nargs='?', type=str, default='data/wine_test_labels.csv', help='Path to the test labels csv')

    args = parser.parse_args()
    mode = args.mode[0]

    return args, mode


if __name__ == '__main__':
    args, mode = parse_args() # get argument from the command line

    # load the data
    train_set, train_labels, test_set, test_labels = load_data(train_set_path=args.train_set_path,
                                                                       train_labels_path=args.train_labels_path,
                                                                       test_set_path=args.test_set_path,
                                                                       test_labels_path=args.test_labels_path)
    if mode == 'feature_sel':
        selected_features = feature_selection(train_set, train_labels)
        print_features(selected_features)
    elif mode == 'knn':
        predictions = knn(train_set, train_labels, test_set, args.k)
        print_predictions(predictions)
    elif mode == 'alt':
        predictions = alternative_classifier(train_set, train_labels, test_set)
        print_predictions(predictions)
    elif mode == 'knn_3d':
        predictions = knn_three_features(train_set, train_labels, test_set, args.k)
        print_predictions(predictions)
    elif mode == 'knn_pca':
        prediction = knn_pca(train_set, train_labels, test_set, args.k)
Пример #52
0
def train_regress_net(train_size=200, valid_size=60, iterations=10000, momentum_decay=0.9, learning_rate=0.7, filter_size=10, n_hidden=500, n_filters=6, plot=False):
    theano.config.compute_test_value = 'off'
    theano.config.DebugMode.check_strides = 0


    # initialize some stuff
    nbins_out = 6
    batch_size=8*train_size
    rng = np.random.RandomState(4321)

    # load the data
    datasets = utilities.load_data("data/train_skies_grid.npz", train_size, valid_size, flip=True, rotate=True)
    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # get the shape of the input image from data
    nbins = train_set_x.get_value().shape[3]

    # prepare theano objects
    data = T.tensor4('x')
    # data.tag.test_value = train_set_x.get_value()
    target = T.matrix('y')
    # target.tag.test_value = train_set_y.get_value()

    # create the net
    net_params = [rng, [batch_size, 3, nbins,nbins], (n_filters, 3, filter_size,filter_size), n_hidden, nbins_out, .001]
    cls = RegressNetWithDropoutTrain(data, *net_params )

    # create a validation net
    val_params = copy.copy(net_params)
    val_params[1][0] = valid_size
    val = RegressNetWithDropoutPredict(data, *val_params)

    #Sanity check to make sure the net works
    cost = theano.function(inputs=[],
                           outputs=cls.cost(target),
                           givens={data:train_set_x,
                                   target: train_set_y})

    print "Testing to make sure forward propagation works"
    print cost()

    # Setup learning rule
    # Currently using gradient decent with momentum
    grads = T.grad(cls.cost(target), cls.params)

    lr = T.scalar('lr')

    updates = {}
    momentum = {}
    learning_rate_scales = [1., 1., 1., 1.]
    for p, g, ls in zip(cls.params, grads, learning_rate_scales):
        momentum[p] = theano.shared(np.zeros_like(p.get_value()))
        updates[p] = p+ls*lr*(momentum_decay*momentum[p]-(1-momentum_decay)*g)
        updates[momentum[p]] = momentum_decay*momentum[p]-(1-momentum_decay)*g

    # compile the training function in theano
    # train_model_debug = theano.function(inputs=[],
    #                               outputs=[cls.cost(target), cls.output, cls.conv_layer.output, cls.hidden_layer.output],
    #                               givens = {
    #                                 data: train_set_x,
    #                                 target: train_set_y
    #                               },
    #                               updates = updates
    #                               # ,mode="DebugMode"
    #                              )
    train_model = theano.function(inputs=[lr],
                                  outputs=cls.cost(target),
                                  givens = {
                                    data: train_set_x,
                                    target: train_set_y
                                  },
                                  updates = updates
                                  # ,mode="DebugMode"
                                 )

    validation_cost = theano.function(inputs=[],
                                     outputs = val.cost(target),
                                     givens = {
                                        data: valid_set_x,
                                        target: valid_set_y
                                     })

    validation_pred = theano.function(inputs=[],
                                      outputs = val.output,
                                      givens= {data: valid_set_x})

    # do the actual training
    print "Training"
    val_score = []
    train_score = []
    for i in xrange(iterations):
        if i%100 == 0:
            # check the score on the validation set every 100 epochs
            # note that this returns the cost *without* the L1 penalty
            val.copy_params(cls)
            vc = validation_cost()
            print "Validation Cost:", vc
            val_score.append(vc)
            print "Validation Prediction\n", validation_pred()[-1]
            print "Acutal value: ", valid_set_y.get_value()[-1]
            # print "Linear weights"
            # print cls.params[-2].get_value()
            tc = train_model(max([learning_rate*i/1000., learning_rate]))
            print tc
            train_score.append(tc)
            if i > 1500:
                # check stopping condition
                # linear least squares to last 10 points in train_score
                # see np.linalg.lstsq for explanation of how this works
                A = np.vstack([np.arange(10)*100, np.ones(10)]).T
                y = np.asarray(train_score[-10:])
                slope, intercept = np.linalg.lstsq(A, y)[0]
                if abs(slope) < 1:
                    print "{} iterations".format(i)
                    print "Final slope: ", slope
                    print "Final intercept: ", intercept
                    break
        train_model(max([learning_rate*i/1000., learning_rate]))

        # import pdb
        # pdb.set_trace()

    print "Final Training Cost: {}".format(train_model(0.))
    print "Final Validation Cost: {}".format(validation_cost())

    print "Validation preditions"
    print validation_pred()

    # save the model parameters
    cls.save_params("test_weights_regress.npy")

    if plot:
        plt.figure()
        plt.plot(val_score)
        plt.plot(train_score)
        plt.legend(["Validation Cost", "Training Cost"])
        plt.show()