예제 #1
0
def test_dict():
    cwd = os.path.dirname(__file__)
    with open(cwd + '/test3.txt', 'r') as myfile:
        data = json.load(myfile)
    vas_cog_block = data['test']['vasCogBlock']
    vas_block_size = data['test']['vasBlockSize']
    return preprocess_data(vas_cog_block, vas_block_size)
def get_preprocessed_data():
  """Obtain the preprocessed data."""
  tickers = ['snp', 'nyse', 'djia', 'nikkei', 'hangseng', 'ftse', 'dax', 'aord']
  closing_data = preprocess.load_data(tickers)
  time_series = preprocess.preprocess_data(closing_data)
  training_test_data = preprocess.train_test_split(time_series, train_test_ratio=0.8)
  return training_test_data
예제 #3
0
def get_test_data():
    tmp1, tmp2 = pp.read_data()
    S,A = pp.preprocess_data(tmp1, tmp2)
    _,_,S_test, A_test = split_data(S,A)
    print("test size:  ", len(S_test))
    #save_testsplit_data(S_test, A_test)
    return S_test, A_test
예제 #4
0
def get_train_data():
    tmp1, tmp2 = pp.read_data()
    S,A = pp.preprocess_data(tmp1, tmp2)
    S_train, A_train, _, _ = split_data(S,A)
    print("train size:  ", len(S_train))
    #save_trainsplit_data(S_train, A_train)
    return S_train, A_train
예제 #5
0
def main():

    verbose = 1
    X, Y = preprocess_data('creditcard.csv')
    X_train = X[0:32768]
    X_test = X[32768:65536]
    Y_train = Y[0:32768]
    Y_test = Y[32768:65536]

    # Train

    dbscan = DBSCAN_Predict(eps=0.23, min_samples=3, n_jobs=4)
    pred = dbscan.fit_predict(X_train)

    classes = dict()
    for i in range(len(X_train)):
        p = int(pred[i])
        if p not in classes: classes[p] = 0
        classes[p] = classes[p] + 1

    for x in classes:
        print('class', x, classes[x])

    confusion(pred, Y_train)

    # Predict

    y_new = dbscan.predict(X_test)

    confusion(y_new, Y_test)
예제 #6
0
def correlation_matrix():
    df_final = preprocess_data(0)
    ax = plt.subplots(figsize=(20, 20))
    data = df_final.copy()
    corr = data.corr()

    ax = sns.heatmap(corr,
                     vmin=-1,
                     vmax=1,
                     center=0,
                     cmap=sns.diverging_palette(20, 220, n=200),
                     square=True,
                     annot=True)

    ax.set_xticklabels(ax.get_xticklabels(),
                       rotation=45,
                       horizontalalignment='right')

    desired_num_features = len(df_final.columns) - 1
    corr['is_in_billboard'] = corr['is_in_billboard'].apply(abs)
    corr = corr.sort_values('is_in_billboard', ascending=False)

    # add one to extract features because 1st feature will be popularity itself
    extracted_features_list = corr['is_in_billboard'].head(
        desired_num_features + 1).index.values
    print("Number of features (excluding target variable column) extracted:",
          len(extracted_features_list) - 1)
    print("Features to extract:", extracted_features_list[1:])

    processed_data = data[extracted_features_list]
    plt.show()
예제 #7
0
def main():
    print('Start')
    if args.pre:
        preprocess_data(args)
    else:
        print('Skip data preprocessing')
    try:
        word_embedding = torch.from_numpy(
            np.load(os.path.join(args.data_dir,
                                 'word_embedding.npy'))).float()
    except FileNotFoundError:
        word_embedding = None
    my_model = Model(args, word_embedding)
    my_loader = Loader(args)
    my_trainer = Trainer(args, my_model, my_loader)
    while not my_trainer.terminate():
        my_trainer.train()
        my_trainer.test()
    my_trainer.plot_loss()
    print('End')
예제 #8
0
def loadData_binary_main(datapath, pickldata):
    with open('pickledata/main_word_list.pickle') as g:
        main_word_list = pickle.load(g)
    # print main_word_list
    # raw_input()
    sorted_main_word_list = sorted(sorted(main_word_list),
                                   key=main_word_list.get,
                                   reverse=True)
    sorted_main_word_list_reduced = sorted_main_word_list[:5000]

    # print sorted_main_word_list_reduced

    file_list = []
    file_list = glob.glob(datapath + "/spam/*.*")
    for i in file_list:
        doc_name1 = tuple(i.split('/'))
        x = preprocess.preprocess_data(i)
        insert_dict(x, doc_name1[-1], 1, sorted_main_word_list_reduced)

    # print binary_dict

    #
    #
    file_list = []
    file_list = glob.glob(datapath + "/notspam/*.*")
    # print len(file_list)

    for j in file_list:
        doc_name2 = tuple(j.split('/'))
        y = preprocess.preprocess_data(j)
        insert_dict(y, doc_name2[-1], 0, sorted_main_word_list_reduced)

    # print binary_dict
    data_dict = pd.DataFrame(binary_dict)
    data_dict1 = data_dict.transpose()

    # print data_dict1

    with open('pickledata/binary_dict5k_sf.pickle',
              'w') as f:  # Python 3: open(..., 'wb')
        pickle.dump(data_dict1, f)
def save_data(api):
    search = 'Delhi -filter:retweets'
    searched_tweet = tweepy.Cursor(api.search, q=search).items(3000)
    tweets_data = [[tweet.user.name, tweet.text]
                   for tweet in searched_tweet]
    df = pd.DataFrame(tweets_data, columns=['user', 'tweet'])
    df.to_csv('tweet.csv', index=False)
    # df = pd.read_csv('tweet.csv', encoding='latin')
    processed_df = preprocess_data(df)
    processed_df['sentiment'] = processed_df['tweet'].apply(get_sentiment)
    processed_df = processed_df.drop_duplicates('tweet')
    processed_df.to_csv('data.csv', index=False)
예제 #10
0
def main(_):

    global INPUT_TOKEN_INDEX, TARGET_TOKEN_INDEX, TARGET_INDEX_TOKEN, MODEL_PARAMETER

    input_tensor_train, target_tensor_train, input_tensor_val, \
    max_encoder_seq_length, max_decoder_seq_length, \
    INPUT_TOKEN_INDEX, TARGET_TOKEN_INDEX, \
    reverse_input_word_index, TARGET_INDEX_TOKEN = ps.preprocess_data(FLAGS.num_samples, FLAGS.data_path)

    buffer_size = len(input_tensor_train)
    n_batch = buffer_size // FLAGS.batch_size

    dataset = tf.data.Dataset.from_tensor_slices(
        (input_tensor_train, target_tensor_train)).shuffle(buffer_size)
    dataset = dataset.batch(FLAGS.batch_size, drop_remainder=True)

    encoder = Encoder(len(INPUT_TOKEN_INDEX), FLAGS.embedding_dim, FLAGS.units,
                      FLAGS.batch_size)
    decoder = Decoder(len(TARGET_TOKEN_INDEX), FLAGS.embedding_dim,
                      FLAGS.units, FLAGS.batch_size)

    optimizer = tf.train.AdamOptimizer()

    checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                     encoder=encoder,
                                     decoder=decoder)

    train(n_batch, dataset, decoder, encoder, optimizer, checkpoint)

    MODEL_PARAMETER = {
        "encoder_seq_length": max_encoder_seq_length,
        "decoder_seq_length": max_decoder_seq_length,
        "embedding_dim": FLAGS.embedding_dim,
        "units": FLAGS.units,
    }

    save_word_analysis_data()

    # restoring the latest checkpoint in checkpoint_dir
    checkpoint.restore(tf.train.latest_checkpoint(FLAGS.checkpoint_path))

    for val in input_tensor_val[:10]:
        sentence = ''.join([reverse_input_word_index[id] for id in val])
        evaluate(sentence, encoder, decoder, max_encoder_seq_length,
                 max_decoder_seq_length)

    for val in input_tensor_train[:10]:
        sentence = ''.join([reverse_input_word_index[id] for id in val])
        evaluate(sentence, encoder, decoder, max_encoder_seq_length,
                 max_decoder_seq_length)
예제 #11
0
def predict_sentiment(input_text, tokenizer, model):
    #print("RAW TEXT: ", input_text.encode('utf-8'))
    processed_text = preprocess_data(input_text)
    #print("PROCESSED: ", processed_text.encode('utf-8'))
    transformed_text = transform_to_sequence_of_integers([processed_text], tokenizer)
    #print("TRANSFORMED: ", transformed_text)
    padded_text = pad_sequences_of_integers(transformed_text)
    #print("PADDED: ", padded_text)

    prediction = model.predict(padded_text)
    # transform the result it is between 0.0 and 1.0 (sigmoid) or -1.0 and 1.0 (tanh)
    sigmoid = tf.math.sigmoid(prediction)
    tanh = tf.math.tanh(prediction)

    return tanh
예제 #12
0
def main():
    print("The config used for this run are being saved @ {}".format(os.path.join(args.prefix, 'config_params.txt')))
    write(vars(args), os.path.join(args.prefix, 'config_params.txt'))
    mean, std = get_dataset_mean_std()
    train_cifar10, test_cifar10, train_loader, test_loader = preprocess_data((mean[0], mean[1], mean[2]), (std[0], std[1], std[2]))
    get_data_stats(train_cifar10, test_cifar10, train_loader)
    plot_train_samples(train_loader)
    L1 = args.L1   
    L2 = args.L2   
    device = torch.device("cuda" if args.cuda else "cpu")
    print(device)
    model = Net().to(device)
    summary(model, input_size=(3, 32, 32))
    if args.cmd == 'train':
        print("Model training starts on CIFAR10 dataset")
        # Enable L2-regularization with supplied value of weight decay, or keep it default-0
        if L2:
            weight_decay = args.l2_weight_decay
        else:
            weight_decay = 0

        optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=weight_decay)

        EPOCHS = args.epochs
        for epoch in range(EPOCHS):
            print("EPOCH:", epoch + 1)
            train(model, device, train_loader, optimizer, epoch)
            test(model, device, test_loader, optimizer, epoch)
        plot_acc_loss()
    elif args.cmd == 'test':
        print("Model inference starts on CIFAR10 dataset")
        model_name = args.best_model
        print("Loaded the best model: {} from last training session".format(model_name))
        model = load_model(Net(), device, model_name=model_name)
        y_test = np.array(test_cifar10.targets)
        print("The confusion-matrix and classification-report for this model are:")
        y_pred = model_pred(model, device, y_test, test_cifar10)
        x_test = test_cifar10.data
        display_mislabelled(model, device, x_test, y_test.reshape(-1, 1), y_pred, test_cifar10,
                            title_str='Predicted Vs Actual With L1')
예제 #13
0
    def get_dataset(self, scale=True, stationary=False, indicators=False):
        '''
            Input: scale - if to scale the input data
        '''
        x_df = self.df[["Close", "Open", "High", "Low",
                        "Volume"]].dropna()[:-1]
        y_df = self.df["Next_day_closing_price"].dropna().fillna(0)

        x_processed_df = preprocess.preprocess_data(x_df).fillna(0)
        if stationary:
            for col in x_processed_df.columns:
                #if not Analysis.ADFtest(x_processed_df[col]):
                print("\nMaking data stationary...\n")
                x_processed_df = Analysis.get_stationary_data(
                    x_processed_df, [col], 12)
                #Analysis.ADFtest(x_processed_df[col])

            y_df = Analysis.get_stationary_data(self.df,
                                                ["Next_day_closing_price"],
                                                12)['Next_day_closing_price']
            y_df.replace([np.inf, -np.inf, np.nan], 0, inplace=True)
        #print(x_processed_df)
        x_processed_df.replace([np.inf, -np.inf], 0, inplace=True)

        self.x_data_values = x_processed_df.fillna(0).values[:-1]
        self.y_data_values = y_df.values[:-1].reshape(-1, 1)

        self.x_scaler = MinMaxScaler(feature_range=(-1, 1))
        self.y_scaler = MinMaxScaler(feature_range=(-1, 1))

        if scale:
            self.x_data = self.x_scaler.fit_transform(self.x_data_values)
            self.y_data = self.y_scaler.fit_transform(self.y_data_values)
            #self.y_data = self.y_data_values
        else:
            self.x_data = self.x_data_values
            self.y_data = self.y_data_values
예제 #14
0
    def record_data(self, task, preprocess=True):
        samples_to_collect = task.get_run_time() * self.sample_rate
        channels = 14
        samples_per_chunk = 80
        chunks = int(samples_to_collect / samples_per_chunk)
        data_array = np.zeros((channels, chunks, samples_per_chunk))

        data = self.signal_reader.read_signals(8960)
        # print(len(data))

        # (640, 14) => (14, 640)
        data = np.array(data).swapaxes(0, 1)

        if preprocess:
            for i, channel_data in enumerate(data):
                processed_data = preprocess_data(channel_data,
                                                 sample_rate=128,
                                                 notch=True,
                                                 bp_filter=True,
                                                 artifact_removal=True)
                data_array[i] = list(divide_chunks(processed_data, 80))
        else:
            data_array = data

        # (14, 8, 80) => (14, 80, 8) => (8, 80, 14)
        samples = data_array.swapaxes(1, 2).swapaxes(0, 2)
        labels = [task.get_task_type()] * 8  # all 8 labels have same target

        # save all data for transfer learning
        if self.transfer_learning:
            self.recorded_data['samples'].append(samples)
            self.recorded_data['labels'].extend(labels)

        task_data = {"samples": samples, "labels": labels}

        return task_data
def loadData_main(datapath,picklepath):
    word_list = []
    spam_word_list = []
    notspam_word_list = []

    smooth_filter = 10e-6


    spam_count1,notspam_count1 = 0,0

    file_list =  glob.glob(datapath + "/spam/*.*")
    for i in file_list:
        spam_count1 += 1
        x = preprocess.preprocess_data(i)
        spam_word_list.append(x)
        word_list.append(x)

    file_list =  glob.glob(datapath + "/notspam/*.*")
    for j in file_list:
        notspam_count1 += 1
        y = preprocess.preprocess_data(j)
        word_list.append(y)
        notspam_word_list.append(y)


    word_list = [item for sublist in word_list for item in sublist]
    main_word_list = dict(Counter(word_list))

    spam_word_list = [item for sublist in spam_word_list for item in sublist]
    main_spam_word_list = dict(Counter(spam_word_list))

    notspam_word_list = [item for sublist in notspam_word_list for item in sublist]
    main_notspam_word_list = dict(Counter(notspam_word_list))

    sorted_main_word_list = sorted(sorted(main_word_list), key=main_word_list.get, reverse=True)
    sorted_main_word_list_reduced = sorted_main_word_list[:30000]

    # main_dict = { word : [spam_count,non_spamcount]}
    word_dict = {}
    for i in sorted_main_word_list_reduced:
        temp = 0
        spam_count2 = 0
        notspam_count2 = 0
        temp1 = 0
        temp = main_spam_word_list.get(i)
        if temp is None:
            spam_count2 = smooth_filter
        else:
            spam_count2 = temp + smooth_filter

        temp1 = main_notspam_word_list.get(i)
        if temp1 is None:
            notspam_count2 = smooth_filter
        else:
            notspam_count2 = temp1 + smooth_filter
        word_dict[i] = [spam_count2, notspam_count2]
    # print word_dict

    #Continous dictionary of all words and their count in the documents as spam or ham.
    with open('pickledata/word_spam_notspam_count_dict.pickle', 'w') as a:  # Python 3: open(..., 'wb')
        pickle.dump(word_dict, a)

    #Stores the count of all the documents in train directory. Spam and ham.
    with open('pickledata/doc_count.pickle', 'w') as f:  # Python 3: open(..., 'wb')
        pickle.dump([notspam_count1,spam_count1], f)

    #Stores a dictionary of all the unique words appearing in the whole train data set with their counts.
    with open('pickledata/main_word_list.pickle', 'w') as g:  # Python 3: open(..., 'wb')
        pickle.dump(main_word_list, g)


    #Stores a dictionary of all the unique spam words appearing in the whole train data set with their counts.
    with open('pickledata/main_spam_word_list.pickle', 'w') as h:  # Python 3: open(..., 'wb')
        pickle.dump(main_spam_word_list, h)

    #Stores a dictionary of all the unique notspam words appearing in the whole train data set with their counts.
    with open('pickledata/main_notspam_word_list.pickle', 'w') as i:  # Python 3: open(..., 'wb')
        pickle.dump(main_notspam_word_list, i)
예제 #16
0
'''
BATCH_SIZE = 10
FEATURE_NUM = 3
LABEL_NUM = 1
HIDDEN1_SIZE = 500
HIDDEN2_SIZE = 200
HIDDEN3_SIZE = 70
HIDDEN4_SIZE = 20
OUTPUT = 4
MAX_RANGE = 10000

'''
 * Get data from preprocess.py
 * The type of data is DataFrame
'''
pre = preprocess.preprocess_data()
dataframe = pre.get_data()

'''
----------------------------- Preprocessing -----------------------------
 * 'sales' will be label, 'vacation', 'temp', 'weekday' will be features
 * vacation : 1 , semester : 0
 * monday : 0 ~ sunday : 6
 * sales is divided by 0% ~ 25% : 0 , 25% ~ 50% : 1 , 50% ~ 75% : 2 , 75% ~ 100% : 3
 * Make dataframe to list in order to insert to 'train_test_split function'
 * In tensorflow, I have to use tf.one_hot but it is easy to use to_categorical in keras
'''
label_list = dataframe['sales'].values.tolist()
label = np.transpose([label_list])
categorical_labels = to_categorical(label, nb_classes=4)
예제 #17
0
파일: popescu.py 프로젝트: vene/misc-nlp
import numpy as np

from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold, StratifiedShuffleSplit
import preprocess


if __name__ == "__main__":
    print "Loading training and test data..."
    X_sg, y_sg = preprocess.load_data("data/singular.txt")
    X_sg_n_clean = preprocess.load_data("data/singular_n.txt", labels=False)
    X_sg = np.r_[X_sg, X_sg_n_clean]
    y_sg = np.r_[y_sg, 2 * np.ones(len(X_sg_n_clean))]
    X_sg_p = preprocess.preprocess_data(X_sg, suffix="$", n=5, return_vect=False, binarize=False)

    train_split, test_split = iter(StratifiedShuffleSplit(y_sg, 1, test_size=0.1, random_state=0)).next()

    X_train, y_train = X_sg[train_split], y_sg[train_split]
    X_test, y_test = X_sg[test_split], y_sg[train_split]
    raise Exception
    scores = np.empty((5, 2, 2))
    best_C = np.empty((5, 2, 2))
    vectorizers = np.empty((5, 2, 2), dtype=np.object)
    for i, n in enumerate((2, 3, 4, 5, 6)):
        for j, suffix in enumerate(("", "$")):
            for k, binarize in enumerate((True, False)):
                X_p, vect = preprocess.preprocess_data(X_train, suffix=suffix, n=n, return_vect=True, binarize=binarize)

                grid = GridSearchCV(
예제 #18
0
파일: train_size.py 프로젝트: vene/misc-nlp
    for sg, this_y_sg, pl, this_y_pl in zip(X_sg_all, y_sg_all, X_pl_all, y_pl_all):
        # get rid of balauri
        sg = sg.strip()
        pl = pl.strip()
        if not (pl.endswith("uri") and sg.endswith("ur")):
            X_sg.append(sg)
            y_sg.append(this_y_sg)
            X_pl.append(pl)
            y_pl.append(this_y_pl)
    X_sg = np.array(X_sg)
    y_sg = np.array(y_sg)
    X_pl = np.array(X_pl)
    y_pl = np.array(y_pl)

    print len(X_sg)
    X_sg_p, v_sg = preprocess.preprocess_data(X_sg, suffix="$", n=5, return_vect=True, binarize=False)
    X_pl_p, v_pl = preprocess.preprocess_data(X_pl, suffix="$", n=5, return_vect=True, binarize=False)

    X_sg_n_clean = preprocess.load_data("data/singular_n.txt", labels=False)
    X_sg_n = v_sg.transform(X_sg_n_clean)
    # X_sg_n = Binarizer(copy=False).transform(v_sg.transform(X_sg_n_clean))

    X_pl_n_clean = preprocess.load_data("data/plural_n.txt", labels=False)
    X_pl_n = v_pl.transform(X_pl_n_clean)
    # X_pl_n = Binarizer(copy=False).transform(v_pl.transform(X_pl_n_clean))

    scores = []
    n_steps = 100
    print "size  \tratio\tsg_score\tpl_score\tscore   \tsg_std  \tpl_std  \tstd"
    for train_proportion in np.linspace(0.1, 1, 10):
        train_size = len(X_sg) * train_proportion
예제 #19
0
def run_training():
    df = pd.read_csv("deliveries.csv")
    features = preprocess.preprocess_data(df)
    train_score_predictor(features)
    train_chase_predictor(features)
예제 #20
0
            X_pl_n.append(pl)
    X_sg_n = np.array(X_sg_n)
    X_pl_n = np.array(X_pl_n)
    scores_sg = np.empty((5, 2, 2))
    predict_sg = np.empty((5, 2, 2))
    best_C_sg = np.empty((5, 2, 2))
    scores_pl = np.empty((5, 2, 2))
    best_C_pl = np.empty((5, 2, 2))
    predict_pl = np.empty((5, 2, 2))

    for i, n in enumerate((2, 3, 4, 5, 6)):
        for j, suffix in enumerate(('', '$')):
            for k, binarize in enumerate((True, False)):
                print "%d-%d-%d out of 411" % (i, j, k)
                X_sg_p, v_sg = preprocess.preprocess_data(X_sg, suffix=suffix,
                                                          n=n, return_vect=True,
                                                          binarize=binarize)
                X_pl_p, v_pl = preprocess.preprocess_data(X_pl, suffix=suffix,
                                                          n=n, return_vect=True,
                                                          binarize=binarize)

                grid1 = GridSearchCV(estimator=LinearSVC(), n_jobs=-1,
                                     verbose=True,
                                     param_grid={'C': np.logspace(-2, 2, 5)},
                                     cv=KFold(len(X_sg), k=10, indices=True))
                grid1.fit(X_sg_p, y_sg)
                scores_sg[i, j, k] = grid1.best_score
                best_C_sg = grid1.best_estimator.C
                clf = grid1.best_estimator

                X_sg_n_p = v_sg.transform(X_sg_n)
예제 #21
0

import sys

import numpy as np

from sklearn.svm.sparse import LinearSVC
from preprocess import get_clf, load_data, preprocess_data
from sklearn.metrics import classification_report
from sklearn.cross_validation import KFold, LeaveOneOut
from sklearn.grid_search import GridSearchCV

if __name__ == '__main__':
	filename = 'inf-all-labeled.txt'

	X, y = load_data(filename)
	n = len(X)
	scores = np.empty((5, 2, 2), dtype=np.float)
	best_C = np.empty((5, 2, 2), dtype=np.float)
	for i, ngrams in enumerate((2, 3, 4, 5, 6)):
		for j, suffix in enumerate(('', '$')):
			for k, binarize in enumerate((True, False)):
				print "ngrams=%d, suffix=%s, binarize=%s" % (ngrams, suffix, binarize)
				X_new = preprocess_data(X, n=ngrams, suffix=suffix, binarize=binarize)
				grid = GridSearchCV(estimator=LinearSVC(), n_jobs=4, verbose=False,
							    	param_grid={'C': (0.01, 0.03, 0.1, 0.3, 1, 1.3)},
									cv=LeaveOneOut(n, indices=True))
				grid.fit(X_new, y)
				scores[i, j, k] = grid.best_score
				best_C[i, j, k] = grid.best_estimator.C