def test_dict(): cwd = os.path.dirname(__file__) with open(cwd + '/test3.txt', 'r') as myfile: data = json.load(myfile) vas_cog_block = data['test']['vasCogBlock'] vas_block_size = data['test']['vasBlockSize'] return preprocess_data(vas_cog_block, vas_block_size)
def get_preprocessed_data(): """Obtain the preprocessed data.""" tickers = ['snp', 'nyse', 'djia', 'nikkei', 'hangseng', 'ftse', 'dax', 'aord'] closing_data = preprocess.load_data(tickers) time_series = preprocess.preprocess_data(closing_data) training_test_data = preprocess.train_test_split(time_series, train_test_ratio=0.8) return training_test_data
def get_test_data(): tmp1, tmp2 = pp.read_data() S,A = pp.preprocess_data(tmp1, tmp2) _,_,S_test, A_test = split_data(S,A) print("test size: ", len(S_test)) #save_testsplit_data(S_test, A_test) return S_test, A_test
def get_train_data(): tmp1, tmp2 = pp.read_data() S,A = pp.preprocess_data(tmp1, tmp2) S_train, A_train, _, _ = split_data(S,A) print("train size: ", len(S_train)) #save_trainsplit_data(S_train, A_train) return S_train, A_train
def main(): verbose = 1 X, Y = preprocess_data('creditcard.csv') X_train = X[0:32768] X_test = X[32768:65536] Y_train = Y[0:32768] Y_test = Y[32768:65536] # Train dbscan = DBSCAN_Predict(eps=0.23, min_samples=3, n_jobs=4) pred = dbscan.fit_predict(X_train) classes = dict() for i in range(len(X_train)): p = int(pred[i]) if p not in classes: classes[p] = 0 classes[p] = classes[p] + 1 for x in classes: print('class', x, classes[x]) confusion(pred, Y_train) # Predict y_new = dbscan.predict(X_test) confusion(y_new, Y_test)
def correlation_matrix(): df_final = preprocess_data(0) ax = plt.subplots(figsize=(20, 20)) data = df_final.copy() corr = data.corr() ax = sns.heatmap(corr, vmin=-1, vmax=1, center=0, cmap=sns.diverging_palette(20, 220, n=200), square=True, annot=True) ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right') desired_num_features = len(df_final.columns) - 1 corr['is_in_billboard'] = corr['is_in_billboard'].apply(abs) corr = corr.sort_values('is_in_billboard', ascending=False) # add one to extract features because 1st feature will be popularity itself extracted_features_list = corr['is_in_billboard'].head( desired_num_features + 1).index.values print("Number of features (excluding target variable column) extracted:", len(extracted_features_list) - 1) print("Features to extract:", extracted_features_list[1:]) processed_data = data[extracted_features_list] plt.show()
def main(): print('Start') if args.pre: preprocess_data(args) else: print('Skip data preprocessing') try: word_embedding = torch.from_numpy( np.load(os.path.join(args.data_dir, 'word_embedding.npy'))).float() except FileNotFoundError: word_embedding = None my_model = Model(args, word_embedding) my_loader = Loader(args) my_trainer = Trainer(args, my_model, my_loader) while not my_trainer.terminate(): my_trainer.train() my_trainer.test() my_trainer.plot_loss() print('End')
def loadData_binary_main(datapath, pickldata): with open('pickledata/main_word_list.pickle') as g: main_word_list = pickle.load(g) # print main_word_list # raw_input() sorted_main_word_list = sorted(sorted(main_word_list), key=main_word_list.get, reverse=True) sorted_main_word_list_reduced = sorted_main_word_list[:5000] # print sorted_main_word_list_reduced file_list = [] file_list = glob.glob(datapath + "/spam/*.*") for i in file_list: doc_name1 = tuple(i.split('/')) x = preprocess.preprocess_data(i) insert_dict(x, doc_name1[-1], 1, sorted_main_word_list_reduced) # print binary_dict # # file_list = [] file_list = glob.glob(datapath + "/notspam/*.*") # print len(file_list) for j in file_list: doc_name2 = tuple(j.split('/')) y = preprocess.preprocess_data(j) insert_dict(y, doc_name2[-1], 0, sorted_main_word_list_reduced) # print binary_dict data_dict = pd.DataFrame(binary_dict) data_dict1 = data_dict.transpose() # print data_dict1 with open('pickledata/binary_dict5k_sf.pickle', 'w') as f: # Python 3: open(..., 'wb') pickle.dump(data_dict1, f)
def save_data(api): search = 'Delhi -filter:retweets' searched_tweet = tweepy.Cursor(api.search, q=search).items(3000) tweets_data = [[tweet.user.name, tweet.text] for tweet in searched_tweet] df = pd.DataFrame(tweets_data, columns=['user', 'tweet']) df.to_csv('tweet.csv', index=False) # df = pd.read_csv('tweet.csv', encoding='latin') processed_df = preprocess_data(df) processed_df['sentiment'] = processed_df['tweet'].apply(get_sentiment) processed_df = processed_df.drop_duplicates('tweet') processed_df.to_csv('data.csv', index=False)
def main(_): global INPUT_TOKEN_INDEX, TARGET_TOKEN_INDEX, TARGET_INDEX_TOKEN, MODEL_PARAMETER input_tensor_train, target_tensor_train, input_tensor_val, \ max_encoder_seq_length, max_decoder_seq_length, \ INPUT_TOKEN_INDEX, TARGET_TOKEN_INDEX, \ reverse_input_word_index, TARGET_INDEX_TOKEN = ps.preprocess_data(FLAGS.num_samples, FLAGS.data_path) buffer_size = len(input_tensor_train) n_batch = buffer_size // FLAGS.batch_size dataset = tf.data.Dataset.from_tensor_slices( (input_tensor_train, target_tensor_train)).shuffle(buffer_size) dataset = dataset.batch(FLAGS.batch_size, drop_remainder=True) encoder = Encoder(len(INPUT_TOKEN_INDEX), FLAGS.embedding_dim, FLAGS.units, FLAGS.batch_size) decoder = Decoder(len(TARGET_TOKEN_INDEX), FLAGS.embedding_dim, FLAGS.units, FLAGS.batch_size) optimizer = tf.train.AdamOptimizer() checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder) train(n_batch, dataset, decoder, encoder, optimizer, checkpoint) MODEL_PARAMETER = { "encoder_seq_length": max_encoder_seq_length, "decoder_seq_length": max_decoder_seq_length, "embedding_dim": FLAGS.embedding_dim, "units": FLAGS.units, } save_word_analysis_data() # restoring the latest checkpoint in checkpoint_dir checkpoint.restore(tf.train.latest_checkpoint(FLAGS.checkpoint_path)) for val in input_tensor_val[:10]: sentence = ''.join([reverse_input_word_index[id] for id in val]) evaluate(sentence, encoder, decoder, max_encoder_seq_length, max_decoder_seq_length) for val in input_tensor_train[:10]: sentence = ''.join([reverse_input_word_index[id] for id in val]) evaluate(sentence, encoder, decoder, max_encoder_seq_length, max_decoder_seq_length)
def predict_sentiment(input_text, tokenizer, model): #print("RAW TEXT: ", input_text.encode('utf-8')) processed_text = preprocess_data(input_text) #print("PROCESSED: ", processed_text.encode('utf-8')) transformed_text = transform_to_sequence_of_integers([processed_text], tokenizer) #print("TRANSFORMED: ", transformed_text) padded_text = pad_sequences_of_integers(transformed_text) #print("PADDED: ", padded_text) prediction = model.predict(padded_text) # transform the result it is between 0.0 and 1.0 (sigmoid) or -1.0 and 1.0 (tanh) sigmoid = tf.math.sigmoid(prediction) tanh = tf.math.tanh(prediction) return tanh
def main(): print("The config used for this run are being saved @ {}".format(os.path.join(args.prefix, 'config_params.txt'))) write(vars(args), os.path.join(args.prefix, 'config_params.txt')) mean, std = get_dataset_mean_std() train_cifar10, test_cifar10, train_loader, test_loader = preprocess_data((mean[0], mean[1], mean[2]), (std[0], std[1], std[2])) get_data_stats(train_cifar10, test_cifar10, train_loader) plot_train_samples(train_loader) L1 = args.L1 L2 = args.L2 device = torch.device("cuda" if args.cuda else "cpu") print(device) model = Net().to(device) summary(model, input_size=(3, 32, 32)) if args.cmd == 'train': print("Model training starts on CIFAR10 dataset") # Enable L2-regularization with supplied value of weight decay, or keep it default-0 if L2: weight_decay = args.l2_weight_decay else: weight_decay = 0 optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=weight_decay) EPOCHS = args.epochs for epoch in range(EPOCHS): print("EPOCH:", epoch + 1) train(model, device, train_loader, optimizer, epoch) test(model, device, test_loader, optimizer, epoch) plot_acc_loss() elif args.cmd == 'test': print("Model inference starts on CIFAR10 dataset") model_name = args.best_model print("Loaded the best model: {} from last training session".format(model_name)) model = load_model(Net(), device, model_name=model_name) y_test = np.array(test_cifar10.targets) print("The confusion-matrix and classification-report for this model are:") y_pred = model_pred(model, device, y_test, test_cifar10) x_test = test_cifar10.data display_mislabelled(model, device, x_test, y_test.reshape(-1, 1), y_pred, test_cifar10, title_str='Predicted Vs Actual With L1')
def get_dataset(self, scale=True, stationary=False, indicators=False): ''' Input: scale - if to scale the input data ''' x_df = self.df[["Close", "Open", "High", "Low", "Volume"]].dropna()[:-1] y_df = self.df["Next_day_closing_price"].dropna().fillna(0) x_processed_df = preprocess.preprocess_data(x_df).fillna(0) if stationary: for col in x_processed_df.columns: #if not Analysis.ADFtest(x_processed_df[col]): print("\nMaking data stationary...\n") x_processed_df = Analysis.get_stationary_data( x_processed_df, [col], 12) #Analysis.ADFtest(x_processed_df[col]) y_df = Analysis.get_stationary_data(self.df, ["Next_day_closing_price"], 12)['Next_day_closing_price'] y_df.replace([np.inf, -np.inf, np.nan], 0, inplace=True) #print(x_processed_df) x_processed_df.replace([np.inf, -np.inf], 0, inplace=True) self.x_data_values = x_processed_df.fillna(0).values[:-1] self.y_data_values = y_df.values[:-1].reshape(-1, 1) self.x_scaler = MinMaxScaler(feature_range=(-1, 1)) self.y_scaler = MinMaxScaler(feature_range=(-1, 1)) if scale: self.x_data = self.x_scaler.fit_transform(self.x_data_values) self.y_data = self.y_scaler.fit_transform(self.y_data_values) #self.y_data = self.y_data_values else: self.x_data = self.x_data_values self.y_data = self.y_data_values
def record_data(self, task, preprocess=True): samples_to_collect = task.get_run_time() * self.sample_rate channels = 14 samples_per_chunk = 80 chunks = int(samples_to_collect / samples_per_chunk) data_array = np.zeros((channels, chunks, samples_per_chunk)) data = self.signal_reader.read_signals(8960) # print(len(data)) # (640, 14) => (14, 640) data = np.array(data).swapaxes(0, 1) if preprocess: for i, channel_data in enumerate(data): processed_data = preprocess_data(channel_data, sample_rate=128, notch=True, bp_filter=True, artifact_removal=True) data_array[i] = list(divide_chunks(processed_data, 80)) else: data_array = data # (14, 8, 80) => (14, 80, 8) => (8, 80, 14) samples = data_array.swapaxes(1, 2).swapaxes(0, 2) labels = [task.get_task_type()] * 8 # all 8 labels have same target # save all data for transfer learning if self.transfer_learning: self.recorded_data['samples'].append(samples) self.recorded_data['labels'].extend(labels) task_data = {"samples": samples, "labels": labels} return task_data
def loadData_main(datapath,picklepath): word_list = [] spam_word_list = [] notspam_word_list = [] smooth_filter = 10e-6 spam_count1,notspam_count1 = 0,0 file_list = glob.glob(datapath + "/spam/*.*") for i in file_list: spam_count1 += 1 x = preprocess.preprocess_data(i) spam_word_list.append(x) word_list.append(x) file_list = glob.glob(datapath + "/notspam/*.*") for j in file_list: notspam_count1 += 1 y = preprocess.preprocess_data(j) word_list.append(y) notspam_word_list.append(y) word_list = [item for sublist in word_list for item in sublist] main_word_list = dict(Counter(word_list)) spam_word_list = [item for sublist in spam_word_list for item in sublist] main_spam_word_list = dict(Counter(spam_word_list)) notspam_word_list = [item for sublist in notspam_word_list for item in sublist] main_notspam_word_list = dict(Counter(notspam_word_list)) sorted_main_word_list = sorted(sorted(main_word_list), key=main_word_list.get, reverse=True) sorted_main_word_list_reduced = sorted_main_word_list[:30000] # main_dict = { word : [spam_count,non_spamcount]} word_dict = {} for i in sorted_main_word_list_reduced: temp = 0 spam_count2 = 0 notspam_count2 = 0 temp1 = 0 temp = main_spam_word_list.get(i) if temp is None: spam_count2 = smooth_filter else: spam_count2 = temp + smooth_filter temp1 = main_notspam_word_list.get(i) if temp1 is None: notspam_count2 = smooth_filter else: notspam_count2 = temp1 + smooth_filter word_dict[i] = [spam_count2, notspam_count2] # print word_dict #Continous dictionary of all words and their count in the documents as spam or ham. with open('pickledata/word_spam_notspam_count_dict.pickle', 'w') as a: # Python 3: open(..., 'wb') pickle.dump(word_dict, a) #Stores the count of all the documents in train directory. Spam and ham. with open('pickledata/doc_count.pickle', 'w') as f: # Python 3: open(..., 'wb') pickle.dump([notspam_count1,spam_count1], f) #Stores a dictionary of all the unique words appearing in the whole train data set with their counts. with open('pickledata/main_word_list.pickle', 'w') as g: # Python 3: open(..., 'wb') pickle.dump(main_word_list, g) #Stores a dictionary of all the unique spam words appearing in the whole train data set with their counts. with open('pickledata/main_spam_word_list.pickle', 'w') as h: # Python 3: open(..., 'wb') pickle.dump(main_spam_word_list, h) #Stores a dictionary of all the unique notspam words appearing in the whole train data set with their counts. with open('pickledata/main_notspam_word_list.pickle', 'w') as i: # Python 3: open(..., 'wb') pickle.dump(main_notspam_word_list, i)
''' BATCH_SIZE = 10 FEATURE_NUM = 3 LABEL_NUM = 1 HIDDEN1_SIZE = 500 HIDDEN2_SIZE = 200 HIDDEN3_SIZE = 70 HIDDEN4_SIZE = 20 OUTPUT = 4 MAX_RANGE = 10000 ''' * Get data from preprocess.py * The type of data is DataFrame ''' pre = preprocess.preprocess_data() dataframe = pre.get_data() ''' ----------------------------- Preprocessing ----------------------------- * 'sales' will be label, 'vacation', 'temp', 'weekday' will be features * vacation : 1 , semester : 0 * monday : 0 ~ sunday : 6 * sales is divided by 0% ~ 25% : 0 , 25% ~ 50% : 1 , 50% ~ 75% : 2 , 75% ~ 100% : 3 * Make dataframe to list in order to insert to 'train_test_split function' * In tensorflow, I have to use tf.one_hot but it is easy to use to_categorical in keras ''' label_list = dataframe['sales'].values.tolist() label = np.transpose([label_list]) categorical_labels = to_categorical(label, nb_classes=4)
import numpy as np from sklearn.svm import LinearSVC from sklearn.grid_search import GridSearchCV from sklearn.cross_validation import StratifiedKFold, StratifiedShuffleSplit import preprocess if __name__ == "__main__": print "Loading training and test data..." X_sg, y_sg = preprocess.load_data("data/singular.txt") X_sg_n_clean = preprocess.load_data("data/singular_n.txt", labels=False) X_sg = np.r_[X_sg, X_sg_n_clean] y_sg = np.r_[y_sg, 2 * np.ones(len(X_sg_n_clean))] X_sg_p = preprocess.preprocess_data(X_sg, suffix="$", n=5, return_vect=False, binarize=False) train_split, test_split = iter(StratifiedShuffleSplit(y_sg, 1, test_size=0.1, random_state=0)).next() X_train, y_train = X_sg[train_split], y_sg[train_split] X_test, y_test = X_sg[test_split], y_sg[train_split] raise Exception scores = np.empty((5, 2, 2)) best_C = np.empty((5, 2, 2)) vectorizers = np.empty((5, 2, 2), dtype=np.object) for i, n in enumerate((2, 3, 4, 5, 6)): for j, suffix in enumerate(("", "$")): for k, binarize in enumerate((True, False)): X_p, vect = preprocess.preprocess_data(X_train, suffix=suffix, n=n, return_vect=True, binarize=binarize) grid = GridSearchCV(
for sg, this_y_sg, pl, this_y_pl in zip(X_sg_all, y_sg_all, X_pl_all, y_pl_all): # get rid of balauri sg = sg.strip() pl = pl.strip() if not (pl.endswith("uri") and sg.endswith("ur")): X_sg.append(sg) y_sg.append(this_y_sg) X_pl.append(pl) y_pl.append(this_y_pl) X_sg = np.array(X_sg) y_sg = np.array(y_sg) X_pl = np.array(X_pl) y_pl = np.array(y_pl) print len(X_sg) X_sg_p, v_sg = preprocess.preprocess_data(X_sg, suffix="$", n=5, return_vect=True, binarize=False) X_pl_p, v_pl = preprocess.preprocess_data(X_pl, suffix="$", n=5, return_vect=True, binarize=False) X_sg_n_clean = preprocess.load_data("data/singular_n.txt", labels=False) X_sg_n = v_sg.transform(X_sg_n_clean) # X_sg_n = Binarizer(copy=False).transform(v_sg.transform(X_sg_n_clean)) X_pl_n_clean = preprocess.load_data("data/plural_n.txt", labels=False) X_pl_n = v_pl.transform(X_pl_n_clean) # X_pl_n = Binarizer(copy=False).transform(v_pl.transform(X_pl_n_clean)) scores = [] n_steps = 100 print "size \tratio\tsg_score\tpl_score\tscore \tsg_std \tpl_std \tstd" for train_proportion in np.linspace(0.1, 1, 10): train_size = len(X_sg) * train_proportion
def run_training(): df = pd.read_csv("deliveries.csv") features = preprocess.preprocess_data(df) train_score_predictor(features) train_chase_predictor(features)
X_pl_n.append(pl) X_sg_n = np.array(X_sg_n) X_pl_n = np.array(X_pl_n) scores_sg = np.empty((5, 2, 2)) predict_sg = np.empty((5, 2, 2)) best_C_sg = np.empty((5, 2, 2)) scores_pl = np.empty((5, 2, 2)) best_C_pl = np.empty((5, 2, 2)) predict_pl = np.empty((5, 2, 2)) for i, n in enumerate((2, 3, 4, 5, 6)): for j, suffix in enumerate(('', '$')): for k, binarize in enumerate((True, False)): print "%d-%d-%d out of 411" % (i, j, k) X_sg_p, v_sg = preprocess.preprocess_data(X_sg, suffix=suffix, n=n, return_vect=True, binarize=binarize) X_pl_p, v_pl = preprocess.preprocess_data(X_pl, suffix=suffix, n=n, return_vect=True, binarize=binarize) grid1 = GridSearchCV(estimator=LinearSVC(), n_jobs=-1, verbose=True, param_grid={'C': np.logspace(-2, 2, 5)}, cv=KFold(len(X_sg), k=10, indices=True)) grid1.fit(X_sg_p, y_sg) scores_sg[i, j, k] = grid1.best_score best_C_sg = grid1.best_estimator.C clf = grid1.best_estimator X_sg_n_p = v_sg.transform(X_sg_n)
import sys import numpy as np from sklearn.svm.sparse import LinearSVC from preprocess import get_clf, load_data, preprocess_data from sklearn.metrics import classification_report from sklearn.cross_validation import KFold, LeaveOneOut from sklearn.grid_search import GridSearchCV if __name__ == '__main__': filename = 'inf-all-labeled.txt' X, y = load_data(filename) n = len(X) scores = np.empty((5, 2, 2), dtype=np.float) best_C = np.empty((5, 2, 2), dtype=np.float) for i, ngrams in enumerate((2, 3, 4, 5, 6)): for j, suffix in enumerate(('', '$')): for k, binarize in enumerate((True, False)): print "ngrams=%d, suffix=%s, binarize=%s" % (ngrams, suffix, binarize) X_new = preprocess_data(X, n=ngrams, suffix=suffix, binarize=binarize) grid = GridSearchCV(estimator=LinearSVC(), n_jobs=4, verbose=False, param_grid={'C': (0.01, 0.03, 0.1, 0.3, 1, 1.3)}, cv=LeaveOneOut(n, indices=True)) grid.fit(X_new, y) scores[i, j, k] = grid.best_score best_C[i, j, k] = grid.best_estimator.C