def get_data( self, dsn_database, dsn_hostname, dsn_port, dsn_protocol, dsn_uid, dsn_pwd, level, ): preprocess = preprocess.Preprocessor() raw_data = preprocess.db2_connect( dsn_database, dsn_hostname, dsn_port, dsn_protocol, dsn_uid, dsn_pwd ) data = preprocess.data_preprocess(raw_data, level) return data
def test(self, dataset): """Accepts a dataset for testing Calculates the probility of the test set against all categories and predicts the label of the test set """ predictions = [] for data in dataset: # Clean the test data entry cleaned_data = data_preprocess(data) # Compute for the posterior probability of the entry post_prob = self.get_test_prob(cleaned_data) # Store the label of the entry into predictions predictions.append(self.classes[np.argmax(post_prob)]) return np.array(predictions)
def __init__(self, filename, cleaning, max_vocab_size, update_embeds): revs, word2idx = data_preprocess(filename, cleaning, max_vocab_size) data, label = feature_extraction_index(revs, word2idx) word_emb_mat = np.loadtxt('w_emb_mat.txt') # data = normalization(data) X_train, X_dev, Y_train, Y_dev = train_test_split(data, label, test_size=0.2, random_state=0) # print("X_train.shape: ", X_train.shape) self.data = X_train self.label = Y_train self.X_dev = X_dev self.Y_dev = Y_dev self.word2idx = word2idx self.embeddings = nn.Embedding.from_pretrained( torch.from_numpy(word_emb_mat), freeze=not update_embeds)
def transformation(): """Do an inference on a single batch of data. In this sample server, we take data as CSV, convert it to a pandas data frame for internal use and then convert the predictions back to CSV (which really just means one prediction per line, since there's a single column. """ data = None # Convert from CSV to pandas if flask.request.content_type == 'text/csv': data = flask.request.data.decode('utf-8') s = StringIO.StringIO(data) data = pd.read_csv(s) else: return flask.Response(response='This predictor only supports CSV data', status=415, mimetype='text/plain') print('Invoked with {} records'.format(data.shape[0])) # Prep data prepped_data = data_preprocess(data) # Drop last_trip_date column prepped_data.drop(["last_trip_date"], axis=1, inplace=True) data_array = prepped_data.values print(data_array[0]) # Do the prediction predictions = ScoringService.predict(data_array) # Convert from numpy back to CSV out = StringIO.StringIO() pd.DataFrame({ 'results': predictions }).to_csv(out, header=False, index=False) result = out.getvalue() return flask.Response(response=result, status=200, mimetype='text/csv')
return parameters if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description='Building Interactive Intelligent Systems') parser.add_argument('-c','--clean', help='True to do data cleaning, default is False', action='store_false') parser.add_argument('-mv','--max_vocab', help='max vocab size predifined, no limit if set -1', required=False, default=-1) parser.add_argument('-lr','--learning_rate', required=False, default=0.001) parser.add_argument('-i','--num_iter', required=False, default=1) parser.add_argument('-fn','--file_name', help='file name', required=False, default='myTest') args = vars(parser.parse_args()) print(args) print('[Read the data from twitter-sentiment-testset.csv...]') revs, word2idx = data_preprocess('./twitter-sentiment-testset.csv', args['clean'], int(args['max_vocab'])) print('[Extract features from the read data...]') data, label = feature_extraction_bow(revs, word2idx) data = normalization(data) # shuffle data shuffle_idx = np.arange(len(data)) np.random.shuffle(shuffle_idx) data = data[shuffle_idx] label = label[shuffle_idx] print('[Start training...]') X_train, X_dev, Y_train, Y_dev = train_test_split(data, label, test_size=0.2, random_state=0) parameters = model(X_train.T, Y_train.T, X_dev.T, Y_dev.T, args['file_name'],
print('Bi-direction Matching is starting ----------') result = [] t = trange(len(sentences)) for i in t: if len(fmm[i]) > len(bmm[i]): # 首先考虑分词数目,取词数较少的结果 result.append(bmm[i]) elif len(fmm[i]) < len(bmm[i]): # 分词数目相同时,取单字数目较少的结果 result.append(fmm[i]) elif fmm[i] == bmm[i]: result.append(fmm[i]) else: count_fmm = [len(s) for s in fmm[i]].count(1) count_bmm = [len(s) for s in bmm[i]].count(1) if count_fmm > count_bmm: result.append(bmm[i]) else: result.append(fmm[i]) self.result_evalutate(result) self.write_to_txt(result, 'result_bm.txt') print('Bi-direction Matching completed ----------') return result if __name__ == "__main__": stop_words, words_set = pre.load_word_data() sentences, labels = pre.data_preprocess() fenci = Segment_Words(stop_words, words_set, labels) result = fenci.bi_direction_matching(sentences)
# -*- coding: utf-8 -*- import numpy as np import pandas as pd from preprocess import data_preprocess from NaiveBayes import NaiveBayes raw_data = open('a1_d3.txt', 'r') dataset = data_preprocess(raw_data) split_dataset = np.array_split(dataset, 5) accuracies = [] f_scores = [] for epoch in range(5): X_train = [] X_test = [] y_train = [] y_test = [] for i in range(5): if epoch == i: X_test = split_dataset[epoch]['Review'].values y_test = split_dataset[epoch]['Sentiment'].values else: X_train.append(pd.DataFrame(split_dataset[i]['Review'].values)) y_train.append(pd.DataFrame(split_dataset[i]['Sentiment'].values)) X_train = pd.concat(X_train)[0].values
def main(): # Set seed for reproducability np.random.seed(6969) # Preprocess do_smoothing = False do_subset = False do_snv = False do_normalize = False # Analysis inspection = 'processed' show_plots = True do_outlier_filtering = False do_linear_class = False do_nonlinear_class = False # Load data dir_path = '../datasets/indian_pines' data_file = 'indian_pines.mat' cali_file = 'calibration.mat' labels_file = 'indian_pines_gt.mat' dataloader = Dataloader(dir_path, data_file, cali_file, labels_file) # Create tables, resample and create test set X = dataloader.get_calibrated_samples() Y = dataloader.get_labels() W = dataloader.get_wave_lengths() X = create_table(X) Y = create_table(Y) X, Y = resample_dataset(X, Y, 4.0) X_train, Y_train, X_test, Y_test = create_test_set(X, Y, test_frac=0.30) # Smoothing, subset selection, SNV avg_window = 5 # 0-38, 42-44, 48-53, 65-73, 84-86, 91-98, 120-144, 167-169, 172-220 subset1 = np.arange(0, 38) subset2 = np.arange(42, 44) subset3 = np.arange(48, 53) subset4 = np.arange(65, 73) subset5 = np.arange(84, 86) subset6 = np.arange(91, 98) subset7 = np.arange(120, 144) subset8 = np.arange(167, 169) subset9 = np.arange(172, 220) subset_inds = np.concatenate((subset1, subset2, subset3, subset4, subset5, subset6, subset7, subset8, subset9)) X_train, X_test = data_preprocess(X_train, X_test, avg_window, subset_inds, do_smoothing, do_subset, do_snv, False) #W = dataloader.get_wave_lengths(subset_inds) # Outlier detection if do_outlier_filtering: outliers = hotellings_t2(X_train, Y_train, 0.05, True, False, fig_num=3, fig_size=(12, 6)) outliers = np.take(outliers, [2]) # 95% CI: Total 45; 2 inspect_outliers(W, X_train, outliers) X_train, Y_train = remove_outliers(X_train, Y_train, outliers) # Normalization if do_normalize: X_train = normalize(X_train) X_test = normalize(X_test) # PCA inspection if inspection == 'processed': data_inspection(W, X_train, Y_train, 17, 1, (12, 6), 'Raw Data', 'Wave lengths [nm]', 'Radiance [Wm^(-2)sr^(-1)]') elif inspection == 'pls': pls_inspection(X_train, Y_train, n_comps=8) elif inspection == 'pca': pca_inspection(X_train, Y_train, n_comps=8) elif inspection == 'kpca': kernel_pca_inspection(X_train, Y_train, 8, 'linear') # Linear classification if do_linear_class: linear_classification(X_train, Y_train, X_test, Y_test, n_folds=5, n_comps_max=10, threshold=0.90, show_plots=show_plots, fignum=2, figsize=(8, 6), normalize=False) # Non-linear classification if do_nonlinear_class: gamma_min = 1e-5 #1e-5 gamma_max = 3e-1 #3e-1 n_gammas = 30 #30 gammas = np.linspace(gamma_min, gamma_max, n_gammas) best_gamma = svm_cross_validation(X_train, Y_train, n_folds=5, kernel='rbf', gammas=gammas, show_plots=show_plots) svm_classification(X_train, Y_train, X_test, Y_test, kernel='rbf', gamma=best_gamma)
def train(self, dataset, labels): """Accepts a dataset with the shape (l x d) where l is the number of classes and the labels with a shape of (l) Training function for the Naive Bayes Model Computes for the BoW for each class """ self.dataset = dataset self.labels = labels if not isinstance(self.dataset, np.ndarray): self.dataset = np.array(self.dataset) if not isinstance(self.labels, np.ndarray): self.labels = np.array(self.labels) for cat_index, category in enumerate(self.classes): # get all data for that category all_cat_data = self.dataset[self.labels == category] # clean the gathered data cleaned_data = [ data_preprocess(cat_data) for cat_data in all_cat_data ] cleaned_data = pd.DataFrame(data=cleaned_data) # construct the BoW for that category np.apply_along_axis(self.add_to_BoW, 1, cleaned_data, cat_index) if self.reduce: self.reduce_words() prob_classes = np.empty(self.classes.shape[0]) all_words = [] cat_word_counts = np.empty(self.classes.shape[0]) for cat_index, category in enumerate(self.classes): # Compute for probability of a category, p(C) prob_classes[cat_index] = np.sum(self.labels == category) / float( self.classes.shape[0]) # Compute for total count of all words in each category # count = list(self.bow_dicts[cat_index].values()) # removed +1 cat_word_counts[cat_index] = np.sum( np.array(list( self.bow_dicts[cat_index].values()))) + self.alpha # Get all words for this category all_words += self.bow_dicts[cat_index].keys() # Construct the vocab for the training set self.vocab = np.unique(np.array(all_words)) self.vocab_length = self.vocab.shape[0] # Get all denominators per category # removed + 1 from self.vocab_length demons = np.array([ cat_word_counts[cat_index] + self.vocab_length + self.alpha for cat_index, category in enumerate(self.classes) ]) # Compile the data into tuples self.cat_infos = [(self.bow_dicts[cat_index], prob_classes[cat_index], demons[cat_index]) for cat_index, category in enumerate(self.classes)] self.cat_infos = np.array(self.cat_infos)