def get_result_from_data(data_dir, result_dir, dp_dir): """ Get result from data :param data_dir: the pathname of the data directory :param result_dir: the pathname of the result directory :param dp_dir: the pathname of the DataPreprocessing module directory :return: """ # Add code_dir folder sys.path.append(dp_dir) # Import the DataPreprocessing module import DataPreprocessing # Get the DataPreprocessing object dp = DataPreprocessing.DataPreprocessing(data_dir) # Match data file withm names file data_names = dp.match_data_names() # The parallel pipelines for data preprocessing, train, test, and evaluate the ALA classifier # n_jobs = -1 indicates (all CPUs are used) # Set backend="multiprocessing" (default) to prevent sharing memory between parent and threads Parallel(n_jobs=10)( delayed(pipeline)(dp, data_file, names_file, result_dir) for data_file, names_file in data_names)
def getData(self, indexs, hasHeader, needHandleNegativeOneIndex, flag): data = [] columns = defaultdict(list) # each value in each column is appended to a list with open(self.fileName, encoding='Latin-1') as f: reader = csv.reader(f, delimiter = ",",quoting=csv.QUOTE_NONE) # read rows into a dictionary format if hasHeader == 1: next(reader) next(reader) for row in reader: for (i, v) in enumerate(row): columns[i].append(v) for j in indexs: newColumns = columns[j] if j in needHandleNegativeOneIndex: newColumns = DataPreprocessing.DataPreprocessing().handleNegativeOneV2([float(i) for i in newColumns], flag = False) data.append(newColumns) # minLength allLengths = [] for i in range(0, len(data)): allLengths.append(len(data[i])) minLength = np.array(allLengths).min() for i in range(0, len(data)): data[i] = data[i][0:minLength] return data
def train(self, trainDF, validateDF): print("+++++++++++++++++++++Training model...") print("Remove non trainable features...") self.xTrain = trainDF self.yTrain = trainDF[self.yColDiscrete] if ('relevance_int' in self.xTrain): self.xTrain = self.xTrain.drop('relevance_int', axis=1) print("OneHot encoding") self.xTrain = pd.get_dummies(self.xTrain, sparse=True) self.xTrain = scipy.sparse.csc_matrix(self.xTrain) fm = SGDFMClassification(n_iter=1000, rank=16, l2_reg_w=0.0005, l2_reg_V=0.0005, l2_reg=0.0005, step_size=0.01) self._model = OneVsRestClassifier(fm) self.fittedModel = self._model.fit(self.xTrain, self.yTrain) self.yPred = self.fittedModel.predict(self.xTrain) print("Converting to old labels") dp = DataPreprocessing.DataPreprocessing() self.yTrain = dp.transformNewLabelToOld(self.yTrain.as_matrix()) self.yPred = dp.transformNewLabelToOld(self.yPred) print("self.yTrain:", self.yTrain.shape, self.yTrain[1:50, ]) print("self.yPred:", self.yPred.shape, self.yPred[1:100, ]) print("MSE:", mean_squared_error(self.yTrain, self.yPred)) print("RMSE:", sqrt(mean_squared_error(self.yTrain, self.yPred))) print("+++++++++++++++++++++Training completed")
def train(self, trainDF, svalidateDF): self._model = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='sag', max_iter=10000, multi_class='multinomial', verbose=1, warm_start=False, n_jobs=-1) print("+++++++++++++++++++++Training model...") print("Remove non trainable features...") self.xTrain = trainDF self.yTrain = trainDF[self.yColDiscrete] # self.xValidate=validateDF # self.yValidate=validateDF[self.yColDiscrete] # self.xTrain.drop('search_term', axis=1, inplace=True) # self.xTrain.drop('relevance', axis=1, inplace=True) if ('relevance_int' in self.xTrain): self.xTrain = self.xTrain.drop('relevance_int', axis=1) # self.xTrain.drop('product_idx', axis=1, inplace=True) # self.xTrain.drop('Word2VecQueryExpansion', axis=1, inplace=True) # self.xValidate.drop('search_term', axis=1, inplace=True) # self.xValidate.drop('relevance', axis=1, inplace=True) # self.xValidate.drop('relevance_int', axis=1, inplace=True) # self.xValidate.drop('product_idx', axis=1, inplace=True) # self.xValidate.drop('Word2VecQueryExpansion', axis=1, inplace=True) print("+++++++++++++++++++++Training in progress") # print("self.xTrain:",list(self.xTrain)) # print("self.yTrain:", list(self.yTrain)) fittedModel = self._model.fit(self.xTrain, self.yTrain) self.yPred = fittedModel.predict(self.xTrain) # print("self.yPred:", list(self.yPred)) print("Converting to old labels") dp = DataPreprocessing.DataPreprocessing() self.yTrain = dp.transformNewLabelToOld(self.yTrain.as_matrix()) self.yPred = dp.transformNewLabelToOld(self.yPred) print("self.yTrain:", self.yTrain.shape, self.yTrain[1:50, ]) print("self.yPred:", self.yPred.shape, self.yPred[1:50, ]) print("MSE:", mean_squared_error(self.yTrain, self.yPred)) print("RMSE:", sqrt(mean_squared_error(self.yTrain, self.yPred))) # print("Accuracy:", accuracy_score(self.yTrain, self.yPred)) # print("Precision:", precision_score(self.yTrain, self.yPred, average='micro')) # print("Recall:", recall_score(self.yTrain, self.yPred, average='micro')) # print("F1:", f1_score(self.yTrain, self.yPred, average='micro')) print("+++++++++++++++++++++Training completed")
def main(): DP = DataPreprocessing() UnderExposedImages, OverExposedImages = DP.ConstructDataset() UnderExposedYCbCrImages = [[], [], []] OverExposedYCbCrImages = [[], [], []] with tf.Session() as sess: for img in UnderExposedImages: Image_Array = img.eval() #Image_Array = Image_Array / 255 # normalization YImage, CbImage, CrImage = rgb_to_ycbcr(Image_Array) UnderExposedYCbCrImages[0].append(YImage) UnderExposedYCbCrImages[1].append(CbImage) UnderExposedYCbCrImages[2].append(CrImage) print("test") # Image.fromarray(np.asarray(OverExposedImages[0].eval())).show() return
def pipeline_all_datasets(): """ The pipeline for all data sets :return: """ # Add code_dir folder sys.path.append(dp_dir) # Import DataPreprocessing module import DataPreprocessing dp = DataPreprocessing.DataPreprocessing(data_dir) # Match data files with names file data_names = dp.match_data_names() # The pipeline for each data set (in parallel) # Set backend="multiprocessing" (default) to prevent sharing memory between parent and threads Parallel(n_jobs=1)( delayed(pipeline_one_dataset)(dp, data_files, names_file) for data_files, names_file in data_names)
4. Taekwondo 5. MLB 1870-2016 """ import numpy as np import math import time import DataPreprocessing import EnsembleClassifiers import ModelValidation DP = DataPreprocessing.DataPreprocessing() EC = EnsembleClassifiers.EnsembleClassifiers() MV = ModelValidation.ModelValidation() #Special Cases Pre-processing #DP.merge_taekwondo_datasets(); #DP.preprocess_sms_dataset(); def dataset_learning(dataset, output_file, dataset_name, preprocess_time, NT, F, parameters): txtfile = open('./Learning_Results/' + output_file + '.txt', 'wb') txtfile.write("\n::::::::::::::::::::::::::::") txtfile.write("\nRandom Forest Classification")
csv2 = CsvReader.CsvReader(glob(dir + 'window*.csv')[0]) # extract gaze data data_gaze = csv1.getData([0, 2, 3], hasHeader=1, needHandleNegativeOneIndex=[2, 3], flag=True) # get rid of zeros if len(data_gaze[1]) != len(data_gaze[2]): data_gaze[0] = data_gaze[0][0:min(len(data_gaze[1]), len(data_gaze[2]) )] data_gaze[1] = data_gaze[1][0:min(len(data_gaze[1]), len(data_gaze[2]) )] data_gaze[2] = data_gaze[2][0:min(len(data_gaze[1]), len(data_gaze[2]) )] # assign outside fixations to (-1, -1) data_gaze[1], data_gaze[2] = DataPreprocessing.DataPreprocessing( ).correspondingXAndY(data_gaze[1], data_gaze[2]) # read video csv data_video = csv2.getData([0, 1], hasHeader=0, needHandleNegativeOneIndex=[], flag=False) # get absolute time for both video and gaze at_gaze = [Time.Time(i) for i in data_gaze[0]] at_video = [Time.Time(i) for i in data_video[1]] # interpolation to video start_v, end_v, data_gaze_videoTimeDomain = ConvertTimeDomainToVideo.ConvertTimeDomainToVideo( ).convertTimeDomain2Video(time_other=at_gaze, time_vidoe=at_video, collections_data=data_gaze[1:len(data_gaze)]) if end_v == -1:
# -*- coding: cp1251 -*- import codecs import os import re # import pandas as pd # import sys import null import numpy as np from requests.packages import chardet # reload(sys) import DataPreprocessing as dp import CreateDictionary as cd import DataProcessing as dproc a1=dp.DataPreprocessing() a2 = cd.CreateDictionary() a3 = dproc.DataProcessing() # f='D:\scientific work\InformationExtracting\processeddataex\qword-20151019144240604.xml' # # # print a.find_first_occurrences_of_header(f,u'ТАЛОН НА ОКАЗАНИЕ ВМП ИЗ ПАК') # # # print a.find_last_occurrence_of_header(f,u'ТАЛОН НА ОКАЗАНИЕ ВМП ИЗ ПАК') # text = a1.find_all_occurrences_of_header(f,u'ЭПИДАНАМНЕЗ') # # # print text # sent = a2.sentences_list(text) # colloc = a2.collocations_list(sent) # words = a2.words_list(colloc) # # print sent # # print colloc # frequency lists for stationary diagnosis
p.add_argument('--hidden_size', type=int, default = 2) # RNN output size p.add_argument('--input_size', type=int, default = 4) #RNN input size : number of uwb p.add_argument('--sequence_length', type=int, default = 5) # # of lstm rolling p.add_argument('--output_size', type=int, default = 2) #final output size (RNN or softmax, etc) #FOR TEST p.add_argument('--load_model_dir', type=str, default="model/RiTA_wo_fcn/stacked_bi_epoch_3000/model_0_00006-17700") p.add_argument('--test_data', type=str, default='train_data_3D_zigzag.csv') p.add_argument('--output_results', type=str, default= 'results/RiTA/stack_lstm_epoch3000_17700.csv') ########### p.add_argument('--mode', type=str, default = "train") #train or test args = p.parse_args() data_parser = DataPreprocessing.DataPreprocessing(args.train_data, args.sequence_length) data_parser.fitDataForMinMaxScaler() X_data,Y_data = data_parser.set_data() # data : size of data - sequence length + 1 tf.reset_default_graph() LSTM = LSTM(args) #batch_size, dic_size, sequence_length, hidden_size, num_classes) print(X_data.shape) #Data size / sequence length / uwb num #terms for learning rate decay global_step = tf.Variable(0, trainable=False) iter = int(len(X_data)/args.batch_size) num_total_steps = args.epoches*iter
def getQueryProductAttributeDataFrame(self, train_filename, test_filename, attribute_filename, description_filename, header=0): ''' Takes in HomeDepot CSV and process into the following dataframe: 1. train_query_df/test_query_df - id - product_uid - search_term - relevance 2. product_df - product_uid - product_title - product_description 3. attribute_df - product_uid - name - value :param train_filename: :param test_filename: :param attribute_filename: :param description_filename: :param header: :return: [train_query_df, product_df, attribute_df, test_query_df] ''' train_query_df = pd.read_csv(train_filename, delimiter=',', low_memory=False, header=header, encoding="ISO-8859-1") test_query_df = pd.read_csv(test_filename, delimiter=',', low_memory=False, header=header, encoding="ISO-8859-1") attribute_df = pd.read_csv( attribute_filename, delimiter=',', low_memory=False, header=header ) # people don't seem to use this on attr and desc, encoding="ISO-8859-1") description_df = pd.read_csv(description_filename, delimiter=',', low_memory=False, header=header) #, encoding="ISO-8859-1") all_df = pd.concat((train_query_df, test_query_df), axis=0, ignore_index=True) train_query_df = train_query_df.drop('product_title', axis=1) test_query_df = test_query_df.drop('product_title', axis=1) product_df = pd.DataFrame() product_df = all_df.drop_duplicates(['product_uid']) product_df = product_df.drop('relevance', axis=1) product_df = product_df.drop('id', axis=1) product_df = product_df.drop('search_term', axis=1) product_df = pd.merge(product_df, description_df, how='left', on='product_uid') dp = DataPreprocessing.DataPreprocessing() train_query_df = dp.transformLabels(trainDF=train_query_df, newColName='relevance_int') # Add a index column in train and test to reference to product train_query_df['product_idx'] = [ product_df[product_df['product_uid'] == uid].index for uid in train_query_df.product_uid ] test_query_df['product_idx'] = [ product_df[product_df['product_uid'] == uid].index for uid in test_query_df.product_uid ] # print("all: ", len(all_df.product_uid)) # print("all unique: ", len(all_df.product_uid.unique())) # print("description_df: ", len(description_df)) # print("description_df unique: ", len(description_df.product_uid.unique())) # print("product_df: ", len(product_df)) # print("product_df info", product_df.info()) # print("train_query_df.info: ", train_query_df.info()) return [train_query_df, product_df, attribute_df, test_query_df]
def __init__(self, id, host, port, gui=False): self.sensorId = id self.socket = Socket.MySocket(host=host, port=port) self.frameSize = 0.300 self.dataPre = dataPre.DataPreprocessing() self.model = model.Model(gui)