def main(argv): modelSpecs = config.InitializeModelSpecs() modelSpecs = ParseCommandLine.ParseArguments(argv, modelSpecs) ## load the datasets. Data is a list of proteins and each protein is represented as a dict() Data = DataProcessor.LoadDistanceLabelMatrices(modelSpecs['dataset'], modelSpecs=modelSpecs) print '#proteins loaded from the dataset: ', len(Data) allProteins = [d['name'] for d in Data] print 'Preparing batch data for training...' groupSize = modelSpecs['minibatchSize'] batches = DataProcessor.SplitData2Batches(data=Data, numDataPoints=groupSize, modelSpecs=modelSpecs) print "#batches:", len(batches) ## add code here to calculate empirical reference state ## RefState is a dict, RefState[response] = (length-independent ref, length-dependent ref) ## length-independent ref is an 1d array, length-dependent ref is a list with each element being an tuple (length, 1d array) RefState = CalcRefState(batches=batches, modelSpecs=modelSpecs) RefState['dataset'] = modelSpecs['dataset'] RefState['proteins'] = allProteins ## save RefState responseStr = '-'.join(modelSpecs['responses']) file4save = 'EmpRefState-' + responseStr + '-' + str(os.getpid()) + '.pkl' fh = open(file4save, 'wb') cPickle.dump(RefState, fh, protocol=cPickle.HIGHEST_PROTOCOL) fh.close() ## print the length-ind reference state for response in modelSpecs['responses']: print RefState[response][0]
def main(): # learn embeddings word2vec.word2vec() # convert training,test and eval data into np arrays DataProcessor.build_data() # this calculates sentiments for the data lstm.lstm_script()
def get_accuracy_windowed(classifier, parameters, length, slide, arms, wrists, detections): #Assemble training data armIMU = arms[0] wristIMU = wrists[0] detection = detections[0] for i in range(1, 5): armIMU = np.concatenate((armIMU, arms[i]), axis=0) wristIMU = np.concatenate((wristIMU, wrists[i]), axis=0) detection = np.concatenate((detection, detections[i]), axis=0) train_arm, train_wrist, train_detect = dp.apply_window( length, slide, armIMU, wristIMU, detection) print("Window applied to training data") #Assemble validation data arm, wrist, time, detect = dr.get_data(6) detect = detect.astype(int) detect = detect.ravel() val_arm, val_wrist, val_detect = dp.apply_window(length, slide, arm, wrist, detect) print("Window applied to validation data") train_arm_wrist = np.concatenate((train_arm, train_wrist), axis=1) val_arm_wrist = np.concatenate((val_arm, val_wrist), axis=1) accuracies = perform_classification(classifier, parameters, slide, train_arm_wrist, train_detect, val_arm_wrist, val_detect) return accuracies
def CalcFeatureExpectBySampling(metaData, modelSpecs): seqfeatures = [] seqweights = [] matrixfeatures = [] matrixweights = [] embedfeatures = [] embedweights = [] dataLocation = DataProcessor.SampleProteinInfo(metaData) for loc in dataLocation: d = DataProcessor.LoadRealData(loc, modelSpecs, loadLabel=False) res = CalcFeatureExpect4OneProtein(d) seqfeature, seqweight, matrixfeature, matrixweight = res[:4] seqfeatures.append(seqfeature) matrixfeatures.append(matrixfeature) seqweights.append(seqweight) matrixweights.append(matrixweight) if len(res) == 6: embedfeature, embedweight = res[5:] embedfeatures.append(embedfeature) embedweights.append(embedweight) modelSpecs['seqFeatures_expected'] = np.average(seqfeatures, axis=0, weights=seqweights) modelSpecs['matrixFeatures_expected'] = np.average(matrixfeatures, axis=0, weights=matrixweights) modelSpecs['embedFeatures_expected'] = np.average(embedfeatures, axis=0, weights=embedweights)
def TrainDataLoader3(sharedQ, sharedLabelPool, sharedLabelWeightPool, stopTrainDataLoader, trainMetaData, modelSpecs, assembleData=True, UseSharedMemory=False): #print 'trainDataLoader has event: ', stopTrainDataLoader ## here we use labelPool to cache the labels of all the training proteins ## one protein may have multiple sets of input features due to MSA sampling or sequnence-template alignment ## but it can only have one set of label matrices, so it is worth to save all label matrices in RAM. labelPool = dict() labelWeightPool = dict() ## load the labels of all training proteins trainDataLocation = DataProcessor.SampleProteinInfo(trainMetaData) for loc in trainDataLocation: d = DataProcessor.LoadRealData(loc, modelSpecs, loadFeature=False, returnMode='list') name = d['name'] labelPool[name] = d['atomLabelMatrix'] labelWeightMatrix = LabelUtils.CalcLabelWeightMatrix(LabelMatrix=d['atomLabelMatrix'], modelSpecs=modelSpecs, floatType=np.float16) labelWeightPool[name] = labelWeightMatrix print 'TrainDataLoader with #PID ', os.getpid(), ' has loaded ', len(labelPool), ' label matrices and ', len(labelWeightPool), ' label weight matrices' ## update labelPool and labelWeightPool to the shared dict() sharedLabelPool.update(labelPool) sharedLabelWeightPool.update(labelWeightPool) print 'TrainDataLoader with #PID ', os.getpid(), ' has update the shared labelPool and labelWeightPool' while True: if stopTrainDataLoader.is_set() or os.getppid()==1: print 'trainDataLoader receives the stop signal' break trainDataLocation = DataProcessor.SampleProteinInfo(trainMetaData) numOriginals = len(trainDataLocation) """ maxLen = 900 trainDataLocation, numExcluded = DataProcessor.FilterByLength(trainDataLocation, maxLen) print 'Exclude ', numExcluded, ' train proteins longer than ', maxLen, ' AAs' """ trainSeqData = DataProcessor.SplitData2Batches(trainDataLocation, numDataPoints=modelSpecs['minibatchSize'], modelSpecs=modelSpecs) random.shuffle(trainSeqData) for batch in trainSeqData: if stopTrainDataLoader.is_set() or os.getppid()==1: print 'trainDataLoader receives the stop signal' break names = [ p['name'] for p in batch ] data = [] for protein in batch: d = DataProcessor.LoadRealData(protein, modelSpecs, loadLabel=False, returnMode='list') data.append(d) FeatureUtils.CheckModelNDataConsistency(modelSpecs, data) if assembleData: data = PrepareInput4Train(data, modelSpecs, floatType=np.float16, UseSharedMemory=UseSharedMemory) #print 'putting data to trainDataLoader queue...' sharedQ.put( (data, names) ) print 'TrainDataLoader has finished loading data' sharedQ.close()
def TrainByOneBatch(batch, train, modelSpecs, forRefState=False): ## batch is a list of protein locations, so we need to load the real data here minibatch = DataProcessor.LoadRealData(batch, modelSpecs) ## add code here to make sure that the data has the same input dimension as the model specification FeatureUtils.CheckModelNDataConsistency(modelSpecs, minibatch) onebatch, names4onebatch = DataProcessor.AssembleOneBatch( minibatch, modelSpecs, forRefState=forRefState) x1d, x2d, x1dmask, x2dmask = onebatch[0:4] ## crop a large protein to deal with limited GPU memory. For sequential and embedding features, the theano model itself will crop based upon bounding box bounds = SampleBoundingBox((x2d.shape[1], x2d.shape[2]), modelSpecs['maxbatchSize']) #x1d_new = x1d[:, bounds[1]:bounds[3], :] x1d_new = x1d x2d_new = x2d[:, bounds[0]:bounds[2], bounds[1]:bounds[3], :] #x1dmask_new = x1dmask[:, bounds[1]:x1dmask.shape[1] ] x1dmask_new = x1dmask x2dmask_new = x2dmask[:, bounds[0]:x2dmask.shape[1], bounds[1]:bounds[3]] input = [x1d_new, x2d_new, x1dmask_new, x2dmask_new] ## if embedding is used ##if any( k in modelSpecs['seq2matrixMode'] for k in ('SeqOnly', 'Seq+SS') ): if config.EmbeddingUsed(modelSpecs): embed = onebatch[4] #embed_new = embed[:, bounds[1]:bounds[3], : ] embed_new = embed input.append(embed_new) remainings = onebatch[5:] else: remainings = onebatch[4:] ##crop the ground truth and weight matrices for x2d0 in remainings: if len(x2d0.shape) == 3: input.append(x2d0[:, bounds[0]:bounds[2], bounds[1]:bounds[3]]) else: input.append(x2d0[:, bounds[0]:bounds[2], bounds[1]:bounds[3], :]) ## add bounding box to the input list input.append(bounds) if config.TrainByRefLoss(modelSpecs): if forRefState: input.append(np.int32(-1)) else: input.append(np.int32(1)) train_loss, train_errors, param_L2 = train(*input) return train_loss, train_errors, param_L2
def icir(factor, r, n=20, rank=False): if rank: x1 = DP.standardize(rankdata(factor)) else: x1 = DP.standardize(factor) x2 = DP.standardize(r) ic = (x1 * x2).mean(1).fillna(0) ir = ic.rolling(20).mean() / ic.rolling(20).std() return ic, ir
def DetermineFeatureDimensionBySampling(metaData, modelSpecs): protein = DataProcessor.SampleProteinInfo(metaData, numSamples=1)[0] d = DataProcessor.LoadRealData(protein, modelSpecs, loadLabel=False, returnMode='list') ## obtain the dimension of each type of input feature modelSpecs['n_in_seq'] = DetermineNumSeqFeatures(d['seqFeatures']) modelSpecs['n_in_matrix'] = DetermineNumMatrixFeatures(d['matrixFeatures']) + DetermineNumMatrixFeatures(d['matrixFeatures_nomean']) if d.has_key('embedFeatures'): modelSpecs['n_in_embed'] = d['embedFeatures'].shape[1]
def CalcLabelDistributionNWeightBySampling(trainMetaData, modelSpecs): trainDataLocation = DataProcessor.SampleProteinInfo(trainMetaData, numSamples=10000) ## only load ground truth but not input features to save memory and speed up labelData = [] for loc in trainDataLocation: p = DataProcessor.LoadRealData(loc, modelSpecs, loadFeature=False) labelData.append(p) CalcLabelDistributionAndWeight(labelData, modelSpecs)
def main(): q = input("enter a query to be processed> ") while not q: q = input("no empty queries please> ") dp = DataProcessor() # list_doc = dp.process_texts(sys.argv[1:]) reuters_texts = [] #Working with the first 50 files from the reuters library reuters_data = reuters.fileids()[:200] for data in reuters_data: file_str = "" #concatinate file to string file = reuters.open(data) for line in file: file_str = file_str + line file_str = file_str.replace('\n','') file_str = file_str.replace(" "," ") file_str = file_str.replace(" ", " ") reuters_texts.append(file_str) # for text in reuters_texts: # print(str(text)+"\n") # #print(reuters_texts) # used for debugging purposes [document_frequency, term_frequency_document] = dp.inverted_index(reuters_texts) """returns the document frequency and term frequency document ITS A MUST WHEN CALCULATING THE TF-IDF """ term_weights = dp.compute_weights(term_frequency_document,reuters_texts) # print the term weights # for term,weights in term_weights.items(): # print(term," ",weights) print("document_frequency: ", document_frequency) [total_collection, total_distinct_terms] = dp.get_collection_lengths(reuters_texts) [similarity,sorted_doc_list] = dp.bm25(reuters_texts,document_frequency,term_frequency_document,q) document_lengths = dp.get_doc_length(reuters_texts) query_likelyhood_scores = dp.query_likelyhood(reuters_texts,document_lengths,total_collection,total_distinct_terms,.5) modded_query_vector = dp.rocchioAlgorithm(reuters_texts,term_weights,q,1,1,1) precision_score = dp.precision(q,reuters_texts) #output statements #print("total_collection: ",total_collection) #print("document lengths: " ,document_lengths) print("Query: ",q) print("using bm25 smoothing: ", similarity) #print("sorted_doc_list: ",sorted_doc_list) print("query_likelyhood_scores: ",query_likelyhood_scores) print("modded_query_vector taken from Rocchios algorithm: ",modded_query_vector) print("precision score from precision function for the query " + q + ": ", precision_score)
def ValidDataLoader2(sharedQ, stopValidDataLoader, validSeqData, modelSpecs, assembleData=True, UseSharedMemory=False): bUseCCMFnorm, bUseCCMsum, bUseCCMraw, bUseFullMI, bUseFullCov = config.ParseExtraCCMmode(modelSpecs) if any([bUseCCMraw, bUseFullMI, bUseFullCov]): ## when full coevolution matrices are used, we shall use float16 to save memory floatType = np.float16 else: floatType = theano.config.floatX #print 'validDataLoader has event: ', stopValidDataLoader for batch in validSeqData: if stopValidDataLoader.is_set() or os.getppid()==1: #print 'validDataLoader receives the stop signal' break ## Load real data for one batch data = DataProcessor.LoadRealData(batch, modelSpecs, returnMode='list') ## add code here to make sure that the data has the same input dimension as the model specification FeatureUtils.CheckModelNDataConsistency(modelSpecs, data) if assembleData: data = PrepareInput4Validate(data, modelSpecs, floatType=floatType, UseSharedMemory=UseSharedMemory) #print 'putting data to validDataLoader queue...' sharedQ.put(data) print 'validDataLoader has finished loading data' sharedQ.close()
def ReplaceMissing(self,df: pd.DataFrame) -> pd.DataFrame: #length = 3 #Create a dataprocessor object and convert the data in the csv and change all missing attribtues Dp = DataProcessor.DataProcessor() #Start the process to change the integrity of the dataframe from within the data processor data = Dp.ReplaceMissingValue(df) return data
def get_batch(batch_size): batch = [] while (len(batch) < batch_size): for op in operators: # for each operator... data = [] img, inclasses = next( iter(dataloaders[op] )) # Gather a set of images and classes from them for i in range(len(img)): np = DataProcessor.get_neg_pairs([op, objects[inclasses[i]]], operators, objects) ## Image, Object, Operator, nObject, nOperator data.append([ Variable(feat_extractor(img[i].unsqueeze_(0))), int(inclasses[i]), int(operators.index(op)), np ]) if (len(data) == batch_size): break batch = batch + data return [batch[i] for i in (random.sample(range(len(batch)), batch_size))]
def train_model_lstm(window_size=200, step_size=150): data = dp.get_dataset_windowed(window_size, step_size) model = create_baseline_lstm(data[0][0].shape) history = model.fit(data[0], data[1], batch_size=64, epochs=100, validation_split=0.2, verbose=0, callbacks=callbacks) model.save("model_lstm.h5") file = open("results.txt", "a") file.writelines([ 'RNN results: \n', 'Topology: [LSTM(200),Dense(10)] \n', 'Accuracy: {} \n'.format(history.history['accuracy'][-1]), 'Val accuracy: {} \n'.format(history.history['val_accuracy'][-1]), 'Loss: {} \n'.format(history.history['loss'][-1]), 'Val loss: {} \n'.format(history.history['val_loss'][-1]), '\n' ]) file.close() return model
def train_model_dnn(): data = dp.get_dataset_flat() model = create_baseline_dnn() history = model.fit(data[0], data[1], batch_size=64, epochs=100, validation_split=0.2, verbose=0, callbacks=callbacks) model.save("model_dnn.h5") file = open("results.txt", 'a') file.writelines([ 'DNN results: \n', 'Accuracy: {} \n'.format(history.history['accuracy'][-1]), 'Val accuracy: {} \n'.format(history.history['val_accuracy'][-1]), 'Loss: {} \n'.format(history.history['loss'][-1]), 'Val loss: {} \n'.format(history.history['val_loss'][-1]), '\n' ]) file.close() return model
def compute_confusion_matrix(model): data = dp.get_dataset_windowed(200, 150) y_pred = model.predict(data[0]) cm = confusion_matrix(data[1].argmax(axis=1), y_pred.argmax(axis=1)) print(cm) plt.matshow(cm, cmap='Blues') plt.title("Confusion Matrix") plt.ylabel("True label") plt.xlabel("Predicted label") classes = ["Walk", "Tölt", "Trot", "Canter"] tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes) plt.yticks(tick_marks, classes) thresh = cm.max() / 2 for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): # if normalize: # plt.text(j, i, "{:0.4f}".format(cm[i, j]), # horizontalalignment="center", # color="white" if cm[i, j] > thresh else "black") # else: plt.text(j, i, "{:,}".format(cm[i, j]), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.show() res = model.evaluate(data[0], data[1]) print(res)
def init(inFile, outFile): global MPIFX3, MPIHandler, MPIProcessor global dev, handler, processor MPIFX3 = mp.Queue() MPIHandler = mp.Queue() MPIProcessor = mp.Queue() dev = FX3.Emulator(MPIFX3, inFile) pipe = dev.getPipe() buffSize = dev.getBufferSize() handler = DataHandler.DataHandler(MPIHandler, pipe, buffSize, filename=outFile) realtime = handler.getRealtimeQueue() handler.enableRealtime() processor = DataProcessor.DataProcessor(MPIProcessor, realtime, [[0, 2], [3, 3]], legacy=False, fs=2.5E6, bufferSize=buffSize, calcFlow=True, numProcessors=2)
def Predict(self, newdata): processed = [] processed.append(DataProcessor.Process(newdata[1])) tfdata = self.tfidfconverter.Transform(processed) pred = self.clf.predict(tfdata) print(newdata, " ", pred) print(" ")
def calc(datasetIndex, multiplierInt): csv = pd.DataFrame(columns=['dataset', 'bins', 'f1', 'zero-one']) exp = ((multiplierInt + 1) / 2) bins = math.ceil(2**exp) results = [] for k in range(trials): dp = DataProcessor.DataProcessor(bin_count=bins) binnedDataset = dp.StartProcess(datasets[datasetIndex]) N, Q, F, testData = train(binnedDataset) model = Classifier.Classifier(N, Q, F) classifiedData = model.classify(testData) stats = Results.Results() zeroOne = stats.ZeroOneLoss(classifiedData) macroF1Average = stats.statsSummary(classifiedData) datapoint = { 'dataset': dataset_names[datasetIndex], 'bins': bins, 'f1': macroF1Average, 'zero-one': zeroOne / 100 } print(datapoint) csv = csv.append(datapoint, ignore_index=True) # trial = {"zeroOne": zeroOne, "F1": macroF1Average} # results.append(trial) # print(trial) data.append(csv)
def generate_windowed_data(length, slide, fs): train_arm, train_wrist, train_detect = dr.get_train_data() train_arm = np.vstack(train_arm) train_wrist = np.vstack(train_wrist) train_detect = np.hstack(train_detect) _, _, train_detect_window = dp.apply_window(length, slide, train_arm, train_wrist, train_detect) train_arm_window = da.apply_window_features(train_arm, length, slide, fs) train_wrist_window = da.apply_window_features(train_wrist, length, slide, fs) print("Training: Full Arm Shape: " + str(np.shape(train_arm))) print("Training: Full Wrist Shape: " + str(np.shape(train_wrist))) print("Training: Full Detection Shape: " + str(np.shape(train_detect))) print("Training: Windowed Arm Shape: " + str(np.shape(train_arm_window))) print("Training: Windowed Wrist Shape: " + str(np.shape(train_wrist_window))) print("Training: Windowed Detect Shape: " + str(np.shape(train_detect_window))) val_arm, val_wrist, val_detect = dr.get_val_data() val_arm = np.vstack(val_arm) val_wrist = np.vstack(val_wrist) val_detect = np.hstack(val_detect) _, _, val_detect_window = dp.apply_window(length, slide, val_arm, val_wrist, val_detect) val_arm_window = da.apply_window_features(val_arm, length, slide, fs) val_wrist_window = da.apply_window_features(val_wrist, length, slide, fs) print("Validation: Windowed Arm Shape: " + str(np.shape(val_arm_window))) print("Validation: Windowed Wrist Shape: " + str(np.shape(val_wrist_window))) print("Validation: Windowed Detect Shape: " + str(np.shape(val_detect_window))) data = { "val_arm": val_arm_window, "val_wrist": val_wrist_window, "val_detect": val_detect_window, "train_arm": train_arm_window, "train_wrist": train_wrist_window, "train_detect": train_detect_window } return data
def del_inventory(inventory_id, update_controls=[None]): ip = dp.InventoryProcessor(database) sql = ip.build_del_code(inventory_id=inventory_id) ip.execute_sql_code(sql) ip.db_con.commit() ip.db_con.close() if update_controls is not None: IOProcessor.sel_inventory(update_controls[0])
def del_product(product_id, update_controls=[None]): pp = DP.ProductsProcessor('Python210FinalDB.db') sql = pp.build_del_code(product_id=product_id) pp.execute_sql_code(sql) pp.db_con.commit() pp.db_con.close() if update_controls is not None: IOProcessor.sel_product(update_controls[0])
def del_product(product_id, update_controls=[None]): pp = dp.ProductProcessor(database) sql = pp.build_del_code(product_id=product_id) pp.execute_sql_code(sql) pp.db_con.commit() pp.db_con.close() if update_controls is not None: IOProcessor.sel_product(update_controls[0])
def del_inventory_counts(inventory_id, update_controls=[None]): pp = DP.InventoryCountProcessor('Python210FinalDB.db') sql = pp.build_del_code(inventory_id=inventory_id) pp.execute_sql_code(sql) pp.db_con.commit() pp.db_con.close() if update_controls is not None: IOProcessor.sel_inventorycounts(update_controls[0])
def RunIVSweep(sweepSpec): KE2010.Initialize() KE237.Initialize() KE237.TurnOutputOn() presentV = sweepSpec.startV presentI = 0 VIList = [] while (presentV < sweepSpec.stopV): print('presentV: ' + str(presentV)) KE237.SetVoltage(presentV) presentI = KE2010.MeasureCurrent() print('presentI: ' + str(presentI)) VIList.append((presentV, presentI)) presentV = presentV + sweepSpec.stepV KE237.TurnOutputOff() KE2010.GPIBReset() dp.SaveListToCsv(VIList, name=HCPName) dp.FitLineFromCsv(name=HCPName)
def SaveDataToFile(): try: objF = DataProcessor.File() objF.FileName = "EmployeeData.txt" objF.TextData = Employees.EmployeeList.ToString() print("Reached here") objF.SaveData() except Exception as e: print(e)
def SaveDataToFile(): try: objF = DataProcessor.File() objF.FileName = "CustomerData.txt" objF.TextData = Customers.CustomerList.ToString() print("Reached here") objF.SaveData() except Exception as e: print(e)
def get_attention_weights(data, embds): tf.reset_default_graph() it = 0 now = "han_100d_163b_50cx_0.0001_0.5d" with tf.Session() as sess: model = HierarchicalAttention( num_classes=2, vocab_size=embds.shape[0], embedding_size=embds.shape[1] ) root_logdir = "logs" logdir = "{}/run-{}-{}/".format(root_logdir, now, it) checkpoint_dir = "{}checkpoints".format(logdir) saver = tf.train.Saver() # saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_dir)) # Training model # training_op, global_step = model.optimize() sess.run(tf.global_variables_initializer()) sess.run(model.embedding_init, feed_dict={model.embedding_placeholder: embds}) saver.restore(sess, checkpoint_dir) predictions = model.predict() # print("Evaluation:") x_val, y_val, sent_lengths_val, seq_lengths_val = data.fetch_val() feed_dict = {model.x: x_val, model.y: y_val, model.sent_lengths: sent_lengths_val, model.seq_lengths: seq_lengths_val, model.dropout_keep_prob: 1, model.max_seq_length: data.val_max_seq_length, model.max_sent_length: data.val_max_sent_length } pred, a_word, a_sent = sess.run([predictions, model.alphas_word, model.alphas_sent], feed_dict=feed_dict) #pred, a1, A = sess.run([predictions, model.alphas1, model.alphas2, model.alphas3, model.alphas4], #feed_dict=feed_dict) a_word = np.reshape(a_word, [-1, data.val_max_seq_length, data.val_max_sent_length, 1]) # filter on correct predictions zipped = list(zip(x_val, pred['labels'], pred['predictions'], pred['probabilities'], a_word, a_sent)) # print(zipped[0:2]) selection = [list(x) for x in zipped][133] zipped_correct = [list(x) for x in zipped if x[1]==x[2] and x[1] == 1] # print(zipped_correct[0:2]) def get_predicted_prob(x): return (x[3])[(x[2])] sorted_correct = sorted(zipped_correct, key=get_predicted_prob, reverse=True) print(sorted_correct[0:2]) #selection = sorted_correct[1] selection_zipped_tuple = list(zip(selection[0], selection[4], selection[5])) #selection_zipped_tuple = list(zip(selection[0], selection[4])) selection_zipped = [list(x) for x in selection_zipped_tuple] for s in selection_zipped: s[0] = dp.translate_to_voc(s[0]) return selection_zipped
def upd_inventory(inventory_id, inventory_date, update_controls=[None]): pp = DP.InventoryProcessor('Python210FinalDB.db') sql = pp.build_upd_code(inventory_id=inventory_id, inventory_date=inventory_date) pp.execute_sql_code(sql) pp.db_con.commit() pp.db_con.close() if update_controls is not None: IOProcessor.sel_inventory(update_controls[0])
def SaveDataToFile(): #Save data to file try: objF = DataProcessor.File() objF.FileName = "CustomerData.txt" objF.TextData = Customers.CustomerList.ToString() objF.SaveData() except Exception as e: print(e)
import tables as tab import numpy as np import DataProcessor as dp datafile = tab.openFile('InstrumentedBicycleData.h5') datatable = datafile.root.data.datatable nanList = [] for x in datatable.iterrows(): cell = x['AccelerationX'] vnSampRate = x['NINumSamples'] vnSig = dp.unsize_vector(cell, vnSampRate) numNan = np.sum(np.isnan(vnSig)) if numNan > 2: nanList.append((x['RunID'], numNan)) nanList.sort(key=lambda x: x[1]) for thing in nanList: print thing
import numpy as np import cv2 import pickle import DataProcessor import EmotionLearner from sklearn import svm import matplotlib.pyplot as plt dataProcessor=DataProcessor() data,label=dataProcessor.loadCKData_With_Hog_32x32_3x3() data,featureMeans,featureVariance=dataProcessor.normalizeData(data) emotionLearner=EmotionLearner() scores=emotionLearner.crossValidateSVM(data,label); print scores print np.means(scores) clf=emotionLearner.trainSVM(data,label); #save data and svm with open('classifier/trainingDataCK.pkl', 'wb') as output: pickle.dump(data, output, pickle.HIGHEST_PROTOCOL) pickle.dump(label, output, pickle.HIGHEST_PROTOCOL) pickle.dump(featureMeans, output, pickle.HIGHEST_PROTOCOL) pickle.dump(featureVariance, output, pickle.HIGHEST_PROTOCOL) with open('classifier/svmCK.pkl', 'wb') as output: pickle.dump(clf, output, pickle.HIGHEST_PROTOCOL)
# for index, data_frame in zip(range(len(data_frames)), data_frames): # plot([d.daily_return(data_frame)], 'Daily Return', market_name[index].upper()+' Index') # print(data_frames[0].head(5)) #d.plot_data([data_frames[0],data_frames[7]], ['Daily Return'], market_names=market_name) #plt.show() # Plot data # data_frames[0]['Daily Return'].plot() #print(data_frames[0].index.name) #print(data_frames[0].columns.values.tolist()) dp = DataProcessor() data_points = [4, 8, 12] # Compute moving average for data_frame in data_frames: data_frame = dp.get_moving_average(data_frame, data_points) #Compute exponential moving average for data_frame in data_frames: data_frame = dp.get_ewma(data_frame, data_points) # cols=['Adj Close','MA_5','MA_10','MA_15','MA_20'] # plot different calculations '''
import matplotlib.pyplot as plt from scipy import stats from scipy.interpolate import UnivariateSpline import DataProcessor as dp # pick a run number runid = 124 print "RunID:", runid # open the data file datafile = tab.openFile('InstrumentedBicycleData.h5') datatable = datafile.root.data.datatable # get the raw data niAcc = dp.get_cell(datatable, 'FrameAccelY', runid) vnAcc = dp.get_cell(datatable, 'AccelerationZ', runid) sampleRate = dp.get_cell(datatable, 'NISampleRate', runid) numSamples = dp.get_cell(datatable, 'NINumSamples', runid) speed = dp.get_cell(datatable, 'Speed', runid) threeVolts = dp.get_cell(datatable, 'ThreeVolts', runid) # close the file datafile.close() # make a nice time vector time = dp.time_vector(numSamples, sampleRate) # scale the NI signal from volts to m/s**2, and switch the sign niSig = -(niAcc - threeVolts / 2.) / (300. / 1000.) * 9.81 vnSig = vnAcc
import tables as tab import numpy as np import matplotlib.pyplot as plt import DataProcessor as dp datafile = tab.openFile('InstrumentedBicycleData.h5') datatable = datafile.root.data.datatable for x in datatable.iterrows(): if x['RunID'] == 4: pass else: if x['Maneuver'] != 'System Test': numSamp = x['NINumSamples'] sampleRate = x['NISampleRate'] time = np.linspace(0., numSamp/sampleRate, num=numSamp) acceleration = dp.unsize_vector(x['FrameAccelY'], numSamp) print '--------------------' print 'Run ID:', x['RunID'] print 'Speed:', x['Speed'] print 'Notes:', x['Notes'] print 'Environment:', x['Environment'] print 'Maneuver:', x['Maneuver'] print 'Total time:', time[-1] print 'Time of max value:', time[np.argmax(acceleration)] print 'Max value:', np.max(acceleration) print '--------------------' if time[np.argmax(acceleration)] > 5.: plt.figure(x['RunID']) plt.plot(time, acceleration) plt.title(x['Speed'])
week = datetime.timedelta(days = 7) days = Analysis.findHistoricalWeekdays(data, specifiedDate) # extends to 1 month behind and ahead of these days # grabs [-3, 0, 3], then adds [-4, -1, 2] and [-2, 1, 4] days = map(lambda x: x - 3 * week, days) + days + map(lambda x: x + 3 * week, days) days = map(lambda x: x - 1 * week, days) + days + map(lambda x: x + 1 * week, days) # return only the ones in the dataset return [day for day in days if (data.has_key(day))] if __name__ == "__main__": formatter = "{0:.2f}" print("Processing file " + sys.argv[1] + " for " + sys.argv[2]) dataMap = DataProcessor.processFile(sys.argv[1]) # find the rational uper bound on graphs dataMax = 0 for values in dataMap.values(): p = np.percentile(values, 90) dataMax = max(dataMax, p) givenDate = sys.argv[2].split("-") givenDate = datetime.date(int(givenDate[0]), int(givenDate[1]), int(givenDate[2])) givenTime = None if (len(sys.argv) >= 4): givenTime = sys.argv[3].split(":") givenTime = datetime.time(int(givenTime[0]), int(givenTime[1]), int(givenTime[2])) # grabs the filepath
# Setting up all the options! opts_prepro = {'rescale':rescale, 'scaleFactor':scaleFactor, 'colorSpace':colorSpace} opts_seeds = {'method':seedSelection} opts_labeller = {'labelsType':labelsType, 'labelThrs':labelThrs, 'colorSpace':colorSpace} ############################################# # 2. Now it's time to play with the data ############################################# gt_index = 0 TLabels = [] for im in Images: ############################################# # 2.1. Data preprocessing & transformation ############################################# imprep = dp.processData(im, opts_prepro) ############################################# # 2.2. Color space transformation (from Image to np.ndarray X) ############################################# X = ce.transformColorSpace(imprep, opts_prepro) ############################################# # 2.3. Selection of K (number of clusters) ############################################# #Done already below for K == -1 ############################################# # 2.4. Selection of the K seeds --> Seeds ############################################# if K == -1: resultList = []
if normalizedQUarterlyReviewCount[i-1][1] == 0: quarterDeltas.append([normalizedQUarterlyReviewCount[i][0], 0]) else: quarterDeltas.append([normalizedQUarterlyReviewCount[i][0], (normalizedQUarterlyReviewCount[i][1]-normalizedQUarterlyReviewCount[i-1][1])/normalizedQUarterlyReviewCount[i-1][1]]) return quarterDeltas if __name__ == '__main__': ticker = "SBUX"; bizFullName = "Starbucks" conn = mysql.connector.connect(user='******', password='******', host='107.170.18.102', database='FinanceNLP') cursor_select = conn.cursor() erDates = getERDates(ticker, cursor_select)#read ER dates from my DB dailyReviewCounts = DataProcessor.getDailyReviewsCount(bizFullName)#list of tuple(date, review count) quarterReviewCounts = getQuarterCumulated(erDates, dailyReviewCounts)#review count over quarters, tuple list (ER date, previous quarter review count) quarterReviewCountsList = [[item[0], item[1]] for item in quarterReviewCounts.items()] quarterReviewCountsList.sort(key = operator.itemgetter(0), reverse=False) normalizedQuarterlyReviewCount = getNormalizedQuarterlyReviewCount(quarterReviewCountsList)#TODY: other normalized method? or no normalize quarterDeltas = getQuarterlyReviewCountDeltas(normalizedQuarterlyReviewCount)#difference between quarters # now quarterDeltasList becomes daily average review count in each quarter ER2Surprise = getStockPriceSurprise(ticker, erDates)#hash (date, price change) # print ER2Surprise lineReviewCount = [] linePrice = [] lineER = [] print "Date\t", "ReviewCount Change\t", "Price Change\n", for er in quarterDeltas: dt = er[0]
import tensorflow as tf import numpy as np import DataProcessor as dp from matplotlib import pyplot nodeId = 205 source = "House" startDate = "2016-01-12" endDate = "2016-01-12" feature_columns = [tf.feature_column.numeric_column("x", shape=[9])] [dataSetTime,dataSetTemp,trainingDataX,trainingDataY,testDataX,testDataY]=dp.getParams(nodeId,source,startDate,endDate) estimator = tf.estimator.DNNRegressor( feature_columns=feature_columns, hidden_units=[30, 30, 30], optimizer=tf.train.ProximalAdagradOptimizer( learning_rate=10, l1_regularization_strength=0.001 )) cX = 0 inp = trainingDataX batchSize = 500 numBatches = len(trainingDataX)//batchSize def input_fn(): global cX if(cX > len(trainingDataX)):