def main(): response_columns = [] columns = [] filename = sys.argv[1] #columns = sys.argv[2].strip('[]').split(',') response_columns_classification = sys.argv[2].strip('[]').split(', ') response_columns_predictin = sys.argv[3].strip('[]').split(', ') #features columns = [ 'MO', 'N', 'INDICE20', 'PHEAU', 'PHSMP', 'KECH', 'CAECH', 'MGECH', 'NAECH', 'HECH', 'CEC', 'PM3', 'MNM3', 'CUM3', 'FEM3', 'ALM3', 'BM3', 'ZNM3', 'PBM3', 'MOM3', 'CDM3', 'COM3', 'CRM3', 'KM3', 'CAM3', 'MGM3', 'NAM3' ] columns_M3 = columns[1:5] + [columns[9]] + columns[11:] columns = columns_M3 #single target classification for response_target in response_columns_classification: df = read_data(filename, columns, response_target) data_view(df, columns, response_target) classification(df, columns, response_target) #regression df = read_data(filename, columns, response_columns_predictin) df.dropna(inplace=True) prediction(df, columns, response_columns_predictin)
def evaluate_accuracy(task_id, dificulty, errors=False, outputFile='result.txt', silent=False): if not silent: print('reading and processing testing data') X, Y = read_data(training=False, task_id=task_id, difficulty=dificulty) if not silent: print("Getting representation") # representation = _classifier.get_representation(task_id) # if representation is None: # representation = _classifier.default_representation(Y) Y = _classifier.labels_remove_twos(Y) representation = _classifier.find_representation(Y) print('Representation for accuracy: ', representation) if not silent: print('read') print('predicting..') raw_predicted = nn.get_model(tasks_encoded[(task_id, dificulty)]).predict(X) # _classifier.print_labels(raw_predicted[:5]) predicted = _classifier.get_normal_output(raw_predicted, representation) # print(predicted.shape) acc = accuracy(predicted, Y, raw_predicted, errors) print('Accuracy %.4f' % acc) return acc
def check(): #raw_labels = np.array([[0.0, 0.142, 0.542, 0.001, 0.0, 0.13, 0.124, 0.0, 0.0, 0.061000001]]) #r#epresentation = [(8,1)] #Y, _ = tra#nsform_labels_with_representation(labels_remove_twos(Y), 4) _, Y = read_data(task_id=3, difficulty=2) Y = labels_remove_twos(Y) rep = find_representation(Y) print(rep)
def read_data_sheet(self): datafile = frame_1.datafile.GetValue() if not os.path.exists(datafile): self.popup_box("Can't find "+datafile, "Can't find "+datafile) return #print all_genes_and_traits #data_sheet.update(readdata.read_data(datafile)) data_list, column_labels = readdata.read_data(datafile) self.data_sheet.extend(data_list) #column_labels = data_sheet.keys() self.all_genes_and_traits.extend(column_labels) #print "all", all_genes_and_traits #all_genes_and_traits.sort() frame_1.gene_list.Set(self.all_genes_and_traits) frame_1.trait_list.Set(self.all_genes_and_traits) # assume that the selection variable is in first columns frame_1.selection_variable_list.SetItems(self.all_genes_and_traits[0:20])
def main(): # uncomment this to create new model prepare_training_set() # return # load model back clf = joblib.load('model.pkl') # read sample image = read_data(sys.stdin) # extract the characters characters = extract_characters(image) for character in characters: prediction = clf.predict(character.ravel().reshape(1, -1)) sys.stdout.write(prediction[0]) sys.stdout.write('\n')
def get_original_output(task_id, difficulty): X, Y = read_data(training=False, task_id=task_id, difficulty=difficulty) Y = _classifier.labels_remove_twos(Y) representation = _classifier.find_representation(Y) raw_predicted = nn.get_model(tasks_encoded[(task_id, difficulty)]).predict(X) predicted = _classifier.get_normal_output(raw_predicted, representation) def transform_single_label(label, how): i = 0 new = [] for id in range(len(how)): if how[id]==2: new.append(0) else: new.append(label[i]) i += 1 return how return list(map(lambda label: transform_single_label(label, Y[0]), predicted))
def model_statistics_on_task(task_id, difficulty, epochs): # train for 1 X_, Y = read_data(task_id=task_id, difficulty=difficulty) X_, Y, _ = transform_data(X_, Y, task_id) nn.construct_model(X_[0].shape) nn.add_new_task(len(Y[0])) model = nn.get_model().model X = nn.get_model().make_input(X_) tasks_encoded[(task_id, difficulty)] = len(nn.tasks) - 1 acc_history = [] for epoch in range(epochs + 1): print('Epoch #', epoch) if epoch > 0: model.fit(X, Y, nb_epoch=1) acc_history.append(evaluate_accuracy(task_id, difficulty, silent=True)) return acc_history
def read_data_sheet(self): datafile = frame_1.datafile.GetValue() if not os.path.exists(datafile): self.popup_box("Can't find " + datafile, "Can't find " + datafile) return #print all_genes_and_traits #data_sheet.update(readdata.read_data(datafile)) data_list, column_labels = readdata.read_data(datafile) self.data_sheet.extend(data_list) #column_labels = data_sheet.keys() self.all_genes_and_traits.extend(column_labels) #print "all", all_genes_and_traits #all_genes_and_traits.sort() frame_1.gene_list.Set(self.all_genes_and_traits) frame_1.trait_list.Set(self.all_genes_and_traits) # assume that the selection variable is in first columns frame_1.selection_variable_list.SetItems( self.all_genes_and_traits[0:20])
def train_network_ui(task_id, difficulty, epochs=3): # tasks[(task_id, difficulty)] = len(tasks.keys()) X, Y = read_data(task_id=task_id, difficulty=difficulty) train_network(X, Y, epochs=epochs, train=True, load_model=False, filename=['model.txt', 'weights.hdf5'], task_id=task_id) tasks_encoded[(task_id, difficulty)] = len(nn.tasks) - 1 old_accuracy = evaluate_accuracy(task_id, difficulty, silent=True) print('Old accuracy: ', old_accuracy) if len(nn.tasks)>1: buffered = nn.tasks[-1] nn.tasks.pop() train_network(X, Y, epochs=epochs, train=True, load_model=False, filename=['model.txt', 'weights.hdf5'], task_id=task_id, independent=True) new_accuracy = evaluate_accuracy(task_id, difficulty, silent=True) print('New accuracy: ', new_accuracy) if new_accuracy<old_accuracy: nn.tasks[-1] = buffered old_accuracy = new_accuracy if old_accuracy<0.9: nn.tasks[-1].kill()
def evaluate_accuracy(task_id, difficulty, errors=False, outputFile='result.txt'): print('reading and processing testing data') X, Y = read_data(training=False, task_id=task_id, difficulty=difficulty) print("Getting representation") print('read') Y = _classifier.labels_remove_twos(Y) print('predicting..') representation = _classifier.find_representation(Y) if (task_id, difficulty) in tasks.keys(): model = nn.get_model(tasks[(task_id, difficulty)]) else: raise Exception('Task unknown') return raw_predicted = model.predict(X, verbose=1) predicted = _classifier.get_normal_output(raw_predicted, representation) acc = accuracy(predicted, Y, raw_predicted, errors) print('Accuracy %.4f' % acc) return acc
from matplotlib.pyplot import * from numpy import * from Green_function import Green_func from readdata import read_data (rs,r0,R2,kx,kt,Rm2,d,h3,h2,h1,rmax,Q,M,v,muq) = read_data() #kx = [0.0,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95,1.0] #kx = [0.0,0.5,1.5,2.5,5.0,10.0] kx = 0.0 kt = [1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0] Scale = 1 Gr_f = zeros(len(kt),complex) for i in range(0,len(kt)): Gr_f[i] = Green_func(rs,r0,R2,kx,kt[i],Rm2,d,h3,h2,h1,rmax,Q,M,v,muq,Scale) print Gr_f[i] print Scale figure(1) plot(kt,Gr_f.imag,label='Scale = %f'%(kx)) ###################################################################################### kx = 5.0 for i in range(0,len(kt)): Gr_f[i] = Green_func(rs,r0,R2,kx,kt[i],Rm2,d,h3,h2,h1,rmax,Q,M,v,muq,Scale) print Gr_f[i] print Scale plot(kt,Gr_f.imag,label='Scale = %f'%(kx)) ###################################################################################### kx = 10.0 for i in range(0,len(kt)): Gr_f[i] = Green_func(rs,r0,R2,kx,kt[i],Rm2,d,h3,h2,h1,rmax,Q,M,v,muq,Scale)
def train_network_ui(task_id, difficulty, epochs=3): tasks[(task_id, difficulty)] = len(tasks.keys()) X, Y = read_data(task_id=task_id, difficulty=difficulty) return train_network(X, Y, epochs=epochs, train=True, load_model=False, filename=['model.txt', 'weights.hdf5'])
import numpy as np import sklearn.linear_model as linear_model from LogisticRegression import LogisticRegression from readdata import read_data from sklearn import cross_validation from sklearn.pipeline import Pipeline from sklearn.grid_search import GridSearchCV X_train,y_train = read_data("./Data/train") X_train = X_train.toarray() #convert labels to 0,1 idx = np.where(y_train==-1) y_train[idx] = 0. kf = cross_validation.KFold(y_train.shape[0], n_folds=10, indices=False) X_test,y_test = read_data("./Data/test") X_test = X_test.toarray() idx = np.where(y_test==-1) y_test[idx] = 0. parameters = dict({ "clf__weight_decay" : [.001,.01,.1,1,10,100,1000], "clf__init_beta" : [0.,0.00001,.0001,0.001,.01,.1,1.,10], }) lr = LogisticRegression(learning="LBFGS",X_test=X_test,y_test=y_test) pipeline = Pipeline([ ("clf" ,lr), ]) f = GridSearchCV(pipeline,parameters,n_jobs=-1,verbose=-1,cv=3) f.fit(X_train,y_train) best_parameters = f.best_estimator_.get_params()
# Don't need to remember the old paths path = newpath n = 0 # if only one element is observed max is sought in the initialization values if len(obs)!=1: n = t (prob, state) = max((V[n][y], y) for y in states) return (prob, path[state]) if __name__ == '__main__': try: datafile = sys.argv[1] except: datafile = 'AllDataWithNonHarmonics.csv' headers = ['module', 'root', 'bar_of_phrase', 'letter', 'bars_per_phrase', 'song_name'] data = read_data(datafile, headers) transition_probs, emission_probs, initial_probs, states = get_probabilities(data) # print states # for one, two in emission_probs: # print '{}->{}: {:.4f}'.format(one, two, emission_probs[(one, two)]) # for state in initial_probs: # print state, initial_probs[state] total_correct = 0 total = 0 for song in data: obs = [entry['root'] for entry in song] correct = [entry['module'].split('_')[0] for entry in song] prob, predictions = viterbi(obs, states, initial_probs, transition_probs, emission_probs)
return chord_counts, transition_counts def get_transition_probs(chord_counts, transition_counts): """ Returns a dictionary of transition probabilities based on counts for chords and transitions. """ probs = dict(transition_counts) # make a copy so we don't destroy the counts dictionary for (first, second), count in transition_counts.items(): probability = transition_counts[(first, second)] / chord_counts[first] probs[(first, second)] = probability return probs if __name__ == '__main__': try: datafile = sys.argv[1] except: datafile = 'AlldataWithNonHarmonics.csv' data = read_data(datafile) chord_counts, transition_counts = get_overall_counts(data) transition_probs = get_transition_probs(chord_counts, transition_counts) # map roman numerals to integers for sorting, and covert back to display transitions = [(RN.index(c1), RN.index(c2)) for c1, c2 in transition_probs] print '\n' +'Phrase Length = ' + z[j] for c1, c2 in sorted(transitions): print '({} -> {}): {:.4f}'.format(RN[c1], RN[c2], transition_probs[(RN[c1], RN[c2])])
#init_state = np.tile(1e-5 * np.random.uniform(low=0.0,high=1.0,size=(n,n)),[batch_size,a_num,1,1]) loss_p = 0 batch_num_idx = range(batch_num) k_fold = KFold(n_splits=10) final_acc_fold = np.zeros((10, 1)) #CL = Chol_de(current_X,n) #CC = Chol_com(CL,n,eps) data = [] label = [] for idx in range(batch_num): print(idx) data_batch_in, label_batch_in = read_data( idx, '../../TTRNN/UCF11_updated_mpg/processed_data/', matrix_length, sample_rate) data.append(data_batch_in) label.append(label_batch_in) with tf.Session() as sess: final_acc = 0. co = 0 for tr_indices, ts_indices in k_fold.split(batch_num_idx): sess.run(tf.global_variables_initializer()) print( np.sum([ np.prod(v.get_shape().as_list()) for v in tf.trainable_variables() ])) #start_time = time.time()
def read_data(filename): with open(filename, 'r') as csvf: return [row for row in csv.reader(csvf)] def write_csv(data, filename): with open(filename, 'w') as csvfile: writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) for line in data: writer.writerow(line) if __name__ == '__main__': try: datafile = sys.argv[1] except: datafile = 'new_cluster_chord_by_chord.csv' inputdata = read_data(datafile) outputdata = [] uniquesongs = [] header = ['song', 'artist', 'year', 'meter', 'cluster15','cluster6'] #add kmeans6 when ready outputdata.append(header) for line in inputdata: if line[0] not in uniquesongs: uniquesongs.append(line[0]) songmeta = [line[0], line[1], line[2], line[3], line[8], line[9]] # add line[9] when kmeans6 is ready outputdata.append(songmeta) write_csv(outputdata, 'metadata_with_clusters.csv')
def evaluateDataset(): """ Returns ------- clf : Classifier Model Evaluate and choose a classifier with the best suited options. Evaluation includes accuracy, F1 score, precision and recall. The output lists the importance of the selected features. """ ### Add a couple features to this list. # from newDataPoint import createNewPoints # createNewPoints(data_dict) alldata = read_data('flightdelays-2010-2020.csv') key_features_list = [ 'arr_del15', 'carrier_ct', ' weather_ct', 'nas_ct', 'security_ct', 'late_aircraft_ct' ] features_list = [ 'arr_del15', ' weather_ct', 'nas_ct', 'security_ct', 'late_aircraft_ct' ] data = pd.DataFrame(alldata, columns=key_features_list) target = [] data = data.dropna() ### Create a prediction target for late flights because of carrier delays. for e in data['carrier_ct']: if (e > 0): target.append(1) else: target.append(0) ### Remove carrier_ct from the list as we used that to create the ### target data for the predictions. ### carrier_ct = weather_ct + nas_ct + security_ct + late_aircraft_ct newdata = pd.DataFrame(data, columns=features_list) data = newdata ### Decision Tree from time import time from sklearn import tree clf = tree.DecisionTreeClassifier() t0 = time() clf.fit(data, target) print("training time for all data:", round(time() - t0, 3), "s") ### print accuracy print("Decision Tree Accuracy on All the data: ", round(clf.score(data, target), 3)) from classifyDT import classifyDT from sklearn.model_selection import train_test_split features_train, features_test, labels_train, labels_test = \ train_test_split(data, target, test_size=0.5, random_state=42) ### features_train and features_test are the features for the training ### and testing datasets, respectively ### labels_train and labels_test are the corresponding item labels # features_train, features_test, labels_train, labels_test = preprocess() clf = classifyDT(features_train, labels_train, features_test, labels_test) ### Determine the importance of the features that we chose. lat = [i for i in clf.feature_importances_] ### use the cut off 0.2 to determine the features of importance. #def condition(x): return x > 0.2 def condition(x): return x output = [idx for idx, element in enumerate(lat) if condition(element)] print("output:", output) for i in output: print("importance: of ", features_list[i], " is ", round(lat[i], 3)) return clf
from matplotlib.pyplot import * from numpy import * from Green_function import Green_func from readdata import read_data (rs,r0,L,kx,kt,ms,d,h3,h2,h1,rmax,Q,M,muq) = read_data() #kx = [0.0,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95,1.0] #kx = [0.0,0.5,1.5,2.5,5.0,10.0] kx = 3.0 kt = linspace(-2.0,2.0,81) Scale = 1 Gr_f = zeros(len(kt),complex) for i in range(0,len(kt)): Gr_f[i] = Green_func(rs,r0,L,kx,kt[i],ms,d,h3,h2,h1,rmax,Q,M,muq,Scale) print Gr_f[i] print Scale figure(1) plot(kt,Gr_f.imag,label='Scale = %f'%Scale) ####################################################################################### #Scale = 10 #Gr_f = zeros(len(kt),complex) #for i in range(0,len(kt)): # Gr_f[i] = Green_func(rs,r0,L,kx,kt[i],ms,d,h3,h2,h1,rmax,Q,M,v,muq,Scale) # print Gr_f[i] #print Scale #figure(1) #plot(kt,Gr_f.imag,label='Scale = %f'%Scale) ''' ######################################################################################
def train(use_data,semi_sv,output,data_aug,epoch=1000): def get_subset(dataset,idx): data = {} for key,value in dataset.items(): data[key] = value[idx] return data def concat_data(data1,data2): result = {} for k in data1.keys(): result[k] = np.concatenate([data1[k],data2[k]]) return result from readdata import read_data tr,te, embedding_matrix, labels = read_data(use_data,data_aug=data_aug) print(use_data) print('Shape of label tensor:', labels.shape) y = labels from config import model_path from sklearn.cross_validation import StratifiedKFold, KFold from config import n_folds y_pred = pd.read_csv("./data/y_pred.csv")['y_pre'].values y_pos_ = y_pred == 1 y_neg_ = y_pred == 0 add_idx = np.any([y_pos_, y_neg_], axis=0) add_y = y_pred[add_idx] y_pos = y_pred > 0.75 y_neg = y_pred < 0.25 y_idx = np.any([y_pos, y_neg], axis=0) y_pred = y_pred[y_idx] print(y_idx.shape) folds = StratifiedKFold(y, n_folds=n_folds, shuffle=True) result = np.zeros((len(te['q1']), 1)) oof_y = np.zeros((len(y), 1)) for n_fold, (tr_idx, val_idx) in enumerate(folds): tr_x = get_subset(tr,tr_idx) if semi_sv: te_x = get_subset(te, y_idx) tr_data = concat_data(tr_x,te_x) tr_y = np.concatenate([y[tr_idx],y_pred]) else: add_data = get_subset(te,add_idx) tr_data = concat_data(tr_x,add_data) tr_y = np.concatenate([y[tr_idx], add_y]) # tr_data = tr_x # tr_y = y[tr_idx] val_x = get_subset(tr, val_idx) val_y = y[val_idx] use_word = True if use_data!='words': use_word = False model = get_model(word_embedding_matrix=embedding_matrix,use_word=use_word) if n_fold == 0: print(model.summary()) hist = epochHistory() print(n_fold) model.fit(tr_data, tr_y, epochs=epoch, validation_data=[val_x,val_y], verbose=1, batch_size=256, callbacks=[ EarlyStopping(patience=2, monitor='val_binary_crossentropy'), # LearningRateScheduler(lr_de,verbose=1) hist, ModelCheckpoint('./weight/weights.{epoch:d}.hdf5',monitor='val_binary_crossentropy',save_weights_only=True) ]) result += iter_ense(hist.epochs,model,te) # result += model.predict(te, batch_size=1024) oof_y[val_idx] = model.predict(val_x, batch_size=2048) K.clear_session() tf.reset_default_graph() # 提交结果 result /= n_folds submit = pd.DataFrame() submit['y_pre'] = list(result[:, 0]) submit.to_csv(output, index=False) ## 保存预测的训练标签 # oof_y = oof_y[:,0] # oof_y_ = oof_y.round().astype(int) # # error_idx = oof_y_!=y # print(np.sum(error_idx)) # oof_y[error_idx] = 1-oof_y[error_idx] submit = pd.DataFrame() submit['y_pre'] = oof_y[:,0] submit.to_csv('./data/oofy.csv',index=False)
#! /usr/bin/python from numpy import * from pylab import * from matplotlib.pyplot import * from matplotlib import rc from cmath import exp, cos, sin from curvfit import * from readdata import read_data from RK_solver import RK4_solver, Adaptive_RK4_solver #Constants (rs,r0,L,kx,kt,ms,d,h3,h2,h1,rmax,Q,M,v,muq) = read_data('input.xml') #k = 1.0 Scale = 1 mm = int((rmax-r0)*Scale/5) Delta = d/2 + sqrt(d**2/4 + ms) pow_r = [-Delta,Delta-d] [r,fpr,fnr] = RK4_solver(rs,r0,L,kx,kt,ms,d,h3,h2,h1,rmax,Q,M,v,muq) fp_real = zeros(len(fpr)) for i in range(0, len(fpr)): fp_real[i] = fpr[i].real fp_imag = zeros(len(fpr)) for i in range(0, len(fpr)): fp_imag[i] = fpr[i].imag for i in range(0, len(r)): r[i] = r[i]-r0
#! /usr/bin/python from numpy import * from pylab import * from matplotlib.pyplot import * from matplotlib import rc from cmath import exp, cos, sin from curvfit import * from readdata import read_data from RK_solver import RK4_solver, Adaptive_RK4_solver #Constants (rs, r0, R2, kx, kt, Rm2, d, h3, h2, h1, rmax, Q, M, v, muq) = read_data('input.xml') #k = 1.0 Scale = 1 mm = int((rmax - r0) * Scale / 5) Delta = d / 2 + sqrt(d**2 / 4 + Rm2) pow_r = [-Delta, Delta - d] err = 1e-9 [r, fr, tot_err] = Adaptive_RK4_solver(rs, r0, R2, kx, kt, Rm2, d, rmax, Q, M, v, muq, err) print "Total Error is ", tot_err f_real = zeros(len(fr)) for i in range(0, len(fr)): f_real[i] = fr[i].real f_imag = zeros(len(fr)) for i in range(0, len(fr)):
def main(): # Set up the database of objects X = readdata.read_data(infl) # Choose initial means with K-means means = ChooseInitialMeans(X) # Set up initial clusters distmat = SetDistMat(X, means) clusters = InitialAssignment(distmat) ## debug code #keys = sorted(clusters.keys()) #for key in keys: # print("cluster %i:"%key) # print(clusters[key]) ## end of debug # Iteration step for iter in range(max_iter): active = 0 # indicate the number of transfers in the current iteration tranlst = (-1) * np.ones( k, dtype='int') # set up transfer list for each cluster # Compute the cluster means oldmeans = means.copy() means = CalcMeans(X, oldmeans, clusters) # Get statistics about the clustering #ClusterStat(X, means, clusters) ## debug code #print("old means:") #print(oldmeans) #print("new means:") #print(means) ## end of debug # For each object, compute the distances to the cluster means distmat = SetDistMat(X, means) # Sort objects based on the delta of the current assignment and the best # possible alternate assignment objlst = SortObj(X, clusters, means, distmat) ##debug code #print(objlst) ##return #end of debug # For each element by prioty: while (len(objlst)): (i, key, temp) = objlst.pop() obj2key = GetDist(X[i], means[key]) transferred = False #record if any transfering has occured to i if (key == distmat[i, 0][0]): ##debug #print("%i is already the opt cluster for obj %i. no transfer"%(clu, i)) ##end of debug continue # For each other clusters by element gain: else: for j in range(k): clu = distmat[i, j][0] # the key of another cluster objgain = obj2key - distmat[i, j][ 1] # gain by transfering i from cluster key to clu if (clu == key): # already in the cluster continue if (len(clusters[clu]) < cluster_size): active += 1 transferred = True clusters = Transfer(i, key, clu, clusters) ##debug #print("cluster %i not full. transfer obj %i from cluster %i to it."%(clu, i, key)) ##end of debug break elif (tranlst[clu] != -1 ): # if the tranlst of another cluster is not empty # distance between the obj in the tranlst and the current cluster tran2key = GetDist(X[tranlst[clu]], means[key]) tran2clu = GetDist(X[tranlst[clu]], means[clu]) # gain by transfering the obj in tranlst from cluster clu to key trangain = tran2clu - tran2key if ( objgain + trangain > 0 ): # transfer if the sum of gains are positive, ie net gain active += 2 transferred = True clusters = Transfer(i, key, clu, clusters) clusters = Transfer(tranlst[clu], clu, key, clusters) ##debug #print("obj %i is transfered from cluster %i to %i"%(i, key, clu)) #print("obj %i is transfered from cluster %i to %i"%(tranlst[clu], clu, key)) #print("objgain: %f, trangain: %f"%(objgain, trangain)) ##end of debug tranlst[clu] = -1 # reset the tranlst to empty break if (not transferred): tranlst[key] = i ##debug #print("add obj %i in cluster %i to the transfer list"%(i, key)) ##end of debug # nothing is transferred during this iteration, return the clustering result if (not active): break #debug code print("number of transfers in iter %i: %i\n" % (iter + 1, active)) #end of debug print("K-means clustering converged in %d iterations!\n" % (iter + 1)) # Output the clustering results WriteResult(outfl, X, means, clusters) ClusterStat(X, means, clusters) return (0)
# -*- coding: utf-8 -*- """ Created by Maya May 9, 2021 """ import pandas as pd from readdata import read_data alldata = read_data('flightdelays-2010-2020.csv') def evaluateDataset(): """ Returns ------- clf : Classifier Model Evaluate and choose a classifier with the best suited options. Evaluation includes accuracy, F1 score, precision and recall. The output lists the importance of the selected features. """ ### Add a couple features to this list. # from newDataPoint import createNewPoints # createNewPoints(data_dict) alldata = read_data('flightdelays-2010-2020.csv') key_features_list = [ 'arr_del15', 'carrier_ct', ' weather_ct', 'nas_ct', 'security_ct', 'late_aircraft_ct' ] features_list = [ 'arr_del15', ' weather_ct', 'nas_ct', 'security_ct', 'late_aircraft_ct'
def main(): # Set up the database of objects X = readdata.read_data(infl) # Choose initial means with K-means means = ChooseInitialMeans(X) # Set up initial clusters distmat = SetDistMat(X, means) clusters = InitialAssignment(distmat) ## debug code #keys = sorted(clusters.keys()) #for key in keys: # print("cluster %i:"%key) # print(clusters[key]) ## end of debug # Iteration step for iter in range(max_iter): active = 0 # indicate the number of transfers in the current iteration tranlst = (-1)*np.ones(k, dtype='int') # set up transfer list for each cluster # Compute the cluster means oldmeans = means.copy() means = CalcMeans(X, oldmeans, clusters) # Get statistics about the clustering #ClusterStat(X, means, clusters) ## debug code #print("old means:") #print(oldmeans) #print("new means:") #print(means) ## end of debug # For each object, compute the distances to the cluster means distmat = SetDistMat(X, means) # Sort objects based on the delta of the current assignment and the best # possible alternate assignment objlst = SortObj(X, clusters, means, distmat) ##debug code #print(objlst) ##return #end of debug # For each element by prioty: while (len(objlst)): (i, key, temp) = objlst.pop() obj2key = GetDist(X[i], means[key]) transferred = False #record if any transfering has occured to i if (key == distmat[i,0][0]): ##debug #print("%i is already the opt cluster for obj %i. no transfer"%(clu, i)) ##end of debug continue # For each other clusters by element gain: else: for j in range(k): clu = distmat[i,j][0] # the key of another cluster objgain = obj2key - distmat[i,j][1] # gain by transfering i from cluster key to clu if (clu==key): # already in the cluster continue if (len(clusters[clu]) < cluster_size): active += 1 transferred = True clusters = Transfer(i, key, clu, clusters) ##debug #print("cluster %i not full. transfer obj %i from cluster %i to it."%(clu, i, key)) ##end of debug break elif (tranlst[clu] != -1): # if the tranlst of another cluster is not empty # distance between the obj in the tranlst and the current cluster tran2key = GetDist(X[tranlst[clu]], means[key]) tran2clu = GetDist(X[tranlst[clu]], means[clu]) # gain by transfering the obj in tranlst from cluster clu to key trangain = tran2clu - tran2key if (objgain + trangain > 0): # transfer if the sum of gains are positive, ie net gain active += 2 transferred = True clusters = Transfer(i, key, clu, clusters) clusters = Transfer(tranlst[clu], clu, key, clusters) ##debug #print("obj %i is transfered from cluster %i to %i"%(i, key, clu)) #print("obj %i is transfered from cluster %i to %i"%(tranlst[clu], clu, key)) #print("objgain: %f, trangain: %f"%(objgain, trangain)) ##end of debug tranlst[clu] = -1 # reset the tranlst to empty break if (not transferred): tranlst[key] = i ##debug #print("add obj %i in cluster %i to the transfer list"%(i, key)) ##end of debug # nothing is transferred during this iteration, return the clustering result if (not active): break #debug code print("number of transfers in iter %i: %i\n"%(iter+1, active)) #end of debug print("K-means clustering converged in %d iterations!\n"%(iter+1)) # Output the clustering results WriteResult(outfl, X, means, clusters) ClusterStat(X, means, clusters) return(0)
#! /usr/bin/python from numpy import * from pylab import * from matplotlib.pyplot import * from matplotlib import rc from cmath import exp, cos, sin from curvfit import * from readdata import read_data from RK_solver import RK4_solver, Adaptive_RK4_solver #Constants (rs,r0,R2,kx,kt,Rm2,d,h3,h2,h1,rmax,Q,M,v,muq) = read_data('input.xml') #k = 1.0 Scale = 1 mm = int((rmax-r0)*Scale/5) Delta = d/2 + sqrt(d**2/4 + Rm2) pow_r = [-Delta,Delta-d] err = 1e-9 [r,fr,tot_err]= Adaptive_RK4_solver(rs,r0,R2,kx,kt,Rm2,d,rmax,Q,M,v,muq,err) print "Total Error is " , tot_err f_real = zeros(len(fr)) for i in range(0, len(fr)): f_real[i] = fr[i].real f_imag = zeros(len(fr)) for i in range(0, len(fr)): f_imag[i] = fr[i].imag
pre_train_epoch = 5000 epoch_num = 500 #50 depth = len(out_channels) assert len(in_channels) == len(middle_channels) and len( middle_channels) == len(out_channels) k = 3 d0 = 1 ############# pre-process data part label_CSV = "data/ad_data/APOE.csv" label_name = "APOE" class_num = 2 ######## group_test_times = 5000 ################ _, _, Label = read_data(label_CSV, label_name, recalculate=False) NagPos = np.where(Label) PosPos = np.where(1 - Label) Length, Nampyte = load_length("./data/ad_data/processed_data/" + label_name + "/Track_info.txt") Trackids = [int(sys.argv[1])] ###### all fibers Track_num = len(Trackids) matrix_length_all = np.zeros(Track_num, dtype=np.int32) for i in range(Track_num): matrix_length_all[i] = Length[Trackids[i]][1] matrix_length = np.sum(matrix_length_all)
from matplotlib.pyplot import * from numpy import * from Green_function import Green_func from readdata import read_data (rs, r0, L, kx, kt, ms, d, h3, h2, h1, rmax, Q, M, muq) = read_data() #kx = [0.0,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95,1.0] #kx = [0.0,0.5,1.5,2.5,5.0,10.0] kx = 3.0 kt = linspace(-2.0, 2.0, 81) Scale = 1 Gr_f = zeros(len(kt), complex) for i in range(0, len(kt)): Gr_f[i] = Green_func(rs, r0, L, kx, kt[i], ms, d, h3, h2, h1, rmax, Q, M, muq, Scale) print Gr_f[i] print Scale figure(1) plot(kt, Gr_f.imag, label='Scale = %f' % Scale) ####################################################################################### #Scale = 10 #Gr_f = zeros(len(kt),complex) #for i in range(0,len(kt)): # Gr_f[i] = Green_func(rs,r0,L,kx,kt[i],ms,d,h3,h2,h1,rmax,Q,M,v,muq,Scale) # print Gr_f[i] #print Scale #figure(1) #plot(kt,Gr_f.imag,label='Scale = %f'%Scale) ''' ######################################################################################
def train_wc(semi_sv,output,epoch=1000): from readdata import read_data tr_q1, tr_q2, te_q1, te_q2, word_embedding_matrix, labels = read_data('words') trc_q1, trc_q2, tec_q1, tec_q2, char_embedding_matrix, labels = read_data('chars') X = { 'q1': tr_q1, 'q2': tr_q2, 'qc1': trc_q1, 'qc2': trc_q2 } y = labels from config import model_path from sklearn.cross_validation import StratifiedKFold, KFold from config import n_folds from nn import aggmodel y_pred = pd.read_csv("./data/y_pred.csv")['y_pre'].values y_pos = y_pred > 0.75 y_neg = y_pred < 0.25 y_idx = np.any([y_pos, y_neg], axis=0) y_pred = y_pred[y_idx] print(y_idx.shape) # oof_y = np.zeros((len(X['q1']),1)) oof_y = pd.read_csv("./data/oofy.csv")['y_pre'].values alpha = 1 oof_y = (1 - alpha) * y + alpha * oof_y folds = StratifiedKFold(y, n_folds=n_folds, shuffle=True,) result = np.zeros((len(te_q1), 1)) for n_fold, (tr_idx, val_idx) in enumerate(folds): if semi_sv: Q1_tr = np.concatenate([X['q1'][tr_idx], te_q1[y_idx]]) Q2_tr = np.concatenate([X['q2'][tr_idx], te_q2[y_idx]]) Qc1_tr = np.concatenate([X['qc1'][tr_idx],tec_q1[y_idx]]) Qc2_tr = np.concatenate([X['qc2'][tr_idx],tec_q2[y_idx]]) y_tr = np.concatenate([y[tr_idx], y_pred]) # y_tr = np.concatenate([oof_y[tr_idx], y_pred]) idx = list(range(len(y_tr))) np.random.shuffle(idx) Q1_tr = Q1_tr[idx] Q2_tr = Q2_tr[idx] Qc1_tr = Qc1_tr[idx] Qc2_tr = Qc2_tr[idx] y_tr = y_tr[idx] else: Q1_tr = X['q1'][tr_idx] Q2_tr = X['q2'][tr_idx] Qc1_tr = X['qc1'][tr_idx] Qc2_tr = X['qc2'][tr_idx] y_tr = y[tr_idx] # y_tr = oof_y[tr_idx] Q1_te = X['q1'][val_idx] Q2_te = X['q2'][val_idx] Qc1_te = X['qc1'][val_idx] Qc2_te = X['qc2'][val_idx] y_te = y[val_idx] model = aggmodel(word_embedding_matrix,char_embedding_matrix) if n_fold == 0: print(model.summary()) print(n_fold) model.fit([Q1_tr, Q2_tr,Qc1_tr,Qc2_tr], y_tr, epochs=epoch, validation_data=[[Q1_te, Q2_te,Qc1_te, Qc2_te], y_te], verbose=1, batch_size=256, callbacks=[ EarlyStopping(patience=3, monitor='val_binary_crossentropy'), # LearningRateScheduler(lr_de,verbose=1) ], ) # model.load_weights(model_path) result += model.predict([te_q1,te_q2,tec_q1,tec_q2], batch_size=1024) # 释放显存 K.clear_session() tf.reset_default_graph() # 提交结果 result /= n_folds submit = pd.DataFrame() submit['y_pre'] = list(result[:, 0]) submit.to_csv(output, index=False)
K.clear_session() tf.reset_default_graph() submit = 0 total_w = 0 for y_pred,ense_w in results: submit += ense_w*y_pred total_w += ense_w return submit/total_w if __name__ == '__main__': from readdata import read_data _, te_word, embedding_matrix_word,__ = read_data('words', data_aug=False) _, te_char, embedding_matrix_char,__ = read_data('chars', data_aug=False) submit_atten = ensemble('esim',te_word,te_char,embedding_matrix_word,embedding_matrix_char) submit = pd.DataFrame() submit['y_pre'] = list(submit_atten[:, 0]) submit.to_csv('atten.csv', index=False)
def train(use_data, semi_sv, output, data_aug, use_model): def get_subset(dataset, idx): data = {} for key, value in dataset.items(): data[key] = value[idx] return data def concat_data(data1, data2): result = {} for k in data1.keys(): result[k] = np.concatenate([data1[k], data2[k]]) return result def get_aug_data(tr_x, tr_y): tr_q1 = tr_x['q1'] tr_q2 = tr_x['q2'] tr_gf = tr_x['gf'] tr_q1node = tr_x['q1node'] tr_q2node = tr_x['q2node'] res_q1 = [] res_q2 = [] res_gf = [] res_q1node = [] res_q2node = [] res_y = [] for q1, q2, gf, q1node, q2node, y in zip(tr_q1, tr_q2, tr_gf, tr_q1node, tr_q2node, tr_y): r1 = q1[np.in1d(q1, q2, invert=True)] len1 = len(r1) if len1 < 4 or len1 == len(q1[q1 != 0]): continue r2 = q2[np.in1d(q2, q1, invert=True)] len2 = len(r2) if len2 < 4 or len2 == len(q2[q2 != 0]): continue out1 = np.zeros(15, dtype=np.int32) out2 = np.zeros(15, dtype=np.int32) out1[-len1:] = r1 out2[-len2:] = r2 res_q1.append(out1) res_q2.append(out2) res_gf.append(gf) res_q1node.append(q1node) res_q2node.append(q2node) res_y.append(y) res_x = { 'q1': np.asarray(res_q1), 'q2': np.asarray(res_q2), 'gf': np.asarray(res_gf), 'q1node': np.asarray(res_q1node), 'q2node': np.asarray(res_q2node) } res_y = np.asarray(res_y) return res_x, res_y from nn import rnnword, aggmodel, esim, attention, rnn_res if use_model == 'rnnword': get_model = rnnword elif use_model == 'aggmodel': pass elif use_model == 'esim': get_model = esim elif use_model == 'attention': get_model = attention elif use_model == 'res': get_model = rnn_res else: raise RuntimeError("don't have this model") from readdata import read_data model_name = datetime.datetime.now().strftime( '%Y-%m-%d_%H:%M:%S') + '_' + use_data + '_' + str(semi_sv) + '_' + str( data_aug) + '_' tr, te, embedding_matrix, labels = read_data(use_data, data_aug=data_aug) print(use_data) print('Shape of label tensor:', labels.shape) y = labels from config import model_path from sklearn.cross_validation import StratifiedKFold, KFold from config import n_folds y_pred = pd.read_csv("./data/y_pred.csv")['y_pre'].values y_pos_ = y_pred == 1 y_neg_ = y_pred == 0 add_idx = np.any([y_pos_, y_neg_], axis=0) add_y = y_pred[add_idx] y_pos = y_pred > 0.75 y_neg = y_pred < 0.25 y_idx = np.any([y_pos, y_neg], axis=0) y_pred = y_pred[y_idx] print(y_idx.shape) folds = StratifiedKFold(y, n_folds=n_folds, shuffle=True) result = np.zeros((len(te['q1']), 1)) oof_y = np.zeros((len(y), 1)) for n_fold, (tr_idx, val_idx) in enumerate(folds): tr_x = get_subset(tr, tr_idx) tr_y = y[tr_idx] # if data_aug: # res_x,res_y = get_aug_data(tr_x,tr_y) # tr_x = concat_data(tr_x,res_x) # tr_y = np.concatenate([tr_y,res_y]) if semi_sv: te_x = get_subset(te, y_idx) tr_data = concat_data(tr_x, te_x) tr_y = np.concatenate([tr_y, y_pred]) patience = 3 else: add_data = get_subset(te, add_idx) tr_data = concat_data(tr_x, add_data) tr_y = np.concatenate([tr_y, add_y]) patience = 2 # tr_data = tr_x # tr_y = y[tr_idx] val_x = get_subset(tr, val_idx) val_y = y[val_idx] use_word = True if use_data != 'words': use_word = False model = get_model(word_embedding_matrix=embedding_matrix, use_word=use_word) if n_fold == 0: print(model.summary()) # hist = epochHistory() print(n_fold) model.fit( tr_data, tr_y, epochs=1000, validation_data=[val_x, val_y], verbose=1, batch_size=256, callbacks=[ EarlyStopping(patience=patience, monitor='val_binary_crossentropy'), # LearningRateScheduler(lr_de,verbose=1) # hist, # ModelCheckpoint('./weight/weights.{epoch:d}.hdf5',monitor='val_binary_crossentropy',save_weights_only=True) ]) # result += iter_ense(hist.epochs,model,te) result += model.predict(te, batch_size=1024) model.save_weights('./weight/' + model_name + str(n_fold) + '.h5') # oof_y[val_idx] = model.predict(val_x, batch_size=2048) K.clear_session() tf.reset_default_graph() # 提交结果 result /= n_folds submit = pd.DataFrame() submit['y_pre'] = list(result[:, 0]) submit.to_csv(output, index=False) ## 保存预测的训练标签 # oof_y = oof_y[:,0] # oof_y_ = oof_y.round().astype(int) # # error_idx = oof_y_!=y # print(np.sum(error_idx)) # oof_y[error_idx] = 1-oof_y[error_idx] submit = pd.DataFrame() submit['y_pre'] = oof_y[:, 0] submit.to_csv('./data/oofy.csv', index=False)