def GetData(): data = pandas.read_csv('data.train.csv') if not os.path.exists('%s_cluster.model' % __fname__): cluster = KMeans(n_clusters=5) scaler = StandardScaler() pca = PCA(n_components=5) pipe = Pipeline([('scaler', scaler), ('pca', pca), ('cluster', cluster)]) pipe.fit(data[[ i for i in data.keys() if i not in ['user_id', 'item_id', 'buy'] ]]) #Y = pipe.predict(data[[i for i in data.keys() if i not in ['user_id','item_id','buy']]]) util.save_obj(pipe, '%s_cluster.model' % __fname__) Y = data['buy'] X = GetFeature(data) #rand = np.random.rand(len(Y))<0.0001 #idx = (Y==1) | ((Y==0) & rand) #X = X[idx] #Y = Y[idx] return X, Y
def GetData(): data = pandas.read_csv('data.train.csv') if not os.path.exists('%s_cluster.model' % __fname__): cluster = KMeans(n_clusters=5) scaler = StandardScaler() pca = PCA(n_components=5) pipe = Pipeline([('scaler',scaler),('pca',pca),('cluster',cluster)]) pipe.fit(data[[i for i in data.keys() if i not in ['user_id','item_id','buy']]]) #Y = pipe.predict(data[[i for i in data.keys() if i not in ['user_id','item_id','buy']]]) util.save_obj(pipe, '%s_cluster.model' % __fname__) Y=data['buy'] X=GetFeature(data) #rand = np.random.rand(len(Y))<0.0001 #idx = (Y==1) | ((Y==0) & rand) #X = X[idx] #Y = Y[idx] return X, Y
def GetGeoTree(all = False): # print os.path.exists('geotree') if os.path.exists('geotree'): tree = util.load_obj('geotree') return tree geo_hash = pandas.read_csv('tianchi_mobile_recommend_train_user.csv.subset.csv') geo_hash = geo_hash.dropna() geo_count=dict() rule = [(0,0),(1,1e5), (2,1e5),(3,1e5),(4,1e4),(5,1e3),(6,1e3)] for r in rule: if r[0]==0: split_list = ['9','m','f'] for i in geo_hash['user_geohash']: util.IncDict(geo_count, i[:1]) else: split_list=[i for i in geo_count.keys() if geo_count[i]>r[1] and len(i)==r[0] ] for i in geo_hash['user_geohash']: if i[:r[0]] in split_list: util.IncDict(geo_count, i[:r[0]+1]) util.save_obj(geo_count, 'geotree') if all: return geo_count else: geo_tree = {i:geo_count[i] for i in geo_count.keys() if geo_count[i]>1e5 or len(i)==1} return geo_tree
def save_obj(self, filename, vertices, textures): ''' vertices: [nv, 3], tensor texture: [3, h, w], tensor ''' util.save_obj(filename, vertices, self.faces[0], textures=textures, uvcoords=self.raw_uvcoords[0], uvfaces=self.uvfaces[0])
def GetGeoTree(all=False): # print os.path.exists('geotree') if os.path.exists('geotree'): tree = util.load_obj('geotree') return tree geo_hash = pandas.read_csv( 'tianchi_mobile_recommend_train_user.csv.subset.csv') geo_hash = geo_hash.dropna() geo_count = dict() rule = [(0, 0), (1, 1e5), (2, 1e5), (3, 1e5), (4, 1e4), (5, 1e3), (6, 1e3)] for r in rule: if r[0] == 0: split_list = ['9', 'm', 'f'] for i in geo_hash['user_geohash']: util.IncDict(geo_count, i[:1]) else: split_list = [ i for i in geo_count.keys() if geo_count[i] > r[1] and len(i) == r[0] ] for i in geo_hash['user_geohash']: if i[:r[0]] in split_list: util.IncDict(geo_count, i[:r[0] + 1]) util.save_obj(geo_count, 'geotree') if all: return geo_count else: geo_tree = { i: geo_count[i] for i in geo_count.keys() if geo_count[i] > 1e5 or len(i) == 1 } return geo_tree
def extract_xref(files_list): # total number of files to calculate completion percentage total_files = len(files_list) bad_files_names = [] # Extract all features related to DATA and CODE XREF xref_dict = xref_initialization() for idx, file_name in enumerate(files_list): asm_file = DATASET_DIR + 'train/' + file_name + '.asm.gz' try: get_xref_features(asm_file, xref_dict) except Exception as e: # log corrupted files for future correction log_exception(e, sys.argv[0], asm_file) bad_files_idx.append(idx) bad_files_names.append(file_name) progress_bar(idx+1, total_files, 50) xref_pd = pd.DataFrame.from_dict(xref_dict) # store xref features to avoid recalculation save_obj(xref_pd, 'xref_features') ''' save_obj(bad_files_names, 'bad_asm_files') # drop corrupted files (if any) from the training set if len(bad_files_names) > 0: # log the number of corrupted files logging.info('XREF Feature Extraction completed: ' + str(len(bad_files_names)) + ' file(s) are corrupted.') # store the corrupted files names in 'bad_asm_files.txt' with open('bad_asm_files.txt', 'w') as bfp: for name in bad_files_names: bfp.write(name + '.asm') ''' # save xref features dataframe to csv file to keep results (optional) xref_pd.to_csv('features/xref_features.csv', index=False) return xref_pd
def get_trees(self): if self.trees == []: trees = util.load_obj(self.filename) if trees is None: trees = self._generate_trees() util.save_obj(trees, self.filename) self.trees = trees return self.trees
def save_data(self): for key, value in self.data.items(): if key not in ['env_data', 'bodytemp', '_label']: # np.save(self.conf.npy_data + '/' + str(key) + '.npy', value) path = self.conf.npy_data if self.save_dir is not None: path = path + '/' + self.save_dir save_obj(value, path + '/' + str(key))
def get_trees(self): if self.trees == []: # attempt to load cache trees = util.load_obj(self.filename) if trees is None or trees == []: # not cached yet trees = self._generate_trees() util.save_obj(trees, self.filename) # save cache self.trees = trees return self.trees
def main(): """main function""" names = ["Divvy_Stations_2017_Q3Q4.csv", "Divvy_Trips_2017_Q3.csv", "Divvy_Trips_2017_Q4.csv", "Divvy_Trips.csv", "6_26_6_30.csv", "Divvy_Stations_2017_Q1Q2.csv"] directory = "Divvy_Data/" fn = names[int(sys.argv[1])] if len(sys.argv) > 1 else "first300.csv" fn = directory + fn print("doing operations on " + fn) data = readdict(fn) # data = data_cleanup_missing(data) save_obj(data, "full_array_of_entries")
def quantization_train(imu_measurements, k): """ :param imu_measurements: [T, 6] :param k: number of possible measurements :return: """ # run k-means model = KMeans(n_clusters=k) kmeans = model.fit(imu_measurements) labels = kmeans.labels_ kmeans.labels_ = [] # save space save_obj(kmeans, 'kmeans_model.pkl') return labels
def extract_opcode_ngram(files_list, n): dicts_list = [] total_files = len(files_list) for idx, file_name in enumerate(files_list): asm_file = conf['dataset_dir'] + file_name + '.asm.gz' clean_asm_code = clean_asm_lines(asm_file) opcode_sequence = [] # this loop constructs a sequence of opcodes delimited by space character for line in clean_asm_code: # below commands works assuming that the preprocessing of the .asm # file has already occured opcode_mnem = line.split(' ')[0].rstrip() # further condition to minimize the number of outliers (handle extreme cases) is_valid_opcode = bool(re.match('^[a-z]{2,7}$', opcode_mnem)) if is_valid_opcode: opcode_sequence.append(opcode_mnem) ngram_dict = {} for index, opcode in enumerate(opcode_sequence): if (n + index) > len(opcode_sequence): break opcode_ngram = "" for j in range(index, index + n): opcode_ngram += opcode_sequence[j] + '-' # remove trailing '-' char from opcode_ngram opcode_ngram = opcode_ngram[:-1] if opcode_ngram in ngram_dict: ngram_dict[opcode_ngram] += 1 else: ngram_dict[opcode_ngram] = 1 dicts_list.append(ngram_dict) # progress bars always save my sanity progress_bar(idx+1, total_files, 50) # convert list of dictionaries to an opcode ngram count numpy array vec = DictVectorizer() ngram_freq = vec.fit_transform(dicts_list).toarray() ngram_freq_df = pd.DataFrame(ngram_freq, columns=vec.get_feature_names()) ngram_freq_df.to_csv('features/' + str(n) + 'gram_opcode_freq1.csv', index=False) save_obj(ngram_freq_df, str(n) + 'gram_opcode_freq') # transform ngram frequency array to ngram tfidf array transformer = TfidfTransformer(smooth_idf=False) ngram_tfidf = transformer.fit_transform(ngram_freq) # transform array to pandas dataframe freq_vec_df = pd.DataFrame(ngram_tfidf.todense(), columns=vec.get_feature_names()) freq_vec_df.to_csv('features/' + str(n) + 'gram_opcode_tfidf1.csv', index=False) save_obj(freq_vec_df, str(n) + 'gram_opcode_tfidf') return freq_vec_df
def main(): train_labels = pd.read_csv(DATASET_DIR + 'trainLabels.csv') files_list = train_labels['Id'].tolist() # total number of files to calculate completion percentage total_files = len(files_list) # do not count corrupted files bad_files_idx = [] bad_files_names = [] # Extract all features related to DATA and CODE XREF xref_dict = xref_initialization() for idx, file_name in enumerate(files_list): asm_file = DATASET_DIR + 'train/' + file_name + '.asm.gz' try: get_xref_features(asm_file, xref_dict) except Exception as e: # log corrupted files for future correction log_exception(e, sys.argv[0], asm_file) bad_files_idx.append(idx) bad_files_names.append(file_name) progress_bar(idx+1, total_files, 50) xref_pd = pd.DataFrame.from_dict(xref_dict) # store xref features to avoid recalculation save_obj(xref_pd, 'xref_features') save_obj(bad_files_names, 'bad_files') # concat features with classes and IDs to create the dataset data = pd.concat([train_labels, xref_pd], axis=1, sort=False) # drop corrupted files (if any) from the training set if len(bad_files_idx) > 0: data.drop(data.index[bad_files_idx], inplace=True) data = data.reset_index(drop=True) # log the number of corrupted files logging.info('XREF Feature Extraction completed: ' + str(len(bad_files_idx)) + ' file(s) are corrupted.') # store the corrupted files names in 'bad_asm_files.txt' with open('bad_asm_files.txt', 'w') as bfp: for name in bad_files_names: bfp.write(name + '.asm.gz') # save xref features dataframe to csv file to keep results (optional) data.to_csv('results/xref_features.csv') '''
def unify_features(): train_labels = pd.read_csv( '~/Documents/thesis/dataset/dataSample/trainLabels.csv') section_features = load_obj('section_features') xref_features = load_obj('xref_features') opcode_1gram_features = load_obj('1gram_opcode_tfidf') byte_1gram_features = load_obj('1gram_byte_tfidf') # concat features with classes and IDs to create the dataset data = pd.concat([train_labels, xref_features, section_features, opcode_1gram_features, \ byte_1gram_features], axis=1, sort=False) print(data.shape) save_obj(data, 'interim_data') return data
def build_index(): corpus_path = util.get_corpus_dir_path_from_args() preprocessor = preprocessing.Preprocessor(corpus_path) doc_to_terms: list[preprocessing.DocToTerms] = preprocessor.parse() indexer_ob = indexer.Indexer(doc_to_terms) inverted_index: dict[str, indexer.Posting] = indexer_ob.inverter_index() doc_id_name_index: dict[int, str] = indexer_ob.doc_id_to_doc_name_index() tf_idf_ranker = ranker.Ranker(inverted_index, doc_id_name_index) _tfidf = tf_idf_ranker.tfidf() print('Indexing completed..saving...') util.save_obj(doc_id_name_index, DOC_ID_NAME_INDEX_NAME) util.save_obj(inverted_index, INVERTED_INDEX_FILE_NAME) util.save_pandas_df_as_pickle(_tfidf, TFIDF_NAME_INDEX_NAME) print('Saved index for quick results for future queries')
def init_cache(): """initial variable caching, done only once""" save_obj(INITIAL_EPSILON, "epsilon") t = 0 save_obj(t, "time") D = deque() save_obj(D, "D")
L_PAD_HEIGHT = 1.0 L_HALF_PAD_HEIGHT = L_PAD_HEIGHT * HEIGHT / 2 if args.mode == 'play': play(args.ctrl == 'auto', args.play_show_stat) elif args.mode == 'train': train(args.alpha, args.gamma, args.decay, args.ne, args.iter, args.train_show_stat) if args.save != 'none': q_learning.QSaveToFile(args.save) elif args.mode == 'test': if args.load == 'init': q_learning.QInit(args.ne, args.xd, args.yd, args.pd) else: q_learning.QInitFromFile(args.load, args.ne) test_result = test(args.test_show_stat) print(" Total : %d" % test_result[0]) print(" Average : %.2f" % test_result[1]) print(" Min : %d" % test_result[2]) print(" Max : %d" % test_result[3]) print("Win Rate : %.2f%%" % test_result[4]) elif args.mode == 'tune': results = [] for alpha in np.arange(0.1, 1.1, 0.1): for gamma in np.arange(0.1, 1.1, 0.1): for decay in np.arange(1000, 30000, 4000): for ne in np.arange(5, 100, 10): result = train(alpha, gamma, decay, ne, 15000, False) results.append(result) save_obj(results, args.save_tune)
# coding:utf-8 # find best import util import summary import numpy as np from sklearn.metrics import f1_score if __name__=='__main__': info = summary.ParTestModelOnData('model16', 'data.test.csv','label_test.csv') util.save_obj(info, 'info.info') pred, Y = info[-2:] f1scores = {} for th in np.linspace(0.3,0.7,20): f1score = f1_score(Y, pred>th) f1score[str(th)] = f1score print th, f1score
count = 0 for tup in class_file_dirs: if count % 100000 == 0: print('hashed %d class images' % count) (cclass, cfile) = tup file_path = os.path.join(parent_path, cfile) chash = hashlib.md5(open(file_path, 'rb').read()).hexdigest() class_file_hashes.append((cclass, cfile, chash)) count += 1 cfhd = os.path.join(parent_path, 'data', 'intermediate', 'class_file_hashes') util.save_obj(class_file_hashes, cfhd) count = 0 for tup in write_file_dirs: if count % 100000 == 0: print('hashed %d write images' % count) (cclass, cfile) = tup file_path = os.path.join(parent_path, cfile) chash = hashlib.md5(open(file_path, 'rb').read()).hexdigest() write_file_hashes.append((cclass, cfile, chash)) count += 1
def run(args, num_workers=1, log_interval=100, verbose=True, save_path=None): code_root = os.path.dirname(os.path.realpath(__file__)) if not os.path.isdir('{}/{}_result_files/'.format(code_root, args.task)): os.mkdir('{}/{}_result_files/'.format(code_root, args.task)) path = '{}/{}_result_files/'.format( code_root, args.task) + utils.get_path_from_args(args) print('File saved in {}'.format(path)) if os.path.exists(path + '.pkl') and not args.rerun: print('File has already existed. Try --rerun') return utils.load_obj(path) start_time = time.time() utils.set_seed(args.seed) # --------------------------------------------------------- # -------------------- training --------------------------- # initialise model model = user_preference_estimator(args).cuda() model.train() print(sum([param.nelement() for param in model.parameters()])) # set up meta-optimiser for model parameters meta_optimiser = torch.optim.Adam(model.parameters(), args.lr_meta) # scheduler = torch.optim.lr_scheduler.StepLR(meta_optimiser, 5000, args.lr_meta_decay) # initialise logger logger = Logger() logger.args = args # initialise the starting point for the meta gradient (it's faster to copy this than to create new object) meta_grad_init = [0 for _ in range(len(model.state_dict()))] dataloader_train = DataLoader(Metamovie(args), batch_size=1, num_workers=args.num_workers) for epoch in range(args.num_epoch): x_spt, y_spt, x_qry, y_qry = [], [], [], [] iter_counter = 0 for step, batch in enumerate(dataloader_train): if len(x_spt) < args.tasks_per_metaupdate: x_spt.append(batch[0][0].cuda()) y_spt.append(batch[1][0].cuda()) x_qry.append(batch[2][0].cuda()) y_qry.append(batch[3][0].cuda()) if not len(x_spt) == args.tasks_per_metaupdate: continue if len(x_spt) != args.tasks_per_metaupdate: continue # initialise meta-gradient meta_grad = copy.deepcopy(meta_grad_init) loss_pre = [] loss_after = [] for i in range(args.tasks_per_metaupdate): loss_pre.append(F.mse_loss(model(x_qry[i]), y_qry[i]).item()) fast_parameters = model.final_part.parameters() for weight in model.final_part.parameters(): weight.fast = None for k in range(args.num_grad_steps_inner): logits = model(x_spt[i]) loss = F.mse_loss(logits, y_spt[i]) grad = torch.autograd.grad(loss, fast_parameters, create_graph=True) fast_parameters = [] for k, weight in enumerate(model.final_part.parameters()): if weight.fast is None: weight.fast = weight - args.lr_inner * grad[ k] #create weight.fast else: weight.fast = weight.fast - args.lr_inner * grad[k] fast_parameters.append(weight.fast) logits_q = model(x_qry[i]) # loss_q will be overwritten and just keep the loss_q on last update step. loss_q = F.mse_loss(logits_q, y_qry[i]) loss_after.append(loss_q.item()) task_grad_test = torch.autograd.grad(loss_q, model.parameters()) for g in range(len(task_grad_test)): meta_grad[g] += task_grad_test[g].detach() # -------------- meta update -------------- meta_optimiser.zero_grad() # set gradients of parameters manually for c, param in enumerate(model.parameters()): param.grad = meta_grad[c] / float(args.tasks_per_metaupdate) param.grad.data.clamp_(-10, 10) # the meta-optimiser only operates on the shared parameters, not the context parameters meta_optimiser.step() #scheduler.step() x_spt, y_spt, x_qry, y_qry = [], [], [], [] loss_pre = np.array(loss_pre) loss_after = np.array(loss_after) logger.train_loss.append(np.mean(loss_pre)) logger.valid_loss.append(np.mean(loss_after)) logger.train_conf.append(1.96 * np.std(loss_pre, ddof=0) / np.sqrt(len(loss_pre))) logger.valid_conf.append(1.96 * np.std(loss_after, ddof=0) / np.sqrt(len(loss_after))) logger.test_loss.append(0) logger.test_conf.append(0) utils.save_obj(logger, path) # print current results logger.print_info(epoch, iter_counter, start_time) start_time = time.time() iter_counter += 1 if epoch % (2) == 0: print('saving model at iter', epoch) logger.valid_model.append(copy.deepcopy(model)) return logger, model
write_dir = os.path.join(parent_path, 'data', 'raw_data', 'by_write') rel_write_dir = os.path.join('data', 'raw_data', 'by_write') write_parts = os.listdir(write_dir) for write_part in write_parts: writers_dir = os.path.join(write_dir, write_part) rel_writers_dir = os.path.join(rel_write_dir, write_part) writers = os.listdir(writers_dir) for writer in writers: writer_dir = os.path.join(writers_dir, writer) rel_writer_dir = os.path.join(rel_writers_dir, writer) wtypes = os.listdir(writer_dir) for wtype in wtypes: type_dir = os.path.join(writer_dir, wtype) rel_type_dir = os.path.join(rel_writer_dir, wtype) images = os.listdir(type_dir) image_dirs = [os.path.join(rel_type_dir, i) for i in images] for image_dir in image_dirs: write_files.append((writer, image_dir)) util.save_obj( class_files, os.path.join(parent_path, 'data', 'intermediate', 'class_file_dirs')) util.save_obj( write_files, os.path.join(parent_path, 'data', 'intermediate', 'write_file_dirs'))
'loss': 'CategoricalCrossentropy' } # Setting MLFlow mlflow.set_experiment(experiment_name=experiment_name) exp = mlflow.get_experiment_by_name(experiment_name) # Preparing full data print("Preparing data") X_train, y_train, X_val, y_val, X_test, y_test = prepareBBdata( dataset.replace('_best', ''), label, model_type, final=True) # Training with full data print("Training model") model = train(X_train, y_train, X_val, y_val, X_test, y_test, model_type, params, exp.experiment_id, n_classes) folder = 'data/' + dataset.replace('_best', '') + '/target/' + model_type + '/' if (model_type == 'RF'): save_obj(model, folder + '/RF_model') if (model_type == 'NN'): model.save(folder + '/NN_model.h5') print("Best model saved in " + folder) # else else: gridSearch(dataset, model_type)
def save(self, filename=None): if filename is None: filename = self.filename util.save_obj(self.trees, filename)
# coding:utf-8 # find best import util import summary import numpy as np from sklearn.metrics import f1_score if __name__ == '__main__': info = summary.ParTestModelOnData('model16', 'data.test.csv', 'label_test.csv') util.save_obj(info, 'info.info') pred, Y = info[-2:] f1scores = {} for th in np.linspace(0.3, 0.7, 20): f1score = f1_score(Y, pred > th) f1score[str(th)] = f1score print th, f1score
def train_network(model, game_state, observe=False): last_time = time.time() # store the previous observations in replay memory D = load_obj("D") # load from file system # get the first state by doing nothing do_nothing = np.zeros(ACTIONS) do_nothing[0] = 1 # 0 => do nothing, # 1=> jump x_t, r_0, terminal = game_state.get_state( do_nothing) # get next step after performing the action s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) # stack 4 images to create placeholder input s_t = s_t.reshape(1, s_t.shape[0], s_t.shape[1], s_t.shape[2]) # 1*20*40*4 initial_state = s_t if observe: OBSERVE = 999999999 # We keep observe, never train epsilon = FINAL_EPSILON print("Now we load weight") model.load_weights("model.h5") adam = Adam(lr=LEARNING_RATE) model.compile(loss='mse', optimizer=adam) print("Weight load successfully") else: # We go to training mode OBSERVE = OBSERVATION epsilon = load_obj("epsilon") model.load_weights("model.h5") adam = Adam(lr=LEARNING_RATE) model.compile(loss='mse', optimizer=adam) t = load_obj( "time") # resume from the previous time step stored in file system while True: # endless running loss = 0 Q_sa = 0 action_index = 0 a_t = np.zeros([ACTIONS]) # action at t # choose an action epsilon greedy if t % FRAME_PER_ACTION == 0: # parameter to skip frames for actions if random.random() <= epsilon: # randomly explore an action print("----------Random Action----------") action_index = random.randrange(ACTIONS) a_t[0] = 1 else: # predict the output q = model.predict( s_t) # input a stack of 4 images, get the prediction max_Q = np.argmax(q) # choose index with maximum q value action_index = max_Q a_t[action_index] = 1 # 0=> do nothing, 1=> jump # We reduced the epsilon (exploration parameter) gradually if epsilon > FINAL_EPSILON and t > OBSERVE: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE # run the selected action and observed next state and reward x_t1, r_t, terminal = game_state.get_state(a_t) print('fps: {0}'.format(1 / (time.time() - last_time)) ) # helpful for measuring frame rate last_time = time.time() x_t1 = x_t1.reshape(1, x_t1.shape[0], x_t1.shape[1], 1) # 1x20x40x1 s_t1 = np.append( x_t1, s_t[:, :, :, :3], axis=3 ) # append the new image to input stack and remove the first one # store the transition in D D.append((s_t, action_index, r_t, s_t1, terminal)) if len(D) > REPLAY_MEMORY: D.popleft() # only train if done observing if t > OBSERVE: # sample a mini_batch to train on mini_batch = random.sample(D, BATCH) inputs = np.zeros((BATCH, s_t.shape[1], s_t.shape[2], s_t.shape[3])) # 32, 20, 40, 4 targets = np.zeros((inputs.shape[0], ACTIONS)) # 32, 2 # Now we do the experience replay for i in range(0, len(mini_batch)): state_t = mini_batch[i][0] # 4D stack of images action_t = mini_batch[i][1] # This is action index reward_t = mini_batch[i][ 2] # reward at state_t due to action_t state_t1 = mini_batch[i][3] # next state terminal = mini_batch[i][ 4] # wheather the agent died or survided due the action inputs[i:i + 1] = state_t targets[i] = model.predict(state_t) # predicted q values Q_sa = model.predict( state_t1) # predict q values for next step if terminal: targets[ i, action_t] = reward_t # if terminated, only equals reward else: targets[i, action_t] = reward_t + GAMMA * np.max(Q_sa) loss += model.train_on_batch(inputs, targets) loss_df.loc[len(loss_df)] = loss q_values_df.loc[len(q_values_df)] = np.max(Q_sa) s_t = initial_state if terminal else s_t1 # reset game to initial frame if terminate t = t + 1 # save progress every 1000 iterations if t % 1000 == 0: print("Now we save model") game_state._game.pause() # pause game while saving to filesystem model.save_weights("model.h5", overwrite=True) save_obj(D, "D") # saving episodes save_obj(t, "time") # caching time steps save_obj(epsilon, "epsilon" ) # cache epsilon to avoid repeated randomness in actions loss_df.to_csv("./objects/loss_df.csv", index=False) scores_df.to_csv("./objects/scores_df.csv", index=False) actions_df.to_csv("./objects/actions_df.csv", index=False) q_values_df.to_csv(q_value_file_path, index=False) with open("model.json", "w") as outfile: json.dump(model.to_json(), outfile) clear_output() game_state._game.resume() # print info if t <= OBSERVE: state = "observe" elif t <= OBSERVE + EXPLORE: state = "explore" else: state = "train" print "TIMESTAMP", t, "/ STATE", state, "/ EPSILON", epsilon, "/ ACTION"\ , action_index, "/ REWARD", r_t, "/ Q_MAX ", np.max(Q_sa), "/ Loss ", loss
def byte_ngram(files_list, addrlength=32, n=1): dicts_list = [] total_files = len(files_list) bad_files_names = [] for idx, file_name in enumerate(files_list): bytes_file = DATASET_DIR + file_name + '.bytes.gz' try: with gzip.open(bytes_file, 'rt') as fp: bytedict = {} hex_seq = "" for line in fp.readlines(): if not line.strip(): continue else: address = int(addrlength / 4) # hex to bytes # ensure that addresses values will not be counted # in the ngram calculation hex_seq = hex_seq + line[address:].strip() hex_seq = hex_seq.replace(" ", "") for i in range(0, len(hex_seq) - 1, 2): # ignore bytes that contain the "?" character if hex_seq[i] == "?" or hex_seq[i + 1] == "?": continue if 2 * n + i > len(hex_seq): break gram = hex_seq[i:(2 * n + i)] if gram not in bytedict.keys(): bytedict[gram] = 1 else: bytedict[gram] += 1 dicts_list.append(bytedict) except Exception as e: bad_files_names.append(file_name) log_exception(e, sys.argv[0], bytes_file) # progress bars always save my sanity progress_bar(idx + 1, total_files, 50) # log the corrupted files for future reference if len(bad_files_names) > 0: with open('bad_bytes_files.txt', 'w') as bfp: for name in bad_files_names: bfp.write(name + '.bytes\n') # convert list of dictionaries to a byte ngram count numpy array vec = DictVectorizer() ngram_freq = vec.fit_transform(dicts_list).toarray() ngram_freq_df = pd.DataFrame(ngram_freq, columns=vec.get_feature_names()) # store frequency of each byte ngram ngram_freq_df.to_csv('features/' + str(n) + 'gram_byte_freq.csv') save_obj(ngram_freq_df, str(n) + 'gram_byte_freq') # transform ngram frequency array to ngram tfidf array transformer = TfidfTransformer(smooth_idf=False) ngram_tfidf = transformer.fit_transform(ngram_freq) # store tfidf of each byte ngram ngram_tfidf_df = pd.DataFrame(ngram_tfidf.todense(), columns=vec.get_feature_names()) ngram_tfidf_df.to_csv('features/' + str(n) + 'gram_byte_tfidf.csv') save_obj(ngram_tfidf_df, str(n) + 'gram_byte_tfidf') return ngram_tfidf_df
# Import pickle package import pickle from util import save_obj dict1 = {'Mar': '84.4', 'June': '69.4', 'Aug': '85', 'Airline': '8'} save_obj(dict1, './OtherFiles/data') # Open pickle file and load data: d with open('./OtherFiles/data.pkl', 'rb') as file: d = pickle.load(file) # Print d print(d) # Print datatype of d print(type(d))
import os import sys utils_dir = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) utils_dir = os.path.join(utils_dir, 'utils') sys.path.append(utils_dir) import util parent_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) cfhd = os.path.join(parent_path, 'data', 'intermediate', 'class_file_hashes') wfhd = os.path.join(parent_path, 'data', 'intermediate', 'write_file_hashes') class_file_hashes = util.load_obj(cfhd) # each elem is (class, file dir, hash) write_file_hashes = util.load_obj( wfhd) # each elem is (writer, file dir, hash) class_hash_dict = {} for i in range(len(class_file_hashes)): (c, f, h) = class_file_hashes[len(class_file_hashes) - i - 1] class_hash_dict[h] = (c, f) write_classes = [] for tup in write_file_hashes: (w, f, h) = tup write_classes.append((w, f, class_hash_dict[h][0])) wwcd = os.path.join(parent_path, 'data', 'intermediate', 'write_with_class') util.save_obj(write_classes, wwcd)
if (stat[key]['StartTime']>=0 and stat[key]['LastTime']>=0): stat[key]['Sequences'].append([stat[key]['StartTime'], stat[key]['LastTime'], stat[key]['StartPos'][0],stat[key]['StartPos'][1], stat[key]['EndPos'][0],stat[key]['EndPos'][1]]) stat[key]['StartTime'] = -1 stat[key]['LastTime'] = -1 for key in stat: stat[key]['Updated'] = False # get planes statics print('get stat') print('Total Files: ', len(data_files)) stat = util.load_obj(os.path.join(conf["output_folder"], 'stat_icao')) for i, path in enumerate(data_files): acList = util.getAcList(path) for ac in acList: key = ac['Icao'] if 'Lat' not in ac or 'Long' not in ac: continue if conf["source"] == "TCP": ac["PosTime"] = int(path.split('\\')[-1].split('.')[0]) # print(ac) getPlaneStat(ac, stat) updateStat(stat) if i and i % 100 == 0: print(i, " files processed.") updateStat(stat) util.save_obj(stat, os.path.join(conf["output_folder"], 'stat')) print("All files processed.")
# rep = s.get(req_url, cookies=cookies) # #设置响应编码 # rep.encoding = 'utf-8' # #构造bsobj # bsobj = BeautifulSoup(rep.text, 'html.parser') # #获取包含item的容器 # div = bsobj.find('div', id="Profile-following") # ques_div = div.find_all('div', class_="List-item") # print(bsobj.prettify()) # print(len(ques_div)) # # for que in ques_div: # # print(que.string) # driver = webdriver.Chrome() # driver.get(req_url) # #获取结果集 # result = [] # get_questions(result) # #保存获取的结果 # util.save_obj(result, 'questions.pkl') # driver.close() driver = webdriver.Chrome() driver.get(req_url) result = [] wait = WebDriverWait(driver, 10) get_ques(result, wait) print(len(result)) util.save_obj(result, 'questions.pkl') time.sleep(1) driver.close()
import pickle from util import save_obj dict_fruit = { 'peaches': 13, 'apples': 4, 'oranges': 11} save_obj(dict_fruit, './OtherFiles/pickle_fuit') with open('./OtherFiles/pickle_fuit.pkl', 'rb') as file: data = pickle.load(file) print(data)
import os import sys utils_dir = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) utils_dir = os.path.join(utils_dir, 'utils') sys.path.append(utils_dir) import util parent_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) wwcd = os.path.join(parent_path, 'data', 'intermediate', 'write_with_class') write_class = util.load_obj(wwcd) writers = [] # each entry is a (writer, [list of (file, class)]) tuple cimages = [] (cw, _, _) = write_class[0] for (w, f, c) in write_class: if w != cw: writers.append((cw, cimages)) cw = w cimages = [(f, c)] cimages.append((f, c)) writers.append((cw, cimages)) ibwd = os.path.join(parent_path, 'data', 'intermediate', 'images_by_writer') util.save_obj(writers, ibwd)
def save_model(): model_file = "_".join(["XGBooster", self.model_id, ".model"]) pickle_file = "_".join(["XGBooster", self.model_id]) self.bst.save_model(model_file) save_obj(self.bst, pickle_file)
import time from selenium import webdriver from selenium.webdriver.common.keys import Keys #登录页面 login_url = "https://accounts.douban.com/" driver = webdriver.Chrome() driver.get(login_url) email = driver.find_element_by_id('email') #输入的帐号 email.clear() email.send_keys('3188****.com') #获取密码输入框 password = driver.find_element_by_id('password') password.clear() #输入密码 password.send_keys('******') #获得登录按钮 submit = driver.find_element_by_class_name('btn-submit') #点击登录按钮 submit.send_keys(Keys.RETURN) #等待两秒 time.sleep(2) #获取cookies对象 cookies = driver.get_cookies() #保存获取的cookies对象 util.save_obj(cookies, 'cookies.pkl') #关闭打开的driver driver.close()