예제 #1
0
def GetData():

    data = pandas.read_csv('data.train.csv')

    if not os.path.exists('%s_cluster.model' % __fname__):
        cluster = KMeans(n_clusters=5)
        scaler = StandardScaler()

        pca = PCA(n_components=5)

        pipe = Pipeline([('scaler', scaler), ('pca', pca),
                         ('cluster', cluster)])
        pipe.fit(data[[
            i for i in data.keys() if i not in ['user_id', 'item_id', 'buy']
        ]])
        #Y = pipe.predict(data[[i for i in data.keys() if i not in ['user_id','item_id','buy']]])
        util.save_obj(pipe, '%s_cluster.model' % __fname__)

    Y = data['buy']

    X = GetFeature(data)

    #rand = np.random.rand(len(Y))<0.0001
    #idx = (Y==1) | ((Y==0) & rand)

    #X = X[idx]
    #Y = Y[idx]

    return X, Y
예제 #2
0
def GetData():
	
	data = pandas.read_csv('data.train.csv')

	if not os.path.exists('%s_cluster.model' % __fname__):
		cluster = KMeans(n_clusters=5)
		scaler = StandardScaler()

		pca = PCA(n_components=5)

		pipe = Pipeline([('scaler',scaler),('pca',pca),('cluster',cluster)])
		pipe.fit(data[[i for i in data.keys() if i not in ['user_id','item_id','buy']]])
		#Y = pipe.predict(data[[i for i in data.keys() if i not in ['user_id','item_id','buy']]])
		util.save_obj(pipe, '%s_cluster.model' % __fname__)
		
	Y=data['buy']
	
	X=GetFeature(data)
	
	
	
	
	#rand = np.random.rand(len(Y))<0.0001
	#idx = (Y==1) | ((Y==0) & rand)
	
	#X = X[idx]
	#Y = Y[idx]

	
	return X, Y
예제 #3
0
def GetGeoTree(all = False):
	# print os.path.exists('geotree')
	if os.path.exists('geotree'):
		tree = util.load_obj('geotree')
		return tree
		
	geo_hash = pandas.read_csv('tianchi_mobile_recommend_train_user.csv.subset.csv')
	geo_hash = geo_hash.dropna()
	geo_count=dict()
	
	rule = [(0,0),(1,1e5), (2,1e5),(3,1e5),(4,1e4),(5,1e3),(6,1e3)]
	for r in rule:
		if r[0]==0:
			split_list = ['9','m','f']
			for i in geo_hash['user_geohash']:
				util.IncDict(geo_count, i[:1])
		else:
			split_list=[i for i in geo_count.keys() if geo_count[i]>r[1] and len(i)==r[0] ]
			for i in geo_hash['user_geohash']:
				if i[:r[0]] in split_list:
					util.IncDict(geo_count, i[:r[0]+1])
			

	
	util.save_obj(geo_count, 'geotree')
	if all:
		return geo_count
	else:
		geo_tree = {i:geo_count[i] for i in geo_count.keys() if geo_count[i]>1e5 or len(i)==1}
	return geo_tree
 def save_obj(self, filename, vertices, textures):
     '''
     vertices: [nv, 3], tensor
     texture: [3, h, w], tensor
     '''
     util.save_obj(filename, vertices, self.faces[0], textures=textures, uvcoords=self.raw_uvcoords[0],
                       uvfaces=self.uvfaces[0])
예제 #5
0
def GetGeoTree(all=False):
    # print os.path.exists('geotree')
    if os.path.exists('geotree'):
        tree = util.load_obj('geotree')
        return tree

    geo_hash = pandas.read_csv(
        'tianchi_mobile_recommend_train_user.csv.subset.csv')
    geo_hash = geo_hash.dropna()
    geo_count = dict()

    rule = [(0, 0), (1, 1e5), (2, 1e5), (3, 1e5), (4, 1e4), (5, 1e3), (6, 1e3)]
    for r in rule:
        if r[0] == 0:
            split_list = ['9', 'm', 'f']
            for i in geo_hash['user_geohash']:
                util.IncDict(geo_count, i[:1])
        else:
            split_list = [
                i for i in geo_count.keys()
                if geo_count[i] > r[1] and len(i) == r[0]
            ]
            for i in geo_hash['user_geohash']:
                if i[:r[0]] in split_list:
                    util.IncDict(geo_count, i[:r[0] + 1])

    util.save_obj(geo_count, 'geotree')
    if all:
        return geo_count
    else:
        geo_tree = {
            i: geo_count[i]
            for i in geo_count.keys() if geo_count[i] > 1e5 or len(i) == 1
        }
    return geo_tree
예제 #6
0
def extract_xref(files_list):
    # total number of files to calculate completion percentage
    total_files = len(files_list)
    bad_files_names = []
    # Extract all features related to DATA and CODE XREF
    xref_dict = xref_initialization()
    for idx, file_name in enumerate(files_list):
        asm_file = DATASET_DIR + 'train/' + file_name + '.asm.gz'
        try:
            get_xref_features(asm_file, xref_dict)
        except Exception as e:
            # log corrupted files for future correction
            log_exception(e, sys.argv[0], asm_file)
            bad_files_idx.append(idx)
            bad_files_names.append(file_name)
        progress_bar(idx+1, total_files, 50)

    xref_pd = pd.DataFrame.from_dict(xref_dict)
    # store xref features to avoid recalculation
    save_obj(xref_pd, 'xref_features')
    '''
    save_obj(bad_files_names, 'bad_asm_files')
    # drop corrupted files (if any) from the training set
    if len(bad_files_names) > 0:
        # log the number of corrupted files
        logging.info('XREF Feature Extraction completed: ' + 
                str(len(bad_files_names)) + ' file(s) are corrupted.')
        # store the corrupted files names in 'bad_asm_files.txt'
        with open('bad_asm_files.txt', 'w') as bfp:
            for name in bad_files_names:
                bfp.write(name + '.asm')
    '''
    # save xref features dataframe to csv file to keep results (optional)
    xref_pd.to_csv('features/xref_features.csv', index=False)
    return xref_pd
예제 #7
0
 def get_trees(self):
     if self.trees == []:
         trees = util.load_obj(self.filename)
         if trees is None:
             trees = self._generate_trees()
             util.save_obj(trees, self.filename)
         self.trees = trees
     return self.trees
예제 #8
0
 def save_data(self):
     for key, value in self.data.items():
         if key not in ['env_data', 'bodytemp', '_label']:
             # np.save(self.conf.npy_data + '/' + str(key) + '.npy', value)
             path = self.conf.npy_data
             if self.save_dir is not None:
                 path = path + '/' + self.save_dir
             save_obj(value, path + '/' + str(key))
예제 #9
0
 def get_trees(self):
     if self.trees == []:
         # attempt to load cache
         trees = util.load_obj(self.filename)
         if trees is None or trees == []:  # not cached yet
             trees = self._generate_trees()
             util.save_obj(trees, self.filename) # save cache
         self.trees = trees
     return self.trees
예제 #10
0
 def get_trees(self):
     if self.trees == []:
         # attempt to load cache
         trees = util.load_obj(self.filename)
         if trees is None or trees == []:  # not cached yet
             trees = self._generate_trees()
             util.save_obj(trees, self.filename)  # save cache
         self.trees = trees
     return self.trees
예제 #11
0
def main():
    """main function"""
    names = ["Divvy_Stations_2017_Q3Q4.csv", "Divvy_Trips_2017_Q3.csv", "Divvy_Trips_2017_Q4.csv", "Divvy_Trips.csv", "6_26_6_30.csv", "Divvy_Stations_2017_Q1Q2.csv"]
    directory = "Divvy_Data/"
    fn = names[int(sys.argv[1])] if len(sys.argv) > 1 else "first300.csv"
    fn = directory + fn
    print("doing operations on " + fn)
    data = readdict(fn)
    # data = data_cleanup_missing(data)
    save_obj(data, "full_array_of_entries")
def quantization_train(imu_measurements, k):
    """
    :param imu_measurements: [T, 6]
    :param k: number of possible measurements
    :return:
    """
    # run k-means
    model = KMeans(n_clusters=k)
    kmeans = model.fit(imu_measurements)
    labels = kmeans.labels_
    kmeans.labels_ = []  # save space
    save_obj(kmeans, 'kmeans_model.pkl')

    return labels
예제 #13
0
def extract_opcode_ngram(files_list, n):
    dicts_list = []
    total_files = len(files_list)
    for idx, file_name in enumerate(files_list):
        asm_file = conf['dataset_dir'] + file_name + '.asm.gz'
        clean_asm_code = clean_asm_lines(asm_file)
        opcode_sequence = [] 
        # this loop constructs a sequence of opcodes delimited by space character
        for line in clean_asm_code:
            # below commands works assuming that the preprocessing of the .asm
            # file has already occured
            opcode_mnem = line.split(' ')[0].rstrip()
            # further condition to minimize the number of outliers (handle extreme cases)
            is_valid_opcode = bool(re.match('^[a-z]{2,7}$', opcode_mnem))
            if is_valid_opcode:
                opcode_sequence.append(opcode_mnem)

        ngram_dict = {} 
        for index, opcode in enumerate(opcode_sequence):
            if (n + index) > len(opcode_sequence):
                break
            opcode_ngram = ""
            for j in range(index, index + n):
                opcode_ngram += opcode_sequence[j] + '-'
            # remove trailing '-' char from opcode_ngram
            opcode_ngram = opcode_ngram[:-1]
            if opcode_ngram in ngram_dict:
                ngram_dict[opcode_ngram] += 1
            else:
                ngram_dict[opcode_ngram] = 1

        dicts_list.append(ngram_dict)
        # progress bars always save my sanity
        progress_bar(idx+1, total_files, 50)
    
    # convert list of dictionaries to an opcode ngram count numpy array
    vec = DictVectorizer()
    ngram_freq = vec.fit_transform(dicts_list).toarray()
    ngram_freq_df = pd.DataFrame(ngram_freq, columns=vec.get_feature_names())
    ngram_freq_df.to_csv('features/' + str(n) + 'gram_opcode_freq1.csv', index=False)
    save_obj(ngram_freq_df, str(n) + 'gram_opcode_freq')
    # transform ngram frequency array to ngram tfidf array
    transformer = TfidfTransformer(smooth_idf=False)
    ngram_tfidf = transformer.fit_transform(ngram_freq)
    # transform array to pandas dataframe
    freq_vec_df = pd.DataFrame(ngram_tfidf.todense(), columns=vec.get_feature_names())
    freq_vec_df.to_csv('features/' + str(n) + 'gram_opcode_tfidf1.csv', index=False)
    save_obj(freq_vec_df, str(n) + 'gram_opcode_tfidf')
    return freq_vec_df
예제 #14
0
def main():
    train_labels = pd.read_csv(DATASET_DIR + 'trainLabels.csv')
    files_list = train_labels['Id'].tolist()
    
    # total number of files to calculate completion percentage
    total_files = len(files_list)

    # do not count corrupted files
    bad_files_idx = []
    bad_files_names = []
    # Extract all features related to DATA and CODE XREF
    xref_dict = xref_initialization()
    for idx, file_name in enumerate(files_list):
        asm_file = DATASET_DIR + 'train/' + file_name + '.asm.gz'
        try:
            get_xref_features(asm_file, xref_dict)
        except Exception as e:
            # log corrupted files for future correction
            log_exception(e, sys.argv[0], asm_file)
            bad_files_idx.append(idx)
            bad_files_names.append(file_name)

        progress_bar(idx+1, total_files, 50)

    xref_pd = pd.DataFrame.from_dict(xref_dict)
    
    # store xref features to avoid recalculation
    save_obj(xref_pd, 'xref_features')
    save_obj(bad_files_names, 'bad_files')

    # concat features with classes and IDs to create the dataset
    data = pd.concat([train_labels, xref_pd], axis=1, sort=False)

    # drop corrupted files (if any) from the training set
    if len(bad_files_idx) > 0:
        data.drop(data.index[bad_files_idx], inplace=True)
        data = data.reset_index(drop=True)
        # log the number of corrupted files
        logging.info('XREF Feature Extraction completed: ' + 
                str(len(bad_files_idx)) + ' file(s) are corrupted.')
        # store the corrupted files names in 'bad_asm_files.txt'
        with open('bad_asm_files.txt', 'w') as bfp:
            for name in bad_files_names:
                bfp.write(name + '.asm.gz')

    # save xref features dataframe to csv file to keep results (optional)
    data.to_csv('results/xref_features.csv')
    
    '''
예제 #15
0
def unify_features():
    train_labels = pd.read_csv(
        '~/Documents/thesis/dataset/dataSample/trainLabels.csv')

    section_features = load_obj('section_features')
    xref_features = load_obj('xref_features')
    opcode_1gram_features = load_obj('1gram_opcode_tfidf')
    byte_1gram_features = load_obj('1gram_byte_tfidf')

    # concat features with classes and IDs to create the dataset
    data = pd.concat([train_labels, xref_features, section_features, opcode_1gram_features, \
            byte_1gram_features], axis=1, sort=False)
    print(data.shape)
    save_obj(data, 'interim_data')

    return data
예제 #16
0
def build_index():
    corpus_path = util.get_corpus_dir_path_from_args()
    preprocessor = preprocessing.Preprocessor(corpus_path)
    doc_to_terms: list[preprocessing.DocToTerms] = preprocessor.parse()

    indexer_ob = indexer.Indexer(doc_to_terms)
    inverted_index: dict[str, indexer.Posting] = indexer_ob.inverter_index()
    doc_id_name_index: dict[int, str] = indexer_ob.doc_id_to_doc_name_index()

    tf_idf_ranker = ranker.Ranker(inverted_index, doc_id_name_index)
    _tfidf = tf_idf_ranker.tfidf()

    print('Indexing completed..saving...')
    util.save_obj(doc_id_name_index, DOC_ID_NAME_INDEX_NAME)
    util.save_obj(inverted_index, INVERTED_INDEX_FILE_NAME)
    util.save_pandas_df_as_pickle(_tfidf, TFIDF_NAME_INDEX_NAME)
    print('Saved index for quick results for future queries')
def init_cache():
    """initial variable caching, done only once"""
    save_obj(INITIAL_EPSILON, "epsilon")
    t = 0
    save_obj(t, "time")
    D = deque()
    save_obj(D, "D")
예제 #18
0
    L_PAD_HEIGHT = 1.0
    L_HALF_PAD_HEIGHT = L_PAD_HEIGHT * HEIGHT / 2

if args.mode == 'play':
    play(args.ctrl == 'auto', args.play_show_stat)
elif args.mode == 'train':
    train(args.alpha, args.gamma, args.decay, args.ne, args.iter,
          args.train_show_stat)
    if args.save != 'none':
        q_learning.QSaveToFile(args.save)
elif args.mode == 'test':
    if args.load == 'init':
        q_learning.QInit(args.ne, args.xd, args.yd, args.pd)
    else:
        q_learning.QInitFromFile(args.load, args.ne)
    test_result = test(args.test_show_stat)
    print("   Total : %d" % test_result[0])
    print(" Average : %.2f" % test_result[1])
    print("     Min : %d" % test_result[2])
    print("     Max : %d" % test_result[3])
    print("Win Rate : %.2f%%" % test_result[4])
elif args.mode == 'tune':
    results = []
    for alpha in np.arange(0.1, 1.1, 0.1):
        for gamma in np.arange(0.1, 1.1, 0.1):
            for decay in np.arange(1000, 30000, 4000):
                for ne in np.arange(5, 100, 10):
                    result = train(alpha, gamma, decay, ne, 15000, False)
                    results.append(result)
    save_obj(results, args.save_tune)
예제 #19
0
# coding:utf-8
# find best

import util
import summary
import numpy as np
from sklearn.metrics import f1_score

if __name__=='__main__':
	info = summary.ParTestModelOnData('model16', 'data.test.csv','label_test.csv')
	util.save_obj(info, 'info.info')
	
	pred, Y = info[-2:]
	
	f1scores = {}
	for th in np.linspace(0.3,0.7,20):
		f1score = f1_score(Y, pred>th)
		f1score[str(th)] = f1score
		print th, f1score
	
예제 #20
0
count = 0
for tup in class_file_dirs:
    if count % 100000 == 0:
        print('hashed %d class images' % count)

    (cclass, cfile) = tup
    file_path = os.path.join(parent_path, cfile)

    chash = hashlib.md5(open(file_path, 'rb').read()).hexdigest()

    class_file_hashes.append((cclass, cfile, chash))

    count += 1

cfhd = os.path.join(parent_path, 'data', 'intermediate', 'class_file_hashes')
util.save_obj(class_file_hashes, cfhd)

count = 0
for tup in write_file_dirs:
    if count % 100000 == 0:
        print('hashed %d write images' % count)

    (cclass, cfile) = tup
    file_path = os.path.join(parent_path, cfile)

    chash = hashlib.md5(open(file_path, 'rb').read()).hexdigest()

    write_file_hashes.append((cclass, cfile, chash))

    count += 1
예제 #21
0
def run(args, num_workers=1, log_interval=100, verbose=True, save_path=None):
    code_root = os.path.dirname(os.path.realpath(__file__))
    if not os.path.isdir('{}/{}_result_files/'.format(code_root, args.task)):
        os.mkdir('{}/{}_result_files/'.format(code_root, args.task))
    path = '{}/{}_result_files/'.format(
        code_root, args.task) + utils.get_path_from_args(args)
    print('File saved in {}'.format(path))

    if os.path.exists(path + '.pkl') and not args.rerun:
        print('File has already existed. Try --rerun')
        return utils.load_obj(path)

    start_time = time.time()
    utils.set_seed(args.seed)

    # ---------------------------------------------------------
    # -------------------- training ---------------------------

    # initialise model
    model = user_preference_estimator(args).cuda()

    model.train()
    print(sum([param.nelement() for param in model.parameters()]))
    # set up meta-optimiser for model parameters
    meta_optimiser = torch.optim.Adam(model.parameters(), args.lr_meta)
    # scheduler = torch.optim.lr_scheduler.StepLR(meta_optimiser, 5000, args.lr_meta_decay)

    # initialise logger
    logger = Logger()
    logger.args = args
    # initialise the starting point for the meta gradient (it's faster to copy this than to create new object)
    meta_grad_init = [0 for _ in range(len(model.state_dict()))]
    dataloader_train = DataLoader(Metamovie(args),
                                  batch_size=1,
                                  num_workers=args.num_workers)
    for epoch in range(args.num_epoch):

        x_spt, y_spt, x_qry, y_qry = [], [], [], []
        iter_counter = 0
        for step, batch in enumerate(dataloader_train):
            if len(x_spt) < args.tasks_per_metaupdate:
                x_spt.append(batch[0][0].cuda())
                y_spt.append(batch[1][0].cuda())
                x_qry.append(batch[2][0].cuda())
                y_qry.append(batch[3][0].cuda())
                if not len(x_spt) == args.tasks_per_metaupdate:
                    continue

            if len(x_spt) != args.tasks_per_metaupdate:
                continue

            # initialise meta-gradient
            meta_grad = copy.deepcopy(meta_grad_init)
            loss_pre = []
            loss_after = []
            for i in range(args.tasks_per_metaupdate):
                loss_pre.append(F.mse_loss(model(x_qry[i]), y_qry[i]).item())
                fast_parameters = model.final_part.parameters()
                for weight in model.final_part.parameters():
                    weight.fast = None
                for k in range(args.num_grad_steps_inner):
                    logits = model(x_spt[i])
                    loss = F.mse_loss(logits, y_spt[i])
                    grad = torch.autograd.grad(loss,
                                               fast_parameters,
                                               create_graph=True)
                    fast_parameters = []
                    for k, weight in enumerate(model.final_part.parameters()):
                        if weight.fast is None:
                            weight.fast = weight - args.lr_inner * grad[
                                k]  #create weight.fast
                        else:
                            weight.fast = weight.fast - args.lr_inner * grad[k]
                        fast_parameters.append(weight.fast)

                logits_q = model(x_qry[i])
                # loss_q will be overwritten and just keep the loss_q on last update step.
                loss_q = F.mse_loss(logits_q, y_qry[i])
                loss_after.append(loss_q.item())
                task_grad_test = torch.autograd.grad(loss_q,
                                                     model.parameters())

                for g in range(len(task_grad_test)):
                    meta_grad[g] += task_grad_test[g].detach()

            # -------------- meta update --------------

            meta_optimiser.zero_grad()

            # set gradients of parameters manually
            for c, param in enumerate(model.parameters()):
                param.grad = meta_grad[c] / float(args.tasks_per_metaupdate)
                param.grad.data.clamp_(-10, 10)

            # the meta-optimiser only operates on the shared parameters, not the context parameters
            meta_optimiser.step()
            #scheduler.step()
            x_spt, y_spt, x_qry, y_qry = [], [], [], []

            loss_pre = np.array(loss_pre)
            loss_after = np.array(loss_after)
            logger.train_loss.append(np.mean(loss_pre))
            logger.valid_loss.append(np.mean(loss_after))
            logger.train_conf.append(1.96 * np.std(loss_pre, ddof=0) /
                                     np.sqrt(len(loss_pre)))
            logger.valid_conf.append(1.96 * np.std(loss_after, ddof=0) /
                                     np.sqrt(len(loss_after)))
            logger.test_loss.append(0)
            logger.test_conf.append(0)

            utils.save_obj(logger, path)
            # print current results
            logger.print_info(epoch, iter_counter, start_time)
            start_time = time.time()

            iter_counter += 1
        if epoch % (2) == 0:
            print('saving model at iter', epoch)
            logger.valid_model.append(copy.deepcopy(model))

    return logger, model
예제 #22
0
write_dir = os.path.join(parent_path, 'data', 'raw_data', 'by_write')
rel_write_dir = os.path.join('data', 'raw_data', 'by_write')
write_parts = os.listdir(write_dir)

for write_part in write_parts:
    writers_dir = os.path.join(write_dir, write_part)
    rel_writers_dir = os.path.join(rel_write_dir, write_part)
    writers = os.listdir(writers_dir)

    for writer in writers:
        writer_dir = os.path.join(writers_dir, writer)
        rel_writer_dir = os.path.join(rel_writers_dir, writer)
        wtypes = os.listdir(writer_dir)

        for wtype in wtypes:
            type_dir = os.path.join(writer_dir, wtype)
            rel_type_dir = os.path.join(rel_writer_dir, wtype)
            images = os.listdir(type_dir)
            image_dirs = [os.path.join(rel_type_dir, i) for i in images]

            for image_dir in image_dirs:
                write_files.append((writer, image_dir))

util.save_obj(
    class_files,
    os.path.join(parent_path, 'data', 'intermediate', 'class_file_dirs'))
util.save_obj(
    write_files,
    os.path.join(parent_path, 'data', 'intermediate', 'write_file_dirs'))
예제 #23
0
                    'loss': 'CategoricalCrossentropy'
                }

        # Setting MLFlow
        mlflow.set_experiment(experiment_name=experiment_name)
        exp = mlflow.get_experiment_by_name(experiment_name)

        # Preparing full data
        print("Preparing data")
        X_train, y_train, X_val, y_val, X_test, y_test = prepareBBdata(
            dataset.replace('_best', ''), label, model_type, final=True)

        # Training with full data
        print("Training model")
        model = train(X_train, y_train, X_val, y_val, X_test, y_test,
                      model_type, params, exp.experiment_id, n_classes)

        folder = 'data/' + dataset.replace('_best',
                                           '') + '/target/' + model_type + '/'
        if (model_type == 'RF'):
            save_obj(model, folder + '/RF_model')

        if (model_type == 'NN'):
            model.save(folder + '/NN_model.h5')

        print("Best model saved in " + folder)

    # else
    else:
        gridSearch(dataset, model_type)
예제 #24
0
 def save(self, filename=None):
     if filename is None:
         filename = self.filename
     util.save_obj(self.trees, filename)
예제 #25
0
# coding:utf-8
# find best

import util
import summary
import numpy as np
from sklearn.metrics import f1_score

if __name__ == '__main__':
    info = summary.ParTestModelOnData('model16', 'data.test.csv',
                                      'label_test.csv')
    util.save_obj(info, 'info.info')

    pred, Y = info[-2:]

    f1scores = {}
    for th in np.linspace(0.3, 0.7, 20):
        f1score = f1_score(Y, pred > th)
        f1score[str(th)] = f1score
        print th, f1score
def train_network(model, game_state, observe=False):
    last_time = time.time()
    # store the previous observations in replay memory
    D = load_obj("D")  # load from file system
    # get the first state by doing nothing
    do_nothing = np.zeros(ACTIONS)
    do_nothing[0] = 1  # 0 => do nothing,
    # 1=> jump

    x_t, r_0, terminal = game_state.get_state(
        do_nothing)  # get next step after performing the action

    s_t = np.stack((x_t, x_t, x_t, x_t),
                   axis=2)  # stack 4 images to create placeholder input

    s_t = s_t.reshape(1, s_t.shape[0], s_t.shape[1], s_t.shape[2])  # 1*20*40*4

    initial_state = s_t

    if observe:
        OBSERVE = 999999999  # We keep observe, never train
        epsilon = FINAL_EPSILON
        print("Now we load weight")
        model.load_weights("model.h5")
        adam = Adam(lr=LEARNING_RATE)
        model.compile(loss='mse', optimizer=adam)
        print("Weight load successfully")
    else:  # We go to training mode
        OBSERVE = OBSERVATION
        epsilon = load_obj("epsilon")
        model.load_weights("model.h5")
        adam = Adam(lr=LEARNING_RATE)
        model.compile(loss='mse', optimizer=adam)

    t = load_obj(
        "time")  # resume from the previous time step stored in file system
    while True:  # endless running

        loss = 0
        Q_sa = 0
        action_index = 0
        a_t = np.zeros([ACTIONS])  # action at t

        # choose an action epsilon greedy
        if t % FRAME_PER_ACTION == 0:  # parameter to skip frames for actions
            if random.random() <= epsilon:  # randomly explore an action
                print("----------Random Action----------")
                action_index = random.randrange(ACTIONS)
                a_t[0] = 1
            else:  # predict the output
                q = model.predict(
                    s_t)  # input a stack of 4 images, get the prediction
                max_Q = np.argmax(q)  # choose index with maximum q value
                action_index = max_Q
                a_t[action_index] = 1  # 0=> do nothing, 1=> jump

        # We reduced the epsilon (exploration parameter) gradually
        if epsilon > FINAL_EPSILON and t > OBSERVE:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

            # run the selected action and observed next state and reward
        x_t1, r_t, terminal = game_state.get_state(a_t)
        print('fps: {0}'.format(1 / (time.time() - last_time))
              )  # helpful for measuring frame rate
        last_time = time.time()
        x_t1 = x_t1.reshape(1, x_t1.shape[0], x_t1.shape[1], 1)  # 1x20x40x1
        s_t1 = np.append(
            x_t1, s_t[:, :, :, :3], axis=3
        )  # append the new image to input stack and remove the first one

        # store the transition in D
        D.append((s_t, action_index, r_t, s_t1, terminal))
        if len(D) > REPLAY_MEMORY:
            D.popleft()

        # only train if done observing
        if t > OBSERVE:

            # sample a mini_batch to train on
            mini_batch = random.sample(D, BATCH)
            inputs = np.zeros((BATCH, s_t.shape[1], s_t.shape[2],
                               s_t.shape[3]))  # 32, 20, 40, 4
            targets = np.zeros((inputs.shape[0], ACTIONS))  # 32, 2

            # Now we do the experience replay
            for i in range(0, len(mini_batch)):
                state_t = mini_batch[i][0]  # 4D stack of images
                action_t = mini_batch[i][1]  # This is action index
                reward_t = mini_batch[i][
                    2]  # reward at state_t due to action_t
                state_t1 = mini_batch[i][3]  # next state
                terminal = mini_batch[i][
                    4]  # wheather the agent died or survided due the action

                inputs[i:i + 1] = state_t

                targets[i] = model.predict(state_t)  # predicted q values
                Q_sa = model.predict(
                    state_t1)  # predict q values for next step

                if terminal:
                    targets[
                        i,
                        action_t] = reward_t  # if terminated, only equals reward
                else:
                    targets[i, action_t] = reward_t + GAMMA * np.max(Q_sa)

            loss += model.train_on_batch(inputs, targets)
            loss_df.loc[len(loss_df)] = loss
            q_values_df.loc[len(q_values_df)] = np.max(Q_sa)

        s_t = initial_state if terminal else s_t1  # reset game to initial frame if terminate
        t = t + 1

        # save progress every 1000 iterations
        if t % 1000 == 0:
            print("Now we save model")
            game_state._game.pause()  # pause game while saving to filesystem
            model.save_weights("model.h5", overwrite=True)
            save_obj(D, "D")  # saving episodes
            save_obj(t, "time")  # caching time steps
            save_obj(epsilon, "epsilon"
                     )  # cache epsilon to avoid repeated randomness in actions
            loss_df.to_csv("./objects/loss_df.csv", index=False)
            scores_df.to_csv("./objects/scores_df.csv", index=False)
            actions_df.to_csv("./objects/actions_df.csv", index=False)
            q_values_df.to_csv(q_value_file_path, index=False)
            with open("model.json", "w") as outfile:
                json.dump(model.to_json(), outfile)
            clear_output()
            game_state._game.resume()
        # print info
        if t <= OBSERVE:
            state = "observe"
        elif t <= OBSERVE + EXPLORE:
            state = "explore"
        else:
            state = "train"

        print "TIMESTAMP", t, "/ STATE", state, "/ EPSILON", epsilon, "/ ACTION"\
            , action_index, "/ REWARD", r_t, "/ Q_MAX ", np.max(Q_sa), "/ Loss ", loss
def byte_ngram(files_list, addrlength=32, n=1):
    dicts_list = []
    total_files = len(files_list)
    bad_files_names = []
    for idx, file_name in enumerate(files_list):
        bytes_file = DATASET_DIR + file_name + '.bytes.gz'
        try:
            with gzip.open(bytes_file, 'rt') as fp:
                bytedict = {}
                hex_seq = ""
                for line in fp.readlines():
                    if not line.strip():
                        continue
                    else:
                        address = int(addrlength / 4)  # hex to bytes
                        # ensure that addresses values will not be counted
                        # in the ngram calculation
                        hex_seq = hex_seq + line[address:].strip()

                hex_seq = hex_seq.replace(" ", "")
                for i in range(0, len(hex_seq) - 1, 2):
                    # ignore bytes that contain the "?" character
                    if hex_seq[i] == "?" or hex_seq[i + 1] == "?":
                        continue
                    if 2 * n + i > len(hex_seq):
                        break

                    gram = hex_seq[i:(2 * n + i)]
                    if gram not in bytedict.keys():
                        bytedict[gram] = 1
                    else:
                        bytedict[gram] += 1

                dicts_list.append(bytedict)
        except Exception as e:
            bad_files_names.append(file_name)
            log_exception(e, sys.argv[0], bytes_file)

        # progress bars always save my sanity
        progress_bar(idx + 1, total_files, 50)

    # log the corrupted files for future reference
    if len(bad_files_names) > 0:
        with open('bad_bytes_files.txt', 'w') as bfp:
            for name in bad_files_names:
                bfp.write(name + '.bytes\n')

    # convert list of dictionaries to a byte ngram count numpy array
    vec = DictVectorizer()
    ngram_freq = vec.fit_transform(dicts_list).toarray()
    ngram_freq_df = pd.DataFrame(ngram_freq, columns=vec.get_feature_names())
    # store frequency of each byte ngram
    ngram_freq_df.to_csv('features/' + str(n) + 'gram_byte_freq.csv')
    save_obj(ngram_freq_df, str(n) + 'gram_byte_freq')

    # transform ngram frequency array to ngram tfidf array
    transformer = TfidfTransformer(smooth_idf=False)
    ngram_tfidf = transformer.fit_transform(ngram_freq)
    # store tfidf of each byte ngram
    ngram_tfidf_df = pd.DataFrame(ngram_tfidf.todense(),
                                  columns=vec.get_feature_names())
    ngram_tfidf_df.to_csv('features/' + str(n) + 'gram_byte_tfidf.csv')
    save_obj(ngram_tfidf_df, str(n) + 'gram_byte_tfidf')
    return ngram_tfidf_df
# Import pickle package
import pickle
from util import save_obj

dict1 = {'Mar': '84.4', 'June': '69.4', 'Aug': '85', 'Airline': '8'}
save_obj(dict1, './OtherFiles/data')

# Open pickle file and load data: d
with open('./OtherFiles/data.pkl', 'rb') as file:
    d = pickle.load(file)

# Print d
print(d)

# Print datatype of d
print(type(d))
예제 #29
0
import os
import sys

utils_dir = os.path.dirname(
    os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
utils_dir = os.path.join(utils_dir, 'utils')
sys.path.append(utils_dir)

import util

parent_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))

cfhd = os.path.join(parent_path, 'data', 'intermediate', 'class_file_hashes')
wfhd = os.path.join(parent_path, 'data', 'intermediate', 'write_file_hashes')
class_file_hashes = util.load_obj(cfhd)  # each elem is (class, file dir, hash)
write_file_hashes = util.load_obj(
    wfhd)  # each elem is (writer, file dir, hash)

class_hash_dict = {}
for i in range(len(class_file_hashes)):
    (c, f, h) = class_file_hashes[len(class_file_hashes) - i - 1]
    class_hash_dict[h] = (c, f)

write_classes = []
for tup in write_file_hashes:
    (w, f, h) = tup
    write_classes.append((w, f, class_hash_dict[h][0]))

wwcd = os.path.join(parent_path, 'data', 'intermediate', 'write_with_class')
util.save_obj(write_classes, wwcd)
예제 #30
0
			if (stat[key]['StartTime']>=0 and stat[key]['LastTime']>=0):
				stat[key]['Sequences'].append([stat[key]['StartTime'], stat[key]['LastTime'], 
				stat[key]['StartPos'][0],stat[key]['StartPos'][1],
				stat[key]['EndPos'][0],stat[key]['EndPos'][1]])

			stat[key]['StartTime'] = -1
			stat[key]['LastTime'] = -1
	
	for key in stat:
		stat[key]['Updated'] = False


# get planes statics
print('get stat')
print('Total Files: ', len(data_files))
stat = util.load_obj(os.path.join(conf["output_folder"], 'stat_icao'))

for i, path in enumerate(data_files):
	acList = util.getAcList(path)
	for ac in acList:
		key = ac['Icao']
		if 'Lat' not in ac or 'Long' not in ac: continue
		if conf["source"] == "TCP": ac["PosTime"] = int(path.split('\\')[-1].split('.')[0])
		# print(ac)
		getPlaneStat(ac, stat)
	updateStat(stat)
	if i and i % 100 == 0: print(i, " files processed.")  
updateStat(stat)

util.save_obj(stat, os.path.join(conf["output_folder"], 'stat'))
print("All files processed.")
예제 #31
0
#     rep = s.get(req_url, cookies=cookies)
#     #设置响应编码
#     rep.encoding = 'utf-8'
#     #构造bsobj
#     bsobj = BeautifulSoup(rep.text, 'html.parser')
#     #获取包含item的容器
#     div = bsobj.find('div', id="Profile-following")
#     ques_div = div.find_all('div', class_="List-item")
#     print(bsobj.prettify())
#     print(len(ques_div))
#     # for que in ques_div:
#     #     print(que.string)

# driver = webdriver.Chrome()
# driver.get(req_url)
# #获取结果集
# result = []
# get_questions(result)
# #保存获取的结果
# util.save_obj(result, 'questions.pkl')
# driver.close()

driver = webdriver.Chrome()
driver.get(req_url)
result = []
wait = WebDriverWait(driver, 10)
get_ques(result, wait)
print(len(result))
util.save_obj(result, 'questions.pkl')
time.sleep(1)
driver.close()
import pickle
from util import save_obj

dict_fruit = { 'peaches': 13, 'apples': 4, 'oranges': 11}

save_obj(dict_fruit, './OtherFiles/pickle_fuit')

with open('./OtherFiles/pickle_fuit.pkl', 'rb') as file:
    data = pickle.load(file)

print(data)

import os
import sys

utils_dir = os.path.dirname(
    os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
utils_dir = os.path.join(utils_dir, 'utils')
sys.path.append(utils_dir)

import util

parent_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))

wwcd = os.path.join(parent_path, 'data', 'intermediate', 'write_with_class')
write_class = util.load_obj(wwcd)

writers = []  # each entry is a (writer, [list of (file, class)]) tuple
cimages = []
(cw, _, _) = write_class[0]
for (w, f, c) in write_class:
    if w != cw:
        writers.append((cw, cimages))
        cw = w
        cimages = [(f, c)]
    cimages.append((f, c))
writers.append((cw, cimages))

ibwd = os.path.join(parent_path, 'data', 'intermediate', 'images_by_writer')
util.save_obj(writers, ibwd)
예제 #34
0
 def save_model():
     model_file = "_".join(["XGBooster", self.model_id, ".model"])
     pickle_file = "_".join(["XGBooster", self.model_id])
     self.bst.save_model(model_file)
     save_obj(self.bst, pickle_file)
예제 #35
0
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

#登录页面
login_url = "https://accounts.douban.com/"

driver = webdriver.Chrome()
driver.get(login_url)
email = driver.find_element_by_id('email')
#输入的帐号
email.clear()
email.send_keys('3188****.com')
#获取密码输入框
password = driver.find_element_by_id('password')
password.clear()
#输入密码
password.send_keys('******')
#获得登录按钮
submit = driver.find_element_by_class_name('btn-submit')
#点击登录按钮
submit.send_keys(Keys.RETURN)
#等待两秒
time.sleep(2)
#获取cookies对象
cookies = driver.get_cookies()
#保存获取的cookies对象
util.save_obj(cookies, 'cookies.pkl')
#关闭打开的driver
driver.close()
예제 #36
0
 def save(self, filename=None):
     if filename is None:
         filename = self.filename
     util.save_obj(self.trees, filename)