def ingest(record_type, **kwargs): """ Run the ingestion flow for the given :record_type:. """ datasource = acquire_data(record_type, **kwargs) validating = not kwargs.get("no_validate") if validating: ref = kwargs.get("ref") datasource = validate_data(datasource, record_type, ref) # clean up missing data for k in [k for k in datasource.keys()]: if not datasource[k] or len(datasource[k]) < 1: del datasource[k] else: print("Skipping data validation") # output to files if needed output = kwargs.get("output") if output and os.path.exists(output): print(f"Writing data files to {output}") start_time, end_time = kwargs.get("start_time"), kwargs.get("end_time") output_data(output, datasource, record_type, start_time, end_time) loading = not kwargs.get("no_load") if loading and len(datasource) > 0: load_data(datasource, record_type, **kwargs) print(f"{record_type} complete")
def main(begin_date, end_date): extract_dict = extract.get_extracts(begin_date, end_date) output_dict = transform.get_transforms(extract_dict) load.load_data(output_dict)
def main(): ''' main function This puts the pieces of the ETL all together: extract -- reads raw data from api endpoints, and passes through raw schemas for validation transform -- uses parsed schemas to deserialize data into desired format for DB load -- uses sqlalchemy models + session to safely merge the data into the database ''' args = get_args() if not args.date: logger.error('Must pass date for parsing') return # extract -- start by extracting all game_ids; then extract each game one by one results = {"shot_attempt": [], "goal": []} game_ids = extract_game_ids_for_date(args.date) logger.info('Receieved %s game_ids to load', len(game_ids)) for game_id in game_ids: logger.info('Extracting: %s', game_id) raw_game = extract_game(game_id) # transform -- for each game we get, turn it into parsed shot and goal rows parsed_data = transform_game(raw_game) results["shot_attempt"].extend(parsed_data["shot_attempt"]) results["goal"].extend(parsed_data["goal"]) # load -- once all games have been iterated through, create sqlalchemy models out of the rows and insert create_tables() load_data(results)
def main(): """ Parse command line arguments and options and run OceanDiv. """ # Read options args = parse_args.get_args() config = namelist.get_namelist(args) tools.print_message(config, 'Running OceanDiv...') # Load data tools.print_message(config, 'Loading data...') ohcs = load.load_data(config, dtype='ohc') flxs = load.load_data(config, dtype='flx') basins = load.load_geodata(config, geotype='basins') areas = load.load_geodata(config, geotype='areas') # Process data ohcs, flxs, basins, areas = process.unify_masks(ohcs, flxs, basins, areas) out_cubes = process.process_by_basin(config, ohcs, flxs, basins, areas) # Save output save.save_as_netcdf(config, out_cubes) # Finished tools.print_message(config, 'Finished!')
def __init__(self, paths, patch_size, batch_size, transformations=[], augment=False, mean=True): self.orig_patch_size = patch_size if augment: patch_size = int(math.sqrt(2 * self.orig_patch_size**2)) + 1 self.patch_size = patch_size self.batch_size = batch_size self.augment = augment self.data_details = [] self.data = [] for path in paths: self.data_details.append(load.load_data(path, details_only=True)) self.data.append(load.load_data(path)) #ASSERT ALL SAME SIZE??? for i, trans in enumerate(transformations): if trans: for j in range(len(self.data[i])): self.data[i][j] = trans(self.data[i][j])
def main(args): usaCovidDataUrl = args['usaCovidDataUrl'] johnHopkinsDataUrl = args['johnHopkinsDataUrl'] loggerLevel = logging.__dict__[args['loggerLevel']] dynamodb_resource = boto3.resource('dynamodb', region_name='ap-southeast-2') try: setupLogger(loggerLevel) logger.info('Starting...') usaCovidDataFilename = getRemoteFile(usaCovidDataUrl, 'usaCovidData') johnHopkinsRecoveryDataFilename = getRemoteFile( johnHopkinsDataUrl, 'johnHopkinsData') mergedData = transform.mergeCsvFiles(usaCovidDataFilename, johnHopkinsRecoveryDataFilename) scanResponse = getTableScanResponse('covidData', dynamodb_resource) latestDate = getLatestRecordDate(scanResponse) intialLoad = tableIsEmpty(scanResponse) load.load_data(mergedData, latestDate, dynamodb_resource, intialLoad) logger.info('Done!') except: logger.exception('Error in processing!') finally: if os.path.isdir('/tmp/download/'): cleanupFiles('/tmp/download/')
def pre_processing(): test_data = load_data(True) train_data = load_data(False) a = test_data['a'] a1 = train_data['a'] x = train_data['x'] columns_rm = [ get_names().index('sex_Female'), get_names().index('sex_Male') ] indx = [j for j in range(len(x.T)) if j not in columns_rm] train_data['x'] = train_data['x'][:, indx] test_data['x'] = test_data['x'][:, indx] xtrain_fe = train_data['x'][a1 == 0] xtrain_ma = train_data['x'][a1 == 1] xtest_fe = test_data['x'][a == 0] xtest_ma = test_data['x'][a == 1] ytrain_fe = train_data['y'][a1 == 0] ytrain_ma = train_data['y'][a1 == 1] ytest_fe = test_data['y'][a == 0] ytest_ma = test_data['y'][a == 1] return ytrain_fe, ytrain_ma, ytest_fe, ytest_ma, xtrain_fe, xtrain_ma, xtest_fe, xtest_ma
def q3(): data_train = load_data(False) xtrain = data_train['x'] ytrain = data_train['y'] atrain = data_train['a'] data_test = load_data(True) xtest = data_test['x'] ytest = data_test['y'] atest = data_test['a'] xtrain = torch.tensor(xtrain).float() ytrain = torch.tensor(ytrain[:, None]).float() atrain = torch.tensor(atrain[:, None]).float() xtest = torch.tensor(xtest).float() ytest = torch.tensor(ytest[:, None]).float() atest = torch.tensor(atest[:, None]).float() accuracy = [] d_p = [] alphas = [.01, .1, 1, 10, 100] for alpha in alphas: print('Alpha :', alpha) features_extractor, classifers = NN_mmd(xtrain, ytrain, atrain, alpha) acc = accuracy_(classifers, xtest, ytest) accuracy.append(acc) print('acc for MMD NN: ', acc) delta_dp = dp(atrain.int().numpy().ravel(), (classifers(xtrain) > 0).numpy().ravel()) d_p.append(delta_dp) print('dp for features:', delta_dp) print('accuracy ', accuracy) print('alpha', alphas) print('d_p', d_p)
def execute_compemploy(): search_term='warehouse&20philadelphia' search_type='lab' scrape.read_rss_and_load(search_type,search_term,cf['data_dir']) df = parse.create_df(cf['data_dir'],search_term) df_valid = parse.get_valid_texts(df) Session = load.bind_to_database(cf['postgres_username'] ,cf['postgres_password'],cf['postgres_db']) load.load_data(Session,df_valid) send.send_from_database(Session)
def main(main_config_fpath='../data/example/main_config.cfg'): '''Get user-specified information from main_config.cfg''' cfg_parser = ConfigParser.SafeConfigParser() cfg_parser.readfp(open(main_config_fpath, 'r')) # get directory paths data_dir = add_pathsep(cfg_parser.get('general', 'data_dir')) downsample_dir = data_dir[0:-1] + "_downsampled" + os.sep preprocess_dir = data_dir[0:-1] + "_preprocessed" + os.sep ttv_list = ['training' + os.sep, 'validation' + os.sep, 'test' + os.sep] # ensure directories exist if not os.path.isdir(data_dir): sys.exit("Specified data directory " + data_dir + " does not exist.") for ttv in ttv_list if is_labeled(data_dir) else ['']: if not os.path.isdir(downsample_dir + ttv): os.makedirs(downsample_dir + ttv) if not os.path.isdir(preprocess_dir + ttv): os.makedirs(preprocess_dir + ttv) # get remaining preprocessing parameters img_width = cfg_parser.getint('general', 'img_width') img_height = cfg_parser.getint('general', 'img_height') mean_proj_bins = cfg_parser.getint('preprocessing', 'mean_proj_bin') max_proj_bins = cfg_parser.getint('preprocessing', 'max_proj_bin') new_time_depth = cfg_parser.getint('preprocessing', 'time_equalize') upper_contrast = cfg_parser.getfloat('preprocessing', 'upper_contrast') lower_contrast = cfg_parser.getfloat('preprocessing', 'lower_contrast') centroid_radius = cfg_parser.getint('preprocessing', 'centroid_radius') # run preprocessing for ttv in ttv_list if is_labeled(data_dir) else ['']: if cfg_parser.getboolean('general', 'do_downsample'): downsample(data_dir + ttv, downsample_dir + ttv, img_width, img_height, mean_proj_bins, max_proj_bins) time_equalize(downsample_dir + ttv, downsample_dir + ttv, img_width, img_height, new_time_depth) else: time_equalize(data_dir + ttv, downsample_dir + ttv, img_width, img_height, new_time_depth) if is_labeled(data_dir): stks, rois, file_names = load_data(downsample_dir + ttv, img_width, img_height) stks = improve_contrast(stks, upper_contrast, lower_contrast) rois = get_centroids(rois, centroid_radius, img_width, img_height) save_image_tifs(stks, file_names, preprocess_dir + ttv) save_roi_tifs(rois, file_names, preprocess_dir + ttv) else: stks, file_names = load_data(downsample_dir + ttv, img_width, img_height, no_rois=True) stks = improve_contrast(stks, upper_contrast, lower_contrast) save_image_tifs(stks, file_names, preprocess_dir + ttv)
def execute_compemploy(): search = cf['searches'] for search_type in search.keys(): for search_term in search[search_type]: scrape.read_rss_and_load(search_type,search_term,cf['data_dir']) df = parse.create_df(cf['data_dir'],search_term) df_valid = parse.get_valid_texts(df) Session = load.bind_to_database(cf['postgres_username'] ,cf['postgres_password'],cf['postgres_db']) load.load_data(Session,df_valid) send.send_from_database(Session)
def get_data(): test_data = load_data(True) train_data = load_data(False) x = train_data['x'] columns_rm = [ get_names().index('sex_Female'), get_names().index('sex_Male') ] indx = [j for j in range(len(x.T)) if j not in columns_rm] train_data['x'] = train_data['x'][:, indx] test_data['x'] = test_data['x'][:, indx] return train_data['x'], test_data['x'], train_data['y'], test_data[ 'y'], train_data['a'], test_data['a']
def test_load(self): # create test log files instead so we can delete self.datelog = open('testimport.log', "w") self.datelog_fn = 'testimport.log' # run load_data load_data(self.db_cursor, self.db_connection, self.datelog_fn) # run some test queries to make sure we have what we expect self.db_cursor.execute("SELECT * FROM fileformat1") assert(len(self.db_cursor.fetchall()) == 3) self.db_cursor.execute("SELECT * FROM fileformat1 WHERE valid=True") assert(len(self.db_cursor.fetchall()) == 2 ) self.db_cursor.execute("SELECT * FROM fileformat1 WHERE name='Corey'") assert(len(self.db_cursor.fetchall()) == 0)
def merge(files, fill_na): df = load.load_data(PREFIX + files[0]) for file in files[1:]: file_df = load.load_data(PREFIX + file) # Dropping stars since this is how we will judge our accuracy if file_df.get('stars') is not None: file_df.drop('stars', axis=1, inplace=True) df = df.merge(file_df, on='business_id', how='outer') if fill_na is not None: df.fillna(fill_na, inplace=True) stars = df.stars.copy() df.drop('stars', axis=1, inplace=True) return df, stars
def __init__(self, bot, config): super(Extension, self).__init__(bot, config) self.data = load.load_data('profile') self.register_commands('profile', 'verify') self.mention_regex = re.compile(r'<@!?(\d+)>') self.domain = config.get('domain', 'undertale.fandom.com') self.initialized = False
def run_simulation(args, sim_num=0, header=None): """Run ANN simulation""" # Always run verbosely (for now) args['verbose'] = True # Load training and test data training_ds, testing_ds = load_data(args) # Build and train feed-forward neural network trainer, ff_network = train(args, training_ds) # Initialize results output file with given or default header if header is None: header = ['hidden_neurons', 'learning_rate', 'max_epochs', 'activation', 'hits', 'mse'] # Create results directory to hold simulation files if not existing if not os.path.exists('results'): os.makedirs('results') # Write table header to simulation file with open('results/simulation{}.txt'.format(sim_num), 'a') as sim_file: sim_file.write('{}\n'.format('|'.join(header))) # Use the trainer to evaluate the network on the training and test data evaluate(args, trainer, ff_network, training_ds, testing_ds, sim_num, header)
def load_datasets_from_file(filename, debug=False, read_size=100): """ Function for loading dataset before initializing neural network and evaluating the model. If you get/build dataset in fasta format beforehand, provide filename in argument when calling build.py. We expect provided filename is located in media directory. If filename is empty/not provided, then specify all the needed params for expected data loading. Filename is build from md5 from sorted genome IDs, depth param, sample param, read_size param, onehot param and seed param. File is saved in fasta format and zipped with gzip. :param filename: filename, given from :param debug: if the flag for debug is present, run in debug mode (controlled seed, smaller taxonomy) :param read_size: input length :return: train and test datasets as well as number of classes """ transmission_dict = {'A': [1, 0, 0, 0], 'T': [0, 1, 0, 0], 'C': [0, 0, 1, 0], 'G': [0, 0, 0, 1]} test = 0.2 depth = 4 sample = 0.2 #read_size = 100 onehot = True # taxonomy_el_count = 20 and seed = 0 for debug only if debug: seed = 0 taxonomy_el_count = 20 else: seed = random.randint(0, 4294967295) taxonomy_el_count = -1 if not filename: filename = "%s_%d_%.3f_%d_%d_%d_%d%s" % (hashlib.md5(str(sorted(get_gids()))).hexdigest(), depth, sample, read_size, onehot, seed, taxonomy_el_count, ".fasta.gz") trX, teX, trY, teY, trteX, trteY, \ num_of_classes, train_class_sizes = load_data(filename=filename, test=test, depth=depth, read_size=read_size, transmission_dict=transmission_dict, sample=sample, seed=seed, taxonomy_el_count=taxonomy_el_count) return trX, teX, trY, teY, trteX, trteY, num_of_classes, train_class_sizes
def scrape_world_select(): dt = datetime.now() response = get_osrs_world_select() status_code = response.status_code if (response.ok): world_data, total_player_data = extract_data(response) world_data, total_player_count = (transform_data( world_data, total_player_data, dt)) load_data(world_data, total_player_count) else: print('Bad Response - HTTP', status_code) update_logs(dt, status_code)
def main(): six.print_('loading data') train_x, train_y, val_x, val_y = load_data() train_x = train_x.reshape(-1, 64 * 64) val_x = val_x.reshape(-1, 64 * 64) six.print_('load data complete') six.print_('start PCA') try: pca = pickle.load(open('pca.pickle', 'rb')) except: pca = decomposition.PCA(n_components=8*8) pca.fit(train_x[:]) train_x = pca.transform(train_x) six.print_('PCA complete') clf = SVC(C=0.0001, kernel='linear', verbose=True, max_iter=100) six.print_('start training') clf.fit(train_x, train_y) six.print_('training complete') val_x = pca.transform(val_x) acc = sum(val_y == clf.predict(val_x)) / float(len(val_y)) print(acc) pickle.dump(pca, open('pca.pickle', 'wb')) pickle.dump(clf, open('svm.pickle', 'wb'))
def main(): six.print_('loading data') train_x, train_y, val_x, val_y = load_data() train_x = train_x.reshape(-1, 64 * 64) val_x = val_x.reshape(-1, 64 * 64) six.print_('load data complete') six.print_('start PCA') try: pca = pickle.load(open('pca.pickle', 'rb')) except: pca = decomposition.PCA(n_components=8 * 8) pca.fit(train_x[:]) train_x = pca.transform(train_x) six.print_('PCA complete') clf = SVC(C=0.0001, kernel='linear', verbose=True, max_iter=100) six.print_('start training') clf.fit(train_x, train_y) six.print_('training complete') val_x = pca.transform(val_x) acc = sum(val_y == clf.predict(val_x)) / float(len(val_y)) print(acc) pickle.dump(pca, open('pca.pickle', 'wb')) pickle.dump(clf, open('svm.pickle', 'wb'))
def __init__(self, args): self.train_data, self.test_data, self.valid_data, self.mapping = load_data( args.data_dir) self.learning_rate = args.learning_rate self.epochs = args.epochs use_gpu = args.gpu hidden_units = args.hidden_units self.architecture = args.arch use_dropout = args.dropout dropout_ratio = args.dropout_ratio base_model = load_base_model(self.architecture) features_in = get_classifier_inputs_number(base_model) classifier = Classifier(features_in=features_in, hidden_units=hidden_units, use_dropout=use_dropout, dropout_ratio=dropout_ratio) self.save_dir = args.save_dir self.model = get_model(base_model, classifier) cuda_is_available = torch.cuda.is_available() if not cuda_is_available: print("Cuda is not available. Only the CPU will be used") self.device = torch.device( "cuda:0" if cuda_is_available and use_gpu else "cpu")
def main(): parser = argparse.ArgumentParser(description='Train a neural network') parser.add_argument('--model', type=str) parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--decay', type=float, default=1e-4) parser.add_argument('--momentum', type=float, default=0.9) parser.add_argument('--batch', type=int, default=128) parser.add_argument('--epoch', type=int, default=100) parser.add_argument('--output', type=str, default='weight') args = parser.parse_args() model = importlib.import_module(args.model).build() six.print_('loading data') (train_x, train_y, val_x, val_y) = load_data() six.print_('load data complete') sgd = SGD(lr=args.lr, decay=args.decay, momentum=args.momentum, nesterov=True) model.compile(loss='binary_crossentropy', optimizer=sgd) six.print_('build model complete') six.print_('start training') model.fit(train_x, train_y, batch_size=args.batch, nb_epoch=args.epoch, verbose=2, show_accuracy=True, shuffle=True, validation_data=(val_x, val_y)) model.save_weights(args.output + '.hdf5')
def score_labeled_data(postprocess_dir, data_dir, img_width, img_height): categories = ["training/", "validation/", "test/"] for c in categories: ground_truth_rois, filenames = load.load_data(data_dir + c, img_width, img_height, rois_only=True) rois = defaultdict(lambda: [None, None]) for i, r in enumerate(ground_truth_rois): rois[filenames[i]][0] = r for f in os.listdir(postprocess_dir + c): filename = os.path.splitext(os.path.basename(f))[0] if f.endswith('.npz'): rois[filename][1] = np.load(postprocess_dir + c + f)['rois'] files_to_remove = [] for f in rois: if rois[f][0] is None: print "Unable to score " + f + " : missing ground truth data" files_to_remove.append(f) elif rois[f][1] is None: print "Unable to score " + f + " : missing convnet data" files_to_remove.append(f) for f in files_to_remove: rois.pop(f) ground_truth_rois, convnet_rois = zip(*rois.values()) score = Score(ground_truth_rois, convnet_rois) with open(postprocess_dir + c + "score.txt", 'w') as score_file: score_file.write(str(score))
def __init__(self, bot, config): super(Extension, self).__init__(bot, config) self.data = load.load_data('cvn') self.register_commands('links', 'cancelprocess') self.confirm = False self.loop = asyncio.new_event_loop() self.namespaces = list(range( 0, 16)) + [110, 111, 502, 503, 828, 1201, 2001]
def compute_histogram_database(vocabulary, max_im=None): res = [] gen = load_data() for i, (im, mask) in enumerate(gen): if max_im and max_im < i: break res.append(compute_histogram(im, mask, vocabulary)) return np.array(res)
def co_with_y(): data = load_data(False) x, y = data['x'], data['y'] corr = [] for i in x.T: corr.append(abs(stats.pearsonr(i, y.reshape(-1))[0])) sorting = np.argsort(corr)[-10:] for j in sorting: print(get_names()[j])
def main(): business_reviews = load_data() sample_business = business_reviews[0] # Let's look at the first business just to get an idea of what we're # working with... print sample_business['reviews'][0] print sample_business['categories'] print sample_business['name']
def removed_columns(): test_data = load_data(True) train_data = load_data(False) x, a = test_data['x'], test_data['a'] columns_rm = [ get_names().index('sex_Female'), get_names().index('sex_Male') ] indx = [j for j in range(len(x.T)) if j not in columns_rm] train_data['x'] = train_data['x'][:, indx] test_data['x'] = test_data['x'][:, indx] clf = LogisticRegression(C=1000).fit(train_data['x'], train_data['a'].reshape(-1)) print(clf.score(test_data['x'], test_data['a'].reshape(-1))) print( re_accuracy(test_data['a'].reshape(-1), clf.predict(test_data['x']), test_data['a'].reshape(-1)))
def test_load_test(self): f = open("test.txt") wd, pd, ctd, etd = load.load_data(f) f.close() self.assertEqual(wd, [["Peter", "Blackburn"], ["1966", "World", "Cup"]]) self.assertEqual(pd, [["NNP", "NNP"], ["CD", "NNP", "NNP"]]) self.assertEqual(ctd, [["I-NP", "I-NP"], ["I-NP", "I-NP", "I-NP"]])
def load_test_dataset(): print("Loading unlabeled dataset") test_df = read_labels(DATA_DIR / 'test.txt', column_names=['name']) # Index by name to make sure pandas doesn't # add an extra column in the submission test_df = test_df.set_index('name') return test_df, load_data(DATA_DIR / 'test' / 'test', test_df.index)
def main(): device = torch.device(args.device) # Loading the train and dev data and save them in a loader + the encoder of the classes train_loader, dev_loader, label_encoder = load_data( args.train_path, args.dev_path, args.batch_size, args.tokens_column, args.predict_column, args.lang_model_name, args.max_len, args.separator, args.pad_label, args.null_label, device) train(train_loader, dev_loader, label_encoder, device)
def main(): business_reviews = load_data() sample_business = business_reviews[0] # Let's look at the first business just to get an idea of what we're # working with... print '\n' print 'First review: {0}'.format(sample_business['reviews'][0]) print 'Categories: {0}'.format(sample_business['categories']) summary = interesting_words(sample_business['reviews']) print '\nReviews summary: {0}'.format(summary)
def predict_corr(): test_data = load_data(True) train_data = load_data(False) x, a = test_data['x'], test_data['a'] clf = LogisticRegression(C=1000).fit(train_data['x'], train_data['y'].reshape(-1)) y_pred = clf.predict(test_data['x']) corr = [] for i in x.T: replace_nan = abs(stats.pearsonr(i, y_pred)[0]) if np.isnan(replace_nan): corr.append(0) else: corr.append(replace_nan) sorted_corr = np.argsort(corr)[-3:] print(sorted_corr) for j in sorted_corr: print(get_names()[j])
def testAll(args): word2idx, vectors = create_model(args) global idx2word idx2word = {b: a for a, b in word2idx.items()} print("> Loading trained model and Test") max_recall = load_model(args.model_dump) print(f"max_recall: {max_recall}") test_data = load_test_data(args, word2idx) with torch.no_grad(): model.eval() dataset = load_data(args, word2idx, vectors) calculateRecall(dataset)
def trainInit(args): max_recall = 0 word2idx, vectors = create_model(args) idx2word = {b:a for a,b in word2idx.items()} if args.model_load != None: print("> Loading trained model and Train") max_recall = load_model(args.model_load) dataset = load_data(args, word2idx, vectors) objective = nn.BCEWithLogitsLoss(pos_weight=torch.FloatTensor([RATE])).to(device) return dataset, objective, word2idx, max_recall
def __init__(self, bot, config): super(Extension, self).__init__(bot, config) self.temp = load.load_data('report') self.register_commands('report', 'unreport', 'resolve', 'kocka') self.mw = mwclient.Site('vstf.wikia.com', path='/') self.mw.login(config['username'], config['password']) for t in ['w', 's', 'p', 'b']: if not t in self.temp: if t == 'w' or t == 'b': self.temp[t] = [] else: self.temp[t] = {} self.message = False
def main(): business_reviews = load_data() # Find the first Mexican business for biz in business_reviews: if 'Mexican' in biz['categories']: sample_business = biz break # Let's see how we classify the first business... result = naive_mexican_classifier(sample_business['reviews']) print '\n' print 'First review: {0}'.format(sample_business['reviews'][0]) print 'Categories: {0}'.format(sample_business['categories']) print '\nClassified as Mexican?: {0}'.format(result)
def compute_cnn_features(ident_list): data = load.load_data(ident_list, 96, 96, color=True) N = data.shape[0] X = np.zeros((N, 3, 96, 96)) for i in range(N): im = data[i, :].copy().reshape((96, 96, 3)).copy() #plt.imshow(im.astype(np.uint8)) im[:, :, 0] -= 103.939 im[:, :, 1] -= 116.779 im[:, :, 2] -= 123.68 im = im.transpose((2, 0, 1)) im = np.expand_dims(im, axis=0) X[i, :, :, :] = im cnn_features = model.predict(X) return cnn_features
def prepare_data(): X = load_data() features = X[:, 5:43] city_encoder = LabelEncoder() city_group_encoder = LabelEncoder() type_encoder = LabelEncoder() raw_city = city_encoder.fit_transform(X[:, 2:3].flatten()) raw_city_group = city_group_encoder.fit_transform(X[:, 3:4].flatten()) raw_type = type_encoder.fit_transform(X[:, 4:5].flatten()) features =np.concatenate((np.array([raw_type]).T, features), axis=1) features =np.concatenate((np.array([raw_city_group]).T, features ), axis=1) features =np.concatenate(( np.array([raw_city]).T, features), axis=1) return train_test_split(features, X[:, 42:43].flatten(), test_size=0.33, random_state=42)
def main(filename, station_filename, output_filename): times, station_ids, empty_slots, available_bikes = load_data(filename) sid2data = load_stations(station_filename) total_slots = np.add(empty_slots, available_bikes) # gen empty_ratio empty_ratios = np.divide(empty_slots, total_slots) # gen available ratio available_bike_ratios = np.divide(available_bikes, total_slots) # gen hours hours = get_hour(times) # gen minutes minutes = get_minute(times, delta=15) # for each 15 minutes generate a snap shot data = [] for h in range(24): for m in range(0, 60, 15): slot_station_ids = station_ids[ hours == h & minutes == m] slot_available_bike_ratios = available_bike_ratios[ hours == h & minutes == m] slot_data = {} slot_data['hour'] = h slot_data['minute'] = m slot_data['station'] = [] for station_id in range(1, 119+1): mean_available_bike_ratio = np.mean(slot_available_bike_ratios[slot_station_ids == station_id]) slot_data['station'].append({ 'bike_ratio': mean_available_bike_ratio, 'name': sid2data[station_id]['name'], 'e_name': sid2data[station_id]['e_name'], 'lat': sid2data[station_id]['lat'], 'lng': sid2data[station_id]['lng'], 'sid': station_id }) data.append(slot_data) # print out outf = open(output_filename, 'w') print >> outf, json.dumps(data) outf.close()
def main(): filename = "../ubike_record.09_13_09_26.csv" # filename = "../../ubike_record.csv" times, station_ids, empty_slots, available_bike = load_data(filename) # fetch Taipei city goverment data start_t = mdates.strpdate2num("%Y-%m-%d %H:%M:%S")("2013-09-15 00:00:00") end_t = mdates.strpdate2num("%Y-%m-%d %H:%M:%S")("2013-09-16 00:00:00") station_ids = station_ids[times < end_t] empty_slots = empty_slots[times < end_t] times = times[times < end_t] station_ids = station_ids[start_t < times] empty_slots = empty_slots[start_t < times] times = times[start_t < times] station_id = 4 times = times[station_ids == station_id] empty_slots = empty_slots[station_ids == station_id] available_bike = available_bike[station_ids == station_id] # plot plot(times, available_bike)
import tensorflow as tf from load import load_data import pickle # 调用load.py, 加载数据并进行预处理 # 加载数据 title_count, title_set, genres2int, features, targets_values, ratings, users, movies, data, movies_orig, users_orig = load_data() # print(title_count) # print(title_set) # print(genres2int) # print(features) # print(targets_values) # print(ratings) # print(movies) # print(movies_orig) sentences_size = title_count # 电影名称的长度 embed_dim = 32 # 嵌入矩阵的维度 num_epochs = 10 # 迭代次数 batch_size = 256 # batch的大小 dropout_keep = 0.5 # dropout比例 learning_rate = 0.0001 # 学习率 save_dir = './model/save' # 生成模型的保存路径 # 用户信息 user_gender = {'M':'男性','F':'女性'} user_age = {1:"Under 18",18: "18-24",25: "25-34",35: "35-44",45: "45-49", 50: "50-55",56: "56+"} user_occupation = {0: "other" , 1: "academic/educator",2: "artist", 3: "clerical/admin",4: "college/grad student",5: "customer service", 6: "doctor/health care",7: "executive/managerial",8: "farmer", 9: "homemaker",10: "K-12 student",11: "lawyer",
import model from imp import reload reload(model) import numpy as np import matplotlib as mpl mpl.use('Agg') import matplotlib.pyplot as plt import os import load data = load.load_data("cifar10") X = data[0] img_depth = data[4] img_x = data[5] import Plots print(X.shape) Plots.plot_testimg(X[:10, :]) batch_size = 32 num_f1 = 50 num_f2 = 50 num_f3 = 50 f1 = (num_f1, img_depth, 11, 11) f2 = (num_f2, num_f1, 2, 2) f3 = (num_f3, num_f2, 3, 3) filters = [f1]
def parameter_tuning(methodology, nfold, is_pca, is_testing, n_jobs, conf, n_estimator): drop_fields = [] parser = ModelConfParser(conf) objective = parser.get_objective() cost = parser.get_cost() filepath_training, filepath_testing, filepath_submission, filepath_tuning = parser.get_filepaths(methodology) filepath_feature_importance, top = parser.get_feature_importance() filepath_feature_interaction, binsize, top_feature = parser.get_feature_interaction() for filepath in [filepath_tuning, filepath_submission]: create_folder(filepath) filepath_cache_1 = "{}/input/train.pkl".format(BASEPATH) train_x, test_x, train_y, test_id, _ = load_data(filepath_cache_1, filepath_training, filepath_testing, drop_fields) pool = [] for value, count in zip(values, counts): if count > 2: pool.append(value) idxs = train_y.isin(pool) train_x = train_x[idxs].values train_y = train_y[idxs].astype(str).values test_x = test_x.values test_id = df_testing["row_id"].values if filepath_feature_interaction: for layers, value in load_interaction_information(filepath_feature_interaction, str(top_feature)): for df in [train_x, test_x]: t = value breaking_layer = None for layer in layers: if layer in train_x.columns: t *= df[layer] else: breaking_layer = layer break if breaking_layer == None: df[";".join(layers)] = t else: log("Skip {}".format(layers), WARN) break if is_pca: train_x, test_x = pca(train_x, train_y.values, test_x) if is_testing: train_x = train_x.head(1000) train_y = train_y.head(1000) params = tuning(train_x, train_y, test_id, test_x, cost, objective, filepath_feature_importance, filepath_tuning, filepath_submission, methodology, nfold, top_feature, n_estimator=n_estimator, thread=n_jobs) log("The final parameters are {}".format(params))
#!/usr/bin/python import argparse import load as Loader import analyze as Analyzer parser = argparse.ArgumentParser(description='Analyze scraped data.') parser.add_argument('directory', metavar='dir', help='directory to process') args = parser.parse_args() data = Loader.load_data(args.directory) print("==== Loaded Data =====") Analyzer.histogram("hits.png", data["base"]["hits"], 25, "Histogram of Number of Hits","Number of Hits") Analyzer.histogram("words.png", data["base"]["words"], 25, "Histogram of Number of Words","Number of Words") Analyzer.scatter("hits-vs-words.png", data["base"]["words"],data["base"]["hits"], "Histogram of Hits vs Words","Number of Words", "Number of Hits") Analyzer.scatter("bookmarks-vs-words.png", data["base"]["words"],data["base"]["bookmarks"], "Histogram of Bookmarks vs Words","Number of Words", "Number of Bookmarks")
filter_params.append(w) outshp = (outshp - f[2] + 1)/2 outshp = filters[-1][0] * outshp * outshp w = init_weights((outshp, fc[0])) fc_params.append(w) for i in range(len(fc)-1): w = init_weights((fc[i], fc[i+1])) fc_params.append(w) return filter_params, fc_params trX, trY, teX, teY, channels, img_x = load.load_data("mnist") #trX, trY, teX, teY, channels, img_x = load.load_data("cifar10") img_y = img_x X = T.ftensor4() Y = T.fmatrix() f1 = (10, channels, 7, 7) f2 = (25, f1[0], 4, 4) filters = [f1, f2] fc = [500, trY.shape[1]] filter_params, fc_params = get_params(img_x, filters, fc) params = filter_params + fc_params print(params)
def feature_engineer(conf, thread, feature_importance, interaction_information, merge_ii, split_idx, split_num, testing, combinations_size): drop_fields = [] transform2 = True parser = ModelConfParser(conf) BASEPATH = parser.get_workspace() binsize, top = parser.get_interaction_information() top_feature = parser.get_top_feature() if feature_importance: filepath_training = "{}/input/train.csv".format(BASEPATH) filepath_testing = "{}/input/test.csv".format(BASEPATH) filepath_cache_1 = "{}/input/train.pkl".format(BASEPATH) folder_ii = "{}/input/interaction_information/transform2=True_testing=-1_binsize={}".format(BASEPATH, binsize) folder_feature = "{}/etc/feature_profile/transform2=True_binsize={}_top={}".format(BASEPATH, binsize, top) train_x, test_x, train_y, test_id, train_id = load_data(filepath_cache_1, filepath_training, filepath_testing, drop_fields) columns = train_x.columns for layers, value in load_interaction_information(folder_ii, threshold=top): for df in [train_x, test_x]: t = value breaking_layer = None for layer in layers: if layer in columns: t *= df[layer].values else: breaking_layer = layer break if breaking_layer == None: df[";".join(layers)] = t else: log("Skip {} due to {} not in columns".format(layers, breaking_layer), WARN) break names = train_x.columns print "Data Distribution is ({}, {}), and then the number of feature is {}".format(np.sum(train_y==0), np.sum(train_y==1), len(names)) fp = FeatureProfile() fp.profile(train_x.values, train_y, names, folder_feature, int(min(512, len(names)))) if interaction_information: log("Try to calculate the interaction information", INFO) filepath_training = "{}/input/train.csv".format(BASEPATH) filepath_testing = "{}/input/test.csv".format(BASEPATH) train_x, test_x, train_y, id_train, id_test = None, None, None, None, None if transform2: train_x, test_x, train_y, id_train, id_test = data_transform_2(filepath_training, filepath_testing, keep_nan=True) else: train_x, train_y, test_x, test_id = data_load(drop_fields=drop_fields) filepath_cache = "{}/input/transform2={}_binsize={}_cache.pkl".format(BASEPATH, transform2, binsize) folder_couple = "{}/input/interaction_information/transform2={}_testing={}_binsize={}".format(BASEPATH, transform2, testing, binsize) results_couple = feature_engineering.calculate_interaction_information(filepath_cache, train_x, train_y, folder_couple, \ binsize=binsize, nthread=thread, combinations_size=combinations_size, n_split_idx=split_idx, n_split_num=split_num, is_testing=int(testing) if testing > 0 else None) if merge_ii: folder_couple = "{}/input/interaction_information/transform2={}_testing={}_binsize={}".format(BASEPATH, transform2, testing, binsize) count_filepath, count_couple, final_count_filepath, final_count_couple = feature_engineering.merge_interaction_information(folder_couple) log("Originally. we have {} records in {} files. After merging, we have {} records in {} files".format(count_couple, count_filepath, final_count_couple, final_count_filepath), INFO)
import numpy as np from sklearn.externals.joblib import Memory from load import load_data from descriptors import compute_boundary_desc, get_interest_points NUM_IMAGES = 300 mem = Memory(cachedir='.') gen = load_data(test=True) descriptors = [] print "Compute descriptors" d = 0 for i, (im, mask) in enumerate(gen): if i % 10 == 0: print "Computed %d images" % i if NUM_IMAGES is not None and i == NUM_IMAGES: break interest_points = mem.cache(get_interest_points)(mask) descriptor, coords = mem.cache(compute_boundary_desc)(im, mask, interest_points) for element in descriptor: descriptors.append(element) # Let's dump descriptors to not recompute them later descriptors = np.array(descriptors) descriptors.dump('./data/descriptors.npy')
import sys import subprocess import os import random from load import load_data, load_emb, load_veclist from tree_blstm import model from accuracy import conlleval from tools import shuffle if __name__ == '__main__': s = {'seed':345, 'epoch':20, 'lr':0.01, 'decay':0.95, 'hnum':100 , 'dnum':340, 'ynum':2, 'wnum':6206, 'L2': 0.000001, 'me':50, 'md':50, 'mx':50, 'Wlnum':13, 'Wrnum':297,'kalpha':0.2} print 'load train data' train_e = load_data("data/train_e.txt") train_d = load_data("data/train_d.txt") train_l = load_data("data/train_l.txt") train_s = load_data("data/train_s.txt") train_tl = load_veclist("data/train_tl.txt") train_tr = load_veclist("data/train_tr.txt") train_ta = load_veclist("data/train_ta.txt") train_y = load_data("data/train_y.txt") print 'load test data' test_e = load_data("data/test_e.txt") test_d = load_data("data/test_d.txt") test_l = load_data("data/test_l.txt") test_s = load_data("data/test_s.txt") test_tl = load_veclist("data/test_tl.txt") test_tr = load_veclist("data/test_tr.txt")
import numpy as np from sklearn.externals.joblib import Memory from load import load_data from descriptors import compute_boundary_desc, get_interest_points from histograms import compute_visual_words NUM_IMAGES = None mem = Memory(cachedir='.') vocabulary = np.load('./data/vocabulary.npy') gen = load_data() res = [] # FIXME needs to lookup the number of images postings = np.zeros((len(vocabulary), 3170)) for i, (im, mask) in enumerate(gen): if i % 10 == 0: print "computed %d images" % i if NUM_IMAGES is not None and i == NUM_IMAGES: break interest_points = mem.cache(get_interest_points)(mask) descriptor, coords = mem.cache(compute_boundary_desc)(im, mask, interest_points) vw = compute_visual_words(descriptor, vocabulary) if vw is not None:
def train(args): """ This function trains the dietnetwork using the histogram embedding. The idea is to use batch learning from numpy files using basic dict_feed (not much improvement in time) Args: - args.path: path to the data dir which contains train/val/test - args.learning_rate: learning rate for the optimizer - args.sum_dir: summry - args.num_epoch: ... - args.batchsize: ... """ # load the data: (note:already preshuffled) trainX, trainY, validX, validY, testX, testY = load_data(args.path) trainX = np.array(trainX).astype(np.float32) trainY = np.array(trainY).astype(np.float32) validX = np.array(validX).astype(np.float32) validY = np.array(validY).astype(np.float32) testX = np.array(testX).astype(np.float32) testY = np.array(testY).astype(np.float32) val_len = np.shape(validX)[0] test_len = np.shape(testX)[0] # get dietnet input values: input_dim=np.shape(trainX)[1] output_dim=np.shape(trainY)[1] embed_size=input_dim ############### Get Hyperparameter grid and iterate ##################3 grid_list, grid_string = grid() for hparam, hparam_list in zip(grid_list, grid_string): tf.reset_default_graph() print("new combination") # Begin loop for each parameter combination # build the graph: loss, accuracy = dietnet(path=args.path, input_size=input_dim, output_size=output_dim, dropout_rate=args.dropout_rate, embed_size=embed_size, hidden_size=100, gamma=args.gamma, w_init=hparam['w_init_dist'], activ_fun=hparam['act_funs'], ) #final ops: accuracy, loss, optimizer: optimizer = hparam['optims'] training_op = slim.learning.create_train_op(loss, optimizer, #summarize_gradients=True, clip_gradient_norm=10) # Summary stuff: get the train/valid/test loss and accuracy test_acc_summary = tf.summary.scalar('test_accuracy', accuracy, collections=['test']) valid_acc_summary = tf.summary.scalar('valid_accuracy', accuracy, collections=['valid']) train_acc_summary = tf.summary.scalar('train_accuracy', accuracy, collections=['train']) test_loss_summary = tf.summary.scalar('test_loss', loss, collections=['test']) valid_loss_summary = tf.summary.scalar('valid_loss', loss, collections=['valid']) train_loss_summary = tf.summary.scalar('train_loss', loss, collections=['train']) # separates the summaries according to the collection train_ops = tf.summary.merge_all('train') valid_ops = tf.summary.merge_all('valid') test_ops = tf.summary.merge_all('test') with tf.Session() as sess: # init variables sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) # print out all trainable variables #print([i for i in tf.trainable_variables()]) # saver for summary swriter = tf.summary.FileWriter(args.sum_dir + hparam_list, sess.graph) step = 0 try: for i in range(args.num_epoch): for idx in range(int(np.shape(trainX)[0] / args.batchsize)): # prep data for train: a,b = idx*args.batchsize, (idx+1)*args.batchsize batch_x = trainX[a:b,:] batch_y = trainY[a:b,:] #get time start_time=time.time() # run train op and get train loss trainloss, accur, summaries = sess.run([training_op, accuracy, train_ops], feed_dict={ 'inputs:0': batch_x, 'outputs:0': batch_y, 'is_training:0': True}) # add sumamries every other step for memory if not idx % 25: swriter.add_summary(summaries,step) duration=time.time() - start_time # every 5 steps get train and test loss/accur if not idx % 25: # sample random 25% from test/valid for error val_ind = [i for i in random.sample(xrange(val_len), args.batchsize)] test_ind = [i for i in random.sample(xrange(test_len), args.batchsize)] val_x = validX[val_ind,:] val_y = validY[val_ind,:] test_x = testX[test_ind,:] test_y = testY[test_ind,:] # get val loss/accur: val_loss, accur_valid, summaries = sess.run([loss, accuracy, valid_ops], feed_dict={ 'inputs:0': val_x, 'outputs:0': val_y, 'is_training:0': False}) swriter.add_summary(summaries,step) # get test loss/accur test_loss,accur_test, summaries = sess.run([loss, accuracy,test_ops], feed_dict={ 'inputs:0': test_x, 'outputs:0': test_y, 'is_training:0': False}) swriter.add_summary(summaries, step) # print to console in order to watch: print('step {:d}-train/v/test acc:={:.3f},{:.3f},{:.3f}'.format(step, accur, accur_valid, accur_test)) step += 1 # add checkpoint here:... # if num_epochs is complete close swriter swriter.close() finally: swriter.close()
def learning(conf, thread, is_testing): drop_fields = [] parser = ModelConfParser(conf) BASEPATH = parser.get_workspace() objective = parser.get_objective() binsize, top = parser.get_interaction_information() cost = parser.get_cost() nfold = parser.get_nfold() top_feature = parser.get_top_feature() filepath_training = "{}/input/train.csv".format(BASEPATH) filepath_testing = "{}/input/test.csv".format(BASEPATH) filepath_cache_1 = "{}/input/train.pkl".format(BASEPATH) folder_ii = "{}/input/interaction_information/transform2=True_testing=-1_binsize={}".format(BASEPATH, binsize) filepath_feature_importance = "{}/etc/feature_profile/transform2=True_binsize={}_top={}.pkl".format(BASEPATH, binsize, top) train_x, test_x, train_y, test_id, train_id = load_data(filepath_cache_1, filepath_training, filepath_testing, drop_fields) if is_testing: train_x = train_x.head(1000) train_y = train_y.head(1000) basic_columns = train_x.columns for layers, value in load_interaction_information(folder_ii, threshold=str(top_feature)): for df in [train_x, test_x]: t = value breaking_layer = None for layer in layers: if layer in basic_columns: t *= df[layer].values else: breaking_layer = layer break if breaking_layer == None: df[";".join(layers)] = t else: log("Skip {} due to {} not in columns".format(layers, breaking_layer), WARN) break ii_columns = train_x.columns importance_columns = load_feature_importance(filepath_feature_importance, top_feature) predictors = {"basic": basic_columns, "interaction-information-3": [column for column in ii_columns if column.count(";") == 1], "interaction-information-4": [column for column in ii_columns if column.count(";") == 2], "feature-importance": importance_columns} train_y = train_y.values test_id = test_id.values train_Y = train_y.astype(float) layer1_models, layer2_models, last_model = [], [], [] data_dimension = [] # Init the parameters of deep learning checkpointer = KaggleCheckpoint(filepath="{epoch}.weights.hdf5", training_set=([train_x], train_Y), testing_set=([test_x], test_id), folder=None, cost_string=cost, verbose=0, save_best_only=True, save_training_dataset=False) # Init the parameters of cluster for idx, layer_models in enumerate([layer1_models, layer2_models, last_model]): data_dimension.append([]) for model_section in parser.get_layer_models(idx+1): for method, setting in parser.get_model_setting(model_section): if method.find("deep") > -1: setting["folder"] = None if "data_dimension" in setting: if setting["data_dimension"] == "basic": setting["input_dims"] = len(basic_columns) elif setting["data_dimension"] == "importance": setting["input_dims"] = len(importance_columns) elif setting["data_dimension"].find("interaction-information") != -1: setting["input_dims"] = top_feature else: log("Wrong Setting for input_dims because the data_dimension is {}".format(setting["data_dimension"]), ERRPR) sys.exit(100) data_dimension[idx].append(setting["data_dimension"]) else: log("Not found data_dimension in LAYER{}".format(idx+1), INFO) data_dimension[idx].append("all") setting["callbacks"] = [checkpointer] setting["number_of_layer"] = setting.pop("layer_number") else: if "data_dimension" in setting: data_dimension[idx].append(setting["data_dimension"]) else: data_dimension[idx].append("all") layer_models.append((method, setting)) log("Get the configuration of {} from {}".format(method, conf), INFO) log("The setting is {}".format(setting), INFO) folder_model = "{}/prediction_model/ensemble_learning/conf={}_is_testing={}_nfold={}_layer1={}_layer2={}_binsize={}_top={}".format(\ BASEPATH, os.path.basename(conf), is_testing, nfold, len(layer1_models), len(layer2_models), binsize, top_feature) folder_middle = "{}/etc/middle_layer/is_testing={}_nfold={}_binsize={}_top={}".format(\ BASEPATH, is_testing, nfold, binsize, top_feature) if is_testing and os.path.isdir(folder_model): log("Due to the testing mode, remove the {} firstly".format(folder_model), INFO) shutil.rmtree(folder_model) folder_submission = "{}/submission".format(folder_model) create_folder(folder_submission + "/dummy.txt") filepath_training = "{}/training_proba_tracking.csv".format(folder_model) filepath_testing = "{}/testing_proba_tracking.csv".format(folder_model) previous_training_dataset, previous_testing_dataset = train_x, test_x prediction_testing_history, prediction_training_history, learning_loss_history = {"ID": test_id}, {"target": train_Y}, [] # Model Training m = [layer1_models, layer2_models, last_model] for idx, models in enumerate(m): filepath_queue = "{}/layer{}_queue.pkl".format(folder_model, idx+1) filepath_nfold = "{}/layer{}_nfold.pkl".format(folder_model, idx+1) for idx_col, (method, setting) in enumerate(models): if method.find("deep") > -1: input_dims = -1 models[idx_col][1]["input_dims"] = len(previous_training_dataset[0]) if "auto_tuning" in setting and setting["auto_tuning"] == 1: filepath_tuning = "{}/etc/parameter_tuning/layer{}/method={}_testing={}_nfold={}_top={}_binsize={}_feature={}.pkl".format(folder_model, idx+1, method, is_testing, nfold, top, binsize, len(previous_training_dataset[0])) filepath_submission = "{}/etc/parameter_tuning/layer{}/method={}_binsize={}_top={}_feature={}.submission.csv".format(folder_model, idx+1, method, binsize, top, len(previous_training_dataset[0])) create_folder(filepath_tuning) log("Start the process of auto-tuning parameters for {}".format(method), INFO) params = tuning(previous_training_dataset, train_Y, test_id, previous_testing_dataset, cost, None, filepath_tuning, filepath_submission, method, nfold, top_feature, binsize, thread=parser.get_n_jobs()) log("The final paramers of layer{}-{} are {}".format(idx+1, method, params), INFO) for k, v in zip(["max_iter", "n_estimators", "learning_rate"], [parser.get_n_estimators(), parser.get_n_estimators(), parser.get_learning_rate()]): if k in params: params[k] = v models[idx_col][1].update(params) if "auto_tuning" in setting: models[idx_col][1].pop("auto_tuning") layer_train_x, layer_test_x, learning_loss = layer_model(\ objective, folder_model, folder_middle, predictors, previous_training_dataset, train_Y, previous_testing_dataset, models, filepath_queue, filepath_nfold, n_folds=nfold, cost_string=cost, number_of_thread=thread, saving_results=(True if (idx == 0 or method.find("deep") == -1 )else False)) learning_loss_history.append(learning_loss) col = layer_test_x.shape[1] for idx_col in range(0, col): submission = layer_test_x[:,idx_col] filepath_submission = "{}/layer={}_dimension={}_model={}_params={}.csv".format(folder_submission, idx+1, data_dimension[idx][idx_col], models[idx_col][0], make_a_stamp(models[idx_col][1])) save_kaggle_submission({"ID": test_id, "Target": submission}, filepath_submission) prediction_training_history["layer={}_method={}_feature={}_params={}".format(idx+1, models[idx_col][0], data_dimension[idx][idx_col], make_a_stamp(models[idx_col][1]))] = layer_train_x[:, idx_col] prediction_testing_history["layer={}_method={}_feature={}_params={}".format(idx+1, models[idx_col][0], data_dimension[idx][idx_col], make_a_stamp(models[idx_col][1]))] = layer_test_x[:, idx_col] previous_training_dataset = layer_train_x previous_testing_dataset = layer_test_x log("Layer{} is done...".format(idx+1), INFO) filepath_history_training_prediction = "{}/history_training.csv".format(folder_model) save_kaggle_submission(prediction_training_history, filepath_history_training_prediction) filepath_history_testing_prediction = "{}/history_testing.csv".format(folder_model) save_kaggle_submission(prediction_testing_history, filepath_history_testing_prediction) filepath_history_learning_loss = "{}/learning_loss.pkl".format(folder_model)
def main(argv): parser = argparse.ArgumentParser(description='Run test') parser.add_argument('-o', '--output', type=str, help='output directory') parser.add_argument('--db', nargs='+', choices=('vista', 'vista1500', 'fantom', 'fantom_long', 'promoters'), required=True) group = parser.add_mutually_exclusive_group() group.add_argument('--saveclass', action='store_true', help='train classifier on whole dataset and save to pickle') group.add_argument('--useclass', type=str, help='use classifier saved in file') parser.add_argument('-p', '--positives', choices=tissue_choices, required=True) parser.add_argument('-n', '--negatives', choices=tissue_choices, required=True) parser.add_argument('--boruta', action='store_true') parser.add_argument('--distinct', action='store_true') parser.add_argument('--histmods', type=str, default='', help='histone modificiations list file') parser.add_argument('--kmers', type=str, nargs='+', default='', help='kmers extension') #parser.add_argument('--usegc', action='store_true') args = parser.parse_args() if args.kmers == None and args.histmods == None: parser.error('Specify kmers or histmods') if len(args.db) == 1: args.db.append(args.db[0]) if 'fantom' in args.db and ('positives' in args.positives or 'positives' in args.negatives ): parser.error('Positives for FANTOM not defined') #prepare output directory outdir = RESULTSPATH+args.output try: maks = 0 for i in xrange(100, 0, -1): if os.path.exists(outdir+'_'+str(i)): maks = i break #print 'moving to', outdir+str(maks+1) outdir += '_'+str(maks+1) os.mkdir(outdir) except: parser.error('Cannot create directory %s' % outdir) outdir += '/' #write report, redirect stdout to log file orig_stdout = sys.stdout outfile = open(outdir+'log.txt', 'w') sys.stdout = outfile print args #print "cv_folds=%d, N_trees=%d, N_repeats=%d" % ( shared.cv_folds, shared.N_trees, shared.N_repeats ) print "cv_folds=%d, N_trees=%d, N_repeats=%d, usegc=%s" % ( cv_folds, N_trees, N_repeats, USEGC ) #load pos and neg data without balance datan = load_data(args.db[1], args.histmods, args.kmers, args.negatives, args.distinct ) datap = load_data(args.db[0], args.histmods, args.kmers, args.positives, args.distinct ) print "data sizes: %s %d %d, %s %d %d" % (args.positives, datap.shape[0], len(datap.dtype.names), args.negatives, datan.shape[0], len(datan.dtype.names)) sys.stdout.flush() if args.boruta: boruta.start_boruta() #load classifier from pickle if args.useclass: auc = predict(datap, datan, args.useclass, outdir ) #train new classifier else: if args.saveclass: name = args.db[0] + '-' + args.db[1]+"_" + args.positives + "_vs_" + args.negatives + "_" + args.histmods + "_" + str(args.kmers) auc = train_save(datap, datan, name, outdir) #else: name = args.positives + " vs " + args.negatives auc = train_cv(datap, datan, name, outdir, args.boruta) #finish #all options: DB, POS, NEG, kmers, hmods, ntrees, cv_folds (0 if no cv), used_class, auc if args.kmers == '': args.kmers = '-' if args.histmods == '': args.histmods = '-' summary = "%s\t%s\t%s\t%s\t%s\t%s\t%d\t%s" % (args.db, args.positives, args.negatives, args.kmers, args.histmods, args.distinct, N_trees, USEGC) if not args.useclass and not args.saveclass: summary += "\t%d\t" % (cv_folds ) elif args.useclass: summary += "\t0\t%s" % (args.useclass) else: summary += "\t0\t%s" % (args.saveclass) summary += "\t%f" % auc[0] summary += "\t%s" % time.ctime() summary += "\t%f" % auc[1] summary += "\t%s" % outdir summary += "\t%d\t%d" % (datap.shape[0], datan.shape[0]) summary += "\n" print summary sys.stdout = orig_stdout outfile.close() summaryf = open(SUMMARYFILE, 'a+') summaryf.write(summary) summaryf.close()
import test import sys print("Init..") model_name = "model.ckpt" epochs = 2 if "e" in sys.argv: epochs = int(sys.argv[sys.argv.index("e")+1]) files = [] files.append("mute") files.append("volume") files.append("channel") print("Files: " + ", ".join(files)) print("Loading data..") inputs, outputs, words = load.load_data(files) if "t" in sys.argv: print("Setup train..") sess = tf.InteractiveSession() x, y, y_ = th.setup(len(words), len(files)) train_step, writer, merged, accuracy = th.trainSetup(y, y_, sess) print("Train..") th.train(inputs, outputs, x, y_, train_step, sess, epochs, writer, merged, accuracy) print("Save..") th.save(sess, model_name) else: print("Test..") test.test(model_name, words, files)