def download_data(params, station_list, with_waveforms, recording_time, padding_time): """ Downloads data from IRIS. """ data.download_data(params, station_list, with_waveforms, recording_time, padding_time)
def build_validation_data_loader(self) -> DataLoader: if not self.data_downloaded: data.download_data(self.download_directory) self.data_downloaded = True corpus = data_util.Corpus(self.download_directory) test_dataset = data.PTBData( corpus.valid, self.context.get_hparam("seq_len"), self.context.get_hparam("eval_batch_size"), self.context.get_hparam("bptt"), self.context.get_hparam("max_seq_length_delta"), ) return DataLoader( test_dataset, batch_sampler=data.BatchSamp( test_dataset, self.context.get_hparam("bptt"), self.context.get_hparam("max_seq_length_delta"), valid=True, ), collate_fn=data.PadSequence(), )
def download_dataset(self) -> None: task = self.context.get_data_config().get("task") path_to_mrpc = self.context.get_data_config().get("path_to_mrpc") if not self.context.get_data_config().get("download_data"): # Exit if you do not want to download data at all return data.download_data(task, self.download_directory, path_to_mrpc) self.data_downloaded = True
def __init__(self, context: PyTorchTrialContext) -> None: self.context = context self.data_config = context.get_data_config() self.hparams = AttrDict(context.get_hparams()) # Create a unique download directory for each rank so they don't overwrite each # other when doing distributed training. self.download_directory = self.data_config["data_download_dir"] data.download_data(self.download_directory) corpus = data_util.Corpus(self.download_directory) self.corpus = corpus self.ntokens = len(corpus.dictionary) self.hidden = None # This is used to store eval history and will switch to ASGD # once validation perplexity stops improving. self._last_loss = None self._eval_history = [] self._last_epoch = -1 # Define the model genotype = self.get_genotype_from_hps() self.model = self.context.wrap_model( RNNModel( self.ntokens, self.hparams.emsize, self.hparams.nhid, self.hparams.nhidlast, self.hparams.dropout, self.hparams.dropouth, self.hparams.dropoutx, self.hparams.dropouti, self.hparams.dropoute, genotype=genotype, )) total_params = sum(x.data.nelement() for x in self.model.parameters()) logging.info("Model total parameters: {}".format(total_params)) # Define the optimizer self._optimizer = self.context.wrap_optimizer( HybridSGD( self.model.parameters(), self.hparams.learning_rate, self.hparams.weight_decay, lambd=0, t0=0, )) # Define the LR scheduler self.myLR = MyLR(self._optimizer, self.hparams) step_mode = LRScheduler.StepMode.MANUAL_STEP self.wrapped_LR = self.context.wrap_lr_scheduler(self.myLR, step_mode=step_mode)
def load_data(self, data_url='', data_dir='', data_tf='', split=0.1, test_samples=100, batch_size=1, shuffle=True): """Load and preprocess data. :param data_url: url download the data from :param data_dir: path to the directory containing the data This main directory should have subdirectories with the names of the classes :param data_tf: name of the TensorFlow dataset. See tfds.list_builders() :param split: percentage of samples for testing (default: 0.1) :param test_samples: number of samples to test the model (default: 100) :param batch_size: size of the batches of data (default: 32) :param shuffle: whether to shuffle the data (default: True) """ seed = 123 # for reproducibility AUTOTUNE = tf.data.experimental.AUTOTUNE # for better performance size = (self.input_shape[1], self.input_shape[2] ) # size to resize images # Download data from url if data_url: data_dir = download_data(data_url, cache_dir='./') print('Data downloaded!') # Load data from directory if data_dir: data_dir = pathlib.Path(data_dir) total = len(list(data_dir.glob('*/*.jpg'))) if test_samples: split = test_samples / total test_ds = tf.keras.preprocessing.image_dataset_from_directory( data_dir, validation_split=split, subset='validation', seed=seed, image_size=size, batch_size=batch_size, shuffle=shuffle) data = test_ds.cache().prefetch(buffer_size=AUTOTUNE) # Load tensorflow dataset if data_tf: split = "train[:" + str(test_samples) + "]" test_ds = tfds.load(data_tf, split=split, as_supervised=True, shuffle_files=shuffle) test_ds = test_ds.map(lambda x, y: (tf.image.resize(x, size), y)) data = test_ds.cache().batch(batch_size).prefetch( buffer_size=AUTOTUNE) self.data = data #print('Data loaded! ', data) return self
def __init__(self, context: det.TrialContext) -> None: self.context = context self.data_config = context.get_data_config() self.hparams = AttrDict(context.get_hparams()) # Create a unique download directory for each rank so they don't overwrite each other. self.download_directory = self.data_config["data_download_dir"] data.download_data(self.download_directory) corpus = data_util.Corpus(self.download_directory) self.corpus = corpus self.ntokens = len(corpus.dictionary) self.hidden = None # This is used to store eval history and will switch to ASGD # once validation perplexity stops improving. self._last_loss = None self._eval_history = [] self._last_epoch = -1
def __init__(self, context: det.TrialContext) -> None: self.context = context # Create a unique download directory for each rank so they don't # overwrite each other. self.download_directory = f"/tmp/data-rank{self.context.distributed.get_rank()}" download_data( download_directory=self.download_directory, data_config=self.context.get_data_config(), ) dataset = PennFudanDataset(self.download_directory + "/PennFudanPed", get_transform()) # Split 80/20 into training and validation datasets. train_size = int(0.8 * len(dataset)) test_size = len(dataset) - train_size self.dataset_train, self.dataset_val = torch.utils.data.random_split( dataset, [train_size, test_size] )
def __init__(self, context: keras.TFKerasTrialContext) -> None: self.context = context # Create a unique download directory for each rank so they don't overwrite each other. self.download_directory = f"/tmp/data-rank{self.context.distributed.get_rank()}" self.download_directory = download_data( download_directory=self.download_directory, url=self.context.get_data_config()["url"], )
def __init__(self, context: PyTorchTrialContext) -> None: self.context = context # Create a unique download directory for each rank so they don't # overwrite each other. self.download_directory = f"/tmp/data-rank{self.context.distributed.get_rank()}" download_data( download_directory=self.download_directory, data_config=self.context.get_data_config(), ) dataset = PennFudanDataset(self.download_directory + "/PennFudanPed", get_transform()) # Split 80/20 into training and validation datasets. train_size = int(0.8 * len(dataset)) test_size = len(dataset) - train_size self.dataset_train, self.dataset_val = torch.utils.data.random_split( dataset, [train_size, test_size] ) model = fasterrcnn_resnet50_fpn(pretrained=True) # Replace the classifier with a new two-class classifier. There are # only two "classes": pedestrian and background. num_classes = 2 in_features = model.roi_heads.box_predictor.cls_score.in_features model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes) # Wrap the model. self.model = self.context.wrap_model(model) # Wrap the optimizer. self.optimizer = self.context.wrap_optimizer(torch.optim.SGD( self.model.parameters(), lr=self.context.get_hparam("learning_rate"), momentum=self.context.get_hparam("momentum"), weight_decay=self.context.get_hparam("weight_decay"), )) # Wrap the LR scheduler. self.lr_scheduler = self.context.wrap_lr_scheduler( torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=3, gamma=0.1), step_mode=LRScheduler.StepMode.STEP_EVERY_EPOCH )
def build_training_data_loader(self) -> DataLoader: if not self.data_downloaded: data.download_data(self.download_directory) self.data_downloaded = True corpus = data_util.Corpus(self.download_directory) train_dataset = data.PTBData( corpus.train, self.context.get_hparam("seq_len"), self.context.get_per_slot_batch_size(), self.context.get_hparam("bptt"), self.context.get_hparam("max_seq_length_delta"), ) return DataLoader( train_dataset, batch_sampler=data.BatchSamp( train_dataset, self.context.get_hparam("bptt"), self.context.get_hparam("max_seq_length_delta"), ), collate_fn=data.PadSequence(), )
from data import download_data, url, vocab, index_lists, word_to_tensor, shuffle_words from model import TextGeneration from lm import generate_word n_epochs = 1000 lr = 0.01 print_every = 100 embedding_dim = 30 hidden_size = 50 batch_size = 32 seq_length = 10 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # set up data dino_names = download_data(url) max_len, (char_to_idx, idx_to_char) = vocab(dino_names) words = dino_names.split() start_indices, end_indices = index_lists(words, char_to_idx) n_chars = len(start_indices) n_batches = n_chars // (batch_size * seq_length) # set up model model = TextGeneration(max_len, hidden_size, max_len) loss_function = nn.CrossEntropyLoss(reduction="mean") optimizer = optim.Adam(model.parameters(), lr=lr) model.to(device) # model training
def load_data(self, training=True, split=0.2, test_samples=100, size=180, batch_size=32, shuffle=True, data_url=None, data_dir=None, data_tf=None): """Load and preprocess data. :param training: whether to train the model (default: True) :param split: percentage of samples for validation, if training is True (default: 20) :param test_samples: number of samples to test the model (default: 100) :param size: size to resize the images (default: (256,256)) :param batch_size: size of the batches of data (default: 32) :param shuffle: whether to shuffle the data (default: True) :param data_url: url to the zip or tar file to download the data :param data_dir: path to the directory containing the data This main directory should have subdirectories with the names of the classes :param data_tf: name of the TensorFlow dataset, check list at tfds.list_builders() """ # Reproducibility seed = 123 # Download data from url if data_url: data_dir = download_data(data_url, cache_dir='./') # Load data from directory size = (size, size) if data_dir: if training: train_ds = tf.keras.preprocessing.image_dataset_from_directory( data_dir, validation_split=split, subset='training', seed=seed, image_size=size, batch_size=batch_size, shuffle=shuffle) val_ds = tf.keras.preprocessing.image_dataset_from_directory( data_dir, validation_split=split, subset='validation', seed=seed, image_size=size, batch_size=batch_size, shuffle=shuffle) else: # if test_samples: split = test_samples / total_samples test_ds = tf.keras.preprocessing.image_dataset_from_directory( data_dir, validation_split=split, subset='validation', seed=seed, image_size=size, batch_size=1, shuffle=shuffle) # Load tensorflow dataset if data_tf: split = "train[:" + str(test_samples) + "]" test_ds = tfds.load(data_tf, split=split, as_supervised=True, shuffle_files=shuffle) test_ds = test_ds.map(lambda x, y: (tf.image.resize(x, size), y)) print('\nImages and labels shapes:') ds = train_ds or test_ds for image_batch, labels_batch in ds.take(1): print(image_batch.shape) print(labels_batch.shape) # Preprocess data AUTOTUNE = tf.data.experimental.AUTOTUNE if training: train_ds = train_ds.cache().shuffle(1000).prefetch( buffer_size=AUTOTUNE) val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE) processed_data = train_ds, val_ds else: processed_data = test_ds.cache().batch(1).prefetch( buffer_size=AUTOTUNE) self.data = processed_data return processed_data
parser.add_argument( '-f', '--fill', type=float, metavar='F', help= 'Transparency of the filled portion of the graph. If 0 (default), only plots the lines', default=0) cmd = parser.parse_args() import visualization import data if cmd.U: data.download_data() if len(cmd.countries) > 0: p, c = data.process_data(*data.load_data()) row_mask = [cmd.no_daily, cmd.no_cumulative] col_mask = [cmd.no_cases, cmd.no_deaths, cmd.no_recoveries, cmd.no_active] smooth = { 'days': cmd.smooth_days, 'smoothness': cmd.smoothness, 'type': 'window' } if cmd.exponential: smoot['type'] = 'exponential'
def create_se_data_filtered(se_archives, se_archives_path, result_dataset_path, test_ids, create_traindev=True, create_test=True, test_top_1=False, filter_tags=None): """Creates the stackexchange data with a filter list that specifies the exact ids of the posts items that should be in the test set. This is to ensure that these items are not in the train/dev set and are thus not used for optimization. Typically, this means that the testset is later used for inference, to generate questions for our downstream cQA tasks. :param se_archives: A list of stackexchange archives that should be used to generate the data from (e.g., "travel.strackexchange.com"). :param se_archives_path: The folder that contains (or will be used to donwload and store) the SE datasets :param result_dataset_path: Path where the resulting dataset should be written to. The files will be named result_dataset_path.{train/dev/test} :param test_ids: A list of ids of SE posts which should be used to construct the test split (they are removed from train and dev) :param create_traindev: Create train and dev splits :param create_test: Create test split :param test_top_1: Will retrieve n most similar paragraphs from a post for the test set. Default=1 (train/dev=1) :param filter_tags: list of tags (to filter questions) to include or None (all tags) """ logging.debug('Creating filtered data') logging.debug('Downloading archives') download_data(se_archives, se_archives_path) logging.debug('Done') test_ids = set(test_ids) questions_test = [] if create_traindev: logger.info('creating train/dev') dgen_traindev = yield_clean_data(se_archives, se_archives_path, top_n=1, filter_tags=filter_tags) questions_train_dev = [] for q in dgen_traindev: if q['post_id'] not in test_ids: questions_train_dev.append(q) else: questions_test.append(q) random.seed(1234) random.shuffle(questions_train_dev) logger.info('Train+Dev={}'.format(len(questions_train_dev))) logger.info( 'Saving data to: {}.[train,dev]'.format(result_dataset_path)) # n_dev = 1000 n_dev = min(5000, round(len(questions_train_dev) * 0.1)) save_json(questions_train_dev[:-n_dev], '{}.train'.format(result_dataset_path)) save_json(questions_train_dev[-n_dev:], '{}.dev'.format(result_dataset_path)) if create_test: logger.info('creating test') if test_top_1: logger.info('...test with top 1 paragraphs only') if not create_traindev: dgen_test = yield_clean_data(se_archives, se_archives_path, top_n=1, filter_ids=test_ids, filter_tags=filter_tags) questions_test = [q for q in dgen_test] else: logger.info('...test with all paragraphs') dgen_test = yield_clean_data(se_archives, se_archives_path, top_n=None, filter_ids=test_ids, filter_tags=filter_tags) questions_test = [q for q in dgen_test] logger.info('Test={}'.format(len(questions_test))) logger.info('Saving data to: {}.test'.format(result_dataset_path)) save_json(questions_test, '{}.test'.format(result_dataset_path)) logging.info('Done')
pred_pnl.ix[:, :, "rank"][pred_pnl.ix[:, :, "rank"] == 0] = 1 # 编号为0的直接归入第一组,因为前一步候原rank为1的元素减1滞后为0 for date in pred_pnl.major_axis: current_return = pred_pnl.major_xs(date).transpose().groupby( "rank")["return"].mean() # 分组并得到组内平均收益 current_return.name = date group_df = group_df.append(current_return) group_df.sort_index(inplace=True) return group_df if __name__ == "__main__": df = read_asset_set(const.STOCK_FILE) codes = df.index for code in codes: data.download_data(code) with open(const.COEF_FILE, 'r') as f: coef = json.load(f) factors = coef.keys() print factors # factors = ["30-day volatility"] # weights = [1] codes = df.index.tolist() generate_factors(codes, factors) update_frequency_factor_data(df, frequency='m') # pnl = get_asset_factor_data(df, factors, frequency='m') # pnl = get_predict_return(pnl, factors) # pnl = get_score_return(pnl, factors, weights) # group_df = get_group_return(pnl, factors) # group_df.to_excel("groups.xlsx")
def main(se_archive, se_archives_path, split_ids_folder, new_split_ids_folder, n_extends_ids, filter_tag, max_year): """This script extends the train ids from an existing set of train/dev/test ids with a number of randomly chosen ids from an SE data dump. :return: """ logger = logging.getLogger('root') filter_tags = None if len(filter_tag) == 0 else filter_tag logger.info('Filtering with tags={}'.format(filter_tags)) with open('{}/train-ids.txt'.format(split_ids_folder), 'r') as f: train_ids = [l.strip() for l in f if l] with open('{}/dev-ids.txt'.format(split_ids_folder), 'r') as f: dev_ids = [l.strip() for l in f if l] with open('{}/test-ids.txt'.format(split_ids_folder), 'r') as f: test_ids = [l.strip() for l in f if l] if n_extends_ids.endswith('x'): n_extends_ids = len(train_ids) * int(n_extends_ids[:-1]) else: n_extends_ids = int(n_extends_ids) logger.info('Downloading and/or extracting SE archive') download_data([se_archive], se_archives_path) logger.info('Reading extracted SE archive') se_dir = '{}/{}'.format(se_archives_path, se_archive) se_reader = SEDataReader('{}/Posts.xml'.format(se_dir)) ids = [] for item in se_reader.read_items(max_year=None if max_year == -1 else max_year): # we neerandom_ids_permd to check if it contains usable paragraphs so that we actually end up with the exact same number of # items in the decanlp train data p, _, tags, _ = get_paragraphs(item) is_in_filter_tags = True if filter_tags is not None: is_in_filter_tags = len(set(tags) & set(filter_tags)) > 0 if len(p) > 0 and is_in_filter_tags: ids.append(item['Id']) logger.info('Did read {} questions (filtered)'.format(len(ids))) random_ids_perm = np.random.permutation(len(ids)) train_dev_test = set(train_ids + dev_ids + test_ids) extended_train_ids = [] i = 0 while len(extended_train_ids) < n_extends_ids and i < len(ids): candidate_id = ids[random_ids_perm[i]] if candidate_id not in train_dev_test: extended_train_ids.append(candidate_id) i += 1 logger.info('Sampled {} test ids'.format(len(extended_train_ids))) logger.info('Writing new files') if not os.path.exists(new_split_ids_folder): os.mkdir(new_split_ids_folder) with open('{}/train-ids.txt'.format(new_split_ids_folder), 'w') as f: for i in train_ids + extended_train_ids: f.write('{}\n'.format(i)) copyfile('{}/dev-ids.txt'.format(split_ids_folder), '{}/dev-ids.txt'.format(new_split_ids_folder)) copyfile('{}/test-ids.txt'.format(split_ids_folder), '{}/test-ids.txt'.format(new_split_ids_folder)) logger.info('DONE')