Пример #1
0
def download_data(params, station_list, with_waveforms, recording_time,
                  padding_time):
    """
    Downloads data from IRIS.
    """
    data.download_data(params, station_list, with_waveforms, recording_time,
                       padding_time)
Пример #2
0
    def build_validation_data_loader(self) -> DataLoader:
        if not self.data_downloaded:
            data.download_data(self.download_directory)
            self.data_downloaded = True

        corpus = data_util.Corpus(self.download_directory)

        test_dataset = data.PTBData(
            corpus.valid,
            self.context.get_hparam("seq_len"),
            self.context.get_hparam("eval_batch_size"),
            self.context.get_hparam("bptt"),
            self.context.get_hparam("max_seq_length_delta"),
        )

        return DataLoader(
            test_dataset,
            batch_sampler=data.BatchSamp(
                test_dataset,
                self.context.get_hparam("bptt"),
                self.context.get_hparam("max_seq_length_delta"),
                valid=True,
            ),
            collate_fn=data.PadSequence(),
        )
Пример #3
0
    def download_dataset(self) -> None:
        task = self.context.get_data_config().get("task")
        path_to_mrpc = self.context.get_data_config().get("path_to_mrpc")

        if not self.context.get_data_config().get("download_data"):
            # Exit if you do not want to download data at all
            return

        data.download_data(task, self.download_directory, path_to_mrpc)
        self.data_downloaded = True
Пример #4
0
    def __init__(self, context: PyTorchTrialContext) -> None:
        self.context = context
        self.data_config = context.get_data_config()
        self.hparams = AttrDict(context.get_hparams())

        # Create a unique download directory for each rank so they don't overwrite each
        # other when doing distributed training.
        self.download_directory = self.data_config["data_download_dir"]
        data.download_data(self.download_directory)
        corpus = data_util.Corpus(self.download_directory)
        self.corpus = corpus
        self.ntokens = len(corpus.dictionary)
        self.hidden = None

        # This is used to store eval history and will switch to ASGD
        # once validation perplexity stops improving.
        self._last_loss = None
        self._eval_history = []
        self._last_epoch = -1

        # Define the model
        genotype = self.get_genotype_from_hps()
        self.model = self.context.wrap_model(
            RNNModel(
                self.ntokens,
                self.hparams.emsize,
                self.hparams.nhid,
                self.hparams.nhidlast,
                self.hparams.dropout,
                self.hparams.dropouth,
                self.hparams.dropoutx,
                self.hparams.dropouti,
                self.hparams.dropoute,
                genotype=genotype,
            ))
        total_params = sum(x.data.nelement() for x in self.model.parameters())
        logging.info("Model total parameters: {}".format(total_params))

        # Define the optimizer
        self._optimizer = self.context.wrap_optimizer(
            HybridSGD(
                self.model.parameters(),
                self.hparams.learning_rate,
                self.hparams.weight_decay,
                lambd=0,
                t0=0,
            ))

        # Define the LR scheduler
        self.myLR = MyLR(self._optimizer, self.hparams)
        step_mode = LRScheduler.StepMode.MANUAL_STEP
        self.wrapped_LR = self.context.wrap_lr_scheduler(self.myLR,
                                                         step_mode=step_mode)
Пример #5
0
    def load_data(self,
                  data_url='',
                  data_dir='',
                  data_tf='',
                  split=0.1,
                  test_samples=100,
                  batch_size=1,
                  shuffle=True):
        """Load and preprocess data.

        :param data_url: url download the data from
        :param data_dir: path to the directory containing the data
            This main directory should have subdirectories with the names of the classes
        :param data_tf: name of the TensorFlow dataset. See tfds.list_builders()
        :param split: percentage of samples for testing (default: 0.1)
        :param test_samples: number of samples to test the model (default: 100)
        :param batch_size: size of the batches of data (default: 32)
        :param shuffle: whether to shuffle the data (default: True)
        """
        seed = 123  # for reproducibility
        AUTOTUNE = tf.data.experimental.AUTOTUNE  # for better performance
        size = (self.input_shape[1], self.input_shape[2]
                )  # size to resize images

        # Download data from url
        if data_url:
            data_dir = download_data(data_url, cache_dir='./')
            print('Data downloaded!')

        # Load data from directory
        if data_dir:
            data_dir = pathlib.Path(data_dir)
            total = len(list(data_dir.glob('*/*.jpg')))
            if test_samples: split = test_samples / total

            test_ds = tf.keras.preprocessing.image_dataset_from_directory(
                data_dir,
                validation_split=split,
                subset='validation',
                seed=seed,
                image_size=size,
                batch_size=batch_size,
                shuffle=shuffle)
            data = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

        # Load tensorflow dataset
        if data_tf:
            split = "train[:" + str(test_samples) + "]"
            test_ds = tfds.load(data_tf,
                                split=split,
                                as_supervised=True,
                                shuffle_files=shuffle)
            test_ds = test_ds.map(lambda x, y: (tf.image.resize(x, size), y))
            data = test_ds.cache().batch(batch_size).prefetch(
                buffer_size=AUTOTUNE)

        self.data = data
        #print('Data loaded! ', data)

        return self
Пример #6
0
    def __init__(self, context: det.TrialContext) -> None:
        self.context = context
        self.data_config = context.get_data_config()
        self.hparams = AttrDict(context.get_hparams())

        # Create a unique download directory for each rank so they don't overwrite each other.
        self.download_directory = self.data_config["data_download_dir"]
        data.download_data(self.download_directory)
        corpus = data_util.Corpus(self.download_directory)
        self.corpus = corpus
        self.ntokens = len(corpus.dictionary)
        self.hidden = None

        # This is used to store eval history and will switch to ASGD
        # once validation perplexity stops improving.
        self._last_loss = None
        self._eval_history = []
        self._last_epoch = -1
Пример #7
0
    def __init__(self, context: det.TrialContext) -> None:
        self.context = context

        # Create a unique download directory for each rank so they don't
        # overwrite each other.
        self.download_directory = f"/tmp/data-rank{self.context.distributed.get_rank()}"
        download_data(
            download_directory=self.download_directory, data_config=self.context.get_data_config(),
        )

        dataset = PennFudanDataset(self.download_directory + "/PennFudanPed", get_transform())

        # Split 80/20 into training and validation datasets.
        train_size = int(0.8 * len(dataset))
        test_size = len(dataset) - train_size
        self.dataset_train, self.dataset_val = torch.utils.data.random_split(
            dataset, [train_size, test_size]
        )
Пример #8
0
    def __init__(self, context: keras.TFKerasTrialContext) -> None:
        self.context = context

        # Create a unique download directory for each rank so they don't overwrite each other.
        self.download_directory = f"/tmp/data-rank{self.context.distributed.get_rank()}"
        self.download_directory = download_data(
            download_directory=self.download_directory,
            url=self.context.get_data_config()["url"],
        )
Пример #9
0
    def __init__(self, context: PyTorchTrialContext) -> None:
        self.context = context

        # Create a unique download directory for each rank so they don't
        # overwrite each other.
        self.download_directory = f"/tmp/data-rank{self.context.distributed.get_rank()}"
        download_data(
            download_directory=self.download_directory, data_config=self.context.get_data_config(),
        )

        dataset = PennFudanDataset(self.download_directory + "/PennFudanPed", get_transform())

        # Split 80/20 into training and validation datasets.
        train_size = int(0.8 * len(dataset))
        test_size = len(dataset) - train_size
        self.dataset_train, self.dataset_val = torch.utils.data.random_split(
            dataset, [train_size, test_size]
        )

        model = fasterrcnn_resnet50_fpn(pretrained=True)
        # Replace the classifier with a new two-class classifier.  There are
        # only two "classes": pedestrian and background.
        num_classes = 2
        in_features = model.roi_heads.box_predictor.cls_score.in_features
        model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

        # Wrap the model.
        self.model = self.context.wrap_model(model)

        # Wrap the optimizer.
        self.optimizer = self.context.wrap_optimizer(torch.optim.SGD(
            self.model.parameters(),
            lr=self.context.get_hparam("learning_rate"),
            momentum=self.context.get_hparam("momentum"),
            weight_decay=self.context.get_hparam("weight_decay"),
        ))

        # Wrap the LR scheduler.
        self.lr_scheduler = self.context.wrap_lr_scheduler(
            torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=3, gamma=0.1),
            step_mode=LRScheduler.StepMode.STEP_EVERY_EPOCH
        )
Пример #10
0
    def build_training_data_loader(self) -> DataLoader:
        if not self.data_downloaded:
            data.download_data(self.download_directory)
            self.data_downloaded = True

        corpus = data_util.Corpus(self.download_directory)

        train_dataset = data.PTBData(
            corpus.train,
            self.context.get_hparam("seq_len"),
            self.context.get_per_slot_batch_size(),
            self.context.get_hparam("bptt"),
            self.context.get_hparam("max_seq_length_delta"),
        )
        return DataLoader(
            train_dataset,
            batch_sampler=data.BatchSamp(
                train_dataset,
                self.context.get_hparam("bptt"),
                self.context.get_hparam("max_seq_length_delta"),
            ),
            collate_fn=data.PadSequence(),
        )
Пример #11
0
from data import download_data, url, vocab, index_lists, word_to_tensor, shuffle_words
from model import TextGeneration
from lm import generate_word

n_epochs = 1000
lr = 0.01
print_every = 100
embedding_dim = 30
hidden_size = 50
batch_size = 32
seq_length = 10

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# set up data
dino_names = download_data(url)
max_len, (char_to_idx, idx_to_char) = vocab(dino_names)

words = dino_names.split()
start_indices, end_indices = index_lists(words, char_to_idx)
n_chars = len(start_indices)
n_batches = n_chars // (batch_size * seq_length)

# set up model
model = TextGeneration(max_len, hidden_size, max_len)
loss_function = nn.CrossEntropyLoss(reduction="mean")
optimizer = optim.Adam(model.parameters(), lr=lr)
model.to(device)

# model training
Пример #12
0
    def load_data(self,
                  training=True,
                  split=0.2,
                  test_samples=100,
                  size=180,
                  batch_size=32,
                  shuffle=True,
                  data_url=None,
                  data_dir=None,
                  data_tf=None):
        """Load and preprocess data.


        :param training: whether to train the model (default: True)
        :param split: percentage of samples for validation, if training is True (default: 20)
        :param test_samples: number of samples to test the model (default: 100)
        :param size: size to resize the images (default: (256,256))
        :param batch_size: size of the batches of data (default: 32)
        :param shuffle: whether to shuffle the data (default: True)
        :param data_url: url to the zip or tar file to download the data
        :param data_dir: path to the directory containing the data
            This main directory should have subdirectories with the names of the classes
        :param data_tf: name of the TensorFlow dataset, check list at tfds.list_builders()
        """
        # Reproducibility
        seed = 123

        # Download data from url
        if data_url:
            data_dir = download_data(data_url, cache_dir='./')

        # Load data from directory
        size = (size, size)
        if data_dir:
            if training:
                train_ds = tf.keras.preprocessing.image_dataset_from_directory(
                    data_dir,
                    validation_split=split,
                    subset='training',
                    seed=seed,
                    image_size=size,
                    batch_size=batch_size,
                    shuffle=shuffle)
                val_ds = tf.keras.preprocessing.image_dataset_from_directory(
                    data_dir,
                    validation_split=split,
                    subset='validation',
                    seed=seed,
                    image_size=size,
                    batch_size=batch_size,
                    shuffle=shuffle)

            else:
                # if test_samples: split = test_samples / total_samples
                test_ds = tf.keras.preprocessing.image_dataset_from_directory(
                    data_dir,
                    validation_split=split,
                    subset='validation',
                    seed=seed,
                    image_size=size,
                    batch_size=1,
                    shuffle=shuffle)

        # Load tensorflow dataset
        if data_tf:
            split = "train[:" + str(test_samples) + "]"
            test_ds = tfds.load(data_tf,
                                split=split,
                                as_supervised=True,
                                shuffle_files=shuffle)
            test_ds = test_ds.map(lambda x, y: (tf.image.resize(x, size), y))

        print('\nImages and labels shapes:')
        ds = train_ds or test_ds
        for image_batch, labels_batch in ds.take(1):
            print(image_batch.shape)
            print(labels_batch.shape)

        # Preprocess data
        AUTOTUNE = tf.data.experimental.AUTOTUNE
        if training:
            train_ds = train_ds.cache().shuffle(1000).prefetch(
                buffer_size=AUTOTUNE)
            val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
            processed_data = train_ds, val_ds
        else:
            processed_data = test_ds.cache().batch(1).prefetch(
                buffer_size=AUTOTUNE)

        self.data = processed_data
        return processed_data
Пример #13
0
parser.add_argument(
    '-f',
    '--fill',
    type=float,
    metavar='F',
    help=
    'Transparency of the filled portion of the graph. If 0 (default), only plots the lines',
    default=0)
cmd = parser.parse_args()

import visualization
import data

if cmd.U:
    data.download_data()

if len(cmd.countries) > 0:
    p, c = data.process_data(*data.load_data())

    row_mask = [cmd.no_daily, cmd.no_cumulative]
    col_mask = [cmd.no_cases, cmd.no_deaths, cmd.no_recoveries, cmd.no_active]

    smooth = {
        'days': cmd.smooth_days,
        'smoothness': cmd.smoothness,
        'type': 'window'
    }
    if cmd.exponential:
        smoot['type'] = 'exponential'
def create_se_data_filtered(se_archives,
                            se_archives_path,
                            result_dataset_path,
                            test_ids,
                            create_traindev=True,
                            create_test=True,
                            test_top_1=False,
                            filter_tags=None):
    """Creates the stackexchange data with a filter list that specifies the exact ids of the posts items that should be
    in the test set. This is to ensure that these items are not in the train/dev set and are thus not used for
    optimization. Typically, this means that the testset is later used for inference, to generate questions for our
    downstream cQA tasks.

    :param se_archives: A list of stackexchange archives that should be used to generate the data from
                        (e.g., "travel.strackexchange.com").
    :param se_archives_path: The folder that contains (or will be used to donwload and store) the SE datasets
    :param result_dataset_path: Path where the resulting dataset should be written to. The files will be named
                                result_dataset_path.{train/dev/test}
    :param test_ids: A list of ids of SE posts which should be used to construct the test split (they are removed from
                     train and dev)
    :param create_traindev: Create train and dev splits
    :param create_test: Create test split
    :param test_top_1: Will retrieve n most similar paragraphs from a post for the test set. Default=1 (train/dev=1)
    :param filter_tags: list of tags (to filter questions) to include or None (all tags)
    """
    logging.debug('Creating filtered data')
    logging.debug('Downloading archives')
    download_data(se_archives, se_archives_path)
    logging.debug('Done')
    test_ids = set(test_ids)

    questions_test = []
    if create_traindev:
        logger.info('creating train/dev')
        dgen_traindev = yield_clean_data(se_archives,
                                         se_archives_path,
                                         top_n=1,
                                         filter_tags=filter_tags)
        questions_train_dev = []
        for q in dgen_traindev:
            if q['post_id'] not in test_ids:
                questions_train_dev.append(q)
            else:
                questions_test.append(q)

        random.seed(1234)
        random.shuffle(questions_train_dev)
        logger.info('Train+Dev={}'.format(len(questions_train_dev)))
        logger.info(
            'Saving data to: {}.[train,dev]'.format(result_dataset_path))

        # n_dev = 1000
        n_dev = min(5000, round(len(questions_train_dev) * 0.1))
        save_json(questions_train_dev[:-n_dev],
                  '{}.train'.format(result_dataset_path))
        save_json(questions_train_dev[-n_dev:],
                  '{}.dev'.format(result_dataset_path))

    if create_test:
        logger.info('creating test')
        if test_top_1:
            logger.info('...test with top 1 paragraphs only')
            if not create_traindev:
                dgen_test = yield_clean_data(se_archives,
                                             se_archives_path,
                                             top_n=1,
                                             filter_ids=test_ids,
                                             filter_tags=filter_tags)
                questions_test = [q for q in dgen_test]
        else:
            logger.info('...test with all paragraphs')
            dgen_test = yield_clean_data(se_archives,
                                         se_archives_path,
                                         top_n=None,
                                         filter_ids=test_ids,
                                         filter_tags=filter_tags)
            questions_test = [q for q in dgen_test]

        logger.info('Test={}'.format(len(questions_test)))
        logger.info('Saving data to: {}.test'.format(result_dataset_path))
        save_json(questions_test, '{}.test'.format(result_dataset_path))
        logging.info('Done')
Пример #15
0
    pred_pnl.ix[:, :, "rank"][pred_pnl.ix[:, :, "rank"] ==
                              0] = 1  # 编号为0的直接归入第一组,因为前一步候原rank为1的元素减1滞后为0
    for date in pred_pnl.major_axis:
        current_return = pred_pnl.major_xs(date).transpose().groupby(
            "rank")["return"].mean()  # 分组并得到组内平均收益
        current_return.name = date
        group_df = group_df.append(current_return)
    group_df.sort_index(inplace=True)
    return group_df


if __name__ == "__main__":
    df = read_asset_set(const.STOCK_FILE)
    codes = df.index
    for code in codes:
        data.download_data(code)
    with open(const.COEF_FILE, 'r') as f:
        coef = json.load(f)
    factors = coef.keys()
    print factors
    # factors = ["30-day volatility"]
    # weights = [1]
    codes = df.index.tolist()
    generate_factors(codes, factors)
    update_frequency_factor_data(df, frequency='m')
    # pnl = get_asset_factor_data(df, factors, frequency='m')
    # pnl = get_predict_return(pnl, factors)
    # pnl = get_score_return(pnl, factors, weights)
    # group_df = get_group_return(pnl, factors)
    # group_df.to_excel("groups.xlsx")
def main(se_archive, se_archives_path, split_ids_folder, new_split_ids_folder, n_extends_ids, filter_tag, max_year):
    """This script extends the train ids from an existing set of train/dev/test ids with a number of randomly chosen ids
    from an SE data dump.

    :return:
    """
    logger = logging.getLogger('root')

    filter_tags = None if len(filter_tag) == 0 else filter_tag
    logger.info('Filtering with tags={}'.format(filter_tags))

    with open('{}/train-ids.txt'.format(split_ids_folder), 'r') as f:
        train_ids = [l.strip() for l in f if l]
    with open('{}/dev-ids.txt'.format(split_ids_folder), 'r') as f:
        dev_ids = [l.strip() for l in f if l]
    with open('{}/test-ids.txt'.format(split_ids_folder), 'r') as f:
        test_ids = [l.strip() for l in f if l]

    if n_extends_ids.endswith('x'):
        n_extends_ids = len(train_ids) * int(n_extends_ids[:-1])
    else:
        n_extends_ids = int(n_extends_ids)

    logger.info('Downloading and/or extracting SE archive')
    download_data([se_archive], se_archives_path)

    logger.info('Reading extracted SE archive')
    se_dir = '{}/{}'.format(se_archives_path, se_archive)
    se_reader = SEDataReader('{}/Posts.xml'.format(se_dir))

    ids = []
    for item in se_reader.read_items(max_year=None if max_year == -1 else max_year):
        # we neerandom_ids_permd to check if it contains usable paragraphs so that we actually end up with the exact same number of
        # items in the decanlp train data
        p, _, tags, _ = get_paragraphs(item)

        is_in_filter_tags = True
        if filter_tags is not None:
            is_in_filter_tags = len(set(tags) & set(filter_tags)) > 0

        if len(p) > 0 and is_in_filter_tags:
            ids.append(item['Id'])

    logger.info('Did read {} questions (filtered)'.format(len(ids)))

    random_ids_perm = np.random.permutation(len(ids))
    train_dev_test = set(train_ids + dev_ids + test_ids)
    extended_train_ids = []
    i = 0
    while len(extended_train_ids) < n_extends_ids and i < len(ids):
        candidate_id = ids[random_ids_perm[i]]
        if candidate_id not in train_dev_test:
            extended_train_ids.append(candidate_id)
        i += 1

    logger.info('Sampled {} test ids'.format(len(extended_train_ids)))

    logger.info('Writing new files')
    if not os.path.exists(new_split_ids_folder):
        os.mkdir(new_split_ids_folder)
    with open('{}/train-ids.txt'.format(new_split_ids_folder), 'w') as f:
        for i in train_ids + extended_train_ids:
            f.write('{}\n'.format(i))
    copyfile('{}/dev-ids.txt'.format(split_ids_folder), '{}/dev-ids.txt'.format(new_split_ids_folder))
    copyfile('{}/test-ids.txt'.format(split_ids_folder), '{}/test-ids.txt'.format(new_split_ids_folder))
    logger.info('DONE')