예제 #1
0
    def __init__(self):
        config = load_config()

        self.utils_config = config['Utils']
        self.task_type = self.utils_config['task_type']
        self.data_name = self.utils_config[self.task_type]['data_name']
        self.min_occurence = self.utils_config[self.task_type]['min_occurence']
        self.special_tokens = self.utils_config['special_token2idx']
        self.date = date.today().strftime('%d-%m-%Y')
        self.max_seq_len = self.utils_config[
            self.task_type]['max_sequence_length']
        self.x_y_pair_name = 'seq_label_pairs' if self.task_type == 'CLF' else 'seq_tags_pairs'  # Key in dataset - semantically correct for the task at hand.
        self.pad_token = '<PAD>'
        self.sos_token = '<START>'
        self.eos_token = '<STOP>'

        print(
            f'{datetime.now()}: Building {self.data_name.upper()} data for {self.task_type.upper()} task'
        )
        if self.task_type == 'SEQ':
            self._load_data()
            self._process_data_ner()
            self._process_pretrain_data_ner()

        elif self.task_type == 'CLF':
            self._load_data()
            self._process_data_clf()

        else:
            raise ValueError
예제 #2
0
    def __init__(self):
        self.config = load_config()
        self.model_config = self.config['Models']
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")s
        self.pretrain = True

        # Model
        self.task_type = self.config['Utils']['task_type']
        self.max_sequence_length = self.config['Utils'][self.task_type]['max_sequence_length']
        
        self.budget_frac = self.config['Train']['budget_frac']
        self.batch_size = self.config['Train']['batch_size']
        self.data_splits_frac = np.round(np.linspace(self.budget_frac, self.budget_frac*10, num=10, endpoint=True), 2)
        
        
        # Real data
        self.data_name = self.config['Utils'][self.task_type]['data_name']
        self.data_splits = self.config['Utils'][self.task_type]['data_split']
        self.pad_idx = self.config['Utils']['special_token2idx']['<PAD>']
        
        # Test run properties
        self.epochs = self.config['Train']['epochs']
        self.svae_iterations = self.config['Train']['svae_iterations']
        self.dsc_iterations = self.config['Train']['discriminator_iterations']
        self.adv_hyperparam = self.config['Models']['SVAE']['adversarial_hyperparameter']
예제 #3
0
    def __init__(self):
        self.config = load_config()
        self.model_config = self.config['Models']

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        # Model
        self.task_type = self.config['Utils']['task_type']
        self.max_sequence_length = self.config['Utils'][
            self.task_type]['max_sequence_length']

        # Real data
        self.data_name = self.config['Utils'][self.task_type]['data_name']
        self.pad_idx = self.config['Utils']['special_token2idx']['<PAD>']

        # Test run properties
        self.epochs = self.config['Train']['epochs']

        self.labelled_data_path = os.path.join(
            r'/home/tyler/Desktop/Repos/s-vaal/src/results/10',
            'labelled_data.txt')
        self.vocab_path = os.path.join(
            r'/home/tyler/Desktop/Repos/s-vaal/data/SEQ/conll2003',
            'vocabs.json')
예제 #4
0
 def __init__(self, trials=3):
     self.exp = Experimenter()
     self.mongo_coll_conn = Mongo(collection_name='optimisation')
     self.trials = trials
     self.config = load_config()
     self.task_type = self.config['Utils']['task_type']
     self.data_name = self.config['Utils'][self.task_type]['data_name']
예제 #5
0
    def __init__(self, runs=5, model_name=None, mongo_conn=None, exp_id=None):
        config = load_config()
        if config:
            self.runs = config['Train']['max_runs']
        else:
            self.runs = runs

        self.model_name = model_name
        self.mongo_conn = mongo_conn
        self.exp_id = exp_id
예제 #6
0
    def __init__(self):
        config = load_config()

        self.pad_idx = config['Utils']['special_token2idx']['<PAD>']
        self.special_chars_list = [self.pad_idx]
        output_classes = ['ORG', 'PER', 'LOC', 'MISC']
        self.no_output_classes = len(output_classes)
        self.tag_space_size = self.no_output_classes + len(
            self.special_chars_list)
        self.no_classes_clf = 4  # TODO: make more suitable...
예제 #7
0
    def __init__(self):
        Trainer.__init__(self)
        config = load_config()
        self.config = config

        self.initial_budget_frac = config['Train']['init_budget_frac']
        self.budget_frac = config['Train']['budget_frac']
        self.data_splits_frac = np.round(
            np.linspace(self.budget_frac,
                        self.budget_frac * 10,
                        num=10,
                        endpoint=True), 2)
        self.batch_size = config['Train']['batch_size']
        self.max_runs = config['Train']['max_runs']
        self.al_mode = config['Train']['al_mode']
        self.run_no = 1  # tracker for running models over n trials (TODO: ensure that this is robust and doesn't index wildly)
예제 #8
0
    def __init__(self):
        self.config = load_config()
        self.model_config = self.config['Models']

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Model
        self.task_type = self.config['Utils']['task_type']
        self.max_sequence_length = self.config['Utils'][self.task_type]['max_sequence_length']
        
        # Real data
        self.data_name = self.config['Utils'][self.task_type]['data_name']
        self.data_splits = self.config['Utils'][self.task_type]['data_split']
        self.pad_idx = self.config['Utils']['special_token2idx']['<PAD>']
        
        # Test run properties
        self.epochs = self.config['Train']['epochs']
        self.svae_iterations = self.config['Train']['svae_iterations']
        self.dsc_iterations = self.config['Train']['discriminator_iterations']
        self.adv_hyperparam = self.config['Models']['SVAE']['adversarial_hyperparameter']
예제 #9
0
                sampled_indices = self.sample_adversarial(
                    svae,
                    discriminator,
                    unlabelled_dataloader,
                    indices=unlabelled_indices,
                    cuda=True)  # TODO: review usage of indices arg
                current_indices = list(current_indices) + list(sampled_indices)
                sampler = data.sampler.SubsetRandomSampler(current_indices)
                self.labelled_dataloader = data.DataLoader(
                    self.datasets['train'],
                    sampler=sampler,
                    batch_size=self.batch_size,
                    drop_last=True)

        # write results to disk
        with open('results.json', 'w') as fj:
            json.dump(metrics_hist, fj, indent=4)


def main(config):
    al = ActiveLearner()
    al.learn()


if __name__ == '__main__':
    # Seeds
    config = load_config()
    np.random.seed(config['Utils']['seed'])
    torch.manual_seed(config['Utils']['seed'])
    main()
예제 #10
0
    def __init__(self, embedding_dim, hidden_dim, rnn_type, num_layers,
                 bidirectional, latent_size, word_dropout, embedding_dropout,
                 vocab_size: int):
        super(SVAE, self).__init__()
        config = load_config()
        utils_config = config['Utils']

        # Misc
        task_type = config['Utils']['task_type']
        self.max_sequence_length = utils_config[task_type][
            'max_sequence_length']
        self.tensor = torch.cuda.FloatTensor if torch.cuda.is_available(
        ) else torch.Tensor
        self.pretrain = config['Train']['pretrain']

        # Specical tokens and vocab
        self.pad_idx = utils_config['special_token2idx']['<PAD>']
        self.eos_idx = utils_config['special_token2idx']['<STOP>']
        self.sos_idx = utils_config['special_token2idx']['<START>']
        self.unk_idx = utils_config['special_token2idx']['<UNK>']
        self.vocab_size = vocab_size  #+ len(utils_config['special_token2idx'])

        # RNN settings
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.rnn_type = rnn_type
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.z_dim = latent_size

        # Embedding initialisation
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.word_dropout_rate = word_dropout
        self.embedding_dropout = nn.Dropout(p=embedding_dropout)

        # RNN type specification
        # TODO: Future implementation will include transformer/reformer models rather than these.
        if self.rnn_type == 'gru':
            rnn = nn.GRU
        elif self.rnn_type == 'lstm':
            rnn = nn.LSTM
        elif self.rnn_type == 'rnn':
            rnn = nn.RNN
        else:
            raise ValueError()

        # Initialise encoder-decoder RNNs (these are identical)
        self.encoder_rnn = rnn(input_size=self.embedding_dim,
                               hidden_size=self.hidden_dim,
                               num_layers=self.num_layers,
                               bidirectional=self.bidirectional,
                               batch_first=True)

        self.decoder_rnn = rnn(input_size=self.embedding_dim,
                               hidden_size=self.hidden_dim,
                               num_layers=self.num_layers,
                               bidirectional=self.bidirectional,
                               batch_first=True)

        # Hidden factor is used for expanding dimensionality if bidirectionality and multi-layer functionality is used
        self.hidden_factor = (2 if self.bidirectional else 1) * self.num_layers

        # Initialisation of FC layers
        # These map from the encoder to the latent space
        self.hidden2mean = nn.Linear(self.hidden_dim * self.hidden_factor,
                                     self.z_dim)
        self.hidden2logv = nn.Linear(self.hidden_dim * self.hidden_factor,
                                     self.z_dim)
        self.z2hidden = nn.Linear(self.z_dim,
                                  self.hidden_dim * self.hidden_factor)
        self.outputs2vocab = nn.Linear(
            self.hidden_dim * (2 if self.bidirectional else 1),
            self.vocab_size)

        # Initialise partial loss function
        self.NLL = nn.NLLLoss(
            ignore_index=self.pad_idx,
            reduction='sum')  # TODO: Review arguments for understanding