def __generate_word_matrix(self, index_lookup):
        """
        Generate a BOW matrix with rows, columns corresponding to documents, words respectively.

        @param index_lookup: A dictionary with keys for the attributes. In order to know which colounm should be incremented in word_matrix.
        """
        batches = s.load(open(env_paths.get_batches_path(self.training), "rb"))
        length = len(batches)
        processed = 1
        for batch in batches:
            docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb"))
            bag_of_words_matrix = zeros([len(docs_list), len(index_lookup)])
            row = 0
            for doc in docs_list:
                for token in doc:
                    try:  # If word is not found in the dictionary
                        col = index_lookup[token]
                        bag_of_words_matrix[row, col] += 1
                    except KeyError:
                        continue
                row += 1
            # Serialize bag of words
            s.dump(bag_of_words_matrix.tolist(), open(env_paths.get_bow_matrix_path(self.training, batch), "wb"))
            print 'Processed ' + str(processed) + ' of ' + str(length) + ' batches'
            processed += 1
    def __generate_word_matrix(self, index_lookup):
        """
        Generate a BOW matrix with rows, columns corresponding to documents, words respectively.

        @param index_lookup: A dictionary with keys for the attributes. In order to know which colounm should be incremented in word_matrix.
        """
        batches = s.load(open(env_paths.get_batches_path(self.training), "rb"))
        length = len(batches)
        processed = 1
        for batch in batches:
            docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb"))
            bag_of_words_matrix = zeros([len(docs_list), len(index_lookup)])
            row = 0
            for doc in docs_list:
                for token in doc:
                    try:  # If word is not found in the dictionary
                        col = index_lookup[token]
                        bag_of_words_matrix[row, col] += 1
                    except KeyError:
                        continue
                row += 1
            # Serialize bag of words
            s.dump(bag_of_words_matrix.tolist(), open(env_paths.get_bow_matrix_path(self.training, batch), "wb"))
            print "Processed " + str(processed) + " of " + str(length) + " batches"
            processed += 1
示例#3
0
    def __generate_input_data(self):
        """
        Generate the input data for the DBN so that it can be visualized.
        """
        if not len(self.input_data) == 0:
            return

        try:
            self.input_data = s.load(open('output/input_data.p', 'rb'))
            self.class_indices = s.load(open('output/class_indices.p', 'rb'))
            if not self.classes_to_visualise == None:
                self.__filter_input_data(self.classes_to_visualise)
        except:
            self.input_data = generate_input_data_list(
                training=False) if self.testing else generate_input_data_list(
                )
            self.class_indices = get_all_class_indices(
                training=False) if self.testing else get_all_class_indices()
            if not self.classes_to_visualise == None:
                self.__filter_input_data(self.classes_to_visualise)
            s.dump([input.tolist() for input in self.input_data],
                   open('output/input_data.p', 'wb'))
            s.dump(self.class_indices, open('output/class_indices.p', 'wb'))

        self.legend = get_class_names_for_class_indices(
            list(set(sorted(self.class_indices))))
示例#4
0
    def __generate_output_data(self):
        """
        Generate the output data of the DBN so that it can be visualised.
        """
        if not len(self.output_data) == 0:
            return
        try:
            self.output_data = s.load(open('output/output_data.p', 'rb'))
            self.class_indices = s.load(open('output/class_indices.p', 'rb'))
            if not self.classes_to_visualise == None:
                self.__filter_output_data(self.classes_to_visualise)
        except:
            self.output_data = generate_output_for_test_data(
                image_data=self.image_data, binary_output=self.binary_output
            ) if self.testing else generate_output_for_train_data(
                image_data=self.image_data, binary_output=self.binary_output)
            self.class_indices = get_all_class_indices(
                training=False) if self.testing else get_all_class_indices()
            if not self.classes_to_visualise == None:
                self.__filter_output_data(self.classes_to_visualise)
            s.dump([out.tolist() for out in self.output_data],
                   open('output/output_data.p', 'wb'))
            s.dump(self.class_indices, open('output/class_indices.p', 'wb'))

        self.legend = get_class_names_for_class_indices(
            list(set(sorted(self.class_indices))))
示例#5
0
    def __init__(self, testing=True, binary_output=False):
        """
        @param testing: Should be True if test data is to be plottet. Otherwise False.
        @param image_data: If the testing should be done on image data.
        @param binary_output: If the output of the DBN must be binary.
        """
        if not check_for_data:
            print 'No DBN data or testing data.'
            return

        self.status = -1
        self.output = []
        self.testing = testing
        self.binary_output = binary_output

        try:
            self.output_data = s.load(open('output/output_data.p', 'rb'))
            self.class_indices = s.load(open('output/class_indices.p', 'rb'))
        except:
            self.output_data = generate_output_for_test_data(
                binary_output=self.binary_output) if testing else generate_output_for_train_data(
                binary_output=self.binary_output)
            self.class_indices = get_all_class_indices(training=False) if testing else get_all_class_indices()
            s.dump([out.tolist() for out in self.output_data], open('output/output_data.p', 'wb'))
            s.dump(self.class_indices, open('output/class_indices.p', 'wb'))

        self.output_data = np.array(self.output_data)
def compare_real_data_to_reconstructed_data():
    weights = s.load(open(env_paths.get_dbn_weight_path(),"rb"))
    batches = s.load(open(env_paths.get_batches_path(train=False),"rb"))
    class_indices = s.load(open(env_paths.get_class_indices_path(False,batches[0]).replace(".0",""),"rb"))
    batch = batches[0]
    data = data_processing.get_bag_of_words_matrix(batch,training = False)


    dict = {}
    for i in range(len(class_indices)):
        idx = class_indices[i]
        if idx in dict.keys(): continue
        dict[idx] = data[i]
        if len(dict) >= 10:
            break

    print dict.keys()

    data_points = dict.values()

    output_data_points = []
    for d in data_points:
        d = append(d,1.)
        out = generate_output_data(d,weights)
        output_data_points.append(out)

    visualise_data_points(data_points,output_data_points)
示例#7
0
	def onTurnSelected(self, evt):
		turn = evt.attr1
		#only load if db does not know about this turn
		if not turn in db.db.turns or not db.db.turns[turn]:
			serialization.load(turn, self)
		self.map.turn = turn
		self.map.update()
		log.info('update info panel with turn %s'%(self.map.turn,))
		self.info_panel.update(self.map.turn)
def load_rbm_weights():
    """
    Load the weight matrices from the rbm pretraining.

    @param weight_matrices: the weight matrices of the rbm pretraining.
    """
    weights = [array(w) for w in s.load( open( env_paths.get_rbm_weights_path(), "rb" ) )]
    hid_bias = [array(b) for b in s.load( open( env_paths.get_rbm_hidden_biases_path(), "rb" ) )]
    vis_bias = [array(b) for b in s.load( open( env_paths.get_rbm_visible_biases_path(), "rb" ) )]
    return weights,hid_bias,vis_bias
def get_document_class(row, batch, training=True):
    """
    The class of a document corresponding to a row
    in a batch.
    
    @param row: row in the bag of words matrix in batch.
    @param batch: the number of the batch.
    @param training: is this the training set or the test set.
    """
    class_indices_for_batch = s.load(open(env_paths.get_class_indices_path(training, batch), "rb"))
    class_names_for_batch = s.load(open(env_paths.get_class_names_path(training), "rb"))
    return class_names_for_batch[class_indices_for_batch[row]]
def get_document_class(row, batch, training=True):
    """
    The class of a document corresponding to a row
    in a batch.
    
    @param row: row in the bag of words matrix in batch.
    @param batch: the number of the batch.
    @param training: is this the training set or the test set.
    """
    class_indices_for_batch = s.load(open(env_paths.get_class_indices_path(training, batch), "rb"))
    class_names_for_batch = s.load(open(env_paths.get_class_names_path(training), "rb"))
    return class_names_for_batch[class_indices_for_batch[row]]
    def __set_attributes(self):
        """
        Set the attributes containing of a list of words of all attributes
        in the bag of words matrix.

        @return: The generated list of words acting as attributes for the BOWs.
        """
        batches = s.load(open(env_paths.get_batches_path(self.training), "rb"))
        length = len(batches)
        attributes = []
        processed = 1
        for batch in batches:
            docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb"))
            tmp_attributes = list(
                set(sorted(list(chain(*docs_list))))
            )  # Retrieve the each word of the docs list in a sorted list
            attributes += tmp_attributes
            attributes = list(
                set(sorted(attributes))
            )  # Sort the attributes list so that there is no 2 occurrences of the same word.
            if not self.acceptance_lst == None:
                attributes = list(
                    set(attributes).intersection(self.acceptance_lst)
                )  # Only consider words in the acceptance list.
            print "Processed attribute " + str(processed) + " of " + str(length) + " batches"
            processed += 1

        # Find attributes of the most common words.
        d = dict.fromkeys(attributes)
        processed = 1
        for batch in batches:
            docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb"))
            words = list(list(chain(*docs_list)))
            for w in words:
                try:
                    if d[w] == None:
                        d[w] = 1
                    else:
                        d[w] += 1
                except KeyError:
                    continue
            print "Processed summing " + str(processed) + " of " + str(length) + " batches"
            processed += 1
        sorted_att = sorted(d.items(), key=lambda x: x[1])
        sorted_att = sorted_att[len(sorted_att) - self.max_words_matrix :]
        attributes = [elem[0] for elem in sorted_att]

        # Serialize attributes
        s.dump(attributes, open(env_paths.get_attributes_path(self.training), "wb"))
        return attributes
def get_attributes(training=True):
    """
    Get the attributes.
    
    @param training: is this the training set or the test set.
    """
    return s.load(open(env_paths.get_attributes_path(training), "rb"))
def get_bag_of_words_matrix(batch, training=True):
    """
    Retrieve the bag of words matrix for a batch.
    
    @param batch: the number of the batch.
    """
    return array(s.load(open(env_paths.get_bow_matrix_path(training, int(batch)), "rb")))
def get_batch_list(training=True):
    """
    Retrieve the list containing the batch numbers.

    @param training: is this the training set or the test set.
    """
    return s.load(open(env_paths.get_batches_path(training), "rb"))
def get_batch_list(training=True):
    """
    Retrieve the list containing the batch numbers.

    @param training: is this the training set or the test set.
    """
    return s.load(open(env_paths.get_batches_path(training), "rb"))
def get_weights():
    """
    Retrieve the weights from the generated DBN.

    @return: Weights of the DBN.
    """
    return [array(w) for w in s.load(open(env_paths.get_dbn_weight_path(), "rb" ) )]
示例#17
0
def load_model(model_path, _log, _run):
    _log.info('Loading model from %s', model_path)
    with open(model_path) as f:
        model = load(f.read())
    if SACRED_OBSERVE_FILES:
        _run.add_resource(model_path)
    return model
def get_attributes(training=True):
    """
    Get the attributes.
    
    @param training: is this the training set or the test set.
    """
    return s.load(open(env_paths.get_attributes_path(training), "rb"))
def get_bag_of_words_matrix(batch, training=True):
    """
    Retrieve the bag of words matrix for a batch.
    
    @param batch: the number of the batch.
    """
    return array(s.load(open(env_paths.get_bow_matrix_path(training, int(batch)), "rb")))
def load_dbn_weights():
    """
    Load the weight matrices from the finetuning.

    @param weight_matrices: the weight matrices of the finetuning.
    """
    return [array(w) for w in s.load(open(env_paths.get_dbn_weight_path(), "rb" ) )]
def get_all_document_names(training=True):
    batches = get_batch_list(training)
    doc_names_collected = []

    for batch in batches:
        doc_names_collected += list(s.load(open(env_paths.get_doc_names_path(training, int(batch)), "rb")))

    return doc_names_collected
def get_all_document_names(training=True):
    batches = get_batch_list(training)
    doc_names_collected = []

    for batch in batches:
        doc_names_collected += list(s.load(open(env_paths.get_doc_names_path(training, int(batch)), "rb")))

    return doc_names_collected
示例#23
0
def load_metadata(save_dir, _log, _run):
    filename = os.path.join(save_dir, METADATA_FILENAME)
    _log.info('Loading metadata from %s', filename)
    with open(filename) as f:
        metadata = load(f.read())
    if SACRED_OBSERVE_FILES:
        _run.add_resource(filename)
    return metadata
    def __set_attributes(self):
        """
        Set the attributes containing of a list of words of all attributes
        in the bag of words matrix.

        @return: The generated list of words acting as attributes for the BOWs.
        """
        batches = s.load(open(env_paths.get_batches_path(self.training), "rb"))
        length = len(batches)
        attributes = []
        processed = 1
        for batch in batches:
            docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb"))
            tmp_attributes = list(
                set(sorted(list(chain(*docs_list)))))  # Retrieve the each word of the docs list in a sorted list
            attributes += tmp_attributes
            attributes = list(
                set(sorted(attributes)))  # Sort the attributes list so that there is no 2 occurrences of the same word.
            if not self.acceptance_lst == None: attributes = list(
                set(attributes).intersection(self.acceptance_lst))  # Only consider words in the acceptance list.
            print 'Processed attribute ' + str(processed) + ' of ' + str(length) + ' batches'
            processed += 1

        # Find attributes of the most common words.
        d = dict.fromkeys(attributes)
        processed = 1
        for batch in batches:
            docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb"))
            words = list(list(chain(*docs_list)))
            for w in words:
                try:
                    if d[w] == None:
                        d[w] = 1
                    else:
                        d[w] += 1
                except KeyError:
                    continue
            print 'Processed summing ' + str(processed) + ' of ' + str(length) + ' batches'
            processed += 1
        sorted_att = sorted(d.items(), key=lambda x: x[1])
        sorted_att = sorted_att[len(sorted_att) - self.max_words_matrix:]
        attributes = [elem[0] for elem in sorted_att]

        # Serialize attributes
        s.dump(attributes, open(env_paths.get_attributes_path(self.training), "wb"))
        return attributes
示例#25
0
def load_model(model_path, _log, _run):
    _log.info('Loading model from %s', model_path)
    with open(model_path) as f:
        model = load(f.read())
    assert isinstance(model, HMMSummarizer), 'model is not an HMM summarizer'
    if SAVE_FILES:
        _run.add_resource(model_path)
    return model
def compare_real_data_to_reconstructed_data_random():
    weights = s.load(open(env_paths.get_dbn_weight_path(),"rb"))
    batches = s.load(open(env_paths.get_batches_path(train=False),"rb"))
    batch = choice(batches) # make sure to pick batch at random
    data = data_processing.get_bag_of_words_matrix(batch,training = False)
    # choose 10 data points at random
    data_points = []
    indices = random.randint(0,len(data),10)
    for idx in indices:
        data_points.append(data[idx])

    output_data_points = []
    for d in data_points:
        d = append(d,1.)
        out = generate_output_data(d,weights)
        output_data_points.append(out)

    visualise_data_points(data_points,output_data_points)
示例#27
0
def get_weights():
    """
    Retrieve the weights from the generated DBN.

    @return: Weights of the DBN.
    """
    return [
        array(w) for w in s.load(open(env_paths.get_dbn_weight_path(), "rb"))
    ]
示例#28
0
def load_dbn_weights():
    """
    Load the weight matrices from the finetuning.

    @param weight_matrices: the weight matrices of the finetuning.
    """
    return [
        array(w) for w in s.load(open(env_paths.get_dbn_weight_path(), "rb"))
    ]
示例#29
0
def load_rbm_weights():
    """
    Load the weight matrices from the rbm pretraining.

    @param weight_matrices: the weight matrices of the rbm pretraining.
    """
    weights = [
        array(w) for w in s.load(open(env_paths.get_rbm_weights_path(), "rb"))
    ]
    hid_bias = [
        array(b)
        for b in s.load(open(env_paths.get_rbm_hidden_biases_path(), "rb"))
    ]
    vis_bias = [
        array(b)
        for b in s.load(open(env_paths.get_rbm_visible_biases_path(), "rb"))
    ]
    return weights, hid_bias, vis_bias
def get_document_names(batch, training=True):
    """
    Get document names.
    
    @param batch: the number of the batch.
    @param training: is this the training set or the test set.
    """
    names = s.load(open(env_paths.get_doc_names_path(training, batch), "rb"))
    return names
def get_document_names(batch, training=True):
    """
    Get document names.
    
    @param batch: the number of the batch.
    @param training: is this the training set or the test set.
    """
    names = s.load(open(env_paths.get_doc_names_path(training, batch), "rb"))
    return names
def get_document_name(row, batch, training=True):
    """
    The name of the document corresponding to a row
    in a batch.

    @param row: row in the bag of words matrix in batch.
    @param batch: the number of the batch.
    @param training: is this the training set or the test set.
    """
    return s.load(open(env_paths.get_doc_names_path(training, batch), "rb"))[row]
def get_class_indices(batch, training=True):
    """
    Get all class indices of the documents in a batch.
    
    @param batch: the number of the batch.
    @param training: is this the training set or the test set.
    """

    indices = s.load(env_paths.get_class_indices_path(training, batch), "rb")
    return indices
def get_class_indices(batch, training=True):
    """
    Get all class indices of the documents in a batch.
    
    @param batch: the number of the batch.
    @param training: is this the training set or the test set.
    """

    indices = s.load(env_paths.get_class_indices_path(training, batch), "rb")
    return indices
def get_document_name(row, batch, training=True):
    """
    The name of the document corresponding to a row
    in a batch.

    @param row: row in the bag of words matrix in batch.
    @param batch: the number of the batch.
    @param training: is this the training set or the test set.
    """
    return s.load(open(env_paths.get_doc_names_path(training, batch), "rb"))[row]
    def __get_input_data__(self, batch_index, first_layer):
        """
        Retrieve the word-count matrix from HDD.

        @param batch_index: Index of the batch.

        @return: The word-count matrix corresponding to the batch_index.
        """
        if first_layer:
            return DataPreparation.data_processing.get_bag_of_words_matrix(self.batches[batch_index])
        return array(s.load(open(env_paths.get_rbm_output_path(self.num_vis, batch_index, self.layer_index - 1), "rb")))
    def __generate_input_data(self):
        """
        Generate the input data for the DBN so that it can be visualized.
        """
        if not len(self.input_data) == 0:
            return

        try:
            self.input_data = s.load(open('output/input_data.p', 'rb'))
            self.class_indices = s.load(open('output/class_indices.p', 'rb'))
            if not self.classes_to_visualise == None:
                self.__filter_input_data(self.classes_to_visualise)
        except:
            self.input_data = generate_input_data_list(training=False) if self.testing else generate_input_data_list()
            self.class_indices = get_all_class_indices(training=False) if self.testing else get_all_class_indices()
            if not self.classes_to_visualise == None:
                self.__filter_input_data(self.classes_to_visualise)
            s.dump([input.tolist() for input in self.input_data], open('output/input_data.p', 'wb'))
            s.dump(self.class_indices, open('output/class_indices.p', 'wb'))

        self.legend = get_class_names_for_class_indices(list(set(sorted(self.class_indices))))
def get_all_class_indices(training=True):
    """
    Get all class indices for all batches in one list.

    @param training: is this the training set or the test set.
    """
    batches = get_batch_list(training)
    indices_collected = []

    for batch in batches:
        indices_collected += list(s.load(open(env_paths.get_class_indices_path(training, int(batch)), "rb")))

    return indices_collected
    def __generate_output_data(self):
        """
        Generate the output data of the DBN so that it can be visualised.
        """
        if not len(self.output_data) == 0:
            return
        try:
            self.output_data = s.load(open('output/output_data.p', 'rb'))
            self.class_indices = s.load(open('output/class_indices.p', 'rb'))
            if not self.classes_to_visualise == None:
                self.__filter_output_data(self.classes_to_visualise)
        except:
            self.output_data = generate_output_for_test_data(image_data=self.image_data,
                                                             binary_output=self.binary_output) if self.testing else generate_output_for_train_data(
                image_data=self.image_data, binary_output=self.binary_output)
            self.class_indices = get_all_class_indices(training=False) if self.testing else get_all_class_indices()
            if not self.classes_to_visualise == None:
                self.__filter_output_data(self.classes_to_visualise)
            s.dump([out.tolist() for out in self.output_data], open('output/output_data.p', 'wb'))
            s.dump(self.class_indices, open('output/class_indices.p', 'wb'))

        self.legend = get_class_names_for_class_indices(list(set(sorted(self.class_indices))))
def get_all_class_indices(training=True):
    """
    Get all class indices for all batches in one list.

    @param training: is this the training set or the test set.
    """
    batches = get_batch_list(training)
    indices_collected = []

    for batch in batches:
        indices_collected += list(s.load(open(env_paths.get_class_indices_path(training, int(batch)), "rb")))

    return indices_collected
示例#41
0
    def __get_input_data__(self, batch_index, first_layer):
        """
        Retrieve the word-count matrix from HDD.

        @param batch_index: Index of the batch.

        @return: The word-count matrix corresponding to the batch_index.
        """
        if first_layer:
            return DataPreparation.data_processing.get_bag_of_words_matrix(
                self.batches[batch_index])
        return array(
            s.load(
                open(
                    env_paths.get_rbm_output_path(self.num_vis, batch_index,
                                                  self.layer_index - 1),
                    "rb")))
示例#42
0
def handle_load():
    '''
        tries to load, telling user if the savefile does not exist
        :returns: tuple (dmap, player) - deserialized DungeonMap and Player
            or None if the load was unsuccessful
    '''
    if not save_exists():
        dlog.debug('Tried to load when savefile does not exist')
        olog.info('You haven\'t saved it yet!')
        return None

    try:
        deserialized = load()
        dlog.debug('changed state to loaded:')
        olog.info('Loaded your game!\n')

        return deserialized

    except UnpicklingError:
        olog.info('Could not load the save, savefile corrupted')
        return None
def load_large_batch(batch):
    return array(s.load(open(env_paths.get_dbn_large_batch_data_path(batch), 'rb')))
def load_large_batches_lst():
    return s.load(open(env_paths.get_dbn_batches_lst_path(), 'rb'))
示例#45
0
def evaluate(
    _log,
    _run,
    max_length=None,
    artifacts_dir="artifacts",
    load_params="model.pth",
    word_emb_path="wiki.id.vec",
    device="cpu",
):
    """Evaluate a trained self-attention graph-based parser."""
    if max_length is None:
        max_length = {}

    artifacts_dir = Path(artifacts_dir)

    samples = {}
    try:
        samples["dev"] = list(
            read_samples(which="dev", max_length=max_length.get("dev")))
    except FileNotFoundError:
        _log.info("Dev set is not found, skipping")
    samples["test"] = list(
        read_samples(which="test", max_length=max_length.get("test")))

    for wh in samples:
        n_toks = sum(len(s["words"]) for s in samples[wh])
        _log.info("Read %d %s samples and %d tokens", len(samples[wh]), wh,
                  n_toks)

    path = artifacts_dir / "vocab.yml"
    _log.info("Loading source vocabulary from %s", path)
    vocab = load(path.read_text(encoding="utf8"))
    for name in vocab.keys():
        _log.info("Found %d %s", len(vocab[name]), name)

    _log.info("Extending vocab with target words")
    old_n_words = len(vocab["words"])
    vocab.extend(chain(*samples.values()), ["words"])
    _log.info("Found %d words now", len(vocab["words"]))

    samples = {wh: list(vocab.stoi(samples[wh])) for wh in samples}

    path = artifacts_dir / "model.yml"
    _log.info("Loading model from metadata %s", path)
    model = load(path.read_text(encoding="utf8"))

    path = artifacts_dir / load_params
    _log.info("Loading model parameters from %s", path)
    model.load_state_dict(torch.load(path, "cpu"))

    if len(vocab["words"]) > old_n_words:
        _log.info("Creating extended word embedding layer")
        if word_emb_path:
            kv = KeyedVectors.load_word2vec_format(word_emb_path)
            assert model.word_emb.embedding_dim == kv.vector_size
        else:
            _log.warning(
                "Word embedding file not specified; any extra target words will be treated as unks"
            )
            kv = None
        with torch.no_grad():
            model.word_emb = torch.nn.Embedding.from_pretrained(
                extend_word_embedding(
                    model.word_emb.weight,
                    vocab["words"],
                    kv,
                    vocab["words"].index(vocab.UNK_TOKEN),
                ))

    model.to(device)
    dev_accs = {}
    for wh in samples:
        _log.info("Evaluating on %s", wh)
        state = run_eval(model, vocab, samples[wh])
        accs = state["counts"].accs
        if wh == "dev":
            dev_accs = accs
        print_accs(accs, on=wh, run=_run)

        if "type2counts" in state:
            _log.info("Type-wise accuracies:")
            for type_, c in state["type2counts"].items():
                for key, acc in c.accs.items():
                    metric_name = f"{wh}_{type_}_{key}"
                    _log.info(f"{metric_name}: {acc:.2%}")
                    _run.log_scalar(metric_name, acc)

                for suffix in ("", "_nopunct"):
                    metric_name = f"{wh}_{type_}_n_arcs{suffix}"
                    _log.info("%s: %d", metric_name,
                              getattr(c, f"n_arcs{suffix}"))
                    _run.log_scalar(metric_name, getattr(c, f"n_arcs{suffix}"))

    return dev_accs.get("las_nopunct")
def get_all_class_names():
    """
    Get all class names for training set.
    """

    return s.load(open(env_paths.get_class_names_path(train=True), "rb"))
    def __read_docs_from_filesystem(self):
        """
        Read all docs and assign them to batches, so that each doc category is represented equally across batches.

        """
        docs_names = []
        docs_names_split = []
        class_indices = []
        class_indices_split = []
        class_names = []
        batches = []

        print "Generating class indices and docs names list."
        doc_count = 0
        for folder in self.paths:
            docs_names_split.append([])
            class_indices_split.append([])
            class_names.append(folder.split("/")[len(folder.split("/")) - 1])
            if self.trainingset_size == None:  # If data processing should be done on all data in the specified folders.
                docs = os.listdir(folder)
            elif (
                not self.trainingset_size == None and self.trainingset_attributes == None
            ):  # If data processing should be done on parts of the docs in the specified folders - for training and testing purposes.
                docs = os.listdir(folder)[: int(len(os.listdir(folder)) * self.trainingset_size)]
            else:  # If data processing should be done on a test set.
                docs = os.listdir(folder)[int(len(os.listdir(folder)) * self.trainingset_size) :]
            for doc in docs:
                if doc.endswith(".p"):
                    # Append the name of the document to the list containing document names.
                    docs_names_split[-1].append(folder + "/" + doc)
                    class_indices_split[-1].append(len(class_names) - 1)
                    doc_count += 1

        if len(docs_names_split) == 0:  # Check if docs have been stemmed.
            print "Documents have not been stemmed. Please stem documents in order to create bag of words matrices."
            return 0

        # Ensure that batches contain an even amount of docs from each category.
        print "Arranging the documents."
        if doc_count < self.batchsize:
            print "Number of documents must be greater than batchsize. Please revise the batchsize."
            return 0
        number_of_batches = doc_count / self.batchsize
        number_of_classes = len(self.paths)
        batches_collected_class_indices = []
        batches_collected_docs_names = []

        # Calculate fraction of category in each batch.
        d = {}
        for i in range(len(class_indices_split)):
            d[i] = float(len(class_indices_split[i])) / number_of_batches

        count = 0
        for i in range(number_of_batches):
            batch_class_indices = []
            batch_docs_names = []
            d_tmp = array([int(v) for v in d.values()])
            while True:
                if (
                    (len(batch_class_indices) == self.batchsize)
                    and (not doc_count - count < self.batchsize)
                    or (count == doc_count)
                ):
                    break
                if len(d_tmp[d_tmp > 0]) == 0:
                    break
                for j in range(number_of_classes):
                    if (
                        (len(batch_class_indices) == self.batchsize)
                        and (not doc_count - count < self.batchsize)
                        or (count == doc_count)
                    ):
                        break
                    if len(class_indices_split[j]) > 0 and d_tmp[j] != 0:
                        batch_class_indices.append(class_indices_split[j].pop(0))
                        batch_docs_names.append(docs_names_split[j].pop(0))
                        d_tmp[j] -= 1
                        count += 1
            batches_collected_class_indices.append(batch_class_indices)
            batches_collected_docs_names.append(batch_docs_names)

        for i in range(number_of_batches):
            bsize = self.batchsize if i < number_of_batches - 1 else self.batchsize + (doc_count % self.batchsize)
            batch_class_indices = batches_collected_class_indices[i]
            batch_docs_names = batches_collected_docs_names[i]
            if len(batch_class_indices) < bsize:
                while True:
                    if len(batch_class_indices) == bsize:
                        break
                    for j in range(number_of_classes):
                        if len(batch_class_indices) == bsize:
                            break
                        if len(class_indices_split[j]) > 0:
                            batch_class_indices.append(class_indices_split[j].pop(0))
                            batch_docs_names.append(docs_names_split[j].pop(0))

            # Shuffle the batch
            batch_class_indices_shuf = []
            batch_docs_names_shuf = []
            index_shuf = range(len(batch_class_indices))
            shuffle(index_shuf)
            for k in index_shuf:
                batch_class_indices_shuf.append(batch_class_indices[k])
                batch_docs_names_shuf.append(batch_docs_names[k])

            # Append batch to full lists
            class_indices += batch_class_indices_shuf
            docs_names += batch_docs_names_shuf

        print "Reading and saving docs from file system"
        count = 0
        class_indices_batch = []
        docs_names_batch = []
        docs_list = []
        for i in xrange(len(class_indices)):
            if (
                not count == 0 and (count % self.batchsize) == 0
            ):  # Save the batch if batchsize is reached or if the last document has been read.
                if not (len(class_indices) - count) < self.batchsize:
                    print "Read ", str(count), " of ", len(class_indices)
                    self.__save_batch_loading_docs(count, docs_list, docs_names_batch, class_indices_batch)
                    batches.append(count)
                    # Reset the lists
                    docs_list = []
                    docs_names_batch = []
                    class_indices_batch = []

            d = s.load(open(docs_names[i], "rb"))
            docs_list.append(d)
            docs_names_batch.append(docs_names[i])
            class_indices_batch.append(class_indices[i])
            count += 1

        # Save the remaining docs
        if len(docs_list) > 0:
            print "Read ", str(count), " of ", len(class_indices)
            self.__save_batch_loading_docs(count, docs_list, docs_names_batch, class_indices_batch)
            batches.append(count)

        s.dump(class_names, open(env_paths.get_class_names_path(self.training), "wb"))
        s.dump(batches, open(env_paths.get_batches_path(self.training), "wb"))
        return 1
def load_large_batch(batch):
    return array(
        s.load(open(env_paths.get_dbn_large_batch_data_path(batch), 'rb')))
def load_large_batches_lst():
    return s.load(open(env_paths.get_dbn_batches_lst_path(), 'rb'))
示例#50
0
	def __init__(self, parent):
		sz = int(config.options['window']['width']), int(config.options['window']['height'])
		wx.Frame.__init__(self, parent, -1, "dcLord (%s): Divide & Conquer client (www.the-game.ru)"%(version.getVersion(),), style=wx.DEFAULT_FRAME_STYLE | wx.NO_FULL_REPAINT_ON_RESIZE, size=sz)
		
		if int(config.options['window']['is_maximized'])==1:
			self.Maximize()
					
		#import_raw.processAllUnpacked()
		#self.map.turn = db.db.max_turn

		self.log_dlg = wx.TextCtrl(self, 1, style=wx.TE_MULTILINE)
		self.log_dlg.Disable()
		self.log_dlg.SetBackgroundColour('WHITE')
		serialization.load(ev_cb = self)
		
		self.info_panel = planet_window.InfoPanel(self)
		self.object_filter = object_filter.FilterPanel(self)
		self.planet_filter = object_filter.FilterFrame(self)
		#self.unit_list = unit_list.UnitPrototypeListWindow(self, 0)
		self.history = history.HistoryPanel(self)
		#self.area_list = area_panel.AreaListWindow(self)

		self.sync_path = config.options['data']['sync_path']
		self.info_panel.turn = db.getTurn()
		print 'db max turn is %s'%(db.getTurn(),)
		
		self.map = map.Map(self)
		self.map.turn = db.getTurn()
		self.map.set_planet_filter(self.planet_filter)
		print 'map turn is set to %s'%(self.map.turn,)
		self.map.update()

		self.started = False
		self.actions_queue = []
		
		self.pf = None
		
		if self.map.turn != 0:
			self.log('loaded data for turn %d'%(self.map.turn,))
		
		self.pending_actions = request.RequestMaker()
		
		self._mgr = wx.aui.AuiManager(self)
		
		self.command_selected_user = False
		
		info = wx.aui.AuiPaneInfo()
		info.CenterPane()
		info.Fixed()
		info.DefaultPane()
		info.Resizable(True)
		info.CaptionVisible(False)
		
		self._mgr.AddPane(self.map, info)
		self._mgr.AddPane(self.history, wx.RIGHT, "Turn")
		self._mgr.AddPane(self.info_panel, wx.RIGHT, "Info")
		self._mgr.AddPane(self.planet_filter, wx.LEFT, "Planets")
		self._mgr.AddPane(self.object_filter, wx.LEFT, "Filter")
		#self._mgr.AddPane(self.unit_list, wx.RIGHT, "Units")
		self._mgr.AddPane(self.log_dlg, wx.BOTTOM, "Log")
		#self._mgr.AddPane(self.area_list, wx.RIGHT, "Areas")
		
		#self.map.set_planet_fileter(self.planet_filter)
		self._mgr.Update()
		
		
		#TODO: load from data
		self.manual_control_units = set()
		
		#unit id
		self.manual_control_units.add( 7906 )
		self.manual_control_units.add( 7291 ) # probes over Othes planets
		
		#TODO: load from file
		self.exclude_fleet_names = [] #busy, taken, etc...

		#p = config.options['window']['pane-info']
		#if p:
		#	print 'load p %s'%(p,)
		#	self._mgr.LoadPerspective( p )
		
		self.recv_data_callback = {}
		
		self.makeMenu()
		
		self.Bind(event.EVT_DATA_DOWNLOAD, self.onDownloadRawData)
		self.Bind(event.EVT_MAP_UPDATE, self.onMapUpdate)
		self.Bind(event.EVT_USER_SELECT, self.onSelectUser)
		self.Bind(event.EVT_ACTIONS_REPLY, self.onActionsReply)
		self.Bind(event.EVT_SELECT_OBJECT, self.info_panel.selectObject)
		self.Bind(event.EVT_TURN_SELECTED, self.onTurnSelected)
		self.Bind(event.EVT_LOG_APPEND, self.onLog)
	
		#import_raw.processAllUnpacked()
		#serialization.save()
		
		#todo - restore previous state
		#self.Maximize()
		
		self.history.updateTurns(self.map.turn)
示例#51
0
def main():  # noqa: C901
    parser = argparse.ArgumentParser()
    parser.add_argument("--env", help="environment ID", type=str, default="Walker2d-v2")
    parser.add_argument("-f", "--folder", help="Log folder", type=str, default="logs")
    parser.add_argument("--algo", help="RL Algorithm", default="ppo", type=str, required=False, choices=list(ALGOS.keys()))
    parser.add_argument("-n", "--n-timesteps", help="number of timesteps", default=1000, type=int)
    parser.add_argument("--num-threads", help="Number of threads for PyTorch (-1 to use default)", default=-1, type=int)
    parser.add_argument("--n-envs", help="number of environments", default=1, type=int)
    parser.add_argument("--exp-id", help="Experiment ID (default: 0: latest, -1: no exp folder)", default=0, type=int)
    parser.add_argument("--verbose", help="Verbose mode (0: no output, 1: INFO)", default=1, type=int)
    parser.add_argument(
        "--no-render", action="store_true", default=False, help="Do not render the environment (useful for tests)"
    )
    parser.add_argument("--deterministic", action="store_true", default=False, help="Use deterministic actions")
    parser.add_argument(
        "--load-best", action="store_true", default=False, help="Load best model instead of last model if available"
    )
    parser.add_argument(
        "--load-checkpoint",
        type=int,
        help="Load checkpoint instead of last model if available, "
        "you must pass the number of timesteps corresponding to it",
    )
    parser.add_argument("--stochastic", action="store_true", default=False, help="Use stochastic actions")
    parser.add_argument(
        "--norm-reward", action="store_true", default=False, help="Normalize reward if applicable (trained with VecNormalize)"
    )
    parser.add_argument("--seed", help="Random generator seed", type=int, default=0)
    parser.add_argument("--reward-log", help="Where to log reward", default="", type=str)
    parser.add_argument(
        "--gym-packages",
        type=str,
        nargs="+",
        default=[],
        help="Additional external Gym environemnt package modules to import (e.g. gym_minigrid)",
    )
    parser.add_argument(
        "--env-kwargs", type=str, nargs="+", action=StoreDict, help="Optional keyword argument to pass to the env constructor"
    )
    args = parser.parse_args()

    # Going through custom gym packages to let them register in the global registory
    for env_module in args.gym_packages:
        importlib.import_module(env_module)

    env_id = args.env
    algo = args.algo
    folder = args.folder

    if args.exp_id == 0:
        args.exp_id = get_latest_run_id(os.path.join(folder, algo), env_id)
        print(f"Loading latest experiment, id={args.exp_id}")

    # Sanity checks
    if args.exp_id > 0:
        log_path = os.path.join(folder, algo, f"{env_id}_{args.exp_id}")
    else:
        log_path = os.path.join(folder, algo)

    assert os.path.isdir(log_path), f"The {log_path} folder was not found"

    found = False
    for ext in ["zip"]:
        model_path = os.path.join(log_path, f"{env_id}.{ext}")
        found = os.path.isfile(model_path)
        if found:
            break

    if args.load_best:
        model_path = os.path.join(log_path, "best_model.zip")
        found = os.path.isfile(model_path)

    if args.load_checkpoint is not None:
        model_path = os.path.join(log_path, f"rl_model_{args.load_checkpoint}_steps.zip")
        found = os.path.isfile(model_path)

    if not found:
        raise ValueError(f"No model found for {algo} on {env_id}, path: {model_path}")

    off_policy_algos = ["qrdqn", "dqn", "ddpg", "sac", "her", "td3", "tqc"]

    if algo in off_policy_algos:
        args.n_envs = 1

    set_random_seed(args.seed)

    if args.num_threads > 0:
        if args.verbose > 1:
            print(f"Setting torch.num_threads to {args.num_threads}")
        th.set_num_threads(args.num_threads)

    is_atari = ExperimentManager.is_atari(env_id)

    stats_path = os.path.join(log_path, env_id)
    hyperparams, stats_path = get_saved_hyperparams(stats_path, norm_reward=args.norm_reward, test_mode=True)

    # load env_kwargs if existing
    env_kwargs = {}
    args_path = os.path.join(log_path, env_id, "args.yml")
    if os.path.isfile(args_path):
        with open(args_path, "r") as f:
            loaded_args = yaml.load(f, Loader=yaml.UnsafeLoader)  # pytype: disable=module-attr
            if loaded_args["env_kwargs"] is not None:
                env_kwargs = loaded_args["env_kwargs"]
    # overwrite with command line arguments
    if args.env_kwargs is not None:
        env_kwargs.update(args.env_kwargs)

    log_dir = args.reward_log if args.reward_log != "" else None

    env = create_test_env(
        env_id,
        n_envs=args.n_envs,
        stats_path=stats_path,
        seed=args.seed,
        log_dir=log_dir,
        should_render=not args.no_render,
        hyperparams=hyperparams,
        env_kwargs=env_kwargs,
    )

    kwargs = dict(seed=args.seed)
    if algo in off_policy_algos:
        # Dummy buffer size as we don't need memory to enjoy the trained agent
        kwargs.update(dict(buffer_size=1))

    # Check if we are running python 3.8+
    # we need to patch saved model under python 3.6/3.7 to load them
    newer_python_version = sys.version_info.major == 3 and sys.version_info.minor >= 8

    custom_objects = {}
    if newer_python_version:
        custom_objects = {
            "learning_rate": 0.0,
            "lr_schedule": lambda _: 0.0,
            "clip_range": lambda _: 0.0,
        }

    model = load(ALGOS[algo], model_path, env=env, custom_objects=custom_objects, **kwargs)

    obs = env.reset()

    # Deterministic by default except for atari games
    stochastic = args.stochastic or is_atari and not args.deterministic
    deterministic = not stochastic

    state = None
    episode_reward = 0.0
    episode_rewards, episode_lengths = [], []
    ep_len = 0
    # For HER, monitor success rate
    successes = []
    try:
        for _ in range(args.n_timesteps):
            action, state = model.predict(obs, state=state, deterministic=deterministic)
            obs, reward, done, infos = env.step(action)
            if not args.no_render:
                env.render("human")

            episode_reward += reward[0]
            ep_len += 1

            if args.n_envs == 1:
                # For atari the return reward is not the atari score
                # so we have to get it from the infos dict
                if is_atari and infos is not None and args.verbose >= 1:
                    episode_infos = infos[0].get("episode")
                    if episode_infos is not None:
                        print(f"Atari Episode Score: {episode_infos['r']:.2f}")
                        print("Atari Episode Length", episode_infos["l"])

                if done and not is_atari and args.verbose > 0:
                    # NOTE: for env using VecNormalize, the mean reward
                    # is a normalized reward when `--norm_reward` flag is passed
                    print(f"Episode Reward: {episode_reward:.2f}")
                    print("Episode Length", ep_len)
                    episode_rewards.append(episode_reward)
                    episode_lengths.append(ep_len)
                    episode_reward = 0.0
                    ep_len = 0
                    state = None

                # Reset also when the goal is achieved when using HER
                if done and infos[0].get("is_success") is not None:
                    if args.verbose > 1:
                        print("Success?", infos[0].get("is_success", False))

                    if infos[0].get("is_success") is not None:
                        successes.append(infos[0].get("is_success", False))
                        episode_reward, ep_len = 0.0, 0

    except KeyboardInterrupt:
        pass

    episode_rewards.append(episode_reward)
    episode_lengths.append(ep_len)

    if args.verbose > 0 and len(successes) > 0:
        print(f"Success rate: {100 * np.mean(successes):.2f}%")

    if args.verbose > 0 and len(episode_rewards) > 0:
        print(f"{len(episode_rewards)} Episodes")
        print(f"Mean reward: {np.mean(episode_rewards):.2f} +/- {np.std(episode_rewards):.2f}")

    if args.verbose > 0 and len(episode_lengths) > 0:
        print(f"Mean episode length: {np.mean(episode_lengths):.2f} +/- {np.std(episode_lengths):.2f}")

    # Workaround for https://github.com/openai/gym/issues/893
    if not args.no_render:
        if args.n_envs == 1 and "Bullet" not in env_id and not is_atari and isinstance(env, VecEnv):
            # DummyVecEnv
            # Unwrap env
            while isinstance(env, VecEnvWrapper):
                env = env.venv
            if isinstance(env, DummyVecEnv):
                env.envs[0].env.close()
            else:
                env.close()
        else:
            # SubprocVecEnv
            env.close()
示例#52
0
def finetune(
    _log,
    _run,
    _rnd,
    max_length=None,
    artifacts_dir="ft_artifacts",
    overwrite=False,
    load_from="artifacts",
    load_params="model.pth",
    device="cpu",
    word_emb_path="wiki.id.vec",
    freeze=False,
    projective=False,
    multiroot=True,
    batch_size=32,
    lr=1e-5,
    l2_coef=1.0,
    max_epoch=5,
):
    """Finetune a trained model with self-training."""
    if max_length is None:
        max_length = {}

    artifacts_dir = Path(artifacts_dir)
    _log.info("Creating artifacts directory %s", artifacts_dir)
    artifacts_dir.mkdir(exist_ok=overwrite)

    samples = {
        wh: list(read_samples(which=wh, max_length=max_length.get(wh)))
        for wh in ["train", "dev", "test"]
    }
    for wh in samples:
        n_toks = sum(len(s["words"]) for s in samples[wh])
        _log.info("Read %d %s samples and %d tokens", len(samples[wh]), wh,
                  n_toks)

    path = Path(load_from) / "vocab.yml"
    _log.info("Loading vocabulary from %s", path)
    vocab = load(path.read_text(encoding="utf8"))
    for name in vocab:
        _log.info("Found %d %s", len(vocab[name]), name)

    _log.info("Extending vocabulary with target words")
    vocab.extend(chain(*samples.values()), ["words"])
    _log.info("Found %d words now", len(vocab["words"]))

    path = artifacts_dir / "vocab.yml"
    _log.info("Saving vocabulary to %s", path)
    path.write_text(dump(vocab), encoding="utf8")

    samples = {wh: list(vocab.stoi(samples[wh])) for wh in samples}

    path = Path(load_from) / "model.yml"
    _log.info("Loading model from metadata %s", path)
    model = load(path.read_text(encoding="utf8"))

    path = Path(load_from) / load_params
    _log.info("Loading model parameters from %s", path)
    model.load_state_dict(torch.load(path, "cpu"))

    _log.info("Creating extended word embedding layer")
    kv = KeyedVectors.load_word2vec_format(word_emb_path)
    assert model.word_emb.embedding_dim == kv.vector_size
    with torch.no_grad():
        model.word_emb = torch.nn.Embedding.from_pretrained(
            extend_word_embedding(model.word_emb.weight, vocab["words"], kv))

    path = artifacts_dir / "model.yml"
    _log.info("Saving model metadata to %s", path)
    path.write_text(dump(model), encoding="utf8")

    model.word_emb.requires_grad_(not freeze)
    model.tag_emb.requires_grad_(not freeze)
    model.to(device)

    for wh in ["train"]:
        for i, s in enumerate(samples[wh]):
            s["_id"] = i

        runner = Runner()
        runner.state.update({"st_heads": [], "st_types": [], "_ids": []})
        runner.on(
            Event.BATCH,
            [
                batch2tensors(device, vocab),
                set_train_mode(model, training=False),
                compute_total_arc_type_scores(model, vocab),
                predict_batch(projective, multiroot),
            ],
        )

        @runner.on(Event.BATCH)
        def save_st_trees(state):
            state["st_heads"].extend(state["pred_heads"].tolist())
            state["st_types"].extend(state["pred_types"].tolist())
            state["_ids"].extend(state["batch"]["_id"].tolist())
            state["n_items"] = state["batch"]["words"].numel()

        n_toks = sum(len(s["words"]) for s in samples[wh])
        ProgressBar(total=n_toks, unit="tok").attach_on(runner)

        _log.info("Computing ST trees for %s set", wh)
        with torch.no_grad():
            runner.run(
                BucketIterator(samples[wh], lambda s: len(s["words"]),
                               batch_size))

        assert len(runner.state["st_heads"]) == len(samples[wh])
        assert len(runner.state["st_types"]) == len(samples[wh])
        assert len(runner.state["_ids"]) == len(samples[wh])
        for i, st_heads, st_types in zip(runner.state["_ids"],
                                         runner.state["st_heads"],
                                         runner.state["st_types"]):
            assert len(samples[wh][i]["words"]) == len(st_heads)
            assert len(samples[wh][i]["words"]) == len(st_types)
            samples[wh][i]["st_heads"] = st_heads
            samples[wh][i]["st_types"] = st_types

    _log.info("Creating optimizer")
    opt = torch.optim.Adam(model.parameters(), lr=lr)

    finetuner = Runner()
    origin_params = {
        name: p.clone().detach()
        for name, p in model.named_parameters()
    }
    finetuner.on(
        Event.BATCH,
        [
            batch2tensors(device, vocab),
            set_train_mode(model),
            compute_l2_loss(model, origin_params),
        ],
    )

    @finetuner.on(Event.BATCH)
    def compute_loss(state):
        bat = state["batch"]
        words, tags, heads, types = bat["words"], bat["tags"], bat[
            "st_heads"], bat["st_types"]
        mask = bat["mask"]

        arc_scores, type_scores = model(words, tags, mask, heads)
        arc_scores = arc_scores.masked_fill(~mask.unsqueeze(2),
                                            -1e9)  # mask padding heads
        type_scores[..., vocab["types"].index(vocab.PAD_TOKEN)] = -1e9

        # remove root
        arc_scores, type_scores = arc_scores[:, :, 1:], type_scores[:, 1:]
        heads, types, mask = heads[:, 1:], types[:, 1:], mask[:, 1:]

        arc_scores = rearrange(arc_scores,
                               "bsz slen1 slen2 -> (bsz slen2) slen1")
        heads = heads.reshape(-1)
        arc_loss = torch.nn.functional.cross_entropy(arc_scores,
                                                     heads,
                                                     reduction="none")

        type_scores = rearrange(type_scores,
                                "bsz slen ntypes -> (bsz slen) ntypes")
        types = types.reshape(-1)
        type_loss = torch.nn.functional.cross_entropy(type_scores,
                                                      types,
                                                      reduction="none")

        arc_loss = arc_loss.masked_select(mask.reshape(-1)).mean()
        type_loss = type_loss.masked_select(mask.reshape(-1)).mean()
        loss = arc_loss + type_loss + l2_coef * state["l2_loss"]

        state["loss"] = loss
        state["stats"] = {
            "arc_ppl": arc_loss.exp().item(),
            "type_ppl": type_loss.exp().item(),
            "l2_loss": state["l2_loss"].item(),
        }
        state["extra_stats"] = {
            "arc_loss": arc_loss.item(),
            "type_loss": type_loss.item()
        }

    finetuner.on(
        Event.BATCH,
        [
            get_n_items(),
            update_params(opt),
            log_grads(_run, model),
            log_stats(_run)
        ],
    )

    @finetuner.on(Event.EPOCH_FINISHED)
    def eval_on_dev(state):
        _log.info("Evaluating on dev")
        eval_state = run_eval(model, vocab, samples["dev"])
        accs = eval_state["counts"].accs
        print_accs(accs, run=_run, step=state["n_iters"])
        state["dev_accs"] = accs

    @finetuner.on(Event.EPOCH_FINISHED)
    def maybe_eval_on_test(state):
        if state["epoch"] != max_epoch:
            return

        _log.info("Evaluating on test")
        eval_state = run_eval(model, vocab, samples["test"])
        print_accs(eval_state["counts"].accs,
                   on="test",
                   run=_run,
                   step=state["n_iters"])

    finetuner.on(Event.EPOCH_FINISHED,
                 save_state_dict("model", model, under=artifacts_dir))

    EpochTimer().attach_on(finetuner)
    n_tokens = sum(len(s["words"]) for s in samples["train"])
    ProgressBar(stats="stats", total=n_tokens, unit="tok").attach_on(finetuner)

    bucket_key = lambda s: (len(s["words"]) - 1) // 10
    trn_iter = ShuffleIterator(
        BucketIterator(samples["train"],
                       bucket_key,
                       batch_size,
                       shuffle_bucket=True,
                       rng=_rnd),
        rng=_rnd,
    )
    _log.info("Starting finetuning")
    try:
        finetuner.run(trn_iter, max_epoch)
    except KeyboardInterrupt:
        _log.info("Interrupt detected, training will abort")
    else:
        return finetuner.state["dev_accs"]["las_nopunct"]
示例#53
0
def finetune(
    corpus,
    _log,
    _run,
    _rnd,
    max_length=None,
    artifacts_dir="ft_artifacts",
    load_samples_from=None,
    overwrite=False,
    load_src=None,
    src_key_as_lang=False,
    main_src=None,
    device="cpu",
    word_emb_path="wiki.id.vec",
    freeze=False,
    thresh=0.95,
    projective=False,
    multiroot=True,
    batch_size=32,
    save_samples=False,
    lr=1e-5,
    l2_coef=1.0,
    max_epoch=5,
):
    """Finetune a trained model with PPTX."""
    if max_length is None:
        max_length = {}
    if load_src is None:
        load_src = {"src": ("artifacts", "model.pth")}
        main_src = "src"
    elif main_src not in load_src:
        raise ValueError(f"{main_src} not found in load_src")

    artifacts_dir = Path(artifacts_dir)
    _log.info("Creating artifacts directory %s", artifacts_dir)
    artifacts_dir.mkdir(exist_ok=overwrite)

    if load_samples_from:
        _log.info("Loading samples from %s", load_samples_from)
        with open(load_samples_from, "rb") as f:
            samples = pickle.load(f)
    else:
        samples = {
            wh: list(read_samples(which=wh, max_length=max_length.get(wh)))
            for wh in ["train", "dev", "test"]
        }
    for wh in samples:
        n_toks = sum(len(s["words"]) for s in samples[wh])
        _log.info("Read %d %s samples and %d tokens", len(samples[wh]), wh,
                  n_toks)

    kv = KeyedVectors.load_word2vec_format(word_emb_path)

    if load_samples_from:
        _log.info(
            "Skipping non-main src because samples are processed and loaded")
        srcs = []
    else:
        srcs = [src for src in load_src if src != main_src]
        if src_key_as_lang and corpus["lang"] in srcs:
            _log.info("Removing %s from src parsers because it's the tgt",
                      corpus["lang"])
            srcs.remove(corpus["lang"])
    srcs.append(main_src)

    for src_i, src in enumerate(srcs):
        _log.info("Processing src %s [%d/%d]", src, src_i + 1, len(srcs))
        load_from, load_params = load_src[src]
        path = Path(load_from) / "vocab.yml"
        _log.info("Loading %s vocabulary from %s", src, path)
        vocab = load(path.read_text(encoding="utf8"))
        for name in vocab:
            _log.info("Found %d %s", len(vocab[name]), name)

        _log.info("Extending %s vocabulary with target words", src)
        vocab.extend(chain(*samples.values()), ["words"])
        _log.info("Found %d words now", len(vocab["words"]))

        samples_ = {wh: list(vocab.stoi(samples[wh])) for wh in samples}

        path = Path(load_from) / "model.yml"
        _log.info("Loading %s model from metadata %s", src, path)
        model = load(path.read_text(encoding="utf8"))

        path = Path(load_from) / load_params
        _log.info("Loading %s model parameters from %s", src, path)
        model.load_state_dict(torch.load(path, "cpu"))

        _log.info("Creating %s extended word embedding layer", src)
        assert model.word_emb.embedding_dim == kv.vector_size
        with torch.no_grad():
            model.word_emb = torch.nn.Embedding.from_pretrained(
                extend_word_embedding(model.word_emb.weight, vocab["words"],
                                      kv))
        model.to(device)

        for wh in ["train", "dev"]:
            if load_samples_from:
                assert all("pptx_mask" in s for s in samples[wh])
                continue

            for i, s in enumerate(samples_[wh]):
                s["_id"] = i

            runner = Runner()
            runner.state.update({"pptx_masks": [], "_ids": []})
            runner.on(
                Event.BATCH,
                [
                    batch2tensors(device, vocab),
                    set_train_mode(model, training=False),
                    compute_total_arc_type_scores(model, vocab),
                ],
            )

            @runner.on(Event.BATCH)
            def compute_pptx_ambiguous_arcs_mask(state):
                assert state["batch"]["mask"].all()
                scores = state["total_arc_type_scores"]
                pptx_mask = compute_ambiguous_arcs_mask(
                    scores, thresh, projective, multiroot)
                state["pptx_masks"].extend(pptx_mask)
                state["_ids"].extend(state["batch"]["_id"].tolist())
                state["n_items"] = state["batch"]["words"].numel()

            n_toks = sum(len(s["words"]) for s in samples_[wh])
            ProgressBar(total=n_toks, unit="tok").attach_on(runner)

            _log.info(
                "Computing PPTX ambiguous arcs mask for %s set with source %s",
                wh, src)
            with torch.no_grad():
                runner.run(
                    BucketIterator(samples_[wh], lambda s: len(s["words"]),
                                   batch_size))

            assert len(runner.state["pptx_masks"]) == len(samples_[wh])
            assert len(runner.state["_ids"]) == len(samples_[wh])
            for i, pptx_mask in zip(runner.state["_ids"],
                                    runner.state["pptx_masks"]):
                samples_[wh][i]["pptx_mask"] = pptx_mask.tolist()

            _log.info("Computing (log) number of trees stats on %s set", wh)
            report_log_ntrees_stats(samples_[wh], "pptx_mask", batch_size,
                                    projective, multiroot)

            _log.info("Combining the ambiguous arcs mask")
            assert len(samples_[wh]) == len(samples[wh])
            for i in range(len(samples_[wh])):
                pptx_mask = torch.tensor(samples_[wh][i]["pptx_mask"])
                assert pptx_mask.dim() == 3
                if "pptx_mask" in samples[wh][i]:
                    old_mask = torch.tensor(samples[wh][i]["pptx_mask"])
                else:
                    old_mask = torch.zeros(1, 1, 1).bool()
                samples[wh][i]["pptx_mask"] = (old_mask | pptx_mask).tolist()

    assert src == main_src
    _log.info("Main source is %s", src)

    path = artifacts_dir / "vocab.yml"
    _log.info("Saving vocabulary to %s", path)
    path.write_text(dump(vocab), encoding="utf8")

    path = artifacts_dir / "model.yml"
    _log.info("Saving model metadata to %s", path)
    path.write_text(dump(model), encoding="utf8")

    if save_samples:
        path = artifacts_dir / "samples.pkl"
        _log.info("Saving samples to %s", path)
        with open(path, "wb") as f:
            pickle.dump(samples, f)

    samples = {wh: list(vocab.stoi(samples[wh])) for wh in samples}

    for wh in ["train", "dev"]:
        _log.info("Computing (log) number of trees stats on %s set", wh)
        report_log_ntrees_stats(samples[wh], "pptx_mask", batch_size,
                                projective, multiroot)

    model.word_emb.requires_grad_(not freeze)
    model.tag_emb.requires_grad_(not freeze)

    _log.info("Creating optimizer")
    opt = torch.optim.Adam(model.parameters(), lr=lr)

    finetuner = Runner()
    origin_params = {
        name: p.clone().detach()
        for name, p in model.named_parameters()
    }
    finetuner.on(
        Event.BATCH,
        [
            batch2tensors(device, vocab),
            set_train_mode(model),
            compute_l2_loss(model, origin_params),
            compute_total_arc_type_scores(model, vocab),
        ],
    )

    @finetuner.on(Event.BATCH)
    def compute_loss(state):
        mask = state["batch"]["mask"]
        pptx_mask = state["batch"]["pptx_mask"].bool()
        scores = state["total_arc_type_scores"]

        pptx_loss = compute_aatrn_loss(scores, pptx_mask, mask, projective,
                                       multiroot)
        pptx_loss /= mask.size(0)
        loss = pptx_loss + l2_coef * state["l2_loss"]

        state["loss"] = loss
        state["stats"] = {
            "pptx_loss": pptx_loss.item(),
            "l2_loss": state["l2_loss"].item(),
        }
        state["extra_stats"] = {"loss": loss.item()}
        state["n_items"] = mask.long().sum().item()

    finetuner.on(Event.BATCH,
                 [update_params(opt),
                  log_grads(_run, model),
                  log_stats(_run)])

    @finetuner.on(Event.EPOCH_FINISHED)
    def eval_on_dev(state):
        _log.info("Evaluating on dev")
        eval_state = run_eval(model, vocab, samples["dev"])
        accs = eval_state["counts"].accs
        print_accs(accs, run=_run, step=state["n_iters"])

        pptx_loss = eval_state["mean_pptx_loss"]
        _log.info("dev_pptx_loss: %.4f", pptx_loss)
        _run.log_scalar("dev_pptx_loss", pptx_loss, step=state["n_iters"])

        state["dev_accs"] = accs

    @finetuner.on(Event.EPOCH_FINISHED)
    def maybe_eval_on_test(state):
        if state["epoch"] != max_epoch:
            return

        _log.info("Evaluating on test")
        eval_state = run_eval(model,
                              vocab,
                              samples["test"],
                              compute_loss=False)
        print_accs(eval_state["counts"].accs,
                   on="test",
                   run=_run,
                   step=state["n_iters"])

    finetuner.on(Event.EPOCH_FINISHED,
                 save_state_dict("model", model, under=artifacts_dir))

    EpochTimer().attach_on(finetuner)
    n_tokens = sum(len(s["words"]) for s in samples["train"])
    ProgressBar(stats="stats", total=n_tokens, unit="tok").attach_on(finetuner)

    bucket_key = lambda s: (len(s["words"]) - 1) // 10
    trn_iter = ShuffleIterator(
        BucketIterator(samples["train"],
                       bucket_key,
                       batch_size,
                       shuffle_bucket=True,
                       rng=_rnd),
        rng=_rnd,
    )
    _log.info("Starting finetuning")
    try:
        finetuner.run(trn_iter, max_epoch)
    except KeyboardInterrupt:
        _log.info("Interrupt detected, training will abort")
    else:
        return finetuner.state["dev_accs"]["las_nopunct"]
示例#54
0
def finetune(
    _log,
    _run,
    _rnd,
    max_length=None,
    artifacts_dir="ft_artifacts",
    overwrite=False,
    load_from="artifacts",
    load_params="model.pth",
    device="cpu",
    word_emb_path="wiki.id.vec",
    freeze=False,
    thresh=0.95,
    projective=False,
    multiroot=True,
    batch_size=32,
    lr=1e-5,
    l2_coef=1.0,
    max_epoch=5,
):
    """Finetune a trained model with PPT."""
    if max_length is None:
        max_length = {}

    artifacts_dir = Path(artifacts_dir)
    _log.info("Creating artifacts directory %s", artifacts_dir)
    artifacts_dir.mkdir(exist_ok=overwrite)

    samples = {
        wh: list(read_samples(which=wh, max_length=max_length.get(wh)))
        for wh in ["train", "dev", "test"]
    }
    for wh in samples:
        n_toks = sum(len(s["words"]) for s in samples[wh])
        _log.info("Read %d %s samples and %d tokens", len(samples[wh]), wh,
                  n_toks)

    path = Path(load_from) / "vocab.yml"
    _log.info("Loading vocabulary from %s", path)
    vocab = load(path.read_text(encoding="utf8"))
    for name in vocab:
        _log.info("Found %d %s", len(vocab[name]), name)

    _log.info("Extending vocabulary with target words")
    vocab.extend(chain(*samples.values()), ["words"])
    _log.info("Found %d words now", len(vocab["words"]))

    path = artifacts_dir / "vocab.yml"
    _log.info("Saving vocabulary to %s", path)
    path.write_text(dump(vocab), encoding="utf8")

    samples = {wh: list(vocab.stoi(samples[wh])) for wh in samples}

    path = Path(load_from) / "model.yml"
    _log.info("Loading model from metadata %s", path)
    model = load(path.read_text(encoding="utf8"))

    path = Path(load_from) / load_params
    _log.info("Loading model parameters from %s", path)
    model.load_state_dict(torch.load(path, "cpu"))

    _log.info("Creating extended word embedding layer")
    kv = KeyedVectors.load_word2vec_format(word_emb_path)
    assert model.word_emb.embedding_dim == kv.vector_size
    with torch.no_grad():
        model.word_emb = torch.nn.Embedding.from_pretrained(
            extend_word_embedding(model.word_emb.weight, vocab["words"], kv))

    path = artifacts_dir / "model.yml"
    _log.info("Saving model metadata to %s", path)
    path.write_text(dump(model), encoding="utf8")

    model.word_emb.requires_grad_(not freeze)
    model.tag_emb.requires_grad_(not freeze)
    model.to(device)

    for wh in ["train", "dev"]:
        for i, s in enumerate(samples[wh]):
            s["_id"] = i

        runner = Runner()
        runner.state.update({"ppt_masks": [], "_ids": []})
        runner.on(
            Event.BATCH,
            [
                batch2tensors(device, vocab),
                set_train_mode(model, training=False),
                compute_total_arc_type_scores(model, vocab),
            ],
        )

        @runner.on(Event.BATCH)
        def compute_ppt_ambiguous_arcs_mask(state):
            assert state["batch"]["mask"].all()
            scores = state["total_arc_type_scores"]
            ppt_mask = compute_ambiguous_arcs_mask(scores, thresh, projective,
                                                   multiroot)
            state["ppt_masks"].extend(ppt_mask.tolist())
            state["_ids"].extend(state["batch"]["_id"].tolist())
            state["n_items"] = state["batch"]["words"].numel()

        n_toks = sum(len(s["words"]) for s in samples[wh])
        ProgressBar(total=n_toks, unit="tok").attach_on(runner)

        _log.info("Computing PPT ambiguous arcs mask for %s set", wh)
        with torch.no_grad():
            runner.run(
                BucketIterator(samples[wh], lambda s: len(s["words"]),
                               batch_size))

        assert len(runner.state["ppt_masks"]) == len(samples[wh])
        assert len(runner.state["_ids"]) == len(samples[wh])
        for i, ppt_mask in zip(runner.state["_ids"],
                               runner.state["ppt_masks"]):
            samples[wh][i]["ppt_mask"] = ppt_mask

        _log.info("Computing (log) number of trees stats on %s set", wh)
        report_log_ntrees_stats(samples[wh], "ppt_mask", batch_size,
                                projective, multiroot)

    _log.info("Creating optimizer")
    opt = torch.optim.Adam(model.parameters(), lr=lr)

    finetuner = Runner()
    origin_params = {
        name: p.clone().detach()
        for name, p in model.named_parameters()
    }
    finetuner.on(
        Event.BATCH,
        [
            batch2tensors(device, vocab),
            set_train_mode(model),
            compute_l2_loss(model, origin_params),
            compute_total_arc_type_scores(model, vocab),
        ],
    )

    @finetuner.on(Event.BATCH)
    def compute_loss(state):
        mask = state["batch"]["mask"]
        ppt_mask = state["batch"]["ppt_mask"].bool()
        scores = state["total_arc_type_scores"]

        ppt_loss = compute_aatrn_loss(scores, ppt_mask, mask, projective,
                                      multiroot)
        ppt_loss /= mask.size(0)
        loss = ppt_loss + l2_coef * state["l2_loss"]

        state["loss"] = loss
        state["stats"] = {
            "ppt_loss": ppt_loss.item(),
            "l2_loss": state["l2_loss"].item(),
        }
        state["extra_stats"] = {"loss": loss.item()}
        state["n_items"] = mask.long().sum().item()

    finetuner.on(Event.BATCH,
                 [update_params(opt),
                  log_grads(_run, model),
                  log_stats(_run)])

    @finetuner.on(Event.EPOCH_FINISHED)
    def eval_on_dev(state):
        _log.info("Evaluating on dev")
        eval_state = run_eval(model, vocab, samples["dev"])
        accs = eval_state["counts"].accs
        print_accs(accs, run=_run, step=state["n_iters"])

        ppt_loss = eval_state["mean_ppt_loss"]
        _log.info("dev_ppt_loss: %.4f", ppt_loss)
        _run.log_scalar("dev_ppt_loss", ppt_loss, step=state["n_iters"])

        state["dev_accs"] = accs

    @finetuner.on(Event.EPOCH_FINISHED)
    def maybe_eval_on_test(state):
        if state["epoch"] != max_epoch:
            return

        _log.info("Evaluating on test")
        eval_state = run_eval(model,
                              vocab,
                              samples["test"],
                              compute_loss=False)
        print_accs(eval_state["counts"].accs,
                   on="test",
                   run=_run,
                   step=state["n_iters"])

    finetuner.on(Event.EPOCH_FINISHED,
                 save_state_dict("model", model, under=artifacts_dir))

    EpochTimer().attach_on(finetuner)
    n_tokens = sum(len(s["words"]) for s in samples["train"])
    ProgressBar(stats="stats", total=n_tokens, unit="tok").attach_on(finetuner)

    bucket_key = lambda s: (len(s["words"]) - 1) // 10
    trn_iter = ShuffleIterator(
        BucketIterator(samples["train"],
                       bucket_key,
                       batch_size,
                       shuffle_bucket=True,
                       rng=_rnd),
        rng=_rnd,
    )
    _log.info("Starting finetuning")
    try:
        finetuner.run(trn_iter, max_epoch)
    except KeyboardInterrupt:
        _log.info("Interrupt detected, training will abort")
    else:
        return finetuner.state["dev_accs"]["las_nopunct"]
示例#55
0
def train(
    _log,
    _run,
    _rnd,
    artifacts_dir="artifacts",
    overwrite=False,
    max_length=None,
    load_types_vocab_from=None,
    batch_size=16,
    device="cpu",
    lr=0.001,
    patience=5,
    max_epoch=1000,
):
    """Train a self-attention graph-based parser."""
    if max_length is None:
        max_length = {}

    artifacts_dir = Path(artifacts_dir)
    _log.info("Creating artifacts directory %s", artifacts_dir)
    artifacts_dir.mkdir(exist_ok=overwrite)

    samples = {
        wh: list(read_samples(which=wh, max_length=max_length.get(wh)))
        for wh in ["train", "dev", "test"]
    }
    for wh in samples:
        n_toks = sum(len(s["words"]) for s in samples[wh])
        _log.info("Read %d %s samples and %d tokens", len(samples[wh]), wh,
                  n_toks)

    _log.info("Creating vocabulary")
    vocab = Vocab.from_samples(chain(*samples.values()))
    if load_types_vocab_from:
        path = Path(load_types_vocab_from)
        _log.info("Loading types vocab from %s", path)
        vocab["types"] = load(path.read_text(encoding="utf8"))["types"]

    _log.info("Vocabulary created")
    for name in vocab:
        _log.info("Found %d %s", len(vocab[name]), name)

    path = artifacts_dir / "vocab.yml"
    _log.info("Saving vocabulary to %s", path)
    path.write_text(dump(vocab), encoding="utf8")

    samples = {wh: list(vocab.stoi(samples[wh])) for wh in samples}

    model = make_model(vocab)
    model.to(device)

    _log.info("Creating optimizer")
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(opt,
                                                           mode="max",
                                                           factor=0.5)

    trainer = Runner()
    trainer.state.update({"dev_larcs_nopunct": -1, "dev_uarcs_nopunct": -1})
    trainer.on(Event.BATCH,
               [batch2tensors(device, vocab),
                set_train_mode(model)])

    @trainer.on(Event.BATCH)
    def compute_loss(state):
        bat = state["batch"]
        words, tags, heads, types = bat["words"], bat["tags"], bat[
            "heads"], bat["types"]
        mask = bat["mask"]

        arc_scores, type_scores = model(words, tags, mask, heads)
        arc_scores = arc_scores.masked_fill(~mask.unsqueeze(2),
                                            -1e9)  # mask padding heads
        type_scores[..., vocab["types"].index(Vocab.PAD_TOKEN)] = -1e9

        # remove root
        arc_scores, type_scores = arc_scores[:, :, 1:], type_scores[:, 1:]
        heads, types, mask = heads[:, 1:], types[:, 1:], mask[:, 1:]

        arc_scores = rearrange(arc_scores,
                               "bsz slen1 slen2 -> (bsz slen2) slen1")
        heads = heads.reshape(-1)
        arc_loss = torch.nn.functional.cross_entropy(arc_scores,
                                                     heads,
                                                     reduction="none")

        type_scores = rearrange(type_scores,
                                "bsz slen ntypes -> (bsz slen) ntypes")
        types = types.reshape(-1)
        type_loss = torch.nn.functional.cross_entropy(type_scores,
                                                      types,
                                                      reduction="none")

        arc_loss = arc_loss.masked_select(mask.reshape(-1)).mean()
        type_loss = type_loss.masked_select(mask.reshape(-1)).mean()
        loss = arc_loss + type_loss

        state["loss"] = loss
        arc_loss, type_loss = arc_loss.item(), type_loss.item()
        state["stats"] = {
            "arc_ppl": math.exp(arc_loss),
            "type_ppl": math.exp(type_loss),
        }
        state["extra_stats"] = {"arc_loss": arc_loss, "type_loss": type_loss}
        state["n_items"] = bat["mask"].long().sum().item()

    trainer.on(Event.BATCH,
               [update_params(opt),
                log_grads(_run, model),
                log_stats(_run)])

    @trainer.on(Event.EPOCH_FINISHED)
    def eval_on_dev(state):
        _log.info("Evaluating on dev")
        eval_state = run_eval(model, vocab, samples["dev"])
        accs = eval_state["counts"].accs
        print_accs(accs, run=_run, step=state["n_iters"])

        scheduler.step(accs["las_nopunct"])

        if eval_state["counts"].larcs_nopunct > state["dev_larcs_nopunct"]:
            state["better"] = True
        elif eval_state["counts"].larcs_nopunct < state["dev_larcs_nopunct"]:
            state["better"] = False
        elif eval_state["counts"].uarcs_nopunct > state["dev_uarcs_nopunct"]:
            state["better"] = True
        else:
            state["better"] = False

        if state["better"]:
            _log.info("Found new best result on dev!")
            state["dev_larcs_nopunct"] = eval_state["counts"].larcs_nopunct
            state["dev_uarcs_nopunct"] = eval_state["counts"].uarcs_nopunct
            state["dev_accs"] = accs
            state["dev_epoch"] = state["epoch"]
        else:
            _log.info("Not better, the best so far is epoch %d:",
                      state["dev_epoch"])
            print_accs(state["dev_accs"])
            print_accs(state["test_accs"], on="test")

    @trainer.on(Event.EPOCH_FINISHED)
    def maybe_eval_on_test(state):
        if not state["better"]:
            return

        _log.info("Evaluating on test")
        eval_state = run_eval(model, vocab, samples["test"])
        state["test_accs"] = eval_state["counts"].accs
        print_accs(state["test_accs"],
                   on="test",
                   run=_run,
                   step=state["n_iters"])

    trainer.on(
        Event.EPOCH_FINISHED,
        [
            maybe_stop_early(patience=patience),
            save_state_dict("model", model, under=artifacts_dir,
                            when="better"),
        ],
    )

    EpochTimer().attach_on(trainer)
    n_tokens = sum(len(s["words"]) for s in samples["train"])
    ProgressBar(stats="stats", total=n_tokens, unit="tok").attach_on(trainer)

    bucket_key = lambda s: (len(s["words"]) - 1) // 10
    trn_iter = ShuffleIterator(
        BucketIterator(samples["train"],
                       bucket_key,
                       batch_size,
                       shuffle_bucket=True,
                       rng=_rnd),
        rng=_rnd,
    )
    _log.info("Starting training")
    try:
        trainer.run(trn_iter, max_epoch)
    except KeyboardInterrupt:
        _log.info("Interrupt detected, training will abort")
    else:
        return trainer.state["dev_accs"]["las_nopunct"]