Exemplo n.º 1
0
 def __init__(self, embedding, units, softmax, timestep, x_word, y_word):
     self.embedding = check.check_instance(embedding, HiddenState)
     self.units = check.check_dict(units)
     self.softmax = check.check_instance(softmax, LabelDistribution)
     self.timestep = timestep
     self.x_word = x_word
     self.y_word = y_word
Exemplo n.º 2
0
 def __init__(self, model_dir, versions={}, latest=None, step=-1):
     self.model_dir = check.check_instance(model_dir, str)
     self.save_path = self.get_save_path(self.model_dir)
     self.versions = check.check_instance(versions, dict)
     self.latest = latest
     self.step = check.check_instance(step, int)
     self.next_step = self.step + 1
Exemplo n.º 3
0
    def train(self, xys_stream, training_parameters):
        check.check_instance(training_parameters, mlbase.TrainingParameters)
        slot_length = len(str(training_parameters.epochs())) - 1
        epoch_template = "[%s] Epoch training {:%dd}: (loss, perplexity): {:.6f}, {:.6f}" % (
            self.scope, slot_length)
        final_loss = None
        epochs_tenth = max(1, int(training_parameters.epochs() / 10))
        losses = training_parameters.losses()
        finished = False
        epoch = -1

        while not finished:
            epoch += 1
            epoch_loss = 0
            # Start at a different offset for every epoch to help avoid overfitting.
            offset = random.randint(0, training_parameters.batch() - 1)
            batch = []
            first = True
            batch_set = False
            count = 0

            for xy in xys_stream():
                batch += [xy]

                if first and len(batch) == offset:
                    first = False
                    batch_set = True
                elif len(batch) == training_parameters.batch():
                    batch_set = True

                if batch_set:
                    count += len(batch)
                    feed = self.get_training_feed(batch, training_parameters)
                    _, training_loss = self.session.run(
                        [self.updates, self.cost], feed_dict=feed)
                    epoch_loss += training_loss
                    batch_set = False
                    batch = []

            if len(batch) > 0:
                count += len(batch)
                feed = self.get_training_feed(batch, training_parameters)
                _, training_loss = self.session.run([self.updates, self.cost],
                                                    feed_dict=feed)
                epoch_loss += training_loss

            epoch_loss /= count
            epoch_perplexity = math.exp(epoch_loss)
            losses.append(epoch_loss)
            finished, reason = training_parameters.finished(epoch, losses)

            if not finished and epoch % epochs_tenth == 0:
                logging.debug(
                    epoch_template.format(epoch, epoch_loss, epoch_perplexity))

        logging.debug(
            epoch_template.format(epoch, epoch_loss, epoch_perplexity))
        logging.debug("Training on %d instances finished due to %s (%s)." %
                      (count, reason, losses))
        return epoch_loss, -epoch_perplexity
Exemplo n.º 4
0
    def __init__(self, all_nodes, kind):
        if len(all_nodes) > 0:
            identifier_class = all_nodes[0].identifier.__class__

            for node in all_nodes:
                check.check_instance(node.identifier, identifier_class)
                check.check_equal(node.finalized, True)

        self.all_nodes = all_nodes
        self.kind = check.check_one_of(kind,
                                       [Graph.DIRECTED, Graph.UNDIRECTED])
        self.log_len = math.log10(len(self.all_nodes) + 1)
        self.indexes = {}

        for node in self.all_nodes:
            self.indexes[node.identifier] = node

        self.clustering_coefficients = self._calculate_clustering_coefficients(
        )
        self._distances = {}
        self._max_distances = {}
        self._global_max_distance = None
        self._background_calculations = threading.Thread(
            target=self._submit_calculations)
        self._background_calculations.daemon = True
        self._background_calculations.start()
Exemplo n.º 5
0
 def __init__(self, sequence, predicted, expected):
     self.sequence = check.check_not_empty(sequence)
     self.predicted = check.check_instance(predicted, SequenceStats)
     self.expected = check.check_instance(expected, SequenceStats)
     check.check_length(self.predicted.values, len(self.sequence))
     check.check_length(self.expected.values, len(self.sequence))
     self.perplexity = perplexity(self.expected.probabilities)
Exemplo n.º 6
0
    def to_lemma(self, inflection_term):
        self._finalize()
        check.check_instance(inflection_term, Term)

        try:
            return self.inflections[inflection_term]
        except KeyError as e:
            return self.inflections[inflection_term.lower()]
Exemplo n.º 7
0
 def __init__(self, save_dir, versions={}, latest=None, next_step=0):
     self.save_dir = check.check_instance(save_dir, str)
     self.savepoints_file = os.path.join(self.save_dir,
                                         Savepoints.SAVEPOINTS_FILE)
     self.model_dir = os.path.join(self.save_dir, Savepoints.MODEL_DIR)
     self.versions = check.check_instance(versions, dict)
     self.latest = latest
     self.next_step = next_step
Exemplo n.º 8
0
def sentences(word_token_stream):
    sentence_builder = SentenceBuilder()

    for token in word_token_stream:
        check.check_instance(token, Token)

        # Since we are streaming words in, we can complete the sentence at any time.
        #                                  v
        if sentence_builder.process(token, can_complete=True):
            yield sentence_builder.build()
            sentence_builder = SentenceBuilder()

    if not sentence_builder.is_empty():
        yield sentence_builder.build()
Exemplo n.º 9
0
    def __init__(self, model, save_dir):
        self.model = check.check_instance(model, Model)
        self.savepoints = Savepoints.load(save_dir)

        if self.savepoints is None:
            self.savepoints = Savepoints(save_dir)
            os.makedirs(save_dir, exist_ok=True)
Exemplo n.º 10
0
 def __init__(self, scope, output_labels, output_distribution):
     super(CustomOutput, self).__init__(scope)
     self.output_labels = check.check_instance(output_labels, mlbase.Labels)
     self.output_distribution = check.check_pdist(output_distribution)
     assert len(self.output_labels) == len(
         self.output_distribution), "%d != %d" % (len(
             self.output_labels), len(self.output_distribution))
Exemplo n.º 11
0
 def __init__(self, labels, array):
     self.labels = check.check_instance(labels, Labels)
     self.array = check.check_length(array, len(self.labels))
     self._prediction = None
     self._prediction_probability = None
     self._distribution = None
     self._ranked_items = None
Exemplo n.º 12
0
    def __init__(self, values, unknown=None):
        check.check_instance(values, set)
        self.unknown = unknown
        self._empty = None
        self._encoding = {}
        self._decoding = {}

        if unknown is not None:
            self._encoding[unknown] = 0
            self._decoding[0] = self.unknown

        i = len(self._encoding)

        for value in values:
            self._encoding[check.check_not_none(value)] = i
            self._decoding[i] = value
            i += 1
Exemplo n.º 13
0
def _dump_stream(data, dir_path, converter):
    check.check_instance(data, queue.Queue)
    batch = []
    batch_size = None
    i = 0
    try_size = 10

    while True:
        item = data.get()

        if item is not None:
            batch += [item]

            # If we're still building out the sample.
            if batch_size is None:
                # Only try to discover the batch_size every so often.
                if len(batch) % try_size == 0:
                    average = _average_size(batch, converter)
                    sample_size = average * len(batch)

                    if sample_size > STREAM_TARGET_FILE_SIZE:
                        batch_size = max(1, int(STREAM_TARGET_FILE_SIZE / average))

                if len(batch) > 2 * try_size:
                    # Notice we don't need to worry about this growing too large, because the next check upper bounds the batch size.
                    try_size = try_size * 2

                if batch_size is None and len(batch) == STREAM_MAX_BATCH:
                    # The batch is plenty large enough - just set it here.
                    batch_size = STREAM_MAX_BATCH
            else:
                # The batch_size has been determined.
                while len(batch) > batch_size:
                    bytes_out = pickle.dumps(_convert(converter, batch[:batch_size]))
                    _write_bytes(bytes_out, dir_path, i)
                    i += 1
                    batch = batch[batch_size:]
        else:
            # The data stream is complete - flush the remaining data.
            if len(batch) > 0:
                bytes_out = pickle.dumps(_convert(converter, batch))
                _write_bytes(bytes_out, dir_path, i)

            logging.debug("Completed pickling stream for '%s'." % dir_path)
            break
Exemplo n.º 14
0
def extract_terms(corpus,
                  terms_trie,
                  lemmatizer=lambda x: x,
                  inflection_recorder=lambda x, y: 0):
    check.check_instance(terms_trie, Node)
    tags = [tag for word, tag in nltk.pos_tag(corpus)]
    assert len(tags) == len(corpus)
    extracted_terms = set()
    i = 0

    while i < len(corpus):
        span = 1
        node = terms_trie
        lemma = lemmatizer(corpus[i])
        tag = tags[i]
        sequence = None
        matched_term = None

        while lemma in node.children:
            if tag in TAGS and TAGS[tag]:
                node = node.children[lemma]

                if node.final:
                    sequence = corpus[i:i + span]
                    matched_term = node.term

                if i + span >= len(corpus):
                    break

                lemma = lemmatizer(corpus[i + span])
                tag = tags[i + span]
                span += 1
            else:
                break

        if sequence is not None:
            inflection_term = Term(sequence)
            extracted_terms.add(matched_term)
            inflection_recorder(matched_term, inflection_term)
            i += len(sequence)
        else:
            i += 1

    return extracted_terms
Exemplo n.º 15
0
    def record(self, lemma_term, inflection_term, number=1):
        check.check_none(self.lemma_to_inflection)
        logging.debug("record: %s->%s" % (lemma_term, inflection_term))
        check.check_instance(lemma_term, Term)
        check.check_instance(inflection_term, Term)

        if lemma_term not in self.counts:
            self.counts[lemma_term] = {}

        count = self.counts[lemma_term].get(inflection_term, 0)
        self.counts[lemma_term][inflection_term] = count + number

        if inflection_term not in self.inflections:
            self.inflections[inflection_term] = lemma_term
        else:
            if self.inflections[inflection_term] != lemma_term:
                raise ValueError(
                    "Inflection '%s' maps to multiple lemmas: [%s, %s]." %
                    (inflection_term, self.inflections[inflection_term],
                     lemma_term))
Exemplo n.º 16
0
 def __init__(self,
              name,
              name_no_t,
              vector,
              min_max=(None, None),
              colour=None,
              predictions=None,
              positioning=None):
     self.name = name
     self.name_no_t = name_no_t
     self.vector = [float(value) for value in vector]
     self.minimum, self.maximum = canonicalize_bounds(min_max, self.vector)
     self.colour = colour
     self.predictions = None if predictions is None else check.check_instance(
         predictions, LabelDistribution)
     self.positioning = positioning
Exemplo n.º 17
0
    def train(self, model_persistence, dataset, debug=False):
        check.check_instance(model_persistence, api.model.ModelPersistence)
        check.check_instance(model_persistence.model,
                             api.model.IterativelyOptimized)
        check.check_instance(dataset, api.data.Dataset)
        train_account = TrainAccount(self.schedule.window_size)
        score = model_persistence.model.score(dataset.validate)
        train_account.baseline(score)
        logging.debug("Baseline validate score: %.4f" % (score))
        model_persistence.save(train_account.version,
                               {"score_validate": score})
        training_parameters = self.parameters

        if debug:
            logging.debug("Training under: %s." % training_parameters)

        while True:
            finished, reason = self.schedule.is_finished(train_account)

            if finished:
                assert reason is not None, "when the schedule is finished it must provide a reason"
                logging.debug("Finished training: %s" % reason)
                break

            round_losses = self._optimization_round(model_persistence.model,
                                                    dataset.train,
                                                    training_parameters, debug)
            score = model_persistence.model.score(dataset.validate)
            progress_marker = self.schedule.evaluate_progress(
                round_losses, train_account.best_score, score)

            if progress_marker.improved:
                logging.debug("Progress improved        - proceeding.  Validate scores: previous=%.4f, current=%.4f." % \
                    (train_account.best_score, score))
                train_account.record_round(round_losses, score,
                                           progress_marker)
                model_persistence.save(train_account.version,
                                       {"score_validate": score})
            else:
                logging.debug("Progress did not improve -   decaying.  Validate scores: previous=%.4f, current=%.4f." % \
                    (train_account.best_score, score))
                train_account.record_decay(training_parameters.learning_rate)
                model_persistence.load(train_account.version)
                training_parameters = self.schedule.decay(
                    train_account, training_parameters)

                if debug:
                    logging.debug("Training under: %s." % training_parameters)

        score_train = model_persistence.model.score(dataset.train)
        score_test = model_persistence.model.score(dataset.test)
        logging.debug("Final train / test scores: %.4f / %.4f" %
                      (score_train, score_test))
Exemplo n.º 18
0
    def _optimization_round(self, model, trainstream, training_parameters,
                            debug):
        check.check_instance(model, api.model.IterativelyOptimized)
        check.check_instance(trainstream, api.data.Datastream)
        check.check_instance(training_parameters, api.train.TrainingParameters)
        model_parameters = model.extract_parameters(training_parameters)
        randomized_trainstream = trainstream.as_randomized(
            training_parameters.batch_size * 4)
        slot_length = util.order_of_magnitude(training_parameters.epoch_size)
        epoch_template = "Epoch {:%dd} loss: {:.6f}" % slot_length
        epoch = -1
        losses = []

        while epoch + 1 < training_parameters.epoch_size:
            epoch += 1
            epoch_loss = model.step_optimize(model_parameters,
                                             randomized_trainstream,
                                             training_parameters.batch_size)
            losses += [epoch_loss]

            if debug:
                logging.debug(epoch_template.format(epoch, epoch_loss))

        return losses
Exemplo n.º 19
0
    def train(self, xy_sequences, training_parameters):
        check.check_instance(training_parameters, mlbase.TrainingParameters)

        if id(xy_sequences) != self._training_id:
            self._training_id = id(xy_sequences)
            # Sort the training sequences by their length to minimize padding (each batch will consist of roughly equal lengthed sequences).
            self.training_xys = sorted(xy_sequences, key=lambda xy: len(xy.x))

        slot_length = len(str(training_parameters.epochs())) - 1
        case_slot_length = len(str(len(xy_sequences)))
        epoch_template = "Epoch training {:%dd} (loss, perplexity): {:.6f}, {:.6f}" % slot_length + (" (score {:.6f})" if training_parameters.score() else "")
        epochs_tenth = max(1, int(training_parameters.epochs() / 10))
        losses = training_parameters.losses()
        finished = False
        epoch = -1

        while not finished:
            epoch += 1
            epoch_loss = 0
            # Start at a different offset for every epoch to help avoid overfitting.
            offset = random.randint(0, min(training_parameters.batch(), len(self.training_xys)) - 1)
            count = 0
            first = True

            while offset < len(self.training_xys):
                if first:
                    first = False
                    batch = self.training_xys[0:offset]
                else:
                    batch = self.training_xys[offset:offset + training_parameters.batch()]
                    offset += training_parameters.batch()

                # To account for when offset is randomly assigned 0
                if len(batch) > 0:
                    count += len(batch)
                    feed = self.get_training_feed(batch, training_parameters)
                    _, loss = self.session.run([self.updates, self.cost], feed_dict=feed)
                    #_, loss, logits, targets = self.session.run([self.updates, self.cost, self.logits, self.targets], feed_dict=feed)
                    #_, loss, mask, uop1, lrs, mmm, mmn, mnn = self.session.run([self.updates, self.cost, self.mask, self.unrolled_outputs_p, self.losses_reduced, self.masked, self.masked2, self.masked3], feed_dict=feed)
                    #_, loss, mask, uop1, tgs, lrs, mmm = self.session.run([self.updates, self.cost, self.mask, self.unrolled_outputs_p, self.targets, self.losses_reduced, self.masked], feed_dict=feed)
                    #if epoch == 0:
                        #print(mask)
                        #print(uop1)
                        #print(tgs)
                        #print(lrs)
                        #print(mmm)
                        #print(mmn)
                        #print(mnn)
                        #print(dd)
                    epoch_loss += loss

            assert count == len(xy_sequences), "%d != %d" % (count, len(xy_sequences))
            epoch_loss /= count
            epoch_perplexity = math.exp(epoch_loss)
            losses.append(epoch_loss)
            finished, reason = training_parameters.finished(epoch, losses)

            if not finished and epoch % epochs_tenth == 0 and training_parameters.debug():
                if training_parameters.score():
                    score = 0.0
                    offset = 0

                    while offset < len(xy_sequences):
                        batch = xy_sequences[offset:offset + 32]
                        offset += 32
                        feed = self.get_testing_feed(batch)
                        time_distributions = self.session.run(self.output_distributions, feed_dict=feed)
                        score += self.score(batch, feed, time_distributions, False, case_slot_length)

                    logging.debug(epoch_template.format(epoch, epoch_loss, epoch_perplexity, score / len(xy_sequences)))
                else:
                    logging.debug(epoch_template.format(epoch, epoch_loss, epoch_perplexity))

        if training_parameters.score():
            score = 0.0
            offset = 0

            while offset < len(xy_sequences):
                batch = xy_sequences[offset:offset + 32]
                offset += 32
                feed = self.get_testing_feed(batch)
                time_distributions = self.session.run(self.output_distributions, feed_dict=feed)
                score += self.score(batch, feed, time_distributions, False, case_slot_length)

            logging.debug(epoch_template.format(epoch, epoch_loss, epoch_perplexity, score / len(xy_sequences)))
        else:
            logging.debug(epoch_template.format(epoch, epoch_loss, epoch_perplexity))

        #logging.debug("Training finished due to %s (%s)." % (reason, losses))
        return epoch_loss, -epoch_perplexity
Exemplo n.º 20
0
    def __init__(self, scope, hyper_parameters, extra, input_field,
                 output_labels, case_field):
        super(SeparateFfnn, self).__init__(scope)
        self.hyper_parameters = check.check_instance(hyper_parameters,
                                                     HyperParameters)
        self.extra = extra
        self.input_field = check.check_instance(input_field, mlbase.Field)
        self.output_labels = check.check_instance(output_labels, mlbase.Labels)
        self.case_field = check.check_instance(case_field, mlbase.Labels)

        batch_size_dimension = None

        # Notation:
        #   _p      placeholder
        #   _c      constant

        # Base variable setup
        self.input_p = self.placeholder(
            "input_p",
            [batch_size_dimension, len(self.input_field)])
        self.input_cases_p = self.placeholder("input_cases_p",
                                              [batch_size_dimension], tf.int32)
        self.output_p = self.placeholder("output_p", [batch_size_dimension],
                                         tf.int32)
        self.learning_rate_p = self.placeholder("learning_rate_p", [1],
                                                tf.float32)
        self.clip_norm_p = self.placeholder("clip_norm_p", [1], tf.float32)
        self.dropout_keep_p = self.placeholder("dropout_keep_p", [1],
                                               tf.float32)

        self.batch_size, _ = tf.unstack(tf.shape(self.input_p))

        if self.hyper_parameters.layers > 0:
            self.E = self.variable("E", [
                len(self.case_field),
                len(self.input_field), self.hyper_parameters.width
            ])
            self.E_bias = self.variable(
                "E_bias",
                [len(self.case_field), 1, self.hyper_parameters.width], 0.)

            self.Y = self.variable("Y", [
                len(self.case_field), self.hyper_parameters.width,
                len(self.output_labels)
            ])
            self.Y_bias = self.variable(
                "Y_bias", [len(self.case_field), 1,
                           len(self.output_labels)], 0.)

            # The E layer is the first layer.
            if self.hyper_parameters.layers > 1:
                self.H = self.variable("H", [
                    len(self.case_field), self.hyper_parameters.layers - 1,
                    self.hyper_parameters.width, self.hyper_parameters.width
                ])
                self.H_bias = self.variable("H_bias", [
                    len(self.case_field), self.hyper_parameters.layers - 1, 1,
                    self.hyper_parameters.width
                ], 0.)

            # Computational graph encoding
            cased_E = tf.nn.embedding_lookup(self.E, self.input_cases_p)
            mlbase.assert_shape(cased_E, [
                batch_size_dimension,
                len(self.input_field), self.hyper_parameters.width
            ])
            cased_E_bias = tf.nn.embedding_lookup(self.E_bias,
                                                  self.input_cases_p)
            mlbase.assert_shape(
                cased_E_bias,
                [batch_size_dimension, 1, self.hyper_parameters.width])

            self.embedded_input = tf.tanh(
                tf.matmul(tf.expand_dims(self.input_p, axis=1), cased_E) +
                cased_E_bias)
            mlbase.assert_shape(
                self.embedded_input,
                [batch_size_dimension, 1, self.hyper_parameters.width])
            hidden = self.embedded_input
            mlbase.assert_shape(
                hidden, [batch_size_dimension, 1, self.hyper_parameters.width])

            for l in range(self.hyper_parameters.layers - 1):
                cased_H = tf.nn.embedding_lookup(self.H, self.input_cases_p)
                mlbase.assert_shape(cased_H, [
                    batch_size_dimension, self.hyper_parameters.layers - 1,
                    self.hyper_parameters.width, self.hyper_parameters.width
                ])
                cased_H_bias = tf.nn.embedding_lookup(self.H_bias,
                                                      self.input_cases_p)
                mlbase.assert_shape(cased_H_bias, [
                    batch_size_dimension, self.hyper_parameters.layers - 1, 1,
                    self.hyper_parameters.width
                ])

                hidden = tf.tanh(
                    tf.matmul(self.dropout(hidden), cased_H[:, l]) +
                    cased_H_bias[:, l])
                mlbase.assert_shape(
                    hidden,
                    [batch_size_dimension, 1, self.hyper_parameters.width])

            mlbase.assert_shape(
                hidden, [batch_size_dimension, 1, self.hyper_parameters.width])

            cased_Y = tf.nn.embedding_lookup(self.Y, self.input_cases_p)
            mlbase.assert_shape(cased_Y, [
                batch_size_dimension, self.hyper_parameters.width,
                len(self.output_labels)
            ])
            cased_Y_bias = tf.nn.embedding_lookup(self.Y_bias,
                                                  self.input_cases_p)
            mlbase.assert_shape(
                cased_Y_bias,
                [batch_size_dimension, 1,
                 len(self.output_labels)])
        else:
            self.Y = self.variable("Y", [
                len(self.case_field),
                len(self.input_field),
                len(self.output_labels)
            ])
            self.Y_bias = self.variable(
                "Y_bias", [len(self.case_field), 1,
                           len(self.output_labels)], 0.)

            # Computational graph encoding
            hidden = tf.expand_dims(self.input_p, axis=1)
            mlbase.assert_shape(
                hidden, [batch_size_dimension, 1,
                         len(self.input_field)])

            cased_Y = tf.nn.embedding_lookup(self.Y, self.input_cases_p)
            mlbase.assert_shape(cased_Y, [
                batch_size_dimension,
                len(self.input_field),
                len(self.output_labels)
            ])
            cased_Y_bias = tf.nn.embedding_lookup(self.Y_bias,
                                                  self.input_cases_p)
            mlbase.assert_shape(
                cased_Y_bias,
                [batch_size_dimension, 1,
                 len(self.output_labels)])

        cased_logit = tf.matmul(self.dropout(hidden), cased_Y) + cased_Y_bias
        mlbase.assert_shape(cased_logit,
                            [batch_size_dimension, 1,
                             len(self.output_labels)])

        self.output_logit = tf.reshape(cased_logit,
                                       [-1, len(self.output_labels)])
        mlbase.assert_shape(self.output_logit,
                            [batch_size_dimension,
                             len(self.output_labels)])

        self.output_distributions = tf.nn.softmax(self.output_logit)
        mlbase.assert_shape(self.output_distributions,
                            [batch_size_dimension,
                             len(self.output_labels)])
        #self.cost = tf.reduce_sum(tf.nn.nce_loss(
        #    weights=tf.transpose(self.Y),
        #    biases=self.Y_bias,
        #    labels=self.output_p,
        #    inputs=hidden,
        #    num_sampled=1,
        #    num_classes=len(self.output_labels)))
        loss_fn = tf.nn.sparse_softmax_cross_entropy_with_logits
        self.cost = tf.reduce_sum(
            loss_fn(labels=tf.stop_gradient(self.output_p),
                    logits=self.output_logit))
        #self.updates = tf.train.AdamOptimizer().minimize(self.cost)

        optimizer = tf.train.GradientDescentOptimizer(self.learning_rate_p[0])
        gradients = optimizer.compute_gradients(self.cost)
        gradients_clipped = [(tf.clip_by_norm(g, self.clip_norm_p[0]), var)
                             for g, var in gradients if g is not None]
        self.updates = optimizer.apply_gradients(gradients_clipped)

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.session = tf.Session(config=config)
        self.session.run(tf.global_variables_initializer())
Exemplo n.º 21
0
 def __init__(self, labels):
     super(MergeLabels, self).__init__(labels)
     self.labels = check.check_instance(labels, Labels)
Exemplo n.º 22
0
 def __init__(self, train, validate, test):
     self.train = check.check_instance(train, Datastream)
     self.validate = check.check_instance(validate, Datastream)
     self.test = check.check_instance(test, Datastream)
Exemplo n.º 23
0
    def find_matches(self, tolerance, first_only, predicates):
        check.check_instance(predicates, Predicates)
        # predicates: list of dicts, keyed by rnn part keys to lists of (axis, value) features
        #             [ {(cell, 0): [ (0, 0.5), (22, -0.02), ... ] }, ... ]
        matched_activations = None
        matched_sequences = None

        for level, predicate in predicates.levels():
            matches = None

            # Hit the _candidates query in order, to leverage the cached hit as much as possible.
            for key, features in sorted(predicate.items()):
                found_sequences = set()
                found_indices = {}
                first_feature = next(iter(features.items()))

                for sequence, index, *point in self._candidates(
                        key, first_feature, tolerance, matched_sequences):
                    point = tuple(point)
                    candidate_point = []
                    target_point = []
                    operator_point = []

                    for axis, target_operator in features.items():
                        target, operator = target_operator
                        candidate_point += [point[axis]]
                        target_point += [target]
                        operator_point += [operator]

                    within = self._within(candidate_point, target_point,
                                          operator_point, tolerance)

                    if within:
                        found_sequences.add(sequence)

                        if sequence not in found_indices:
                            found_indices[sequence] = set()

                        found_indices[sequence].add(index)

                if matched_sequences is None:
                    matched_sequences = found_sequences
                    logging.debug("initially matched sequences: %d" %
                                  len(matched_sequences))
                else:
                    matched_sequences.intersection_update(found_sequences)
                    logging.debug("subsequently matched sequences: %d" %
                                  len(matched_sequences))

                if matches is None:
                    matches = found_indices
                else:
                    next_matches = {}

                    for sequence in matches.keys():
                        if sequence in found_indices:
                            next_matches[sequence] = matches[
                                sequence].intersection(found_indices[sequence])

                    matches = next_matches

            if matched_activations is None:
                matched_activations = {}

                for sequence, indices in matches.items():
                    matched_activations[sequence] = [indices]
            else:
                removes = set()

                for sequence in matched_activations.keys():
                    if sequence in matches:
                        matched_activations[sequence] += [matches[sequence]]
                    else:
                        removes.add(sequence)

                for remove in removes:
                    del matched_activations[remove]

                    if remove in matched_sequences:
                        matched_sequences.remove(remove)

        matches = []

        for sequence, requirements in matched_activations.items():
            #logging.debug("searching for paths through: %s\n  %s" % (" ".join(sequence), requirements))
            paths = monotonic_paths(requirements, len(sequence), first_only)
            #logging.debug("found %d paths" % (len(paths)))

            for path in paths:
                matches += [(sequence, path)]

        return matches
Exemplo n.º 24
0
 def extract_parameters(self, training_parameters):
     check.check_instance(training_parameters, api.train.TrainingParameters)
     raise NotImplementedError()
Exemplo n.º 25
0
 def __init__(self, parameters, schedule):
     self.parameters = check.check_instance(parameters, TrainingParameters)
     self.schedule = check.check_instance(schedule, TrainingSchedule)
Exemplo n.º 26
0
 def __init__(self, model_dir, step, version_key):
     self.model_dir = check.check_instance(model_dir, str)
     self.step = check.check_gte(check.check_instance(step, int), 0)
     self.version_key = check.check_instance(version_key, str)
Exemplo n.º 27
0
 def to_dominant_inflection(self, lemma_term):
     self._finalize()
     check.check_instance(lemma_term, Term)
     return self.lemma_to_inflection[lemma_term]
Exemplo n.º 28
0
 def add_descendant(self, descendant):
     check.check_equal(self.finalized, False)
     check.check_instance(descendant, Node)
     check.check_not_equal(self.identifier, descendant.identifier)
     self.descendants.add(descendant)