Пример #1
0
def required_model(v: int, n: int, delta: float, train_file: str, test_file: str):
    """
    Run the required model with the given parameters
    :param v: Vocabulary choice
    :param n: ngram choice
    :param delta: Smoothing choice
    :param train_file: Path to training data
    :param test_file: Path to testing data
    :return: void
    """
    validate_params(v, n, delta, train_file, test_file)
    vocab_size = get_vocab_size(v)
    ngrams = process_train_data(v, n, delta, vocab_size, train_file)

    test_data = pd.read_csv(test_file, delimiter='\t',
                            names=[DF_COLUMN_ID, DF_COLUMN_NAME, DF_COLUMN_LANG, DF_COLUMN_TWEET])
    transform_to_vocab(test_data, v)

    print("Running model against provided testing data.")
    results = get_test_results(test_data, ngrams, vocab_size, n)
    generate_trace_file(v, n, delta, results)

    print("Final results generated")
    print(results)

    print("Evaluating classifier with parameters: [vocabulary = {}, ngram size = {}, delta = {}]".format(v, n, delta))
    evaluate_results(results, v, n, delta)
    return results
Пример #2
0
    def __init__(self, is_training=True):
        super(Bow, self).__init__()

        if not FLAGS.dynamic_batch_length and not FLAGS.exclude_zero_index and FLAGS.combiner != 'sum':
            raise ValueError(
                """dynamic_batch_length=False, exclude_zero_index=False,
                          must use sum combiner(predictor will assume this also), 
                          but input combiner is:""", FLAGS.combiner)

        emb_dim = FLAGS.emb_dim
        init_width = 0.5 / emb_dim
        vocab_size = vocabulary.get_vocab_size()
        print('bow vocab_size:', vocab_size)
        self.vocab_size = vocab_size
        #if not cpu and on gpu run and using adagrad, will fail
        #also this will be more safer, since emb is large might exceed gpu mem
        with tf.device('/cpu:0'):
            self.emb = melt.variable.get_weights_uniform(
                'emb', [vocab_size, emb_dim], -init_width, init_width)

        if is_training:
            tf.histogram_summary('debug-emb_0', tf.gather(self.emb, 0))
            tf.histogram_summary('debug-emb_nv', tf.gather(self.emb, 7))
            ##it seems gpu will not fail if exceeds bound
            # tf.histogram_summary('debug-emb_1k', tf.gather(self.emb, 1000))
            # tf.histogram_summary('debug-emb_1w', tf.gather(self.emb, 10000))
            # tf.histogram_summary('debug-emb_10w', tf.gather(self.emb, 100000))
            tf.histogram_summary('debug-emb_middle',
                                 tf.gather(self.emb, vocab_size // 2))
            tf.histogram_summary('debug-emb_end',
                                 tf.gather(self.emb, vocab_size - 1))

        self.activation = melt.activations[FLAGS.activation]
Пример #3
0
    def __init__(self):
        super(ShowAndTell, self).__init__()
        self.sess = tf.InteractiveSession()

        vocab_size = vocabulary.get_vocab_size()

        hidden_size = 256
        emb_dim = 256

        init_width = 0.5 / emb_dim
        with tf.device('/cpu:0'):
            self.emb = tf.Variable(tf.random_uniform([vocab_size, emb_dim],
                                                     -init_width, init_width),
                                   name="emb")
            self.bemb = melt.init_bias([emb_dim], name='bemb')

        self.encode_img_W = tf.Variable(tf.random_uniform(
            [IMAGE_FEATURE_LEN, hidden_size], -0.1, 0.1),
                                        name='encode_img_W')
        self.encode_img_b = melt.init_bias([hidden_size], name='encode_img_b')

        with tf.device('/cpu:0'):
            self.embed_word_W = tf.Variable(tf.random_uniform(
                [emb_dim, vocab_size], -0.1, 0.1),
                                            name='embed_word_W')
            self.embed_word_b = melt.init_bias([vocab_size],
                                               name='embed_word_b')

        self.lstm = tf.nn.rnn_cell.BasicLSTMCell(hidden_size)

        self.n_lstm_steps = TEXT_MAX_WORDS + 2

        self.activation = tf.nn.relu
Пример #4
0
    def __init__(self):
        super(ShowAndTell, self).__init__()
        self.sess = tf.InteractiveSession()

        vocab_size = vocabulary.get_vocab_size()

        self.emb_dim = emb_dim = hidden_size = 256

        #init_width = 0.5 / emb_dim
        init_width = 0.1
        with tf.device('/cpu:0'):
            self.emb = tf.Variable(tf.random_uniform([vocab_size, emb_dim],
                                                     -init_width, init_width),
                                   name="emb")
            self.bemb = melt.init_bias([emb_dim], name='bemb')

        self.encode_img_W = tf.Variable(tf.random_uniform(
            [IMAGE_FEATURE_LEN, hidden_size], -0.1, 0.1),
                                        name='encode_img_W')
        self.encode_img_b = melt.init_bias([hidden_size], name='encode_img_b')

        with tf.device('/cpu:0'):
            self.embed_word_W = tf.Variable(tf.random_uniform(
                [emb_dim, vocab_size], -0.1, 0.1),
                                            name='embed_word_W')
            self.embed_word_b = melt.init_bias([vocab_size],
                                               name='embed_word_b')

        self.cell = cell(hidden_size, state_is_tuple=True)
        #------GRUCell has no arg state_is_tuple
        #self.cell = cell(hidden_size)

        self.activation = tf.nn.relu

        num_samples = FLAGS.num_samples

        #@TODO move to melt  def prepare_sampled_softmax_loss(num_samples, vocab_size, hidden_size)
        #return output_projection, softmax_loss_function
        #also consider candidate sampler
        self.softmax_loss_function = None
        # Sampled softmax only makes sense if we sample less than vocabulary size.
        if num_samples > 0 and num_samples < vocab_size:

            def sampled_loss(inputs, labels):
                with tf.device("/cpu:0"):
                    labels = tf.reshape(labels, [-1, 1])
                    return tf.nn.sampled_softmax_loss(
                        tf.transpose(self.embed_word_W), self.embed_word_b,
                        inputs, labels, num_samples, vocab_size)

            self.softmax_loss_function = sampled_loss
Пример #5
0
 def __init__(self):
     super(Model, self).__init__()
     emb_dim = FLAGS.emb_dim
     init_width = 0.5 / emb_dim
     vocab_size = vocabulary.get_vocab_size()
     self.vocab_size = vocab_size
     #if not cpu and on gpu run and using adagrad, will fail
     #also this will be more safer, since emb is large might exceed gpu mem
     with tf.device('/cpu:0'):
         self.emb = init_weights_uniform([vocab_size, emb_dim],
                                         -init_width,
                                         init_width,
                                         name='emb')
     self.activation = melt.activations[FLAGS.activation]
Пример #6
0
    def __init__(self, is_training=True):
        super(ShowAndTell, self).__init__()

        self.is_training = is_training

        if is_training:
            print('num_sampled:', FLAGS.num_sampled)
            print('use_neg:', FLAGS.use_neg)
            print('per_example_loss:', FLAGS.per_example_loss)

        vocab_size = vocabulary.get_vocab_size()
        self.vocab_counts_list = [
            vocabulary.vocab.freq(i) for i in xrange(vocab_size)
        ]
        self.vocab_counts_list.append(1)

        #vocabe_size + 1 add one for store end id
        vocab_size += 1
        self.vocab_size = vocab_size
        self.end_id = vocab_size - 1

        #self.emb_dim = emb_dim = hidden_size = 256
        #@TODO now default hidden_size flags in bow.py with 1024 seems much better then 256(but also add drop out)
        self.emb_dim = emb_dim = hidden_size = FLAGS.hidden_size

        init_width = 0.5 / emb_dim
        #init_width = 0.1
        with tf.device('/cpu:0'):
            self.emb = melt.variable.get_weights_uniform(
                'emb', [vocab_size, emb_dim], -init_width, init_width)
            self.bemb = melt.variable.get_bias('bemb', [emb_dim])

            self.embed_word_W = melt.variable.get_weights_uniform(
                'embed_word_W', [emb_dim, vocab_size], -0.1, 0.1)
            self.embed_word_b = melt.variable.get_bias('embed_word_b',
                                                       [vocab_size])

        self.encode_img_W = melt.variable.get_weights_uniform(
            'encode_img_W', [IMAGE_FEATURE_LEN, hidden_size], -0.1, 0.1)
        self.encode_img_b = melt.variable.get_bias('encode_img_b',
                                                   [hidden_size])

        self.cell = cell(hidden_size, state_is_tuple=True)
        if is_training and FLAGS.keep_prob < 1:
            self.cell = tf.nn.rnn_cell.DropoutWrapper(
                self.cell, output_keep_prob=FLAGS.keep_prob)
        self.cell = tf.nn.rnn_cell.MultiRNNCell([self.cell] * FLAGS.num_layers,
                                                state_is_tuple=True)
        #------GRUCell has no arg state_is_tuple
        #self.cell = cell(hidden_size)

        num_sampled = FLAGS.num_sampled

        #@TODO move to melt  def prepare_sampled_softmax_loss(num_sampled, vocab_size, hidden_size)
        #return output_projection, softmax_loss_function
        #also consider candidate sampler
        self.softmax_loss_function = None
        # Sampled softmax only makes sense if we sample less than vocabulary size.
        if num_sampled > 0 and num_sampled < vocab_size:

            def sampled_loss(inputs, labels):
                with tf.device("/cpu:0"):
                    labels = tf.reshape(labels, [-1, 1])

                    sampled_values = tf.nn.fixed_unigram_candidate_sampler(
                        true_classes=labels,
                        num_true=1,
                        num_sampled=num_sampled,
                        unique=True,
                        range_max=vocab_size,
                        distortion=0.75,
                        unigrams=self.vocab_counts_list)

                    return tf.nn.sampled_softmax_loss(
                        tf.transpose(self.embed_word_W),
                        self.embed_word_b,
                        inputs,
                        labels,
                        num_sampled,
                        vocab_size,
                        sampled_values=sampled_values)

            self.softmax_loss_function = sampled_loss