def required_model(v: int, n: int, delta: float, train_file: str, test_file: str): """ Run the required model with the given parameters :param v: Vocabulary choice :param n: ngram choice :param delta: Smoothing choice :param train_file: Path to training data :param test_file: Path to testing data :return: void """ validate_params(v, n, delta, train_file, test_file) vocab_size = get_vocab_size(v) ngrams = process_train_data(v, n, delta, vocab_size, train_file) test_data = pd.read_csv(test_file, delimiter='\t', names=[DF_COLUMN_ID, DF_COLUMN_NAME, DF_COLUMN_LANG, DF_COLUMN_TWEET]) transform_to_vocab(test_data, v) print("Running model against provided testing data.") results = get_test_results(test_data, ngrams, vocab_size, n) generate_trace_file(v, n, delta, results) print("Final results generated") print(results) print("Evaluating classifier with parameters: [vocabulary = {}, ngram size = {}, delta = {}]".format(v, n, delta)) evaluate_results(results, v, n, delta) return results
def __init__(self, is_training=True): super(Bow, self).__init__() if not FLAGS.dynamic_batch_length and not FLAGS.exclude_zero_index and FLAGS.combiner != 'sum': raise ValueError( """dynamic_batch_length=False, exclude_zero_index=False, must use sum combiner(predictor will assume this also), but input combiner is:""", FLAGS.combiner) emb_dim = FLAGS.emb_dim init_width = 0.5 / emb_dim vocab_size = vocabulary.get_vocab_size() print('bow vocab_size:', vocab_size) self.vocab_size = vocab_size #if not cpu and on gpu run and using adagrad, will fail #also this will be more safer, since emb is large might exceed gpu mem with tf.device('/cpu:0'): self.emb = melt.variable.get_weights_uniform( 'emb', [vocab_size, emb_dim], -init_width, init_width) if is_training: tf.histogram_summary('debug-emb_0', tf.gather(self.emb, 0)) tf.histogram_summary('debug-emb_nv', tf.gather(self.emb, 7)) ##it seems gpu will not fail if exceeds bound # tf.histogram_summary('debug-emb_1k', tf.gather(self.emb, 1000)) # tf.histogram_summary('debug-emb_1w', tf.gather(self.emb, 10000)) # tf.histogram_summary('debug-emb_10w', tf.gather(self.emb, 100000)) tf.histogram_summary('debug-emb_middle', tf.gather(self.emb, vocab_size // 2)) tf.histogram_summary('debug-emb_end', tf.gather(self.emb, vocab_size - 1)) self.activation = melt.activations[FLAGS.activation]
def __init__(self): super(ShowAndTell, self).__init__() self.sess = tf.InteractiveSession() vocab_size = vocabulary.get_vocab_size() hidden_size = 256 emb_dim = 256 init_width = 0.5 / emb_dim with tf.device('/cpu:0'): self.emb = tf.Variable(tf.random_uniform([vocab_size, emb_dim], -init_width, init_width), name="emb") self.bemb = melt.init_bias([emb_dim], name='bemb') self.encode_img_W = tf.Variable(tf.random_uniform( [IMAGE_FEATURE_LEN, hidden_size], -0.1, 0.1), name='encode_img_W') self.encode_img_b = melt.init_bias([hidden_size], name='encode_img_b') with tf.device('/cpu:0'): self.embed_word_W = tf.Variable(tf.random_uniform( [emb_dim, vocab_size], -0.1, 0.1), name='embed_word_W') self.embed_word_b = melt.init_bias([vocab_size], name='embed_word_b') self.lstm = tf.nn.rnn_cell.BasicLSTMCell(hidden_size) self.n_lstm_steps = TEXT_MAX_WORDS + 2 self.activation = tf.nn.relu
def __init__(self): super(ShowAndTell, self).__init__() self.sess = tf.InteractiveSession() vocab_size = vocabulary.get_vocab_size() self.emb_dim = emb_dim = hidden_size = 256 #init_width = 0.5 / emb_dim init_width = 0.1 with tf.device('/cpu:0'): self.emb = tf.Variable(tf.random_uniform([vocab_size, emb_dim], -init_width, init_width), name="emb") self.bemb = melt.init_bias([emb_dim], name='bemb') self.encode_img_W = tf.Variable(tf.random_uniform( [IMAGE_FEATURE_LEN, hidden_size], -0.1, 0.1), name='encode_img_W') self.encode_img_b = melt.init_bias([hidden_size], name='encode_img_b') with tf.device('/cpu:0'): self.embed_word_W = tf.Variable(tf.random_uniform( [emb_dim, vocab_size], -0.1, 0.1), name='embed_word_W') self.embed_word_b = melt.init_bias([vocab_size], name='embed_word_b') self.cell = cell(hidden_size, state_is_tuple=True) #------GRUCell has no arg state_is_tuple #self.cell = cell(hidden_size) self.activation = tf.nn.relu num_samples = FLAGS.num_samples #@TODO move to melt def prepare_sampled_softmax_loss(num_samples, vocab_size, hidden_size) #return output_projection, softmax_loss_function #also consider candidate sampler self.softmax_loss_function = None # Sampled softmax only makes sense if we sample less than vocabulary size. if num_samples > 0 and num_samples < vocab_size: def sampled_loss(inputs, labels): with tf.device("/cpu:0"): labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss( tf.transpose(self.embed_word_W), self.embed_word_b, inputs, labels, num_samples, vocab_size) self.softmax_loss_function = sampled_loss
def __init__(self): super(Model, self).__init__() emb_dim = FLAGS.emb_dim init_width = 0.5 / emb_dim vocab_size = vocabulary.get_vocab_size() self.vocab_size = vocab_size #if not cpu and on gpu run and using adagrad, will fail #also this will be more safer, since emb is large might exceed gpu mem with tf.device('/cpu:0'): self.emb = init_weights_uniform([vocab_size, emb_dim], -init_width, init_width, name='emb') self.activation = melt.activations[FLAGS.activation]
def __init__(self, is_training=True): super(ShowAndTell, self).__init__() self.is_training = is_training if is_training: print('num_sampled:', FLAGS.num_sampled) print('use_neg:', FLAGS.use_neg) print('per_example_loss:', FLAGS.per_example_loss) vocab_size = vocabulary.get_vocab_size() self.vocab_counts_list = [ vocabulary.vocab.freq(i) for i in xrange(vocab_size) ] self.vocab_counts_list.append(1) #vocabe_size + 1 add one for store end id vocab_size += 1 self.vocab_size = vocab_size self.end_id = vocab_size - 1 #self.emb_dim = emb_dim = hidden_size = 256 #@TODO now default hidden_size flags in bow.py with 1024 seems much better then 256(but also add drop out) self.emb_dim = emb_dim = hidden_size = FLAGS.hidden_size init_width = 0.5 / emb_dim #init_width = 0.1 with tf.device('/cpu:0'): self.emb = melt.variable.get_weights_uniform( 'emb', [vocab_size, emb_dim], -init_width, init_width) self.bemb = melt.variable.get_bias('bemb', [emb_dim]) self.embed_word_W = melt.variable.get_weights_uniform( 'embed_word_W', [emb_dim, vocab_size], -0.1, 0.1) self.embed_word_b = melt.variable.get_bias('embed_word_b', [vocab_size]) self.encode_img_W = melt.variable.get_weights_uniform( 'encode_img_W', [IMAGE_FEATURE_LEN, hidden_size], -0.1, 0.1) self.encode_img_b = melt.variable.get_bias('encode_img_b', [hidden_size]) self.cell = cell(hidden_size, state_is_tuple=True) if is_training and FLAGS.keep_prob < 1: self.cell = tf.nn.rnn_cell.DropoutWrapper( self.cell, output_keep_prob=FLAGS.keep_prob) self.cell = tf.nn.rnn_cell.MultiRNNCell([self.cell] * FLAGS.num_layers, state_is_tuple=True) #------GRUCell has no arg state_is_tuple #self.cell = cell(hidden_size) num_sampled = FLAGS.num_sampled #@TODO move to melt def prepare_sampled_softmax_loss(num_sampled, vocab_size, hidden_size) #return output_projection, softmax_loss_function #also consider candidate sampler self.softmax_loss_function = None # Sampled softmax only makes sense if we sample less than vocabulary size. if num_sampled > 0 and num_sampled < vocab_size: def sampled_loss(inputs, labels): with tf.device("/cpu:0"): labels = tf.reshape(labels, [-1, 1]) sampled_values = tf.nn.fixed_unigram_candidate_sampler( true_classes=labels, num_true=1, num_sampled=num_sampled, unique=True, range_max=vocab_size, distortion=0.75, unigrams=self.vocab_counts_list) return tf.nn.sampled_softmax_loss( tf.transpose(self.embed_word_W), self.embed_word_b, inputs, labels, num_sampled, vocab_size, sampled_values=sampled_values) self.softmax_loss_function = sampled_loss