def link(obj_code): # build symbol/relocation tables symtbl = SymbolTable(False) reltbls = [] for i in range(0, len(obj_code)): reltbls += [SymbolTable(True)] build_tables(obj_code, symtbl, reltbls) # print(symtbl.to_string()) # Find .text section of input byte_off = 0 line_num = 0 output = [] errors = [] index = 0 for obj_file in obj_code: start, end = find_text_block(obj_file) for line in obj_file[start:end]: try: line_num += 1 # write instruction out instruction = int(line, 16) if inst_needs_relocation(instruction): instruction = relocate_inst(instruction, byte_off, symtbl, reltbls[index]) write_inst_hex(output, instruction) except AssemblerException as e: errors += [(line_num, e)] byte_off += 4 index += 1 byte_off = 0 if len(errors) > 0: print("Errors during linking:") for line_num, e in errors: print("Error: line {0}: {1}".format(line_num, e)) return output
def addScope(): if not scope_stack: print "scope stack empty: global symbol table not initialised" else: curr_scope = scope_stack[-1] new_scope = SymbolTable(curr_scope) global scope_label scope_label += 1 new_scope.label = scope_label scope_stack.append(new_scope) scope_list.append(new_scope)
def _preprocess_data(self, sentence_data, init=True): # Initialize word table and populate with embeddings if init: self.word_dict = SymbolTable() for word in self.embedding_words: self.word_dict.get(word) # Process data return [ map_words_to_symbols(s, self.word_dict.lookup, self.ngrams) for s in sentence_data ]
def _preprocess_data(self, sentence_data, init=True): # Initialize word table and populate with embeddings if init: self.word_dict = SymbolTable() # Process data mapper = self.word_dict.get if init else self.word_dict.lookup tokens = [ map_words_to_symbols(s, mapper, self.ngrams) for s in sentence_data ] if init: self.td = self.word_dict.num_symbols() return tokens
def main(): """ The driver for the Jack syntax analyser. Responsible for setting up and invoking Initialiser, JackTokeniser, SymbolTable, and CompilationEngine. """ # Pass Initialiser the cli to generate array of .jack files for compilation initialiser = Initialiser(sys.argv[1]) # Generate dict of input files for translation and output files for writing file_names = {} for input_file in initialiser.files: vm_file = input_file.replace('jack', 'vm') file_names[input_file] = vm_file # Compile every .jack file and write to output for input_file, output_file in file_names.items(): # Tokenise the input tokeniser = JackTokeniser(input_file) # Create a symbol table for the Jack class symbol_table = SymbolTable(tokeniser) # Create a vm_writer vm_writer = VMWriter(output_file) # Prepare the compilation engine, compile the class, and close engine = CompilationEngine(tokeniser, symbol_table, vm_writer) engine.compile_class() vm_writer.close()
class LinearModel(SHALOModelVectorMean, SHALOModelFixed): """Linear model over pretrained embeddings""" name = 'LinearModel' def _preprocess_data(self, sentence_data, init=True): # Initialize word table and populate with embeddings if init: self.word_dict = SymbolTable() for word in self.embedding_words: self.word_dict.get(word) # Process data return [ map_words_to_symbols(s, self.word_dict.lookup, self.ngrams) for s in sentence_data ]
class SHALOModelPreTrain(SHALOModel): name = 'SHALOModelPreTrain' def __init__(self, embedding_file, save_file=None, n_threads=None): SHALOModel.__init__(self, save_file, n_threads) with open(embedding_file, 'rb') as f: self.embedding_words, self.embeddings = cPickle.load(f) def _word_table_init(self, training_sentences): """Get training words and init word table with pre-embedded words""" self._get_training_words(training_sentences) self.word_dict = SymbolTable() for word in self.embedding_words_train: self.word_dict.get(word) def _get_training_words(self, training_sentences): """Get training words and subset of pre-embedded words in train set""" unique_words = set(w for s in training_sentences for w in s) embedding_idxs_train, self.embedding_words_train = [], [] for i, word in enumerate(self.embedding_words): if word in unique_words: self.embedding_words_train.append(word) embedding_idxs_train.append(i) idxs = np.ravel(embedding_idxs_train) self.embeddings_train = self.embeddings[idxs, :] def _get_embedding(self): """ Return embedding tensor (either constant or variable) Row 0 is 0 vector for no token Row 1 is random initialization for UNKNOWN Rows 2 : 2 + len(self.embedding_words) are pretrained initialization Remaining rows are random initialization """ zero = tf.constant(0.0, dtype=tf.float32, shape=(1, self.d)) s = self.seed - 1 unk = tf.Variable(tf.random_normal((1, self.d), stddev=SD, seed=s)) pretrain = tf.Variable(self.embeddings_train, dtype=tf.float32) vecs = [zero, unk, pretrain] n_r = self.word_dict.num_words() - len(self.embedding_words_train) if n_r > 0: r = tf.Variable(tf.random_normal((n_r, self.d), stddev=SD, seed=s)) vecs.append(r) self.U = tf.concat(vecs, axis=0, name='embedding_matrix') return self.U
def _preprocess_data(self, sentence_data, init=True): # Initialize word table if init: self.word_dict = SymbolTable() # Process data mapper = self.word_dict.get if init else self.word_dict.lookup return [ map_words_to_symbols(s, mapper, self.ngrams) for s in sentence_data ]
def __init__(self, save_file=None, name='RNNBase', seed=None, n_threads=4): """Base class for bidirectional RNN""" # Define metadata self.mx_len = None # Max sentence length self.dim = None # Embedding dimension self.n_v = None # Vocabulary size self.lr = None # Learning rate self.attn = None # Attention window self.cell = None # RNN cell type self.word_dict = SymbolTable() # Symbol table for dictionary # Define input layers self.sentences = None self.sentence_lengths = None self.train_marginals = None self.keep_prob = None self.seed = seed # Super constructor super(RNNBase, self).__init__(n_threads=n_threads, save_file=save_file, name=name)
def assemble(input_file): cleaned = [ strip_comments(line).strip() for line in utils.read_file_to_list(input_file) ] asm = [line for line in cleaned if line != ""] symtbl = SymbolTable(False) reltbl = SymbolTable(True) # Pass One intermediate, errors_one = pass_one(asm, symtbl) # Pass Two output, errors_two = pass_two(intermediate, symtbl, reltbl) if len(errors_one) > 0: print("Errors during pass one:") for line_num, e in errors_one: print("Error: line {0}: {1}".format(line_num, e)) if len(errors_two) > 0: print("Errors during pass two:") for line_num, e in errors_two: print("Error: line {0}: {1}".format(line_num, e)) if len(errors_one) > 0 or len(errors_two) > 0: print("One or more errors encountered during assembly operation") return intermediate, output
def _preprocess_data(self, candidates, extend=False): """Convert candidate sentences to lookup sequences :param candidates: candidates to process :param extend: extend symbol table for tokens (train), or lookup (test)? """ if not hasattr(self, 'word_dict'): self.word_dict = SymbolTable() data, ends = [], [] for candidate in candidates: toks = candidate.get_contexts()[0].text.split() # Either extend word table or retrieve from it f = self.word_dict.get if extend else self.word_dict.lookup data.append(np.array(map(f, toks))) ends.append(len(toks)) return data, ends
def _preprocess_data(self, candidates, extend=False): """Convert candidate sentences to tagged symbol sequences @candidates: candidates to process @extend: extend symbol table for tokens (train), or lookup (test)? """ if not hasattr(self, 'word_dict'): self.word_dict = SymbolTable() data, ends = [], [] for candidate in candidates: # Read sentence data tokens = candidate_to_tokens(candidate) # Get label sequence labels = np.zeros(len(tokens), dtype=int) labels[c[0].get_word_start():c[0].get_word_end() + 1] = 1 # Tag sequence s = tag(tokens, labels) # Either extend word table or retrieve from it f = self.word_dict.get if extend else self.word_dict.lookup data.append(np.array(map(f, s))) ends.append(c[0].get_word_end()) return data, ends
def _preprocess_data(self, candidates, extend=False): """Convert candidate sentences to lookup sequences :param candidates: candidates to process :param extend: extend symbol table for tokens (train), or lookup (test)? """ if not hasattr(self, 'word_dict'): self.word_dict = SymbolTable() data, ends = [], [] for candidate in candidates: # Mark sentence args = [ (candidate[0].get_word_start(), candidate[0].get_word_end(), 1), (candidate[1].get_word_start(), candidate[1].get_word_end(), 2) ] s = mark_sentence(candidate_to_tokens(candidate), args) # Either extend word table or retrieve from it f = self.word_dict.get if extend else self.word_dict.lookup data.append(np.array(map(f, s))) ends.append(max(candidate[i].get_word_end() for i in [0, 1])) return data, ends
class SparseLinearModel(SHALOModelRandInit): """Sparse linear model over BOW indicator vector""" name = 'SparseLinearModel' def _preprocess_data(self, sentence_data, init=True): # Initialize word table and populate with embeddings if init: self.word_dict = SymbolTable() # Process data mapper = self.word_dict.get if init else self.word_dict.lookup tokens = [ map_words_to_symbols(s, mapper, self.ngrams) for s in sentence_data ] if init: self.td = self.word_dict.num_symbols() return tokens def _get_data_batch(self, x_batch): # Construct LIL matrix X_lil = sparse.lil_matrix((len(x_batch), self.td)) for j, x in enumerate(x_batch): for t in x: X_lil[j, t] += 1 # Get batch data indices, ids, weights = [], [], [] max_len = 0 for i, (row, data) in enumerate(zip(X_lil.rows, X_lil.data)): # Dummy weight for all-zero row if len(row) == 0: indices.append((i, 0)) ids.append(0) weights.append(0.0) continue # Update indices by position max_len = max(max_len, len(row)) indices.extend((i, t) for t in xrange(len(row))) ids.extend(row) weights.extend(data) shape = (len(X_lil.rows), max_len) return [indices, shape, ids, weights], None def _get_feed(self, x_batch, len_batch, y_batch=None): indices, shape, ids, weights = x_batch feed = { self.indices: indices, self.shape: shape, self.ids: ids, self.weights: weights, } if y_batch is not None: feed[self.y] = y_batch return feed def _build(self): assert (self.lr is not None) assert (self.l2_penalty is not None) assert (self.loss_function is not None) # Define input placeholders self.indices = tf.placeholder(tf.int64) self.shape = tf.placeholder(tf.int64, (2, )) self.ids = tf.placeholder(tf.int64) self.weights = tf.placeholder(tf.float32) self.y = tf.placeholder(tf.float32, (None, )) # Define training variables sparse_ids = tf.SparseTensor(self.indices, self.ids, self.shape) sparse_vals = tf.SparseTensor(self.indices, self.weights, self.shape) s1, s2 = self.seed, (self.seed + 1 if self.seed is not None else None) w = tf.Variable(tf.random_normal((self.td, 1), stddev=0.01, seed=s1)) b = tf.Variable(tf.random_normal((1, 1), stddev=0.01, seed=s2)) z = tf.nn.embedding_lookup_sparse(params=w, sp_ids=sparse_ids, sp_weights=sparse_vals, combiner='sum') h = tf.squeeze(tf.add(z, b)) # Define training procedure self.loss = self._get_loss(h, self.y) self.loss += self.l2_penalty * tf.nn.l2_loss(w) self.prediction = tf.sigmoid(h) self.train_fn = tf.train.AdamOptimizer(self.lr).minimize(self.loss) self.save_dict = self._get_save_dict(w=w, b=b)
class RNNBase(TFNoiseAwareModel): representation = True def __init__(self, save_file=None, name='RNNBase', seed=None, n_threads=4): """Base class for bidirectional RNN""" # Define metadata self.mx_len = None # Max sentence length self.dim = None # Embedding dimension self.n_v = None # Vocabulary size self.lr = None # Learning rate self.attn = None # Attention window self.cell = None # RNN cell type self.word_dict = SymbolTable() # Symbol table for dictionary # Define input layers self.sentences = None self.sentence_lengths = None self.train_marginals = None self.keep_prob = None self.seed = seed # Super constructor super(RNNBase, self).__init__(n_threads=n_threads, save_file=save_file, name=name) def _preprocess_data(self, candidates, extend): """Build @self.word_dict to encode and process data for extraction Return list of encoded sentences and list of last index of arguments """ raise NotImplementedError() def _check_max_sentence_length(self, ends): """Check that extraction arguments are within @self.mx_len""" mx = self.mx_len for i, end in enumerate(ends): if end >= mx: w = "Candidate {0} has argument past max length for model:" info = "[arg ends at index {0}; max len {1}]".format(end, mx) warnings.warn('\t'.join([w.format(i), info])) def _make_tensor(self, x): """Construct input tensor with padding Builds a matrix of symbols corresponding to @self.word_dict for the current batch and an array of true sentence lengths """ batch_size = len(x) x_batch = np.zeros((batch_size, self.mx_len), dtype=np.int32) len_batch = np.zeros(batch_size, dtype=np.int32) for j, token_ids in enumerate(x): t = min(len(token_ids), self.mx_len) x_batch[j, 0:t] = token_ids[0:t] len_batch[j] = t return x_batch, len_batch def _embedding_init(self, s): """Random initialization for embedding table""" return tf.random_normal((self.n_v - 1, self.dim), stddev=SD, seed=s) def _build(self): """Get feed forward step, loss function, and optimizer for RNN""" # Define input layers self.sentences = tf.placeholder(tf.int32, [None, None]) self.sentence_lengths = tf.placeholder(tf.int32, [None]) self.train_marginals = tf.placeholder(tf.float32, [None]) self.keep_prob = tf.placeholder(tf.float32) # Seeds s = self.seed s1, s2, s3, s4 = [None] * 4 if s is None else [s + i for i in range(4)] # Embedding layer emb_var = tf.Variable(self._embedding_init(s1)) embedding = tf.concat([tf.zeros([1, self.dim]), emb_var], axis=0) inputs = tf.nn.embedding_lookup(embedding, self.sentences) # Build RNN graph batch_size = tf.shape(self.sentences)[0] rand_name = "RNN_{0}".format(random.randint(0, 1e12)) # Obscene hack init = tf.contrib.layers.xavier_initializer(seed=s2) with tf.variable_scope(rand_name, reuse=False, initializer=init): # Build RNN cells fw_cell = self.cell(self.dim) bw_cell = self.cell(self.dim) # Add attention if needed if self.attn: fw_cell = rnn.AttentionCellWrapper(fw_cell, self.attn, state_is_tuple=True) bw_cell = rnn.AttentionCellWrapper(bw_cell, self.attn, state_is_tuple=True) # Construct RNN initial_state_fw = fw_cell.zero_state(batch_size, tf.float32) initial_state_bw = bw_cell.zero_state(batch_size, tf.float32) rnn_out, _ = tf.nn.bidirectional_dynamic_rnn( fw_cell, bw_cell, inputs, sequence_length=self.sentence_lengths, initial_state_fw=initial_state_fw, initial_state_bw=initial_state_bw, time_major=False) # Get potentials potentials = get_bi_rnn_output(rnn_out, self.dim, self.sentence_lengths) # Compute activation potentials_dropout = tf.nn.dropout(potentials, self.keep_prob, seed=s3) W = tf.Variable(tf.random_normal((2 * self.dim, 1), stddev=SD, seed=s4)) b = tf.Variable(0., dtype=tf.float32) h_dropout = tf.squeeze(tf.matmul(potentials_dropout, W)) + b # Noise-aware loss self.loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( labels=self.train_marginals, logits=h_dropout)) # Backprop trainer self.train_fn = tf.train.AdamOptimizer(self.lr).minimize(self.loss) # Get prediction self.prediction = tf.nn.sigmoid(h_dropout) def train(self, candidates, marginals, n_epochs=25, lr=0.01, dropout=0.5, dim=50, attn_window=None, cell_type=rnn.BasicLSTMCell, batch_size=256, max_sentence_length=None, rebalance=False, dev_candidates=None, dev_labels=None, print_freq=5): """Train bidirectional RNN model for binary classification @candidates: list of Candidate objects for training @marginals: array of marginal probabilities for each Candidate @n_epochs: number of training epochs @lr: learning rate @dropout: keep probability for dropout layer (no dropout if None) @dim: embedding dimension @attn_window: attention window length (no attention if 0 or None) @cell_type: subclass of tensorflow.python.ops.rnn_cell_impl._RNNCell @batch_size: batch size for mini-batch SGD @max_sentence_length: maximum sentence length for candidates @rebalance: bool or fraction of positive examples for training - if True, defaults to standard 0.5 class balance - if False, no class balancing @dev_candidates: list of Candidate objects for evaluation @dev_labels: array of labels for each dev Candidate @print_freq: number of epochs after which to print status """ verbose = print_freq > 0 if verbose: print("[{0}] Dimension={1} LR={2}".format(self.name, dim, lr)) print("[{0}] Begin preprocessing".format(self.name)) st = time() # Text preprocessing train_data, ends = self._preprocess_data(candidates, extend=True) # Get training indices np.random.seed(self.seed) train_idxs = LabelBalancer(marginals).get_train_idxs(rebalance) x_train = [train_data[j] for j in train_idxs] y_train = np.ravel(marginals)[train_idxs] # Get max sentence size self.mx_len = max_sentence_length or max(len(x) for x in x_train) self._check_max_sentence_length(ends) # Build model self.dim = dim self.lr = lr self.n_v = self.word_dict.len() self.attn = attn_window self.cell = cell_type self._build() # Get dev data dev_data, dev_gold = None, None if dev_candidates is not None and dev_labels is not None: dev_data, _ = self._preprocess_data(dev_candidates, extend=False) dev_gold = np.ravel(dev_labels) if not ((dev_gold >= 0).all() and (dev_gold <= 1).all()): raise Exception("Dev labels should be in [0, 1]") print("[{0}] Loaded {1} candidates for evaluation".format( self.name, len(dev_data))) # Run mini-batch SGD n = len(x_train) batch_size = min(batch_size, n) if verbose: print("[{0}] Preprocessing done ({1:.2f}s)".format( self.name, time() - st)) st = time() print("[{0}] Training model".format(self.name)) print("[{0}] #examples={1} #epochs={2} batch size={3}".format( self.name, n, n_epochs, batch_size)) self.session.run(tf.global_variables_initializer()) for t in range(n_epochs): epoch_loss = [] for i in range(0, n, batch_size): # Get batch tensors x_b, len_b = self._make_tensor(x_train[i:i + batch_size]) y_b = y_train[i:i + batch_size] # Run training step and evaluate loss function epoch_loss.append( self.session.run( [self.loss, self.train_fn], { self.sentences: x_b, self.sentence_lengths: len_b, self.train_marginals: y_b, self.keep_prob: dropout or 1.0, })[0]) # Print training stats if verbose and (t % print_freq == 0 or t in [0, (n_epochs - 1)]): msg = "[{0}] Epoch {1} ({2:.2f}s)\tAverage loss={3:.6f}".format( self.name, t, time() - st, np.mean(epoch_loss)) if dev_data is not None: dev_p = self._marginals_preprocessed(dev_data) f1, _, _ = f1_score(dev_p, dev_gold) msg += '\tDev F1={0:.2f}'.format(100. * f1) print msg if verbose: print("[{0}] Training done ({1:.2f}s)".format( self.name, time() - st)) def _marginals_preprocessed(self, test_data): """Get marginals from preprocessed data""" x, x_len = self._make_tensor(test_data) return np.ravel( self.session.run( self.prediction, { self.sentences: x, self.sentence_lengths: x_len, self.keep_prob: 1.0, })) def marginals(self, test_candidates): """Get likelihood of tagged sequences represented by test_candidates @test_candidates: list of lists representing test sentence """ test_data, ends = self._preprocess_data(test_candidates, extend=False) self._check_max_sentence_length(ends) return self._marginals_preprocessed(test_data)
class TTBB(SHALOModelFixed): """Implementation of A Simple but Tough-to-Beat Baseline for Sent. Embedding In the basic model, the common component vector is computed before all computations. The embeddings are static, so no updates are made. """ name = 'TTBB' def __init__(self, embedding_file, word_freq_file, save_file=None, n_threads=None): SHALOModelFixed.__init__(self, embedding_file, save_file, n_threads) # Get marginals file with open(word_freq_file, 'rb') as f: self.word_freq = cPickle.load(f) def _word_table_init(self, training_sentences): self.word_dict = SymbolTable() for word in self.embedding_words: self.word_dict.get(word) def _get_mapper(self, init): return self.word_dict.lookup def _preprocess_data(self, sentence_data, init=True): # Initialize word table and populate with embeddings if init: self._word_table_init(sentence_data) # Process data # Map tokens and return if not initializing mapper = self._get_mapper(init) tokens = [ np.ravel(map_words_to_symbols(s, mapper, self.ngrams)) for s in sentence_data ] self.train_tokens = tokens if not init: return tokens # If initializing, get marginal estimates self.marginals = np.zeros(self.word_dict.num_symbols()) for word, idx in self.word_dict.d.iteritems(): # Try getting word frequency directly if word in self.word_freq: self.marginals[idx] = self.word_freq[word] # Otherwise, try getting minimum frequency among sub-grams split_grams = word.split(GRAMSEP) if len(split_grams) > 1: min_freq = min(self.word_freq.get(w, 0.0) for w in split_grams) self.marginals[idx] = min_freq # Get initial smoother value self.a = self.train_kwargs.get('a', -3.0) return tokens def _compute_train_common_component(self, init=False): if init: self.session.run(tf.global_variables_initializer()) x_array, x_len = self._get_data_batch(self.train_tokens) self.ccx = self.session.run(self.tf_ccx, { self.input: x_array, self.input_lengths: x_len }) return self.ccx def _get_a_exp(self): return tf.constant(self.a, dtype=tf.float32) def _get_common_component(self): self.ccx = self._compute_train_common_component(init=True) return tf.constant(self.ccx, dtype=tf.float32) def _embed_sentences(self): """Tensorflow implementation of Simple but Tough-to-Beat Baseline""" # Get word features word_embeddings = self._get_embedding() word_feats = tf.nn.embedding_lookup(word_embeddings, self.input) # Get marginal estimates and scaling term batch_size = tf.shape(word_feats)[0] a = tf.pow(10.0, self._get_a_exp()) p = tf.constant(self.marginals, dtype=tf.float32, name='marginals') q = tf.reshape(a / (a + tf.nn.embedding_lookup(p, self.input)), (batch_size, self.mx_len, 1)) # Compute initial sentence embedding z = tf.reshape(1.0 / tf.to_float(self.input_lengths), (batch_size, 1)) S = z * tf.reduce_sum(q * word_feats, axis=1) # Compute common component S_centered = S - tf.reduce_mean(S, axis=0) _, _, V = tf.svd(S_centered, full_matrices=False, compute_uv=True) self.tf_ccx = tf.stop_gradient(tf.gather(tf.transpose(V), 0)) # Common component removal ccx = tf.reshape(self._get_common_component(), (1, self.d)) sv = {'embeddings': word_embeddings, 'a': a, 'p': p, 'ccx': ccx} return S - tf.matmul(S, ccx * tf.transpose(ccx)), sv
def _word_table_init(self, training_sentences): self.word_dict = SymbolTable() for word in self.embedding_words: self.word_dict.get(word)
def _word_table_init(self, training_sentences): """Get training words and init word table with pre-embedded words""" self._get_training_words(training_sentences) self.word_dict = SymbolTable() for word in self.embedding_words_train: self.word_dict.get(word)
class CRFTextRNN(RNNBase): """RNN for sequence labeling of strings of text.""" def _preprocess_data(self, candidates, marginals=None, dev_labels=None, extend=False, shuffle_data=False): """Convert candidate sentences to lookup sequences :param candidates: candidates to process :param extend: extend symbol table for tokens (train), or lookup (test)? """ if not hasattr(self, 'word_dict'): self.word_dict = SymbolTable() if not hasattr(self, 'char_dict'): self.char_dict = SymbolTable() max_word_len = 0 data, ends, sent_buf, words, word_buf, sents = [], [], [], [], [], [] for candidate in candidates: tok = candidate.get_contexts()[1].text index = candidate.get_contexts()[2].text if sent_buf and index == '0': f = self.word_dict.get if extend else self.word_dict.lookup data.append(np.array(map(f, sent_buf))) sents.append(sent_buf) ends.append(len(sent_buf)) sent_buf = [] c = self.char_dict.get if extend else self.char_dict.lookup sent_words = [np.array(map(c, chars)) for chars in word_buf] words.append(np.array(sent_words)) word_buf = [] sent_buf.append(tok) word_buf.append(list(tok)) max_word_len = max(max_word_len, len(tok)) marg = [] if marginals is not None: cand_idx = 0 for sent_len in ends: end_idx = cand_idx + sent_len marg.append(marginals[cand_idx:end_idx, :]) cand_idx = end_idx marg = np.array(marg) aligned_dev_labels = [] if dev_labels is not None: cand_idx = 0 for sent_len in ends: end_idx = cand_idx + sent_len aligned_dev_labels.append(dev_labels[cand_idx:end_idx]) cand_idx = end_idx aligned_dev_labels = np.array(aligned_dev_labels) if shuffle_data: indexes = np.arange(len(data)) np.random.shuffle(indexes) data = np.array(data)[indexes] sents = np.array(sents)[indexes] ends = np.array(ends)[indexes] if marginals is not None: marg = marg[indexes] if dev_labels is not None: aligned_dev_labels = aligned_dev_labels[indexes] if words: words = np.array(words)[indexes] print('Shuffled data for LSTM') words = words if len(words) > 0 else None return data, ends, marg, aligned_dev_labels, words, max_word_len, sents def _build_model(self, dim=50, dim_char=50, attn_window=None, max_len=20, cell_type=tf.contrib.rnn.BasicLSTMCell, max_word_len=10, word_dict=SymbolTable(), char_dict=SymbolTable(), **kwargs): # Set the word dictionary passed in as the word_dict for the instance self.max_len = max_len self.word_dict = word_dict vocab_size = word_dict.len() self.max_word_len = max_word_len self.char_dict = char_dict n_chars = char_dict.len() # Define input layers self.sentences = tf.placeholder(tf.int32, [None, None]) self.sentence_lengths = tf.placeholder(tf.int32, [None]) # Seeds s = self.seed s1, s2, s3, s4 = [None] * 4 if s is None else [s + i for i in range(4)] # Embedding layer emb_var = tf.Variable( tf.random_normal((vocab_size - 1, dim), stddev=SD, seed=s1)) embedding = tf.concat([tf.zeros([1, dim]), emb_var], axis=0) inputs = tf.nn.embedding_lookup(embedding, self.sentences) # Character embedding # shape = (batch_size, max_sent_len, max_word_len) self.words = tf.placeholder(tf.int32, [None, None, None]) self.word_lengths = tf.placeholder(tf.int32, shape=[None, None]) char_var = tf.get_variable(name='char_embeddings', dtype=tf.float32, shape=[n_chars, dim_char]) char_embedding = tf.nn.embedding_lookup(char_var, self.words) char_s = tf.shape(char_embedding) # shape = (batch x sentence, word, dim of char embeddings) char_embedding = tf.reshape( char_embedding, shape=[char_s[0] * char_s[1], char_s[-2], dim_char]) word_lengths = tf.reshape(self.word_lengths, shape=[char_s[0] * char_s[1]]) init = tf.contrib.layers.xavier_initializer(seed=s2) with tf.variable_scope(self.name + '_char', reuse=False, initializer=init): char_fw_cell = cell_type(dim_char, state_is_tuple=True) char_bw_cell = cell_type(dim_char, state_is_tuple=True) _, ((_, char_fw_out), (_, char_bw_out)) = tf.nn.bidirectional_dynamic_rnn( char_fw_cell, char_bw_cell, char_embedding, sequence_length=word_lengths, dtype=tf.float32) char_out = tf.concat([char_fw_out, char_bw_out], axis=-1) char_rep = tf.reshape(char_out, shape=[-1, char_s[1], 2 * dim_char]) # inputs = tf.concat([inputs, char_rep], axis=-1) # Add dropout layer # self.keep_prob = tf.placeholder(tf.float32) # inputs_dropout = tf.nn.dropout(inputs, self.keep_prob, seed=s3) self.in_keep_prob = tf.placeholder(tf.float32) inputs_dropout = tf.nn.dropout(inputs, self.in_keep_prob, seed=s3) # Build RNN graph batch_size = tf.shape(self.sentences)[0] init = tf.contrib.layers.xavier_initializer(seed=s2) with tf.variable_scope(self.name, reuse=False, initializer=init): # Build RNN cells fw_cell = cell_type(dim) bw_cell = cell_type(dim) # Add attention if needed if attn_window: fw_cell = tf.contrib.rnn.AttentionCellWrapper( fw_cell, attn_window, state_is_tuple=True) bw_cell = tf.contrib.rnn.AttentionCellWrapper( bw_cell, attn_window, state_is_tuple=True) # Construct RNN initial_state_fw = fw_cell.zero_state(batch_size, tf.float32) initial_state_bw = bw_cell.zero_state(batch_size, tf.float32) # rnn_out, _ = tf.nn.bidirectional_dynamic_rnn( # fw_cell, bw_cell, inputs, # sequence_length=self.sentence_lengths, # initial_state_fw=initial_state_fw, # initial_state_bw=initial_state_bw, # time_major=False # ) rnn_out, _ = tf.nn.bidirectional_dynamic_rnn( fw_cell, bw_cell, inputs_dropout, sequence_length=self.sentence_lengths, initial_state_fw=initial_state_fw, initial_state_bw=initial_state_bw, time_major=False) potentials, ntime_steps = get_bi_rnn_seq_output( rnn_out, dim, self.sentence_lengths) # Add dropout layer # potentials_dropout = tf.nn.dropout(potentials, self.keep_prob, seed=s3) self.out_keep_prob = tf.placeholder(tf.float32) potentials_dropout = tf.nn.dropout(potentials, self.out_keep_prob, seed=s3) # Build activation layer self.Y = tf.placeholder(tf.float32, [None, None, self.cardinality]) self.train_labels = tf.placeholder(tf.int32, [None, self.max_len]) W = tf.Variable( tf.random_normal((2 * dim, self.cardinality), stddev=SD, seed=s4)) b = tf.Variable(np.zeros(self.cardinality), dtype=tf.float32) # self.logits = tf.matmul(potentials, W) + b self.logits = tf.matmul(potentials_dropout, W) + b self.logits = tf.reshape(self.logits, [-1, ntime_steps, self.cardinality]) # self.marginals_op = tf.nn.softmax(self.logits) self.pred = tf.cast(tf.argmax(self.logits, axis=-1), tf.int32) def _build_training_ops(self, **training_kwargs): # batch_size = tf.shape(self.logits)[0] # seq_len = tf.shape(self.logits)[1] # self.Y = tf.cast(tf.argmax(self.Y, axis=2), tf.int32) # self.Y = tf.reshape(self.Y, [batch_size, seq_len]) # # log_likelihood, self.transition_params = tf.contrib.crf.crf_log_likelihood( # self.logits, self.Y, self.sentence_lengths) # self.loss = tf.reduce_mean(-log_likelihood) # self.pred, viterbi_score = tf.contrib.crf.viterbi_decode( # self.logits, self.transition_params) losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.Y) # losses = tf.nn.sparse_softmax_cross_entropy_with_logits( # logits=self.logits, labels=self.train_labels) mask = tf.sequence_mask(self.sentence_lengths) losses = tf.boolean_mask(losses, mask) self.loss = tf.reduce_mean(losses) # Build training op self.lr = tf.placeholder(tf.float32) self.optimizer = tf.train.AdamOptimizer(self.lr).minimize(self.loss) def _construct_feed_dict(self, X_b, Y_b, lr=0.01, dropout=None, train_labels=None, chars=None, dropout_in=None, dropout_out=None, **kwargs): X_b, len_b, Y_b, L_b, C_b, len_c = self._make_tensor( X_b, Y_b, train_labels, chars) return { self.sentences: X_b, self.sentence_lengths: len_b, self.Y: Y_b, # self.keep_prob: dropout or 1.0, self.in_keep_prob: dropout_in or 1.0, self.out_keep_prob: dropout_out or 1.0, self.lr: lr, self.train_labels: L_b, self.words: C_b, self.word_lengths: len_c } def _make_tensor(self, x, y=None, z=None, c=None): """Construct input tensor with padding Builds a matrix of symbols corresponding to @self.word_dict for the current batch and an array of true sentence lengths """ batch_size = len(x) x_batch = np.zeros((batch_size, self.max_len), dtype=np.int32) y_batch = np.zeros((batch_size, self.max_len, self.cardinality)) z_batch = np.zeros((batch_size, self.max_len), dtype=np.int32) c_batch = np.zeros((batch_size, self.max_len, self.max_word_len), dtype=np.int32) len_batch = np.zeros(batch_size, dtype=np.int32) len_words = np.zeros((batch_size, self.max_len), dtype=np.int32) if c is not None and y is None and z is None: for j, (token_ids, words) in enumerate(zip(x, c)): t = min(len(token_ids), self.max_len) x_batch[j, 0:t] = token_ids[0:t] len_batch[j] = t for x, y in enumerate(words[0:t]): c_batch[j][x][0:len(y)] = y char_t = np.array([ min(len(word_ids), self.max_word_len) for word_ids in words ]) len_words[j][0:len(char_t)] = char_t elif c is not None and y is not None and z is None: for j, (token_ids, marginals, words) in enumerate(zip(x, y, c)): t = min(len(token_ids), self.max_len) x_batch[j, 0:t] = token_ids[0:t] y_batch[j, 0:t] = marginals[0:t] len_batch[j] = t for x, y in enumerate(words[0:t]): c_batch[j][x][0:len(y)] = y char_t = np.array([ min(len(word_ids), self.max_word_len) for word_ids in words ]) len_words[j][0:len(char_t)] = char_t elif c is not None: for j, (token_ids, marginals, labels, words) in enumerate(zip(x, y, z, c)): t = min(len(token_ids), self.max_len) x_batch[j, 0:t] = token_ids[0:t] y_batch[j, 0:t] = marginals[0:t] z_batch[j, 0:t] = labels[0:t] len_batch[j] = t for x, y in enumerate(words[0:t]): c_batch[j][x][0:len(y)] = y char_t = np.array([ min(len(word_ids), self.max_word_len) for word_ids in words ]) len_words[j][0:len(char_t)] = char_t elif z is not None: for j, (token_ids, marginals, labels) in enumerate(zip(x, y, z)): t = min(len(token_ids), self.max_len) x_batch[j, 0:t] = token_ids[0:t] y_batch[j, 0:t] = marginals[0:t] z_batch[j, 0:t] = labels[0:t] len_batch[j] = t elif y is not None: for j, (token_ids, marginals) in enumerate(zip(x, y)): t = min(len(token_ids), self.max_len) x_batch[j, 0:t] = token_ids[0:t] y_batch[j, 0:t] = marginals[0:t] len_batch[j] = t else: for j, token_ids in enumerate(x): t = min(len(token_ids), self.max_len) x_batch[j, 0:t] = token_ids[0:t] len_batch[j] = t return x_batch, len_batch, y_batch, z_batch, c_batch, len_words def predictions(self, X, b=0.5, batch_size=None, words=None): if isinstance(X[0], Candidate): X_test, ends, _, _, pwords, _, _ = self._preprocess_data( X, extend=False) words = pwords if words is not None else None self._check_max_sentence_length(ends) else: X_test = X # Make tensor and run prediction op x, x_len, _, _, _words, _words_len = self._make_tensor(X_test, c=words) pred = self.session.run( self.pred, { self.sentences: x, self.sentence_lengths: x_len, # self.keep_prob: 1.0, self.in_keep_prob: 1.0, self.out_keep_prob: 1.0, self.words: _words, self.word_lengths: _words_len }) # logit_scores = self.session.run(self.logits, { # self.sentences: x, # self.sentence_lengths: x_len, # self.keep_prob: 1.0, # self.words: _words, # self.word_lengths: _words_len # }) # preds = [] # for logits in logit_scores: # pred_seq, viterbi_score = tf.contrib.crf.viterbi_decode(logits, # self.transition_params) # preds.append(pred_seq) return pred # return preds def score(self, X_test, Y_test, b=0.5, set_unlabeled_as_neg=True, beta=1, batch_size=None, other_id=-1, out_path='predictions.txt', ids_to_classes=None, use_chars=False): # predictions, viterbi_score = self.predictions(X_test, b, batch_size) # pred_words = [self.word_dict.reverse()[i] for i in predictions] # try: # Y_test = np.array(Y_test.todense()).reshape(-1) # except: # Y_test = np.array(Y_test) # correct = np.where([predictions == Y_test])[0].shape[0] # return correct / float(Y_test.shape[0]) X_test, ends, _, _, words, _, sents = self._preprocess_data( X_test, extend=False) self._check_max_sentence_length(ends) words = words if use_chars else None predictions = self.predictions(X_test, b=b, batch_size=batch_size, words=words) # # Convert Y_test to dense numpy array # try: # Y_test = np.array(Y_test.todense()).reshape(-1) # except: # Y_test = np.array(Y_test) labels = [] cand_idx = 0 for sent_len in ends: end_idx = cand_idx + sent_len labels.append(Y_test[cand_idx:end_idx]) cand_idx = end_idx Y_test = np.array(labels) correct = 0 # correct = np.where([predictions == Y_test])[0].shape[0] # return correct / float(Y_test.shape[0]) token_err, sent_err = 0, 0 token_num, sent_num = 0, len(Y_test) other_total, other_as_class = 0, 0 class_total, class_as_other = 0, 0 ids_to_words = self.word_dict.reverse() # with open(out_path, 'w') as out: preds_final = [] for sent_pred, sent_gold, sent in zip(predictions, Y_test, sents): # for sent_pred, sent_gold, sent in zip(predictions, Y_test, X_test): pred_err = 0 preds_final_sent = [] for tag_pred, tag_gold, token in zip(sent_pred, sent_gold, sent): token_num += 1 if tag_gold == other_id: other_total += 1 else: class_total += 1 if tag_pred == tag_gold: correct += 1 if tag_pred != tag_gold: pred_err += 1 if tag_pred == other_id: class_as_other += 1 if tag_gold == other_id: other_as_class += 1 if tag_pred > self.cardinality: print('PREDICTION ({}) / CARDINALITY MISMATCH ({})'.format( tag_pred, self.cardinality)) else: # word = ids_to_words.get(token, None) word = token if ids_to_classes is not None: # In Snorkel, class IDs have to start at 1 because 0 is the reserved value for abstaining # labeling functions. There is no abstention in TensorFlow, i.e. classes have to be zero-indexed. class_pred = ids_to_classes.get(tag_pred + 1, None) # class_gold = ids_to_classes.get(tag_gold + 1, None) else: class_pred = tag_pred # class_gold = tag_gold preds_final_sent.append((word, class_pred)) # out.write('{}\t{}\t{}'.format(word, class_pred, class_gold)) # out.write('\n') token_err += pred_err if pred_err != 0: sent_err += 1 # out.write('\n') preds_final.append(preds_final_sent) if other_total == 0: other_total = 1 if class_total == 0: class_total = 1 return float(correct) / token_num, \ float(token_err) / token_num, float(sent_err) / sent_num, \ float(other_as_class) / other_total, float(class_as_other) / class_total, \ preds_final def train(self, X_train, Y_train, dev_labels=None, X_dev=None, max_sentence_length=None, shuffle=False, max_word_length=None, **kwargs): """ Perform preprocessing of data, construct dataset-specific model, then train. """ # Text preprocessing X_train, ends, Y_train, train_labels, train_words, max_word_len, _ = self._preprocess_data( X_train, Y_train, dev_labels=dev_labels, extend=True, shuffle_data=shuffle) if X_dev is not None: X_dev, _, _, _, _, _, _ = self._preprocess_data(X_dev, [], extend=False) # Get max sentence size max_len = max_sentence_length or max(len(x) for x in X_train) self._check_max_sentence_length(ends, max_len=max_len) max_word_len = max_word_length or max_word_len # Train model- note we pass word_dict through here so it gets saved... # super(RNNBase, self).train(X_train, Y_train, X_dev=X_dev, # word_dict=self.word_dict, max_len=max_len, train_labels=train_labels, **kwargs) self._train(X_train, Y_train, X_dev=X_dev, words=train_words, char_dict=self.char_dict, word_dict=self.word_dict, max_len=max_len, dev_labels=train_labels, max_word_len=max_word_len, **kwargs) def _train(self, X_train, Y_train, dev_labels=None, words=None, n_epochs=25, lr=0.01, batch_size=256, rebalance=False, X_dev=None, Y_dev=None, print_freq=5, dev_ckpt=True, dev_ckpt_delay=0.75, save_dir='checkpoints', **kwargs): """ Generic training procedure for TF model :param X_train: The training Candidates. If self.representation is True, then this is a list of Candidate objects; else is a csr_AnnotationMatrix with rows corresponding to training candidates and columns corresponding to features. :param Y_train: Array of marginal probabilities for each Candidate :param n_epochs: Number of training epochs :param lr: Learning rate :param batch_size: Batch size for SGD :param rebalance: Bool or fraction of positive examples for training - if True, defaults to standard 0.5 class balance - if False, no class balancing :param X_dev: Candidates for evaluation, same format as X_train :param Y_dev: Labels for evaluation, same format as Y_train :param print_freq: number of epochs at which to print status, and if present, evaluate the dev set (X_dev, Y_dev). :param dev_ckpt: If True, save a checkpoint whenever highest score on (X_dev, Y_dev) reached. Note: currently only evaluates at every @print_freq epochs. :param dev_ckpt_delay: Start dev checkpointing after this portion of n_epochs. :param save_dir: Save dir path for checkpointing. :param kwargs: All hyperparameters that change how the graph is built must be passed through here to be saved and reloaded to save / reload model. *NOTE: If a parameter needed to build the network and/or is needed at test time is not included here, the model will not be able to be reloaded!* """ self._check_input(X_train) verbose = print_freq > 0 # Set random seed for all numpy operations self.rand_state.seed(self.seed) # If the data passed in is a feature matrix (representation=False), # set the dimensionality here; else assume this is done by sub-class if not self.representation: kwargs['d'] = X_train.shape[1] if dev_labels is not None: if len(dev_labels) > 0: train_labels = copy.deepcopy(dev_labels) else: train_labels = None else: train_labels = None # Create new graph, build network, and start session self._build_new_graph_session(**kwargs) # Build training ops # Note that training_kwargs and model_kwargs are mixed together; ideally # would be separated but no negative effect with self.graph.as_default(): self._build_training_ops(**kwargs) # Initialize variables with self.graph.as_default(): self.session.run(tf.global_variables_initializer()) # Run mini-batch SGD n = len(X_train) if self.representation else X_train.shape[0] batch_size = min(batch_size, n) if verbose: st = time() print("[{0}] Training model".format(self.name)) print("[{0}] n_train={1} #epochs={2} batch size={3}".format( self.name, n, n_epochs, batch_size)) dev_score_opt = 0.0 for t in range(n_epochs): epoch_losses = [] for i in range(0, n, batch_size): if train_labels is not None: batch_labels = train_labels[i:min(n, i + batch_size)] else: batch_labels = None if words is not None: batch_words = words[i:min(n, i + batch_size)] else: batch_words = None feed_dict = self._construct_feed_dict( X_train[i:min(n, i + batch_size)], Y_train[i:min(n, i + batch_size)], train_labels=batch_labels, chars=batch_words, lr=lr, **kwargs) # Run training step and evaluate loss function epoch_loss, _ = self.session.run([self.loss, self.optimizer], feed_dict=feed_dict) epoch_losses.append(epoch_loss) # Reshuffle training data train_idxs = range(n) self.rand_state.shuffle(train_idxs) X_train = [X_train[j] for j in train_idxs] if self.representation \ else X_train[train_idxs, :] Y_train = Y_train[train_idxs] if train_labels is not None: train_labels = [train_labels[j] for j in train_idxs] # Print training stats and optionally checkpoint model # if verbose and (t % print_freq == 0 or t in [0, (n_epochs - 1)]): # msg = "[{0}] Epoch {1} ({2:.2f}s)\tAverage loss={3:.6f}".format( # self.name, t, time() - st, np.mean(epoch_losses)) # if train_labels is not None: # dev_accurarcy, dev_token_err, dev_sent_err, dev_gold_other_err, dev_pred_other_err \ # = self.score(X_dev, train_labels, batch_size=batch_size, preprocess=False, **kwargs) # print(msg) # print('\tDev accuracy: ' + str(dev_accurarcy)) # print('\tDev token error rate: ' + str(dev_token_err)) # print('\tDev sentence error rate: ' + str(dev_sent_err)) # print('\tDev gold-annotated OTHER predicted as some class: ' + str(dev_gold_other_err)) # print('\tDev predicted OTHER gold-annotated as some class: ' + str(dev_pred_other_err)) # else: # print(msg) if verbose and (t % print_freq == 0 or t in [0, (n_epochs - 1)]): msg = "[{0}] Epoch {1} ({2:.2f}s)\tAverage loss={3:.6f}".format( self.name, t, time() - st, np.mean(epoch_losses)) if X_dev is not None: scores = self.score(X_train, Y_dev, batch_size=batch_size) score = scores if self.cardinality > 2 else scores[-1] score_label = "Acc." if self.cardinality > 2 else "F1" msg += '\tDev {0}={1:.2f}'.format(score_label, 100. * score) print(msg) # If best score on dev set so far and dev checkpointing is # active, save checkpoint if X_dev is not None and dev_ckpt and \ t > dev_ckpt_delay * n_epochs and score > dev_score_opt: dev_score_opt = score self.save(save_dir=save_dir, global_step=t) # Conclude training if verbose: print("[{0}] Training done ({1:.2f}s)".format( self.name, time() - st)) # If checkpointing on, load last checkpoint (i.e. best on dev set) if dev_ckpt and X_dev is not None and verbose and dev_score_opt > 0: self.load(save_dir=save_dir)
temp_var_count += 1 return new_temp # need to remove this from variable lists?? label_count = 0 def newLabel(): global label_count new_label = 'label' + str(label_count) label_count += 1 return new_label global_symbol_table = SymbolTable(None) scope_stack.append(global_symbol_table) scope_list.append(global_symbol_table) precedence = (('right', 'EQUAL', 'NOT'), ('left', 'OROR'), ('left', 'AMPAMP'), ('left', 'EQEQ', 'NOTEQ', 'LESS', 'GREAT', 'LEQ', 'GEQ'), ('left', 'PLUS', 'MINUS', 'OR', 'CARET'), ('left', 'TIMES', 'DIVIDE', 'MOD', 'LL', 'GG', 'AMPERS', 'AMPCAR')) #-------------------------------Start------------------------------# def p_start(p): '''start : Source''' p[0] = p[1]
def _build_model(self, dim=50, attn_window=None, max_len=20, cell_type=tf.contrib.rnn.BasicLSTMCell, word_dict=SymbolTable(), **kwargs): """ Build RNN model :param dim: embedding dimension :param attn_window: attention window length (no attention if 0 or None) :param cell_type: subclass of tensorflow.python.ops.rnn_cell_impl._RNNCell :param batch_size: batch size for mini-batch SGD :param vocab_size: Vocab size for determining size of word embeddings tensor """ # Set the word dictionary passed in as the word_dict for the instance self.max_len = max_len self.word_dict = word_dict vocab_size = word_dict.len() # Define input layers self.sentences = tf.placeholder(tf.int32, [None, None]) self.sentence_lengths = tf.placeholder(tf.int32, [None]) # Seeds s = self.seed s1, s2, s3, s4 = [None] * 4 if s is None else [s + i for i in range(4)] # Embedding layer emb_var = tf.Variable( tf.random_normal((vocab_size - 1, dim), stddev=SD, seed=s1)) embedding = tf.concat([tf.zeros([1, dim]), emb_var], axis=0) inputs = tf.nn.embedding_lookup(embedding, self.sentences) # Build RNN graph batch_size = tf.shape(self.sentences)[0] init = tf.contrib.layers.xavier_initializer(seed=s2) with tf.variable_scope(self.name, reuse=False, initializer=init): # Build RNN cells fw_cell = cell_type(dim) bw_cell = cell_type(dim) # Add attention if needed if attn_window: fw_cell = tf.contrib.rnn.AttentionCellWrapper( fw_cell, attn_window, state_is_tuple=True) bw_cell = tf.contrib.rnn.AttentionCellWrapper( bw_cell, attn_window, state_is_tuple=True) # Construct RNN initial_state_fw = fw_cell.zero_state(batch_size, tf.float32) initial_state_bw = bw_cell.zero_state(batch_size, tf.float32) rnn_out, _ = tf.nn.bidirectional_dynamic_rnn( fw_cell, bw_cell, inputs, sequence_length=self.sentence_lengths, initial_state_fw=initial_state_fw, initial_state_bw=initial_state_bw, time_major=False) potentials = get_bi_rnn_output(rnn_out, dim, self.sentence_lengths) # Add dropout layer self.keep_prob = tf.placeholder(tf.float32) potentials_dropout = tf.nn.dropout(potentials, self.keep_prob, seed=s3) # Build activation layer if self.cardinality > 2: self.Y = tf.placeholder(tf.float32, [None, self.cardinality]) W = tf.Variable( tf.random_normal((2 * dim, self.cardinality), stddev=SD, seed=s4)) b = tf.Variable(np.zeros(self.cardinality), dtype=tf.float32) self.logits = tf.matmul(potentials, W) + b self.marginals_op = tf.nn.softmax(self.logits) else: self.Y = tf.placeholder(tf.float32, [None]) W = tf.Variable(tf.random_normal((2 * dim, 1), stddev=SD, seed=s4)) # TODO: Implement for categorical as well... if self.deterministic: # TODO: Implement for categorical as well... if self.cardinality > 2: raise NotImplementedError( "Deterministic mode not implemented for categoricals.") # Make deterministic # See: https://github.com/tensorflow/tensorflow/pull/10636/files b = tf.Variable(np.zeros([1]), dtype=tf.float32) f_w = tf.matmul(potentials, W) f_w_temp = tf.concat([f_w, tf.ones_like(f_w)], axis=1) b_temp = tf.stack([tf.ones_like(b), b], axis=0) self.logits = tf.squeeze(tf.matmul(f_w_temp, b_temp)) else: b = tf.Variable(0., dtype=tf.float32) self.logits = tf.squeeze(tf.matmul(potentials, W)) + b self.marginals_op = tf.nn.sigmoid(self.logits)
def _preprocess_data(self, candidates, marginals=None, dev_labels=None, extend=False, shuffle_data=False): """Convert candidate sentences to lookup sequences :param candidates: candidates to process :param extend: extend symbol table for tokens (train), or lookup (test)? """ if not hasattr(self, 'word_dict'): self.word_dict = SymbolTable() if not hasattr(self, 'char_dict'): self.char_dict = SymbolTable() max_word_len = 0 data, ends, sent_buf, words, word_buf, sents = [], [], [], [], [], [] for candidate in candidates: tok = candidate.get_contexts()[1].text index = candidate.get_contexts()[2].text if sent_buf and index == '0': f = self.word_dict.get if extend else self.word_dict.lookup data.append(np.array(map(f, sent_buf))) sents.append(sent_buf) ends.append(len(sent_buf)) sent_buf = [] c = self.char_dict.get if extend else self.char_dict.lookup sent_words = [np.array(map(c, chars)) for chars in word_buf] words.append(np.array(sent_words)) word_buf = [] sent_buf.append(tok) word_buf.append(list(tok)) max_word_len = max(max_word_len, len(tok)) marg = [] if marginals is not None: cand_idx = 0 for sent_len in ends: end_idx = cand_idx + sent_len marg.append(marginals[cand_idx:end_idx, :]) cand_idx = end_idx marg = np.array(marg) aligned_dev_labels = [] if dev_labels is not None: cand_idx = 0 for sent_len in ends: end_idx = cand_idx + sent_len aligned_dev_labels.append(dev_labels[cand_idx:end_idx]) cand_idx = end_idx aligned_dev_labels = np.array(aligned_dev_labels) if shuffle_data: indexes = np.arange(len(data)) np.random.shuffle(indexes) data = np.array(data)[indexes] sents = np.array(sents)[indexes] ends = np.array(ends)[indexes] if marginals is not None: marg = marg[indexes] if dev_labels is not None: aligned_dev_labels = aligned_dev_labels[indexes] if words: words = np.array(words)[indexes] print('Shuffled data for LSTM') words = words if len(words) > 0 else None return data, ends, marg, aligned_dev_labels, words, max_word_len, sents
def _build_model(self, dim=50, dim_char=50, attn_window=None, max_len=20, cell_type=tf.contrib.rnn.BasicLSTMCell, max_word_len=10, word_dict=SymbolTable(), char_dict=SymbolTable(), **kwargs): # Set the word dictionary passed in as the word_dict for the instance self.max_len = max_len self.word_dict = word_dict vocab_size = word_dict.len() self.max_word_len = max_word_len self.char_dict = char_dict n_chars = char_dict.len() # Define input layers self.sentences = tf.placeholder(tf.int32, [None, None]) self.sentence_lengths = tf.placeholder(tf.int32, [None]) # Seeds s = self.seed s1, s2, s3, s4 = [None] * 4 if s is None else [s + i for i in range(4)] # Embedding layer emb_var = tf.Variable( tf.random_normal((vocab_size - 1, dim), stddev=SD, seed=s1)) embedding = tf.concat([tf.zeros([1, dim]), emb_var], axis=0) inputs = tf.nn.embedding_lookup(embedding, self.sentences) # Character embedding # shape = (batch_size, max_sent_len, max_word_len) self.words = tf.placeholder(tf.int32, [None, None, None]) self.word_lengths = tf.placeholder(tf.int32, shape=[None, None]) char_var = tf.get_variable(name='char_embeddings', dtype=tf.float32, shape=[n_chars, dim_char]) char_embedding = tf.nn.embedding_lookup(char_var, self.words) char_s = tf.shape(char_embedding) # shape = (batch x sentence, word, dim of char embeddings) char_embedding = tf.reshape( char_embedding, shape=[char_s[0] * char_s[1], char_s[-2], dim_char]) word_lengths = tf.reshape(self.word_lengths, shape=[char_s[0] * char_s[1]]) init = tf.contrib.layers.xavier_initializer(seed=s2) with tf.variable_scope(self.name + '_char', reuse=False, initializer=init): char_fw_cell = cell_type(dim_char, state_is_tuple=True) char_bw_cell = cell_type(dim_char, state_is_tuple=True) _, ((_, char_fw_out), (_, char_bw_out)) = tf.nn.bidirectional_dynamic_rnn( char_fw_cell, char_bw_cell, char_embedding, sequence_length=word_lengths, dtype=tf.float32) char_out = tf.concat([char_fw_out, char_bw_out], axis=-1) char_rep = tf.reshape(char_out, shape=[-1, char_s[1], 2 * dim_char]) # inputs = tf.concat([inputs, char_rep], axis=-1) # Add dropout layer # self.keep_prob = tf.placeholder(tf.float32) # inputs_dropout = tf.nn.dropout(inputs, self.keep_prob, seed=s3) self.in_keep_prob = tf.placeholder(tf.float32) inputs_dropout = tf.nn.dropout(inputs, self.in_keep_prob, seed=s3) # Build RNN graph batch_size = tf.shape(self.sentences)[0] init = tf.contrib.layers.xavier_initializer(seed=s2) with tf.variable_scope(self.name, reuse=False, initializer=init): # Build RNN cells fw_cell = cell_type(dim) bw_cell = cell_type(dim) # Add attention if needed if attn_window: fw_cell = tf.contrib.rnn.AttentionCellWrapper( fw_cell, attn_window, state_is_tuple=True) bw_cell = tf.contrib.rnn.AttentionCellWrapper( bw_cell, attn_window, state_is_tuple=True) # Construct RNN initial_state_fw = fw_cell.zero_state(batch_size, tf.float32) initial_state_bw = bw_cell.zero_state(batch_size, tf.float32) # rnn_out, _ = tf.nn.bidirectional_dynamic_rnn( # fw_cell, bw_cell, inputs, # sequence_length=self.sentence_lengths, # initial_state_fw=initial_state_fw, # initial_state_bw=initial_state_bw, # time_major=False # ) rnn_out, _ = tf.nn.bidirectional_dynamic_rnn( fw_cell, bw_cell, inputs_dropout, sequence_length=self.sentence_lengths, initial_state_fw=initial_state_fw, initial_state_bw=initial_state_bw, time_major=False) potentials, ntime_steps = get_bi_rnn_seq_output( rnn_out, dim, self.sentence_lengths) # Add dropout layer # potentials_dropout = tf.nn.dropout(potentials, self.keep_prob, seed=s3) self.out_keep_prob = tf.placeholder(tf.float32) potentials_dropout = tf.nn.dropout(potentials, self.out_keep_prob, seed=s3) # Build activation layer self.Y = tf.placeholder(tf.float32, [None, None, self.cardinality]) self.train_labels = tf.placeholder(tf.int32, [None, self.max_len]) W = tf.Variable( tf.random_normal((2 * dim, self.cardinality), stddev=SD, seed=s4)) b = tf.Variable(np.zeros(self.cardinality), dtype=tf.float32) # self.logits = tf.matmul(potentials, W) + b self.logits = tf.matmul(potentials_dropout, W) + b self.logits = tf.reshape(self.logits, [-1, ntime_steps, self.cardinality]) # self.marginals_op = tf.nn.softmax(self.logits) self.pred = tf.cast(tf.argmax(self.logits, axis=-1), tf.int32)