def __init__(self, dataset_path, batch_size=500, instance_weights_path=None): L.info("Initializing dataset from: " + os.path.abspath(dataset_path)) # Reading parameters from the mmap file fp = np.memmap(dataset_path, dtype='int32', mode='r') self.num_samples = fp[0] self.ngram = fp[1] fp = fp.reshape((self.num_samples + 3, self.ngram)) self.vocab_size = fp[1,0] self.num_classes = fp[2,0] # Setting minibatch size and number of mini batches self.batch_size = batch_size self.num_batches = int(M.ceil(self.num_samples / self.batch_size)) # Reading the matrix of samples x = fp[3:,0:self.ngram - 1] # Reading the context indices y = fp[3:,self.ngram - 1] # Reading the output word index self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32') self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32') self.is_weighted = False if instance_weights_path: instance_weights = np.loadtxt(instance_weights_path) U.xassert(instance_weights.shape == (self.num_samples,), "The number of lines in weights file must be the same as the number of samples.") self.shared_w = T.cast(theano.shared(instance_weights, borrow=True), theano.config.floatX) self.is_weighted = True L.info(' #samples: %s, ngram size: %s, vocab size: %s, #classes: %s, batch size: %s, #batches: %s' % ( U.red(self.num_samples), U.red(self.ngram), U.red(self.vocab_size), U.red(self.num_classes), U.red(self.batch_size), U.red(self.num_batches) ) )
def next(self): # Returns a group of NBestItems with the same index if self.eof_flag == True: raise StopIteration U.xassert(self.mode == 'r', "next_group() method can only be used in 'r' mode") group = NBestGroup(self.ref_manager) group.add(self.curr_item ) # add the item that was read in the last next() call try: self.curr_item = self.next_item() except StopIteration: self.eof_flag = True return group if self.curr_index != self.curr_item.index: self.curr_index = self.curr_item.index return group while self.curr_index == self.curr_item.index: group.add(self.curr_item) try: self.curr_item = self.next_item() except StopIteration: self.eof_flag = True return group self.curr_index = self.curr_item.index return group
def append_features(self, features_list): U.xassert( len(features_list) == len(self.group), 'Number of features and number of items in this group do not match' ) for i in range(len(self.group)): self.group[i].append_feature(features_list[i])
def initialize(self, emb_path, vocab_path): L.info('Initializing lookup table') vm = VocabManager(vocab_path) w2v = W2VEmbReader(emb_path) U.xassert(w2v.get_emb_dim() == self.emb_matrix.shape[1], 'The embeddings dimension does not match with the given word embeddings') for i in range(self.emb_matrix.shape[0]): vec = w2v.get_emb_given_word(vm.get_word_given_id(i)) if vec: self.emb_matrix[i] = vec
def add(self, item): if item is None: return if self.group_index == -1: self.group_index = item.index if self.ref_manager: self.refs = self.ref_manager.get_all_refs(self.group_index) else: U.xassert(item.index == self.group_index, "Cannot add an nbest item with an incompatible index") self.group.append(item)
def __init__(self, rng, input, vocab_size, emb_dim, emb_matrix=None, concat=True, emb_path=None, vocab_path=None, add_weights=False): L.info("Lookup Table layer, #words: %s, #dims: %s" % (U.red(vocab_size), U.red(emb_dim))) self.input = input self.emb_matrix = emb_matrix if self.emb_matrix is None: self.emb_matrix = numpy.asarray( rng.uniform( low=-0.01, #low=-1, high=0.01, #high=1, size=(vocab_size, emb_dim)), dtype=theano.config.floatX) if emb_path: U.xassert(vocab_path, 'When emb_path is given, vocab must be given too.') self.initialize(emb_path, vocab_path) self.embeddings = theano.shared(value=self.emb_matrix, name='embeddings', borrow=True) if add_weights: weights_vec = numpy.ones(vocab_size, dtype=theano.config.floatX) self.weights = theano.shared(value=weights_vec, name='word_weights', borrow=True) # Check if the speed can be improved self.output = (self.weights.dimshuffle(0, 'x') * self.embeddings)[input] #self.output = self.weights.dimshuffle(0, 'x')[input] * self.embeddings[input] #self.output = self.weights[input].dimshuffle(0, 'x') * self.embeddings[input] self.params = [self.embeddings, self.weights] else: self.output = self.embeddings[input] self.params = [self.embeddings] if concat: self.output = self.output.reshape( (input.shape[0], emb_dim * input.shape[1]))
def read_vocab(vocab_path): word_to_id_dict = dict() found_sent_marker = False with open(vocab_path,'r') as f_vocab: curr_index = 0 for line in f_vocab: token = line.strip().split()[0] U.xassert((not word_to_id_dict.has_key(token)), "Given vocab file has duplicate entry for '" + token + "'.") word_to_id_dict[token] = curr_index curr_index = curr_index + 1 return word_to_id_dict
def read_vocab(vocab_path): word_to_id_dict = dict() found_sent_marker = False with open(vocab_path, "r") as f_vocab: curr_index = 0 for line in f_vocab: token = line.strip().split()[0] U.xassert((not word_to_id_dict.has_key(token)), "Given vocab file has duplicate entry for '" + token + "'.") word_to_id_dict[token] = curr_index curr_index = curr_index + 1 return word_to_id_dict
def __init__(self, dataset_path, batch_size=500, instance_weights_path=None): L.info("Initializing dataset from: " + os.path.abspath(dataset_path)) # Reading parameters from the mmap file print K.get_platform() fp = np.memmap(dataset_path, dtype='int32', mode='r') self.num_samples = fp[0] self.ngram = fp[1] fp = fp.reshape((self.num_samples + 3, self.ngram)) self.vocab_size = fp[1, 0] self.num_classes = fp[2, 0] # Setting minibatch size and number of mini batches self.batch_size = batch_size self.num_batches = int(M.ceil(self.num_samples / self.batch_size)) # Reading the matrix of samples x = fp[3:, 0:self.ngram - 1] # Reading the context indices y = fp[3:, self.ngram - 1] # Reading the output word index #self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32') #self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32') # What is T.cast :)) L.info("Initialize a simple variable") val = np.random.random((4, 2)) tmp = K.variable(val) L.info("Initialize a real variable") tmp = K.variable(x) L.info("Initialize two casted variables") self.shared_x = K.cast(K.variable(x), 'int32') self.shared_y = K.cast(K.variable(y), 'int32') L.info("Create two variable without borrow=True") self.is_weighted = False if instance_weights_path: instance_weights = np.loadtxt(instance_weights_path) U.xassert( instance_weights.shape == (self.num_samples, ), "The number of lines in weights file must be the same as the number of samples." ) # what is borrow=True # self.shared_w = T.cast(theano.shared(instance_weights, borrow=True), theano.config.floatX) self.shared_w = K.cast(K.variable(instance_weights), K._FLOATX) self.is_weighted = True L.info( ' #samples: %s, ngram size: %s, vocab size: %s, #classes: %s, batch size: %s, #batches: %s' % (U.red(self.num_samples), U.red( self.ngram), U.red(self.vocab_size), U.red(self.num_classes), U.red(self.batch_size), U.red(self.num_batches)))
def initialize(self, emb_path, vocab_path): L.info('Initializing lookup table') vm = VocabManager(vocab_path) w2v = W2VEmbReader(emb_path) U.xassert( w2v.get_emb_dim() == self.emb_matrix.shape[1], 'The embeddings dimension does not match with the given word embeddings' ) for i in range(self.emb_matrix.shape[0]): vec = w2v.get_emb_given_word(vm.get_word_given_id(i)) if vec: self.emb_matrix[i] = vec
def __init__(self, nbest_path, mode='r', reference_list=None): U.xassert(mode == 'r' or mode == 'w', "Invalid mode: " + mode) self.mode = mode self.nbest_file = codecs.open(nbest_path, mode=mode, encoding='UTF-8') self.prev_index = -1 self.curr_item = None self.curr_index = 0 self.eof_flag = False self.ref_manager = None if reference_list: U.xassert(mode == 'r', "Cannot accept a reference_list in 'w' mode") self.ref_manager = RefernceManager(reference_list)
def __init__(self, paths_list): U.xassert(type(paths_list) is list, "The input to a RefernceManager class must be a list") self.ref_list = [] self.num_lines = -1 self.num_refs = 0 for path in paths_list: with codecs.open(path, mode='r', encoding='UTF-8') as f: self.num_refs += 1 sentences = f.readlines() if self.num_lines == -1: self.num_lines = len(sentences) else: U.xassert(self.num_lines == len(sentences), "Reference files must have the same number of lines") self.ref_list.append(sentences)
def read_vocab(vocab_path, endp, has_null): word_to_id_dict = dict() with open(vocab_path, 'r') as f_vocab: curr_index = 0 for line in f_vocab: token = line.strip() if not word_to_id_dict.has_key(token): word_to_id_dict[token] = curr_index curr_index = curr_index + 1 U.xassert( word_to_id_dict.has_key('<s>') and word_to_id_dict.has_key('<unk>'), "Missing <s> or <unk> in given vocab file") if has_null: U.xassert(word_to_id_dict.has_key('<null>'), "Missing <null> in given target vocab file") if endp: U.xassert( word_to_id_dict.has_key('</s>'), "Missing </s> in given vocab file while --endp flag is used") if word_to_id_dict.has_key('</s>'): U.xassert( args.endp, "Given vocab file has </s> but --endp flag is not activated") return word_to_id_dict
def __init__(self, emb_path): L.info('Loading embeddings from: ' + emb_path) has_header = False with codecs.open(emb_path, 'r', encoding='utf8') as emb_file: tokens = emb_file.next().split() if len(tokens) == 2: try: int(tokens[0]) int(tokens[1]) has_header = True except ValueError: pass if has_header: with codecs.open(emb_path, 'r', encoding='utf8') as emb_file: tokens = emb_file.next().split() U.xassert( len(tokens) == 2, 'The first line in W2V embeddings must be the pair (vocab_size, emb_dim)' ) self.vocab_size = int(tokens[0]) self.emb_dim = int(tokens[1]) self.embeddings = {} counter = 0 for line in emb_file: tokens = line.split() U.xassert( len(tokens) == self.emb_dim + 1, 'The number of dimensions does not match the header info' ) word = tokens[0] vec = tokens[1:] self.embeddings[word] = vec counter += 1 U.xassert(counter == self.vocab_size, 'Vocab size does not match the header info') else: with codecs.open(emb_path, 'r', encoding='utf8') as emb_file: self.vocab_size = 0 self.emb_dim = -1 self.embeddings = {} for line in emb_file: tokens = line.split() if self.emb_dim == -1: self.emb_dim = len(tokens) - 1 else: U.xassert( len(tokens) == self.emb_dim + 1, 'The number of dimensions does not match the header info' ) word = tokens[0] vec = tokens[1:] self.embeddings[word] = vec self.vocab_size += 1 L.info(' #vectors: %i, #dimensions: %i' % (self.vocab_size, self.emb_dim))
def __init__(self, dataset_path, batch_size=500, instance_weights_path=None): L.info("Initializing dataset from: " + os.path.abspath(dataset_path)) # Reading parameters from the mmap file print K.get_platform() fp = np.memmap(dataset_path, dtype='int32', mode='r') self.num_samples = fp[0] self.ngram = fp[1] fp = fp.reshape((self.num_samples + 3, self.ngram)) self.vocab_size = fp[1,0] self.num_classes = fp[2,0] # Setting minibatch size and number of mini batches self.batch_size = batch_size self.num_batches = int(M.ceil(self.num_samples / self.batch_size)) # Reading the matrix of samples x = fp[3:,0:self.ngram - 1] # Reading the context indices y = fp[3:,self.ngram - 1] # Reading the output word index #self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32') #self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32') # What is T.cast :)) L.info("Initialize a simple variable") val = np.random.random((4, 2)) tmp = K.variable(val) L.info("Initialize a real variable") tmp = K.variable(x) L.info("Initialize two casted variables") self.shared_x = K.cast(K.variable(x), 'int32') self.shared_y = K.cast(K.variable(y), 'int32') L.info("Create two variable without borrow=True") self.is_weighted = False if instance_weights_path: instance_weights = np.loadtxt(instance_weights_path) U.xassert(instance_weights.shape == (self.num_samples,), "The number of lines in weights file must be the same as the number of samples.") # what is borrow=True # self.shared_w = T.cast(theano.shared(instance_weights, borrow=True), theano.config.floatX) self.shared_w = K.cast(K.variable(instance_weights), K._FLOATX) self.is_weighted = True L.info(' #samples: %s, ngram size: %s, vocab size: %s, #classes: %s, batch size: %s, #batches: %s' % ( U.red(self.num_samples), U.red(self.ngram), U.red(self.vocab_size), U.red(self.num_classes), U.red(self.batch_size), U.red(self.num_batches) ) )
def __init__(self, rng, input, vocab_size, emb_dim, emb_matrix=None, concat=True, emb_path=None, vocab_path=None, add_weights=False, suffix=None, high=0.01): L.info("Lookup Table layer, #words: %s, #dims: %s" % (U.red(vocab_size), U.red(emb_dim))) self.input = input self.emb_matrix = emb_matrix if self.emb_matrix is None: self.emb_matrix = numpy.asarray( rng.uniform( low=-high, #low=-1, high=high, #high=1, size=(vocab_size, emb_dim) ), dtype=theano.config.floatX ) if emb_path: U.xassert(vocab_path, 'When emb_path is given, vocab must be given too.') self.initialize(emb_path, vocab_path) embeddings_name = 'embeddings' if suffix is not None: embeddings_name += '.' + str(suffix) self.embeddings = theano.shared(value=self.emb_matrix, name=embeddings_name, borrow=True) if add_weights: weights_vec = numpy.ones(vocab_size, dtype=theano.config.floatX) self.weights = theano.shared(value=weights_vec, name='word_weights', borrow=True) # Check if the speed can be improved self.output = (self.weights.dimshuffle(0, 'x') * self.embeddings)[input] #self.output = self.weights.dimshuffle(0, 'x')[input] * self.embeddings[input] #self.output = self.weights[input].dimshuffle(0, 'x') * self.embeddings[input] self.params = [self.embeddings, self.weights] else: self.output = self.embeddings[input] self.params = [self.embeddings] if concat: self.output = self.output.reshape((input.shape[0], emb_dim * input.shape[1]))
def __init__(self, paths_list): U.xassert( type(paths_list) is list, "The input to a RefernceManager class must be a list") self.ref_list = [] self.num_lines = -1 self.num_refs = 0 for path in paths_list: with codecs.open(path, mode='r', encoding='UTF-8') as f: self.num_refs += 1 sentences = f.readlines() if self.num_lines == -1: self.num_lines = len(sentences) else: U.xassert( self.num_lines == len(sentences), "Reference files must have the same number of lines") self.ref_list.append(sentences)
def __init__(self, rng, input, vocab_size, emb_dim, emb_matrix=None, concat=True, emb_path=None, vocab_path=None, add_weights=False): L.info("Lookup Table layer, #words: %s, #dims: %s" % (U.red(vocab_size), U.red(emb_dim))) self.input = input L.info("Input " + str(input)) L.info("Add weightes " + str(add_weights)) self.emb_matrix = emb_matrix if self.emb_matrix is None: self.emb_matrix = numpy.asarray( rng.uniform( low=-0.01, #low=-1, high=0.01, #high=1, size=(vocab_size, emb_dim) ), dtype=K._FLOATX ) if emb_path: U.xassert(vocab_path, 'When emb_path is given, vocab must be given too.') self.initialize(emb_path, vocab_path) #self.embeddings = theano.shared(value=self.emb_matrix, name='embeddings', borrow=True) self.embeddings = K.variable(self.emb_matrix, name='embeddings') if add_weights: weights_vec = numpy.ones(vocab_size, dtype=K._FLOATX) #self.weights = theano.shared(value=weights_vec, name='word_weights', borrow=True) self.weights = K.variable(weights_vec, name='word_weights') # Check if the speed can be improved self.output = (self.weights.dimshuffle(0, 'x') * self.embeddings)[input] #self.output = self.weights.dimshuffle(0, 'x')[input] * self.embeddings[input] #self.output = self.weights[input].dimshuffle(0, 'x') * self.embeddings[input] self.params = [self.embeddings, self.weights] else: self.output = self.embeddings[input] self.params = [self.embeddings] if concat: self.output = self.output.reshape((input.shape[0], emb_dim * input.shape[1]))
def __init__(self, dataset_path, batch_size=500, instance_weights_path=None): L.info("Initializing dataset from: " + os.path.abspath(dataset_path)) # Reading parameters from the mmap file fp = np.memmap(dataset_path, dtype='int32', mode='r') self.num_samples = fp[0] self.ngram = fp[1] fp = fp.reshape((self.num_samples + 3, self.ngram)) self.vocab_size = fp[1, 0] self.num_classes = fp[2, 0] # Setting minibatch size and number of mini batches self.batch_size = batch_size self.num_batches = int(M.ceil(self.num_samples / self.batch_size)) # Reading the matrix of samples x = fp[3:, 0:self.ngram - 1] # Reading the context indices y = fp[3:, self.ngram - 1] # Reading the output word index self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32') self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32') self.is_weighted = False if instance_weights_path: instance_weights = np.loadtxt(instance_weights_path) U.xassert( instance_weights.shape == (self.num_samples, ), "The number of lines in weights file must be the same as the number of samples." ) self.shared_w = T.cast( theano.shared(instance_weights, borrow=True), theano.config.floatX) self.is_weighted = True L.info( ' #samples: %s, ngram size: %s, vocab size: %s, #classes: %s, batch size: %s, #batches: %s' % (U.red(self.num_samples), U.red( self.ngram), U.red(self.vocab_size), U.red(self.num_classes), U.red(self.batch_size), U.red(self.num_batches)))
def next_item(self): U.xassert(self.mode == 'r', "next() method can only be used in 'r' mode") try: segments = self.nbest_file.next().split("|||") except StopIteration: self.close() raise StopIteration try: index = int(segments[0]) except ValueError: L.error("The first segment in an n-best list must be an integer") hyp = segments[1].strip() features = segments[2].strip() score = segments[3].strip() phrase_alignments = None word_alignments = None if len(segments) > 4: phrase_alignments = segments[4].strip() if len(segments) > 5: word_alignments = segments[5].strip() return NBestItem(index, hyp, features, score, phrase_alignments, word_alignments)
def next(self): # Returns a group of NBestItems with the same index if self.eof_flag == True: raise StopIteration U.xassert(self.mode == 'r', "next_group() method can only be used in 'r' mode") group = NBestGroup(self.ref_manager) group.add(self.curr_item) # add the item that was read in the last next() call try: self.curr_item = self.next_item() except StopIteration: self.eof_flag = True return group if self.curr_index != self.curr_item.index: self.curr_index = self.curr_item.index return group while self.curr_index == self.curr_item.index: group.add(self.curr_item) try: self.curr_item = self.next_item() except StopIteration: self.eof_flag = True return group self.curr_index = self.curr_item.index return group
def read_vocab(vocab_path, endp, has_null): word_to_id_dict = dict() with open(vocab_path,'r') as f_vocab: curr_index = 0 for line in f_vocab: token = line.strip() if not word_to_id_dict.has_key(token): word_to_id_dict[token] = curr_index curr_index = curr_index + 1 U.xassert(word_to_id_dict.has_key('<s>') and word_to_id_dict.has_key('<unk>'), "Missing <s> or <unk> in given vocab file") if has_null: U.xassert(word_to_id_dict.has_key('<null>'), "Missing <null> in given target vocab file") if endp: U.xassert(word_to_id_dict.has_key('</s>'), "Missing </s> in given vocab file while --endp flag is used") if word_to_id_dict.has_key('</s>'): U.xassert(args.endp, "Given vocab file has </s> but --endp flag is not activated") return word_to_id_dict
def __init__(self, emb_path): L.info('Loading embeddings from: ' + emb_path) has_header=False with codecs.open(emb_path, 'r', encoding='utf8') as emb_file: tokens = emb_file.next().split() if len(tokens) == 2: try: int(tokens[0]) int(tokens[1]) has_header = True except ValueError: pass if has_header: with codecs.open(emb_path, 'r', encoding='utf8') as emb_file: tokens = emb_file.next().split() U.xassert(len(tokens) == 2, 'The first line in W2V embeddings must be the pair (vocab_size, emb_dim)') self.vocab_size = int(tokens[0]) self.emb_dim = int(tokens[1]) self.embeddings = {} counter = 0 for line in emb_file: tokens = line.split() U.xassert(len(tokens) == self.emb_dim + 1, 'The number of dimensions does not match the header info') word = tokens[0] vec = tokens[1:] self.embeddings[word] = vec counter += 1 U.xassert(counter == self.vocab_size, 'Vocab size does not match the header info') else: with codecs.open(emb_path, 'r', encoding='utf8') as emb_file: self.vocab_size = 0 self.emb_dim = -1 self.embeddings = {} for line in emb_file: tokens = line.split() if self.emb_dim == -1: self.emb_dim = len(tokens) - 1 else: U.xassert(len(tokens) == self.emb_dim + 1, 'The number of dimensions does not match the header info') word = tokens[0] vec = tokens[1:] self.embeddings[word] = vec self.vocab_size += 1 L.info(' #vectors: %i, #dimensions: %i' % (self.vocab_size, self.emb_dim))
def get_all_refs(self, index): U.xassert(index < self.num_lines, "Index out of bound") return [self.ref_list[k][index] for k in range(self.num_refs)]
'epsilon' : B.add_epsilon_smoothing, 'lin' : B.lin_smoothing, 'nist' : B.nist_smoothing, 'chen' : B.chen_smoothing } ref_path_list = args.ref_paths.split(',') input_nbest = NBestList(args.input_path, mode='r', reference_list=ref_path_list) if args.out_nbest_path: output_nbest = NBestList(args.out_nbest_path, mode='w') if args.out_scores_path: output_scores = open(args.out_scores_path, mode='w') output_1best = codecs.open(args.out_1best_path, mode='w', encoding='UTF-8') U.xassert(methods.has_key(args.method), "Invalid smoothing method: " + args.method) scorer = methods[args.method] L.info('Processing the n-best list') def process_group(group): index = 0 scores = dict() for item in group: scores[index] = scorer(item.hyp, group.refs) index += 1 return scores pool = Pool(args.threads) counter = 0
from dlm.models.mlp import MLP from dlm import eval import theano import theano.tensor as T ######################### ## Loading model # classifier = MLP(model_path=args.model_path) ######################### ## Loading dataset # U.xassert(args.format == "mmap" or args.format == "nbest" or args.format == "text", "Invalid file format given: " + args.format) U.xassert(args.perplexity or args.nlp_path or args.ulp_path, "You should use one of (or more) -ppl, -nlp or -ulp") if args.format == "mmap": U.xassert((args.nlp_path is None) and (args.ulp_path is None), "Cannot compute log-probabilities for an mmap file") from dlm.io.mmapReader import MemMapReader testset = MemMapReader(dataset_path=args.test_path, batch_size=500) else: U.xassert(args.vocab_path, "Vocab file is required for non-mmap file formats") from dlm.io.textReader import TextReader is_nbest = False if args.format == "nbest": is_nbest = True testset = TextReader(dataset_path=args.test_path, is_nbest=is_nbest, ngram_size=classifier.ngram_size, vocab_path=args.vocab_path) #########################
def __init__(self, args=None, model_path=None): ###################################################################### ## Parameters # U.xassert((args or model_path) and not (args and model_path), "args or model_path are mutually exclusive") if model_path: args, loaded_params = self.load_model(model_path) emb_dim = args.emb_dim num_hidden_list = map(int, args.num_hidden.split(',')) if num_hidden_list[0] <= 0: num_hidden_list = [] self.ngram_size = args.ngram_size if args.feature_emb_dim is None: features_info = [(args.vocab_size, args.ngram_size-1, args.emb_dim)] else: features_dim = map(int, args.feature_emb_dim.split(',')) features_dim.insert(0,emb_dim) U.xassert(len(features_dim) == len(args.features_info), "The number of specified feature dimensions does not match the number of features!") features_info = [] for feature_info,feature_dim in zip(args.features_info, features_dim): feature_info = feature_info + (feature_dim,) features_info.append(feature_info) print "Classifier Creation" print features_info num_classes = args.num_classes activation_name = args.activation_name self.args = args self.L1 = 0 self.L2_sqr = 0 self.params = [] # Not implemented with Sequence Labelling emb_path, vocab = None, None try: emb_path = args.emb_path vocab = args.vocab except AttributeError: pass rng = numpy.random.RandomState(1234) self.input = T.imatrix('input') ###################################################################### ## Lookup Table Layer # last_start_pos = 0 last_layer_output = None last_layer_output_size = 0 for i in range(0, len(features_info)): vocab_size, num_elems,emb_dim = features_info[i] if i != 0: emb_path, vocab = None, None lookupTableLayer = LookupTable( rng=rng, input=self.input[:,last_start_pos:last_start_pos+num_elems], vocab_size=vocab_size, emb_dim=emb_dim, emb_path=emb_path, vocab_path=vocab, add_weights=args.weighted_emb, suffix=i ) if last_layer_output is None: last_layer_output = lookupTableLayer.output else: last_layer_output = T.concatenate([last_layer_output, lookupTableLayer.output], axis=1) last_layer_output_size += (num_elems) * emb_dim self.params += lookupTableLayer.params last_start_pos = last_start_pos + num_elems ###################################################################### ## Hidden Layer(s) # for i in range(0, len(num_hidden_list)): linearLayer = Linear( rng=rng, input=last_layer_output, n_in=last_layer_output_size, n_out=num_hidden_list[i], suffix=i ) last_layer_output = linearLayer.output last_layer_output_size = num_hidden_list[i] self.params += linearLayer.params activation = Activation( input=last_layer_output, func_name=activation_name ) last_layer_output = activation.output self.L1 = self.L1 + abs(linearLayer.W).sum() self.L2_sqr = self.L2_sqr + (linearLayer.W ** 2).sum() ###################################################################### ## Output Linear Layer # linearLayer = Linear( rng=rng, input=last_layer_output, n_in=last_layer_output_size, n_out=num_classes, #b_values = numpy.zeros(num_classes) - math.log(num_classes) b_values = numpy.full(shape=(num_classes),fill_value=(-math.log(num_classes)),dtype=theano.config.floatX), suffix='out' ) last_layer_output = linearLayer.output self.params += linearLayer.params self.L1 = self.L1 + abs(linearLayer.W).sum() self.L2_sqr = self.L2_sqr + (linearLayer.W ** 2).sum() ###################################################################### ## Model Output # self.output = last_layer_output self.p_y_given_x_matrix = T.nnet.softmax(last_layer_output) # Log Softmax last_layer_output_shifted = last_layer_output - last_layer_output.max(axis=1, keepdims=True) self.log_p_y_given_x_matrix = last_layer_output_shifted - T.log(T.sum(T.exp(last_layer_output_shifted),axis=1,keepdims=True)) #self.log_Z_sqr = T.log(T.mean(T.sum(T.exp(last_layer_output), axis=1))) ** 2 #self.log_Z_sqr = T.sum(T.log(T.sum(T.exp(last_layer_output), axis=1))) ** 2 self.log_Z_sqr = T.mean(T.log(T.sum(T.exp(last_layer_output), axis=1)) ** 2) ###################################################################### ## Model Predictions self.y_pred = T.argmax(self.p_y_given_x_matrix, axis=1) ###################################################################### ## Loading parameters from file (if given) # if model_path: self.set_params(loaded_params)
args = parser.parse_args() U.set_theano_device('cpu',1) from dlm.models.mlp import MLP if args.out_dir is None: args.out_dir = 'corelm_convert-' + U.curr_time() U.mkdir_p(args.out_dir) # Loading CoreLM model and creating classifier class L.info("Loading CoreLM model") classifier = MLP(model_path=args.corelm_model) args_nn = classifier.args params_nn = classifier.params U.xassert(len(params_nn)==7, "CoreLM model is not compatible with NPLM architecture. 2 hidden layers and an output linear layer is required.") embeddings = params_nn[0].get_value() W1 = params_nn[1].get_value() W1 = np.transpose(W1) b1 = params_nn[2].get_value() W2 = params_nn[3].get_value() W2 = np.transpose(W2) b2 = params_nn[4].get_value() W3 = params_nn[5].get_value() W3 = np.transpose(W3) b3 = params_nn[6].get_value() # Storing vocabulary into an array has_null = False
tokens_freq_sorted = sorted(word_to_freq_dict, key=word_to_freq_dict.get, reverse=True) if args.prune_vocab_size is not None and args.prune_vocab_size < len(tokens_freq_sorted): tokens_freq_sorted = tokens_freq_sorted[0:args.prune_vocab_size] for token in tokens_freq_sorted: f_vocab.write(token+"\n") word_to_id_dict[token] = curr_index curr_index = curr_index + 1 else: with open(args.input_vocab_path, 'r') as f_vocab: curr_index = 0 for line in f_vocab: token = line.strip() if not word_to_id_dict.has_key(token): word_to_id_dict[token] = curr_index curr_index = curr_index + 1 U.xassert(word_to_id_dict.has_key('<s>') and word_to_id_dict.has_key('<unk>') and word_to_id_dict.has_key('<null>'), "Missing <s> or <unk> or <null> in given vocab file") if args.endp: U.xassert(word_to_id_dict.has_key('</s>'), "Missing </s> in given vocab file while --endp flag is used") if word_to_id_dict.has_key('</s>'): U.xassert(args.endp, "Given vocab file has </s> but --endp flag is not activated") _, tmp_path = tempfile.mkstemp(prefix='dlm.tmp.') # For shuffling only samples = [] # List of samples nsamples = 0 # Reading input text file to create IDX file with open(args.input_path, 'r') as input_file, open(tmp_path, 'w') as tmp_file: next_id = 0 for line in input_file:
output_mmap_path = prefix + ".idx.mmap" output_text_path = prefix + ".idx.txt" output_words_path = prefix + ".txt" if args.word_out: f_words = open(output_words_path, 'w') input_word_to_id = read_vocab(args.input_vocab_path) feature_to_id = read_vocab(args.features_vocab_path) label_to_id = read_vocab(args.labels_vocab_path) input_vocab_size = len(input_word_to_id) feature_vocab_size = len(feature_to_id) label_vocab_size = len(label_to_id) half_context = args.context_size / 2 U.xassert(input_word_to_id.has_key("<s>"), "Sentence marker <s> not found in input vocabulary!") U.xassert(feature_to_id.has_key("<s>"), "Sentence marker <s> not found in feature vocabulary!") _, tmp_path = tempfile.mkstemp(prefix='dlm.tmp.') # For shuffling only samples = [] # List of samples samples_idx = [] nsamples = 0 # Read lines and write to the mmap file line_num = 0 nsamples = 0 with open(args.input_path, 'r') as input_file, open(args.labels_path,
def write(self, item): U.xassert(self.mode == 'w', "write() method can only be used in 'w' mode") self.nbest_file.write(unicode(item) + "\n")
def __iter__(self): U.xassert(self.mode == 'r', "Iteration can only be done in 'r' mode") return self
tokens_freq_sorted): tokens_freq_sorted = tokens_freq_sorted[0:args.prune_vocab_size] for token in tokens_freq_sorted: f_vocab.write(token + "\n") word_to_id_dict[token] = curr_index curr_index = curr_index + 1 else: with open(args.input_vocab_path, 'r') as f_vocab: curr_index = 0 for line in f_vocab: token = line.strip() if not word_to_id_dict.has_key(token): word_to_id_dict[token] = curr_index curr_index = curr_index + 1 U.xassert( word_to_id_dict.has_key('<s>') and word_to_id_dict.has_key('<unk>') and word_to_id_dict.has_key('<null>'), "Missing <s> or <unk> or <null> in given vocab file") if args.endp: U.xassert( word_to_id_dict.has_key('</s>'), "Missing </s> in given vocab file while --endp flag is used") if word_to_id_dict.has_key('</s>'): U.xassert( args.endp, "Given vocab file has </s> but --endp flag is not activated") _, tmp_path = tempfile.mkstemp(prefix='dlm.tmp.') # For shuffling only samples = [] # List of samples nsamples = 0
src_prune_args = parser.add_mutually_exclusive_group(required=True) src_prune_args.add_argument("-vs","--prune-source-vocab", dest="src_vocab_size", type=int, help="Source vocabulary size") src_prune_args.add_argument("--source-vocab-file", dest="src_vocab_path", help="Source vocabulary file path") trg_prune_args = parser.add_mutually_exclusive_group(required=True) trg_prune_args.add_argument("-vt","--prune-target-vocab", dest="trg_vocab_size", type=int, help="Target vocabulary size") trg_prune_args.add_argument("--target-vocab-file", dest="trg_vocab_path", help="Target vocabulary file path") output_prune_args = parser.add_mutually_exclusive_group(required=True) output_prune_args.add_argument("-vo","--prune-output-vocab", dest="output_vocab_size", type=int, help="Output vocabulary size. Defaults to target vocabulary size.") output_prune_args.add_argument("--output-vocab-file", dest="output_vocab_path", help="Output vocabulary file") args = parser.parse_args() # Format of the memmap file does not support less than 5 because the first row consists of parameters for the neural network U.xassert(args.trg_context + args.src_context*2 + 1 > 3, "Total ngram size must be greater than 3. ngrams < 3 are not supported by the current memmap format.") L.info("Source Window Size: " + str(args.src_context * 2 + 1)) L.info("Target Window Size: " + str(args.trg_context - 1)) L.info("Total Sample Size: " + str(args.trg_context + args.src_context * 2 + 1)) if (args.output_vocab_size is None): args.output_vocab_size = args.trg_vocab_size # The output directory is if (not os.path.exists(args.output_dir_path)): os.makedirs(args.output_dir_path) L.info("Output directory: " + os.path.abspath(args.output_dir_path)) # Prefix of files src_prefix = args.output_dir_path + "/" + os.path.basename(args.src_input_path)
args = parser.parse_args() U.set_theano_device('cpu', 1) from dlm.models.mlp import MLP if args.out_dir is None: args.out_dir = 'primelm_convert-' + U.curr_time() U.mkdir_p(args.out_dir) # Loading PrimeLM model and creating classifier class L.info("Loading PrimeLM model") classifier = MLP(model_path=args.primelm_model) args_nn = classifier.args params_nn = classifier.params U.xassert( len(params_nn) == 7, "PrimeLM model is not compatible with NPLM architecture. 2 hidden layers and an output linear layer is required." ) embeddings = params_nn[0].get_value() W1 = params_nn[1].get_value() W1 = np.transpose(W1) b1 = params_nn[2].get_value() W2 = params_nn[3].get_value() W2 = np.transpose(W2) b2 = params_nn[4].get_value() W3 = params_nn[5].get_value() W3 = np.transpose(W3) b3 = params_nn[6].get_value() # Storing vocabulary into an array has_null = False
if args.command.startswith('top'): mode = 0 N = int(args.command[3:]) # N in N-best output_nbest = NBestList(args.output_path, mode='w') elif args.command == '1best': mode = 1 output_1best = codecs.open(args.output_path, mode='w', encoding='UTF-8') elif args.command.startswith('feature'): mode = 2 N = int(args.command[7:]) # Nth feature output = open(args.output_path, mode='w') elif args.command.startswith('correl'): mode = 3 N = int(args.command[6:]) # Nth feature U.xassert(args.oracle, "correlN command needs a file (-s) containing oracle scores") with open(args.oracle, mode='r') as oracles_file: oracles = map(float, oracles_file.read().splitlines()) #output = open(args.output_path, mode='w') elif args.command.startswith('augment'): U.set_theano_device(args.device) from dlm.reranker import augmenter augmenter.augment(args.model_path, args.input_path, args.vocab_path, args.output_path) else: L.error('Invalid command: ' + args.command) counter = 0 features = [] for group in input_nbest: if mode == 0:
if args.out_dir is None: args.out_dir = 'corelm-' + U.curr_time() U.mkdir_p(args.out_dir) L.quiet = args.quiet L.set_file_path(os.path.abspath(args.out_dir) + "/log.txt") L.info('Command: ' + ' '.join(sys.argv)) curr_version = U.curr_version() if curr_version: L.info("Version: " + curr_version) if args.emb_path: U.xassert(args.vocab, 'When --emb-path is used, vocab file must be given too (using --vocab).') if args.loss_function == "nll": args.num_noise_samples = 0 U.print_args(args) U.set_theano_device(args.device, args.threads) import dlm.trainer from dlm.io.mmapReader import MemMapReader from dlm.models.mlp import MLP ######################### ## Loading datasets #
def set_params(self, params): U.xassert(len(self.params) == len(params), 'The given model file is consistent with the architecture') for param, loaded_param in zip(self.params, params): param.set_value(loaded_param)
def __init__(self, args=None, model_path=None): ###################################################################### ## Parameters # U.xassert((args or model_path) and not (args and model_path), "args or model_path are mutually exclusive") if model_path: args, loaded_params = self.load_model(model_path) emb_dim = args.emb_dim num_hidden_list = map(int, args.num_hidden.split(',')) if num_hidden_list[0] <= 0: num_hidden_list = [] vocab_size = args.vocab_size self.ngram_size = args.ngram_size num_classes = args.num_classes activation_name = args.activation_name self.args = args self.L1 = 0 self.L2_sqr = 0 self.params = [] emb_path, vocab = None, None try: emb_path = args.emb_path vocab = args.vocab except AttributeError: pass rng = numpy.random.RandomState(1234) self.input = T.imatrix('input') ###################################################################### ## Lookup Table Layer # lookupTableLayer = LookupTable(rng=rng, input=self.input, vocab_size=vocab_size, emb_dim=emb_dim, emb_path=emb_path, vocab_path=vocab, add_weights=args.weighted_emb) last_layer_output = lookupTableLayer.output last_layer_output_size = (self.ngram_size - 1) * emb_dim self.params += lookupTableLayer.params ###################################################################### ## Hidden Layer(s) # for i in range(0, len(num_hidden_list)): linearLayer = Linear(rng=rng, input=last_layer_output, n_in=last_layer_output_size, n_out=num_hidden_list[i], suffix=i) last_layer_output = linearLayer.output last_layer_output_size = num_hidden_list[i] self.params += linearLayer.params activation = Activation(input=last_layer_output, func_name=activation_name) last_layer_output = activation.output self.L1 = self.L1 + abs(linearLayer.W).sum() self.L2_sqr = self.L2_sqr + (linearLayer.W**2).sum() ###################################################################### ## Output Linear Layer # linearLayer = Linear( rng=rng, input=last_layer_output, n_in=last_layer_output_size, n_out=num_classes, #b_values = numpy.zeros(num_classes) - math.log(num_classes) b_values=numpy.full(shape=(num_classes), fill_value=(-math.log(num_classes)), dtype=theano.config.floatX), suffix='out') last_layer_output = linearLayer.output self.params += linearLayer.params self.L1 = self.L1 + abs(linearLayer.W).sum() self.L2_sqr = self.L2_sqr + (linearLayer.W**2).sum() ###################################################################### ## Model Output # self.output = last_layer_output self.p_y_given_x_matrix = T.nnet.softmax(last_layer_output) # Log Softmax last_layer_output_shifted = last_layer_output - last_layer_output.max( axis=1, keepdims=True) self.log_p_y_given_x_matrix = last_layer_output_shifted - T.log( T.sum(T.exp(last_layer_output_shifted), axis=1, keepdims=True)) #self.log_Z_sqr = T.log(T.mean(T.sum(T.exp(last_layer_output), axis=1))) ** 2 #self.log_Z_sqr = T.sum(T.log(T.sum(T.exp(last_layer_output), axis=1))) ** 2 self.log_Z_sqr = T.mean( T.log(T.sum(T.exp(last_layer_output), axis=1))**2) ###################################################################### ## Model Predictions self.y_pred = T.argmax(self.p_y_given_x_matrix, axis=1) ###################################################################### ## Loading parameters from file (if given) # if model_path: self.set_params(loaded_params)
init_opt.write(' '.join(init_list) + '\n') init_opt.write(' '.join(['0' for i in range(dim)]) + '\n') init_opt.write(' '.join(['1' for i in range(dim)]) + '\n') seed_arg = '' if args.pred_seed: seed_arg = ' -r 1234 ' if (args.alg == 'pro' or args.alg == 'wpro'): # PRO if args.alg == 'pro': L.info("Running PRO") cmd = moses_root + '/bin/pro' + ' -S ' + args.out_dir + '/statscore.data -F ' + args.out_dir + '/features.data -o ' + args.out_dir +'/pro.data' + seed_arg else: L.info("Running WEIGHTED PRO") U.xassert(args.instance_weights_path, 'Instance weights are not given to wpro') cmd = moses_root + '/bin/proWeighted' + ' -S ' + args.out_dir + '/statscore.data -F ' + args.out_dir + '/features.data -o ' + args.out_dir +'/pro.data' + seed_arg + ' -w ' + args.instance_weights_path U.capture(cmd) cmd = moses_root + '/bin/megam_i686.opt -fvals -maxi 30 -nobias binary ' + args.out_dir + '/pro.data' pro_weights = U.capture(cmd) pro_weights_arr = pro_weights.strip().split('\n') weights_dict = dict() sum = 0.0 highest_feature_index = 0 for elem in pro_weights_arr: feature_index,weight = elem[1:].split() feature_index = int(feature_index) weight = float(weight) weights_dict[feature_index] = weight
if args.out_dir is None: args.out_dir = 'primelm-' + U.curr_time() U.mkdir_p(args.out_dir) L.quiet = args.quiet L.set_file_path(os.path.abspath(args.out_dir) + "/log.txt") L.info('Command: ' + ' '.join(sys.argv)) curr_version = U.curr_version() if curr_version: L.info("Version: " + curr_version) if args.emb_path: U.xassert( args.vocab, 'When --emb-path is used, vocab file must be given too (using --vocab).' ) if args.loss_function == "nll": args.num_noise_samples = 0 U.print_args(args) U.set_theano_device(args.device, args.threads) import dlm.trainer from dlm.io.mmapReader import MemMapReader from dlm.io.featuresmmapReader import FeaturesMemMapReader from dlm.models.mlp import MLP #########################
output_words_path = prefix + ".txt" if args.word_out: f_words = open(output_words_path, "w") input_word_to_id = read_vocab(args.input_vocab_path) feature_to_id = read_vocab(args.features_vocab_path) label_to_id = read_vocab(args.labels_vocab_path) input_vocab_size = len(input_word_to_id) feature_vocab_size = len(feature_to_id) label_vocab_size = len(label_to_id) half_context = args.context_size / 2 U.xassert(input_word_to_id.has_key("<s>"), "Sentence marker <s> not found in input vocabulary!") U.xassert(feature_to_id.has_key("<s>"), "Sentence marker <s> not found in feature vocabulary!") _, tmp_path = tempfile.mkstemp(prefix="dlm.tmp.") # For shuffling only samples = [] # List of samples samples_idx = [] nsamples = 0 # Read lines and write to the mmap file line_num = 0 nsamples = 0 with open(args.input_path, "r") as input_file, open(args.labels_path, "r") as labels_file, open(
def __init__(self, dataset_path, batch_size=500, instance_weights_path=None): L.info("Initializing dataset (with features) from: " + os.path.abspath(dataset_path)) # Reading parameters from the mmap file fp = np.memmap(dataset_path, dtype='int32', mode='r') #print type(fp1) #fp = np.empty(fp1.shape, dtype='int32') #fp[:] = fp1 #print type(fp) self.num_samples = fp[0] self.ngram = fp[1] fp = fp.reshape((len(fp)/self.ngram, self.ngram)) num_header_lines = fp[1,0] self.features_info = [] # Format (vocab_size, num_of_elements) for i in xrange(num_header_lines-1): self.features_info.append( (fp[i+2,0], fp[i+2,1]) ) self.num_classes = fp[(num_header_lines+2)-1,0] # Setting minibatch size and number of mini batches self.batch_size = batch_size self.num_batches = int(M.ceil(self.num_samples / self.batch_size)) # Reading the matrix of samples # x is list ''' self.shared_x_list = [] last_start_pos = 0 for i in xrange(len(self.features_info)): vocab_size, num_elems = self.features_info[i] x = fp[num_header_lines+2:,last_start_pos:last_start_pos + num_elems] # Reading the context indices last_start_pos += num_elems shared_x = T.cast(theano.shared(x, borrow=True), 'int32') self.shared_x_list.append(shared_x) ''' x = fp[num_header_lines+2:,0:self.ngram - 1] # Reading the context indices self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32') y = fp[num_header_lines+2:,self.ngram - 1] # Reading the output word index self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32') ## Untested instance weighting self.is_weighted = False if instance_weights_path: instance_weights = np.loadtxt(instance_weights_path) U.xassert(instance_weights.shape == (self.num_samples,), "The number of lines in weights file must be the same as the number of samples.") self.shared_w = T.cast(theano.shared(instance_weights, borrow=True), theano.config.floatX) self.is_weighted = True L.info(' #samples: %s, #classes: %s, batch size: %s, #batches: %s' % ( U.red(self.num_samples), U.red(self.num_classes), U.red(self.batch_size), U.red(self.num_batches) )) for feature in enumerate(self.features_info): L.info("Feature %s: #ngrams= %s vocab_size= %s" %( U.red(feature[0]), U.red(feature[1][1]), U.red(feature[1][0])))
def __init__(self, args=None, model_path=None): ###################################################################### ## Parameters # U.xassert((args or model_path) and not (args and model_path), "args or model_path are mutually exclusive") if model_path: args, loaded_params = self.load_model(model_path) emb_dim = args.emb_dim num_hidden_list = map(int, args.num_hidden.split(',')) if num_hidden_list[0] <= 0: num_hidden_list = [] self.ngram_size = args.ngram_size if args.feature_emb_dim is None: features_info = [(args.vocab_size, args.ngram_size - 1, args.emb_dim)] else: features_dim = map(int, args.feature_emb_dim.split(',')) features_dim.insert(0, emb_dim) U.xassert( len(features_dim) == len(args.features_info), "The number of specified feature dimensions does not match the number of features!" ) features_info = [] for feature_info, feature_dim in zip(args.features_info, features_dim): feature_info = feature_info + (feature_dim, ) features_info.append(feature_info) print "Classifier Creation" print features_info num_classes = args.num_classes activation_name = args.activation_name self.args = args self.L1 = 0 self.L2_sqr = 0 self.params = [] # Not implemented with Sequence Labelling emb_path, vocab = None, None try: emb_path = args.emb_path vocab = args.vocab except AttributeError: pass rng = numpy.random.RandomState(1234) self.input = T.imatrix('input') ###################################################################### ## Lookup Table Layer # last_start_pos = 0 last_layer_output = None last_layer_output_size = 0 for i in range(0, len(features_info)): vocab_size, num_elems, emb_dim = features_info[i] if i != 0: emb_path, vocab = None, None lookupTableLayer = LookupTable( rng=rng, input=self.input[:, last_start_pos:last_start_pos + num_elems], vocab_size=vocab_size, emb_dim=emb_dim, emb_path=emb_path, vocab_path=vocab, add_weights=args.weighted_emb, suffix=i) if last_layer_output is None: last_layer_output = lookupTableLayer.output else: last_layer_output = T.concatenate( [last_layer_output, lookupTableLayer.output], axis=1) last_layer_output_size += (num_elems) * emb_dim self.params += lookupTableLayer.params last_start_pos = last_start_pos + num_elems ###################################################################### ## Hidden Layer(s) # for i in range(0, len(num_hidden_list)): linearLayer = Linear(rng=rng, input=last_layer_output, n_in=last_layer_output_size, n_out=num_hidden_list[i], suffix=i) last_layer_output = linearLayer.output last_layer_output_size = num_hidden_list[i] self.params += linearLayer.params activation = Activation(input=last_layer_output, func_name=activation_name) last_layer_output = activation.output self.L1 = self.L1 + abs(linearLayer.W).sum() self.L2_sqr = self.L2_sqr + (linearLayer.W**2).sum() ###################################################################### ## Output Linear Layer # linearLayer = Linear( rng=rng, input=last_layer_output, n_in=last_layer_output_size, n_out=num_classes, #b_values = numpy.zeros(num_classes) - math.log(num_classes) b_values=numpy.full(shape=(num_classes), fill_value=(-math.log(num_classes)), dtype=theano.config.floatX), suffix='out') last_layer_output = linearLayer.output self.params += linearLayer.params self.L1 = self.L1 + abs(linearLayer.W).sum() self.L2_sqr = self.L2_sqr + (linearLayer.W**2).sum() ###################################################################### ## Model Output # self.output = last_layer_output self.p_y_given_x_matrix = T.nnet.softmax(last_layer_output) # Log Softmax last_layer_output_shifted = last_layer_output - last_layer_output.max( axis=1, keepdims=True) self.log_p_y_given_x_matrix = last_layer_output_shifted - T.log( T.sum(T.exp(last_layer_output_shifted), axis=1, keepdims=True)) #self.log_Z_sqr = T.log(T.mean(T.sum(T.exp(last_layer_output), axis=1))) ** 2 #self.log_Z_sqr = T.sum(T.log(T.sum(T.exp(last_layer_output), axis=1))) ** 2 self.log_Z_sqr = T.mean( T.log(T.sum(T.exp(last_layer_output), axis=1))**2) ###################################################################### ## Model Predictions self.y_pred = T.argmax(self.p_y_given_x_matrix, axis=1) ###################################################################### ## Loading parameters from file (if given) # if model_path: self.set_params(loaded_params)
output_prune_args = parser.add_mutually_exclusive_group(required=True) output_prune_args.add_argument( "-vo", "--prune-output-vocab", dest="output_vocab_size", type=int, help="Output vocabulary size. Defaults to target vocabulary size.") output_prune_args.add_argument("--output-vocab-file", dest="output_vocab_path", help="Output vocabulary file") args = parser.parse_args() # Format of the memmap file does not support less than 5 because the first row consists of parameters for the neural network U.xassert( args.trg_context + args.src_context * 2 + 1 > 3, "Total ngram size must be greater than 3. ngrams < 3 are not supported by the current memmap format." ) L.info("Source Window Size: " + str(args.src_context * 2 + 1)) L.info("Target Window Size: " + str(args.trg_context - 1)) L.info("Total Sample Size: " + str(args.trg_context + args.src_context * 2 + 1)) if (args.output_vocab_size is None): args.output_vocab_size = args.trg_vocab_size # The output directory is if (not os.path.exists(args.output_dir_path)): os.makedirs(args.output_dir_path) L.info("Output directory: " + os.path.abspath(args.output_dir_path))
def append_features(self, features_list): U.xassert(len(features_list) == len(self.group), 'Number of features and number of items in this group do not match') for i in range(len(self.group)): self.group[i].append_feature(features_list[i])
if args.command.startswith('top'): mode = 0 N = int(args.command[3:]) # N in N-best output_nbest = NBestList(args.output_path, mode='w') elif args.command == '1best': mode = 1 output_1best = codecs.open(args.output_path, mode='w', encoding='UTF-8') elif args.command.startswith('feature'): mode = 2 N = int(args.command[7:]) # Nth feature output = open(args.output_path, mode='w') elif args.command.startswith('correl'): mode = 3 N = int(args.command[6:]) # Nth feature U.xassert(args.oracle, "correlN command needs a file (-s) containing oracle scores") with open(args.oracle, mode='r') as oracles_file: oracles = map(float, oracles_file.read().splitlines()) #output = open(args.output_path, mode='w') elif args.command.startswith('augment'): U.set_theano_device(args.device) from dlm.reranker import augmenter augmenter.augment(args.model_path, args.input_path, args.vocab_path, args.output_path) else: L.error('Invalid command: ' + args.command) counter = 0 features = [] for group in input_nbest: if mode == 0: for i in range(min(N, group.size())):
def __init__(self, dataset_path, batch_size=500, instance_weights_path=None): L.info("Initializing dataset (with features) from: " + os.path.abspath(dataset_path)) # Reading parameters from the mmap file fp = np.memmap(dataset_path, dtype='int32', mode='r') #print type(fp1) #fp = np.empty(fp1.shape, dtype='int32') #fp[:] = fp1 #print type(fp) self.num_samples = fp[0] self.ngram = fp[1] fp = fp.reshape((len(fp) / self.ngram, self.ngram)) num_header_lines = fp[1, 0] self.features_info = [] # Format (vocab_size, num_of_elements) for i in xrange(num_header_lines - 1): self.features_info.append((fp[i + 2, 0], fp[i + 2, 1])) self.num_classes = fp[(num_header_lines + 2) - 1, 0] # Setting minibatch size and number of mini batches self.batch_size = batch_size self.num_batches = int(M.ceil(self.num_samples / self.batch_size)) # Reading the matrix of samples # x is list ''' self.shared_x_list = [] last_start_pos = 0 for i in xrange(len(self.features_info)): vocab_size, num_elems = self.features_info[i] x = fp[num_header_lines+2:,last_start_pos:last_start_pos + num_elems] # Reading the context indices last_start_pos += num_elems shared_x = T.cast(theano.shared(x, borrow=True), 'int32') self.shared_x_list.append(shared_x) ''' x = fp[num_header_lines + 2:, 0:self.ngram - 1] # Reading the context indices self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32') y = fp[num_header_lines + 2:, self.ngram - 1] # Reading the output word index self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32') ## Untested instance weighting self.is_weighted = False if instance_weights_path: instance_weights = np.loadtxt(instance_weights_path) U.xassert( instance_weights.shape == (self.num_samples, ), "The number of lines in weights file must be the same as the number of samples." ) self.shared_w = T.cast( theano.shared(instance_weights, borrow=True), theano.config.floatX) self.is_weighted = True L.info(' #samples: %s, #classes: %s, batch size: %s, #batches: %s' % (U.red(self.num_samples), U.red(self.num_classes), U.red(self.batch_size), U.red(self.num_batches))) for feature in enumerate(self.features_info): L.info("Feature %s: #ngrams= %s vocab_size= %s" % (U.red( feature[0]), U.red(feature[1][1]), U.red(feature[1][0])))
'nist': B.nist_smoothing, 'chen': B.chen_smoothing } ref_path_list = args.ref_paths.split(',') input_nbest = NBestList(args.input_path, mode='r', reference_list=ref_path_list) if args.out_nbest_path: output_nbest = NBestList(args.out_nbest_path, mode='w') if args.out_scores_path: output_scores = open(args.out_scores_path, mode='w') output_1best = codecs.open(args.out_1best_path, mode='w', encoding='UTF-8') U.xassert(methods.has_key(args.method), "Invalid smoothing method: " + args.method) scorer = methods[args.method] L.info('Processing the n-best list') def process_group(group): index = 0 scores = dict() for item in group: scores[index] = scorer(item.hyp, group.refs) index += 1 return scores pool = Pool(args.threads)
def __init__(self, args=None, model_path=None): ###################################################################### ## Parameters # U.xassert((args or model_path) and not (args and model_path), "args or model_path are mutually exclusive") if model_path: args, loaded_params = self.load_model(model_path) emb_dim = args.emb_dim num_hidden_list = map(int, args.num_hidden.split(',')) if num_hidden_list[0] <= 0: num_hidden_list = [] vocab_size = args.vocab_size self.ngram_size = args.ngram_size num_classes = args.num_classes activation_name = args.activation_name self.args = args self.L1 = 0 self.L2_sqr = 0 self.params = [] emb_path, vocab = None, None try: emb_path = args.emb_path vocab = args.vocab except AttributeError: pass rng = numpy.random.RandomState(1234) self.input = K.placeholder(ndim=2, dtype='int32', name='input') ###################################################################### ## Lookup Table Layer # lookupTableLayer = LookupTable( rng=rng, input=self.input, vocab_size=vocab_size, emb_dim=emb_dim, emb_path=emb_path, vocab_path=vocab, add_weights=args.weighted_emb ) last_layer_output = lookupTableLayer.output last_layer_output_size = (self.ngram_size - 1) * emb_dim self.params += lookupTableLayer.params ###################################################################### ## Hidden Layer(s) # for i in range(0, len(num_hidden_list)): linearLayer = Linear( rng=rng, input=last_layer_output, n_in=last_layer_output_size, n_out=num_hidden_list[i], suffix=i ) last_layer_output = linearLayer.output last_layer_output_size = num_hidden_list[i] self.params += linearLayer.params activation = Activation( input=last_layer_output, func_name=activation_name ) last_layer_output = activation.output self.L1 = self.L1 + abs(linearLayer.W).sum() self.L2_sqr = self.L2_sqr + (linearLayer.W ** 2).sum() ###################################################################### ## Output Linear Layer # linearLayer = Linear( rng=rng, input=last_layer_output, n_in=last_layer_output_size, n_out=num_classes, #b_values = numpy.zeros(num_classes) - math.log(num_classes) b_values = numpy.full(shape=(num_classes),fill_value=(-math.log(num_classes)),dtype=K._FLOATX), suffix='out' ) last_layer_output = linearLayer.output self.params += linearLayer.params self.L1 = self.L1 + abs(linearLayer.W).sum() self.L2_sqr = self.L2_sqr + (linearLayer.W ** 2).sum() ###################################################################### ## Model Output # self.output = last_layer_output self.p_y_given_x_matrix = K.softmax(last_layer_output) # Log Softmax last_layer_output_shifted = last_layer_output - last_layer_output.max(axis=1, keepdims=True) self.log_p_y_given_x_matrix = last_layer_output_shifted - K.log(K.sum(K.exp(last_layer_output_shifted),axis=1,keepdims=True)) self.log_Z_sqr = K.mean(K.log(K.sum(K.exp(last_layer_output), axis=1)) ** 2) ###################################################################### ## Model Predictions self.y_pred = K.argmax(self.p_y_given_x_matrix, axis=1) ###################################################################### ## Loading parameters from file (if given) # if model_path: self.set_params(loaded_params)
init_opt.write(' '.join(init_list) + '\n') init_opt.write(' '.join(['0' for i in range(dim)]) + '\n') init_opt.write(' '.join(['1' for i in range(dim)]) + '\n') seed_arg = '' if args.pred_seed: seed_arg = ' -r 1234 ' if (args.alg == 'pro' or args.alg == 'wpro'): # PRO if args.alg == 'pro': L.info("Running PRO") cmd = moses_root + '/bin/pro' + ' -S ' + args.out_dir + '/statscore.data -F ' + args.out_dir + '/features.data -o ' + args.out_dir + '/pro.data' + seed_arg else: L.info("Running WEIGHTED PRO") U.xassert(args.instance_weights_path, 'Instance weights are not given to wpro') cmd = moses_root + '/bin/proWeighted' + ' -S ' + args.out_dir + '/statscore.data -F ' + args.out_dir + '/features.data -o ' + args.out_dir + '/pro.data' + seed_arg + ' -w ' + args.instance_weights_path U.capture(cmd) cmd = moses_root + '/bin/megam_i686.opt -fvals -maxi 30 -nobias binary ' + args.out_dir + '/pro.data' pro_weights = U.capture(cmd) pro_weights_arr = pro_weights.strip().split('\n') weights_dict = dict() sum = 0.0 highest_feature_index = 0 for elem in pro_weights_arr: feature_index, weight = elem[1:].split() feature_index = int(feature_index) weight = float(weight) weights_dict[feature_index] = weight