예제 #1
0
파일: mmapReader.py 프로젝트: nusnlp/corelm
	def __init__(self, dataset_path, batch_size=500, instance_weights_path=None):
		
		L.info("Initializing dataset from: " + os.path.abspath(dataset_path))
		
		# Reading parameters from the mmap file
		fp = np.memmap(dataset_path, dtype='int32', mode='r')
		self.num_samples = fp[0]
		self.ngram = fp[1]
		fp = fp.reshape((self.num_samples + 3, self.ngram))
		self.vocab_size = fp[1,0]
		self.num_classes = fp[2,0]

		# Setting minibatch size and number of mini batches
		self.batch_size = batch_size
		self.num_batches = int(M.ceil(self.num_samples / self.batch_size))
		
		# Reading the matrix of samples
		x = fp[3:,0:self.ngram - 1]			# Reading the context indices
		y = fp[3:,self.ngram - 1]			# Reading the output word index
		self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32')
		self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32')
		
		self.is_weighted = False
		if instance_weights_path:
			instance_weights = np.loadtxt(instance_weights_path)
			U.xassert(instance_weights.shape == (self.num_samples,), "The number of lines in weights file must be the same as the number of samples.")
			self.shared_w = T.cast(theano.shared(instance_weights, borrow=True), theano.config.floatX)
			self.is_weighted = True
		
		L.info('  #samples: %s, ngram size: %s, vocab size: %s, #classes: %s, batch size: %s, #batches: %s' % (
				U.red(self.num_samples), U.red(self.ngram), U.red(self.vocab_size), U.red(self.num_classes), U.red(self.batch_size), U.red(self.num_batches)
			)
		)
예제 #2
0
 def next(self):  # Returns a group of NBestItems with the same index
     if self.eof_flag == True:
         raise StopIteration
     U.xassert(self.mode == 'r',
               "next_group() method can only be used in 'r' mode")
     group = NBestGroup(self.ref_manager)
     group.add(self.curr_item
               )  # add the item that was read in the last next() call
     try:
         self.curr_item = self.next_item()
     except StopIteration:
         self.eof_flag = True
         return group
     if self.curr_index != self.curr_item.index:
         self.curr_index = self.curr_item.index
         return group
     while self.curr_index == self.curr_item.index:
         group.add(self.curr_item)
         try:
             self.curr_item = self.next_item()
         except StopIteration:
             self.eof_flag = True
             return group
     self.curr_index = self.curr_item.index
     return group
예제 #3
0
 def append_features(self, features_list):
     U.xassert(
         len(features_list) == len(self.group),
         'Number of features and number of items in this group do not match'
     )
     for i in range(len(self.group)):
         self.group[i].append_feature(features_list[i])
예제 #4
0
파일: lookuptable.py 프로젝트: tamhd/corelm
	def initialize(self, emb_path, vocab_path):
		L.info('Initializing lookup table')
		vm = VocabManager(vocab_path)
		w2v = W2VEmbReader(emb_path)
		U.xassert(w2v.get_emb_dim() == self.emb_matrix.shape[1], 'The embeddings dimension does not match with the given word embeddings')
		for i in range(self.emb_matrix.shape[0]):
			vec = w2v.get_emb_given_word(vm.get_word_given_id(i))
			if vec:
				self.emb_matrix[i] = vec
예제 #5
0
	def add(self, item):
		if item is None:
			return
		if self.group_index == -1:
			self.group_index = item.index
			if self.ref_manager:
				self.refs = self.ref_manager.get_all_refs(self.group_index)
		else:
			U.xassert(item.index == self.group_index, "Cannot add an nbest item with an incompatible index")
		self.group.append(item)
예제 #6
0
    def __init__(self,
                 rng,
                 input,
                 vocab_size,
                 emb_dim,
                 emb_matrix=None,
                 concat=True,
                 emb_path=None,
                 vocab_path=None,
                 add_weights=False):

        L.info("Lookup Table layer, #words: %s, #dims: %s" %
               (U.red(vocab_size), U.red(emb_dim)))

        self.input = input

        self.emb_matrix = emb_matrix

        if self.emb_matrix is None:
            self.emb_matrix = numpy.asarray(
                rng.uniform(
                    low=-0.01,  #low=-1,
                    high=0.01,  #high=1,
                    size=(vocab_size, emb_dim)),
                dtype=theano.config.floatX)

        if emb_path:
            U.xassert(vocab_path,
                      'When emb_path is given, vocab must be given too.')
            self.initialize(emb_path, vocab_path)

        self.embeddings = theano.shared(value=self.emb_matrix,
                                        name='embeddings',
                                        borrow=True)

        if add_weights:
            weights_vec = numpy.ones(vocab_size, dtype=theano.config.floatX)
            self.weights = theano.shared(value=weights_vec,
                                         name='word_weights',
                                         borrow=True)

            # Check if the speed can be improved
            self.output = (self.weights.dimshuffle(0, 'x') *
                           self.embeddings)[input]
            #self.output = self.weights.dimshuffle(0, 'x')[input] * self.embeddings[input]
            #self.output = self.weights[input].dimshuffle(0, 'x') * self.embeddings[input]

            self.params = [self.embeddings, self.weights]
        else:
            self.output = self.embeddings[input]
            self.params = [self.embeddings]

        if concat:
            self.output = self.output.reshape(
                (input.shape[0], emb_dim * input.shape[1]))
예제 #7
0
def read_vocab(vocab_path):
	word_to_id_dict = dict()
	found_sent_marker = False
	with open(vocab_path,'r') as f_vocab:
		curr_index = 0
		for line in f_vocab:
			token = line.strip().split()[0]
			U.xassert((not word_to_id_dict.has_key(token)), "Given vocab file has duplicate entry for '" + token + "'.")
			word_to_id_dict[token] = curr_index
			curr_index = curr_index + 1
	return word_to_id_dict		
예제 #8
0
 def add(self, item):
     if item is None:
         return
     if self.group_index == -1:
         self.group_index = item.index
         if self.ref_manager:
             self.refs = self.ref_manager.get_all_refs(self.group_index)
     else:
         U.xassert(item.index == self.group_index,
                   "Cannot add an nbest item with an incompatible index")
     self.group.append(item)
예제 #9
0
def read_vocab(vocab_path):
    word_to_id_dict = dict()
    found_sent_marker = False
    with open(vocab_path, "r") as f_vocab:
        curr_index = 0
        for line in f_vocab:
            token = line.strip().split()[0]
            U.xassert((not word_to_id_dict.has_key(token)), "Given vocab file has duplicate entry for '" + token + "'.")
            word_to_id_dict[token] = curr_index
            curr_index = curr_index + 1
    return word_to_id_dict
예제 #10
0
    def __init__(self,
                 dataset_path,
                 batch_size=500,
                 instance_weights_path=None):

        L.info("Initializing dataset from: " + os.path.abspath(dataset_path))
        # Reading parameters from the mmap file
        print K.get_platform()
        fp = np.memmap(dataset_path, dtype='int32', mode='r')
        self.num_samples = fp[0]
        self.ngram = fp[1]
        fp = fp.reshape((self.num_samples + 3, self.ngram))
        self.vocab_size = fp[1, 0]
        self.num_classes = fp[2, 0]

        # Setting minibatch size and number of mini batches
        self.batch_size = batch_size
        self.num_batches = int(M.ceil(self.num_samples / self.batch_size))

        # Reading the matrix of samples
        x = fp[3:, 0:self.ngram - 1]  # Reading the context indices
        y = fp[3:, self.ngram - 1]  # Reading the output word index
        #self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32')
        #self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32')
        # What is T.cast :))
        L.info("Initialize a simple variable")
        val = np.random.random((4, 2))
        tmp = K.variable(val)

        L.info("Initialize a real variable")
        tmp = K.variable(x)
        L.info("Initialize two casted variables")
        self.shared_x = K.cast(K.variable(x), 'int32')
        self.shared_y = K.cast(K.variable(y), 'int32')
        L.info("Create two variable without borrow=True")
        self.is_weighted = False
        if instance_weights_path:
            instance_weights = np.loadtxt(instance_weights_path)
            U.xassert(
                instance_weights.shape == (self.num_samples, ),
                "The number of lines in weights file must be the same as the number of samples."
            )
            # what is borrow=True
            # self.shared_w = T.cast(theano.shared(instance_weights, borrow=True), theano.config.floatX)
            self.shared_w = K.cast(K.variable(instance_weights), K._FLOATX)

            self.is_weighted = True

        L.info(
            '  #samples: %s, ngram size: %s, vocab size: %s, #classes: %s, batch size: %s, #batches: %s'
            % (U.red(self.num_samples), U.red(
                self.ngram), U.red(self.vocab_size), U.red(self.num_classes),
               U.red(self.batch_size), U.red(self.num_batches)))
예제 #11
0
 def initialize(self, emb_path, vocab_path):
     L.info('Initializing lookup table')
     vm = VocabManager(vocab_path)
     w2v = W2VEmbReader(emb_path)
     U.xassert(
         w2v.get_emb_dim() == self.emb_matrix.shape[1],
         'The embeddings dimension does not match with the given word embeddings'
     )
     for i in range(self.emb_matrix.shape[0]):
         vec = w2v.get_emb_given_word(vm.get_word_given_id(i))
         if vec:
             self.emb_matrix[i] = vec
예제 #12
0
	def __init__(self, nbest_path, mode='r', reference_list=None):
		U.xassert(mode == 'r' or mode == 'w', "Invalid mode: " + mode)
		self.mode = mode
		self.nbest_file = codecs.open(nbest_path, mode=mode, encoding='UTF-8')
		self.prev_index = -1
		self.curr_item = None
		self.curr_index = 0
		self.eof_flag = False
		self.ref_manager = None
		if reference_list:
			U.xassert(mode == 'r', "Cannot accept a reference_list in 'w' mode")
			self.ref_manager = RefernceManager(reference_list)
예제 #13
0
 def __init__(self, nbest_path, mode='r', reference_list=None):
     U.xassert(mode == 'r' or mode == 'w', "Invalid mode: " + mode)
     self.mode = mode
     self.nbest_file = codecs.open(nbest_path, mode=mode, encoding='UTF-8')
     self.prev_index = -1
     self.curr_item = None
     self.curr_index = 0
     self.eof_flag = False
     self.ref_manager = None
     if reference_list:
         U.xassert(mode == 'r',
                   "Cannot accept a reference_list in 'w' mode")
         self.ref_manager = RefernceManager(reference_list)
예제 #14
0
	def __init__(self, paths_list):
		U.xassert(type(paths_list) is list, "The input to a RefernceManager class must be a list")
		self.ref_list = []
		self.num_lines = -1
		self.num_refs = 0
		for path in paths_list:
			with codecs.open(path, mode='r', encoding='UTF-8') as f:
				self.num_refs += 1
				sentences = f.readlines()
				if self.num_lines == -1:
					self.num_lines = len(sentences)
				else:
					U.xassert(self.num_lines == len(sentences), "Reference files must have the same number of lines")
				self.ref_list.append(sentences)
예제 #15
0
파일: bilingual.py 프로젝트: tamhd/corelm
def read_vocab(vocab_path, endp, has_null):
    word_to_id_dict = dict()
    with open(vocab_path, 'r') as f_vocab:
        curr_index = 0
        for line in f_vocab:
            token = line.strip()
            if not word_to_id_dict.has_key(token):
                word_to_id_dict[token] = curr_index
            curr_index = curr_index + 1
        U.xassert(
            word_to_id_dict.has_key('<s>')
            and word_to_id_dict.has_key('<unk>'),
            "Missing <s> or <unk> in given vocab file")
        if has_null:
            U.xassert(word_to_id_dict.has_key('<null>'),
                      "Missing <null> in given target vocab file")
        if endp:
            U.xassert(
                word_to_id_dict.has_key('</s>'),
                "Missing </s> in given vocab file while --endp flag is used")
        if word_to_id_dict.has_key('</s>'):
            U.xassert(
                args.endp,
                "Given vocab file has </s> but --endp flag is not activated")
    return word_to_id_dict
예제 #16
0
    def __init__(self, emb_path):
        L.info('Loading embeddings from: ' + emb_path)
        has_header = False
        with codecs.open(emb_path, 'r', encoding='utf8') as emb_file:
            tokens = emb_file.next().split()
            if len(tokens) == 2:
                try:
                    int(tokens[0])
                    int(tokens[1])
                    has_header = True
                except ValueError:
                    pass
        if has_header:
            with codecs.open(emb_path, 'r', encoding='utf8') as emb_file:
                tokens = emb_file.next().split()
                U.xassert(
                    len(tokens) == 2,
                    'The first line in W2V embeddings must be the pair (vocab_size, emb_dim)'
                )
                self.vocab_size = int(tokens[0])
                self.emb_dim = int(tokens[1])
                self.embeddings = {}
                counter = 0
                for line in emb_file:
                    tokens = line.split()
                    U.xassert(
                        len(tokens) == self.emb_dim + 1,
                        'The number of dimensions does not match the header info'
                    )
                    word = tokens[0]
                    vec = tokens[1:]
                    self.embeddings[word] = vec
                    counter += 1
                U.xassert(counter == self.vocab_size,
                          'Vocab size does not match the header info')
        else:
            with codecs.open(emb_path, 'r', encoding='utf8') as emb_file:
                self.vocab_size = 0
                self.emb_dim = -1
                self.embeddings = {}
                for line in emb_file:
                    tokens = line.split()
                    if self.emb_dim == -1:
                        self.emb_dim = len(tokens) - 1
                    else:
                        U.xassert(
                            len(tokens) == self.emb_dim + 1,
                            'The number of dimensions does not match the header info'
                        )
                    word = tokens[0]
                    vec = tokens[1:]
                    self.embeddings[word] = vec
                    self.vocab_size += 1

        L.info('  #vectors: %i, #dimensions: %i' %
               (self.vocab_size, self.emb_dim))
예제 #17
0
파일: mmapReader.py 프로젝트: tamhd/corelm
	def __init__(self, dataset_path, batch_size=500, instance_weights_path=None):

		L.info("Initializing dataset from: " + os.path.abspath(dataset_path))
		# Reading parameters from the mmap file
                print K.get_platform()
		fp = np.memmap(dataset_path, dtype='int32', mode='r')
		self.num_samples = fp[0]
		self.ngram = fp[1]
		fp = fp.reshape((self.num_samples + 3, self.ngram))
		self.vocab_size = fp[1,0]
		self.num_classes = fp[2,0]

		# Setting minibatch size and number of mini batches
		self.batch_size = batch_size
		self.num_batches = int(M.ceil(self.num_samples / self.batch_size))

		# Reading the matrix of samples
		x = fp[3:,0:self.ngram - 1]			# Reading the context indices
		y = fp[3:,self.ngram - 1]			# Reading the output word index
		#self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32')
		#self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32')
                # What is T.cast :))
                L.info("Initialize a simple variable")
                val = np.random.random((4, 2))
                tmp = K.variable(val)

                L.info("Initialize a real variable")
                tmp = K.variable(x)
                L.info("Initialize two casted variables")
                self.shared_x = K.cast(K.variable(x), 'int32')
                self.shared_y = K.cast(K.variable(y), 'int32')
                L.info("Create two variable without borrow=True")
		self.is_weighted = False
		if instance_weights_path:
			instance_weights = np.loadtxt(instance_weights_path)
			U.xassert(instance_weights.shape == (self.num_samples,), "The number of lines in weights file must be the same as the number of samples.")
			# what is borrow=True
                        # self.shared_w = T.cast(theano.shared(instance_weights, borrow=True), theano.config.floatX)
                        self.shared_w = K.cast(K.variable(instance_weights), K._FLOATX)

			self.is_weighted = True

		L.info('  #samples: %s, ngram size: %s, vocab size: %s, #classes: %s, batch size: %s, #batches: %s' % (
				U.red(self.num_samples), U.red(self.ngram), U.red(self.vocab_size), U.red(self.num_classes), U.red(self.batch_size), U.red(self.num_batches)
			)
		)
예제 #18
0
	def __init__(self, rng, input, vocab_size, emb_dim, emb_matrix=None, concat=True, emb_path=None, vocab_path=None, add_weights=False, suffix=None, high=0.01):
		
		L.info("Lookup Table layer, #words: %s, #dims: %s" % (U.red(vocab_size), U.red(emb_dim)))

		self.input = input
		
		self.emb_matrix = emb_matrix

		if self.emb_matrix is None:
			self.emb_matrix = numpy.asarray(
				rng.uniform(
					low=-high, #low=-1,
					high=high, #high=1,
					size=(vocab_size, emb_dim)
				),
				dtype=theano.config.floatX
			)
		
		if emb_path:
			U.xassert(vocab_path, 'When emb_path is given, vocab must be given too.')
			self.initialize(emb_path, vocab_path)
		

		embeddings_name = 'embeddings'
		if suffix is not None:
			embeddings_name += '.' + str(suffix)
		
		self.embeddings = theano.shared(value=self.emb_matrix, name=embeddings_name, borrow=True)
		
		if add_weights:
			weights_vec = numpy.ones(vocab_size, dtype=theano.config.floatX)
			self.weights = theano.shared(value=weights_vec, name='word_weights', borrow=True)
			
			# Check if the speed can be improved
			self.output = (self.weights.dimshuffle(0, 'x') * self.embeddings)[input]
			#self.output = self.weights.dimshuffle(0, 'x')[input] * self.embeddings[input]
			#self.output = self.weights[input].dimshuffle(0, 'x') * self.embeddings[input]
			
			self.params = [self.embeddings, self.weights]
		else:
			self.output = self.embeddings[input]
			self.params = [self.embeddings]
		
		if concat:
			self.output = self.output.reshape((input.shape[0], emb_dim * input.shape[1]))
예제 #19
0
 def __init__(self, paths_list):
     U.xassert(
         type(paths_list) is list,
         "The input to a RefernceManager class must be a list")
     self.ref_list = []
     self.num_lines = -1
     self.num_refs = 0
     for path in paths_list:
         with codecs.open(path, mode='r', encoding='UTF-8') as f:
             self.num_refs += 1
             sentences = f.readlines()
             if self.num_lines == -1:
                 self.num_lines = len(sentences)
             else:
                 U.xassert(
                     self.num_lines == len(sentences),
                     "Reference files must have the same number of lines")
             self.ref_list.append(sentences)
예제 #20
0
파일: lookuptable.py 프로젝트: tamhd/corelm
	def __init__(self, rng, input, vocab_size, emb_dim, emb_matrix=None, concat=True, emb_path=None, vocab_path=None, add_weights=False):

		L.info("Lookup Table layer, #words: %s, #dims: %s" % (U.red(vocab_size), U.red(emb_dim)))

		self.input = input
		L.info("Input " + str(input))
                L.info("Add weightes " + str(add_weights))
                self.emb_matrix = emb_matrix

		if self.emb_matrix is None:
			self.emb_matrix = numpy.asarray(
				rng.uniform(
					low=-0.01, #low=-1,
					high=0.01, #high=1,
					size=(vocab_size, emb_dim)
				),
				dtype=K._FLOATX
			)

		if emb_path:
			U.xassert(vocab_path, 'When emb_path is given, vocab must be given too.')
			self.initialize(emb_path, vocab_path)

		#self.embeddings = theano.shared(value=self.emb_matrix, name='embeddings', borrow=True)
		self.embeddings = K.variable(self.emb_matrix, name='embeddings')


		if add_weights:
			weights_vec = numpy.ones(vocab_size, dtype=K._FLOATX)
			#self.weights = theano.shared(value=weights_vec, name='word_weights', borrow=True)
			self.weights = K.variable(weights_vec, name='word_weights')

			# Check if the speed can be improved
			self.output = (self.weights.dimshuffle(0, 'x') * self.embeddings)[input]
			#self.output = self.weights.dimshuffle(0, 'x')[input] * self.embeddings[input]
			#self.output = self.weights[input].dimshuffle(0, 'x') * self.embeddings[input]

			self.params = [self.embeddings, self.weights]
		else:
			self.output = self.embeddings[input]
			self.params = [self.embeddings]

		if concat:
			self.output = self.output.reshape((input.shape[0], emb_dim * input.shape[1]))
예제 #21
0
    def __init__(self,
                 dataset_path,
                 batch_size=500,
                 instance_weights_path=None):

        L.info("Initializing dataset from: " + os.path.abspath(dataset_path))

        # Reading parameters from the mmap file
        fp = np.memmap(dataset_path, dtype='int32', mode='r')
        self.num_samples = fp[0]
        self.ngram = fp[1]
        fp = fp.reshape((self.num_samples + 3, self.ngram))
        self.vocab_size = fp[1, 0]
        self.num_classes = fp[2, 0]

        # Setting minibatch size and number of mini batches
        self.batch_size = batch_size
        self.num_batches = int(M.ceil(self.num_samples / self.batch_size))

        # Reading the matrix of samples
        x = fp[3:, 0:self.ngram - 1]  # Reading the context indices
        y = fp[3:, self.ngram - 1]  # Reading the output word index
        self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32')
        self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32')

        self.is_weighted = False
        if instance_weights_path:
            instance_weights = np.loadtxt(instance_weights_path)
            U.xassert(
                instance_weights.shape == (self.num_samples, ),
                "The number of lines in weights file must be the same as the number of samples."
            )
            self.shared_w = T.cast(
                theano.shared(instance_weights, borrow=True),
                theano.config.floatX)
            self.is_weighted = True

        L.info(
            '  #samples: %s, ngram size: %s, vocab size: %s, #classes: %s, batch size: %s, #batches: %s'
            % (U.red(self.num_samples), U.red(
                self.ngram), U.red(self.vocab_size), U.red(self.num_classes),
               U.red(self.batch_size), U.red(self.num_batches)))
예제 #22
0
	def next_item(self):
		U.xassert(self.mode == 'r', "next() method can only be used in 'r' mode")
		try:
			segments = self.nbest_file.next().split("|||")
		except StopIteration:
			self.close()
			raise StopIteration
		try:
			index = int(segments[0])
		except ValueError:
			L.error("The first segment in an n-best list must be an integer")
		hyp = segments[1].strip()
		features = segments[2].strip()
		score = segments[3].strip()
		phrase_alignments = None
		word_alignments = None
		if len(segments) > 4:
			phrase_alignments = segments[4].strip()
		if len(segments) > 5:
			word_alignments = segments[5].strip()
		return NBestItem(index, hyp, features, score, phrase_alignments, word_alignments)
예제 #23
0
 def next_item(self):
     U.xassert(self.mode == 'r',
               "next() method can only be used in 'r' mode")
     try:
         segments = self.nbest_file.next().split("|||")
     except StopIteration:
         self.close()
         raise StopIteration
     try:
         index = int(segments[0])
     except ValueError:
         L.error("The first segment in an n-best list must be an integer")
     hyp = segments[1].strip()
     features = segments[2].strip()
     score = segments[3].strip()
     phrase_alignments = None
     word_alignments = None
     if len(segments) > 4:
         phrase_alignments = segments[4].strip()
     if len(segments) > 5:
         word_alignments = segments[5].strip()
     return NBestItem(index, hyp, features, score, phrase_alignments,
                      word_alignments)
예제 #24
0
	def next(self): # Returns a group of NBestItems with the same index
		if self.eof_flag == True:
			raise StopIteration
		U.xassert(self.mode == 'r', "next_group() method can only be used in 'r' mode")
		group = NBestGroup(self.ref_manager)
		group.add(self.curr_item) # add the item that was read in the last next() call
		try:
			self.curr_item = self.next_item()
		except StopIteration:
			self.eof_flag = True
			return group
		if self.curr_index != self.curr_item.index:
			self.curr_index = self.curr_item.index
			return group
		while self.curr_index == self.curr_item.index:
			group.add(self.curr_item)
			try:
				self.curr_item = self.next_item()
			except StopIteration:
				self.eof_flag = True
				return group
		self.curr_index = self.curr_item.index
		return group
예제 #25
0
파일: bilingual.py 프로젝트: nusnlp/corelm
def read_vocab(vocab_path, endp, has_null):
	word_to_id_dict = dict()
	with open(vocab_path,'r') as f_vocab:
		curr_index = 0
		for line in f_vocab:
			token = line.strip()
			if not word_to_id_dict.has_key(token):
				word_to_id_dict[token] = curr_index
			curr_index = curr_index + 1
		U.xassert(word_to_id_dict.has_key('<s>') and word_to_id_dict.has_key('<unk>'), "Missing <s> or <unk> in given vocab file")
		if has_null:
			U.xassert(word_to_id_dict.has_key('<null>'), "Missing <null> in given target vocab file")
		if endp:
			U.xassert(word_to_id_dict.has_key('</s>'), "Missing </s> in given vocab file while --endp flag is used")
		if word_to_id_dict.has_key('</s>'):
			U.xassert(args.endp, "Given vocab file has </s> but --endp flag is not activated")
	return word_to_id_dict
예제 #26
0
	def __init__(self, emb_path):
		L.info('Loading embeddings from: ' + emb_path)
		has_header=False
		with codecs.open(emb_path, 'r', encoding='utf8') as emb_file:
			tokens = emb_file.next().split()
			if len(tokens) == 2:
				try:
					int(tokens[0])
					int(tokens[1])
					has_header = True
				except ValueError:
					pass
		if has_header:
			with codecs.open(emb_path, 'r', encoding='utf8') as emb_file:
				tokens = emb_file.next().split()
				U.xassert(len(tokens) == 2, 'The first line in W2V embeddings must be the pair (vocab_size, emb_dim)')
				self.vocab_size = int(tokens[0])
				self.emb_dim = int(tokens[1])
				self.embeddings = {}
				counter = 0
				for line in emb_file:
					tokens = line.split()
					U.xassert(len(tokens) == self.emb_dim + 1, 'The number of dimensions does not match the header info')
					word = tokens[0]
					vec = tokens[1:]
					self.embeddings[word] = vec
					counter += 1
				U.xassert(counter == self.vocab_size, 'Vocab size does not match the header info')
		else:
			with codecs.open(emb_path, 'r', encoding='utf8') as emb_file:
				self.vocab_size = 0
				self.emb_dim = -1
				self.embeddings = {}
				for line in emb_file:
					tokens = line.split()
					if self.emb_dim == -1:
						self.emb_dim = len(tokens) - 1
					else:
						U.xassert(len(tokens) == self.emb_dim + 1, 'The number of dimensions does not match the header info')
					word = tokens[0]
					vec = tokens[1:]
					self.embeddings[word] = vec
					self.vocab_size += 1
		
		L.info('  #vectors: %i, #dimensions: %i' % (self.vocab_size, self.emb_dim))
예제 #27
0
	def get_all_refs(self, index):
		U.xassert(index < self.num_lines, "Index out of bound")
		return [self.ref_list[k][index] for k in range(self.num_refs)]
예제 #28
0
파일: oracle.py 프로젝트: tamhd/corelm
	'epsilon' : B.add_epsilon_smoothing,
	'lin'     : B.lin_smoothing,
	'nist'    : B.nist_smoothing,
	'chen'    : B.chen_smoothing
}

ref_path_list = args.ref_paths.split(',')

input_nbest = NBestList(args.input_path, mode='r', reference_list=ref_path_list)
if args.out_nbest_path:
	output_nbest = NBestList(args.out_nbest_path, mode='w')
if args.out_scores_path:
	output_scores = open(args.out_scores_path, mode='w')
output_1best = codecs.open(args.out_1best_path, mode='w', encoding='UTF-8')

U.xassert(methods.has_key(args.method), "Invalid smoothing method: " + args.method)
scorer = methods[args.method]

L.info('Processing the n-best list')

def process_group(group):
	index = 0
	scores = dict()
	for item in group:
		scores[index] = scorer(item.hyp, group.refs)
		index += 1
	return scores

pool = Pool(args.threads)

counter = 0
예제 #29
0
파일: test.py 프로젝트: nusnlp/corelm
from dlm.models.mlp import MLP
from dlm import eval
import theano
import theano.tensor as T

#########################
## Loading model
#

classifier = MLP(model_path=args.model_path)

#########################
## Loading dataset
#

U.xassert(args.format == "mmap" or args.format == "nbest" or args.format == "text", "Invalid file format given: " + args.format)
U.xassert(args.perplexity or args.nlp_path or args.ulp_path, "You should use one of (or more) -ppl, -nlp or -ulp")

if args.format == "mmap":
	U.xassert((args.nlp_path is None) and (args.ulp_path is None), "Cannot compute log-probabilities for an mmap file")
	from dlm.io.mmapReader import MemMapReader
	testset = MemMapReader(dataset_path=args.test_path, batch_size=500)
else:
	U.xassert(args.vocab_path, "Vocab file is required for non-mmap file formats")
	from dlm.io.textReader import TextReader
	is_nbest = False
	if args.format == "nbest":
		is_nbest = True
	testset = TextReader(dataset_path=args.test_path, is_nbest=is_nbest, ngram_size=classifier.ngram_size, vocab_path=args.vocab_path)

#########################
예제 #30
0
파일: mlp.py 프로젝트: wanghm92/corelm_sll
	def __init__(self, args=None, model_path=None):

		######################################################################
		## Parameters
		#
		
		U.xassert((args or model_path) and not (args and model_path), "args or model_path are mutually exclusive")
		
		if model_path:
			args, loaded_params = self.load_model(model_path)
		
		emb_dim = args.emb_dim
		num_hidden_list = map(int, args.num_hidden.split(','))
		if num_hidden_list[0] <= 0:
			num_hidden_list = []


		self.ngram_size = args.ngram_size

		if args.feature_emb_dim is None:
			features_info = [(args.vocab_size, args.ngram_size-1, args.emb_dim)]
		else:
			features_dim = map(int, args.feature_emb_dim.split(','))
			features_dim.insert(0,emb_dim)
			U.xassert(len(features_dim) == len(args.features_info), "The number of specified feature dimensions does not match the number of features!")
			features_info = []
			for feature_info,feature_dim in zip(args.features_info, features_dim):
				feature_info = feature_info + (feature_dim,)
				features_info.append(feature_info)

		print "Classifier Creation"
		print features_info
		num_classes = args.num_classes
		activation_name = args.activation_name
		self.args = args
		self.L1 = 0
		self.L2_sqr = 0
		self.params = []
		
		# Not implemented with Sequence Labelling
		emb_path, vocab = None, None
		try:
			emb_path = args.emb_path
			vocab = args.vocab
		except AttributeError:
			pass
		
		rng = numpy.random.RandomState(1234)
		self.input = T.imatrix('input')

		######################################################################
		## Lookup Table Layer
		#
		last_start_pos = 0
		last_layer_output = None
		last_layer_output_size = 0
		for i in range(0, len(features_info)):
			vocab_size, num_elems,emb_dim = features_info[i]
			if i != 0:
				emb_path, vocab = None, None
			lookupTableLayer = LookupTable(
				rng=rng,
				input=self.input[:,last_start_pos:last_start_pos+num_elems],
				vocab_size=vocab_size,
				emb_dim=emb_dim,
				emb_path=emb_path,
				vocab_path=vocab,
				add_weights=args.weighted_emb,
				suffix=i
			)
			if last_layer_output is None:
				last_layer_output = lookupTableLayer.output
			else:
				last_layer_output = T.concatenate([last_layer_output, lookupTableLayer.output], axis=1)
			
			last_layer_output_size +=  (num_elems) * emb_dim
			self.params += lookupTableLayer.params
			last_start_pos = last_start_pos + num_elems
		
		######################################################################
		## Hidden Layer(s)
		#
		for i in range(0, len(num_hidden_list)):
			linearLayer = Linear(
				rng=rng,
				input=last_layer_output,
				n_in=last_layer_output_size,
				n_out=num_hidden_list[i],
				suffix=i
			)
			last_layer_output = linearLayer.output
			last_layer_output_size = num_hidden_list[i]
			self.params += linearLayer.params
			
			activation = Activation(
				input=last_layer_output,
				func_name=activation_name
			)
			last_layer_output = activation.output
			
			self.L1 = self.L1 + abs(linearLayer.W).sum()
			self.L2_sqr = self.L2_sqr + (linearLayer.W ** 2).sum()
		
		######################################################################
		## Output Linear Layer
		#
		linearLayer = Linear(
			rng=rng,
			input=last_layer_output,
			n_in=last_layer_output_size,
			n_out=num_classes,
			#b_values = numpy.zeros(num_classes) - math.log(num_classes)
			b_values = numpy.full(shape=(num_classes),fill_value=(-math.log(num_classes)),dtype=theano.config.floatX),
			suffix='out'
		)
		last_layer_output = linearLayer.output
		self.params += linearLayer.params
		
		self.L1 = self.L1 + abs(linearLayer.W).sum()
		self.L2_sqr = self.L2_sqr + (linearLayer.W ** 2).sum()
		
		######################################################################
		## Model Output
		#
		
		self.output = last_layer_output
		self.p_y_given_x_matrix = T.nnet.softmax(last_layer_output)
		
		# Log Softmax
		last_layer_output_shifted = last_layer_output - last_layer_output.max(axis=1, keepdims=True)
		self.log_p_y_given_x_matrix = last_layer_output_shifted - T.log(T.sum(T.exp(last_layer_output_shifted),axis=1,keepdims=True))


		#self.log_Z_sqr = T.log(T.mean(T.sum(T.exp(last_layer_output), axis=1))) ** 2
		#self.log_Z_sqr = T.sum(T.log(T.sum(T.exp(last_layer_output), axis=1))) ** 2
		self.log_Z_sqr = T.mean(T.log(T.sum(T.exp(last_layer_output), axis=1)) ** 2)

		######################################################################
		## Model Predictions

		self.y_pred = T.argmax(self.p_y_given_x_matrix, axis=1)
		
		######################################################################
		## Loading parameters from file (if given)
		#
		
		if model_path:
			self.set_params(loaded_params)
예제 #31
0
args = parser.parse_args()

U.set_theano_device('cpu',1)
from dlm.models.mlp import MLP

if args.out_dir is None:
	args.out_dir = 'corelm_convert-' + U.curr_time()
U.mkdir_p(args.out_dir)

# Loading CoreLM model and creating classifier class
L.info("Loading CoreLM model")
classifier = MLP(model_path=args.corelm_model)
args_nn = classifier.args
params_nn = classifier.params
U.xassert(len(params_nn)==7, "CoreLM model is not compatible with NPLM architecture. 2 hidden layers and an output linear layer is required.")

embeddings = params_nn[0].get_value()
W1 = params_nn[1].get_value()
W1 = np.transpose(W1)
b1 = params_nn[2].get_value()
W2 = params_nn[3].get_value()
W2 = np.transpose(W2)
b2 = params_nn[4].get_value()
W3 = params_nn[5].get_value()
W3 = np.transpose(W3)
b3 = params_nn[6].get_value()


# Storing vocabulary into an array
has_null = False
예제 #32
0
		tokens_freq_sorted = sorted(word_to_freq_dict, key=word_to_freq_dict.get, reverse=True)
		if args.prune_vocab_size is not None and args.prune_vocab_size < len(tokens_freq_sorted):
			tokens_freq_sorted = tokens_freq_sorted[0:args.prune_vocab_size]
		for token in tokens_freq_sorted:
			f_vocab.write(token+"\n")
			word_to_id_dict[token] = curr_index
			curr_index = curr_index + 1
else:
	with open(args.input_vocab_path, 'r') as f_vocab:
		curr_index = 0
		for line in f_vocab:
			token = line.strip()
			if not word_to_id_dict.has_key(token):
				word_to_id_dict[token] = curr_index
			curr_index = curr_index + 1
		U.xassert(word_to_id_dict.has_key('<s>') and word_to_id_dict.has_key('<unk>') and word_to_id_dict.has_key('<null>'), "Missing <s> or <unk> or <null> in given vocab file")
		if args.endp:
			U.xassert(word_to_id_dict.has_key('</s>'), "Missing </s> in given vocab file while --endp flag is used")
		if word_to_id_dict.has_key('</s>'):
			U.xassert(args.endp, "Given vocab file has </s> but --endp flag is not activated")

_, tmp_path = tempfile.mkstemp(prefix='dlm.tmp.')

# For shuffling only
samples = []			# List of samples
nsamples = 0

# Reading input text file to create IDX file
with open(args.input_path, 'r') as input_file, open(tmp_path, 'w') as tmp_file:
	next_id = 0
	for line in input_file:
예제 #33
0
파일: features.py 프로젝트: tamhd/corelm
    output_mmap_path = prefix + ".idx.mmap"
    output_text_path = prefix + ".idx.txt"
    output_words_path = prefix + ".txt"

if args.word_out:
    f_words = open(output_words_path, 'w')

input_word_to_id = read_vocab(args.input_vocab_path)
feature_to_id = read_vocab(args.features_vocab_path)
label_to_id = read_vocab(args.labels_vocab_path)
input_vocab_size = len(input_word_to_id)
feature_vocab_size = len(feature_to_id)
label_vocab_size = len(label_to_id)

half_context = args.context_size / 2
U.xassert(input_word_to_id.has_key("<s>"),
          "Sentence marker <s> not found in input vocabulary!")
U.xassert(feature_to_id.has_key("<s>"),
          "Sentence marker <s> not found in feature vocabulary!")

_, tmp_path = tempfile.mkstemp(prefix='dlm.tmp.')
# For shuffling only
samples = []  # List of samples
samples_idx = []
nsamples = 0

# Read lines and write to the mmap file
line_num = 0
nsamples = 0

with open(args.input_path,
          'r') as input_file, open(args.labels_path,
예제 #34
0
	def write(self, item):
		U.xassert(self.mode == 'w', "write() method can only be used in 'w' mode")
		self.nbest_file.write(unicode(item) + "\n")
예제 #35
0
 def __iter__(self):
     U.xassert(self.mode == 'r', "Iteration can only be done in 'r' mode")
     return self
예제 #36
0
                tokens_freq_sorted):
            tokens_freq_sorted = tokens_freq_sorted[0:args.prune_vocab_size]
        for token in tokens_freq_sorted:
            f_vocab.write(token + "\n")
            word_to_id_dict[token] = curr_index
            curr_index = curr_index + 1
else:
    with open(args.input_vocab_path, 'r') as f_vocab:
        curr_index = 0
        for line in f_vocab:
            token = line.strip()
            if not word_to_id_dict.has_key(token):
                word_to_id_dict[token] = curr_index
            curr_index = curr_index + 1
        U.xassert(
            word_to_id_dict.has_key('<s>') and word_to_id_dict.has_key('<unk>')
            and word_to_id_dict.has_key('<null>'),
            "Missing <s> or <unk> or <null> in given vocab file")
        if args.endp:
            U.xassert(
                word_to_id_dict.has_key('</s>'),
                "Missing </s> in given vocab file while --endp flag is used")
        if word_to_id_dict.has_key('</s>'):
            U.xassert(
                args.endp,
                "Given vocab file has </s> but --endp flag is not activated")

_, tmp_path = tempfile.mkstemp(prefix='dlm.tmp.')

# For shuffling only
samples = []  # List of samples
nsamples = 0
예제 #37
0
파일: bilingual.py 프로젝트: nusnlp/corelm
src_prune_args = parser.add_mutually_exclusive_group(required=True)
src_prune_args.add_argument("-vs","--prune-source-vocab", dest="src_vocab_size",  type=int, help="Source vocabulary size")
src_prune_args.add_argument("--source-vocab-file", dest="src_vocab_path",  help="Source vocabulary file path")

trg_prune_args = parser.add_mutually_exclusive_group(required=True)
trg_prune_args.add_argument("-vt","--prune-target-vocab", dest="trg_vocab_size", type=int, help="Target vocabulary size")
trg_prune_args.add_argument("--target-vocab-file", dest="trg_vocab_path", help="Target vocabulary file path")

output_prune_args = parser.add_mutually_exclusive_group(required=True)
output_prune_args.add_argument("-vo","--prune-output-vocab", dest="output_vocab_size", type=int, help="Output vocabulary size. Defaults to target vocabulary size.")
output_prune_args.add_argument("--output-vocab-file", dest="output_vocab_path", help="Output vocabulary file")

args = parser.parse_args()

# Format of the memmap file does not support less than 5 because the first row consists of parameters for the neural network
U.xassert(args.trg_context + args.src_context*2 + 1 > 3, "Total ngram size must be greater than 3. ngrams < 3 are not supported by the current memmap format.")

L.info("Source Window Size: " + str(args.src_context * 2 + 1))
L.info("Target Window Size: " + str(args.trg_context - 1))
L.info("Total Sample Size: " + str(args.trg_context + args.src_context * 2 + 1))

if (args.output_vocab_size is None):
	args.output_vocab_size = args.trg_vocab_size

# The output directory is 
if (not os.path.exists(args.output_dir_path)):
	os.makedirs(args.output_dir_path)
L.info("Output directory: " + os.path.abspath(args.output_dir_path))

# Prefix of files
src_prefix = args.output_dir_path + "/" + os.path.basename(args.src_input_path)
예제 #38
0
	def __iter__(self):
		U.xassert(self.mode == 'r', "Iteration can only be done in 'r' mode")
		return self
예제 #39
0
args = parser.parse_args()

U.set_theano_device('cpu', 1)
from dlm.models.mlp import MLP

if args.out_dir is None:
    args.out_dir = 'primelm_convert-' + U.curr_time()
U.mkdir_p(args.out_dir)

# Loading PrimeLM model and creating classifier class
L.info("Loading PrimeLM model")
classifier = MLP(model_path=args.primelm_model)
args_nn = classifier.args
params_nn = classifier.params
U.xassert(
    len(params_nn) == 7,
    "PrimeLM model is not compatible with NPLM architecture. 2 hidden layers and an output linear layer is required."
)

embeddings = params_nn[0].get_value()
W1 = params_nn[1].get_value()
W1 = np.transpose(W1)
b1 = params_nn[2].get_value()
W2 = params_nn[3].get_value()
W2 = np.transpose(W2)
b2 = params_nn[4].get_value()
W3 = params_nn[5].get_value()
W3 = np.transpose(W3)
b3 = params_nn[6].get_value()

# Storing vocabulary into an array
has_null = False
예제 #40
0
if args.command.startswith('top'):
    mode = 0
    N = int(args.command[3:])  # N in N-best
    output_nbest = NBestList(args.output_path, mode='w')
elif args.command == '1best':
    mode = 1
    output_1best = codecs.open(args.output_path, mode='w', encoding='UTF-8')
elif args.command.startswith('feature'):
    mode = 2
    N = int(args.command[7:])  # Nth feature
    output = open(args.output_path, mode='w')
elif args.command.startswith('correl'):
    mode = 3
    N = int(args.command[6:])  # Nth feature
    U.xassert(args.oracle,
              "correlN command needs a file (-s) containing oracle scores")
    with open(args.oracle, mode='r') as oracles_file:
        oracles = map(float, oracles_file.read().splitlines())
    #output = open(args.output_path, mode='w')
elif args.command.startswith('augment'):
    U.set_theano_device(args.device)
    from dlm.reranker import augmenter
    augmenter.augment(args.model_path, args.input_path, args.vocab_path,
                      args.output_path)
else:
    L.error('Invalid command: ' + args.command)

counter = 0
features = []
for group in input_nbest:
    if mode == 0:
예제 #41
0
파일: train.py 프로젝트: nusnlp/corelm
if args.out_dir is None:
	args.out_dir = 'corelm-' + U.curr_time()
U.mkdir_p(args.out_dir)

L.quiet = args.quiet
L.set_file_path(os.path.abspath(args.out_dir) + "/log.txt")

L.info('Command: ' + ' '.join(sys.argv))

curr_version = U.curr_version()
if curr_version:
	L.info("Version: " + curr_version)

if args.emb_path:
	U.xassert(args.vocab, 'When --emb-path is used, vocab file must be given too (using --vocab).')

if args.loss_function == "nll":
	args.num_noise_samples = 0

U.print_args(args)
U.set_theano_device(args.device, args.threads)

import dlm.trainer
from dlm.io.mmapReader import MemMapReader
from dlm.models.mlp import MLP

#########################
## Loading datasets
#
예제 #42
0
	def set_params(self, params):
		U.xassert(len(self.params) == len(params), 'The given model file is consistent with the architecture')
		for param, loaded_param in zip(self.params, params):
			param.set_value(loaded_param)
예제 #43
0
파일: mlp.py 프로젝트: sxdkxgwan/corelm
    def __init__(self, args=None, model_path=None):

        ######################################################################
        ## Parameters
        #

        U.xassert((args or model_path) and not (args and model_path),
                  "args or model_path are mutually exclusive")

        if model_path:
            args, loaded_params = self.load_model(model_path)

        emb_dim = args.emb_dim
        num_hidden_list = map(int, args.num_hidden.split(','))
        if num_hidden_list[0] <= 0:
            num_hidden_list = []

        vocab_size = args.vocab_size
        self.ngram_size = args.ngram_size
        num_classes = args.num_classes
        activation_name = args.activation_name
        self.args = args
        self.L1 = 0
        self.L2_sqr = 0
        self.params = []

        emb_path, vocab = None, None
        try:
            emb_path = args.emb_path
            vocab = args.vocab
        except AttributeError:
            pass

        rng = numpy.random.RandomState(1234)
        self.input = T.imatrix('input')

        ######################################################################
        ## Lookup Table Layer
        #

        lookupTableLayer = LookupTable(rng=rng,
                                       input=self.input,
                                       vocab_size=vocab_size,
                                       emb_dim=emb_dim,
                                       emb_path=emb_path,
                                       vocab_path=vocab,
                                       add_weights=args.weighted_emb)
        last_layer_output = lookupTableLayer.output
        last_layer_output_size = (self.ngram_size - 1) * emb_dim
        self.params += lookupTableLayer.params

        ######################################################################
        ## Hidden Layer(s)
        #

        for i in range(0, len(num_hidden_list)):
            linearLayer = Linear(rng=rng,
                                 input=last_layer_output,
                                 n_in=last_layer_output_size,
                                 n_out=num_hidden_list[i],
                                 suffix=i)
            last_layer_output = linearLayer.output
            last_layer_output_size = num_hidden_list[i]
            self.params += linearLayer.params

            activation = Activation(input=last_layer_output,
                                    func_name=activation_name)
            last_layer_output = activation.output

            self.L1 = self.L1 + abs(linearLayer.W).sum()
            self.L2_sqr = self.L2_sqr + (linearLayer.W**2).sum()

        ######################################################################
        ## Output Linear Layer
        #

        linearLayer = Linear(
            rng=rng,
            input=last_layer_output,
            n_in=last_layer_output_size,
            n_out=num_classes,
            #b_values = numpy.zeros(num_classes) - math.log(num_classes)
            b_values=numpy.full(shape=(num_classes),
                                fill_value=(-math.log(num_classes)),
                                dtype=theano.config.floatX),
            suffix='out')
        last_layer_output = linearLayer.output
        self.params += linearLayer.params

        self.L1 = self.L1 + abs(linearLayer.W).sum()
        self.L2_sqr = self.L2_sqr + (linearLayer.W**2).sum()

        ######################################################################
        ## Model Output
        #

        self.output = last_layer_output
        self.p_y_given_x_matrix = T.nnet.softmax(last_layer_output)

        # Log Softmax
        last_layer_output_shifted = last_layer_output - last_layer_output.max(
            axis=1, keepdims=True)
        self.log_p_y_given_x_matrix = last_layer_output_shifted - T.log(
            T.sum(T.exp(last_layer_output_shifted), axis=1, keepdims=True))

        #self.log_Z_sqr = T.log(T.mean(T.sum(T.exp(last_layer_output), axis=1))) ** 2
        #self.log_Z_sqr = T.sum(T.log(T.sum(T.exp(last_layer_output), axis=1))) ** 2
        self.log_Z_sqr = T.mean(
            T.log(T.sum(T.exp(last_layer_output), axis=1))**2)

        ######################################################################
        ## Model Predictions

        self.y_pred = T.argmax(self.p_y_given_x_matrix, axis=1)

        ######################################################################
        ## Loading parameters from file (if given)
        #

        if model_path:
            self.set_params(loaded_params)
예제 #44
0
파일: train.py 프로젝트: nusnlp/corelm
	init_opt.write(' '.join(init_list) + '\n')
	init_opt.write(' '.join(['0' for i in range(dim)]) + '\n')
	init_opt.write(' '.join(['1' for i in range(dim)]) + '\n')

seed_arg = ''
if args.pred_seed:
	seed_arg = ' -r 1234 '

if (args.alg == 'pro' or args.alg == 'wpro'):
	# PRO
	if args.alg == 'pro':
		L.info("Running PRO")
		cmd = moses_root + '/bin/pro' + ' -S ' + args.out_dir + '/statscore.data -F ' + args.out_dir + '/features.data -o ' + args.out_dir +'/pro.data' + seed_arg
	else:
		L.info("Running WEIGHTED PRO")
		U.xassert(args.instance_weights_path, 'Instance weights are not given to wpro')
		cmd = moses_root + '/bin/proWeighted' + ' -S ' + args.out_dir + '/statscore.data -F ' + args.out_dir + '/features.data -o ' + args.out_dir +'/pro.data' + seed_arg + ' -w ' + args.instance_weights_path
	U.capture(cmd)
	cmd = moses_root + '/bin/megam_i686.opt -fvals -maxi 30 -nobias binary ' + args.out_dir + '/pro.data'
	pro_weights = U.capture(cmd)

	pro_weights_arr = pro_weights.strip().split('\n')
	weights_dict = dict()
	sum = 0.0
	highest_feature_index = 0

	for elem in pro_weights_arr:
		feature_index,weight = elem[1:].split()
		feature_index = int(feature_index)
		weight = float(weight)
		weights_dict[feature_index] = weight
예제 #45
0
 def write(self, item):
     U.xassert(self.mode == 'w',
               "write() method can only be used in 'w' mode")
     self.nbest_file.write(unicode(item) + "\n")
예제 #46
0
if args.out_dir is None:
    args.out_dir = 'primelm-' + U.curr_time()
U.mkdir_p(args.out_dir)

L.quiet = args.quiet
L.set_file_path(os.path.abspath(args.out_dir) + "/log.txt")

L.info('Command: ' + ' '.join(sys.argv))

curr_version = U.curr_version()
if curr_version:
    L.info("Version: " + curr_version)

if args.emb_path:
    U.xassert(
        args.vocab,
        'When --emb-path is used, vocab file must be given too (using --vocab).'
    )

if args.loss_function == "nll":
    args.num_noise_samples = 0

U.print_args(args)
U.set_theano_device(args.device, args.threads)

import dlm.trainer
from dlm.io.mmapReader import MemMapReader
from dlm.io.featuresmmapReader import FeaturesMemMapReader

from dlm.models.mlp import MLP

#########################
예제 #47
0
    output_words_path = prefix + ".txt"

if args.word_out:
    f_words = open(output_words_path, "w")


input_word_to_id = read_vocab(args.input_vocab_path)
feature_to_id = read_vocab(args.features_vocab_path)
label_to_id = read_vocab(args.labels_vocab_path)
input_vocab_size = len(input_word_to_id)
feature_vocab_size = len(feature_to_id)
label_vocab_size = len(label_to_id)


half_context = args.context_size / 2
U.xassert(input_word_to_id.has_key("<s>"), "Sentence marker <s> not found in input vocabulary!")
U.xassert(feature_to_id.has_key("<s>"), "Sentence marker <s> not found in feature vocabulary!")


_, tmp_path = tempfile.mkstemp(prefix="dlm.tmp.")
# For shuffling only
samples = []  # List of samples
samples_idx = []
nsamples = 0


# Read lines and write to the mmap file
line_num = 0
nsamples = 0

with open(args.input_path, "r") as input_file, open(args.labels_path, "r") as labels_file, open(
예제 #48
0
	def __init__(self, dataset_path, batch_size=500, instance_weights_path=None):
		
		L.info("Initializing dataset (with features) from: " + os.path.abspath(dataset_path))
		
		# Reading parameters from the mmap file
		fp = np.memmap(dataset_path, dtype='int32', mode='r')
		#print type(fp1)
		#fp = np.empty(fp1.shape, dtype='int32')
		#fp[:] = fp1
		#print type(fp)
		self.num_samples = fp[0]
		self.ngram = fp[1]

		fp = fp.reshape((len(fp)/self.ngram, self.ngram))

		num_header_lines = fp[1,0]

	
		self.features_info = []    # Format (vocab_size, num_of_elements)
		for i in xrange(num_header_lines-1):
			self.features_info.append( (fp[i+2,0], fp[i+2,1]) )


		self.num_classes = fp[(num_header_lines+2)-1,0]


		# Setting minibatch size and number of mini batches
		self.batch_size = batch_size
		self.num_batches = int(M.ceil(self.num_samples / self.batch_size))

		# Reading the matrix of samples
		# x is list
		'''
		self.shared_x_list = []
		last_start_pos = 0
		for i in xrange(len(self.features_info)):
			vocab_size, num_elems = self.features_info[i]
			x = fp[num_header_lines+2:,last_start_pos:last_start_pos + num_elems]			# Reading the context indices
			last_start_pos += num_elems
			shared_x = T.cast(theano.shared(x, borrow=True), 'int32')
			self.shared_x_list.append(shared_x)
		'''
		x = fp[num_header_lines+2:,0:self.ngram - 1]			# Reading the context indices
		self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32')
		y = fp[num_header_lines+2:,self.ngram - 1]			# Reading the output word index
		self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32')
		

		## Untested instance weighting
		self.is_weighted = False
		if instance_weights_path:
			instance_weights = np.loadtxt(instance_weights_path)
			U.xassert(instance_weights.shape == (self.num_samples,), "The number of lines in weights file must be the same as the number of samples.")
			self.shared_w = T.cast(theano.shared(instance_weights, borrow=True), theano.config.floatX)
			self.is_weighted = True
		
		L.info('  #samples: %s,  #classes: %s, batch size: %s, #batches: %s' % (
				U.red(self.num_samples),   U.red(self.num_classes), U.red(self.batch_size), U.red(self.num_batches)
			))
		for feature in enumerate(self.features_info):
			L.info("Feature %s: #ngrams= %s vocab_size= %s" %( U.red(feature[0]), U.red(feature[1][1]), U.red(feature[1][0])))
예제 #49
0
파일: mlp.py 프로젝트: wanghm92/corelm_sll
    def __init__(self, args=None, model_path=None):

        ######################################################################
        ## Parameters
        #

        U.xassert((args or model_path) and not (args and model_path),
                  "args or model_path are mutually exclusive")

        if model_path:
            args, loaded_params = self.load_model(model_path)

        emb_dim = args.emb_dim
        num_hidden_list = map(int, args.num_hidden.split(','))
        if num_hidden_list[0] <= 0:
            num_hidden_list = []

        self.ngram_size = args.ngram_size

        if args.feature_emb_dim is None:
            features_info = [(args.vocab_size, args.ngram_size - 1,
                              args.emb_dim)]
        else:
            features_dim = map(int, args.feature_emb_dim.split(','))
            features_dim.insert(0, emb_dim)
            U.xassert(
                len(features_dim) == len(args.features_info),
                "The number of specified feature dimensions does not match the number of features!"
            )
            features_info = []
            for feature_info, feature_dim in zip(args.features_info,
                                                 features_dim):
                feature_info = feature_info + (feature_dim, )
                features_info.append(feature_info)

        print "Classifier Creation"
        print features_info
        num_classes = args.num_classes
        activation_name = args.activation_name
        self.args = args
        self.L1 = 0
        self.L2_sqr = 0
        self.params = []

        # Not implemented with Sequence Labelling
        emb_path, vocab = None, None
        try:
            emb_path = args.emb_path
            vocab = args.vocab
        except AttributeError:
            pass

        rng = numpy.random.RandomState(1234)
        self.input = T.imatrix('input')

        ######################################################################
        ## Lookup Table Layer
        #
        last_start_pos = 0
        last_layer_output = None
        last_layer_output_size = 0
        for i in range(0, len(features_info)):
            vocab_size, num_elems, emb_dim = features_info[i]
            if i != 0:
                emb_path, vocab = None, None
            lookupTableLayer = LookupTable(
                rng=rng,
                input=self.input[:, last_start_pos:last_start_pos + num_elems],
                vocab_size=vocab_size,
                emb_dim=emb_dim,
                emb_path=emb_path,
                vocab_path=vocab,
                add_weights=args.weighted_emb,
                suffix=i)
            if last_layer_output is None:
                last_layer_output = lookupTableLayer.output
            else:
                last_layer_output = T.concatenate(
                    [last_layer_output, lookupTableLayer.output], axis=1)

            last_layer_output_size += (num_elems) * emb_dim
            self.params += lookupTableLayer.params
            last_start_pos = last_start_pos + num_elems

        ######################################################################
        ## Hidden Layer(s)
        #
        for i in range(0, len(num_hidden_list)):
            linearLayer = Linear(rng=rng,
                                 input=last_layer_output,
                                 n_in=last_layer_output_size,
                                 n_out=num_hidden_list[i],
                                 suffix=i)
            last_layer_output = linearLayer.output
            last_layer_output_size = num_hidden_list[i]
            self.params += linearLayer.params

            activation = Activation(input=last_layer_output,
                                    func_name=activation_name)
            last_layer_output = activation.output

            self.L1 = self.L1 + abs(linearLayer.W).sum()
            self.L2_sqr = self.L2_sqr + (linearLayer.W**2).sum()

        ######################################################################
        ## Output Linear Layer
        #
        linearLayer = Linear(
            rng=rng,
            input=last_layer_output,
            n_in=last_layer_output_size,
            n_out=num_classes,
            #b_values = numpy.zeros(num_classes) - math.log(num_classes)
            b_values=numpy.full(shape=(num_classes),
                                fill_value=(-math.log(num_classes)),
                                dtype=theano.config.floatX),
            suffix='out')
        last_layer_output = linearLayer.output
        self.params += linearLayer.params

        self.L1 = self.L1 + abs(linearLayer.W).sum()
        self.L2_sqr = self.L2_sqr + (linearLayer.W**2).sum()

        ######################################################################
        ## Model Output
        #

        self.output = last_layer_output
        self.p_y_given_x_matrix = T.nnet.softmax(last_layer_output)

        # Log Softmax
        last_layer_output_shifted = last_layer_output - last_layer_output.max(
            axis=1, keepdims=True)
        self.log_p_y_given_x_matrix = last_layer_output_shifted - T.log(
            T.sum(T.exp(last_layer_output_shifted), axis=1, keepdims=True))

        #self.log_Z_sqr = T.log(T.mean(T.sum(T.exp(last_layer_output), axis=1))) ** 2
        #self.log_Z_sqr = T.sum(T.log(T.sum(T.exp(last_layer_output), axis=1))) ** 2
        self.log_Z_sqr = T.mean(
            T.log(T.sum(T.exp(last_layer_output), axis=1))**2)

        ######################################################################
        ## Model Predictions

        self.y_pred = T.argmax(self.p_y_given_x_matrix, axis=1)

        ######################################################################
        ## Loading parameters from file (if given)
        #

        if model_path:
            self.set_params(loaded_params)
예제 #50
0
파일: bilingual.py 프로젝트: tamhd/corelm
output_prune_args = parser.add_mutually_exclusive_group(required=True)
output_prune_args.add_argument(
    "-vo",
    "--prune-output-vocab",
    dest="output_vocab_size",
    type=int,
    help="Output vocabulary size. Defaults to target vocabulary size.")
output_prune_args.add_argument("--output-vocab-file",
                               dest="output_vocab_path",
                               help="Output vocabulary file")

args = parser.parse_args()

# Format of the memmap file does not support less than 5 because the first row consists of parameters for the neural network
U.xassert(
    args.trg_context + args.src_context * 2 + 1 > 3,
    "Total ngram size must be greater than 3. ngrams < 3 are not supported by the current memmap format."
)

L.info("Source Window Size: " + str(args.src_context * 2 + 1))
L.info("Target Window Size: " + str(args.trg_context - 1))
L.info("Total Sample Size: " +
       str(args.trg_context + args.src_context * 2 + 1))

if (args.output_vocab_size is None):
    args.output_vocab_size = args.trg_vocab_size

# The output directory is
if (not os.path.exists(args.output_dir_path)):
    os.makedirs(args.output_dir_path)
L.info("Output directory: " + os.path.abspath(args.output_dir_path))
예제 #51
0
	def append_features(self, features_list):
		U.xassert(len(features_list) == len(self.group), 'Number of features and number of items in this group do not match')
		for i in range(len(self.group)):
			self.group[i].append_feature(features_list[i])
예제 #52
0
파일: tools.py 프로젝트: tamhd/corelm
if args.command.startswith('top'):
	mode = 0
	N = int(args.command[3:]) # N in N-best
	output_nbest = NBestList(args.output_path, mode='w')
elif args.command == '1best':
	mode = 1
	output_1best = codecs.open(args.output_path, mode='w', encoding='UTF-8')
elif args.command.startswith('feature'):
	mode = 2
	N = int(args.command[7:]) # Nth feature
	output = open(args.output_path, mode='w')
elif args.command.startswith('correl'):
	mode = 3
	N = int(args.command[6:]) # Nth feature
	U.xassert(args.oracle, "correlN command needs a file (-s) containing oracle scores")
	with open(args.oracle, mode='r') as oracles_file:
		oracles = map(float, oracles_file.read().splitlines())
	#output = open(args.output_path, mode='w')
elif args.command.startswith('augment'):
	U.set_theano_device(args.device)
	from dlm.reranker import augmenter
	augmenter.augment(args.model_path, args.input_path, args.vocab_path, args.output_path)
else:
	L.error('Invalid command: ' + args.command)

counter = 0
features = []
for group in input_nbest:
	if mode == 0:
		for i in range(min(N, group.size())):
예제 #53
0
    def __init__(self,
                 dataset_path,
                 batch_size=500,
                 instance_weights_path=None):

        L.info("Initializing dataset (with features) from: " +
               os.path.abspath(dataset_path))

        # Reading parameters from the mmap file
        fp = np.memmap(dataset_path, dtype='int32', mode='r')
        #print type(fp1)
        #fp = np.empty(fp1.shape, dtype='int32')
        #fp[:] = fp1
        #print type(fp)
        self.num_samples = fp[0]
        self.ngram = fp[1]

        fp = fp.reshape((len(fp) / self.ngram, self.ngram))

        num_header_lines = fp[1, 0]

        self.features_info = []  # Format (vocab_size, num_of_elements)
        for i in xrange(num_header_lines - 1):
            self.features_info.append((fp[i + 2, 0], fp[i + 2, 1]))

        self.num_classes = fp[(num_header_lines + 2) - 1, 0]

        # Setting minibatch size and number of mini batches
        self.batch_size = batch_size
        self.num_batches = int(M.ceil(self.num_samples / self.batch_size))

        # Reading the matrix of samples
        # x is list
        '''
		self.shared_x_list = []
		last_start_pos = 0
		for i in xrange(len(self.features_info)):
			vocab_size, num_elems = self.features_info[i]
			x = fp[num_header_lines+2:,last_start_pos:last_start_pos + num_elems]			# Reading the context indices
			last_start_pos += num_elems
			shared_x = T.cast(theano.shared(x, borrow=True), 'int32')
			self.shared_x_list.append(shared_x)
		'''
        x = fp[num_header_lines + 2:,
               0:self.ngram - 1]  # Reading the context indices
        self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32')
        y = fp[num_header_lines + 2:,
               self.ngram - 1]  # Reading the output word index
        self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32')

        ## Untested instance weighting
        self.is_weighted = False
        if instance_weights_path:
            instance_weights = np.loadtxt(instance_weights_path)
            U.xassert(
                instance_weights.shape == (self.num_samples, ),
                "The number of lines in weights file must be the same as the number of samples."
            )
            self.shared_w = T.cast(
                theano.shared(instance_weights, borrow=True),
                theano.config.floatX)
            self.is_weighted = True

        L.info('  #samples: %s,  #classes: %s, batch size: %s, #batches: %s' %
               (U.red(self.num_samples), U.red(self.num_classes),
                U.red(self.batch_size), U.red(self.num_batches)))
        for feature in enumerate(self.features_info):
            L.info("Feature %s: #ngrams= %s vocab_size= %s" % (U.red(
                feature[0]), U.red(feature[1][1]), U.red(feature[1][0])))
예제 #54
0
    'nist': B.nist_smoothing,
    'chen': B.chen_smoothing
}

ref_path_list = args.ref_paths.split(',')

input_nbest = NBestList(args.input_path,
                        mode='r',
                        reference_list=ref_path_list)
if args.out_nbest_path:
    output_nbest = NBestList(args.out_nbest_path, mode='w')
if args.out_scores_path:
    output_scores = open(args.out_scores_path, mode='w')
output_1best = codecs.open(args.out_1best_path, mode='w', encoding='UTF-8')

U.xassert(methods.has_key(args.method),
          "Invalid smoothing method: " + args.method)
scorer = methods[args.method]

L.info('Processing the n-best list')


def process_group(group):
    index = 0
    scores = dict()
    for item in group:
        scores[index] = scorer(item.hyp, group.refs)
        index += 1
    return scores


pool = Pool(args.threads)
예제 #55
0
파일: mlp.py 프로젝트: tamhd/corelm
	def __init__(self, args=None, model_path=None):

		######################################################################
		## Parameters
		#

		U.xassert((args or model_path) and not (args and model_path), "args or model_path are mutually exclusive")

		if model_path:
			args, loaded_params = self.load_model(model_path)

		emb_dim = args.emb_dim
		num_hidden_list = map(int, args.num_hidden.split(','))
		if num_hidden_list[0] <= 0:
			num_hidden_list = []

		vocab_size = args.vocab_size
		self.ngram_size = args.ngram_size
		num_classes = args.num_classes
		activation_name = args.activation_name
		self.args = args
		self.L1 = 0
		self.L2_sqr = 0
		self.params = []

		emb_path, vocab = None, None
		try:
			emb_path = args.emb_path
			vocab = args.vocab
		except AttributeError:
			pass

		rng = numpy.random.RandomState(1234)
                self.input = K.placeholder(ndim=2, dtype='int32', name='input')

		######################################################################
		## Lookup Table Layer
		#

		lookupTableLayer = LookupTable(
			rng=rng,
			input=self.input,
			vocab_size=vocab_size,
			emb_dim=emb_dim,
			emb_path=emb_path,
			vocab_path=vocab,
			add_weights=args.weighted_emb
		)
		last_layer_output = lookupTableLayer.output
		last_layer_output_size = (self.ngram_size - 1) * emb_dim
		self.params += lookupTableLayer.params

		######################################################################
		## Hidden Layer(s)
		#

		for i in range(0, len(num_hidden_list)):
			linearLayer = Linear(
				rng=rng,
				input=last_layer_output,
				n_in=last_layer_output_size,
				n_out=num_hidden_list[i],
				suffix=i
			)
			last_layer_output = linearLayer.output
			last_layer_output_size = num_hidden_list[i]
			self.params += linearLayer.params

			activation = Activation(
				input=last_layer_output,
				func_name=activation_name
			)
			last_layer_output = activation.output

			self.L1 = self.L1 + abs(linearLayer.W).sum()
			self.L2_sqr = self.L2_sqr + (linearLayer.W ** 2).sum()

		######################################################################
		## Output Linear Layer
		#

		linearLayer = Linear(
			rng=rng,
			input=last_layer_output,
			n_in=last_layer_output_size,
			n_out=num_classes,
			#b_values = numpy.zeros(num_classes) - math.log(num_classes)
			b_values = numpy.full(shape=(num_classes),fill_value=(-math.log(num_classes)),dtype=K._FLOATX),
			suffix='out'
		)
		last_layer_output = linearLayer.output
		self.params += linearLayer.params

		self.L1 = self.L1 + abs(linearLayer.W).sum()
		self.L2_sqr = self.L2_sqr + (linearLayer.W ** 2).sum()

		######################################################################
		## Model Output
		#

		self.output = last_layer_output
		self.p_y_given_x_matrix = K.softmax(last_layer_output)


		# Log Softmax
		last_layer_output_shifted = last_layer_output - last_layer_output.max(axis=1, keepdims=True)
		self.log_p_y_given_x_matrix = last_layer_output_shifted - K.log(K.sum(K.exp(last_layer_output_shifted),axis=1,keepdims=True))


		self.log_Z_sqr = K.mean(K.log(K.sum(K.exp(last_layer_output), axis=1)) ** 2)

		######################################################################
		## Model Predictions

		self.y_pred = K.argmax(self.p_y_given_x_matrix, axis=1)

		######################################################################
		## Loading parameters from file (if given)
		#

		if model_path:
			self.set_params(loaded_params)
예제 #56
0
파일: train.py 프로젝트: sxdkxgwan/corelm
    init_opt.write(' '.join(init_list) + '\n')
    init_opt.write(' '.join(['0' for i in range(dim)]) + '\n')
    init_opt.write(' '.join(['1' for i in range(dim)]) + '\n')

seed_arg = ''
if args.pred_seed:
    seed_arg = ' -r 1234 '

if (args.alg == 'pro' or args.alg == 'wpro'):
    # PRO
    if args.alg == 'pro':
        L.info("Running PRO")
        cmd = moses_root + '/bin/pro' + ' -S ' + args.out_dir + '/statscore.data -F ' + args.out_dir + '/features.data -o ' + args.out_dir + '/pro.data' + seed_arg
    else:
        L.info("Running WEIGHTED PRO")
        U.xassert(args.instance_weights_path,
                  'Instance weights are not given to wpro')
        cmd = moses_root + '/bin/proWeighted' + ' -S ' + args.out_dir + '/statscore.data -F ' + args.out_dir + '/features.data -o ' + args.out_dir + '/pro.data' + seed_arg + ' -w ' + args.instance_weights_path
    U.capture(cmd)
    cmd = moses_root + '/bin/megam_i686.opt -fvals -maxi 30 -nobias binary ' + args.out_dir + '/pro.data'
    pro_weights = U.capture(cmd)

    pro_weights_arr = pro_weights.strip().split('\n')
    weights_dict = dict()
    sum = 0.0
    highest_feature_index = 0

    for elem in pro_weights_arr:
        feature_index, weight = elem[1:].split()
        feature_index = int(feature_index)
        weight = float(weight)
        weights_dict[feature_index] = weight
예제 #57
0
 def get_all_refs(self, index):
     U.xassert(index < self.num_lines, "Index out of bound")
     return [self.ref_list[k][index] for k in range(self.num_refs)]