Пример #1
0
    def __init__(self, source, target,
                 source_dict, target_dict,
                 batch_size=128,
                 maxlen=100,
                 n_words_source=-1,
                 n_words_target=-1,
                 shuffle_each_epoch=False,
                 sort_by_length=True):
        if shuffle_each_epoch:
            shuffle.main([source, target])
            self.source = fopen(source+'.shuf', 'r')
            self.target = fopen(target+'.shuf', 'r')
        else:
            self.source = fopen(source, 'r')
            self.target = fopen(target, 'r')
        self.source_dict = load_dict(source_dict)
        self.target_dict = load_dict(target_dict)

        self.batch_size = batch_size
        self.maxlen = maxlen

        self.n_words_source = n_words_source
        self.n_words_target = n_words_target

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length

        self.source_buffer = []
        self.target_buffer = []
        self.k = batch_size * 20

        self.end_of_data = False
Пример #2
0
    def __init__(self, source, target,
                 source_dicts, target_dict,
                 batch_size=128,
                 maxlen=100,
                 n_words_source=-1,
                 n_words_target=-1,
                 skip_empty=False,
                 shuffle_each_epoch=False,
                 sort_by_length=True,
                 indomain_source='', indomain_target='',
                 interpolation_rate=0.1,
                 maxibatch_size=20):
        if shuffle_each_epoch:
            self.source_orig = source
            self.target_orig = target
            self.source, self.target = shuffle.main([self.source_orig, self.target_orig], temporary=True)
            self.indomain_source_orig = indomain_source
            self.indomain_target_orig = indomain_target
            self.indomain_source, self.indomain_target = shuffle.main([self.indomain_source_orig, self.indomain_target_orig], temporary=True)
        else:
            self.source = fopen(source, 'r')
            self.target = fopen(target, 'r')
            self.indomain_source = fopen(indomain_source, 'r')
            self.indomain_target = fopen(indomain_target, 'r')
        self.source_dicts = []
        for source_dict in source_dicts:
            self.source_dicts.append(load_dict(source_dict))
        self.target_dict = load_dict(target_dict)

        self.batch_size = batch_size
        self.maxlen = maxlen
        self.skip_empty = skip_empty

        self.n_words_source = n_words_source
        self.n_words_target = n_words_target

        if self.n_words_source > 0:
            for d in self.source_dicts:
                for key, idx in d.items():
                    if idx >= self.n_words_source:
                        del d[key]

        if self.n_words_target > 0:
                for key, idx in self.target_dict.items():
                    if idx >= self.n_words_target:
                        del self.target_dict[key]

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length

        self.source_buffer = []
        self.target_buffer = []
        self.k = batch_size * maxibatch_size

        self.end_of_data = False

        self.interpolation_rate = interpolation_rate
        self.cur_interpolation_rate = self.interpolation_rate
        self.indomain_k = int(math.ceil(self.cur_interpolation_rate * self.k))
        self.outdomain_k = self.k - self.indomain_k
Пример #3
0
    def __init__(self,
                 source,
                 target,
                 source_dict,
                 target_dict,
                 batch_size=128,
                 maxlen=100,
                 n_words_source=-1,
                 n_words_target=-1,
                 shuffle_each_epoch=False,
                 sort_by_length=True):
        if shuffle_each_epoch:
            shuffle.main([source, target])
            self.source = fopen(source + '.shuf', 'r')
            self.target = fopen(target + '.shuf', 'r')
        else:
            self.source = fopen(source, 'r')
            self.target = fopen(target, 'r')
        self.source_dict = load_dict(source_dict)
        self.target_dict = load_dict(target_dict)

        self.batch_size = batch_size
        self.maxlen = maxlen

        self.n_words_source = n_words_source
        self.n_words_target = n_words_target

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length

        self.source_buffer = []
        self.target_buffer = []
        self.k = batch_size * 20

        self.end_of_data = False
    def __init__(self,
                 source,
                 target,
                 source_dict,
                 target_dict,
                 batch_size=128,
                 maxlen=100,
                 n_words_source=-1,
                 n_words_target=-1,
                 shuffle_each_epoch=False):
        if shuffle_each_epoch:
            shuffle.main([source, target])
            self.source = fopen(source + '.shuf', 'r')
            self.target = fopen(target + '.shuf', 'r')
        else:
            self.source = fopen(source, 'r')
            self.target = fopen(target, 'r')
        with open(source_dict, 'rb') as f:
            self.source_dict = pkl.load(f)
        with open(target_dict, 'rb') as f:
            self.target_dict = pkl.load(f)

        self.batch_size = batch_size
        self.maxlen = maxlen

        self.n_words_source = n_words_source
        self.n_words_target = n_words_target

        self.source_buffer = []
        self.target_buffer = []
        self.k = batch_size * 20

        self.end_of_data = False
Пример #5
0
 def reset(self):
     if self.shuffle:
         shuffle.main([self.source.name.replace('.shuf',''), self.target.name.replace('.shuf','')])
         self.source = fopen(self.source.name, 'r')
         self.target = fopen(self.target.name, 'r')
     else:
         self.source.seek(0)
         self.target.seek(0)
Пример #6
0
 def reset(self):
     if self.shuffle:
         shuffle.main([self.source.name.replace('.shuf',''), self.target.name.replace('.shuf','')])
         self.source = fopen(self.source.name, 'r')
         self.target = fopen(self.target.name, 'r')
     else:
         self.source.seek(0)
         self.target.seek(0)
Пример #7
0
 def reset(self):
     if self.shuffle:
         shuffle.main([source.name.replace('.shuf','') for source in self.sources] + [self.target.name.replace('.shuf','')])
         self.sources = [fopen(source.name, 'r') for source in self.sources]
         self.target = fopen(self.target.name, 'r')
     else:
         for source in self.sources:
             source.seek(0)
         self.target.seek(0)
    def __init__(self, source, target,
                 source_dicts, target_dict,
                 batch_size=128,
                 maxlen=100,
                 n_words_source=-1,
                 n_words_target=-1,
                 shuffle_each_epoch=False,
                 sort_by_length=True,
                 indomain_source='', indomain_target='',
                 interpolation_rate=0.1,
                 maxibatch_size=20):
        if shuffle_each_epoch:
            shuffle.main([source, target])
            shuffle.main([indomain_source, indomain_target])
            self.source = fopen(source+'.shuf', 'r')
            self.target = fopen(target+'.shuf', 'r')
            self.indomain_source = fopen(indomain_source+'.shuf', 'r')
            self.indomain_target = fopen(indomain_target+'.shuf', 'r')
        else:
            self.source = fopen(source, 'r')
            self.target = fopen(target, 'r')
            self.indomain_source = fopen(indomain_source, 'r')
            self.indomain_target = fopen(indomain_target, 'r')
        self.source_dicts = []
        for source_dict in source_dicts:
            self.source_dicts.append(load_dict(source_dict))
        self.target_dict = load_dict(target_dict)

        self.batch_size = batch_size
        self.maxlen = maxlen

        self.n_words_source = n_words_source
        self.n_words_target = n_words_target

        if self.n_words_source > 0:
            for d in self.source_dicts:
                for key, idx in d.items():
                    if idx >= self.n_words_source:
                        del d[key]

        if self.n_words_target > 0:
                for key, idx in self.target_dict.items():
                    if idx >= self.n_words_target:
                        del self.target_dict[key]

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length

        self.source_buffer = []
        self.target_buffer = []
        self.k = batch_size * maxibatch_size

        self.end_of_data = False

        self.interpolation_rate = interpolation_rate
        self.indomain_k = int(math.ceil(self.interpolation_rate * self.k))
        self.outdomain_k = self.k - self.indomain_k
Пример #9
0
 def do_test(self, testNumber):
     testFile = self.testDataFolder + "/" + str(testNumber)
     main(testFile + ".in", testFile + "_actual.out")
     # compare the result
     expectedOut = open(testFile + ".out", 'r')
     actualOut = open(testFile + "_actual.out", 'r')
     expectedLines = expectedOut.readlines()
     actualLines = actualOut.readlines()
     expectedOut.close()
     actualOut.close()
     self.assertEqual(actualLines, expectedLines)
    def __init__(
        self,
        source,
        target,
        source_dicts,
        target_dict,
        batch_size=128,
        maxlen=100,
        n_words_source=-1,
        n_words_target=-1,
        shuffle_each_epoch=False,
        sort_by_length=True,
        maxibatch_size=20,
    ):
        global epoch_num
        if shuffle_each_epoch:
            shuffle.main([source, target], epoch_num)
            self.source = fopen(source + '.shuf', 'r')
            self.target = fopen(target + '.shuf', 'r')
        else:
            self.source = fopen(source, 'r')
            self.target = fopen(target, 'r')
        self.source_dicts = []
        for source_dict in source_dicts:
            self.source_dicts.append(load_dict(source_dict))
        self.target_dict = load_dict(target_dict)

        self.batch_size = batch_size
        self.maxlen = maxlen

        self.n_words_source = n_words_source
        self.n_words_target = n_words_target

        if self.n_words_source > 0:
            for d in self.source_dicts:
                for key, idx in d.items():
                    if idx >= self.n_words_source:
                        del d[key]

        if self.n_words_target > 0:
            for key, idx in self.target_dict.items():
                if idx >= self.n_words_target:
                    del self.target_dict[key]

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length

        self.source_buffer = []
        self.target_buffer = []
        self.k = batch_size * maxibatch_size

        self.end_of_data = False

        self.embeddings = embeddings
 def reset(self):
     global epoch_num
     epoch_num += 1
     if self.shuffle:
         shuffle.main([
             self.source.name.replace('.shuf', ''),
             self.target.name.replace('.shuf', '')
         ], epoch_num)
         self.source = fopen(self.source.name, 'r')
         self.target = fopen(self.target.name, 'r')
     else:
         self.source.seek(0)
         self.target.seek(0)
 def reset(self):
     # clear buffers for new epoch
     self.source_buffer = []
     self.target_buffer = []
     self.pseudo_source_buffer = []
     self.pseudo_target_buffer = []
     if self.shuffle:
         self.source, self.target = shuffle.main([self.source_orig, self.target_orig], temporary=True)
         self.pseudo_source, self.pseudo_target = shuffle.main([self.pseudo_source_orig, self.pseudo_target_orig], temporary=True)            
     else:
         self.source.seek(0)
         self.target.seek(0)
         self.pseudo_source.seek(0)
         self.pseudo_target.seek(0)
Пример #13
0
    def __init__(self, source, target,
                 source_dicts, target_dict,
                 batch_size=128,
                 maxlen=100,
                 n_words_source=-1,
                 n_words_target=-1,
                 shuffle_each_epoch=False,
                 sort_by_length=True,
                 maxibatch_size=20):
        if shuffle_each_epoch:
            shuffle.main([source, target])
            self.source = fopen(source+'.shuf', 'r')
            self.target = fopen(target+'.shuf', 'r')
        else:
            self.source = fopen(source, 'r')
            self.target = fopen(target, 'r')
        self.source_dicts = []
        for source_dict in source_dicts:
            self.source_dicts.append(load_dict(source_dict))
        self.target_dict = load_dict(target_dict)

        self.batch_size = batch_size
        self.maxlen = maxlen

        self.n_words_source = n_words_source
        self.n_words_target = n_words_target

        if self.n_words_source > 0:
            for d in self.source_dicts:
                for key, idx in d.items():
                    if idx >= self.n_words_source:
                        del d[key]

        if self.n_words_target > 0:
                for key, idx in self.target_dict.items():
                    if idx >= self.n_words_target:
                        del self.target_dict[key]

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length

        self.source_buffer = []
        self.target_buffer = []
        self.k = batch_size * maxibatch_size
        
        print "K=", self.k

        self.end_of_data = False
Пример #14
0
 def reset(self):
     if self.shuffle:
         shuffled = shuffle.main(self.source_orig+[self.target_orig], temporary=True)
         self.all_sources, self.target = shuffled[:-1], shuffled[-1]
     else:
         [ss.seek(0) for ss in self.all_sources]
         self.target.seek(0)
Пример #15
0
 def reset(self):
     if self.shuffle:
         self.source, self.target = shuffle.main(
             [self.source_orig, self.target_orig], temporary=True)
     else:
         self.source.seek(0)
         self.target.seek(0)
Пример #16
0
    def __init__(self,
                 source,
                 batch_size=128,
                 max_len=100,
                 skip_empty=False,
                 shuffle_each_epoch=False,
                 sort_by_length=False,
                 max_batch_size=20,
                 min_len=None):
        if shuffle_each_epoch:
            self.source_orig = source
            self.source = shuffle.main(self.source_orig, temporary=True)
        else:
            self.source = fopen(source, 'r')
        self.source_dicts = []
        self.batch_size = batch_size
        self.max_len = max_len
        self.min_len = min_len
        self.skip_empty = skip_empty

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length

        self.source_buffer = []
        self.k = batch_size * max_batch_size

        self.end_of_data = False
Пример #17
0
    def __init__(self, sources, target,
                 source_dicts, target_dict,
                 batch_size=128,
                 maxlen=100,
                 n_words_source=[-1],
                 n_words_target=-1,
                 shuffle_each_epoch=False,
                 sort_by_length=True,
                 maxibatch_size=20):
        if shuffle_each_epoch:
            shuffle.main(sources + [target])
            self.sources = [fopen(source+'.shuf', 'r') for source in sources]
            self.target = fopen(target+'.shuf', 'r')
        else:
            self.sources = [fopen(source, 'r') for source in sources]
            self.target = fopen(target, 'r')
        self.source_dicts = []
        for factor_dicts in source_dicts:
            self.source_dicts.append([load_dict(source_dict) for source_dict in factor_dicts])
        self.target_dict = load_dict(target_dict)

        self.batch_size = batch_size
        self.maxlen = maxlen

        self.n_words_source = n_words_source
        self.n_words_target = n_words_target

        for i, n_words in enumerate(self.n_words_source):
            if n_words > 0:
                for d in self.source_dicts[i]:
                    for key, idx in d.items():
                        if idx >= n_words:
                            del d[key]

        if self.n_words_target > 0:
                for key, idx in self.target_dict.items():
                    if idx >= self.n_words_target:
                        del self.target_dict[key]

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length

        self.source_buffers = [list() for _ in range(len(self.sources))]
        self.target_buffer = []
        self.k = batch_size * maxibatch_size
        
        self.end_of_data = False
Пример #18
0
    def __init__(self, source, target,
                 source_dicts, target_dict,
                 batch_size=128,
                 maxlen=100,
                 n_words_source=-1,
                 n_words_target=-1,
                 skip_empty=False,
                 shuffle_each_epoch=False,
                 sort_by_length=True,
                 use_factor=False,
                 maxibatch_size=20,
                 keep_data_in_memory=False):
        if keep_data_in_memory:
            self.source, self.target = FileWrapper(source), FileWrapper(target)
            if shuffle_each_epoch:
                r = numpy.random.permutation(len(self.source))
                self.source.shuffle_lines(r)
                self.target.shuffle_lines(r)
        elif shuffle_each_epoch:
            self.source_orig = source
            self.target_orig = target
            self.source, self.target = shuffle.main([self.source_orig, self.target_orig], temporary=True)
        else:
            self.source = fopen(source, 'r')
            self.target = fopen(target, 'r')
        self.source_dicts = []
        for source_dict in source_dicts:
            self.source_dicts.append(load_dict(source_dict))
        self.target_dict = load_dict(target_dict)

        self.keep_data_in_memory = keep_data_in_memory
        self.batch_size = batch_size
        self.maxlen = maxlen
        self.skip_empty = skip_empty
        self.use_factor = use_factor

        self.n_words_source = n_words_source
        self.n_words_target = n_words_target

        if self.n_words_source > 0:
            for d in self.source_dicts:
                for key, idx in d.items():
                    if idx >= self.n_words_source:
                        del d[key]

        if self.n_words_target > 0:
                for key, idx in self.target_dict.items():
                    if idx >= self.n_words_target:
                        del self.target_dict[key]

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length

        self.source_buffer = []
        self.target_buffer = []
        self.k = batch_size * maxibatch_size
        

        self.end_of_data = False
Пример #19
0
    def __init__(self,
                 source,
                 uid_voc,
                 mid_voc,
                 cat_voc,
                 batch_size=128,
                 max_len=100,
                 skip_empty=False,
                 shuffle_each_epoch=False,
                 sort_by_length=True,
                 max_batch_size=20,
                 min_len=None):
        if shuffle_each_epoch:
            self.source_orig = source
            self.source = shuffle.main(self.source_orig, temporary=True)
        else:
            self.source = fopen(source, 'r')
        self.source_dicts = []
        for source_dict in [uid_voc, mid_voc, cat_voc]:
            self.source_dicts.append(load_dict(source_dict))

        f_meta = open(
            "../taobao_data_process/central_taobao_data/taobao-item-info", "r")
        meta_map = {}  # 最原始的item id与cate id的映射
        for line in f_meta:
            arr = line.strip().split("\t")
            if arr[0] not in meta_map:
                meta_map[arr[0]] = arr[1]
        self.meta_id_map = {}  # 存储新的item id与 cate id对应的map (基于voc)
        for key in meta_map:
            val = meta_map[key]
            if key in self.source_dicts[1]:
                mid_idx = self.source_dicts[1][key]
            else:
                mid_idx = 0
            if val in self.source_dicts[2]:
                cat_idx = self.source_dicts[2][val]
            else:
                cat_idx = 0
            self.meta_id_map[mid_idx] = cat_idx

        self.batch_size = batch_size
        self.max_len = max_len
        self.min_len = min_len
        self.skip_empty = skip_empty

        self.n_uid = len(self.source_dicts[0])
        self.n_mid = len(self.source_dicts[1])
        self.n_cat = len(self.source_dicts[2])

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length

        self.source_buffer = []
        self.k = batch_size * max_batch_size

        self.end_of_data = False
Пример #20
0
 def reset(self, x):
     
     if self.shuffle:
         self.sources[x], self.targets[x] = shuffle.main([self.sources_orig[x], self.targets_orig[x]], temporary=True)
     else:
         self.sources[x].seek(0)
         self.targets[x].seek(0)
     self.finished_files[x] = True
     self.stop()
Пример #21
0
    def __init__(self,
                 source,
                 target,
                 source_dicts,
                 target_dict,
                 batch_size=128,
                 maxlen=100,
                 n_words_source=-1,
                 n_words_target=-1,
                 skip_empty=False,
                 shuffle_each_epoch=False,
                 sort_by_length=True,
                 use_factor=False,
                 maxibatch_size=20,
                 token_batch_size=0):
        if shuffle_each_epoch:
            self.source_orig = source
            self.target_orig = target
            self.source, self.target = shuffle.main(
                [self.source_orig, self.target_orig], temporary=True)
        else:
            self.source = fopen(source, 'r')
            self.target = fopen(target, 'r')
        self.source_dicts = []
        for source_dict in source_dicts:
            self.source_dicts.append(load_dict(source_dict))
        self.target_dict = load_dict(target_dict)

        self.batch_size = batch_size
        self.maxlen = maxlen
        self.skip_empty = skip_empty
        self.use_factor = use_factor

        self.n_words_source = n_words_source
        self.n_words_target = n_words_target

        if self.n_words_source > 0:
            for d in self.source_dicts:
                for key, idx in d.items():
                    if idx >= self.n_words_source:
                        del d[key]

        if self.n_words_target > 0:
            for key, idx in self.target_dict.items():
                if idx >= self.n_words_target:
                    del self.target_dict[key]

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length

        self.source_buffer = []
        self.target_buffer = []
        self.k = batch_size * maxibatch_size
        self.token_batch_size = token_batch_size

        self.end_of_data = False
Пример #22
0
 def reset(self):
     if self.shuffle:
         if self.external_reward:
             self.source, self.target, self.log, self.external_reward = shuffle.main(
                 [
                     self.source_orig, self.target_orig, self.log_orig,
                     self.external_reward_ori
                 ],
                 temporary=True)
         else:
             self.source, self.target, self.log = shuffle.main(
                 [self.source_orig, self.target_orig, self.log_orig],
                 temporary=True)
     else:
         self.source.seek(0)
         self.target.seek(0)
         self.log.seek(0)
         if self.external_reward:
             self.external_reward.seek(0)
Пример #23
0
    def __init__(self,
                 source,
                 target,
                 source_dict,
                 target_dict,
                 batch_size=128,
                 maxlen=100,
                 n_words_source=-1,
                 n_words_target=-1,
                 skip_empty=False,
                 shuffle_each_epoch=False,
                 sort_by_length=True,
                 maxibatch_size=20):
        if shuffle_each_epoch:
            self.source_orig = source
            self.target_orig = target
            self.source, self.target = shuffle.main(
                [self.source_orig, self.target_orig], temporary=True)
        else:
            self.source = fopen(source, 'r')
            self.target = fopen(target, 'r')

        #for line in self.source.readlines():
        #print line
        #aline = self.target.readline()
        #print aline

        self.source_dict = load_dict(source_dict)
        self.target_dict = load_dict(target_dict)

        self.batch_size = batch_size
        self.maxlen = maxlen
        self.skip_empty = skip_empty

        self.n_words_source = n_words_source
        self.n_words_target = n_words_target

        if self.n_words_source > 0:  # if source number is specified
            for key, idx in self.source_dict.items():
                if idx >= self.n_words_source:
                    del self.source_dict[key]

        if self.n_words_target > 0:
            for key, idx in self.target_dict.items():
                if idx >= self.n_words_target:
                    del self.target_dict[key]

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length
        self.source_buffer = []  # source instance in memory
        self.target_buffer = []  # target instance in memory
        self.k = batch_size * maxibatch_size  # number of instance in memory in total

        self.end_of_data = False
Пример #24
0
 def reset(self):
     if self.shuffle:
         if self.keep_data_in_memory:
             r = numpy.random.permutation(len(self.source))
             self.source.shuffle_lines(r)
             self.target.shuffle_lines(r)
         else:
             self.source, self.target = shuffle.main([self.source_orig, self.target_orig], temporary=True)
     else:
         self.source.seek(0)
         self.target.seek(0)
Пример #25
0
 def reset(self):
     if self.shuffle:
         if self.keep_data_in_memory:
             r = numpy.random.permutation(len(self.source))
             self.source.shuffle_lines(r)
             self.target.shuffle_lines(r)
         else:
             self.source, self.target = shuffle.main([self.source_orig, self.target_orig], temporary=True)
     else:
         self.source.seek(0)
         self.target.seek(0)
Пример #26
0
 def reset(self):
     if self.shuffle:
         if self.align1:
             self.source1, self.source2, self.target, self.align1, self.align2 = shuffle.main(
                 [
                     self.source_orig1, self.source_orig2, self.target_orig,
                     self.align_orig1, self.align_orig2
                 ],
                 temporary=True)
         else:
             self.source1, self.source2, self.target = shuffle.main(
                 [self.source_orig1, self.source_orig2, self.target_orig],
                 temporary=True)
     else:
         self.source1.seek(0)
         self.source2.seek(0)
         self.target.seek(0)
         if self.align1:
             self.align1.seek(0)
             self.align2.seek(0)
Пример #27
0
    def __init__(self,
                 source,
                 target,
                 source_dict,
                 target_dict,
                 batch_size=128,
                 maxlen=None,
                 n_words_source=-1,
                 n_words_target=-1,
                 skip_empty=False,
                 shuffle_each_epoch=False,
                 sort_by_length=True,
                 maxibatch_size=20):
        if shuffle_each_epoch:
            self.source_orig = source
            self.target_orig = target
            self.source, self.target = shuffle.main(
                [self.source_orig, self.target_orig], temporary=True)
        else:
            self.source = data_utils.fopen(source, 'r')
            self.target = data_utils.fopen(target, 'r')

        self.source_dict = load_dict(source_dict)
        self.target_dict = load_dict(target_dict)

        self.batch_size = batch_size
        self.maxlen = maxlen
        self.skip_empty = skip_empty

        self.n_words_source = n_words_source
        self.n_words_target = n_words_target

        if self.n_words_source > 0:
            for key, idx in self.source_dict.items():
                if idx >= self.n_words_source:
                    del self.source_dict[key]

        if self.n_words_target > 0:
            for key, idx in self.target_dict.items():
                if idx >= self.n_words_target:
                    del self.target_dict[key]

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length

        self.source_buffer = []
        self.target_buffer = []
        self.k = batch_size * maxibatch_size

        self.end_of_data = False
Пример #28
0
    def __init__(self,
                 datasets,
                 dicts,
                 n_words_dicts=None,
                 batch_size=128,
                 maxlen=100,
                 skip_empty=False,
                 shuffle_each_epoch=False,
                 sort_by_length=True,
                 factors=1,
                 outputs=1,
                 maxibatch_size=20):

        if shuffle_each_epoch:
            self.datasets_orig = datasets
            self.datasets = shuffle.main(datasets, temporary=True)
        else:
            self.datasets = [fopen(fp, 'r') for fp in datasets]

        self.dicts = []
        for dict_ in dicts:
            self.dicts.append(load_dict(dict_))

        self.batch_size = batch_size
        self.maxlen = maxlen
        self.skip_empty = skip_empty
        self.factors = factors
        self.outputs = outputs

        assert len(
            datasets) == 1 + outputs, 'Datasets and dictionaries mismatch'

        self.n_words_dicts = n_words_dicts

        if self.n_words_dicts:
            for d, max_ in zip(self.dicts, self.n_words_dicts):
                for key, idx in d.items():
                    if idx >= max_:
                        del d[key]

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length

        self.buffers = [[] for _ in range(len(datasets))]
        self.k = batch_size * maxibatch_size

        self.end_of_data = False
Пример #29
0
    def __init__(self,
                 source,
                 source_dicts,
                 batch_size=128,
                 maxlen=100,
                 n_words_source=-1,
                 skip_empty=False,
                 shuffle_each_epoch=False,
                 sort_by_length=True,
                 maxibatch_size=20):
        if shuffle_each_epoch:
            self.source_orig = source
            self.source = shuffle.main([self.source_orig], temporary=True)
            self.source = self.source[0]  # ???
            print('this had better be a file:', type(self.source))
        else:
            self.source = fopen(source, 'r')
        self.source_dicts = []
        for source_dict in source_dicts:
            self.source_dicts.append(load_dict(source_dict))

        self.batch_size = batch_size
        self.maxlen = maxlen
        self.skip_empty = skip_empty

        self.n_words_source = n_words_source

        if self.n_words_source > 0:
            for d in self.source_dicts:
                for key, idx in d.items():
                    if idx >= self.n_words_source:
                        del d[key]

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length

        self.source_buffer = []
        self.k = batch_size * maxibatch_size

        self.end_of_data = False
Пример #30
0
    def __init__(self, source,
                 uid_voc,
                 iid_voc,
                 cat_voc,
                 brand_voc,
                 batch_size=128,
                 maxlen=100,
                 skip_empty=False,
                 shuffle_each_epoch=False,
                 sort_by_length=True,
                 max_batch_size=20,
                 minlen=1):
        if shuffle_each_epoch:
            self.source_orig = source
            self.source = shuffle.main(self.source_orig, temporary=True)
        else:
            self.source = fopen(source, 'r')
        self.source_dicts = []
        for source_dict in [uid_voc, iid_voc, cat_voc, brand_voc]:
            self.source_dicts.append(load_dict(source_dict))
        #print(self.source_dicts[0])
        self.batch_size = batch_size
        self.maxlen = maxlen
        self.minlen = minlen
        self.skip_empty = skip_empty

        self.n_uid = len(self.source_dicts[0])
        self.n_iid = len(self.source_dicts[1])
        self.n_cat = len(self.source_dicts[2])
        self.n_brand = len(self.source_dicts[3])

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length

        self.source_buffer = []
        self.k = batch_size * max_batch_size

        self.end_of_data = False
Пример #31
0
 def reset(self):
     if self.shuffle:
         self.source= shuffle.main(self.source_orig, temporary=True)
     else:
         self.source.seek(0)
Пример #32
0
    def __init__(self, source, target,
                 source_dicts, target_dict,
                 model_type,
                 batch_size=128,
                 maxlen=100,
                 source_vocab_sizes=None,
                 target_vocab_size=None,
                 skip_empty=False,
                 shuffle_each_epoch=False,
                 sort_by_length=True,
                 use_factor=False,
                 maxibatch_size=20,
                 token_batch_size=0,
                 keep_data_in_memory=False):
        if keep_data_in_memory:
            self.source, self.target = FileWrapper(source), FileWrapper(target)
            if shuffle_each_epoch:
                r = numpy.random.permutation(len(self.source))
                self.source.shuffle_lines(r)
                self.target.shuffle_lines(r)
        elif shuffle_each_epoch:
            self.source_orig = source
            self.target_orig = target
            self.source, self.target = shuffle.main([self.source_orig, self.target_orig], temporary=True)
        else:
            self.source = fopen(source, 'r')
            self.target = fopen(target, 'r')
        self.source_dicts = []
        for source_dict in source_dicts:
            self.source_dicts.append(load_dict(source_dict, model_type))
        self.target_dict = load_dict(target_dict, model_type)

        # Determine the UNK value for each dictionary (the value depends on
        # which version of build_dictionary.py was used).

        def determine_unk_val(d):
            if '<UNK>' in d and d['<UNK>'] == 2:
                return 2
            return 1

        self.source_unk_vals = [determine_unk_val(d)
                                for d in self.source_dicts]
        self.target_unk_val = determine_unk_val(self.target_dict)


        self.keep_data_in_memory = keep_data_in_memory
        self.batch_size = batch_size
        self.maxlen = maxlen
        self.skip_empty = skip_empty
        self.use_factor = use_factor

        self.source_vocab_sizes = source_vocab_sizes
        self.target_vocab_size = target_vocab_size

        self.token_batch_size = token_batch_size

        if self.source_vocab_sizes != None:
            assert len(self.source_vocab_sizes) == len(self.source_dicts)
            for d, vocab_size in zip(self.source_dicts, self.source_vocab_sizes):
                if vocab_size != None and vocab_size > 0:
                    for key, idx in list(d.items()):
                        if idx >= vocab_size:
                            del d[key]

        if self.target_vocab_size != None and self.target_vocab_size > 0:
            for key, idx in list(self.target_dict.items()):
                if idx >= self.target_vocab_size:
                    del self.target_dict[key]

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length

        self.source_buffer = []
        self.target_buffer = []
        self.k = batch_size * maxibatch_size
        

        self.end_of_data = False
Пример #33
0
    def __init__(self, source,
                 uid_voc,
                 mid_voc,
                 cat_voc,
                 item_info,
                 reviews_info,
                 batch_size=128,
                 maxlen=100,
                 skip_empty=False,
                 shuffle_each_epoch=False,
                 sort_by_length=True,
                 max_batch_size=20):
        if shuffle_each_epoch:
            self.source_orig = source
            self.source = shuffle.main(self.source_orig, temporary=True)
        else:
            self.source = fopen(source, 'r')
        self.source_dicts = []
        for source_dict in [uid_voc, mid_voc, cat_voc]:
            self.source_dicts.append(load_dict(source_dict))

        f_meta = fopen(item_info, "r")
        meta_map = {}
        for line in f_meta:
            arr = line.strip().split("\t")
            if arr[0] not in meta_map:
                meta_map[arr[0]] = arr[1]
        self.meta_id_map = {}
        for key in meta_map:
            val = meta_map[key]
            if key in self.source_dicts[1]:
                mid_idx = self.source_dicts[1][key]
            else:
                mid_idx = 0
            if val in self.source_dicts[2]:
                cat_idx = self.source_dicts[2][val]
            else:
                cat_idx = 0
            self.meta_id_map[mid_idx] = cat_idx

        f_review = fopen(reviews_info, "r")
        self.mid_list_for_random = []
        for line in f_review:
            arr = line.strip().split("\t")
            tmp_idx = 0
            if arr[1] in self.source_dicts[1]:
                tmp_idx = self.source_dicts[1][arr[1]]
            self.mid_list_for_random.append(tmp_idx)

        self.batch_size = batch_size
        self.maxlen = maxlen
        self.skip_empty = skip_empty

        self.n_uid = len(self.source_dicts[0])
        self.n_mid = len(self.source_dicts[1])
        self.n_cat = len(self.source_dicts[2])

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length

        self.source_buffer = []
        self.k = batch_size * max_batch_size

        self.end_of_data = False
Пример #34
0
    def __init__(self,
                 source,
                 NUM_FEATURE,
                 NUM_QUERY,
                 voc_list,
                 batch_size=128,
                 maxlen=100,
                 skip_empty=False,
                 shuffle_each_epoch=False,
                 sort_by_length=True,
                 max_batch_size=20,
                 minlen=None):
        if shuffle_each_epoch:
            self.source_orig = source
            self.source = shuffle.main(self.source_orig, temporary=True)
        else:
            self.source = fopen(source, 'r')
        self.source_dicts = []
        for source_dict in voc_list:

            self.source_dicts.append(load_dict(source_dict))
        self.num_feature = NUM_FEATURE
        self.num_query = NUM_QUERY
        f_meta = open("item-info", "r")
        meta_map = {}
        for line in f_meta:
            arr = line.strip().split("\t")
            if arr[0] not in meta_map:
                meta_map[arr[0]] = arr[1:self.num_feature]
        self.meta_id_map = {}
        for key in meta_map:

            if key in self.source_dicts[self.num_query]:
                mid_idx = self.source_dicts[self.num_query][key]
            else:
                mid_idx = 0
            val = []
            for i in range(len(meta_map[key])):
                idx = 0
                cur_val = meta_map[key][i]
                if (cur_val in self.source_dicts[i + self.num_query + 1]):
                    idx = self.source_dicts[i + self.num_query + 1][cur_val]
                val.append(idx)

            self.meta_id_map[mid_idx] = val

        f_review = open("reviews-info", "r")
        self.mid_list_for_random = []
        for line in f_review:
            arr = line.strip().split("\t")
            tmp_idx = 0
            if arr[1] in self.source_dicts[self.num_query]:
                tmp_idx = self.source_dicts[self.num_query][arr[1]]
            self.mid_list_for_random.append(tmp_idx)

        self.batch_size = batch_size
        self.maxlen = maxlen
        self.minlen = minlen
        self.skip_empty = skip_empty

        self.n_query = []
        for i in range(self.num_query):
            self.n_query.append(len(self.source_dicts[i]))

        self.n = []
        for i in range(self.num_feature):
            self.n.append(len(self.source_dicts[i + self.num_query]))

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length

        self.source_buffer = []
        self.k = batch_size * max_batch_size

        self.end_of_data = False
    def __init__(self,
                 source,
                 uid_voc,
                 mid_voc,
                 cat_voc,
                 batch_size=128,
                 maxlen=100,
                 skip_empty=False,
                 shuffle_each_epoch=False,
                 sort_by_length=True,
                 max_batch_size=20,
                 minlen=None,
                 label_type=1):
        if shuffle_each_epoch:
            self.source_orig = source
            self.source = shuffle.main(self.source_orig, temporary=True)
        else:
            self.source = fopen(source, 'r')
        self.source_dicts = []
        #for source_dict in [uid_voc, mid_voc, cat_voc, cat_voc, cat_voc]:# 'item_carte_voc.pkl', 'cate_carte_voc.pkl']:
        for source_dict in [
                uid_voc, mid_voc, cat_voc, 'item_carte_voc.pkl',
                'cate_carte_voc.pkl'
        ]:
            self.source_dicts.append(load_dict(source_dict))

        f_meta = open("item-info", "r")
        meta_map = {}
        for line in f_meta:
            arr = line.strip().split("\t")
            if arr[0] not in meta_map:
                meta_map[arr[0]] = arr[1]
        self.meta_id_map = {}
        for key in meta_map:
            val = meta_map[key]
            if key in self.source_dicts[1]:
                mid_idx = self.source_dicts[1][key]
            else:
                mid_idx = 0
            if val in self.source_dicts[2]:
                cat_idx = self.source_dicts[2][val]
            else:
                cat_idx = 0
            self.meta_id_map[mid_idx] = cat_idx

        f_review = open("reviews-info", "r")
        self.mid_list_for_random = []
        for line in f_review:
            arr = line.strip().split("\t")
            tmp_idx = 0
            if arr[1] in self.source_dicts[1]:
                tmp_idx = self.source_dicts[1][arr[1]]
            self.mid_list_for_random.append(tmp_idx)

        self.batch_size = batch_size
        self.maxlen = maxlen
        self.minlen = minlen
        self.skip_empty = skip_empty

        self.n_uid = len(self.source_dicts[0])
        self.n_mid = len(self.source_dicts[1])
        self.n_cat = len(self.source_dicts[2])
        self.n_carte = [len(self.source_dicts[3]), len(self.source_dicts[4])]
        print("n_uid=%d, n_mid=%d, n_cat=%d" %
              (self.n_uid, self.n_mid, self.n_cat))

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length

        self.source_buffer = []
        self.k = batch_size * max_batch_size

        self.end_of_data = False
        self.label_type = label_type
Пример #36
0
    def __init__(self, source, target,
                 source_dicts, target_dict,
                 model_type,
                 batch_size=128,
                 maxlen=100,
                 source_vocab_sizes=None,
                 target_vocab_size=None,
                 skip_empty=False,
                 shuffle_each_epoch=False,
                 sort_by_length=True,
                 use_factor=False,
                 maxibatch_size=20,
                 token_batch_size=0,
                 keep_data_in_memory=False):
        if keep_data_in_memory:
            self.source, self.target = FileWrapper(source), FileWrapper(target)
            if shuffle_each_epoch:
                r = numpy.random.permutation(len(self.source))
                self.source.shuffle_lines(r)
                self.target.shuffle_lines(r)
        elif shuffle_each_epoch:
            self.source_orig = source
            self.target_orig = target
            self.source, self.target = shuffle.main([self.source_orig, self.target_orig], temporary=True)
        else:
            self.source = fopen(source, 'r')
            self.target = fopen(target, 'r')
        self.source_dicts = []
        for source_dict in source_dicts:
            self.source_dicts.append(load_dict(source_dict, model_type))
        self.target_dict = load_dict(target_dict, model_type)

        # Determine the UNK value for each dictionary (the value depends on
        # which version of build_dictionary.py was used).

        def determine_unk_val(d):
            if '<UNK>' in d and d['<UNK>'] == 2:
                return 2
            return 1

        self.source_unk_vals = [determine_unk_val(d)
                                for d in self.source_dicts]
        self.target_unk_val = determine_unk_val(self.target_dict)


        self.keep_data_in_memory = keep_data_in_memory
        self.batch_size = batch_size
        self.maxlen = maxlen
        self.skip_empty = skip_empty
        self.use_factor = use_factor

        self.source_vocab_sizes = source_vocab_sizes
        self.target_vocab_size = target_vocab_size

        self.token_batch_size = token_batch_size

        if self.source_vocab_sizes != None:
            assert len(self.source_vocab_sizes) == len(self.source_dicts)
            for d, vocab_size in zip(self.source_dicts, self.source_vocab_sizes):
                if vocab_size != None and vocab_size > 0:
                    for key, idx in list(d.items()):
                        if idx >= vocab_size:
                            del d[key]

        if self.target_vocab_size != None and self.target_vocab_size > 0:
            for key, idx in list(self.target_dict.items()):
                if idx >= self.target_vocab_size:
                    del self.target_dict[key]

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length

        self.source_buffer = []
        self.target_buffer = []
        self.k = batch_size * maxibatch_size
        

        self.end_of_data = False
Пример #37
0
 def reset(self):
     if self.shuffle:
         self.source = shuffle.main(self.source_orig, temporary=True)
     else:
         self.source.seek(0)
Пример #38
0
    def __init__(self, source, target,
                 source_dicts, target_dict,
                 batch_size=128,
                 maxlen=100,
                 n_words_source=-1,
                 n_words_target=-1,
                 skip_empty=False,
                 shuffle_each_epoch=False,
                 sort_by_length=True,
                 use_factor=False,
                 maxibatch_size=20):
        if shuffle_each_epoch:
            self.source_orig = source
            self.target_orig = target
            self.source, self.target = shuffle.main([self.source_orig, self.target_orig], temporary=True)
        else:
            self.source = fopen(source, 'r')
            self.target = fopen(target, 'r')
        print 'scan the dataset.'
        for si, _ in enumerate(self.source):
            pass
        for ti, _ in enumerate(self.target):
            pass

        self.source.seek(0)
        self.target.seek(0)
        assert si == ti, 'the number of the source and target document must the same'
        print 'scanned {} lines'.format(si)

        self.source_dicts = []
        for source_dict in source_dicts:
            self.source_dicts.append(load_dict(source_dict))
        self.target_dict = load_dict(target_dict)

        self.batch_size = batch_size
        self.maxlen = maxlen
        self.skip_empty = skip_empty
        self.use_factor = use_factor

        self.n_words_source = n_words_source
        self.n_words_target = n_words_target

        if self.n_words_source > 0:
            for d in self.source_dicts:
                for key, idx in d.items():
                    if idx >= self.n_words_source:
                        del d[key]

        if self.n_words_target > 0:
                for key, idx in self.target_dict.items():
                    if idx >= self.n_words_target:
                        del self.target_dict[key]

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length

        self.source_buffer = []
        self.target_buffer = []
        self.k = batch_size * maxibatch_size
        

        self.end_of_data = False
Пример #39
0
    def __init__(self,
                 source,
                 target,
                 source_dicts,
                 target_dict,
                 batch_size=128,
                 maxlen=100,
                 n_words_source=-1,
                 n_words_target=-1,
                 skip_empty=False,
                 shuffle_each_epoch=False,
                 sort_by_length=True,
                 use_factor=False,
                 maxibatch_size=20,
                 align1_file=None,
                 align2_file=None):
        self.source_files = source.split(",")
        if shuffle_each_epoch:
            self.source_orig1 = self.source_files[0]
            self.source_orig2 = self.source_files[1]
            self.target_orig = target
            if align1_file:
                self.align_orig1 = align1_file
                self.align_orig2 = align2_file
                self.source1, self.source2, self.target, self.align1, self.align2 = shuffle.main(
                    [
                        self.source_orig1, self.source_orig2, self.target_orig,
                        self.align_orig1, self.align_orig2
                    ],
                    temporary=True)
            else:
                self.source1, self.source2, self.target = shuffle.main(
                    [self.source_orig1, self.source_orig2, self.target_orig],
                    temporary=True)
                self.align1 = None
        else:
            self.source1 = fopen(self.source_files[0], 'r')
            self.source2 = fopen(self.source_files[1], 'r')
            self.target = fopen(target, 'r')
            if align1_file:
                self.align1 = fopen(align1_file, 'r')
                self.align2 = fopen(align2_file, 'r')
            else:
                self.align1 = None
        self.source_dicts = []
        for source_dict in source_dicts:
            self.source_dicts.append(load_dict(source_dict))
        self.target_dict = load_dict(target_dict)

        self.batch_size = batch_size
        self.maxlen = maxlen
        self.skip_empty = skip_empty
        self.use_factor = use_factor

        self.n_words_source = n_words_source
        self.n_words_target = n_words_target

        if self.n_words_source > 0:
            for d in self.source_dicts:
                for key, idx in d.items():
                    if idx >= self.n_words_source:
                        del d[key]

        if self.n_words_target > 0:
            for key, idx in self.target_dict.items():
                if idx >= self.n_words_target:
                    del self.target_dict[key]

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length

        self.source1_buffer = []
        self.source2_buffer = []
        self.target_buffer = []
        self.k = batch_size * maxibatch_size
        if self.align1:
            self.align1_buffer = []
            self.align2_buffer = []
        '''
        ts = []
        for s in self.source1:
            ts.append(s)
        print len(ts)
        self.source1.seek(0)

        ts = []
        for s in self.source2:
            ts.append(s)
        print len(ts)
        self.source2.seek(0)

        ts = []
        tss = []
        for s, ss in zip(self.source1, self.source2):
            ts.append(s)
            tss.append(ss)
        print len(ts), len(tss)
        self.source1.seek(0)
        self.source2.seek(0)      
        '''

        self.end_of_data = False