def _dump_example_ids(self, dumper, start_index, batch_num, batch_size): self.assertEqual(start_index, dumper.get_next_index()) self.assertEqual(dumper.get_next_index(), start_index) index = start_index for i in range(batch_num): example_id_batch = dj_pb.LiteExampleIds( partition_id=0, begin_index=index ) for j in range(batch_size): example_id_batch.example_id.append('{}'.format(index).encode()) example_id_batch.event_time.append(150000000+index) self.end_index = index index += 1 packed_example_id_batch = dj_pb.PackedLiteExampleIds( partition_id=0, begin_index=index-batch_size, example_id_num=batch_size, sered_lite_example_ids=example_id_batch.SerializeToString() ) dumper.add_example_id_batch(packed_example_id_batch) self.assertEqual(dumper.get_next_index(), index) dumper.finish_sync_example_id() self.assertTrue(dumper.need_dump()) with dumper.make_example_id_dumper() as eid: eid()
def make_packed_lite_example_ids(self): return dj_pb.PackedLiteExampleIds( partition_id=self._partition_id, begin_index=self._begin_index, example_id_num=len(self._example_ids), sered_lite_example_ids=dj_pb.LiteExampleIds( partition_id=self._partition_id, begin_index=self._begin_index, example_id=self._example_ids, event_time=self._event_times).SerializeToString())
def generate_example_id(self, dumper, start_index, item_count): self.total_example_id_count += item_count for req_index in range(start_index // 512, self.total_example_id_count // 512): cands = list(range(req_index * 512, (req_index + 1) * 512)) start_index = cands[0] for i in range(len(cands)): if random.randint(1, 4) > 1: continue a = random.randint(i - 64, i + 64) b = random.randint(i - 64, i + 64) if a < 0: a = 0 if a >= len(cands): a = len(cands) - 1 if b < 0: b = 0 if b >= len(cands): b = len(cands) - 1 if (abs(cands[a] - i - start_index) <= 64 and abs(cands[b] - i - start_index) <= 64): cands[a], cands[b] = cands[b], cands[a] example_id_list = [] event_time_list = [] for example_idx in cands: example_id_list.append('{}'.format(example_idx).encode()) event_time_list.append(150000000 + example_idx) tf_example_id = tf.train.Feature(bytes_list=tf.train.BytesList( value=example_id_list)) tf_event_time = tf.train.Feature(int64_list=tf.train.Int64List( value=event_time_list)) example_id_batch = dj_pb.LiteExampleIds( partition_id=0, begin_index=req_index * 512, features=tf.train.Features(feature={ 'example_id': tf_example_id, 'event_time': tf_event_time })) packed_example_id_batch = dj_pb.PackedLiteExampleIds( partition_id=0, begin_index=req_index * 512, example_id_num=len(cands), sered_lite_example_ids=example_id_batch.SerializeToString()) dumper.add_example_id_batch(packed_example_id_batch) self.assertEqual(dumper.get_next_index(), (req_index + 1) * 512) self.assertTrue(dumper.need_dump()) with dumper.make_example_id_dumper() as eid: eid()
def generate_example_id(self, dumper, start_index, item_count): self.total_example_id_count += item_count for req_index in range(start_index // 512, self.total_example_id_count // 512): example_id_batch = dj_pb.LiteExampleIds(partition_id=0, begin_index=req_index * 512) cands = list(range(req_index * 512, (req_index + 1) * 512)) start_index = cands[0] for i in range(len(cands)): if random.randint(1, 4) > 1: continue a = random.randint(i - 64, i + 64) b = random.randint(i - 64, i + 64) if a < 0: a = 0 if a >= len(cands): a = len(cands) - 1 if b < 0: b = 0 if b >= len(cands): b = len(cands) - 1 if (abs(cands[a] - i - start_index) <= 64 and abs(cands[b] - i - start_index) <= 64): cands[a], cands[b] = cands[b], cands[a] for example_idx in cands: example_id = '{}'.format(example_idx).encode() example_id_batch.example_id.append(example_id) example_id_batch.event_time.append(150000000 + example_idx) if self.version == Version.V2: click_id = '%s_%s' % (example_id.decode(), example_id.decode()) example_id_batch.click_id.append(click_id.encode()) example_id_batch.id_type.append('IMEI'.encode()) example_id_batch.event_time_deep.append(150000000 + example_idx + 1) example_id_batch.type.append(b'1') packed_example_id_batch = dj_pb.PackedLiteExampleIds( partition_id=0, begin_index=req_index * 512, example_id_num=len(cands), sered_lite_example_ids=example_id_batch.SerializeToString()) dumper.add_example_id_batch(packed_example_id_batch) self.assertEqual(dumper.get_next_index(), (req_index + 1) * 512) self.assertTrue(dumper.need_dump()) with dumper.make_example_id_dumper() as eid: eid()
def make_packed_lite_example_ids(self): serde_lite_examples = dj_pb.LiteExampleIds( partition_id=self._partition_id, begin_index=self._begin_index, example_id=self._example_ids, event_time=self._event_times, ) if len(self._id_types) > 0: serde_lite_examples.id_type.extend(self._id_types) serde_lite_examples.event_time_deep.extend(self._event_time_deeps) serde_lite_examples.type.extend(self._types) serde_lite_examples.click_id.extend(self._click_ids) return dj_pb.PackedLiteExampleIds( partition_id=self._partition_id, begin_index=self._begin_index, example_id_num=len(self._example_ids), sered_lite_example_ids=serde_lite_examples.SerializeToString())
def make_packed_lite_example_ids(self): features = {} for name, value_list in self._feature_buf.items(): if len(value_list) > 0: if isinstance(value_list[0], int): features[name] = tf.train.Feature( int64_list=tf.train.Int64List(value=value_list)) else: features[name] = tf.train.Feature( bytes_list=tf.train.BytesList(value=value_list)) tf_features = tf.train.Features(feature=features) serde_lite_examples = dj_pb.LiteExampleIds( partition_id=self._partition_id, begin_index=self._begin_index, features=tf_features) return dj_pb.PackedLiteExampleIds( partition_id=self._partition_id, begin_index=self._begin_index, example_id_num=self.__len__(), sered_lite_example_ids=serde_lite_examples.SerializeToString())
def _dump_example_ids(self, dumper, start_index, batch_num, batch_size): self.assertEqual(start_index, dumper.get_next_index()) self.assertEqual(dumper.get_next_index(), start_index) index = start_index for i in range(batch_num): example_id_list = [] event_time_list = [] prev_index = index for j in range(batch_size): example_id_list.append('{}'.format(index).encode()) event_time_list.append(150000000 + index) self.end_index = index index += 1 tf_example_id = tf.train.Feature(bytes_list=tf.train.BytesList( value=example_id_list)) tf_event_time = tf.train.Feature(int64_list=tf.train.Int64List( value=event_time_list)) example_id_batch = dj_pb.LiteExampleIds( partition_id=0, begin_index=prev_index, features=tf.train.Features(feature={ 'example_id': tf_example_id, 'event_time': tf_event_time })) packed_example_id_batch = dj_pb.PackedLiteExampleIds( partition_id=0, begin_index=index - batch_size, example_id_num=batch_size, sered_lite_example_ids=example_id_batch.SerializeToString()) dumper.add_example_id_batch(packed_example_id_batch) self.assertEqual(dumper.get_next_index(), index) dumper.finish_sync_example_id() self.assertTrue(dumper.need_dump()) with dumper.make_example_id_dumper() as eid: eid()
def generate_example_id(self, dumper, start_index, item_count): self.total_example_id_count += item_count for req_index in range(start_index // 512, self.total_example_id_count // 512): cands = list(range(req_index * 512, (req_index + 1) * 512)) start_index = cands[0] for i in range(len(cands)): if random.randint(1, 4) > 1: continue a = random.randint(i - 64, i + 64) b = random.randint(i - 64, i + 64) if a < 0: a = 0 if a >= len(cands): a = len(cands) - 1 if b < 0: b = 0 if b >= len(cands): b = len(cands) - 1 if (abs(cands[a] - i - start_index) <= 64 and abs(cands[b] - i - start_index) <= 64): cands[a], cands[b] = cands[b], cands[a] features = { 'example_id': [], 'event_time': [], 'click_id': [], 'id_type': [], 'event_time_deep': [], 'type': [], } for example_idx in cands: example_id = '{}'.format(example_idx).encode() features['example_id'].append(example_id) features['event_time'].append((150000000 + example_idx)) if self.version == Version.V2: click_id = '%s_%s' % (example_id.decode(), example_id.decode()) features['click_id'].append(click_id.encode()) features['id_type'].append('IMEI'.encode()) features['event_time_deep'].append(150000000 + example_idx + 1) features['type'].append(b'1') feature_list = tf.train.Features( feature={ 'example_id': tf.train.Feature(bytes_list=tf.train.BytesList( value=features['example_id'])), 'event_time': tf.train.Feature(int64_list=tf.train.Int64List( value=features['event_time'])), 'click_id': tf.train.Feature(bytes_list=tf.train.BytesList( value=features['click_id'])), 'id_type': tf.train.Feature(bytes_list=tf.train.BytesList( value=features['id_type'])), 'type': tf.train.Feature(bytes_list=tf.train.BytesList( value=features['type'])), 'event_time_deep': tf.train.Feature(int64_list=tf.train.Int64List( value=features['event_time_deep'])) }) example_id_batch = dj_pb.LiteExampleIds(partition_id=0, begin_index=req_index * 512, features=feature_list) packed_example_id_batch = dj_pb.PackedLiteExampleIds( partition_id=0, begin_index=req_index * 512, example_id_num=len(cands), sered_lite_example_ids=example_id_batch.SerializeToString()) dumper.add_example_id_batch(packed_example_id_batch) self.assertEqual(dumper.get_next_index(), (req_index + 1) * 512) self.assertTrue(dumper.need_dump()) with dumper.make_example_id_dumper() as eid: eid()