예제 #1
0
 def generate_example_id(self, dumper, start_index, item_count):
     self.total_example_id_count += item_count
     for req_index in range(start_index // 512,
                            self.total_example_id_count // 512):
         example_id_batch = dj_pb.LiteExampleIds(partition_id=0,
                                                 begin_index=req_index *
                                                 512)
         cands = list(range(req_index * 512, (req_index + 1) * 512))
         start_index = cands[0]
         for i in range(len(cands)):
             if random.randint(1, 4) > 1:
                 continue
             a = random.randint(i - 64, i + 64)
             b = random.randint(i - 64, i + 64)
             if a < 0:
                 a = 0
             if a >= len(cands):
                 a = len(cands) - 1
             if b < 0:
                 b = 0
             if b >= len(cands):
                 b = len(cands) - 1
             if (abs(cands[a] - i - start_index) <= 64
                     and abs(cands[b] - i - start_index) <= 64):
                 cands[a], cands[b] = cands[b], cands[a]
         for example_idx in cands:
             example_id_batch.example_id.append(
                 '{}'.format(example_idx).encode())
             example_id_batch.event_time.append(150000000 + example_idx)
         dumper.add_example_id_batch(example_id_batch)
         self.assertEqual(dumper.get_next_index(), (req_index + 1) * 512)
     self.assertTrue(dumper.need_dump())
     with dumper.make_example_id_dumper() as eid:
         eid()
 def _dump_example_ids(self, dumper, start_index, batch_num, batch_size):
     self.assertEqual(start_index, dumper.get_next_index())
     self.assertEqual(dumper.get_next_index(), start_index)
     index = start_index
     for i in range(batch_num):
         example_id_batch = dj_pb.LiteExampleIds(
                 partition_id=0,
                 begin_index=index
             )
         for j in range(batch_size):
             example_id_batch.example_id.append('{}'.format(index).encode())
             example_id_batch.event_time.append(150000000+index)
             self.end_index = index
             index += 1
         packed_example_id_batch = dj_pb.PackedLiteExampleIds(
                 partition_id=0,
                 begin_index=index-batch_size,
                 example_id_num=batch_size,
                 sered_lite_example_ids=example_id_batch.SerializeToString()
             )
         dumper.add_example_id_batch(packed_example_id_batch)
         self.assertEqual(dumper.get_next_index(), index)
     dumper.finish_sync_example_id()
     self.assertTrue(dumper.need_dump())
     with dumper.make_example_id_dumper() as eid:
         eid()
예제 #3
0
        def _inner_iter(self, fpath):
            with make_tf_record_iter(fpath) as record_iter:
                for record in record_iter:
                    lite_example_ids = dj_pb.LiteExampleIds()
                    lite_example_ids.ParseFromString(record)
                    tf_example = tf.train.Example(
                        features=lite_example_ids.features)
                    rows = convert_tf_example_to_dict(tf_example)

                    example_id_num = len(rows['example_id'])
                    index = 0
                    while index < example_id_num:
                        row = dict()
                        for fn in SYNC_ALLOWED_OPTIONAL_FIELDS:
                            if fn not in rows:
                                continue
                            value_list = rows[fn]
                            if len(value_list) > 0:
                                row[fn] = value_list[index]
                        example_id_item = ExampleIdVisitor.ExampleIdItem(
                                index + lite_example_ids.begin_index,
                                row
                            )
                        yield example_id_item
                        index += 1
 def make_packed_lite_example_ids(self):
     return dj_pb.PackedLiteExampleIds(
         partition_id=self._partition_id,
         begin_index=self._begin_index,
         example_id_num=len(self._example_ids),
         sered_lite_example_ids=dj_pb.LiteExampleIds(
             partition_id=self._partition_id,
             begin_index=self._begin_index,
             example_id=self._example_ids,
             event_time=self._event_times).SerializeToString())
예제 #5
0
    def generate_example_id(self, dumper, start_index, item_count):
        self.total_example_id_count += item_count
        for req_index in range(start_index // 512,
                               self.total_example_id_count // 512):
            cands = list(range(req_index * 512, (req_index + 1) * 512))
            start_index = cands[0]
            for i in range(len(cands)):
                if random.randint(1, 4) > 1:
                    continue
                a = random.randint(i - 64, i + 64)
                b = random.randint(i - 64, i + 64)
                if a < 0:
                    a = 0
                if a >= len(cands):
                    a = len(cands) - 1
                if b < 0:
                    b = 0
                if b >= len(cands):
                    b = len(cands) - 1
                if (abs(cands[a] - i - start_index) <= 64
                        and abs(cands[b] - i - start_index) <= 64):
                    cands[a], cands[b] = cands[b], cands[a]
            example_id_list = []
            event_time_list = []
            for example_idx in cands:
                example_id_list.append('{}'.format(example_idx).encode())
                event_time_list.append(150000000 + example_idx)
            tf_example_id = tf.train.Feature(bytes_list=tf.train.BytesList(
                value=example_id_list))
            tf_event_time = tf.train.Feature(int64_list=tf.train.Int64List(
                value=event_time_list))

            example_id_batch = dj_pb.LiteExampleIds(
                partition_id=0,
                begin_index=req_index * 512,
                features=tf.train.Features(feature={
                    'example_id': tf_example_id,
                    'event_time': tf_event_time
                }))
            packed_example_id_batch = dj_pb.PackedLiteExampleIds(
                partition_id=0,
                begin_index=req_index * 512,
                example_id_num=len(cands),
                sered_lite_example_ids=example_id_batch.SerializeToString())
            dumper.add_example_id_batch(packed_example_id_batch)
            self.assertEqual(dumper.get_next_index(), (req_index + 1) * 512)
        self.assertTrue(dumper.need_dump())
        with dumper.make_example_id_dumper() as eid:
            eid()
예제 #6
0
 def generate_example_id(self, dumper, start_index, item_count):
     self.total_example_id_count += item_count
     for req_index in range(start_index // 512,
                            self.total_example_id_count // 512):
         example_id_batch = dj_pb.LiteExampleIds(partition_id=0,
                                                 begin_index=req_index *
                                                 512)
         cands = list(range(req_index * 512, (req_index + 1) * 512))
         start_index = cands[0]
         for i in range(len(cands)):
             if random.randint(1, 4) > 1:
                 continue
             a = random.randint(i - 64, i + 64)
             b = random.randint(i - 64, i + 64)
             if a < 0:
                 a = 0
             if a >= len(cands):
                 a = len(cands) - 1
             if b < 0:
                 b = 0
             if b >= len(cands):
                 b = len(cands) - 1
             if (abs(cands[a] - i - start_index) <= 64
                     and abs(cands[b] - i - start_index) <= 64):
                 cands[a], cands[b] = cands[b], cands[a]
         for example_idx in cands:
             example_id = '{}'.format(example_idx).encode()
             example_id_batch.example_id.append(example_id)
             example_id_batch.event_time.append(150000000 + example_idx)
             if self.version == Version.V2:
                 click_id = '%s_%s' % (example_id.decode(),
                                       example_id.decode())
                 example_id_batch.click_id.append(click_id.encode())
                 example_id_batch.id_type.append('IMEI'.encode())
                 example_id_batch.event_time_deep.append(150000000 +
                                                         example_idx + 1)
                 example_id_batch.type.append(b'1')
         packed_example_id_batch = dj_pb.PackedLiteExampleIds(
             partition_id=0,
             begin_index=req_index * 512,
             example_id_num=len(cands),
             sered_lite_example_ids=example_id_batch.SerializeToString())
         dumper.add_example_id_batch(packed_example_id_batch)
         self.assertEqual(dumper.get_next_index(), (req_index + 1) * 512)
     self.assertTrue(dumper.need_dump())
     with dumper.make_example_id_dumper() as eid:
         eid()
예제 #7
0
 def _inner_iter(self, fpath):
     with make_tf_record_iter(fpath) as record_iter:
         for record in record_iter:
             lite_example_ids = dj_pb.LiteExampleIds()
             lite_example_ids.ParseFromString(record)
             example_id_num = len(lite_example_ids.example_id)
             event_time_num = len(lite_example_ids.event_time)
             assert example_id_num == event_time_num, \
                 "the size of example id and event time must the "\
                 "same. {} != {}".format(example_id_num,
                                         event_time_num)
             index = 0
             while index < len(lite_example_ids.example_id):
                 yield ExampleIdVisitor.ExampleIdItem(
                     lite_example_ids.example_id[index],
                     lite_example_ids.event_time[index],
                     index + lite_example_ids.begin_index)
                 index += 1
예제 #8
0
    def make_packed_lite_example_ids(self):
        serde_lite_examples = dj_pb.LiteExampleIds(
            partition_id=self._partition_id,
            begin_index=self._begin_index,
            example_id=self._example_ids,
            event_time=self._event_times,
        )

        if len(self._id_types) > 0:
            serde_lite_examples.id_type.extend(self._id_types)
            serde_lite_examples.event_time_deep.extend(self._event_time_deeps)
            serde_lite_examples.type.extend(self._types)
            serde_lite_examples.click_id.extend(self._click_ids)
        return dj_pb.PackedLiteExampleIds(
            partition_id=self._partition_id,
            begin_index=self._begin_index,
            example_id_num=len(self._example_ids),
            sered_lite_example_ids=serde_lite_examples.SerializeToString())
    def make_packed_lite_example_ids(self):
        features = {}
        for name, value_list in self._feature_buf.items():
            if len(value_list) > 0:
                if isinstance(value_list[0], int):
                    features[name] = tf.train.Feature(
                        int64_list=tf.train.Int64List(value=value_list))
                else:
                    features[name] = tf.train.Feature(
                        bytes_list=tf.train.BytesList(value=value_list))
        tf_features = tf.train.Features(feature=features)
        serde_lite_examples = dj_pb.LiteExampleIds(
            partition_id=self._partition_id,
            begin_index=self._begin_index,
            features=tf_features)

        return dj_pb.PackedLiteExampleIds(
            partition_id=self._partition_id,
            begin_index=self._begin_index,
            example_id_num=self.__len__(),
            sered_lite_example_ids=serde_lite_examples.SerializeToString())
예제 #10
0
    def _dump_example_ids(self, dumper, start_index, batch_num, batch_size):
        self.assertEqual(start_index, dumper.get_next_index())
        self.assertEqual(dumper.get_next_index(), start_index)
        index = start_index
        for i in range(batch_num):
            example_id_list = []
            event_time_list = []
            prev_index = index
            for j in range(batch_size):
                example_id_list.append('{}'.format(index).encode())
                event_time_list.append(150000000 + index)
                self.end_index = index
                index += 1
            tf_example_id = tf.train.Feature(bytes_list=tf.train.BytesList(
                value=example_id_list))
            tf_event_time = tf.train.Feature(int64_list=tf.train.Int64List(
                value=event_time_list))
            example_id_batch = dj_pb.LiteExampleIds(
                partition_id=0,
                begin_index=prev_index,
                features=tf.train.Features(feature={
                    'example_id': tf_example_id,
                    'event_time': tf_event_time
                }))

            packed_example_id_batch = dj_pb.PackedLiteExampleIds(
                partition_id=0,
                begin_index=index - batch_size,
                example_id_num=batch_size,
                sered_lite_example_ids=example_id_batch.SerializeToString())
            dumper.add_example_id_batch(packed_example_id_batch)
            self.assertEqual(dumper.get_next_index(), index)
        dumper.finish_sync_example_id()
        self.assertTrue(dumper.need_dump())
        with dumper.make_example_id_dumper() as eid:
            eid()
예제 #11
0
 def make_lite_example_ids(self):
     return dj_pb.LiteExampleIds(partition_id=self._partition_id,
                                 begin_index=self._begin_index,
                                 example_id=self._example_ids,
                                 event_time=self._event_times)
예제 #12
0
 def __init__(self, partition_id, begin_index):
     self._lite_example_ids = dj_pb.LiteExampleIds(
         partition_id=partition_id, begin_index=begin_index)
예제 #13
0
    def generate_example_id(self, dumper, start_index, item_count):
        self.total_example_id_count += item_count
        for req_index in range(start_index // 512,
                               self.total_example_id_count // 512):
            cands = list(range(req_index * 512, (req_index + 1) * 512))
            start_index = cands[0]
            for i in range(len(cands)):
                if random.randint(1, 4) > 1:
                    continue
                a = random.randint(i - 64, i + 64)
                b = random.randint(i - 64, i + 64)
                if a < 0:
                    a = 0
                if a >= len(cands):
                    a = len(cands) - 1
                if b < 0:
                    b = 0
                if b >= len(cands):
                    b = len(cands) - 1
                if (abs(cands[a] - i - start_index) <= 64
                        and abs(cands[b] - i - start_index) <= 64):
                    cands[a], cands[b] = cands[b], cands[a]
            features = {
                'example_id': [],
                'event_time': [],
                'click_id': [],
                'id_type': [],
                'event_time_deep': [],
                'type': [],
            }
            for example_idx in cands:
                example_id = '{}'.format(example_idx).encode()
                features['example_id'].append(example_id)
                features['event_time'].append((150000000 + example_idx))
                if self.version == Version.V2:
                    click_id = '%s_%s' % (example_id.decode(),
                                          example_id.decode())
                    features['click_id'].append(click_id.encode())
                    features['id_type'].append('IMEI'.encode())
                    features['event_time_deep'].append(150000000 +
                                                       example_idx + 1)
                    features['type'].append(b'1')

            feature_list = tf.train.Features(
                feature={
                    'example_id':
                    tf.train.Feature(bytes_list=tf.train.BytesList(
                        value=features['example_id'])),
                    'event_time':
                    tf.train.Feature(int64_list=tf.train.Int64List(
                        value=features['event_time'])),
                    'click_id':
                    tf.train.Feature(bytes_list=tf.train.BytesList(
                        value=features['click_id'])),
                    'id_type':
                    tf.train.Feature(bytes_list=tf.train.BytesList(
                        value=features['id_type'])),
                    'type':
                    tf.train.Feature(bytes_list=tf.train.BytesList(
                        value=features['type'])),
                    'event_time_deep':
                    tf.train.Feature(int64_list=tf.train.Int64List(
                        value=features['event_time_deep']))
                })

            example_id_batch = dj_pb.LiteExampleIds(partition_id=0,
                                                    begin_index=req_index *
                                                    512,
                                                    features=feature_list)
            packed_example_id_batch = dj_pb.PackedLiteExampleIds(
                partition_id=0,
                begin_index=req_index * 512,
                example_id_num=len(cands),
                sered_lite_example_ids=example_id_batch.SerializeToString())
            dumper.add_example_id_batch(packed_example_id_batch)
            self.assertEqual(dumper.get_next_index(), (req_index + 1) * 512)
        self.assertTrue(dumper.need_dump())
        with dumper.make_example_id_dumper() as eid:
            eid()