Пример #1
0
 def query_vecs(self, records, input_spec, fields, device):
     with self.timer.time('query_vectors'):
         records = {f: [r[f] for r in records] for f in fields}
         batch = spec.apply_spec_batch(records, input_spec, device)
         result = self.ranker.query_vectors(dense=True, **batch)
         result = result.coalesce()
     return result
Пример #2
0
 def iter_batches(self, it):
     while True: # breaks on StopIteration
         input_data = {}
         for _, record in zip(range(self.batch_size), it):
             for k, seq in record.items():
                 input_data.setdefault(k, []).append(seq)
         input_data = spec.apply_spec_batch(input_data, self.input_spec, self.device)
         yield input_data
Пример #3
0
    def _batch_iter_docs(self, ranker, device):
        input_spec = ranker.input_spec()
        fields = {f for f in input_spec['fields'] if f.startswith('doc_')} | {'doc_id'}

        records = []
        for record in onir.datasets.doc_iter(self.test_ds, fields):
            records.append(record)
            if len(records) == self.config['batch_size']:
                records = {f: [r[f] for r in records] for f in fields}
                yield spec.apply_spec_batch(records, input_spec, device)
                records = []
Пример #4
0
 def _iter_batches(self, device):
     fields = set(self.input_spec['fields']) | {'query_id', 'doc_id'}
     it = datasets.record_iter(self.dataset,
                               fields=fields,
                               source=self.config['source'],
                               run_threshold=self.config['run_threshold'],
                               minrel=None,
                               shuf=False,
                               random=self.random,
                               inf=False)
     for batch_items in util.chunked(it, self.config['batch_size']):
         batch = {}
         for record in batch_items:
             for k, seq in record.items():
                 batch.setdefault(k, []).append(seq)
         batch = spec.apply_spec_batch(batch, self.input_spec, device)
         # ship 'em
         yield batch
Пример #5
0
def main():
    onir.rankers.base.global_memcache_enable = False
    context = injector.load({
        'vocab':
        onir.vocab,
        'dataset':
        onir.datasets.registry.copy(default='random'),
        'ranker':
        onir.rankers,
    })

    logger = onir.log.easy()
    logger.debug(f'vocab: {context["vocab"].config}')
    logger.debug(f'dataset: {context["dataset"].config}')
    logger.debug(f'ranker: {context["ranker"].config}')

    ds = context['dataset']
    ranker = context['ranker']

    batch_size = int(onir.config.args().get('batch_size', '512'))
    repeat = int(onir.config.args().get('repeat', '5'))
    gpu = onir.config.args().get('gpu', 'true').lower() == 'true'
    device = torch.device('cuda') if gpu else torch.device('cpu')

    logger.debug(f'batch_size: {batch_size}')
    logger.debug(f'repeat: {repeat}')
    logger.debug(f'device: {device}')

    input_spec = ranker.input_spec()
    batches = [{field: [] for field in input_spec['fields']}]
    some_field = next(iter(input_spec['fields']))
    record_count = 0
    for record in tqdm(ds.iter_records(input_spec['fields']),
                       desc='loading data',
                       leave=False):
        if len(batches[-1][some_field]) == batch_size:
            batches[-1] = spec.apply_spec_batch(batches[-1], input_spec)
            batches.append({field: [] for field in input_spec['fields']})
        for k, v in record.items():
            batches[-1][k].append(v)
        record_count += 1
    batches[-1] = spec.apply_spec_batch(batches[-1], input_spec)

    if gpu:
        ranker = ranker.to(device)
    ranker.eval()
    times = []
    times_per_1k = []
    for i in range(repeat):
        with torch.no_grad():
            timer = util.HierTimer(gpu_sync=gpu)
            for batch in tqdm(batches, leave=False, ncols=80, desc=str(i)):
                with timer.time('model'):
                    batch = {k: v.to(device) for k, v in batch.items()}
                    ranker(**batch).cpu()
            time = timer.durations['model'] * 1000
            time_per_1k = time / record_count * 1000
            logger.debug(
                f'{i} time={time:.2f}ms record_count={record_count} time_per_1k={time_per_1k:.2f}ms'
            )
            times.append(time)
            times_per_1k.append(time_per_1k)
    avg = lambda vals: sum(vals) / len(vals)
    med = lambda vals: list(sorted(vals))[len(vals) // 2]
    logger.info(
        f'max time={max(times):.2f}ms record_count={record_count} time_per_1k={max(times_per_1k):.2f}ms'
    )
    logger.info(
        f'avg time={avg(times):.2f}ms record_count={record_count} time_per_1k={avg(times_per_1k):.2f}ms'
    )
    logger.info(
        f'med time={med(times):.2f}ms record_count={record_count} time_per_1k={med(times_per_1k):.2f}ms'
    )
    logger.info(
        f'min time={min(times):.2f}ms record_count={record_count} time_per_1k={min(times_per_1k):.2f}ms'
    )