Exemplo n.º 1
0
def clone_index(createidx=False, test=True):
    if test:
        return
    from utils.es import ESIndexer
    from utils.common import iter_n

    new_idx = 'myvariant_current_3'
    step = 10000
    if createidx:
        from mapping import get_mapping
        m = get_mapping()
        body = {'settings': {'number_of_shards': 10}}  # ###
        es.indices.create(new_idx, body=body)
        es.indices.put_mapping(index=new_idx, doc_type='variant', body=m)
    # helpers.reindex(es, source_index='myvariant_all',
    #                 target_index= new_idx, chunk_size=10000)
    esi = ESIndexer()
    doc_iter = esi.doc_feeder(index='myvariant_all_1',
                              doc_type='variant',
                              step=step)

    for doc_batch in iter_n(doc_iter, step):
        do_index(doc_batch,
                 index_name=new_idx,
                 doc_type='variant',
                 step=step,
                 verbose=False,
                 update=True)
Exemplo n.º 2
0
    def doc_iterator(self, genedoc_d, batch=True, step=10000, validate=True):
        if isinstance(genedoc_d, types.GeneratorType) and batch:
            for doc_li in iter_n(genedoc_d, n=step):
                yield doc_li
        else:
            if batch:
                doc_li = []
                i = 0
            for _id, doc in genedoc_d.items():
                doc['_id'] = _id
                _doc = copy.copy(self)
                _doc.clear()
                _doc.update(doc)
                if validate:
                    _doc.validate()
                if batch:
                    doc_li.append(_doc)
                    i += 1
                    if i % step == 0:
                        yield doc_li
                        doc_li = []
                else:
                    yield _doc

            if batch:
                yield doc_li
Exemplo n.º 3
0
 def _add_docs(ids):
     i = 0
     for _ids in iter_n(ids, step):
         t1 = time.time()
         _doc_li = src.mget_from_ids(_ids)
         for _doc in _doc_li:
             _doc['_timestamp'] = _timestamp
             i += 1
         target.insert(_doc_li)
         print('\t{}\t{}'.format(i, timesofar(t1)))
Exemplo n.º 4
0
def verify_ids(doc_iter, step=100000, index=None, doc_type=None):
    '''verify how many docs from input interator/list overlapping with existing docs.'''

    index = index or config.ES_INDEX_NAME
    doc_type = doc_type or config.ES_DOC_TYPE
    q = {'query': {'ids': {"values": []}}}
    total_cnt = 0
    found_cnt = 0
    out = []
    for doc_batch in iter_n(doc_iter, n=step):
        id_li = [doc['_id'] for doc in doc_batch]
        # id_li = [doc['_id'].replace('chr', '') for doc in doc_batch]
        q['query']['ids']['values'] = id_li
        xres = es.search(index=index, doc_type=doc_type, body=q, _source=False)
        found_cnt += xres['hits']['total']
        total_cnt += len(id_li)
        print(xres['hits']['total'], found_cnt, total_cnt)
        out.extend([x['_id'] for x in xres['hits']['hits']])
    return out
Exemplo n.º 5
0
def verify_ids(doc_iter, step=100000, index=None, doc_type=None):
    '''verify how many docs from input interator/list overlapping with existing docs.'''

    index = index or config.ES_INDEX_NAME
    doc_type = doc_type or config.ES_DOC_TYPE
    es = get_es()
    q = {'query': {'ids': {"values": []}}}
    total_cnt = 0
    found_cnt = 0
    out = []
    for doc_batch in iter_n(doc_iter, n=step):
        id_li = [doc['_id'] for doc in doc_batch]
        # id_li = [doc['_id'].replace('chr', '') for doc in doc_batch]
        q['query']['ids']['values'] = id_li
        xres = es.search(index=index, doc_type=doc_type, body=q, _source=False)
        found_cnt += xres['hits']['total']
        total_cnt += len(id_li)
        print(xres['hits']['total'], found_cnt, total_cnt)
        out.extend([x['_id'] for x in xres['hits']['hits']])
    return out
Exemplo n.º 6
0
def clone_index(createidx=False, test=True):
    if test:
        return
    from utils.es import ESIndexer
    from utils.common import iter_n

    new_idx = 'myvariant_current_3'
    step = 10000
    if createidx:
        from mapping import get_mapping
        m = get_mapping()
        body = {'settings': {'number_of_shards': 10}}    # ###
        es.indices.create(new_idx, body=body)
        es.indices.put_mapping(index=new_idx, doc_type='variant', body=m)
    # helpers.reindex(es, source_index='myvariant_all',
    #                 target_index= new_idx, chunk_size=10000)
    esi = ESIndexer()
    doc_iter = esi.doc_feeder(index='myvariant_all_1', doc_type='variant', step=step)

    for doc_batch in iter_n(doc_iter, step):
        do_index(doc_batch, index_name=new_idx, doc_type='variant', step=step, verbose=False, update=True)
Exemplo n.º 7
0
    def apply_changes(self, changes):
        step = self.step
        target_col = self._target_col
        source_col = self._db[changes['source']]
        src = GeneDocMongoDBBackend(source_col)
        target = GeneDocMongoDBBackend(target_col)
        _timestamp = changes['timestamp']

        t0 = time.time()
        if changes['add']:
            logging.info("Adding {} new docs...".format(len(changes['add'])))
            t00 = time.time()
            for _ids in iter_n(changes['add'], step):
                _doc_li = src.mget_from_ids(_ids)
                for _doc in _doc_li:
                    _doc['_timestamp'] = _timestamp
                target.insert(_doc_li)
            logging.info("done. [{}]".format(timesofar(t00)))
        if changes['delete']:
            logging.info("Deleting {} discontinued docs...".format(
                len(changes['delete'])))
            t00 = time.time()
            target.remove_from_ids(changes['delete'], step=step)
            logging.info("done. [{}]".format(timesofar(t00)))

        if changes['update']:
            logging.info("Updating {} existing docs...".format(
                len(changes['update'])))
            t00 = time.time()
            i = 0
            t1 = time.time()
            for _diff in changes['update']:
                target.update_diff(_diff, extra={'_timestamp': _timestamp})
                i += 1
                if i > 1 and i % step == 0:
                    logging.info('\t{}\t{}'.format(i, timesofar(t1)))
                    t1 = time.time()
            logging.info("done. [{}]".format(timesofar(t00)))
        logging.info("\n")
        logging.info("Finished. %s" % timesofar(t0))
Exemplo n.º 8
0
    def apply_changes(self, changes):
        step = self.step
        target_col = self._target_col
        source_col = self._db[changes['source']]
        src = GeneDocMongoDBBackend(source_col)
        target = GeneDocMongoDBBackend(target_col)
        _timestamp = changes['timestamp']

        t0 = time.time()
        if changes['add']:
            logging.info("Adding {} new docs...".format(len(changes['add'])))
            t00 = time.time()
            for _ids in iter_n(changes['add'], step):
                _doc_li = src.mget_from_ids(_ids)
                for _doc in _doc_li:
                    _doc['_timestamp'] = _timestamp
                target.insert(_doc_li)
            logging.info("done. [{}]".format(timesofar(t00)))
        if changes['delete']:
            logging.info("Deleting {} discontinued docs...".format(len(changes['delete'])))
            t00 = time.time()
            target.remove_from_ids(changes['delete'], step=step)
            logging.info("done. [{}]".format(timesofar(t00)))

        if changes['update']:
            logging.info("Updating {} existing docs...".format(len(changes['update'])))
            t00 = time.time()
            i = 0
            t1 = time.time()
            for _diff in changes['update']:
                target.update_diff(_diff, extra={'_timestamp': _timestamp})
                i += 1
                if i > 1 and i % step == 0:
                    logging.info('\t{}\t{}'.format(i, timesofar(t1)))
                    t1 = time.time()
            logging.info("done. [{}]".format(timesofar(t00)))
        logging.info("\n")
        logging.info("Finished. %s" % timesofar(t0))