Exemplo n.º 1
0
async def import_all(folder):
    es = PrefixedElasticsearch()
    if 'LAZO_SERVER_HOST' in os.environ:
        lazo_client = lazo_index_service.LazoIndexClient(
            host=os.environ['LAZO_SERVER_HOST'],
            port=int(os.environ['LAZO_SERVER_PORT']))
    else:
        lazo_client = None

    dataset_docs = []
    lazo_docs = []
    for name in os.listdir(folder):
        if name.startswith('lazo.'):
            lazo_docs.append(name)
        else:
            dataset_docs.append(name)

    for i, name in enumerate(dataset_docs):
        if i % 50 == 0:
            print(
                "\nImporting to Elasticsearch, %d/%d" % (i, len(dataset_docs)),
                flush=True,
            )
        path = os.path.join(folder, name)
        with open(path, 'r') as fp:
            obj = json.load(fp)

        dataset_id = decode_dataset_id(name)
        try:
            delete_dataset_from_index(es, dataset_id, lazo_client)
            add_dataset_to_index(es, dataset_id, obj)
        except elasticsearch.TransportError:
            print('X', end='', flush=True)
            time.sleep(10)  # If writing can't keep up, needs a real break
            delete_dataset_from_index(es, dataset_id, lazo_client)
            add_dataset_to_index(es, dataset_id, obj)
        print('.', end='', flush=True)

    for i, name in enumerate(lazo_docs):
        if i % 500 == 0:
            print(
                "\nImporting to Lazo, %d/%d" % (i, len(lazo_docs)),
                flush=True,
            )
        path = os.path.join(folder, name)
        with open(path, 'r') as fp:
            obj = json.load(fp)

        dataset_id = decode_dataset_id(name[5:]).rsplit('.', 1)[0]
        lazo_es_id = obj.pop('_id')
        assert lazo_es_id.split('__.__')[0] == dataset_id
        try:
            add_dataset_to_lazo_storage(es, lazo_es_id, obj)
        except elasticsearch.TransportError:
            print('X', end='', flush=True)
            time.sleep(10)  # If writing can't keep up, needs a real break
            add_dataset_to_lazo_storage(es, lazo_es_id, obj)
        if i % 10 == 0:
            print('.', end='', flush=True)
Exemplo n.º 2
0
async def import_all(folder):
    amqp_conn = await aio_pika.connect_robust(
        host=os.environ['AMQP_HOST'],
        port=int(os.environ['AMQP_PORT']),
        login=os.environ['AMQP_USER'],
        password=os.environ['AMQP_PASSWORD'],
    )
    amqp_chan = await amqp_conn.channel()
    amqp_profile_exchange = await amqp_chan.declare_exchange(
        'profile',
        aio_pika.ExchangeType.FANOUT,
    )

    for name in os.listdir(folder):
        if not name.startswith('lazo.'):
            dataset_id = decode_dataset_id(name)
            path = os.path.join(folder, name)
            with open(path, 'r') as fp:
                obj = json.load(fp)
            metadata = dict(name=obj['name'],
                            materialize=obj['materialize'],
                            source=obj.get('source', 'unknown'))
            if obj.get('description'):
                metadata['description'] = obj['description']
            if obj.get('date'):
                metadata['date'] = obj['date']
            if obj.get('manual_annotations'):
                metadata['manual_annotations'] = obj['manual_annotations']
            await amqp_profile_exchange.publish(
                json2msg(dict(id=dataset_id, metadata=metadata)),
                '',
            )
            print('.', end='', flush=True)
Exemplo n.º 3
0
async def import_all(folder):
    es = elasticsearch.Elasticsearch(
        os.environ['ELASTICSEARCH_HOSTS'].split(',')
    )
    amqp_conn = await aio_pika.connect_robust(
        host=os.environ['AMQP_HOST'],
        port=int(os.environ['AMQP_PORT']),
        login=os.environ['AMQP_USER'],
        password=os.environ['AMQP_PASSWORD'],
    )
    amqp_chan = await amqp_conn.channel()
    amqp_datasets_exchange = await amqp_chan.declare_exchange(
        'datasets',
        aio_pika.ExchangeType.TOPIC,
    )
    if 'LAZO_SERVER_HOST' in os.environ:
        lazo_client = lazo_index_service.LazoIndexClient(
            host=os.environ['LAZO_SERVER_HOST'],
            port=int(os.environ['LAZO_SERVER_PORT'])
        )
    else:
        lazo_client = None

    print("Importing Elasticsearch data", end='', flush=True)
    for name in os.listdir(folder):
        if name.startswith('lazo.'):
            continue
        path = os.path.join(folder, name)
        with open(path, 'r') as fp:
            obj = json.load(fp)

        dataset_id = decode_dataset_id(name)
        try:
            delete_dataset_from_index(es, dataset_id, lazo_client)
            add_dataset_to_index(es, dataset_id, obj)
        except elasticsearch.TransportError:
            print('X', end='', flush=True)
            time.sleep(10)  # If writing can't keep up, needs a real break
            delete_dataset_from_index(es, dataset_id, lazo_client)
            add_dataset_to_index(es, dataset_id, obj)
        await amqp_datasets_exchange.publish(
            json2msg(dict(obj, id=dataset_id)),
            dataset_id,
        )
        print('.', end='', flush=True)

    print("Importing Lazo data", end='', flush=True)
    for name in os.listdir(folder):
        if not name.startswith('lazo.'):
            continue
        path = os.path.join(folder, name)
        with open(path, 'r') as fp:
            obj = json.load(fp)

        dataset_id = decode_dataset_id(name[5:]).rsplit('.', 1)[0]
        lazo_es_id = obj.pop('_id')
        assert lazo_es_id.split('__.__')[0] == dataset_id
        try:
            add_dataset_to_lazo_storage(es, lazo_es_id, obj)
        except elasticsearch.TransportError:
            print('X', end='', flush=True)
            time.sleep(10)  # If writing can't keep up, needs a real break
            add_dataset_to_lazo_storage(es, lazo_es_id, obj)
        print('.', end='', flush=True)
Exemplo n.º 4
0
 def test_decode(self):
     """Test decoding a file name to a dataset ID."""
     self.assertEqual(
         common.decode_dataset_id('datamart__contrived_2Fdataset_23id_3B'),
         'datamart_contrived/dataset#id;',
     )