def import_populate_db():
    es_host = app.config['ELASTICSEARCH']
    es_idx = app.config['ES_IDX']

    es = Elasticsearch(f'{es_host}:9200')
    idx = IndicesClient(es)
    if idx.exists(index=es_idx):
        existing_docs = es.count(index=es_idx).get('count')
        if existing_docs:
            return existing_docs, 0

        app.logger.info("Drop and creating index and mapping definition")
        idx.delete(es_idx)
    else:
        idx.create(index=es_idx, body=mapping)

    app.logger.info("Populating elasticsearch with documents")
    errors = []
    for pdv in pdvs:
        document_id = ''.join(filter(str.isdigit, pdv.get('document')))
        pdv['document'] = document_id
        del pdv['id']
        try:
            es.index(index=es_idx, body=pdv, id=document_id)
        except Exception as ex:
            app.logger.exception(ex)
            errors.append({'id': pdv.get('id'), 'description': ex.args})
    inserted = len(pdvs) - len(errors)
    return errors, inserted
class ElasticIndiceDriver:
    def __init__(self, client: Elasticsearch):
        self.client = IndicesClient(client)

    def create_index(self, index: str, mapping: dict):
        self.client.create(index, json.dumps(mapping))

    def clean_index(self, index: str):
        self.client.delete(index)
        self.client.delete(f'{index}-finished')
示例#3
0
 def create(cls, user, **kwargs):
     """Create user index."""
     # Create index for user
     client = Elasticsearch(cls.__url__)
     indice = IndicesClient(client)
     if indice.exists(index=user.user_id):
         if 'delete_existing' in kwargs and kwargs['delete_existing']:
             log.warn('Deleting existing index for user %s' % user.user_id)
             indice.delete(index=user.user_id)
         else:
             log.warn('Index already exists for user %s' % user.user_id)
             return False
     log.info('Creating index for user %s' % user.user_id)
     indice.create(index=user.user_id)
     return True
示例#4
0
 def create(cls, user, **kwargs):
     """Create user index."""
     # Create index for user
     client = Elasticsearch(cls.__url__)
     indice = IndicesClient(client)
     if indice.exists(index=user.user_id):
         if 'delete_existing' in kwargs and kwargs['delete_existing']:
             log.warn('Deleting existing index for user %s' % user.user_id)
             indice.delete(index=user.user_id)
         else:
             log.warn('Index already exists for user %s' % user.user_id)
             return False
     log.info('Creating index for user %s' % user.user_id)
     indice.create(index=user.user_id)
     return True
    def recreate_index_model(self, model: Union[type[Gallery], type[Archive]]):

        from elasticsearch.client.indices import IndicesClient

        indices_client = IndicesClient(client=self.es_client)
        index_name = model._meta.es_index_name  # type: ignore
        if indices_client.exists(index_name):
            indices_client.delete(index=index_name)
        indices_client.create(index=index_name)
        indices_client.close(index=index_name)
        indices_client.put_settings(
            index=index_name,
            body={
                "index": {
                    "max_result_window": settings.MAX_RESULT_WINDOW
                },
                "analysis": {
                    "filter": {
                        "edge_ngram_filter": {
                            "type": "edge_ngram",
                            "min_gram": 2,
                            "max_gram": 20
                        }
                    },
                    "analyzer": {
                        "edge_ngram_analyzer": {
                            "type": "custom",
                            "tokenizer": "standard",
                            "filter": ["lowercase", "edge_ngram_filter"]
                        }
                    }
                }
            })
        indices_client.put_mapping(
            body=model._meta.es_mapping,  # type: ignore
            index=index_name,
        )
        indices_client.open(index=index_name)
示例#6
0
def initialize_elastic_search() -> Tuple[Elasticsearch, IndicesClient]:
    elastic_search = Elasticsearch(hosts=[{"host": "localhost", "port": 9200}])
    indices_client = IndicesClient(client=elastic_search)
    try:
        indices_client.create(
            index=INDEX,
            body={
                "mappings": {
                    "properties": {
                        "doc": {
                            "type": "text"
                        },
                        "vector": {
                            "type": "dense_vector",
                            "dims": 768
                        },
                    }
                }
            },
        )
    except RequestError:
        pass

    return elastic_search, indices_client
def create_index(esconn, index_name, data_file, shard_count):
    index = IndicesClient(esconn)
    try:
        index_json = open(data_file)
        body = index_json.read()
        json_body = json.loads(body)
        # Work out number of shards == no. of data nodes x 2
        print("Setting Index Shard Count to: " + str(shard_count))
        # Update json doc
        json_body["settings"]["index"]["number_of_shards"] = shard_count
        # For single node clusters (shard_count will be 2)- no replicas possible
        if shard_count == 2:
            print("Single node cluster detected - disabling replicas")
            json_body["settings"]["index"]["number_of_replicas"] = 0
        # Create Index and Apply any settings & mappings
        idx = index.create(
            index = index_name,
            body = json_body
        )
        if idx['acknowledged'] != True:
            raise ES_INDEX_ERROR('Failed to create Index. Response: ', idx)
        print("SUCCESS: Created Index: " + index_name)
    except Exception as ex:
        raise ES_PIPELINE_ERROR(ex)
示例#8
0
def main():
    c_parser = configparser.ConfigParser()
    c_parser.read("config.ini")
    es_config = c_parser["ELASTIC"]
    gtfs_config = c_parser["GTFS"]
    gtfs_path = gtfs_config["gtfs_path"]
    index_prefix = es_config["index_prefix"]
    stops_index = index_prefix + "_stops"
    shapes_index = index_prefix + "_shapes"
    stop_times_index = index_prefix + "_stop_times"
    es = Elasticsearch(
        host=es_config["host"],
        scheme=es_config["scheme"],
        port=es_config.getint("port"),
        http_auth=(es_config["username"], es_config["password"]),
        use_ssl=es_config.getboolean("use_ssl"),
        verify_certs=es_config.getboolean("verify_certs"),
        ca_certs=certifi.where())
    
    with open("mappings/shapes.json", 'r' ) as shapes_mapping_file:
        shapes_mapping = shapes_mapping_file.read()
    
    with open("mappings/stops.json", 'r' ) as stops_mapping_file:
        stops_mapping = stops_mapping_file.read()

    with open("mappings/stop_times.json", 'r') as stop_times_file:
        stop_times_mapping = stop_times_file.read()
    
    indices = IndicesClient(es)
    indices.create(stops_index, body=stops_mapping)
    indices.create(shapes_index, body=shapes_mapping)
    indices.create(stop_times_index, body=stop_times_mapping)
    all_stops = gather_stops(gtfs_path)
    for ok, item in parallel_bulk(es, genbulkactions(stops_index, all_stops.values()), chunk_size=500):
        if not ok:
            print(item)
    
    print("Done with stops")

    all_shapes = gather_shapes(gtfs_path)
    all_trips = gather_trips(gtfs_path)
    all_routes = gather_routes(gtfs_path)
    shapes_to_route = shape_to_route_dict(all_trips.values(), all_routes)
    for shape_id in shapes_to_route.keys():
        all_shapes[shape_id]['route'] = shapes_to_route[shape_id]
        all_shapes[shape_id].pop('start_seq', None)
        all_shapes[shape_id].pop('finish_seq', None)

    for ok, item in parallel_bulk(es, genbulkactions(shapes_index, all_shapes.values()), chunk_size=500):
        if not ok:
            print(item)
    
    print("Done with shapes")
    for trip in all_trips.values():
        route_id = trip.pop("route_id", None)
        if route_id:
            trip['route'] = all_routes[int(route_id)]

    all_stop_times = gather_stop_times(gtfs_path)
    for stop_time in all_stop_times:
        trip_id = stop_time.pop("trip_id", None)
        stop_id = stop_time.pop("stop_id", None)
        if trip_id:
            stop_time['trip'] = all_trips[int(trip_id)]
        if stop_id:
            stop_time['stop'] = all_stops[int(stop_id)]

    for ok, item in parallel_bulk(es, genbulkactions(stop_times_index, all_stop_times), chunk_size=1000):
        if not ok:
            print(item) 

    print("Done with stop times")