예제 #1
0
def insert_documents_to_index(documents, an, index):
    client = Elasticsearch()
    idx = Index(index, using=client)
    if idx.exists():
        idx.delete()

    idx.settings(number_of_shards=1)
    idx.create()

    idx = Index(index, using=client)
    idx.close()
    idx.analyzer(an)

    client.indices.put_mapping(
        doc_type='document',
        index=index,
        body={'document': {
            'properties': {
                'path': {
                    'type': 'keyword'
                }
            }
        }})

    idx.save()
    idx.open()

    print('Index settings=', idx.get_settings())
    print('Indexing ...')
    bulk(client, documents)
예제 #2
0
def _init_index(index_config, force):
    index = Index(index_config['name'])
    aliases = {}
    for alias_val in index_config['alias']:
        if isinstance(alias_val, basestring):
            aliases[alias_val] = {}
        else:
            aliases[alias_val['name']] = alias_val['config']
    index.aliases(**aliases)
    if force:
        index.delete(ignore=404)
    try:
        index.create()
    except TransportError as err:
        if err.status_code == 404:
            logger.debug('Index already exists, initializing document')
    index.close()

    for document_config in index_config['documents']:
        module_str, class_str = document_config['class'].rsplit('.', 1)
        module = import_module(module_str)
        cls = getattr(module, class_str)
        index.doc_type(cls)
        cls.init()
    index.open()

    return index
예제 #3
0
def insert_document_to_index(documents, text_an, index, keep):
    client = Elasticsearch()

    idx = Index(index, using=client)

    if idx.exists() and not keep:
        print('Removing existing index...')
        idx.delete()

    if not idx.exists():
        print('Creating index')
        idx.create()

    idx.close()
    idx.analyzer(text_an)

    client.indices.put_mapping(
        doc_type='document',
        index=index,
        body={'document': {
            'properties': {
                'path': {
                    'type': 'keyword'
                }
            }
        }})

    idx.save()
    idx.open()

    print("Index settings=", idx.get_settings())
    print('Indexing ...')
    bulk(client, documents)
예제 #4
0
def close_index_with_pk(dic_pk):
    index_name = get_index_name_with_pk(dic_pk)
    if index_name == 'mdict-':
        print(dic_pk, 'not exists')
        return
    index = Index(index_name)
    try:
        index.close()
    except NotFoundError as e:
        print(e)
    print('close', dic_pk, index_name)
예제 #5
0
    def init_models(models):
        """
        Init a model and create the index if not existing
        """
        for _, model_obj in models.items():

            # TO BE FIXED waiting for the following issue
            # https://github.com/elastic/elasticsearch-dsl-py/pull/272
            i = Index(model_obj._doc_type.index)
            if i.exists():
                i.close()
            model_obj.init()
            i.open()
예제 #6
0
파일: indices.py 프로젝트: marcinn/springy
    def initialize(self, using=None):
        """
        Initialize / update doctype
        """
        from .settings import INDEX_DEFAULTS
        meta = dict(INDEX_DEFAULTS)
        meta.update(self._meta.meta or {})

        _idx = DSLIndex(self._meta.document._doc_type.index)
        _idx.settings(**meta)

        if not _idx.exists():
            _idx.create()
        else:
            static_settings = [
                'number_of_shards', 'codec', 'routing_partition_size'
            ]
            not_updateable = [
                'number_of_shards',
            ]

            def filter_out_not_updateable(settings):
                return dict(
                    filter(lambda x: x[0] not in not_updateable,
                           settings.items()))

            idx_dict = _idx.to_dict()
            idx_settings = idx_dict.get('settings') or {}
            idx_analysis = idx_settings.pop('analysis') or {}
            idx_static = dict(
                map(lambda x: (x, idx_settings.pop(x)),
                    list(filter(lambda x: x in static_settings,
                                idx_settings))))

            idx_settings = filter_out_not_updateable(idx_settings)
            idx_static = filter_out_not_updateable(idx_static)

            if idx_settings:
                _idx.put_settings(body=idx_settings, preserve_existing=True)

            try:
                _idx.close()
                _idx.put_settings(body={'analysis': idx_analysis},
                                  preserve_existing=True)
                if idx_static:
                    _idx.put_settings(body=idx_static, preserve_existing=True)
            finally:
                _idx.open()

        self._meta.document.init(using=using)
    def handle(self, *args, **options):
        text_analyzer = get_text_analyzer("german")
        elastic_index = Index("mst_debug")
        if not elastic_index.exists():
            elastic_index.create()
        elastic_index.close()
        elastic_index.analyzer(text_analyzer)
        elastic_index.save()
        elastic_index.open()
        elastic_index.flush()

        for word in options["words"]:
            analysis = elastic_index.analyze(body={
                "analyzer": "text_analyzer",
                "text": word
            })
            tokens = [i["token"] for i in analysis["tokens"]]
            self.stdout.write("{} {}\n".format(word, tokens))
예제 #8
0
    def open_spider(self, spider):

        self.client = Elasticsearch()
        try:
            # Drop index if it exists
            ind = Index(self.elastic_db, using=self.client)
            ind.delete()
        except NotFoundError:
            pass
        # then create it
        ind.create()
        ind.close()
        # Configure tokenizer
        my_analyzer = analyzer('default', type='custom',
            tokenizer=tokenizer('standard'),
            filter=['lowercase', 'asciifolding'])
        ind.analyzer(my_analyzer)
        ind.save()
        ind.open()
예제 #9
0
class ElasticSearchIndex:
    def __init__(self,
                 name='qb',
                 similarity='default',
                 bm25_b=None,
                 bm25_k1=None):
        self.name = name
        self.ix = Index(self.name)
        self.answer_doc = create_doctype(self.name, similarity)
        if bm25_b is None:
            bm25_b = .75
        if bm25_k1 is None:
            bm25_k1 = 1.2
        self.bm25_b = bm25_b
        self.bm25_k1 = bm25_k1

    def delete(self):
        try:
            self.ix.delete()
        except elasticsearch.exceptions.NotFoundError:
            log.info('Could not delete non-existent index.')

    def exists(self):
        return self.ix.exists()

    def init(self):
        self.ix.create()
        self.ix.close()
        self.ix.put_settings(
            body={
                'similarity': {
                    'qb_bm25': {
                        'type': 'BM25',
                        'b': self.bm25_b,
                        'k1': self.bm25_k1
                    }
                }
            })
        self.ix.open()
        self.answer_doc.init(index=self.name)

    def build_large_docs(self,
                         documents: Dict[str, str],
                         use_wiki=True,
                         use_qb=True,
                         rebuild_index=False):
        if rebuild_index or bool(int(os.getenv('QB_REBUILD_INDEX', 0))):
            log.info(f'Deleting index: {self.name}')
            self.delete()

        if self.exists():
            log.info(f'Index {self.name} exists')
        else:
            log.info(f'Index {self.name} does not exist')
            self.init()
            wiki_lookup = Wikipedia()
            log.info(
                'Indexing questions and corresponding wikipedia pages as large docs...'
            )
            for page in tqdm.tqdm(documents):
                if use_wiki and page in wiki_lookup:
                    wiki_content = wiki_lookup[page].text
                else:
                    wiki_content = ''

                if use_qb:
                    qb_content = documents[page]
                else:
                    qb_content = ''

                answer = self.answer_doc(page=page,
                                         wiki_content=wiki_content,
                                         qb_content=qb_content)
                answer.save(index=self.name)

    def build_many_docs(self,
                        pages,
                        documents,
                        use_wiki=True,
                        use_qb=True,
                        rebuild_index=False):
        if rebuild_index or bool(int(os.getenv('QB_REBUILD_INDEX', 0))):
            log.info(f'Deleting index: {self.name}')
            self.delete()

        if self.exists():
            log.info(f'Index {self.name} exists')
        else:
            log.info(f'Index {self.name} does not exist')
            self.init()
            log.info(
                'Indexing questions and corresponding pages as many docs...')
            if use_qb:
                log.info('Indexing questions...')
                for page, doc in tqdm.tqdm(documents):
                    self.answer_doc(page=page, qb_content=doc).save()

            if use_wiki:
                log.info('Indexing wikipedia...')
                wiki_lookup = Wikipedia()
                for page in tqdm.tqdm(pages):
                    if page in wiki_lookup:
                        content = word_tokenize(wiki_lookup[page].text)
                        for i in range(0, len(content), 200):
                            chunked_content = content[i:i + 200]
                            if len(chunked_content) > 0:
                                self.answer_doc(page=page,
                                                wiki_content=' '.join(
                                                    chunked_content)).save()

    def search(self,
               text: str,
               max_n_guesses: int,
               normalize_score_by_length=False,
               wiki_boost=1,
               qb_boost=1):
        if not self.exists():
            raise ValueError(
                'The index does not exist, you must create it before searching'
            )

        if wiki_boost != 1:
            wiki_field = 'wiki_content^{}'.format(wiki_boost)
        else:
            wiki_field = 'wiki_content'

        if qb_boost != 1:
            qb_field = 'qb_content^{}'.format(qb_boost)
        else:
            qb_field = 'qb_content'

        s = Search(index=self.name)[0:max_n_guesses].query(
            'multi_match', query=text, fields=[wiki_field, qb_field])
        results = s.execute()
        guess_set = set()
        guesses = []
        if normalize_score_by_length:
            query_length = len(text.split())
        else:
            query_length = 1

        for r in results:
            if r.page in guess_set:
                continue
            else:
                guesses.append((r.page, r.meta.score / query_length))
        return guesses
예제 #10
0
                           filter=args.filter)

    try:
        # Drop index if it exists
        ind = Index(index, using=client)
        ind.delete()
    except NotFoundError:
        pass
    # then create it
    ind.settings(number_of_shards=1)
    ind.create()

    ind = Index(index, using=client)

    # configure default analyzer
    ind.close()  # index must be closed for configuring analyzer
    ind.analyzer(my_analyzer)

    # configure the path field so it is not tokenized and we can do exact match search
    client.indices.put_mapping(
        doc_type='document',
        index=index,
        body={"document": {
            "properties": {
                "path": {
                    "type": "keyword",
                }
            }
        }})

    ind.save()
예제 #11
0
class ElasticSearchIndex:
    def __init__(self, name='qb', similarity='default', bm25_b=None, bm25_k1=None):
        self.name = name
        self.ix = Index(self.name)
        self.answer_doc = create_doctype(self.name, similarity)
        if bm25_b is None:
            bm25_b = .75
        if bm25_k1 is None:
            bm25_k1 = 1.2
        self.bm25_b = bm25_b
        self.bm25_k1 = bm25_k1

    def delete(self):
        try:
            self.ix.delete()
        except elasticsearch.exceptions.NotFoundError:
            log.info('Could not delete non-existent index.')

    def exists(self):
        return self.ix.exists()

    def init(self):
        self.ix.create()
        self.ix.close()
        self.ix.put_settings(body={'similarity': {
            'qb_bm25': {'type': 'BM25', 'b': self.bm25_b, 'k1': self.bm25_k1}}
        })
        self.ix.open()
        self.answer_doc.init(index=self.name)

    def build_large_docs(self, documents: Dict[str, str], use_wiki=True, use_qb=True, rebuild_index=False):
        if rebuild_index or bool(int(os.getenv('QB_REBUILD_INDEX', 0))):
            log.info(f'Deleting index: {self.name}')
            self.delete()

        if self.exists():
            log.info(f'Index {self.name} exists')
        else:
            log.info(f'Index {self.name} does not exist')
            self.init()
            wiki_lookup = Wikipedia()
            log.info('Indexing questions and corresponding wikipedia pages as large docs...')
            for page in tqdm.tqdm(documents):
                if use_wiki and page in wiki_lookup:
                    wiki_content = wiki_lookup[page].text
                else:
                    wiki_content = ''

                if use_qb:
                    qb_content = documents[page]
                else:
                    qb_content = ''

                answer = self.answer_doc(
                    page=page,
                    wiki_content=wiki_content, qb_content=qb_content
                )
                answer.save(index=self.name)

    def build_many_docs(self, pages, documents, use_wiki=True, use_qb=True, rebuild_index=False):
        if rebuild_index or bool(int(os.getenv('QB_REBUILD_INDEX', 0))):
            log.info(f'Deleting index: {self.name}')
            self.delete()

        if self.exists():
            log.info(f'Index {self.name} exists')
        else:
            log.info(f'Index {self.name} does not exist')
            self.init()
            log.info('Indexing questions and corresponding pages as many docs...')
            if use_qb:
                log.info('Indexing questions...')
                for page, doc in tqdm.tqdm(documents):
                    self.answer_doc(page=page, qb_content=doc).save()

            if use_wiki:
                log.info('Indexing wikipedia...')
                wiki_lookup = Wikipedia()
                for page in tqdm.tqdm(pages):
                    if page in wiki_lookup:
                        content = word_tokenize(wiki_lookup[page].text)
                        for i in range(0, len(content), 200):
                            chunked_content = content[i:i + 200]
                            if len(chunked_content) > 0:
                                self.answer_doc(page=page, wiki_content=' '.join(chunked_content)).save()

    def search(self, text: str, max_n_guesses: int,
               normalize_score_by_length=False,
               wiki_boost=1, qb_boost=1):
        if not self.exists():
            raise ValueError('The index does not exist, you must create it before searching')

        if wiki_boost != 1:
            wiki_field = 'wiki_content^{}'.format(wiki_boost)
        else:
            wiki_field = 'wiki_content'

        if qb_boost != 1:
            qb_field = 'qb_content^{}'.format(qb_boost)
        else:
            qb_field = 'qb_content'

        s = Search(index=self.name)[0:max_n_guesses].query(
            'multi_match', query=text, fields=[wiki_field, qb_field]
        )
        results = s.execute()
        guess_set = set()
        guesses = []
        if normalize_score_by_length:
            query_length = len(text.split())
        else:
            query_length = 1

        for r in results:
            if r.page in guess_set:
                continue
            else:
                guesses.append((r.page, r.meta.score / query_length))
        return guesses
예제 #12
0
파일: ir.py 프로젝트: DenisPeskov/QBASR
class IrIndex:
    def __init__(self,
                 name='qb',
                 similarity='default',
                 bm25_b=None,
                 bm25_k1=None):
        self.name = name
        self.ix = Index(self.name)
        self.answer_doc = create_doctype(self.name, similarity)
        if bm25_b is None:
            bm25_b = .75
        if bm25_k1 is None:
            bm25_k1 = 1.2
        self.bm25_b = bm25_b
        self.bm25_k1 = bm25_k1

    def delete(self):
        try:
            self.ix.delete()
        except elasticsearch.exceptions.NotFoundError:
            log.info('Could not delete non-existent index.')

    def exists(self):
        return self.ix.exists()

    def init(self):
        self.ix.create()
        self.ix.close()
        self.ix.put_settings(
            body={
                'similarity': {
                    'qb_bm25': {
                        'type': 'BM25',
                        'b': self.bm25_b,
                        'k1': self.bm25_k1
                    }
                }
            })
        self.ix.open()
        self.answer_doc.init(index=self.name)

    def build(self,
              qb_docs: Dict[str, str],
              asr_docs: Dict[str, str],
              use_wiki=False,
              use_qb=True,
              use_asr=True,
              rebuild_index=False):
        if rebuild_index or bool(int(os.getenv('QB_REBUILD_INDEX', 0))):  # pylint: disable=invalid-envvar-default
            log.info(f'Deleting index: {self.name}')
            self.delete()

        if self.exists():
            log.info(f'Index {self.name} exists')
        else:
            log.info(f'Index {self.name} does not exist')
            self.init()
            # wiki_lookup = Wikipedia()
            log.info('Indexing...')
            for page in tqdm.tqdm(qb_docs):
                wiki_content = ''
                # if use_wiki and page in wiki_lookup:
                #     wiki_content = wiki_lookup[page].text
                # else:
                #     wiki_content = ''

                if use_qb:
                    qb_content = qb_docs[page]
                else:
                    qb_content = ''

                if use_asr:
                    asr_content = asr_docs[page]
                else:
                    asr_content = ''

                answer = self.answer_doc(page=page,
                                         wiki_content=wiki_content,
                                         qb_content=qb_content,
                                         asr_content=asr_content)
                answer.save(index=self.name)

    def search(self,
               text: str,
               max_n_guesses: int,
               normalize_score_by_length=False):
        if not self.exists():
            raise ValueError(
                'The index does not exist, you must create it before searching'
            )

        wiki_field = 'wiki_content'
        qb_field = 'qb_content'
        asr_field = 'asr_content'

        s = Search(index=self.name)[0:max_n_guesses].query(  # pylint: disable=no-member
            'multi_match',
            query=text,
            fields=[wiki_field, qb_field, asr_field])
        results = s.execute()
        guess_set = set()
        guesses = []
        if normalize_score_by_length:
            query_length = max(1, len(text.split()))
        else:
            query_length = 1

        for r in results:
            if r.page in guess_set:
                continue
            else:
                guesses.append({
                    'guess': r.page,
                    'score': r.meta.score,
                    'length': query_length
                })
        if len(guesses) == 0:
            return {'guess': '~~~NOGUESS~~~', 'score': 0, 'length': 1}
        else:
            return guesses[0]
예제 #13
0
def createIndex(isUpdate=True):
    # 5. Создаем класс для хранения Адресов
    address = Index(address.INDEX)
    address.close()

    houses = Index(houses.INDEX)

    @houses.document
    class House(Document, InnerDoc):
        houseId = Keyword()
        house_num = Keyword()
        build_num = Keyword()
        str_num = Keyword()
        postal_code = Keyword()
        ifns_fl = Keyword()
        ifns_ul = Keyword()
        counter = Keyword()

    @address.document
    class Address(Document):
        ao_guid = Keyword(required=True)
        parent_guid = Keyword()

        act_status = Integer()
        curr_status = Integer()
        live_status = Integer()
        oper_status = Integer()

        formal_name = Keyword()
        off_name = Keyword()
        short_name = Keyword()

        region_code = Keyword()
        ao_level = Keyword()

        area_code = Keyword()
        auto_code = Keyword()
        extr_code = Keyword()
        city_ar_code = Keyword()
        city_code = Keyword()
        street_code = Keyword()
        plan_code = Keyword()
        place_code = Keyword()
        sub_ext_code = Keyword()
        plain_code = Keyword()
        code = Keyword()

        okato = Keyword()
        oktmo = Keyword()

        postal_code = Keyword()
        terr_ifns_fl = Keyword()
        terr_ifns_ul = Keyword()
        ifns_fl = Keyword()
        ifns_ul = Keyword()
        norm_doc = Keyword()

        district = Keyword()
        district_type = Keyword()
        settlement = Keyword()
        settlement_type = Keyword()
        street = Keyword()
        street_type = Keyword()

        start_date = Date()
        end_date = Date()
        update_date = Date()

        street_address_suggest = Text(analyzer="autocomplete")
        full_address = Keyword()
        district_full = Keyword()
        settlement_full = Keyword()
        street_full = Keyword()

        houses = Nested(House)

        def add_house(self, house_num, build_num):
            self.houses.append(House(house_num=house_num, build_num=build_num))

        def save(self, **kwargs):
            return super().save(**kwargs)

    Address.init()

    address.open()

    queryAllStreet = {
        "query": {
            "bool": {
                "must": [{
                    "match_all": {}
                }],
                "filter": {
                    "term": {
                        "ao_level": "7"
                    }
                }
            }
        }
    }

    queryByRegion = {
        "query": {
            "bool": {
                "must": [{
                    "term": {
                        "region_code": {
                            "value": "13"
                        }
                    }
                }],
                "filter": {
                    "term": {
                        "ao_level": "7"
                    }
                }
            }
        }
    }

    update_date = str(fiases.fias_data.VERSION_DATE) \
        + fiases.fias_data.DATE_TIME_ZONE

    queryUpdate = {
        "query": {
            "bool": {
                "must": [{
                    "term": {
                        "update_date": {
                            "value": update_date
                        }
                    }
                }],
                "filter": {
                    "term": {
                        "ao_level": "7"
                    }
                }
            }
        }
    }

    # Выбираем все улицы
    if isUpdate:
        print("indexing ...")
        scanResStreet = scan(ES,
                             scroll='1h',
                             query=queryUpdate,
                             index=address.INDEX)
        ADDR_UPDATE_CNT = Address.search()\
            .query("term", update_date=update_date)\
            .filter("term", ao_level="7").count()
    else:
        print("Full indexing ...")
        scanResStreet = scan(ES,
                             scroll='1h',
                             query=queryAllStreet,
                             index=address.INDEX)

        ADDR_UPDATE_CNT = Address.search()\
            .query("term", ao_level="7").count()

    print("ADDR_UPDATE_CNT: ", ADDR_UPDATE_CNT)
    # Обновляем индеск street_address_suggest
    addrSearch = Address.search()
    homeSearch = House.search()

    print("address: ", Address.search().count())
    print("houses: ", House.search().count())
    houseList = []
    for address in tqdm(scanResStreet,
                        unit=' address',
                        desc='indexed',
                        total=ADDR_UPDATE_CNT):
        # source = address['_source']
        # Получаем улицу
        street = Address.get(address['_id'])

        try:
            # Находим город
            city = addrSearch.query("match",
                                    ao_guid=street.parent_guid).execute()[0]

            if (not city.parent_guid):
                # Для Москвы, Питера и Севастополя регион равен городу.
                district = city
            else:
                # Находим регион
                district = addrSearch.query(
                    "match", ao_guid=city.parent_guid).execute()[0]
        except (Exception):
            print()
            print("Ошибка индексации: ")
            print("city: " + city)
            print()
            print("city.parent_guid: " + str((city.parent_guid == False)))
            print(address['_id'])
            print(street.short_name + "." + street.off_name.lower().strip() +
                  ", " + city.short_name + "." +
                  city.off_name.lower().strip() + ", " +
                  district.short_name.lower().strip() + "." +
                  district.off_name.lower().strip())
            print()
            continue
        else:
            houses = homeSearch.filter("term", ao_guid=street.ao_guid)
            for house in houses.scan():
                houseList.append(house)
        try:
            if (street.postal_code):
                postal_code = street.postal_code + ', '
            else:
                postal_code = ''
            street.update(
                street_type=street.short_name.strip(),
                street=street.off_name.strip(),
                settlement=city.off_name.strip(),
                settlement_type=city.short_name.strip(),
                district=district.off_name.strip(),
                district_type=district.short_name.strip(),
                street_address_suggest=district.off_name.lower().strip() +
                " " + city.off_name.lower().strip() + " " +
                street.off_name.lower().strip(),
                full_address=postal_code + district.short_name + ' ' +
                district.off_name + ', ' + city.short_name + ' ' +
                city.off_name + ', ' + street.short_name + ' ' +
                street.off_name,
                houses=houseList)
            houseList[:] = []
        except (Exception):
            print(house)
    return ADDR_UPDATE_CNT
    print("finish")