예제 #1
0
def setup_index(year):
    index = Index(f'{INDEX_NAME}-{year}')
    index.settings(number_of_shards=2, number_of_replicas=0)
    index.aliases(politicians={})
    index.document(Politicians)
    index.analyzer(brazilian_analyzer)
    index.create()
예제 #2
0
class BaseSearchTestCase(TestCase):

    def setUp(self):
        from django.conf import settings
        SEARCH = getattr(settings, 'SEARCH')

        connections.create_connection('testing', **SEARCH['default']['connections'])
        self.index = Index(SEARCH['default']['index'], using='testing')
        # This is needed for test_documents, but has side effects in all running tests
        doctypes_list = (
            value for name, value
            in inspect.getmembers(documents)
            if not name.startswith('_') and
            inspect.isclass(value) and
            issubclass(value, DocType) and
            name != DocType.__name__
        )

        for doctype in doctypes_list:
            # Remove assigned index
            doctype._doc_type.index = None
            # Associate docs with test index
            self.index.doc_type(doctype)

        if self.index.exists():
            self.index.delete(ignore=404)
        self.index.create()

        self.search = Search(index=SEARCH['default']['index'])

    def tearDown(self):
        self.index.delete()
        queue = django_rq.get_queue()
        queue.empty()
예제 #3
0
def test_index_can_be_created_with_settings_and_mappings(write_client):
    i = Index('test-blog', using=write_client)
    i.document(Post)
    i.settings(number_of_replicas=0, number_of_shards=1)
    i.create()

    assert {
        'test-blog': {
            'mappings': {
                'properties': {
                    'title': {'type': 'text', 'analyzer': 'my_analyzer'},
                    'published_from': {'type': 'date'}
                }
            }
        }
    } == write_client.indices.get_mapping(index='test-blog')

    settings = write_client.indices.get_settings(index='test-blog')
    assert settings['test-blog']['settings']['index']['number_of_replicas'] == '0'
    assert settings['test-blog']['settings']['index']['number_of_shards'] == '1'
    assert settings['test-blog']['settings']['index']['analysis'] == {
        'analyzer': {
            'my_analyzer': {
                'type': 'custom',
                'tokenizer': 'keyword'
            }
        }
    }
예제 #4
0
def setup_es():
    """Create the ElasticSearch index and configure the mapping.
    """
    client = elasticsearch_config['client']
    index_name = elasticsearch_config['index']

    info = client.info()
    print('ElasticSearch version: {0}'.format(info['version']['number']))

    if client.indices.exists(index_name):
        print('Index "{0}" already exists. To re-create the index, manually '
              'delete the index and run this script again.'.format(index_name))
        print('To delete the index run:')
        print('curl -XDELETE \'http://{0}:{1}/{2}/\''.format(
            elasticsearch_config['host'], elasticsearch_config['port'],
            index_name))
        sys.exit(0)

    index = Index(index_name)
    index.settings(analysis=analysis_settings)

    index.doc_type(SearchArea)
    index.doc_type(SearchBook)
    index.doc_type(SearchImage)
    index.doc_type(SearchOuting)
    index.doc_type(SearchXreport)
    index.doc_type(SearchRoute)
    index.doc_type(SearchTopoMap)
    index.doc_type(SearchUser)
    index.doc_type(SearchWaypoint)
    index.doc_type(SearchArticle)

    index.create()

    print('Index "{0}" created'.format(index_name))
예제 #5
0
파일: search.py 프로젝트: Carlosedo/mixees
    def create_index_if_does_not_exist(cls):
        index = Index(cls.INDEX_NAME)
        index.doc_type(cls)

        if not index.connection.indices.exists(cls.INDEX_NAME):
            index.create()
            time.sleep(1)  # It takes some time to create the index
예제 #6
0
 def create_index_if_not_exists(self, index_name):
     self.index = index_name
     idx = Index(index_name)
     idx.settings(number_of_shards=1, number_of_replicas=1)
     idx.doc_type(LogType)
     try:
         idx.create()
     except:
         pass
    def initialize_index(self, delete_if_exists=False):
        """
        Initialize index with mapping in ElasticSearch

        :param delete_if_exists: delete index, if exists
        :return: None
        """

        def update_index_settings():
            """
            Function updates settings for slovenian lemmatization of words.
            As far as we know, elasticsearch-dsl library does not support
            custom filter settings.

            :return: None
            """
            analysis_settings = {
                "analysis": {
                    "filter": {
                        "lemmagen_filter_sl": {
                            "type": "lemmagen",
                            "lexicon": "sl"
                        }
                    },
                    "analyzer": {
                        "lemmagen_sl": {
                            "type": "custom",
                            "tokenizer": "uax_url_email",
                            "filter": [
                                "lemmagen_filter_sl",
                                "lowercase"
                            ]
                        }
                    }
                }
            }
            self.client.cluster.health(index=self.index_name,
                                       wait_for_status='green',
                                       request_timeout=2)
            self.client.indices.close(index=self.index_name)
            self.client.indices.put_settings(json.dumps(analysis_settings),
                                             index=self.index_name)
            self.client.indices.open(index=self.index_name)

        index = Index(self.index_name, using=self.client)
        if delete_if_exists and index.exists():
            index.delete()

        index.settings(
            # use higher number in production
            number_of_replicas=0
        )

        # register models
        index.doc_type(Document)
        index.create()
        update_index_settings()  # set lemmanizer
예제 #8
0
def es_index_mapping(index_name, doc_type, force=False):
    # 创建索引
    index = Index(index_name)
    index.doc_type(doc_type)

    if not index.exists():
        index.create()
    else:
        if force:
            index.upgrade()
예제 #9
0
def create_index(hosts, index):
    i = set_hosts_index(hosts=hosts, index=index)
    logprint('debug', 'creating new index')
    i = Index(index)
    i.create()
    logprint('debug', 'registering doc types')
    i.doc_type(Author)
    i.doc_type(Page)
    i.doc_type(Source)
    logprint('debug', 'DONE')
예제 #10
0
def create_search_index(index_name, doc_types=None, connection='default', delete_if_exists=False):
    index = Index(index_name, using=connection)
    if delete_if_exists:
        index.delete(ignore=404)
    if doc_types:
        for dt in doc_types:
            if isinstance(dt, str):
                dt = get_document_class(dt)
            index.doc_type(dt)
    if not index.exists():
        index.create()
    return index
예제 #11
0
 def applyConfig(self):
     try:
         print("Connecting to '%s', index '%s'" % (self.confESHost, self.confESIndex))
         res = connections.create_connection(hosts=[self.confESHost])
         idx = Index(self.confESIndex)
         idx.doc_type(DocHTTPRequestResponse)
         DocHTTPRequestResponse.init()
         try:
             idx.create()
         except:
             pass
     except Exception as e:
         JOptionPane.showMessageDialog(self.panel, "<html><p style='width: 300px'>Error while initializing ElasticSearch: %s</p></html>" % (str(e)), "Error", JOptionPane.ERROR_MESSAGE)
    def test_es_create_documents(self):
        # Index name required.
        with self.assertRaises(SystemExit):
            call_command('es_create_documents')

        # index_name not in settings.
        with self.assertRaises(SystemExit):
            call_command(
                'es_create_documents',
                index_name='barfoo'
            )

        # Index doesn't exist.
        with self.assertRaises(SystemExit):
            call_command(
                'es_create_documents',
                index_name='foobar'
            )

        index = Index('foobar')
        doc_type = Token.get_es_doc_type()
        index.doc_type(doc_type)
        index.create()
        self.refresh()

        # Disable auto indexing while creating objects.
        settings.TRAMPOLINE['OPTIONS']['disabled'] = True
        token = Token.objects.create(name="token")
        token_not_indexable = Token.objects.create(name='not_indexable')
        token_raise_exception = Token.objects.create(name='raise_exception')
        settings.TRAMPOLINE['OPTIONS']['disabled'] = False

        # Dry run.
        call_command(
            'es_create_documents',
            index_name='foobar',
            dry_run=True
        )
        self.assertDocDoesntExist(token)
        self.assertDocDoesntExist(token_not_indexable)
        self.assertDocDoesntExist(token_raise_exception)

        call_command(
            'es_create_documents',
            index_name='foobar',
            verbosity=3
        )
        self.assertDocExists(token)
        self.assertDocDoesntExist(token_not_indexable)
        self.assertDocDoesntExist(token_raise_exception)
    def test_es_create_alias(self):
        # Index name required.
        with self.assertRaises(SystemExit):
            call_command(
                'es_create_alias',
                target_name='foobar_target'
            )

        # Target name required.
        with self.assertRaises(SystemExit):
            call_command(
                'es_create_alias',
                index_name='foobar'
            )

        # Index doesn't exist.
        with self.assertRaises(SystemExit):
            call_command(
                'es_create_alias',
                index_name='foobar',
                target_name='foobar_target'
            )

        index = Index('foobar_target')
        index.create()
        self.refresh()

        # Alias with same name as index.
        with self.assertRaises(SystemExit):
            call_command(
                'es_create_alias',
                index_name='foobar_target',
                target_name='foobar_target'
            )

        # Dry run.
        call_command(
            'es_create_alias',
            index_name='foobar',
            target_name='foobar_target',
            dry_run=True
        )
        self.assertAliasDoesntExist(index='foobar_target', name='foobar')

        call_command(
            'es_create_alias',
            index_name='foobar',
            target_name='foobar_target'
        )
        self.assertAliasExists(index='foobar_target', name='foobar')
예제 #14
0
파일: encyc.py 프로젝트: densho/encyc-front
def create_index():
    index = set_hosts_index()
    logprint('debug', 'creating new index')
    index = Index(settings.DOCSTORE_INDEX)
    index.create()
    logprint('debug', 'creating mappings')
    Author.init()
    Page.init()
    Source.init()
    logprint('debug', 'registering doc types')
    index.doc_type(Author)
    index.doc_type(Page)
    index.doc_type(Source)
    logprint('debug', 'DONE')
예제 #15
0
    def _create_index(self):
        dt = datetime.utcnow()
        dt = dt.strftime('%Y.%m')
        es = connections.get_connection()
        if not es.indices.exists('indicators-{}'.format(dt)):
            index = Index('indicators-{}'.format(dt))
            index.aliases(live={})
            index.doc_type(Indicator)
            index.create()

            m = Mapping('indicator')
            m.field('indicator_ipv4', 'ip')
            m.field('indicator_ipv4_mask', 'integer')
            m.save('indicators-{}'.format(dt))
        return 'indicators-{}'.format(dt)
예제 #16
0
def create_indices(endpoint):
    """
    Creates constituent and address indices in PIC
    """
    connections.connections.create_connection(hosts=[endpoint], timeout=360, max_retries=10, retry_on_timeout=True)
    pic_index = Index('pic')
    pic_index.doc_type(Constituent)
    pic_index.doc_type(Address)
    pic_index.delete(ignore=404)

    pic_index.settings(
        number_of_shards=5,
        number_of_replicas=2
    )
    pic_index.create()
예제 #17
0
    def registerExtenderCallbacks(self, callbacks):
        self.callbacks = callbacks
        self.helpers = callbacks.getHelpers()
        callbacks.setExtensionName("Storing HTTP Requests/Responses into ElasticSearch")
        self.callbacks.registerHttpListener(self)
        self.callbacks.registerContextMenuFactory(self)
        self.out = callbacks.getStdout()

        res = connections.create_connection(hosts=[ES_host])
        idx = Index(ES_index)
        idx.doc_type(DocHTTPRequestResponse)
        try:
            idx.create()
        except:
            print("Index already exists")
    def test_es_delete_alias(self):
        # Index name required.
        with self.assertRaises(SystemExit):
            call_command(
                'es_delete_alias',
                target_name='foobar_target'
            )

        # Target name required.
        with self.assertRaises(SystemExit):
            call_command(
                'es_delete_alias',
                index_name='foobar'
            )

        # Index doesn't exist.
        with self.assertRaises(SystemExit):
            call_command(
                'es_delete_alias',
                index_name='foobar',
                target_name='foobar_target',
                yes=True
            )

        index = Index('foobar_target')
        index.create()
        self.refresh()

        # Alias doesn't exist.
        with self.assertRaises(SystemExit):
            call_command(
                'es_delete_alias',
                index_name='foobar',
                target_name='foobar_target',
                yes=True
            )

        trampoline_config.connection.indices.put_alias(
            index='foobar_target', name='foobar')
        self.assertAliasExists(index='foobar_target', name='foobar')

        call_command(
            'es_delete_alias',
            index_name='foobar',
            target_name='foobar_target',
            yes=True
        )
        self.assertAliasDoesntExist(index='foobar_target', name='foobar')
예제 #19
0
def _create_index():
    # https://github.com/csirtgadgets/massive-octo-spice/blob/develop/elasticsearch/observables.json
    # http://elasticsearch-py.readthedocs.org/en/master/api.html#elasticsearch.Elasticsearch.bulk
    idx = _current_index()
    es = connections.get_connection()
    if not es.indices.exists(idx):
        index = Index(idx)
        index.aliases(live={})
        index.doc_type(Indicator)
        index.create()

        m = Mapping('indicator')
        m.field('indicator_ipv4', 'ip')
        m.field('indicator_ipv4_mask', 'integer')
        m.field('lasttime', 'date')
        m.save(idx)
    return idx
예제 #20
0
    def _create_index(self):
        # https://github.com/csirtgadgets/massive-octo-spice/blob/develop/elasticsearch/observables.json
        # http://elasticsearch-py.readthedocs.org/en/master/api.html#elasticsearch.Elasticsearch.bulk
        dt = datetime.utcnow()
        dt = dt.strftime('%Y.%m')
        es = connections.get_connection()
        if not es.indices.exists('indicators-{}'.format(dt)):
            index = Index('indicators-{}'.format(dt))
            index.aliases(live={})
            index.doc_type(Indicator)
            index.create()

            m = Mapping('indicator')
            m.field('indicator_ipv4', 'ip')
            m.field('indicator_ipv4_mask', 'integer')
            m.save('indicators-{}'.format(dt))
        return 'indicators-{}'.format(dt)
예제 #21
0
def recreate_index():
    """Delete index if it's there and creates a new one"""
    index = Index(name=get_index_name(), using='default')

    for name, doc_type in get_doctypes().items():
        index.doc_type(doc_type)

    # Delete the index if it exists.
    try:
        index.delete()
    except NotFoundError:
        pass

    # Note: There should be no mapping-conflict race here since the
    # index doesn't exist. Live indexing should just fail.

    # Create the index with the mappings all at once.
    index.create()
예제 #22
0
def test_index_template_works(write_client):
    it = IndexTemplate('test-template', 'test-*')
    it.document(Post)
    it.settings(number_of_replicas=0, number_of_shards=1)
    it.save()

    i = Index('test-blog')
    i.create()

    assert {
        'test-blog': {
            'mappings': {
                'properties': {
                    'title': {'type': 'text', 'analyzer': 'my_analyzer'},
                    'published_from': {'type': 'date'},
                }
            }
        }
    } == write_client.indices.get_mapping(index='test-blog')
예제 #23
0
def create_index(index_name, doc_classes=None):
    """ Create index and add document classes to it.

    Does NOT check whether index already exists.

    :param index_name: Name of index to be created.
    :param doc_classes: Sequence of document classes which should be
        added to created index. Defaults to None, in which case all
        document classes from document registry are added to new index.
    """
    index = Index(index_name)

    if doc_classes is None:
        doc_classes = get_document_classes().values()

    for doc_cls in doc_classes:
        index.doc_type(doc_cls)

    index.create()
예제 #24
0
    def _create_index(self):
        # https://github.com/csirtgadgets/massive-octo-spice/blob/develop/elasticsearch/observables.json
        # http://elasticsearch-py.readthedocs.org/en/master/api.html#elasticsearch.Elasticsearch.bulk

        # every time we check it does a HEAD req
        if self.last_index_value and (datetime.utcnow() - self.last_index_check) < timedelta(minutes=2):
            return self.last_index_value

        idx = self._current_index()

        if not self.handle.indices.exists(idx):
            index = Index(idx)
            index.aliases(live={})
            index.doc_type(Indicator)
            index.settings(max_result_window=WINDOW_LIMIT)
            index.create()
            self.handle.indices.flush(idx)

        self.last_index_check = datetime.utcnow()
        self.last_index_value = idx
        return idx
예제 #25
0
def test_index_can_be_created_with_settings_and_mappings(write_client):
    i = Index('test-blog', using=write_client)
    i.doc_type(Post)
    i.doc_type(User)
    i.settings(number_of_replicas=0, number_of_shards=1)
    i.create()

    assert {
        'test-blog': {
            'mappings': {
                'post': {
                    'properties': {
                        'title': {'type': 'string', 'analyzer': 'my_analyzer'},
                        'published_from': {'type': 'date', 'format': 'dateOptionalTime',},
                    }
                },
                'user': {
                    'properties': {
                        'username': {'type': 'string', 'index': 'not_analyzed'},
                        'joined_date': {'type': 'date', 'format': 'dateOptionalTime',},
                    }
                },
            }
        }
    } == write_client.indices.get_mapping(index='test-blog')

    settings = write_client.indices.get_settings(index='test-blog')
    assert settings['test-blog']['settings']['index']['number_of_replicas'] == '0'
    assert settings['test-blog']['settings']['index']['number_of_shards'] == '1'
    assert settings['test-blog']['settings']['index']['analysis'] == {
        'analyzer': {
            'my_analyzer': {
                'type': 'custom',
                'tokenizer': 'keyword'
            }
        }
    }
예제 #26
0
    def mitm_request(self, data):
	# Initialize ES connection and index
	res = connections.create_connection(hosts=[args.elasticsearch])
	idx = Index(args.index)
	idx.doc_type(DocHTTPRequestResponse)
	try:
	    DocHTTPRequestResponse.init()
	    idx.create()
	except:
	    pass

        r = HTTPRequest(data)

        # determine url
        if self.is_connect:
            scheme = "https"
        else:
            scheme = "http"
        url = scheme + "://" + self.hostname
        if scheme == "http" and int(self.port) != 80 or scheme == "https" and int(self.port) != 443:
            url += ":" + str(self.port)
        url += self.path

        if args.verbose:
            print(url)

        self.doc = DocHTTPRequestResponse(host=self.hostname, port=int(self.port), protocol=scheme)
        self.doc.meta.index = args.index
        self.doc.request.url = url
        self.doc.request.requestline = r.requestline
        self.doc.request.method = r.command
        self.doc.host = self.hostname
        self.doc.port = int(self.port)
        self.doc.protocol = scheme
            
        return data
    def test_es_delete_index(self):
        # Index name required.
        with self.assertRaises(SystemExit):
            call_command('es_delete_index')

        # Index doesn't exist.
        with self.assertRaises(SystemExit):
            call_command(
                'es_delete_index',
                index_name='foobar',
                yes=True
            )

        index = Index('foobar')
        index.create()
        self.refresh()
        self.assertIndexExists('foobar')

        call_command(
            'es_delete_index',
            index_name='foobar',
            yes=True
        )
        self.assertIndexDoesntExist('foobar')
예제 #28
0
)


@nsf.document
class Grant(Document):
    title = Text()
    abstract = Text()
    date = Date()
    division = Keyword()

    class Index:
        name = "nsf"


nsf.delete()
nsf.create()

cur = db.cursor()
cur.execute(
    "select AwardTitle, AbstractNarration, AwardAmount, AwardEffectiveDate, LongName from Award join Division on Award.AwardID = Division.AwardID"
)

Grant.init()

for r in cur.fetchall():
    g = Grant(title=r[0], abstract=r[1], date=r[3], division=r[4])
    g.amount = r[2]
    g.save()

exit()
예제 #29
0
from sortedcontainers import SortedDict

from pe.pe import PE
from pocket_rankings.pocket_rankings import PocketRankings

logger = logging.getLogger(__name__)

connections.create_connection(hosts=['localhost'])

INDEX_NAME = 'poker'

es_index = Index(INDEX_NAME)
# for index in connections.get_connection().indices.get('*'):
#   print(index)
# es_index.delete(ignore=404)
es_index.create(ignore=400)
# logger.info('index truncated')


@es_index.doc_type
class GameAction(DocType):
    site = String(index='not_analyzed')
    game = String(index='not_analyzed')
    vs = Integer()

    player = String(index='not_analyzed')
    amount = Integer()
    pot = Integer()
    pos = Integer()

    preflop_1 = String(index='not_analyzed')
예제 #30
0
class ElasticSearchIndex:
    def __init__(self, name='qb', similarity='default', bm25_b=None, bm25_k1=None):
        self.name = name
        self.ix = Index(self.name)
        self.answer_doc = create_doctype(self.name, similarity)
        if bm25_b is None:
            bm25_b = .75
        if bm25_k1 is None:
            bm25_k1 = 1.2
        self.bm25_b = bm25_b
        self.bm25_k1 = bm25_k1

    def delete(self):
        try:
            self.ix.delete()
        except elasticsearch.exceptions.NotFoundError:
            log.info('Could not delete non-existent index.')

    def exists(self):
        return self.ix.exists()

    def init(self):
        self.ix.create()
        self.ix.close()
        self.ix.put_settings(body={'similarity': {
            'qb_bm25': {'type': 'BM25', 'b': self.bm25_b, 'k1': self.bm25_k1}}
        })
        self.ix.open()
        self.answer_doc.init(index=self.name)

    def build_large_docs(self, documents: Dict[str, str], use_wiki=True, use_qb=True, rebuild_index=False):
        if rebuild_index or bool(int(os.getenv('QB_REBUILD_INDEX', 0))):
            log.info(f'Deleting index: {self.name}')
            self.delete()

        if self.exists():
            log.info(f'Index {self.name} exists')
        else:
            log.info(f'Index {self.name} does not exist')
            self.init()
            wiki_lookup = Wikipedia()
            log.info('Indexing questions and corresponding wikipedia pages as large docs...')
            for page in tqdm.tqdm(documents):
                if use_wiki and page in wiki_lookup:
                    wiki_content = wiki_lookup[page].text
                else:
                    wiki_content = ''

                if use_qb:
                    qb_content = documents[page]
                else:
                    qb_content = ''

                answer = self.answer_doc(
                    page=page,
                    wiki_content=wiki_content, qb_content=qb_content
                )
                answer.save(index=self.name)

    def build_many_docs(self, pages, documents, use_wiki=True, use_qb=True, rebuild_index=False):
        if rebuild_index or bool(int(os.getenv('QB_REBUILD_INDEX', 0))):
            log.info(f'Deleting index: {self.name}')
            self.delete()

        if self.exists():
            log.info(f'Index {self.name} exists')
        else:
            log.info(f'Index {self.name} does not exist')
            self.init()
            log.info('Indexing questions and corresponding pages as many docs...')
            if use_qb:
                log.info('Indexing questions...')
                for page, doc in tqdm.tqdm(documents):
                    self.answer_doc(page=page, qb_content=doc).save()

            if use_wiki:
                log.info('Indexing wikipedia...')
                wiki_lookup = Wikipedia()
                for page in tqdm.tqdm(pages):
                    if page in wiki_lookup:
                        content = word_tokenize(wiki_lookup[page].text)
                        for i in range(0, len(content), 200):
                            chunked_content = content[i:i + 200]
                            if len(chunked_content) > 0:
                                self.answer_doc(page=page, wiki_content=' '.join(chunked_content)).save()

    def search(self, text: str, max_n_guesses: int,
               normalize_score_by_length=False,
               wiki_boost=1, qb_boost=1):
        if not self.exists():
            raise ValueError('The index does not exist, you must create it before searching')

        if wiki_boost != 1:
            wiki_field = 'wiki_content^{}'.format(wiki_boost)
        else:
            wiki_field = 'wiki_content'

        if qb_boost != 1:
            qb_field = 'qb_content^{}'.format(qb_boost)
        else:
            qb_field = 'qb_content'

        s = Search(index=self.name)[0:max_n_guesses].query(
            'multi_match', query=text, fields=[wiki_field, qb_field]
        )
        results = s.execute()
        guess_set = set()
        guesses = []
        if normalize_score_by_length:
            query_length = len(text.split())
        else:
            query_length = 1

        for r in results:
            if r.page in guess_set:
                continue
            else:
                guesses.append((r.page, r.meta.score / query_length))
        return guesses
예제 #31
0
class BurpExtender(IBurpExtender, IHttpListener, IContextMenuFactory, ITab):
    def registerExtenderCallbacks(self, callbacks):
        self.callbacks = callbacks
        self.helpers = callbacks.getHelpers()
        callbacks.setExtensionName(
            "Storing HTTP Requests/Responses into ElasticSearch")
        self.callbacks.registerHttpListener(self)
        self.callbacks.registerContextMenuFactory(self)
        self.out = callbacks.getStdout()

        self.lastTimestamp = None
        self.confESHost = self.callbacks.loadExtensionSetting(
            "elasticburp.host") or ES_host
        self.confESIndex = self.callbacks.loadExtensionSetting(
            "elasticburp.index") or ES_index
        self.confBurpTools = int(
            self.callbacks.loadExtensionSetting("elasticburp.tools")
            or Burp_Tools)
        saved_onlyresp = self.callbacks.loadExtensionSetting(
            "elasticburp.onlyresp")
        if saved_onlyresp == "True":
            self.confBurpOnlyResp = True
        elif saved_onlyresp == "False":
            self.confBurpOnlyResp = False
        else:
            self.confBurpOnlyResp = bool(
                int(saved_onlyresp or Burp_onlyResponses))

        self.callbacks.addSuiteTab(self)
        self.applyConfig()

    def applyConfig(self):
        try:
            print("Connecting to '%s', index '%s'" %
                  (self.confESHost, self.confESIndex))
            self.es = connections.create_connection(hosts=[self.confESHost])
            self.idx = Index(self.confESIndex)
            self.idx.doc_type(DocHTTPRequestResponse)
            if self.idx.exists():
                self.idx.open()
            else:
                self.idx.create()
            self.callbacks.saveExtensionSetting("elasticburp.host",
                                                self.confESHost)
            self.callbacks.saveExtensionSetting("elasticburp.index",
                                                self.confESIndex)
            self.callbacks.saveExtensionSetting("elasticburp.tools",
                                                str(self.confBurpTools))
            self.callbacks.saveExtensionSetting(
                "elasticburp.onlyresp", str(int(self.confBurpOnlyResp)))
        except Exception as e:
            JOptionPane.showMessageDialog(
                self.panel,
                "<html><p style='width: 300px'>Error while initializing ElasticSearch: %s</p></html>"
                % (str(e)), "Error", JOptionPane.ERROR_MESSAGE)

    ### ITab ###
    def getTabCaption(self):
        return "ElasticBurp"

    def applyConfigUI(self, event):
        #self.idx.close()
        self.confESHost = self.uiESHost.getText()
        self.confESIndex = self.uiESIndex.getText()
        self.confBurpTools = int(
            (self.uiCBSuite.isSelected() and IBurpExtenderCallbacks.TOOL_SUITE)
            | (self.uiCBTarget.isSelected()
               and IBurpExtenderCallbacks.TOOL_TARGET) |
            (self.uiCBProxy.isSelected() and IBurpExtenderCallbacks.TOOL_PROXY)
            | (self.uiCBSpider.isSelected()
               and IBurpExtenderCallbacks.TOOL_SPIDER)
            | (self.uiCBScanner.isSelected()
               and IBurpExtenderCallbacks.TOOL_SCANNER)
            | (self.uiCBIntruder.isSelected()
               and IBurpExtenderCallbacks.TOOL_INTRUDER)
            | (self.uiCBRepeater.isSelected()
               and IBurpExtenderCallbacks.TOOL_REPEATER)
            | (self.uiCBSequencer.isSelected()
               and IBurpExtenderCallbacks.TOOL_SEQUENCER)
            | (self.uiCBExtender.isSelected()
               and IBurpExtenderCallbacks.TOOL_EXTENDER))
        self.confBurpOnlyResp = self.uiCBOptRespOnly.isSelected()
        self.applyConfig()

    def resetConfigUI(self, event):
        self.uiESHost.setText(self.confESHost)
        self.uiESIndex.setText(self.confESIndex)
        self.uiCBSuite.setSelected(
            bool(self.confBurpTools & IBurpExtenderCallbacks.TOOL_SUITE))
        self.uiCBTarget.setSelected(
            bool(self.confBurpTools & IBurpExtenderCallbacks.TOOL_TARGET))
        self.uiCBProxy.setSelected(
            bool(self.confBurpTools & IBurpExtenderCallbacks.TOOL_PROXY))
        self.uiCBSpider.setSelected(
            bool(self.confBurpTools & IBurpExtenderCallbacks.TOOL_SPIDER))
        self.uiCBScanner.setSelected(
            bool(self.confBurpTools & IBurpExtenderCallbacks.TOOL_SCANNER))
        self.uiCBIntruder.setSelected(
            bool(self.confBurpTools & IBurpExtenderCallbacks.TOOL_INTRUDER))
        self.uiCBRepeater.setSelected(
            bool(self.confBurpTools & IBurpExtenderCallbacks.TOOL_REPEATER))
        self.uiCBSequencer.setSelected(
            bool(self.confBurpTools & IBurpExtenderCallbacks.TOOL_SEQUENCER))
        self.uiCBExtender.setSelected(
            bool(self.confBurpTools & IBurpExtenderCallbacks.TOOL_EXTENDER))
        self.uiCBOptRespOnly.setSelected(self.confBurpOnlyResp)

    def getUiComponent(self):
        self.panel = JPanel()
        self.panel.setLayout(BoxLayout(self.panel, BoxLayout.PAGE_AXIS))

        self.uiESHostLine = JPanel()
        self.uiESHostLine.setLayout(
            BoxLayout(self.uiESHostLine, BoxLayout.LINE_AXIS))
        self.uiESHostLine.setAlignmentX(JPanel.LEFT_ALIGNMENT)
        self.uiESHostLine.add(JLabel("ElasticSearch Host: "))
        self.uiESHost = JTextField(40)
        self.uiESHost.setMaximumSize(self.uiESHost.getPreferredSize())
        self.uiESHostLine.add(self.uiESHost)
        self.panel.add(self.uiESHostLine)

        self.uiESIndexLine = JPanel()
        self.uiESIndexLine.setLayout(
            BoxLayout(self.uiESIndexLine, BoxLayout.LINE_AXIS))
        self.uiESIndexLine.setAlignmentX(JPanel.LEFT_ALIGNMENT)
        self.uiESIndexLine.add(JLabel("ElasticSearch Index: "))
        self.uiESIndex = JTextField(40)
        self.uiESIndex.setMaximumSize(self.uiESIndex.getPreferredSize())
        self.uiESIndexLine.add(self.uiESIndex)
        self.panel.add(self.uiESIndexLine)

        uiToolsLine = JPanel()
        uiToolsLine.setLayout(BoxLayout(uiToolsLine, BoxLayout.LINE_AXIS))
        uiToolsLine.setAlignmentX(JPanel.LEFT_ALIGNMENT)
        self.uiCBSuite = JCheckBox("Suite")
        uiToolsLine.add(self.uiCBSuite)
        uiToolsLine.add(Box.createRigidArea(Dimension(10, 0)))
        self.uiCBTarget = JCheckBox("Target")
        uiToolsLine.add(self.uiCBTarget)
        uiToolsLine.add(Box.createRigidArea(Dimension(10, 0)))
        self.uiCBProxy = JCheckBox("Proxy")
        uiToolsLine.add(self.uiCBProxy)
        uiToolsLine.add(Box.createRigidArea(Dimension(10, 0)))
        self.uiCBSpider = JCheckBox("Spider")
        uiToolsLine.add(self.uiCBSpider)
        uiToolsLine.add(Box.createRigidArea(Dimension(10, 0)))
        self.uiCBScanner = JCheckBox("Scanner")
        uiToolsLine.add(self.uiCBScanner)
        uiToolsLine.add(Box.createRigidArea(Dimension(10, 0)))
        self.uiCBIntruder = JCheckBox("Intruder")
        uiToolsLine.add(self.uiCBIntruder)
        uiToolsLine.add(Box.createRigidArea(Dimension(10, 0)))
        self.uiCBRepeater = JCheckBox("Repeater")
        uiToolsLine.add(self.uiCBRepeater)
        uiToolsLine.add(Box.createRigidArea(Dimension(10, 0)))
        self.uiCBSequencer = JCheckBox("Sequencer")
        uiToolsLine.add(self.uiCBSequencer)
        uiToolsLine.add(Box.createRigidArea(Dimension(10, 0)))
        self.uiCBExtender = JCheckBox("Extender")
        uiToolsLine.add(self.uiCBExtender)
        self.panel.add(uiToolsLine)
        self.panel.add(Box.createRigidArea(Dimension(0, 10)))

        uiOptionsLine = JPanel()
        uiOptionsLine.setLayout(BoxLayout(uiOptionsLine, BoxLayout.LINE_AXIS))
        uiOptionsLine.setAlignmentX(JPanel.LEFT_ALIGNMENT)
        self.uiCBOptRespOnly = JCheckBox(
            "Process only responses (include requests)")
        uiOptionsLine.add(self.uiCBOptRespOnly)
        self.panel.add(uiOptionsLine)
        self.panel.add(Box.createRigidArea(Dimension(0, 10)))

        uiButtonsLine = JPanel()
        uiButtonsLine.setLayout(BoxLayout(uiButtonsLine, BoxLayout.LINE_AXIS))
        uiButtonsLine.setAlignmentX(JPanel.LEFT_ALIGNMENT)
        uiButtonsLine.add(JButton("Apply", actionPerformed=self.applyConfigUI))
        uiButtonsLine.add(JButton("Reset", actionPerformed=self.resetConfigUI))
        self.panel.add(uiButtonsLine)
        self.resetConfigUI(None)

        return self.panel

    ### IHttpListener ###
    def processHttpMessage(self, tool, isRequest, msg):
        if not tool & self.confBurpTools or isRequest and self.confBurpOnlyResp:
            return

        doc = self.genESDoc(msg)
        doc.save()

    ### IContextMenuFactory ###
    def createMenuItems(self, invocation):
        menuItems = list()
        selectedMsgs = invocation.getSelectedMessages()
        if selectedMsgs != None and len(selectedMsgs) >= 1:
            menuItems.append(
                JMenuItem("Add to ElasticSearch Index",
                          actionPerformed=self.genAddToES(
                              selectedMsgs,
                              invocation.getInputEvent().getComponent())))
        return menuItems

    def genAddToES(self, msgs, component):
        def menuAddToES(e):
            progress = ProgressMonitor(component, "Feeding ElasticSearch", "",
                                       0, len(msgs))
            i = 0
            docs = list()
            for msg in msgs:
                if not Burp_onlyResponses or msg.getResponse():
                    docs.append(
                        self.genESDoc(
                            msg, timeStampFromResponse=True).to_dict(True))
                i += 1
                progress.setProgress(i)
            success, failed = bulk(self.es, docs, True, raise_on_error=False)
            progress.close()
            JOptionPane.showMessageDialog(
                self.panel,
                "<html><p style='width: 300px'>Successful imported %d messages, %d messages failed.</p></html>"
                % (success, failed), "Finished",
                JOptionPane.INFORMATION_MESSAGE)

        return menuAddToES

    ### Interface to ElasticSearch ###
    def genESDoc(self, msg, timeStampFromResponse=False):
        httpService = msg.getHttpService()
        doc = DocHTTPRequestResponse(protocol=httpService.getProtocol(),
                                     host=httpService.getHost(),
                                     port=httpService.getPort())
        doc.meta.index = self.confESIndex

        request = msg.getRequest()
        response = msg.getResponse()

        if request:
            iRequest = self.helpers.analyzeRequest(msg)
            doc.request.method = iRequest.getMethod()
            doc.request.url = iRequest.getUrl().toString()

            headers = iRequest.getHeaders()
            for header in headers:
                try:
                    doc.add_request_header(header)
                except:
                    doc.request.requestline = header

            parameters = iRequest.getParameters()
            for parameter in parameters:
                ptype = parameter.getType()
                if ptype == IParameter.PARAM_URL:
                    typename = "url"
                elif ptype == IParameter.PARAM_BODY:
                    typename = "body"
                elif ptype == IParameter.PARAM_COOKIE:
                    typename = "cookie"
                elif ptype == IParameter.PARAM_XML:
                    typename = "xml"
                elif ptype == IParameter.PARAM_XML_ATTR:
                    typename = "xmlattr"
                elif ptype == IParameter.PARAM_MULTIPART_ATTR:
                    typename = "multipartattr"
                elif ptype == IParameter.PARAM_JSON:
                    typename = "json"
                else:
                    typename = "unknown"

                name = parameter.getName()
                value = parameter.getValue()
                doc.add_request_parameter(typename, name, value)

            ctype = iRequest.getContentType()
            if ctype == IRequestInfo.CONTENT_TYPE_NONE:
                doc.request.content_type = "none"
            elif ctype == IRequestInfo.CONTENT_TYPE_URL_ENCODED:
                doc.request.content_type = "urlencoded"
            elif ctype == IRequestInfo.CONTENT_TYPE_MULTIPART:
                doc.request.content_type = "multipart"
            elif ctype == IRequestInfo.CONTENT_TYPE_XML:
                doc.request.content_type = "xml"
            elif ctype == IRequestInfo.CONTENT_TYPE_JSON:
                doc.request.content_type = "json"
            elif ctype == IRequestInfo.CONTENT_TYPE_AMF:
                doc.request.content_type = "amf"
            else:
                doc.request.content_type = "unknown"

            bodyOffset = iRequest.getBodyOffset()
            doc.request.body = request[bodyOffset:].tostring().decode(
                "ascii", "replace")

        if response:
            iResponse = self.helpers.analyzeResponse(response)

            doc.response.status = iResponse.getStatusCode()
            doc.response.content_type = iResponse.getStatedMimeType()
            doc.response.inferred_content_type = iResponse.getInferredMimeType(
            )

            headers = iResponse.getHeaders()
            dateHeader = None
            for header in headers:
                try:
                    doc.add_response_header(header)
                    match = reDateHeader.match(header)
                    if match:
                        dateHeader = match.group(1)
                except:
                    doc.response.responseline = header

            cookies = iResponse.getCookies()
            for cookie in cookies:
                expCookie = cookie.getExpiration()
                expiration = None
                if expCookie:
                    try:
                        expiration = str(
                            datetime.fromtimestamp(expCookie.time / 1000))
                    except:
                        pass
                doc.add_response_cookie(cookie.getName(), cookie.getValue(),
                                        cookie.getDomain(), cookie.getPath(),
                                        expiration)

            bodyOffset = iResponse.getBodyOffset()
            doc.response.body = response[bodyOffset:].tostring().decode(
                "ascii", "replace")

            if timeStampFromResponse:
                if dateHeader:
                    try:
                        doc.timestamp = datetime.fromtimestamp(
                            mktime_tz(parsedate_tz(dateHeader)),
                            tz)  # try to use date from response header "Date"
                        self.lastTimestamp = doc.timestamp
                    except:
                        doc.timestamp = self.lastTimestamp  # fallback: last stored timestamp. Else: now

        return doc
예제 #32
0
def buildIndex():
    """
    buildIndex creates a new film index, deleting any existing index of
    the same name.
    It loads a json file containing the covid doc metadata corpus and does bulk loading
    using a generator function.
    """
    doc_index = Index('covid_doc_index')
    if doc_index.exists():
        doc_index.delete()  # Overwrite any previous version
    doc_index.analyzer(
        basic_analyzer
    )  # register your customized analyzer as the default analyzer
    doc_index.create()

    # Open the covid metadata corpus
    with open('covid_comm_use_subset_meta.json', 'r',
              encoding='utf-8') as data_file:
        enum_id = 1
        documents = {}
        for line in data_file:
            try:
                doc = json.loads(line)
                for key in doc.keys():
                    if type(doc.get(key)) is not str:
                        if math.isnan(doc.get(key)):
                            doc.update({key: None})
                    if key == "publish_time" and doc.get(key) and len(
                            doc.get(key)) > 4:
                        match = re.search(year, doc[key]).group(0)
                        if match:
                            doc.update({key: match})
                documents.update({str(enum_id): doc})
                enum_id += 1
            except json.decoder.JSONDecodeError:
                continue
        # load doc metadata from json file into dictionary
        size = len(documents)

    # Action series for bulk loading with helpers.bulk function.
    # Implemented as a generator, to return one movie with each call.
    # Note that we include the index name here.
    # The Document type is always 'doc'.
    # Every item to be indexed must have a unique key.
    def actions():
        # enum_id is an enumerated id created when reading the json and used as key into covid metadata dictionary)
        for enum_id in range(1, size + 1):
            yield {
                "_index":
                "covid_doc_index",
                "_type":
                'doc',
                "_id":
                enum_id,
                "title":
                documents[str(enum_id)].get('title', 'None'),
                "text":
                documents[str(enum_id)].get('abstract', 'None'),
                "authors":
                documents[str(enum_id)].get('authors', 'None'),
                "publish_time":
                documents[str(enum_id)].get('publish_time', int(0000))
            }

    helpers.bulk(es, actions())
예제 #33
0
from flask import render_template
from flask import request
import vk_api
from polyglot.text import Text
import re
from datetime import datetime
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Index
from elasticsearch_dsl.connections import connections

connections.create_connection(hosts=['elasticsearch'], port=9200)
es = Elasticsearch(hosts=['elasticsearch'], port=9200)

post_index = Index('post_index', using=es)
post_index.delete(ignore=404)
post_index.create()

app = Flask(__name__)


@app.route('/')
def index():
    return render_template('index.html')


@app.route('/search/q')
def search():
    query = request.args.get("search")
    sentiment = request.args.get("sentiment")

    db_history_search = es.search(index='post_index', doc_type='post', q=query)
예제 #34
0
class InsightError(Exception):
    """exception for errors with insights"""


###############################################################################
# Account
###############################################################################


class AccountInsightError(InsightError):
    """an error with account index"""


account_ix = Index('account')
if not account_ix.exists():
    account_ix.create()


class AccountDoc(DocType):
    username = Keyword(required=True, store=True)
    posts_count = Integer(store=True)
    followers_count = Integer(store=True)
    following_count = Integer(store=True)
    bio = Text(store=True)
    website = Keyword(store=True)
    joined_at = Date(store=True)

    # post
    location = Keyword(store=True)
    tags = Keyword(store=True)
    count = Integer(store=True)
예제 #35
0
                       help="ElasticSearch host (default: %(default)s)")
argparser.add_argument("--index",
                       "-i",
                       default="testssl-scan",
                       help="ElasticSearch index (default: %(default)s)")
argparser.add_argument("files",
                       nargs="+",
                       help="List of testssl.sh logs in CSV format")
args = argparser.parse_args()

connections.create_connection(hosts=args.elasticsearch)
idx = Index(args.index)
idx.document(DocTestSSLResult)
DocTestSSLResult.init()
try:
    idx.create()
except:
    pass

csvFiles = args.files
for csvFile in csvFiles:
    try:
        csv = open(csvFile, mode="r", newline="")
    except IOError as e:
        print("Error while opening %s: %s" % (csvFile, e.strerror))

    print("Processing '%s'" % (csvFile))
    doc = DocTestSSLResult(source=csvFile)
    doc.parseCSV(csv)
    csv.close()
    try:
def createIndex():
    index = Index(INDEX, using=client)
    index.create()
예제 #37
0
def _create_index(index_name):
    new_index = Index(index_name, using=CONNECTION_ALIAS)
    new_index.delete(ignore=[400, 404])
    new_index.settings(index=DEFAULT_INDEX_SETTING)
    new_index.create()
예제 #38
0
class ElasticInsert(object):
    """此类为工具类,用于将更新的资讯插入至elasticsearch中."""
    def __init__(self, client, index_name: str):
        self.client = client
        self.index_name = index_name
        self.index = Index(name=self.index_name, using=self.client)
        self._init()

    def get_news_ids(self, dt: str):
        """取出给定日期的news_id集合,从而可以判断哪些资讯已插入,从而只插入新资讯."""
        scan_generator = scan(self.client,
                              query={'query': {
                                  'match': {
                                      'dt': dt
                                  }
                              }},
                              index=self.index_name,
                              _source=['news_id'])
        news_ids = set()
        news_id_list = list()
        for item in scan_generator:
            news_id = item['_source']['news_id']
            news_ids.add(news_id)
            news_id_list.append(news_id)
        if len(news_ids) != len(news_id_list):
            logging.warning(
                f"There are {len(news_id_list)-len(news_ids)} news " +
                f"repeatedly inserted.")
        logging.info(
            f"There are {len(news_ids)} news in index {self.index_name} on {dt}"
        )
        return news_ids

    def _init(self):
        self._document()
        if not self.index.exists():
            self.index.create()
            logging.info(f"Create index {self.index_name} successfully.")
        else:
            logging.info(f"Index {self.index_name} already existed.")

    def _document(self):
        @self.index.document
        class News(Document):
            news_id = Keyword()
            title = Text()
            content = Text()
            news_tag = Text()
            source = Keyword()
            info_url = Keyword()
            https_url = Keyword()
            large_pic = Keyword()
            mini_pic = Keyword()
            third_source = Text()
            content_type = Keyword()
            news_type = Keyword()
            is_video = Keyword()
            video_time = Long()
            update_time = Date()
            utc_update_time = Date()
            dt = Date()

        self.News = News
예제 #39
0
파일: ir.py 프로젝트: DenisPeskov/QBASR
class IrIndex:
    def __init__(self,
                 name='qb',
                 similarity='default',
                 bm25_b=None,
                 bm25_k1=None):
        self.name = name
        self.ix = Index(self.name)
        self.answer_doc = create_doctype(self.name, similarity)
        if bm25_b is None:
            bm25_b = .75
        if bm25_k1 is None:
            bm25_k1 = 1.2
        self.bm25_b = bm25_b
        self.bm25_k1 = bm25_k1

    def delete(self):
        try:
            self.ix.delete()
        except elasticsearch.exceptions.NotFoundError:
            log.info('Could not delete non-existent index.')

    def exists(self):
        return self.ix.exists()

    def init(self):
        self.ix.create()
        self.ix.close()
        self.ix.put_settings(
            body={
                'similarity': {
                    'qb_bm25': {
                        'type': 'BM25',
                        'b': self.bm25_b,
                        'k1': self.bm25_k1
                    }
                }
            })
        self.ix.open()
        self.answer_doc.init(index=self.name)

    def build(self,
              qb_docs: Dict[str, str],
              asr_docs: Dict[str, str],
              use_wiki=False,
              use_qb=True,
              use_asr=True,
              rebuild_index=False):
        if rebuild_index or bool(int(os.getenv('QB_REBUILD_INDEX', 0))):  # pylint: disable=invalid-envvar-default
            log.info(f'Deleting index: {self.name}')
            self.delete()

        if self.exists():
            log.info(f'Index {self.name} exists')
        else:
            log.info(f'Index {self.name} does not exist')
            self.init()
            # wiki_lookup = Wikipedia()
            log.info('Indexing...')
            for page in tqdm.tqdm(qb_docs):
                wiki_content = ''
                # if use_wiki and page in wiki_lookup:
                #     wiki_content = wiki_lookup[page].text
                # else:
                #     wiki_content = ''

                if use_qb:
                    qb_content = qb_docs[page]
                else:
                    qb_content = ''

                if use_asr:
                    asr_content = asr_docs[page]
                else:
                    asr_content = ''

                answer = self.answer_doc(page=page,
                                         wiki_content=wiki_content,
                                         qb_content=qb_content,
                                         asr_content=asr_content)
                answer.save(index=self.name)

    def search(self,
               text: str,
               max_n_guesses: int,
               normalize_score_by_length=False):
        if not self.exists():
            raise ValueError(
                'The index does not exist, you must create it before searching'
            )

        wiki_field = 'wiki_content'
        qb_field = 'qb_content'
        asr_field = 'asr_content'

        s = Search(index=self.name)[0:max_n_guesses].query(  # pylint: disable=no-member
            'multi_match',
            query=text,
            fields=[wiki_field, qb_field, asr_field])
        results = s.execute()
        guess_set = set()
        guesses = []
        if normalize_score_by_length:
            query_length = max(1, len(text.split()))
        else:
            query_length = 1

        for r in results:
            if r.page in guess_set:
                continue
            else:
                guesses.append({
                    'guess': r.page,
                    'score': r.meta.score,
                    'length': query_length
                })
        if len(guesses) == 0:
            return {'guess': '~~~NOGUESS~~~', 'score': 0, 'length': 1}
        else:
            return guesses[0]
예제 #40
0
파일: __init__.py 프로젝트: Ryuno-Ki/yari
def index(
    buildroot: Path,
    url: str,
    update=False,
    no_progressbar=False,
):
    # We can confidently use a single host here because we're not searching
    # a cluster.
    connections.create_connection(hosts=[url], retry_on_timeout=True)
    connection = connections.get_connection()
    health = connection.cluster.health()
    status = health["status"]
    if status not in ("green", "yellow"):
        raise click.ClickException(f"status {status} not green or yellow")

    count_todo = 0
    for file in walk(buildroot):
        count_todo += 1

    click.echo(f"Found {count_todo:,} (potential) documents to index")

    if update:
        for name in connection.indices.get_alias():
            if name.startswith(f"{INDEX_ALIAS_NAME}_"):
                document_index = Index(name)
                break
        else:
            raise IndexAliasError(
                f"Unable to find an index called {INDEX_ALIAS_NAME}_*")

    else:
        # Confusingly, `._index` is actually not a private API.
        # It's the documented way you're supposed to reach it.
        document_index = Document._index
        click.echo("Deleting any possible existing index "
                   f"and creating a new one called {document_index._name!r}")
        document_index.delete(ignore=404)
        document_index.create()

    skipped = []

    def generator():
        root = Path(buildroot)
        for doc in walk(root):
            # The reason for specifying the exact index name is that we might
            # be doing an update and if you don't specify it, elasticsearch_dsl
            # will fall back to using whatever Document._meta.Index automatically
            # becomes in this moment.
            search_doc = to_search(doc, _index=document_index._name)
            if search_doc:
                yield search_doc.to_dict(True)
            else:
                # The reason something might be chosen to be skipped is because
                # there's logic that kicks in only when the `index.json` file
                # has been opened and parsed.
                # Keep a count of all of these. It's used to make sure the
                # progressbar, if used, ticks as many times as the estimate
                # count was.
                skipped.append(1)

    def get_progressbar():
        if no_progressbar:
            return VoidProgressBar()
        return click.progressbar(length=count_todo, label="Indexing", width=0)

    count_done = count_worked = count_errors = 0
    count_shards_worked = count_shards_failed = 0
    errors_counter = Counter()
    t0 = time.time()
    with get_progressbar() as bar:
        for success, info in parallel_bulk(
                connection,
                generator(),
                # If the bulk indexing failed, it will by default raise a BulkIndexError.
                # Setting this to 'False' will suppress that.
                raise_on_exception=False,
                # If the bulk operation failed for some other reason like a ReadTimeoutError
                # it will raise whatever the error but default.
                # We prefer to swallow all errors under the assumption that the holes
                # will hopefully be fixed in the next attempt.
                raise_on_error=False,
        ):
            if success:
                count_shards_worked += info["index"]["_shards"]["successful"]
                count_shards_failed += info["index"]["_shards"]["failed"]
                count_worked += 1
            else:
                count_errors += 1
                errors_counter[info["index"]["error"]] += 1
            count_done += 1
            bar.update(1)

        for skip in skipped:
            bar.update(1)

    # Now when the index has been filled, we need to make sure we
    # correct any previous indexes.
    if update:
        # When you do an update, Elasticsearch will internally delete the
        # previous docs (based on the _id primary key we set).
        # Normally, Elasticsearch will do this when you restart the cluster
        # but that's not something we usually do.
        # See https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-forcemerge.html
        document_index.forcemerge()
    else:
        # Now we're going to bundle the change to set the alias to point
        # to the new index and delete all old indexes.
        # The reason for doing this together in one update is to make it atomic.
        alias_updates = [{
            "add": {
                "index": document_index._name,
                "alias": INDEX_ALIAS_NAME
            }
        }]
        for index_name in connection.indices.get_alias():
            if index_name.startswith(f"{INDEX_ALIAS_NAME}_"):
                if index_name != document_index._name:
                    alias_updates.append(
                        {"remove_index": {
                            "index": index_name
                        }})
                    click.echo(f"Delete old index {index_name!r}")

        connection.indices.update_aliases({"actions": alias_updates})
        click.echo(f"Reassign the {INDEX_ALIAS_NAME!r} alias from old index "
                   f"to {document_index._name}")

    t1 = time.time()
    took = t1 - t0
    rate = count_done / took
    click.echo(f"Took {format_time(took)} to index {count_done:,} documents. "
               f"Approximately {rate:.1f} docs/second")
    click.echo(f"Count shards - successful: {count_shards_worked:,} "
               f"failed: {count_shards_failed:,}")
    click.echo(f"Counts - worked: {count_worked:,} errors: {count_errors:,}")
    if errors_counter:
        click.echo("Most common errors....")
        for error, count in errors_counter.most_common():
            click.echo(f"{count:,}\t{error[:80]}")
예제 #41
0
class ElasticSearchIndex:
    def __init__(self,
                 name='qb',
                 similarity='default',
                 bm25_b=None,
                 bm25_k1=None):
        self.name = name
        self.ix = Index(self.name)
        self.answer_doc = create_doctype(self.name, similarity)
        if bm25_b is None:
            bm25_b = .75
        if bm25_k1 is None:
            bm25_k1 = 1.2
        self.bm25_b = bm25_b
        self.bm25_k1 = bm25_k1

    def delete(self):
        try:
            self.ix.delete()
        except elasticsearch.exceptions.NotFoundError:
            log.info('Could not delete non-existent index.')

    def exists(self):
        return self.ix.exists()

    def init(self):
        self.ix.create()
        self.ix.close()
        self.ix.put_settings(
            body={
                'similarity': {
                    'qb_bm25': {
                        'type': 'BM25',
                        'b': self.bm25_b,
                        'k1': self.bm25_k1
                    }
                }
            })
        self.ix.open()
        self.answer_doc.init(index=self.name)

    def build_large_docs(self,
                         documents: Dict[str, str],
                         use_wiki=True,
                         use_qb=True,
                         rebuild_index=False):
        if rebuild_index or bool(int(os.getenv('QB_REBUILD_INDEX', 0))):
            log.info(f'Deleting index: {self.name}')
            self.delete()

        if self.exists():
            log.info(f'Index {self.name} exists')
        else:
            log.info(f'Index {self.name} does not exist')
            self.init()
            wiki_lookup = Wikipedia()
            log.info(
                'Indexing questions and corresponding wikipedia pages as large docs...'
            )
            for page in tqdm.tqdm(documents):
                if use_wiki and page in wiki_lookup:
                    wiki_content = wiki_lookup[page].text
                else:
                    wiki_content = ''

                if use_qb:
                    qb_content = documents[page]
                else:
                    qb_content = ''

                answer = self.answer_doc(page=page,
                                         wiki_content=wiki_content,
                                         qb_content=qb_content)
                answer.save(index=self.name)

    def build_many_docs(self,
                        pages,
                        documents,
                        use_wiki=True,
                        use_qb=True,
                        rebuild_index=False):
        if rebuild_index or bool(int(os.getenv('QB_REBUILD_INDEX', 0))):
            log.info(f'Deleting index: {self.name}')
            self.delete()

        if self.exists():
            log.info(f'Index {self.name} exists')
        else:
            log.info(f'Index {self.name} does not exist')
            self.init()
            log.info(
                'Indexing questions and corresponding pages as many docs...')
            if use_qb:
                log.info('Indexing questions...')
                for page, doc in tqdm.tqdm(documents):
                    self.answer_doc(page=page, qb_content=doc).save()

            if use_wiki:
                log.info('Indexing wikipedia...')
                wiki_lookup = Wikipedia()
                for page in tqdm.tqdm(pages):
                    if page in wiki_lookup:
                        content = word_tokenize(wiki_lookup[page].text)
                        for i in range(0, len(content), 200):
                            chunked_content = content[i:i + 200]
                            if len(chunked_content) > 0:
                                self.answer_doc(page=page,
                                                wiki_content=' '.join(
                                                    chunked_content)).save()

    def search(self,
               text: str,
               max_n_guesses: int,
               normalize_score_by_length=False,
               wiki_boost=1,
               qb_boost=1):
        if not self.exists():
            raise ValueError(
                'The index does not exist, you must create it before searching'
            )

        if wiki_boost != 1:
            wiki_field = 'wiki_content^{}'.format(wiki_boost)
        else:
            wiki_field = 'wiki_content'

        if qb_boost != 1:
            qb_field = 'qb_content^{}'.format(qb_boost)
        else:
            qb_field = 'qb_content'

        s = Search(index=self.name)[0:max_n_guesses].query(
            'multi_match', query=text, fields=[wiki_field, qb_field])
        results = s.execute()
        guess_set = set()
        guesses = []
        if normalize_score_by_length:
            query_length = len(text.split())
        else:
            query_length = 1

        for r in results:
            if r.page in guess_set:
                continue
            else:
                guesses.append((r.page, r.meta.score / query_length))
        return guesses
예제 #42
0
def create_product_index():
    products = Index('products')
    products.settings(number_of_shards=1, number_of_replicas=0)
    products.doc_type(Product)
    products.delete(ignore=404)
    products.create()
예제 #43
0
def buildIndex():
    """
    buildIndex creates a new film index, deleting any existing index of
    the same name.
    It loads a json file containing the movie corpus and does bulk loading
    using a generator function.
    """
    film_index = Index('covid_19_index')
    if film_index.exists():
        film_index.delete()  # Overwrite any previous version
    film_index.document(Document_COVID_19)
    film_index.create()

    documents = {}
    # paths = ['CORD-19-research-challenge/biorxiv_medrxiv/biorxiv_medrxiv/pdf_json/',
    #          'CORD-19-research-challenge/comm_use_subset/comm_use_subset/pdf_json/',
    #          'CORD-19-research-challenge/custom_license/custom_license/pdf_json/',
    #          'CORD-19-research-challenge/noncomm_use_subset/noncomm_use_subset/pdf_json/']
    paths = [
        'CORD-19-research-challenge/comm_use_subset/comm_use_subset/pdf_json/'
    ]

    # Getting all files
    id = 1
    for path in paths:
        for file in os.listdir(path):
            # Open the json film corpus
            fullFilePath = path + file

            data = open(fullFilePath)
            currentDoc = json.load(data)
            try:
                abstract = currentDoc['abstract'][0]['text']
            except:
                abstract = ''

            body_text = currentDoc['body_text']

            # ldaModel = LDA(body_text).performLDA()
            documents[str(id)] = [
                currentDoc['paper_id'], currentDoc['metadata']['title'],
                abstract, body_text
            ]
            id = id + 1

    size = len(documents)

    # Action series for bulk loading with helpers.bulk function.
    # Implemented as a generator, to return one document with each call.
    # Note that we include the index name here.
    # The Document type is always 'doc'.
    # Every item to be indexed must have a unique key.
    def actions():
        for mid in range(1, size + 1):
            yield {
                "_index": "covid_19_index",
                "_type": '_doc',
                "_id": documents[str(mid)][0],
                "paper_id": documents[str(mid)][0],
                "title": documents[str(mid)][1],
                "abstract": documents[str(mid)][2],
                "body_text": getBodyText(documents[str(mid)][3])
            }

    helpers.bulk(es, actions())
예제 #44
0
from pe.pe import PE
from pocket_rankings.pocket_rankings import PocketRankings


logger = logging.getLogger(__name__)


connections.create_connection(hosts=['localhost'])

INDEX_NAME = 'poker'

es_index = Index(INDEX_NAME)
# for index in connections.get_connection().indices.get('*'):
#   print(index)
# es_index.delete(ignore=404)
es_index.create(ignore=400)
# logger.info('index truncated')


@es_index.doc_type
class GameAction(DocType):
    site = String(index='not_analyzed')
    game = String(index='not_analyzed')
    vs = Integer()

    player = String(index='not_analyzed')
    amount = Integer()
    pot = Integer()
    pos = Integer()

    preflop_1 = String(index='not_analyzed')
예제 #45
0
import pandas as pd
from elasticsearch_dsl import connections, Index
from model import Talk
from datetime import datetime

if __name__ == "__main__":

    # Indexes Reset
    connections.create_connection(hosts=['localhost'])
    talks_index = Index(Talk.Index.name)
    talks_index.delete(ignore=404)
    talks_index.create()

    talks_df = pd.read_csv('./data/talks.csv')
    print("Number of talks in the report: {}".format(len(talks_df)))

    for index, row in talks_df.iterrows():
        next_talk = Talk()
        next_talk.title = row['Title']
        next_talk.speakers = row['Speakers'].split('/')
        next_talk.day = row['Day']
        next_talk.place = row['Place']
        next_talk.type = row['Type']
        next_talk.start = datetime.strptime(row['Start'], '%H:%M')
        if next_talk.day.startswith("s"):
            next_talk.start = next_talk.start.replace(year=2019,
                                                      month=10,
                                                      day=5)
        else:
            next_talk.start = next_talk.start.replace(year=2019,
                                                      month=10,
예제 #46
0
class TestMixins(BaseTestCase):
    def setUp(self):
        super(TestMixins, self).setUp()
        self.doc_type = Token.get_es_doc_type()
        self.index = Index(self.doc_type._doc_type.index)
        self.index.doc_type(self.doc_type)
        self.index.create()
        self.refresh()

    def tearDown(self):
        super(TestMixins, self).tearDown()
        self.index.delete()

    def test_is_indexable(self):
        self.assertTrue(ESIndexableMixin().is_indexable())

    def test_get_indexable_queryset(self):
        self.assertEqual(str(Token.get_indexable_queryset().query),
                         str(Token.objects.all().query))

    def test_get_es_doc(self):
        token = Token(name='token')
        self.assertIsNone(token.get_es_doc())
        token.save()
        self.assertIsNotNone(token.get_es_doc())

    def test_es_index(self):
        # Asynchronous call.
        token = Token.objects.create(name='not_indexable')
        self.assertDocDoesntExist(token)
        token.es_index()
        self.assertDocExists(token)

        # Synchronous call.
        token = Token.objects.create(name='not_indexable')
        self.assertDocDoesntExist(token)
        token.es_index(async=False)
        self.assertDocExists(token)

    def test_es_delete(self):
        # Asynchronous call.
        token = Token.objects.create(name='token')
        self.assertDocExists(token)
        token.es_delete()
        self.assertDocDoesntExist(Token, token.pk)

        # Synchronous call.
        token = Token.objects.create(name='token')
        self.assertDocExists(token)
        token.es_delete(async=False)
        self.assertDocDoesntExist(Token, token.pk)

    def test_save(self):
        token = Token(name='token')

        with override_settings(TRAMPOLINE={'OPTIONS': {'disabled': True}}):
            token.save()
            self.assertDocDoesntExist(token)

        token.save()
        doc = token.get_es_doc()
        self.assertEqual(doc.name, 'token')
        self.assertEqual(doc._id, str(token.pk))

        # Update model and synchronise doc.
        token.name = 'kento'
        token.save()
        doc = token.get_es_doc()
        self.assertEqual(doc.name, 'kento')

        # Instance is not indexable.
        token = Token.objects.create(name='not_indexable')
        self.assertDocDoesntExist(token)

    def test_delete(self):
        token = Token.objects.create(name='token')
        token_id = token.pk
        self.assertDocExists(token)

        with override_settings(TRAMPOLINE={'OPTIONS': {'disabled': True}}):
            token.delete()
            self.assertDocExists(Token, token_id)

        token.save()
        token_id = token.pk
        token.delete()
        self.assertDocDoesntExist(Token, token_id)
예제 #47
0
def test_elasticsearch_target_additional_properties(sdc_builder, sdc_executor, elasticsearch):
    """
    Elasticsearch target pipeline, adding additional properties, where specifies every routing with the value of the
    shard's record. It checks if the value of the record-label is added correctly to the property routing at
    ElasticSearch query.
        dev_raw_data_source >> es_target
    """
    # Test static
    index_values = []
    for j in range(4):
        index_values.append(get_random_string(string.ascii_letters, 10).lower())

    raw_data = [{"text": "Record1", "index": index_values[0], "mapping": get_random_string(string.ascii_letters, 10).lower(),
                 "doc_id": get_random_string(string.ascii_letters, 10).lower(), "shard": "record1"},
                {"text": "Record2", "index": index_values[1], "mapping": get_random_string(string.ascii_letters, 10).lower(),
                 "doc_id": get_random_string(string.ascii_letters, 10).lower(), "shard": "record2"},
                {"text": "Record3", "index": index_values[2], "mapping": get_random_string(string.ascii_letters, 10).lower(),
                 "doc_id": get_random_string(string.ascii_letters, 10).lower(), "shard": "record3"},
                {"text": "Record4", "index": index_values[3], "mapping": get_random_string(string.ascii_letters, 10).lower(),
                 "doc_id": get_random_string(string.ascii_letters, 10).lower(), "shard": None}]

    # Build pipeline
    builder = sdc_builder.get_pipeline_builder()
    dev_raw_data_source = builder.add_stage('Dev Raw Data Source').set_attributes(data_format='JSON',
                                                                                  stop_after_first_batch=True,
                                                                                  raw_data='\n'.join(json.dumps(rec)
                                                                                                     for rec in raw_data))
    es_target = builder.add_stage('Elasticsearch', type='destination')
    es_target.set_attributes(default_operation='INDEX', document_id='${record:value(\'/doc_id\')}',
                             index='${record:value(\'/index\')}', mapping='${record:value(\'/mapping\')}',
                             additional_properties='{\"_routing\":${record:value(\'/shard\')}}')

    dev_raw_data_source >> es_target
    es_target_pipeline = builder.build(title='ES target pipeline').configure_for_environment(elasticsearch)

    sdc_executor.add_pipeline(es_target_pipeline)
    try:
        elasticsearch.connect()

        # Make sure that the index exists properly before running the test
        index = Index(index_values[0])
        index.create()
        assert index.refresh()

        # Run pipeline with additional properties
        sdc_executor.start_pipeline(es_target_pipeline).wait_for_finished()

        es_response = []
        for i in index_values:
            es_search = ESSearch(index=i)
            response = es_search.execute()
            es_response.append(response[0])
            time.sleep(5)

        assert len(es_response) == 4
        for r in es_response:
            assert r
            if r.text == "Record4":
                for attribute in r.meta:
                    assert attribute != "routing"
            else:
                assert r.shard == r.meta.routing

    finally:
        # Clean up test data in ES
        idx = Index(index_values[0])
        idx.delete()
예제 #48
0
class TestMixins(BaseTestCase):

    def setUp(self):
        super(TestMixins, self).setUp()
        self.doc_type = Token.get_es_doc_type()
        self.index = Index(self.doc_type._doc_type.index)
        self.index.doc_type(self.doc_type)
        self.index.create()
        self.refresh()

    def tearDown(self):
        super(TestMixins, self).tearDown()
        self.index.delete()

    def test_is_indexable(self):
        self.assertTrue(ESIndexableMixin().is_indexable())

    def test_is_index_update_needed(self):
        self.assertTrue(ESIndexableMixin().is_index_update_needed())

    def test_get_indexable_queryset(self):
        self.assertEqual(
            str(Token.get_indexable_queryset().query),
            str(Token.objects.all().query)
        )

    def test_get_es_doc(self):
        token = Token(name="token")
        self.assertIsNone(token.get_es_doc())
        token.save()
        self.assertIsNotNone(token.get_es_doc())

    def test_auto_doc_type_mapping(self):
        person = Person(first_name="Simion", last_name="Baws")
        person.save()
        doc_type = person.get_es_doc_mapping()
        self.assertEqual(doc_type.first_name, person.first_name)
        self.assertEqual(doc_type.last_name, person.last_name)
        self.assertEqual(
            doc_type.full_name,
            u"{0} {1}".format(person.first_name, person.last_name)
        )

    def test_es_index(self):
        # Asynchronous call.
        token = Token.objects.create(name='not_indexable')
        self.assertDocDoesntExist(token)
        token.es_index()
        self.assertDocExists(token)

        # Synchronous call.
        token = Token.objects.create(name='not_indexable')
        self.assertDocDoesntExist(token)
        token.es_index(async=False)
        self.assertDocExists(token)

        # Fail silently.
        settings.TRAMPOLINE['OPTIONS']['disabled'] = True
        token = Token.objects.create(name='raise_exception')
        settings.TRAMPOLINE['OPTIONS']['disabled'] = False
        token.es_index()
        self.assertDocDoesntExist(token)

        # Hard fail.
        settings.TRAMPOLINE['OPTIONS']['fail_silently'] = False
        with self.assertRaises(RuntimeError):
            token.es_index()
        settings.TRAMPOLINE['OPTIONS']['fail_silently'] = True

    def test_es_delete(self):
        # Asynchronous call.
        token = Token.objects.create(name='token')
        self.assertDocExists(token)
        token.es_delete()
        self.assertDocDoesntExist(Token, token.pk)

        # Synchronous call.
        token = Token.objects.create(name='token')
        self.assertDocExists(token)
        token.es_delete(async=False)
        self.assertDocDoesntExist(Token, token.pk)

        # Fail silently if document doesn't exist.
        token.es_delete()

        from trampoline import get_trampoline_config
        trampoline_config = get_trampoline_config()

        # Fake delete to raise exception.
        backup_delete = trampoline_config.connection.delete

        def delete_raise_exception(*args, **kwargs):
            raise RuntimeError
        trampoline_config.connection.delete = delete_raise_exception

        # Fail silently
        token.es_delete()

        # Hard fail.
        settings.TRAMPOLINE['OPTIONS']['fail_silently'] = False
        with self.assertRaises(RuntimeError):
            token.es_delete()
        settings.TRAMPOLINE['OPTIONS']['fail_silently'] = True

        trampoline_config.connection.delete = backup_delete

    def test_save(self):
        token = Token(name='token')

        settings.TRAMPOLINE['OPTIONS']['disabled'] = True
        token.save()
        settings.TRAMPOLINE['OPTIONS']['disabled'] = False
        self.assertDocDoesntExist(token)

        token.save()
        doc = token.get_es_doc()
        self.assertEqual(doc.name, 'token')
        self.assertEqual(doc._id, str(token.pk))

        # Update model and synchronise doc.
        token.name = 'kento'
        token.save()
        doc = token.get_es_doc()
        self.assertEqual(doc.name, 'kento')

        # Instance is not indexable.
        token = Token.objects.create(name='not_indexable')
        self.assertDocDoesntExist(token)

    def test_delete(self):
        token = Token.objects.create(name='token')
        token_id = token.pk
        self.assertDocExists(token)

        settings.TRAMPOLINE['OPTIONS']['disabled'] = True
        token.delete()
        settings.TRAMPOLINE['OPTIONS']['disabled'] = False
        self.assertDocExists(Token, token_id)

        token.save()
        token_id = token.pk
        token.delete()
        self.assertDocDoesntExist(Token, token_id)
예제 #49
0
connections.create_connection()
es = connections.get_connection()

# Check if index already exists
i = Index(indexName)
index_exists = i.exists()

if not index_exists:
    # Define analyzer
    my_analyzer = analyzer('my_analyzer',
                           type="standard",
                           stopwords='_english_')

    # Create index
    i.analyzer(my_analyzer)
    i.create()
    print('Created index', indexName)
else:
    print('Index', indexName, 'already exists, skipping creation.')

# Index metadata documents
inputFile = 'data/processed/metadata.csv'
count = 0
metaProps = []
metaDoc = {}
with open(inputFile, newline='', encoding='utf-8') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',')
    for row in spamreader:
        count += 1
        if (count != 1):
            # create document
def buildIndex():
    """
    buildIndex creates a new film index, deleting any existing index of
    the same name.
    It loads a json file containing the movie corpus and does bulk loading
    using a generator function.
    """
    film_index = Index('sample_film_index')
    if film_index.exists():
        film_index.delete()  # Overwrite any previous version
    film_index.create()

    # Open the json film corpus
    with open('2018_movies.json', 'r', encoding='utf-8') as data_file:
        # load movies from json file into dictionary
        movies = json.load(data_file)
        size = len(movies)

    if es.indices.exists(index='sample_film_index'):
        es.indices.delete(
            index='sample_film_index')  # Overwrite any previous version

    # Action series for bulk loading with helpers.bulk function.
    # Implemented as a generator, to return one movie with each call.
    # Note that we include the index name here.
    # The Document type is always 'doc'.
    # Every item to be indexed must have a unique key.
    def actions():
        # mid is movie id (used as key into movies dictionary)
        # There exist a issue in corpus that some empty values are "[]" instead of an empty list
        # The data type is inconsistent.
        # Need to clean the corpus
        for mid in range(1, size + 1):
            if movies[str(mid)]['Starring'] == "[]":
                movies[str(mid)]['Starring'] = []
            if movies[str(mid)]['Country'] == "[]":
                movies[str(mid)]['Country'] = []
            if movies[str(mid)]['Language'] == "[]":
                movies[str(mid)]['Language'] = []
            if movies[str(mid)]['Director'] == "[]":
                movies[str(mid)]['Director'] = []
            if movies[str(mid)]['Running Time'] == "[]":
                movies[str(mid)]['Running Time'] = []
            elif movies[str(mid)]['Running Time'] == "TBA":
                movies[str(mid)]['Running Time'] = []
            elif movies[str(mid)]['Running Time'] == "? minutes":
                movies[str(mid)]['Running Time'] = []
            elif movies[str(mid)]['Running Time'] == "minutes":
                movies[str(mid)]['Running Time'] = []
            if len(movies[str(mid)]['Title']) <= 1:
                movies[str(mid)]['Title'] = "".join(movies[str(mid)]['Title'])
            else:
                movies[str(mid)]['Title'] = ", ".join(
                    movies[str(mid)]['Title'])
            if len(movies[str(mid)]['Starring']) <= 1:
                movies[str(mid)]['Starring'] = "".join(
                    movies[str(mid)]['Starring'])
            else:
                movies[str(mid)]['Starring'] = ", ".join(
                    movies[str(mid)]['Starring'])
            if len(movies[str(mid)]['Director']) <= 1:
                movies[str(mid)]['Director'] = "".join(
                    movies[str(mid)]['Director'])
            else:
                movies[str(mid)]['Director'] = ", ".join(
                    movies[str(mid)]['Director'])
            if len(movies[str(mid)]['Time']) <= 1:
                movies[str(mid)]['Time'] = "".join(movies[str(mid)]['Time'])
            else:
                movies[str(mid)]['Time'] = ", ".join(movies[str(mid)]['Time'])
            if len(movies[str(mid)]['Location']) <= 1:
                movies[str(mid)]['Location'] = "".join(
                    movies[str(mid)]['Location'])
            else:
                movies[str(mid)]['Location'] = ", ".join(
                    movies[str(mid)]['Location'])
            if len(movies[str(mid)]['Language']) <= 1:
                movies[str(mid)]['Language'] = "".join(
                    movies[str(mid)]['Language'])
            else:
                movies[str(mid)]['Language'] = ", ".join(
                    movies[str(mid)]['Language'])
            if len(movies[str(mid)]['Country']) <= 1:
                movies[str(mid)]['Country'] = "".join(
                    movies[str(mid)]['Country'])
            else:
                movies[str(mid)]['Country'] = ", ".join(
                    movies[str(mid)]['Country'])
            if len(movies[str(mid)]['Categories']) <= 1:
                movies[str(mid)]['Categories'] = "".join(
                    movies[str(mid)]['Categories'])
            else:
                movies[str(mid)]['Categories'] = ", ".join(
                    movies[str(mid)]['Categories'])

            yield {
                "_index": "sample_film_index",
                "_type": 'doc',
                "_id": mid,
                "title": movies[str(mid)]['Title'],
                "starring": movies[str(mid)]['Starring'],
                "runtime": movies[str(mid)]['Running Time'],
                #movies[str(mid)]['runtime'] # You would like to convert runtime to integer (in minutes)
                # --- Add more fields here ---
                "director": movies[str(mid)]['Director'],
                "location": movies[str(mid)]['Location'],
                "time": movies[str(mid)]['Time'],
                "language": movies[str(mid)]['Language'],
                "categories": movies[str(mid)]['Categories'],
                "country": movies[str(mid)]['Country'],
                "text": movies[str(mid)]['Text'],
            }

    helpers.bulk(es, actions())
def createIndex():
    index = Index(INDEX, using=client)
    res = index.create()
    print(res)
예제 #52
0
class QAManipulate:
    _PLACEHOLDER = object()

    def __init__(self, index_name, **settings):
        self.index_name = index_name
        self._index = Index(index_name)
        self._index.settings(**settings)
        self.connect()

        class Inner(QADuos):
            class Index:
                name = index_name

        self._inner_cls = Inner

    def create_index(self):
        self._index.create()

    def delete_index(self):
        self._index.delete()

    def insert(self, doc_id, a_id=None, a_content=None, q_list=None):
        data = self._inner_cls()
        found = data.get(id=doc_id, ignore=404)
        if found is None:
            data.meta.id = doc_id
            data.a_id, data.a_content, data.q_list = a_id, a_content, self._question_parser(
                q_list)
            return data.save()
        else:
            msg = "ID '{}' of Index '{}' already exists."
            raise ValueError(msg.format(doc_id, self._index._name))

    def update(self,
               doc_id,
               a_id=_PLACEHOLDER,
               a_content=_PLACEHOLDER,
               q_list=_PLACEHOLDER):
        data = self._inner_cls()
        found = data.get(id=doc_id, ignore=404)
        if found is not None:
            param_dict = {}
            if a_id is not QAManipulate._PLACEHOLDER:
                param_dict.update({"a_id": a_id})
            if a_content is not QAManipulate._PLACEHOLDER:
                param_dict.update({"a_content": a_content})
            if q_list is not QAManipulate._PLACEHOLDER:
                param_dict.update({"q_list": self._question_parser(q_list)})
            return found.update(**param_dict)
        else:
            msg = "ID '{}' of Index '{}' does not exist."
            raise ValueError(msg.format(doc_id, self._index._name))

    def delete(self, doc_id):
        data = self._inner_cls()
        found = data.get(id=doc_id, ignore=404)
        if found is not None:
            return found.delete()
        else:
            msg = "ID '{}' of Index '{}' does not exist."
            raise ValueError(msg.format(doc_id, self._index._name))

    def query(self,
              query,
              question_only=True,
              max_rec_cnt=5,
              boost_question=1,
              **kwargs):
        if question_only:
            result = self._search_by_question(query,
                                              max_rec_cnt=max_rec_cnt,
                                              **kwargs)
        else:
            result = self._search_by_qa(query,
                                        boost_question,
                                        max_rec_cnt=max_rec_cnt,
                                        **kwargs)

        df = pd.DataFrame(self._parse_result(result))
        df_temp = df[['doc_id', 'score']].copy().drop_duplicates()
        df_temp['score_softmax'] = self._softmax(df_temp['score'])

        return df.merge(df_temp, on=['doc_id', 'score'], how='left')

    def _search_by_question(self, question, max_rec_cnt, **kwargs):
        s = self._inner_cls().search(**kwargs)

        result = s.query("match",
                         q_list__q_content=question).execute(ignore_cache=True)
        return result[:max_rec_cnt]

    def _search_by_qa(self, qa_string, boost, max_rec_cnt, **kwargs):
        s = self._inner_cls().search(**kwargs)

        boosted_question = "q_list.q_content^{}".format(boost)

        result = s.query("multi_match", query=qa_string, fields=['a_content', boosted_question])\
                  .execute(ignore_cache=True)
        return result[:max_rec_cnt]

    def _raw_search(self, **kwargs):
        s = self._inner_cls().search(**kwargs)
        return s

    @staticmethod
    def connect(hosts=['http://*****:*****@192.168.10.49:9200/'],
                timeout=80):
        connections.create_connection(hosts=hosts, timeout=timeout)

    @staticmethod
    def _parse_result(result):
        for hit in result:
            a_id, a_content, score, doc_id = hit.a_id, hit.a_content, hit.meta.score, hit.meta.id
            for question in hit.q_list:
                q_id = question.q_id
                q_content = question.q_content

                yield {
                    'a_id': a_id,
                    'a_content': a_content,
                    'score': score,
                    'q_id': q_id,
                    'q_content': q_content,
                    'doc_id': doc_id
                }

    @staticmethod
    def _question_parser(answer_list):
        result = []
        for answer_data in answer_list:
            result.append(
                Questions(q_id=answer_data['q_id'],
                          q_content=answer_data['q_content']))
        return result

    @staticmethod
    def _softmax(score_list):
        score_array = np.sqrt(score_list)
        exp_array = np.exp(score_array)
        factor = score_list[0] / (1 + score_list[0])
        softmax_array = exp_array / exp_array.sum() * factor
        return softmax_array
def test_elasticsearch_target(sdc_builder, sdc_executor, elasticsearch,
                              additional_properties):
    """Test for Elasticsearch target stage. We do so by ingesting data via Dev Raw Data source to
    Elasticsearch stage and then asserting what we ingest to what will be read from Elasticsearch.
    The pipeline looks like:

    Elasticsearch target pipeline:
        dev_raw_data_source >> es_target
    """
    # Test static
    es_index = get_random_string(
        string.ascii_letters,
        10).lower()  # Elasticsearch indexes must be lower case
    es_mapping = get_random_string(string.ascii_letters, 10)
    es_doc_id = get_random_string(string.ascii_letters, 10)
    raw_str = 'Hello World!'

    # Build pipeline
    builder = sdc_builder.get_pipeline_builder()
    dev_raw_data_source = builder.add_stage(
        'Dev Raw Data Source').set_attributes(data_format='TEXT',
                                              stop_after_first_batch=True,
                                              raw_data=raw_str)
    es_target = builder.add_stage('Elasticsearch', type='destination')
    es_target.set_attributes(default_operation='INDEX',
                             document_id=es_doc_id,
                             index=es_index,
                             mapping=es_mapping,
                             additional_properties=additional_properties)

    dev_raw_data_source >> es_target
    es_target_pipeline = builder.build(
        title='ES target pipeline').configure_for_environment(elasticsearch)
    es_target_pipeline.configuration["shouldRetry"] = False

    sdc_executor.add_pipeline(es_target_pipeline)

    try:
        elasticsearch.connect()

        # Make sure that the index exists properly before running the test
        index = Index(es_index)
        index.create()
        assert index.refresh()

        # Run pipeline and read from Elasticsearch to assert
        sdc_executor.start_pipeline(es_target_pipeline).wait_for_finished()

        # Since we are upsert on the same index, map, doc - there should only be one document (index 0)
        es_search = ESSearch(index=es_index)
        es_response = _es_search_with_retry(es_search)
        es_meta = es_response[0].meta

        # assert meta ingest
        assert es_meta['index'] == es_index and es_meta[
            'doc_type'] == es_mapping and es_meta['id'] == es_doc_id
        # assert data ingest
        assert raw_str == es_response[0].text
    finally:
        # Clean up test data in ES
        idx = Index(es_index)
        idx.delete()
예제 #54
0
def buildIndex():
    idiom_index = Index('idioms_search')

    if idiom_index.exists():
        idiom_index.delete()
    idiom_index.document(Idiom)
    idiom_index.create()

    # get json object movies
    with open('chengyu_addedfeatures.json', 'r',
              encoding='utf-8') as data_file:
        idioms = json.load(data_file)
        size = len(idioms)
    with open('translations.json', 'r', encoding='utf-8') as translation_file:
        translations = json.load(translation_file)

    def actions():
        for mid in range(1, size + 1):
            pinyin_segmentation = idioms[str(mid)]['Pinyin_segmented']
            segmentation_string = " ".join(pinyin_segmentation)
            animal = idioms[str(mid)]['Animal']
            zodiac = ", ".join(animal)
            english = idioms[str(mid)]['English']
            idioms[str(mid)]['English'] = english.rstrip("\"")
            # print(segmentation_string)
            yield {
                "_index":
                "idioms_search",
                "_type":
                'doc',
                "_id":
                mid,
                "name":
                idioms[str(mid)]['Name'],
                "english":
                idioms[str(mid)]['English'],
                "afterword":
                idioms[str(mid)]['Afterword'],
                "riddle":
                idioms[str(mid)]['Riddle'],
                "source":
                idioms[str(mid)]['Source'],
                "story":
                idioms[str(mid)]['Story'],
                "synonym":
                idioms[str(mid)]['Synonym'],
                "antonym":
                idioms[str(mid)]['Antonym'],
                "desc_translation":
                idioms[str(mid)]['Description_Translations'],
                "source_translation":
                idioms[str(mid)]['Source_Translations'],
                "story_translation":
                idioms[str(mid)]['Story_Translations'],
                "usage_translation":
                idioms[str(mid)]['Usage_Translations'],
                "desc_segmentation":
                translations[str(mid)]['Description_Segmentation'],
                "source_segmentation":
                translations[str(mid)]['Source_Segmentation'],
                "story_segmentation":
                translations[str(mid)]['Story_Segmentation'],
                "usage_segmentation":
                translations[str(mid)]['Usage_Segmentation'],
                "pinyin":
                segmentation_string,
                "zodiac":
                zodiac,
                "sentiment":
                idioms[str(mid)]['Sentiment'],
                "difficulty":
                idioms[str(mid)]['Difficulty'],
                "char_num":
                idioms[str(mid)]['Char_num'],
                # "synonym": idioms[str(mid)]['Synonym']
            }

    helpers.bulk(es, actions())
예제 #55
0
http_auth = (os.getenv("USERNAME"), os.getenv("PASSWORD"))
port = os.getenv("PORT")
client = connections.create_connection(hosts=hosts,
                                       http_auth=http_auth,
                                       port=port)

# initiate Redis connection
redis_conn = Redis(os.getenv("REDIS_HOST", "redis"),
                   os.getenv("REDIS_PORT", 6379))

# create indices and mappings
for lang in ["fr"]:  #languages :
    # index named "web-<language code>"
    index = Index('web-%s' % lang)
    if not index.exists():
        index.create()

    # mapping of page
    m = Mapping('page')
    m.field('url', 'keyword')
    m.field('domain', 'keyword')
    m.field('title', 'text', analyzer=languages[lang])
    m.field('description', 'text', analyzer=languages[lang])
    m.field('body', 'text', analyzer=languages[lang])
    m.field('weight', 'long')
    #m.field('thumbnail', 'binary')
    #m.field('keywords', 'completion') # -- TEST -- #
    m.save('web-%s' % lang)

# index for misc mappings
index = Index('web')
예제 #56
0
def build_index():
    """
    buildIndex creates a new film index, deleting any existing index of
    the same name.
    It loads a json file containing the movie corpus and does bulk loading
    using a generator function.
    """
    article_index = Index(args.index_name)
    if article_index.exists():
        article_index.delete()  # overwrite any previous version
    article_index.document(Article)  # register the document mapping
    article_index.create()

    with open(os.path.join(args.module_dir_path, 'graph.p'), 'rb') as f:
        citation_graph = pickle.load(f)
    pagerank_scores = nx.pagerank(citation_graph)
    ddict = defaultdict(float, pagerank_scores)

    # load articles from data source
    with open(os.path.join(args.module_dir_path, 'articles.p'), 'rb') as f:
        articles = pickle.load(f)

    # build a default dictionary to map titles do ids (for eventual use in citations 'more like this')
    titles_to_ids = {
        v['metadata']['title'].lower(): k
        for k, v in enumerate(articles.values())
    }
    titles_to_ids = defaultdict(
        lambda: -1, titles_to_ids)  # -1 is default value for a key error

    # get anchor text:
    anchor_text_dict = utils.get_anchor_text(articles, titles_to_ids)

    # open ner and metadata dict
    with open(args.meta_ner_path, 'r') as f:
        meta_ner_all = json.load(f)

    # get entity frequencies (to filter out unique entities)
    ent_freqs = utils.get_entity_counts(meta_ner_all)

    def actions():
        for i, article in enumerate(articles.values()):
            sha = article['paper_id']

            # extract contents of entity and metadata dict
            if sha in set(meta_ner_all.keys(
            )):  # entities, source, doi, publish_time, has_full_text, journal
                ents = []
                for type, entlist in meta_ner_all[sha]['entities'].items():
                    if type in entity_types:
                        ents.extend(entlist)
                ents = [ent for ent in ents if ent_freqs[ent] > 1
                        ]  # get only ents that occur > 1 in corpus
                ents_str = utils.untokenize(
                    ents)  # transform to string type for indexing

                publish_time = utils.extract_year(
                    meta_ner_all[sha]["publish_time"])
                journal = meta_ner_all[sha]['journal']
            else:
                publish_time = 0
                ents_str = ''
                journal = ''

            # extract contents of article dict
            title = article['metadata']['title'] if 'title' in article[
                'metadata'].keys() else '(Untitled)'
            cits = article['bib_entries'] if 'bib_entries' in article.keys(
            ) else [{}]
            cits = [{
                "title":
                cit['title'],
                "year":
                cit['year'],
                "in_corpus":
                titles_to_ids[cit['title'].lower()],
                "authors": [{
                    "first": auth['first'],
                    "last": auth["last"]
                } for auth in cit['authors']]
            } for cit in cits.values() if cit['title'] != '']
            authors = [{
                "first": auth['first'],
                "last": auth["last"]
            } for auth in article['metadata']['authors']]
            pr = ddict[article['metadata']['title'].lower()]
            abstract = ' '.join([
                abs['text'] if 'text' in abs.keys() else ''
                for abs in article['abstract']
            ]) if 'abstract' in article.keys() else ''
            anchor_text = ' '.join(
                [cit['text'] for cit in anchor_text_dict[title.lower()]])
            section_dict = defaultdict(list)
            for txt in article['body_text']:
                section = txt['section']
                section_dict[section].append(txt['text'])
            body = [{"name": k, "text": v} for k, v in section_dict.items()]
            cited_by = anchor_text_dict[title.lower()]

            body_text = ' '.join(
                [sect['text'] for sect in article['body_text']])

            # check that article is in English
            in_english = (langid.classify(body_text)[0] == 'en')

            yield {
                "_index": args.index_name,
                "_type": '_doc',
                "_id": i,
                "title": title,
                "id_num": sha,
                "abstract": abstract,
                "body": body,
                "body_text": body_text,
                "authors": authors,
                "publish_time": publish_time,
                "journal": journal,
                "citations": cits,
                "in_english": in_english,
                "pr": pr,
                "anchor_text": anchor_text,
                "cited_by": cited_by,
                "ents": ents_str,
            }

    helpers.bulk(
        es, actions(), raise_on_error=True
    )  # one doc in corpus contains a NAN value and it has to be ignored.
예제 #57
0
from elasticsearch_dsl import Index, DocType, Text, analyzer, connections
from doctype import Movies

connections.create_connection(hosts=['localhost'])

single_shard_movies = Index('single_shard_movies')

single_shard_movies.settings(number_of_shards=1, number_of_replicas=0)

single_shard_movies.doc_type(Movies)

multi_shard_movies = single_shard_movies.clone('multi_shard_movies')
multi_shard_movies.settings(number_of_shards=3, number_of_replicas=0)

single_shard_movies.create()
multi_shard_movies.create()
예제 #58
0
    for f in lfiles:
        ftxt = codecs.open(f, "r", encoding='iso-8859-1')

        text = ''
        for line in ftxt:
            text += line
        # Insert operation for a document with fields' path' and 'text'
        ldocs.append({
            '_op_type': 'index',
            '_index': index,
            '_type': 'document',
            'path': f,
            'text': text
        })

    # Working with ElasticSearch
    client = Elasticsearch()
    try:
        # Drop index if it exists
        ind = Index(index, using=client)
        ind.delete()
    except NotFoundError:
        pass
    # then create it
    ind.settings(number_of_shards=1)
    ind.create()

    # Bulk execution of elasticsearch operations (faster than executing all one by one)
    print('Indexing ...')
    bulk(client, ldocs)
예제 #59
0
 def __init__(self):
     movies = Index('imdb', using=es)
     movies.doc_type(Movie)
     movies.delete(ignore=404)
     movies.create()