def setup_index(year): index = Index(f'{INDEX_NAME}-{year}') index.settings(number_of_shards=2, number_of_replicas=0) index.aliases(politicians={}) index.document(Politicians) index.analyzer(brazilian_analyzer) index.create()
class BaseSearchTestCase(TestCase): def setUp(self): from django.conf import settings SEARCH = getattr(settings, 'SEARCH') connections.create_connection('testing', **SEARCH['default']['connections']) self.index = Index(SEARCH['default']['index'], using='testing') # This is needed for test_documents, but has side effects in all running tests doctypes_list = ( value for name, value in inspect.getmembers(documents) if not name.startswith('_') and inspect.isclass(value) and issubclass(value, DocType) and name != DocType.__name__ ) for doctype in doctypes_list: # Remove assigned index doctype._doc_type.index = None # Associate docs with test index self.index.doc_type(doctype) if self.index.exists(): self.index.delete(ignore=404) self.index.create() self.search = Search(index=SEARCH['default']['index']) def tearDown(self): self.index.delete() queue = django_rq.get_queue() queue.empty()
def test_index_can_be_created_with_settings_and_mappings(write_client): i = Index('test-blog', using=write_client) i.document(Post) i.settings(number_of_replicas=0, number_of_shards=1) i.create() assert { 'test-blog': { 'mappings': { 'properties': { 'title': {'type': 'text', 'analyzer': 'my_analyzer'}, 'published_from': {'type': 'date'} } } } } == write_client.indices.get_mapping(index='test-blog') settings = write_client.indices.get_settings(index='test-blog') assert settings['test-blog']['settings']['index']['number_of_replicas'] == '0' assert settings['test-blog']['settings']['index']['number_of_shards'] == '1' assert settings['test-blog']['settings']['index']['analysis'] == { 'analyzer': { 'my_analyzer': { 'type': 'custom', 'tokenizer': 'keyword' } } }
def setup_es(): """Create the ElasticSearch index and configure the mapping. """ client = elasticsearch_config['client'] index_name = elasticsearch_config['index'] info = client.info() print('ElasticSearch version: {0}'.format(info['version']['number'])) if client.indices.exists(index_name): print('Index "{0}" already exists. To re-create the index, manually ' 'delete the index and run this script again.'.format(index_name)) print('To delete the index run:') print('curl -XDELETE \'http://{0}:{1}/{2}/\''.format( elasticsearch_config['host'], elasticsearch_config['port'], index_name)) sys.exit(0) index = Index(index_name) index.settings(analysis=analysis_settings) index.doc_type(SearchArea) index.doc_type(SearchBook) index.doc_type(SearchImage) index.doc_type(SearchOuting) index.doc_type(SearchXreport) index.doc_type(SearchRoute) index.doc_type(SearchTopoMap) index.doc_type(SearchUser) index.doc_type(SearchWaypoint) index.doc_type(SearchArticle) index.create() print('Index "{0}" created'.format(index_name))
def create_index_if_does_not_exist(cls): index = Index(cls.INDEX_NAME) index.doc_type(cls) if not index.connection.indices.exists(cls.INDEX_NAME): index.create() time.sleep(1) # It takes some time to create the index
def create_index_if_not_exists(self, index_name): self.index = index_name idx = Index(index_name) idx.settings(number_of_shards=1, number_of_replicas=1) idx.doc_type(LogType) try: idx.create() except: pass
def initialize_index(self, delete_if_exists=False): """ Initialize index with mapping in ElasticSearch :param delete_if_exists: delete index, if exists :return: None """ def update_index_settings(): """ Function updates settings for slovenian lemmatization of words. As far as we know, elasticsearch-dsl library does not support custom filter settings. :return: None """ analysis_settings = { "analysis": { "filter": { "lemmagen_filter_sl": { "type": "lemmagen", "lexicon": "sl" } }, "analyzer": { "lemmagen_sl": { "type": "custom", "tokenizer": "uax_url_email", "filter": [ "lemmagen_filter_sl", "lowercase" ] } } } } self.client.cluster.health(index=self.index_name, wait_for_status='green', request_timeout=2) self.client.indices.close(index=self.index_name) self.client.indices.put_settings(json.dumps(analysis_settings), index=self.index_name) self.client.indices.open(index=self.index_name) index = Index(self.index_name, using=self.client) if delete_if_exists and index.exists(): index.delete() index.settings( # use higher number in production number_of_replicas=0 ) # register models index.doc_type(Document) index.create() update_index_settings() # set lemmanizer
def es_index_mapping(index_name, doc_type, force=False): # 创建索引 index = Index(index_name) index.doc_type(doc_type) if not index.exists(): index.create() else: if force: index.upgrade()
def create_index(hosts, index): i = set_hosts_index(hosts=hosts, index=index) logprint('debug', 'creating new index') i = Index(index) i.create() logprint('debug', 'registering doc types') i.doc_type(Author) i.doc_type(Page) i.doc_type(Source) logprint('debug', 'DONE')
def create_search_index(index_name, doc_types=None, connection='default', delete_if_exists=False): index = Index(index_name, using=connection) if delete_if_exists: index.delete(ignore=404) if doc_types: for dt in doc_types: if isinstance(dt, str): dt = get_document_class(dt) index.doc_type(dt) if not index.exists(): index.create() return index
def applyConfig(self): try: print("Connecting to '%s', index '%s'" % (self.confESHost, self.confESIndex)) res = connections.create_connection(hosts=[self.confESHost]) idx = Index(self.confESIndex) idx.doc_type(DocHTTPRequestResponse) DocHTTPRequestResponse.init() try: idx.create() except: pass except Exception as e: JOptionPane.showMessageDialog(self.panel, "<html><p style='width: 300px'>Error while initializing ElasticSearch: %s</p></html>" % (str(e)), "Error", JOptionPane.ERROR_MESSAGE)
def test_es_create_documents(self): # Index name required. with self.assertRaises(SystemExit): call_command('es_create_documents') # index_name not in settings. with self.assertRaises(SystemExit): call_command( 'es_create_documents', index_name='barfoo' ) # Index doesn't exist. with self.assertRaises(SystemExit): call_command( 'es_create_documents', index_name='foobar' ) index = Index('foobar') doc_type = Token.get_es_doc_type() index.doc_type(doc_type) index.create() self.refresh() # Disable auto indexing while creating objects. settings.TRAMPOLINE['OPTIONS']['disabled'] = True token = Token.objects.create(name="token") token_not_indexable = Token.objects.create(name='not_indexable') token_raise_exception = Token.objects.create(name='raise_exception') settings.TRAMPOLINE['OPTIONS']['disabled'] = False # Dry run. call_command( 'es_create_documents', index_name='foobar', dry_run=True ) self.assertDocDoesntExist(token) self.assertDocDoesntExist(token_not_indexable) self.assertDocDoesntExist(token_raise_exception) call_command( 'es_create_documents', index_name='foobar', verbosity=3 ) self.assertDocExists(token) self.assertDocDoesntExist(token_not_indexable) self.assertDocDoesntExist(token_raise_exception)
def test_es_create_alias(self): # Index name required. with self.assertRaises(SystemExit): call_command( 'es_create_alias', target_name='foobar_target' ) # Target name required. with self.assertRaises(SystemExit): call_command( 'es_create_alias', index_name='foobar' ) # Index doesn't exist. with self.assertRaises(SystemExit): call_command( 'es_create_alias', index_name='foobar', target_name='foobar_target' ) index = Index('foobar_target') index.create() self.refresh() # Alias with same name as index. with self.assertRaises(SystemExit): call_command( 'es_create_alias', index_name='foobar_target', target_name='foobar_target' ) # Dry run. call_command( 'es_create_alias', index_name='foobar', target_name='foobar_target', dry_run=True ) self.assertAliasDoesntExist(index='foobar_target', name='foobar') call_command( 'es_create_alias', index_name='foobar', target_name='foobar_target' ) self.assertAliasExists(index='foobar_target', name='foobar')
def create_index(): index = set_hosts_index() logprint('debug', 'creating new index') index = Index(settings.DOCSTORE_INDEX) index.create() logprint('debug', 'creating mappings') Author.init() Page.init() Source.init() logprint('debug', 'registering doc types') index.doc_type(Author) index.doc_type(Page) index.doc_type(Source) logprint('debug', 'DONE')
def _create_index(self): dt = datetime.utcnow() dt = dt.strftime('%Y.%m') es = connections.get_connection() if not es.indices.exists('indicators-{}'.format(dt)): index = Index('indicators-{}'.format(dt)) index.aliases(live={}) index.doc_type(Indicator) index.create() m = Mapping('indicator') m.field('indicator_ipv4', 'ip') m.field('indicator_ipv4_mask', 'integer') m.save('indicators-{}'.format(dt)) return 'indicators-{}'.format(dt)
def create_indices(endpoint): """ Creates constituent and address indices in PIC """ connections.connections.create_connection(hosts=[endpoint], timeout=360, max_retries=10, retry_on_timeout=True) pic_index = Index('pic') pic_index.doc_type(Constituent) pic_index.doc_type(Address) pic_index.delete(ignore=404) pic_index.settings( number_of_shards=5, number_of_replicas=2 ) pic_index.create()
def registerExtenderCallbacks(self, callbacks): self.callbacks = callbacks self.helpers = callbacks.getHelpers() callbacks.setExtensionName("Storing HTTP Requests/Responses into ElasticSearch") self.callbacks.registerHttpListener(self) self.callbacks.registerContextMenuFactory(self) self.out = callbacks.getStdout() res = connections.create_connection(hosts=[ES_host]) idx = Index(ES_index) idx.doc_type(DocHTTPRequestResponse) try: idx.create() except: print("Index already exists")
def test_es_delete_alias(self): # Index name required. with self.assertRaises(SystemExit): call_command( 'es_delete_alias', target_name='foobar_target' ) # Target name required. with self.assertRaises(SystemExit): call_command( 'es_delete_alias', index_name='foobar' ) # Index doesn't exist. with self.assertRaises(SystemExit): call_command( 'es_delete_alias', index_name='foobar', target_name='foobar_target', yes=True ) index = Index('foobar_target') index.create() self.refresh() # Alias doesn't exist. with self.assertRaises(SystemExit): call_command( 'es_delete_alias', index_name='foobar', target_name='foobar_target', yes=True ) trampoline_config.connection.indices.put_alias( index='foobar_target', name='foobar') self.assertAliasExists(index='foobar_target', name='foobar') call_command( 'es_delete_alias', index_name='foobar', target_name='foobar_target', yes=True ) self.assertAliasDoesntExist(index='foobar_target', name='foobar')
def _create_index(): # https://github.com/csirtgadgets/massive-octo-spice/blob/develop/elasticsearch/observables.json # http://elasticsearch-py.readthedocs.org/en/master/api.html#elasticsearch.Elasticsearch.bulk idx = _current_index() es = connections.get_connection() if not es.indices.exists(idx): index = Index(idx) index.aliases(live={}) index.doc_type(Indicator) index.create() m = Mapping('indicator') m.field('indicator_ipv4', 'ip') m.field('indicator_ipv4_mask', 'integer') m.field('lasttime', 'date') m.save(idx) return idx
def _create_index(self): # https://github.com/csirtgadgets/massive-octo-spice/blob/develop/elasticsearch/observables.json # http://elasticsearch-py.readthedocs.org/en/master/api.html#elasticsearch.Elasticsearch.bulk dt = datetime.utcnow() dt = dt.strftime('%Y.%m') es = connections.get_connection() if not es.indices.exists('indicators-{}'.format(dt)): index = Index('indicators-{}'.format(dt)) index.aliases(live={}) index.doc_type(Indicator) index.create() m = Mapping('indicator') m.field('indicator_ipv4', 'ip') m.field('indicator_ipv4_mask', 'integer') m.save('indicators-{}'.format(dt)) return 'indicators-{}'.format(dt)
def recreate_index(): """Delete index if it's there and creates a new one""" index = Index(name=get_index_name(), using='default') for name, doc_type in get_doctypes().items(): index.doc_type(doc_type) # Delete the index if it exists. try: index.delete() except NotFoundError: pass # Note: There should be no mapping-conflict race here since the # index doesn't exist. Live indexing should just fail. # Create the index with the mappings all at once. index.create()
def test_index_template_works(write_client): it = IndexTemplate('test-template', 'test-*') it.document(Post) it.settings(number_of_replicas=0, number_of_shards=1) it.save() i = Index('test-blog') i.create() assert { 'test-blog': { 'mappings': { 'properties': { 'title': {'type': 'text', 'analyzer': 'my_analyzer'}, 'published_from': {'type': 'date'}, } } } } == write_client.indices.get_mapping(index='test-blog')
def create_index(index_name, doc_classes=None): """ Create index and add document classes to it. Does NOT check whether index already exists. :param index_name: Name of index to be created. :param doc_classes: Sequence of document classes which should be added to created index. Defaults to None, in which case all document classes from document registry are added to new index. """ index = Index(index_name) if doc_classes is None: doc_classes = get_document_classes().values() for doc_cls in doc_classes: index.doc_type(doc_cls) index.create()
def _create_index(self): # https://github.com/csirtgadgets/massive-octo-spice/blob/develop/elasticsearch/observables.json # http://elasticsearch-py.readthedocs.org/en/master/api.html#elasticsearch.Elasticsearch.bulk # every time we check it does a HEAD req if self.last_index_value and (datetime.utcnow() - self.last_index_check) < timedelta(minutes=2): return self.last_index_value idx = self._current_index() if not self.handle.indices.exists(idx): index = Index(idx) index.aliases(live={}) index.doc_type(Indicator) index.settings(max_result_window=WINDOW_LIMIT) index.create() self.handle.indices.flush(idx) self.last_index_check = datetime.utcnow() self.last_index_value = idx return idx
def test_index_can_be_created_with_settings_and_mappings(write_client): i = Index('test-blog', using=write_client) i.doc_type(Post) i.doc_type(User) i.settings(number_of_replicas=0, number_of_shards=1) i.create() assert { 'test-blog': { 'mappings': { 'post': { 'properties': { 'title': {'type': 'string', 'analyzer': 'my_analyzer'}, 'published_from': {'type': 'date', 'format': 'dateOptionalTime',}, } }, 'user': { 'properties': { 'username': {'type': 'string', 'index': 'not_analyzed'}, 'joined_date': {'type': 'date', 'format': 'dateOptionalTime',}, } }, } } } == write_client.indices.get_mapping(index='test-blog') settings = write_client.indices.get_settings(index='test-blog') assert settings['test-blog']['settings']['index']['number_of_replicas'] == '0' assert settings['test-blog']['settings']['index']['number_of_shards'] == '1' assert settings['test-blog']['settings']['index']['analysis'] == { 'analyzer': { 'my_analyzer': { 'type': 'custom', 'tokenizer': 'keyword' } } }
def mitm_request(self, data): # Initialize ES connection and index res = connections.create_connection(hosts=[args.elasticsearch]) idx = Index(args.index) idx.doc_type(DocHTTPRequestResponse) try: DocHTTPRequestResponse.init() idx.create() except: pass r = HTTPRequest(data) # determine url if self.is_connect: scheme = "https" else: scheme = "http" url = scheme + "://" + self.hostname if scheme == "http" and int(self.port) != 80 or scheme == "https" and int(self.port) != 443: url += ":" + str(self.port) url += self.path if args.verbose: print(url) self.doc = DocHTTPRequestResponse(host=self.hostname, port=int(self.port), protocol=scheme) self.doc.meta.index = args.index self.doc.request.url = url self.doc.request.requestline = r.requestline self.doc.request.method = r.command self.doc.host = self.hostname self.doc.port = int(self.port) self.doc.protocol = scheme return data
def test_es_delete_index(self): # Index name required. with self.assertRaises(SystemExit): call_command('es_delete_index') # Index doesn't exist. with self.assertRaises(SystemExit): call_command( 'es_delete_index', index_name='foobar', yes=True ) index = Index('foobar') index.create() self.refresh() self.assertIndexExists('foobar') call_command( 'es_delete_index', index_name='foobar', yes=True ) self.assertIndexDoesntExist('foobar')
) @nsf.document class Grant(Document): title = Text() abstract = Text() date = Date() division = Keyword() class Index: name = "nsf" nsf.delete() nsf.create() cur = db.cursor() cur.execute( "select AwardTitle, AbstractNarration, AwardAmount, AwardEffectiveDate, LongName from Award join Division on Award.AwardID = Division.AwardID" ) Grant.init() for r in cur.fetchall(): g = Grant(title=r[0], abstract=r[1], date=r[3], division=r[4]) g.amount = r[2] g.save() exit()
from sortedcontainers import SortedDict from pe.pe import PE from pocket_rankings.pocket_rankings import PocketRankings logger = logging.getLogger(__name__) connections.create_connection(hosts=['localhost']) INDEX_NAME = 'poker' es_index = Index(INDEX_NAME) # for index in connections.get_connection().indices.get('*'): # print(index) # es_index.delete(ignore=404) es_index.create(ignore=400) # logger.info('index truncated') @es_index.doc_type class GameAction(DocType): site = String(index='not_analyzed') game = String(index='not_analyzed') vs = Integer() player = String(index='not_analyzed') amount = Integer() pot = Integer() pos = Integer() preflop_1 = String(index='not_analyzed')
class ElasticSearchIndex: def __init__(self, name='qb', similarity='default', bm25_b=None, bm25_k1=None): self.name = name self.ix = Index(self.name) self.answer_doc = create_doctype(self.name, similarity) if bm25_b is None: bm25_b = .75 if bm25_k1 is None: bm25_k1 = 1.2 self.bm25_b = bm25_b self.bm25_k1 = bm25_k1 def delete(self): try: self.ix.delete() except elasticsearch.exceptions.NotFoundError: log.info('Could not delete non-existent index.') def exists(self): return self.ix.exists() def init(self): self.ix.create() self.ix.close() self.ix.put_settings(body={'similarity': { 'qb_bm25': {'type': 'BM25', 'b': self.bm25_b, 'k1': self.bm25_k1}} }) self.ix.open() self.answer_doc.init(index=self.name) def build_large_docs(self, documents: Dict[str, str], use_wiki=True, use_qb=True, rebuild_index=False): if rebuild_index or bool(int(os.getenv('QB_REBUILD_INDEX', 0))): log.info(f'Deleting index: {self.name}') self.delete() if self.exists(): log.info(f'Index {self.name} exists') else: log.info(f'Index {self.name} does not exist') self.init() wiki_lookup = Wikipedia() log.info('Indexing questions and corresponding wikipedia pages as large docs...') for page in tqdm.tqdm(documents): if use_wiki and page in wiki_lookup: wiki_content = wiki_lookup[page].text else: wiki_content = '' if use_qb: qb_content = documents[page] else: qb_content = '' answer = self.answer_doc( page=page, wiki_content=wiki_content, qb_content=qb_content ) answer.save(index=self.name) def build_many_docs(self, pages, documents, use_wiki=True, use_qb=True, rebuild_index=False): if rebuild_index or bool(int(os.getenv('QB_REBUILD_INDEX', 0))): log.info(f'Deleting index: {self.name}') self.delete() if self.exists(): log.info(f'Index {self.name} exists') else: log.info(f'Index {self.name} does not exist') self.init() log.info('Indexing questions and corresponding pages as many docs...') if use_qb: log.info('Indexing questions...') for page, doc in tqdm.tqdm(documents): self.answer_doc(page=page, qb_content=doc).save() if use_wiki: log.info('Indexing wikipedia...') wiki_lookup = Wikipedia() for page in tqdm.tqdm(pages): if page in wiki_lookup: content = word_tokenize(wiki_lookup[page].text) for i in range(0, len(content), 200): chunked_content = content[i:i + 200] if len(chunked_content) > 0: self.answer_doc(page=page, wiki_content=' '.join(chunked_content)).save() def search(self, text: str, max_n_guesses: int, normalize_score_by_length=False, wiki_boost=1, qb_boost=1): if not self.exists(): raise ValueError('The index does not exist, you must create it before searching') if wiki_boost != 1: wiki_field = 'wiki_content^{}'.format(wiki_boost) else: wiki_field = 'wiki_content' if qb_boost != 1: qb_field = 'qb_content^{}'.format(qb_boost) else: qb_field = 'qb_content' s = Search(index=self.name)[0:max_n_guesses].query( 'multi_match', query=text, fields=[wiki_field, qb_field] ) results = s.execute() guess_set = set() guesses = [] if normalize_score_by_length: query_length = len(text.split()) else: query_length = 1 for r in results: if r.page in guess_set: continue else: guesses.append((r.page, r.meta.score / query_length)) return guesses
class BurpExtender(IBurpExtender, IHttpListener, IContextMenuFactory, ITab): def registerExtenderCallbacks(self, callbacks): self.callbacks = callbacks self.helpers = callbacks.getHelpers() callbacks.setExtensionName( "Storing HTTP Requests/Responses into ElasticSearch") self.callbacks.registerHttpListener(self) self.callbacks.registerContextMenuFactory(self) self.out = callbacks.getStdout() self.lastTimestamp = None self.confESHost = self.callbacks.loadExtensionSetting( "elasticburp.host") or ES_host self.confESIndex = self.callbacks.loadExtensionSetting( "elasticburp.index") or ES_index self.confBurpTools = int( self.callbacks.loadExtensionSetting("elasticburp.tools") or Burp_Tools) saved_onlyresp = self.callbacks.loadExtensionSetting( "elasticburp.onlyresp") if saved_onlyresp == "True": self.confBurpOnlyResp = True elif saved_onlyresp == "False": self.confBurpOnlyResp = False else: self.confBurpOnlyResp = bool( int(saved_onlyresp or Burp_onlyResponses)) self.callbacks.addSuiteTab(self) self.applyConfig() def applyConfig(self): try: print("Connecting to '%s', index '%s'" % (self.confESHost, self.confESIndex)) self.es = connections.create_connection(hosts=[self.confESHost]) self.idx = Index(self.confESIndex) self.idx.doc_type(DocHTTPRequestResponse) if self.idx.exists(): self.idx.open() else: self.idx.create() self.callbacks.saveExtensionSetting("elasticburp.host", self.confESHost) self.callbacks.saveExtensionSetting("elasticburp.index", self.confESIndex) self.callbacks.saveExtensionSetting("elasticburp.tools", str(self.confBurpTools)) self.callbacks.saveExtensionSetting( "elasticburp.onlyresp", str(int(self.confBurpOnlyResp))) except Exception as e: JOptionPane.showMessageDialog( self.panel, "<html><p style='width: 300px'>Error while initializing ElasticSearch: %s</p></html>" % (str(e)), "Error", JOptionPane.ERROR_MESSAGE) ### ITab ### def getTabCaption(self): return "ElasticBurp" def applyConfigUI(self, event): #self.idx.close() self.confESHost = self.uiESHost.getText() self.confESIndex = self.uiESIndex.getText() self.confBurpTools = int( (self.uiCBSuite.isSelected() and IBurpExtenderCallbacks.TOOL_SUITE) | (self.uiCBTarget.isSelected() and IBurpExtenderCallbacks.TOOL_TARGET) | (self.uiCBProxy.isSelected() and IBurpExtenderCallbacks.TOOL_PROXY) | (self.uiCBSpider.isSelected() and IBurpExtenderCallbacks.TOOL_SPIDER) | (self.uiCBScanner.isSelected() and IBurpExtenderCallbacks.TOOL_SCANNER) | (self.uiCBIntruder.isSelected() and IBurpExtenderCallbacks.TOOL_INTRUDER) | (self.uiCBRepeater.isSelected() and IBurpExtenderCallbacks.TOOL_REPEATER) | (self.uiCBSequencer.isSelected() and IBurpExtenderCallbacks.TOOL_SEQUENCER) | (self.uiCBExtender.isSelected() and IBurpExtenderCallbacks.TOOL_EXTENDER)) self.confBurpOnlyResp = self.uiCBOptRespOnly.isSelected() self.applyConfig() def resetConfigUI(self, event): self.uiESHost.setText(self.confESHost) self.uiESIndex.setText(self.confESIndex) self.uiCBSuite.setSelected( bool(self.confBurpTools & IBurpExtenderCallbacks.TOOL_SUITE)) self.uiCBTarget.setSelected( bool(self.confBurpTools & IBurpExtenderCallbacks.TOOL_TARGET)) self.uiCBProxy.setSelected( bool(self.confBurpTools & IBurpExtenderCallbacks.TOOL_PROXY)) self.uiCBSpider.setSelected( bool(self.confBurpTools & IBurpExtenderCallbacks.TOOL_SPIDER)) self.uiCBScanner.setSelected( bool(self.confBurpTools & IBurpExtenderCallbacks.TOOL_SCANNER)) self.uiCBIntruder.setSelected( bool(self.confBurpTools & IBurpExtenderCallbacks.TOOL_INTRUDER)) self.uiCBRepeater.setSelected( bool(self.confBurpTools & IBurpExtenderCallbacks.TOOL_REPEATER)) self.uiCBSequencer.setSelected( bool(self.confBurpTools & IBurpExtenderCallbacks.TOOL_SEQUENCER)) self.uiCBExtender.setSelected( bool(self.confBurpTools & IBurpExtenderCallbacks.TOOL_EXTENDER)) self.uiCBOptRespOnly.setSelected(self.confBurpOnlyResp) def getUiComponent(self): self.panel = JPanel() self.panel.setLayout(BoxLayout(self.panel, BoxLayout.PAGE_AXIS)) self.uiESHostLine = JPanel() self.uiESHostLine.setLayout( BoxLayout(self.uiESHostLine, BoxLayout.LINE_AXIS)) self.uiESHostLine.setAlignmentX(JPanel.LEFT_ALIGNMENT) self.uiESHostLine.add(JLabel("ElasticSearch Host: ")) self.uiESHost = JTextField(40) self.uiESHost.setMaximumSize(self.uiESHost.getPreferredSize()) self.uiESHostLine.add(self.uiESHost) self.panel.add(self.uiESHostLine) self.uiESIndexLine = JPanel() self.uiESIndexLine.setLayout( BoxLayout(self.uiESIndexLine, BoxLayout.LINE_AXIS)) self.uiESIndexLine.setAlignmentX(JPanel.LEFT_ALIGNMENT) self.uiESIndexLine.add(JLabel("ElasticSearch Index: ")) self.uiESIndex = JTextField(40) self.uiESIndex.setMaximumSize(self.uiESIndex.getPreferredSize()) self.uiESIndexLine.add(self.uiESIndex) self.panel.add(self.uiESIndexLine) uiToolsLine = JPanel() uiToolsLine.setLayout(BoxLayout(uiToolsLine, BoxLayout.LINE_AXIS)) uiToolsLine.setAlignmentX(JPanel.LEFT_ALIGNMENT) self.uiCBSuite = JCheckBox("Suite") uiToolsLine.add(self.uiCBSuite) uiToolsLine.add(Box.createRigidArea(Dimension(10, 0))) self.uiCBTarget = JCheckBox("Target") uiToolsLine.add(self.uiCBTarget) uiToolsLine.add(Box.createRigidArea(Dimension(10, 0))) self.uiCBProxy = JCheckBox("Proxy") uiToolsLine.add(self.uiCBProxy) uiToolsLine.add(Box.createRigidArea(Dimension(10, 0))) self.uiCBSpider = JCheckBox("Spider") uiToolsLine.add(self.uiCBSpider) uiToolsLine.add(Box.createRigidArea(Dimension(10, 0))) self.uiCBScanner = JCheckBox("Scanner") uiToolsLine.add(self.uiCBScanner) uiToolsLine.add(Box.createRigidArea(Dimension(10, 0))) self.uiCBIntruder = JCheckBox("Intruder") uiToolsLine.add(self.uiCBIntruder) uiToolsLine.add(Box.createRigidArea(Dimension(10, 0))) self.uiCBRepeater = JCheckBox("Repeater") uiToolsLine.add(self.uiCBRepeater) uiToolsLine.add(Box.createRigidArea(Dimension(10, 0))) self.uiCBSequencer = JCheckBox("Sequencer") uiToolsLine.add(self.uiCBSequencer) uiToolsLine.add(Box.createRigidArea(Dimension(10, 0))) self.uiCBExtender = JCheckBox("Extender") uiToolsLine.add(self.uiCBExtender) self.panel.add(uiToolsLine) self.panel.add(Box.createRigidArea(Dimension(0, 10))) uiOptionsLine = JPanel() uiOptionsLine.setLayout(BoxLayout(uiOptionsLine, BoxLayout.LINE_AXIS)) uiOptionsLine.setAlignmentX(JPanel.LEFT_ALIGNMENT) self.uiCBOptRespOnly = JCheckBox( "Process only responses (include requests)") uiOptionsLine.add(self.uiCBOptRespOnly) self.panel.add(uiOptionsLine) self.panel.add(Box.createRigidArea(Dimension(0, 10))) uiButtonsLine = JPanel() uiButtonsLine.setLayout(BoxLayout(uiButtonsLine, BoxLayout.LINE_AXIS)) uiButtonsLine.setAlignmentX(JPanel.LEFT_ALIGNMENT) uiButtonsLine.add(JButton("Apply", actionPerformed=self.applyConfigUI)) uiButtonsLine.add(JButton("Reset", actionPerformed=self.resetConfigUI)) self.panel.add(uiButtonsLine) self.resetConfigUI(None) return self.panel ### IHttpListener ### def processHttpMessage(self, tool, isRequest, msg): if not tool & self.confBurpTools or isRequest and self.confBurpOnlyResp: return doc = self.genESDoc(msg) doc.save() ### IContextMenuFactory ### def createMenuItems(self, invocation): menuItems = list() selectedMsgs = invocation.getSelectedMessages() if selectedMsgs != None and len(selectedMsgs) >= 1: menuItems.append( JMenuItem("Add to ElasticSearch Index", actionPerformed=self.genAddToES( selectedMsgs, invocation.getInputEvent().getComponent()))) return menuItems def genAddToES(self, msgs, component): def menuAddToES(e): progress = ProgressMonitor(component, "Feeding ElasticSearch", "", 0, len(msgs)) i = 0 docs = list() for msg in msgs: if not Burp_onlyResponses or msg.getResponse(): docs.append( self.genESDoc( msg, timeStampFromResponse=True).to_dict(True)) i += 1 progress.setProgress(i) success, failed = bulk(self.es, docs, True, raise_on_error=False) progress.close() JOptionPane.showMessageDialog( self.panel, "<html><p style='width: 300px'>Successful imported %d messages, %d messages failed.</p></html>" % (success, failed), "Finished", JOptionPane.INFORMATION_MESSAGE) return menuAddToES ### Interface to ElasticSearch ### def genESDoc(self, msg, timeStampFromResponse=False): httpService = msg.getHttpService() doc = DocHTTPRequestResponse(protocol=httpService.getProtocol(), host=httpService.getHost(), port=httpService.getPort()) doc.meta.index = self.confESIndex request = msg.getRequest() response = msg.getResponse() if request: iRequest = self.helpers.analyzeRequest(msg) doc.request.method = iRequest.getMethod() doc.request.url = iRequest.getUrl().toString() headers = iRequest.getHeaders() for header in headers: try: doc.add_request_header(header) except: doc.request.requestline = header parameters = iRequest.getParameters() for parameter in parameters: ptype = parameter.getType() if ptype == IParameter.PARAM_URL: typename = "url" elif ptype == IParameter.PARAM_BODY: typename = "body" elif ptype == IParameter.PARAM_COOKIE: typename = "cookie" elif ptype == IParameter.PARAM_XML: typename = "xml" elif ptype == IParameter.PARAM_XML_ATTR: typename = "xmlattr" elif ptype == IParameter.PARAM_MULTIPART_ATTR: typename = "multipartattr" elif ptype == IParameter.PARAM_JSON: typename = "json" else: typename = "unknown" name = parameter.getName() value = parameter.getValue() doc.add_request_parameter(typename, name, value) ctype = iRequest.getContentType() if ctype == IRequestInfo.CONTENT_TYPE_NONE: doc.request.content_type = "none" elif ctype == IRequestInfo.CONTENT_TYPE_URL_ENCODED: doc.request.content_type = "urlencoded" elif ctype == IRequestInfo.CONTENT_TYPE_MULTIPART: doc.request.content_type = "multipart" elif ctype == IRequestInfo.CONTENT_TYPE_XML: doc.request.content_type = "xml" elif ctype == IRequestInfo.CONTENT_TYPE_JSON: doc.request.content_type = "json" elif ctype == IRequestInfo.CONTENT_TYPE_AMF: doc.request.content_type = "amf" else: doc.request.content_type = "unknown" bodyOffset = iRequest.getBodyOffset() doc.request.body = request[bodyOffset:].tostring().decode( "ascii", "replace") if response: iResponse = self.helpers.analyzeResponse(response) doc.response.status = iResponse.getStatusCode() doc.response.content_type = iResponse.getStatedMimeType() doc.response.inferred_content_type = iResponse.getInferredMimeType( ) headers = iResponse.getHeaders() dateHeader = None for header in headers: try: doc.add_response_header(header) match = reDateHeader.match(header) if match: dateHeader = match.group(1) except: doc.response.responseline = header cookies = iResponse.getCookies() for cookie in cookies: expCookie = cookie.getExpiration() expiration = None if expCookie: try: expiration = str( datetime.fromtimestamp(expCookie.time / 1000)) except: pass doc.add_response_cookie(cookie.getName(), cookie.getValue(), cookie.getDomain(), cookie.getPath(), expiration) bodyOffset = iResponse.getBodyOffset() doc.response.body = response[bodyOffset:].tostring().decode( "ascii", "replace") if timeStampFromResponse: if dateHeader: try: doc.timestamp = datetime.fromtimestamp( mktime_tz(parsedate_tz(dateHeader)), tz) # try to use date from response header "Date" self.lastTimestamp = doc.timestamp except: doc.timestamp = self.lastTimestamp # fallback: last stored timestamp. Else: now return doc
def buildIndex(): """ buildIndex creates a new film index, deleting any existing index of the same name. It loads a json file containing the covid doc metadata corpus and does bulk loading using a generator function. """ doc_index = Index('covid_doc_index') if doc_index.exists(): doc_index.delete() # Overwrite any previous version doc_index.analyzer( basic_analyzer ) # register your customized analyzer as the default analyzer doc_index.create() # Open the covid metadata corpus with open('covid_comm_use_subset_meta.json', 'r', encoding='utf-8') as data_file: enum_id = 1 documents = {} for line in data_file: try: doc = json.loads(line) for key in doc.keys(): if type(doc.get(key)) is not str: if math.isnan(doc.get(key)): doc.update({key: None}) if key == "publish_time" and doc.get(key) and len( doc.get(key)) > 4: match = re.search(year, doc[key]).group(0) if match: doc.update({key: match}) documents.update({str(enum_id): doc}) enum_id += 1 except json.decoder.JSONDecodeError: continue # load doc metadata from json file into dictionary size = len(documents) # Action series for bulk loading with helpers.bulk function. # Implemented as a generator, to return one movie with each call. # Note that we include the index name here. # The Document type is always 'doc'. # Every item to be indexed must have a unique key. def actions(): # enum_id is an enumerated id created when reading the json and used as key into covid metadata dictionary) for enum_id in range(1, size + 1): yield { "_index": "covid_doc_index", "_type": 'doc', "_id": enum_id, "title": documents[str(enum_id)].get('title', 'None'), "text": documents[str(enum_id)].get('abstract', 'None'), "authors": documents[str(enum_id)].get('authors', 'None'), "publish_time": documents[str(enum_id)].get('publish_time', int(0000)) } helpers.bulk(es, actions())
from flask import render_template from flask import request import vk_api from polyglot.text import Text import re from datetime import datetime from elasticsearch import Elasticsearch from elasticsearch_dsl import Index from elasticsearch_dsl.connections import connections connections.create_connection(hosts=['elasticsearch'], port=9200) es = Elasticsearch(hosts=['elasticsearch'], port=9200) post_index = Index('post_index', using=es) post_index.delete(ignore=404) post_index.create() app = Flask(__name__) @app.route('/') def index(): return render_template('index.html') @app.route('/search/q') def search(): query = request.args.get("search") sentiment = request.args.get("sentiment") db_history_search = es.search(index='post_index', doc_type='post', q=query)
class InsightError(Exception): """exception for errors with insights""" ############################################################################### # Account ############################################################################### class AccountInsightError(InsightError): """an error with account index""" account_ix = Index('account') if not account_ix.exists(): account_ix.create() class AccountDoc(DocType): username = Keyword(required=True, store=True) posts_count = Integer(store=True) followers_count = Integer(store=True) following_count = Integer(store=True) bio = Text(store=True) website = Keyword(store=True) joined_at = Date(store=True) # post location = Keyword(store=True) tags = Keyword(store=True) count = Integer(store=True)
help="ElasticSearch host (default: %(default)s)") argparser.add_argument("--index", "-i", default="testssl-scan", help="ElasticSearch index (default: %(default)s)") argparser.add_argument("files", nargs="+", help="List of testssl.sh logs in CSV format") args = argparser.parse_args() connections.create_connection(hosts=args.elasticsearch) idx = Index(args.index) idx.document(DocTestSSLResult) DocTestSSLResult.init() try: idx.create() except: pass csvFiles = args.files for csvFile in csvFiles: try: csv = open(csvFile, mode="r", newline="") except IOError as e: print("Error while opening %s: %s" % (csvFile, e.strerror)) print("Processing '%s'" % (csvFile)) doc = DocTestSSLResult(source=csvFile) doc.parseCSV(csv) csv.close() try:
def createIndex(): index = Index(INDEX, using=client) index.create()
def _create_index(index_name): new_index = Index(index_name, using=CONNECTION_ALIAS) new_index.delete(ignore=[400, 404]) new_index.settings(index=DEFAULT_INDEX_SETTING) new_index.create()
class ElasticInsert(object): """此类为工具类,用于将更新的资讯插入至elasticsearch中.""" def __init__(self, client, index_name: str): self.client = client self.index_name = index_name self.index = Index(name=self.index_name, using=self.client) self._init() def get_news_ids(self, dt: str): """取出给定日期的news_id集合,从而可以判断哪些资讯已插入,从而只插入新资讯.""" scan_generator = scan(self.client, query={'query': { 'match': { 'dt': dt } }}, index=self.index_name, _source=['news_id']) news_ids = set() news_id_list = list() for item in scan_generator: news_id = item['_source']['news_id'] news_ids.add(news_id) news_id_list.append(news_id) if len(news_ids) != len(news_id_list): logging.warning( f"There are {len(news_id_list)-len(news_ids)} news " + f"repeatedly inserted.") logging.info( f"There are {len(news_ids)} news in index {self.index_name} on {dt}" ) return news_ids def _init(self): self._document() if not self.index.exists(): self.index.create() logging.info(f"Create index {self.index_name} successfully.") else: logging.info(f"Index {self.index_name} already existed.") def _document(self): @self.index.document class News(Document): news_id = Keyword() title = Text() content = Text() news_tag = Text() source = Keyword() info_url = Keyword() https_url = Keyword() large_pic = Keyword() mini_pic = Keyword() third_source = Text() content_type = Keyword() news_type = Keyword() is_video = Keyword() video_time = Long() update_time = Date() utc_update_time = Date() dt = Date() self.News = News
class IrIndex: def __init__(self, name='qb', similarity='default', bm25_b=None, bm25_k1=None): self.name = name self.ix = Index(self.name) self.answer_doc = create_doctype(self.name, similarity) if bm25_b is None: bm25_b = .75 if bm25_k1 is None: bm25_k1 = 1.2 self.bm25_b = bm25_b self.bm25_k1 = bm25_k1 def delete(self): try: self.ix.delete() except elasticsearch.exceptions.NotFoundError: log.info('Could not delete non-existent index.') def exists(self): return self.ix.exists() def init(self): self.ix.create() self.ix.close() self.ix.put_settings( body={ 'similarity': { 'qb_bm25': { 'type': 'BM25', 'b': self.bm25_b, 'k1': self.bm25_k1 } } }) self.ix.open() self.answer_doc.init(index=self.name) def build(self, qb_docs: Dict[str, str], asr_docs: Dict[str, str], use_wiki=False, use_qb=True, use_asr=True, rebuild_index=False): if rebuild_index or bool(int(os.getenv('QB_REBUILD_INDEX', 0))): # pylint: disable=invalid-envvar-default log.info(f'Deleting index: {self.name}') self.delete() if self.exists(): log.info(f'Index {self.name} exists') else: log.info(f'Index {self.name} does not exist') self.init() # wiki_lookup = Wikipedia() log.info('Indexing...') for page in tqdm.tqdm(qb_docs): wiki_content = '' # if use_wiki and page in wiki_lookup: # wiki_content = wiki_lookup[page].text # else: # wiki_content = '' if use_qb: qb_content = qb_docs[page] else: qb_content = '' if use_asr: asr_content = asr_docs[page] else: asr_content = '' answer = self.answer_doc(page=page, wiki_content=wiki_content, qb_content=qb_content, asr_content=asr_content) answer.save(index=self.name) def search(self, text: str, max_n_guesses: int, normalize_score_by_length=False): if not self.exists(): raise ValueError( 'The index does not exist, you must create it before searching' ) wiki_field = 'wiki_content' qb_field = 'qb_content' asr_field = 'asr_content' s = Search(index=self.name)[0:max_n_guesses].query( # pylint: disable=no-member 'multi_match', query=text, fields=[wiki_field, qb_field, asr_field]) results = s.execute() guess_set = set() guesses = [] if normalize_score_by_length: query_length = max(1, len(text.split())) else: query_length = 1 for r in results: if r.page in guess_set: continue else: guesses.append({ 'guess': r.page, 'score': r.meta.score, 'length': query_length }) if len(guesses) == 0: return {'guess': '~~~NOGUESS~~~', 'score': 0, 'length': 1} else: return guesses[0]
def index( buildroot: Path, url: str, update=False, no_progressbar=False, ): # We can confidently use a single host here because we're not searching # a cluster. connections.create_connection(hosts=[url], retry_on_timeout=True) connection = connections.get_connection() health = connection.cluster.health() status = health["status"] if status not in ("green", "yellow"): raise click.ClickException(f"status {status} not green or yellow") count_todo = 0 for file in walk(buildroot): count_todo += 1 click.echo(f"Found {count_todo:,} (potential) documents to index") if update: for name in connection.indices.get_alias(): if name.startswith(f"{INDEX_ALIAS_NAME}_"): document_index = Index(name) break else: raise IndexAliasError( f"Unable to find an index called {INDEX_ALIAS_NAME}_*") else: # Confusingly, `._index` is actually not a private API. # It's the documented way you're supposed to reach it. document_index = Document._index click.echo("Deleting any possible existing index " f"and creating a new one called {document_index._name!r}") document_index.delete(ignore=404) document_index.create() skipped = [] def generator(): root = Path(buildroot) for doc in walk(root): # The reason for specifying the exact index name is that we might # be doing an update and if you don't specify it, elasticsearch_dsl # will fall back to using whatever Document._meta.Index automatically # becomes in this moment. search_doc = to_search(doc, _index=document_index._name) if search_doc: yield search_doc.to_dict(True) else: # The reason something might be chosen to be skipped is because # there's logic that kicks in only when the `index.json` file # has been opened and parsed. # Keep a count of all of these. It's used to make sure the # progressbar, if used, ticks as many times as the estimate # count was. skipped.append(1) def get_progressbar(): if no_progressbar: return VoidProgressBar() return click.progressbar(length=count_todo, label="Indexing", width=0) count_done = count_worked = count_errors = 0 count_shards_worked = count_shards_failed = 0 errors_counter = Counter() t0 = time.time() with get_progressbar() as bar: for success, info in parallel_bulk( connection, generator(), # If the bulk indexing failed, it will by default raise a BulkIndexError. # Setting this to 'False' will suppress that. raise_on_exception=False, # If the bulk operation failed for some other reason like a ReadTimeoutError # it will raise whatever the error but default. # We prefer to swallow all errors under the assumption that the holes # will hopefully be fixed in the next attempt. raise_on_error=False, ): if success: count_shards_worked += info["index"]["_shards"]["successful"] count_shards_failed += info["index"]["_shards"]["failed"] count_worked += 1 else: count_errors += 1 errors_counter[info["index"]["error"]] += 1 count_done += 1 bar.update(1) for skip in skipped: bar.update(1) # Now when the index has been filled, we need to make sure we # correct any previous indexes. if update: # When you do an update, Elasticsearch will internally delete the # previous docs (based on the _id primary key we set). # Normally, Elasticsearch will do this when you restart the cluster # but that's not something we usually do. # See https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-forcemerge.html document_index.forcemerge() else: # Now we're going to bundle the change to set the alias to point # to the new index and delete all old indexes. # The reason for doing this together in one update is to make it atomic. alias_updates = [{ "add": { "index": document_index._name, "alias": INDEX_ALIAS_NAME } }] for index_name in connection.indices.get_alias(): if index_name.startswith(f"{INDEX_ALIAS_NAME}_"): if index_name != document_index._name: alias_updates.append( {"remove_index": { "index": index_name }}) click.echo(f"Delete old index {index_name!r}") connection.indices.update_aliases({"actions": alias_updates}) click.echo(f"Reassign the {INDEX_ALIAS_NAME!r} alias from old index " f"to {document_index._name}") t1 = time.time() took = t1 - t0 rate = count_done / took click.echo(f"Took {format_time(took)} to index {count_done:,} documents. " f"Approximately {rate:.1f} docs/second") click.echo(f"Count shards - successful: {count_shards_worked:,} " f"failed: {count_shards_failed:,}") click.echo(f"Counts - worked: {count_worked:,} errors: {count_errors:,}") if errors_counter: click.echo("Most common errors....") for error, count in errors_counter.most_common(): click.echo(f"{count:,}\t{error[:80]}")
class ElasticSearchIndex: def __init__(self, name='qb', similarity='default', bm25_b=None, bm25_k1=None): self.name = name self.ix = Index(self.name) self.answer_doc = create_doctype(self.name, similarity) if bm25_b is None: bm25_b = .75 if bm25_k1 is None: bm25_k1 = 1.2 self.bm25_b = bm25_b self.bm25_k1 = bm25_k1 def delete(self): try: self.ix.delete() except elasticsearch.exceptions.NotFoundError: log.info('Could not delete non-existent index.') def exists(self): return self.ix.exists() def init(self): self.ix.create() self.ix.close() self.ix.put_settings( body={ 'similarity': { 'qb_bm25': { 'type': 'BM25', 'b': self.bm25_b, 'k1': self.bm25_k1 } } }) self.ix.open() self.answer_doc.init(index=self.name) def build_large_docs(self, documents: Dict[str, str], use_wiki=True, use_qb=True, rebuild_index=False): if rebuild_index or bool(int(os.getenv('QB_REBUILD_INDEX', 0))): log.info(f'Deleting index: {self.name}') self.delete() if self.exists(): log.info(f'Index {self.name} exists') else: log.info(f'Index {self.name} does not exist') self.init() wiki_lookup = Wikipedia() log.info( 'Indexing questions and corresponding wikipedia pages as large docs...' ) for page in tqdm.tqdm(documents): if use_wiki and page in wiki_lookup: wiki_content = wiki_lookup[page].text else: wiki_content = '' if use_qb: qb_content = documents[page] else: qb_content = '' answer = self.answer_doc(page=page, wiki_content=wiki_content, qb_content=qb_content) answer.save(index=self.name) def build_many_docs(self, pages, documents, use_wiki=True, use_qb=True, rebuild_index=False): if rebuild_index or bool(int(os.getenv('QB_REBUILD_INDEX', 0))): log.info(f'Deleting index: {self.name}') self.delete() if self.exists(): log.info(f'Index {self.name} exists') else: log.info(f'Index {self.name} does not exist') self.init() log.info( 'Indexing questions and corresponding pages as many docs...') if use_qb: log.info('Indexing questions...') for page, doc in tqdm.tqdm(documents): self.answer_doc(page=page, qb_content=doc).save() if use_wiki: log.info('Indexing wikipedia...') wiki_lookup = Wikipedia() for page in tqdm.tqdm(pages): if page in wiki_lookup: content = word_tokenize(wiki_lookup[page].text) for i in range(0, len(content), 200): chunked_content = content[i:i + 200] if len(chunked_content) > 0: self.answer_doc(page=page, wiki_content=' '.join( chunked_content)).save() def search(self, text: str, max_n_guesses: int, normalize_score_by_length=False, wiki_boost=1, qb_boost=1): if not self.exists(): raise ValueError( 'The index does not exist, you must create it before searching' ) if wiki_boost != 1: wiki_field = 'wiki_content^{}'.format(wiki_boost) else: wiki_field = 'wiki_content' if qb_boost != 1: qb_field = 'qb_content^{}'.format(qb_boost) else: qb_field = 'qb_content' s = Search(index=self.name)[0:max_n_guesses].query( 'multi_match', query=text, fields=[wiki_field, qb_field]) results = s.execute() guess_set = set() guesses = [] if normalize_score_by_length: query_length = len(text.split()) else: query_length = 1 for r in results: if r.page in guess_set: continue else: guesses.append((r.page, r.meta.score / query_length)) return guesses
def create_product_index(): products = Index('products') products.settings(number_of_shards=1, number_of_replicas=0) products.doc_type(Product) products.delete(ignore=404) products.create()
def buildIndex(): """ buildIndex creates a new film index, deleting any existing index of the same name. It loads a json file containing the movie corpus and does bulk loading using a generator function. """ film_index = Index('covid_19_index') if film_index.exists(): film_index.delete() # Overwrite any previous version film_index.document(Document_COVID_19) film_index.create() documents = {} # paths = ['CORD-19-research-challenge/biorxiv_medrxiv/biorxiv_medrxiv/pdf_json/', # 'CORD-19-research-challenge/comm_use_subset/comm_use_subset/pdf_json/', # 'CORD-19-research-challenge/custom_license/custom_license/pdf_json/', # 'CORD-19-research-challenge/noncomm_use_subset/noncomm_use_subset/pdf_json/'] paths = [ 'CORD-19-research-challenge/comm_use_subset/comm_use_subset/pdf_json/' ] # Getting all files id = 1 for path in paths: for file in os.listdir(path): # Open the json film corpus fullFilePath = path + file data = open(fullFilePath) currentDoc = json.load(data) try: abstract = currentDoc['abstract'][0]['text'] except: abstract = '' body_text = currentDoc['body_text'] # ldaModel = LDA(body_text).performLDA() documents[str(id)] = [ currentDoc['paper_id'], currentDoc['metadata']['title'], abstract, body_text ] id = id + 1 size = len(documents) # Action series for bulk loading with helpers.bulk function. # Implemented as a generator, to return one document with each call. # Note that we include the index name here. # The Document type is always 'doc'. # Every item to be indexed must have a unique key. def actions(): for mid in range(1, size + 1): yield { "_index": "covid_19_index", "_type": '_doc', "_id": documents[str(mid)][0], "paper_id": documents[str(mid)][0], "title": documents[str(mid)][1], "abstract": documents[str(mid)][2], "body_text": getBodyText(documents[str(mid)][3]) } helpers.bulk(es, actions())
from pe.pe import PE from pocket_rankings.pocket_rankings import PocketRankings logger = logging.getLogger(__name__) connections.create_connection(hosts=['localhost']) INDEX_NAME = 'poker' es_index = Index(INDEX_NAME) # for index in connections.get_connection().indices.get('*'): # print(index) # es_index.delete(ignore=404) es_index.create(ignore=400) # logger.info('index truncated') @es_index.doc_type class GameAction(DocType): site = String(index='not_analyzed') game = String(index='not_analyzed') vs = Integer() player = String(index='not_analyzed') amount = Integer() pot = Integer() pos = Integer() preflop_1 = String(index='not_analyzed')
import pandas as pd from elasticsearch_dsl import connections, Index from model import Talk from datetime import datetime if __name__ == "__main__": # Indexes Reset connections.create_connection(hosts=['localhost']) talks_index = Index(Talk.Index.name) talks_index.delete(ignore=404) talks_index.create() talks_df = pd.read_csv('./data/talks.csv') print("Number of talks in the report: {}".format(len(talks_df))) for index, row in talks_df.iterrows(): next_talk = Talk() next_talk.title = row['Title'] next_talk.speakers = row['Speakers'].split('/') next_talk.day = row['Day'] next_talk.place = row['Place'] next_talk.type = row['Type'] next_talk.start = datetime.strptime(row['Start'], '%H:%M') if next_talk.day.startswith("s"): next_talk.start = next_talk.start.replace(year=2019, month=10, day=5) else: next_talk.start = next_talk.start.replace(year=2019, month=10,
class TestMixins(BaseTestCase): def setUp(self): super(TestMixins, self).setUp() self.doc_type = Token.get_es_doc_type() self.index = Index(self.doc_type._doc_type.index) self.index.doc_type(self.doc_type) self.index.create() self.refresh() def tearDown(self): super(TestMixins, self).tearDown() self.index.delete() def test_is_indexable(self): self.assertTrue(ESIndexableMixin().is_indexable()) def test_get_indexable_queryset(self): self.assertEqual(str(Token.get_indexable_queryset().query), str(Token.objects.all().query)) def test_get_es_doc(self): token = Token(name='token') self.assertIsNone(token.get_es_doc()) token.save() self.assertIsNotNone(token.get_es_doc()) def test_es_index(self): # Asynchronous call. token = Token.objects.create(name='not_indexable') self.assertDocDoesntExist(token) token.es_index() self.assertDocExists(token) # Synchronous call. token = Token.objects.create(name='not_indexable') self.assertDocDoesntExist(token) token.es_index(async=False) self.assertDocExists(token) def test_es_delete(self): # Asynchronous call. token = Token.objects.create(name='token') self.assertDocExists(token) token.es_delete() self.assertDocDoesntExist(Token, token.pk) # Synchronous call. token = Token.objects.create(name='token') self.assertDocExists(token) token.es_delete(async=False) self.assertDocDoesntExist(Token, token.pk) def test_save(self): token = Token(name='token') with override_settings(TRAMPOLINE={'OPTIONS': {'disabled': True}}): token.save() self.assertDocDoesntExist(token) token.save() doc = token.get_es_doc() self.assertEqual(doc.name, 'token') self.assertEqual(doc._id, str(token.pk)) # Update model and synchronise doc. token.name = 'kento' token.save() doc = token.get_es_doc() self.assertEqual(doc.name, 'kento') # Instance is not indexable. token = Token.objects.create(name='not_indexable') self.assertDocDoesntExist(token) def test_delete(self): token = Token.objects.create(name='token') token_id = token.pk self.assertDocExists(token) with override_settings(TRAMPOLINE={'OPTIONS': {'disabled': True}}): token.delete() self.assertDocExists(Token, token_id) token.save() token_id = token.pk token.delete() self.assertDocDoesntExist(Token, token_id)
def test_elasticsearch_target_additional_properties(sdc_builder, sdc_executor, elasticsearch): """ Elasticsearch target pipeline, adding additional properties, where specifies every routing with the value of the shard's record. It checks if the value of the record-label is added correctly to the property routing at ElasticSearch query. dev_raw_data_source >> es_target """ # Test static index_values = [] for j in range(4): index_values.append(get_random_string(string.ascii_letters, 10).lower()) raw_data = [{"text": "Record1", "index": index_values[0], "mapping": get_random_string(string.ascii_letters, 10).lower(), "doc_id": get_random_string(string.ascii_letters, 10).lower(), "shard": "record1"}, {"text": "Record2", "index": index_values[1], "mapping": get_random_string(string.ascii_letters, 10).lower(), "doc_id": get_random_string(string.ascii_letters, 10).lower(), "shard": "record2"}, {"text": "Record3", "index": index_values[2], "mapping": get_random_string(string.ascii_letters, 10).lower(), "doc_id": get_random_string(string.ascii_letters, 10).lower(), "shard": "record3"}, {"text": "Record4", "index": index_values[3], "mapping": get_random_string(string.ascii_letters, 10).lower(), "doc_id": get_random_string(string.ascii_letters, 10).lower(), "shard": None}] # Build pipeline builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = builder.add_stage('Dev Raw Data Source').set_attributes(data_format='JSON', stop_after_first_batch=True, raw_data='\n'.join(json.dumps(rec) for rec in raw_data)) es_target = builder.add_stage('Elasticsearch', type='destination') es_target.set_attributes(default_operation='INDEX', document_id='${record:value(\'/doc_id\')}', index='${record:value(\'/index\')}', mapping='${record:value(\'/mapping\')}', additional_properties='{\"_routing\":${record:value(\'/shard\')}}') dev_raw_data_source >> es_target es_target_pipeline = builder.build(title='ES target pipeline').configure_for_environment(elasticsearch) sdc_executor.add_pipeline(es_target_pipeline) try: elasticsearch.connect() # Make sure that the index exists properly before running the test index = Index(index_values[0]) index.create() assert index.refresh() # Run pipeline with additional properties sdc_executor.start_pipeline(es_target_pipeline).wait_for_finished() es_response = [] for i in index_values: es_search = ESSearch(index=i) response = es_search.execute() es_response.append(response[0]) time.sleep(5) assert len(es_response) == 4 for r in es_response: assert r if r.text == "Record4": for attribute in r.meta: assert attribute != "routing" else: assert r.shard == r.meta.routing finally: # Clean up test data in ES idx = Index(index_values[0]) idx.delete()
class TestMixins(BaseTestCase): def setUp(self): super(TestMixins, self).setUp() self.doc_type = Token.get_es_doc_type() self.index = Index(self.doc_type._doc_type.index) self.index.doc_type(self.doc_type) self.index.create() self.refresh() def tearDown(self): super(TestMixins, self).tearDown() self.index.delete() def test_is_indexable(self): self.assertTrue(ESIndexableMixin().is_indexable()) def test_is_index_update_needed(self): self.assertTrue(ESIndexableMixin().is_index_update_needed()) def test_get_indexable_queryset(self): self.assertEqual( str(Token.get_indexable_queryset().query), str(Token.objects.all().query) ) def test_get_es_doc(self): token = Token(name="token") self.assertIsNone(token.get_es_doc()) token.save() self.assertIsNotNone(token.get_es_doc()) def test_auto_doc_type_mapping(self): person = Person(first_name="Simion", last_name="Baws") person.save() doc_type = person.get_es_doc_mapping() self.assertEqual(doc_type.first_name, person.first_name) self.assertEqual(doc_type.last_name, person.last_name) self.assertEqual( doc_type.full_name, u"{0} {1}".format(person.first_name, person.last_name) ) def test_es_index(self): # Asynchronous call. token = Token.objects.create(name='not_indexable') self.assertDocDoesntExist(token) token.es_index() self.assertDocExists(token) # Synchronous call. token = Token.objects.create(name='not_indexable') self.assertDocDoesntExist(token) token.es_index(async=False) self.assertDocExists(token) # Fail silently. settings.TRAMPOLINE['OPTIONS']['disabled'] = True token = Token.objects.create(name='raise_exception') settings.TRAMPOLINE['OPTIONS']['disabled'] = False token.es_index() self.assertDocDoesntExist(token) # Hard fail. settings.TRAMPOLINE['OPTIONS']['fail_silently'] = False with self.assertRaises(RuntimeError): token.es_index() settings.TRAMPOLINE['OPTIONS']['fail_silently'] = True def test_es_delete(self): # Asynchronous call. token = Token.objects.create(name='token') self.assertDocExists(token) token.es_delete() self.assertDocDoesntExist(Token, token.pk) # Synchronous call. token = Token.objects.create(name='token') self.assertDocExists(token) token.es_delete(async=False) self.assertDocDoesntExist(Token, token.pk) # Fail silently if document doesn't exist. token.es_delete() from trampoline import get_trampoline_config trampoline_config = get_trampoline_config() # Fake delete to raise exception. backup_delete = trampoline_config.connection.delete def delete_raise_exception(*args, **kwargs): raise RuntimeError trampoline_config.connection.delete = delete_raise_exception # Fail silently token.es_delete() # Hard fail. settings.TRAMPOLINE['OPTIONS']['fail_silently'] = False with self.assertRaises(RuntimeError): token.es_delete() settings.TRAMPOLINE['OPTIONS']['fail_silently'] = True trampoline_config.connection.delete = backup_delete def test_save(self): token = Token(name='token') settings.TRAMPOLINE['OPTIONS']['disabled'] = True token.save() settings.TRAMPOLINE['OPTIONS']['disabled'] = False self.assertDocDoesntExist(token) token.save() doc = token.get_es_doc() self.assertEqual(doc.name, 'token') self.assertEqual(doc._id, str(token.pk)) # Update model and synchronise doc. token.name = 'kento' token.save() doc = token.get_es_doc() self.assertEqual(doc.name, 'kento') # Instance is not indexable. token = Token.objects.create(name='not_indexable') self.assertDocDoesntExist(token) def test_delete(self): token = Token.objects.create(name='token') token_id = token.pk self.assertDocExists(token) settings.TRAMPOLINE['OPTIONS']['disabled'] = True token.delete() settings.TRAMPOLINE['OPTIONS']['disabled'] = False self.assertDocExists(Token, token_id) token.save() token_id = token.pk token.delete() self.assertDocDoesntExist(Token, token_id)
connections.create_connection() es = connections.get_connection() # Check if index already exists i = Index(indexName) index_exists = i.exists() if not index_exists: # Define analyzer my_analyzer = analyzer('my_analyzer', type="standard", stopwords='_english_') # Create index i.analyzer(my_analyzer) i.create() print('Created index', indexName) else: print('Index', indexName, 'already exists, skipping creation.') # Index metadata documents inputFile = 'data/processed/metadata.csv' count = 0 metaProps = [] metaDoc = {} with open(inputFile, newline='', encoding='utf-8') as csvfile: spamreader = csv.reader(csvfile, delimiter=',') for row in spamreader: count += 1 if (count != 1): # create document
def buildIndex(): """ buildIndex creates a new film index, deleting any existing index of the same name. It loads a json file containing the movie corpus and does bulk loading using a generator function. """ film_index = Index('sample_film_index') if film_index.exists(): film_index.delete() # Overwrite any previous version film_index.create() # Open the json film corpus with open('2018_movies.json', 'r', encoding='utf-8') as data_file: # load movies from json file into dictionary movies = json.load(data_file) size = len(movies) if es.indices.exists(index='sample_film_index'): es.indices.delete( index='sample_film_index') # Overwrite any previous version # Action series for bulk loading with helpers.bulk function. # Implemented as a generator, to return one movie with each call. # Note that we include the index name here. # The Document type is always 'doc'. # Every item to be indexed must have a unique key. def actions(): # mid is movie id (used as key into movies dictionary) # There exist a issue in corpus that some empty values are "[]" instead of an empty list # The data type is inconsistent. # Need to clean the corpus for mid in range(1, size + 1): if movies[str(mid)]['Starring'] == "[]": movies[str(mid)]['Starring'] = [] if movies[str(mid)]['Country'] == "[]": movies[str(mid)]['Country'] = [] if movies[str(mid)]['Language'] == "[]": movies[str(mid)]['Language'] = [] if movies[str(mid)]['Director'] == "[]": movies[str(mid)]['Director'] = [] if movies[str(mid)]['Running Time'] == "[]": movies[str(mid)]['Running Time'] = [] elif movies[str(mid)]['Running Time'] == "TBA": movies[str(mid)]['Running Time'] = [] elif movies[str(mid)]['Running Time'] == "? minutes": movies[str(mid)]['Running Time'] = [] elif movies[str(mid)]['Running Time'] == "minutes": movies[str(mid)]['Running Time'] = [] if len(movies[str(mid)]['Title']) <= 1: movies[str(mid)]['Title'] = "".join(movies[str(mid)]['Title']) else: movies[str(mid)]['Title'] = ", ".join( movies[str(mid)]['Title']) if len(movies[str(mid)]['Starring']) <= 1: movies[str(mid)]['Starring'] = "".join( movies[str(mid)]['Starring']) else: movies[str(mid)]['Starring'] = ", ".join( movies[str(mid)]['Starring']) if len(movies[str(mid)]['Director']) <= 1: movies[str(mid)]['Director'] = "".join( movies[str(mid)]['Director']) else: movies[str(mid)]['Director'] = ", ".join( movies[str(mid)]['Director']) if len(movies[str(mid)]['Time']) <= 1: movies[str(mid)]['Time'] = "".join(movies[str(mid)]['Time']) else: movies[str(mid)]['Time'] = ", ".join(movies[str(mid)]['Time']) if len(movies[str(mid)]['Location']) <= 1: movies[str(mid)]['Location'] = "".join( movies[str(mid)]['Location']) else: movies[str(mid)]['Location'] = ", ".join( movies[str(mid)]['Location']) if len(movies[str(mid)]['Language']) <= 1: movies[str(mid)]['Language'] = "".join( movies[str(mid)]['Language']) else: movies[str(mid)]['Language'] = ", ".join( movies[str(mid)]['Language']) if len(movies[str(mid)]['Country']) <= 1: movies[str(mid)]['Country'] = "".join( movies[str(mid)]['Country']) else: movies[str(mid)]['Country'] = ", ".join( movies[str(mid)]['Country']) if len(movies[str(mid)]['Categories']) <= 1: movies[str(mid)]['Categories'] = "".join( movies[str(mid)]['Categories']) else: movies[str(mid)]['Categories'] = ", ".join( movies[str(mid)]['Categories']) yield { "_index": "sample_film_index", "_type": 'doc', "_id": mid, "title": movies[str(mid)]['Title'], "starring": movies[str(mid)]['Starring'], "runtime": movies[str(mid)]['Running Time'], #movies[str(mid)]['runtime'] # You would like to convert runtime to integer (in minutes) # --- Add more fields here --- "director": movies[str(mid)]['Director'], "location": movies[str(mid)]['Location'], "time": movies[str(mid)]['Time'], "language": movies[str(mid)]['Language'], "categories": movies[str(mid)]['Categories'], "country": movies[str(mid)]['Country'], "text": movies[str(mid)]['Text'], } helpers.bulk(es, actions())
def createIndex(): index = Index(INDEX, using=client) res = index.create() print(res)
class QAManipulate: _PLACEHOLDER = object() def __init__(self, index_name, **settings): self.index_name = index_name self._index = Index(index_name) self._index.settings(**settings) self.connect() class Inner(QADuos): class Index: name = index_name self._inner_cls = Inner def create_index(self): self._index.create() def delete_index(self): self._index.delete() def insert(self, doc_id, a_id=None, a_content=None, q_list=None): data = self._inner_cls() found = data.get(id=doc_id, ignore=404) if found is None: data.meta.id = doc_id data.a_id, data.a_content, data.q_list = a_id, a_content, self._question_parser( q_list) return data.save() else: msg = "ID '{}' of Index '{}' already exists." raise ValueError(msg.format(doc_id, self._index._name)) def update(self, doc_id, a_id=_PLACEHOLDER, a_content=_PLACEHOLDER, q_list=_PLACEHOLDER): data = self._inner_cls() found = data.get(id=doc_id, ignore=404) if found is not None: param_dict = {} if a_id is not QAManipulate._PLACEHOLDER: param_dict.update({"a_id": a_id}) if a_content is not QAManipulate._PLACEHOLDER: param_dict.update({"a_content": a_content}) if q_list is not QAManipulate._PLACEHOLDER: param_dict.update({"q_list": self._question_parser(q_list)}) return found.update(**param_dict) else: msg = "ID '{}' of Index '{}' does not exist." raise ValueError(msg.format(doc_id, self._index._name)) def delete(self, doc_id): data = self._inner_cls() found = data.get(id=doc_id, ignore=404) if found is not None: return found.delete() else: msg = "ID '{}' of Index '{}' does not exist." raise ValueError(msg.format(doc_id, self._index._name)) def query(self, query, question_only=True, max_rec_cnt=5, boost_question=1, **kwargs): if question_only: result = self._search_by_question(query, max_rec_cnt=max_rec_cnt, **kwargs) else: result = self._search_by_qa(query, boost_question, max_rec_cnt=max_rec_cnt, **kwargs) df = pd.DataFrame(self._parse_result(result)) df_temp = df[['doc_id', 'score']].copy().drop_duplicates() df_temp['score_softmax'] = self._softmax(df_temp['score']) return df.merge(df_temp, on=['doc_id', 'score'], how='left') def _search_by_question(self, question, max_rec_cnt, **kwargs): s = self._inner_cls().search(**kwargs) result = s.query("match", q_list__q_content=question).execute(ignore_cache=True) return result[:max_rec_cnt] def _search_by_qa(self, qa_string, boost, max_rec_cnt, **kwargs): s = self._inner_cls().search(**kwargs) boosted_question = "q_list.q_content^{}".format(boost) result = s.query("multi_match", query=qa_string, fields=['a_content', boosted_question])\ .execute(ignore_cache=True) return result[:max_rec_cnt] def _raw_search(self, **kwargs): s = self._inner_cls().search(**kwargs) return s @staticmethod def connect(hosts=['http://*****:*****@192.168.10.49:9200/'], timeout=80): connections.create_connection(hosts=hosts, timeout=timeout) @staticmethod def _parse_result(result): for hit in result: a_id, a_content, score, doc_id = hit.a_id, hit.a_content, hit.meta.score, hit.meta.id for question in hit.q_list: q_id = question.q_id q_content = question.q_content yield { 'a_id': a_id, 'a_content': a_content, 'score': score, 'q_id': q_id, 'q_content': q_content, 'doc_id': doc_id } @staticmethod def _question_parser(answer_list): result = [] for answer_data in answer_list: result.append( Questions(q_id=answer_data['q_id'], q_content=answer_data['q_content'])) return result @staticmethod def _softmax(score_list): score_array = np.sqrt(score_list) exp_array = np.exp(score_array) factor = score_list[0] / (1 + score_list[0]) softmax_array = exp_array / exp_array.sum() * factor return softmax_array
def test_elasticsearch_target(sdc_builder, sdc_executor, elasticsearch, additional_properties): """Test for Elasticsearch target stage. We do so by ingesting data via Dev Raw Data source to Elasticsearch stage and then asserting what we ingest to what will be read from Elasticsearch. The pipeline looks like: Elasticsearch target pipeline: dev_raw_data_source >> es_target """ # Test static es_index = get_random_string( string.ascii_letters, 10).lower() # Elasticsearch indexes must be lower case es_mapping = get_random_string(string.ascii_letters, 10) es_doc_id = get_random_string(string.ascii_letters, 10) raw_str = 'Hello World!' # Build pipeline builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = builder.add_stage( 'Dev Raw Data Source').set_attributes(data_format='TEXT', stop_after_first_batch=True, raw_data=raw_str) es_target = builder.add_stage('Elasticsearch', type='destination') es_target.set_attributes(default_operation='INDEX', document_id=es_doc_id, index=es_index, mapping=es_mapping, additional_properties=additional_properties) dev_raw_data_source >> es_target es_target_pipeline = builder.build( title='ES target pipeline').configure_for_environment(elasticsearch) es_target_pipeline.configuration["shouldRetry"] = False sdc_executor.add_pipeline(es_target_pipeline) try: elasticsearch.connect() # Make sure that the index exists properly before running the test index = Index(es_index) index.create() assert index.refresh() # Run pipeline and read from Elasticsearch to assert sdc_executor.start_pipeline(es_target_pipeline).wait_for_finished() # Since we are upsert on the same index, map, doc - there should only be one document (index 0) es_search = ESSearch(index=es_index) es_response = _es_search_with_retry(es_search) es_meta = es_response[0].meta # assert meta ingest assert es_meta['index'] == es_index and es_meta[ 'doc_type'] == es_mapping and es_meta['id'] == es_doc_id # assert data ingest assert raw_str == es_response[0].text finally: # Clean up test data in ES idx = Index(es_index) idx.delete()
def buildIndex(): idiom_index = Index('idioms_search') if idiom_index.exists(): idiom_index.delete() idiom_index.document(Idiom) idiom_index.create() # get json object movies with open('chengyu_addedfeatures.json', 'r', encoding='utf-8') as data_file: idioms = json.load(data_file) size = len(idioms) with open('translations.json', 'r', encoding='utf-8') as translation_file: translations = json.load(translation_file) def actions(): for mid in range(1, size + 1): pinyin_segmentation = idioms[str(mid)]['Pinyin_segmented'] segmentation_string = " ".join(pinyin_segmentation) animal = idioms[str(mid)]['Animal'] zodiac = ", ".join(animal) english = idioms[str(mid)]['English'] idioms[str(mid)]['English'] = english.rstrip("\"") # print(segmentation_string) yield { "_index": "idioms_search", "_type": 'doc', "_id": mid, "name": idioms[str(mid)]['Name'], "english": idioms[str(mid)]['English'], "afterword": idioms[str(mid)]['Afterword'], "riddle": idioms[str(mid)]['Riddle'], "source": idioms[str(mid)]['Source'], "story": idioms[str(mid)]['Story'], "synonym": idioms[str(mid)]['Synonym'], "antonym": idioms[str(mid)]['Antonym'], "desc_translation": idioms[str(mid)]['Description_Translations'], "source_translation": idioms[str(mid)]['Source_Translations'], "story_translation": idioms[str(mid)]['Story_Translations'], "usage_translation": idioms[str(mid)]['Usage_Translations'], "desc_segmentation": translations[str(mid)]['Description_Segmentation'], "source_segmentation": translations[str(mid)]['Source_Segmentation'], "story_segmentation": translations[str(mid)]['Story_Segmentation'], "usage_segmentation": translations[str(mid)]['Usage_Segmentation'], "pinyin": segmentation_string, "zodiac": zodiac, "sentiment": idioms[str(mid)]['Sentiment'], "difficulty": idioms[str(mid)]['Difficulty'], "char_num": idioms[str(mid)]['Char_num'], # "synonym": idioms[str(mid)]['Synonym'] } helpers.bulk(es, actions())
http_auth = (os.getenv("USERNAME"), os.getenv("PASSWORD")) port = os.getenv("PORT") client = connections.create_connection(hosts=hosts, http_auth=http_auth, port=port) # initiate Redis connection redis_conn = Redis(os.getenv("REDIS_HOST", "redis"), os.getenv("REDIS_PORT", 6379)) # create indices and mappings for lang in ["fr"]: #languages : # index named "web-<language code>" index = Index('web-%s' % lang) if not index.exists(): index.create() # mapping of page m = Mapping('page') m.field('url', 'keyword') m.field('domain', 'keyword') m.field('title', 'text', analyzer=languages[lang]) m.field('description', 'text', analyzer=languages[lang]) m.field('body', 'text', analyzer=languages[lang]) m.field('weight', 'long') #m.field('thumbnail', 'binary') #m.field('keywords', 'completion') # -- TEST -- # m.save('web-%s' % lang) # index for misc mappings index = Index('web')
def build_index(): """ buildIndex creates a new film index, deleting any existing index of the same name. It loads a json file containing the movie corpus and does bulk loading using a generator function. """ article_index = Index(args.index_name) if article_index.exists(): article_index.delete() # overwrite any previous version article_index.document(Article) # register the document mapping article_index.create() with open(os.path.join(args.module_dir_path, 'graph.p'), 'rb') as f: citation_graph = pickle.load(f) pagerank_scores = nx.pagerank(citation_graph) ddict = defaultdict(float, pagerank_scores) # load articles from data source with open(os.path.join(args.module_dir_path, 'articles.p'), 'rb') as f: articles = pickle.load(f) # build a default dictionary to map titles do ids (for eventual use in citations 'more like this') titles_to_ids = { v['metadata']['title'].lower(): k for k, v in enumerate(articles.values()) } titles_to_ids = defaultdict( lambda: -1, titles_to_ids) # -1 is default value for a key error # get anchor text: anchor_text_dict = utils.get_anchor_text(articles, titles_to_ids) # open ner and metadata dict with open(args.meta_ner_path, 'r') as f: meta_ner_all = json.load(f) # get entity frequencies (to filter out unique entities) ent_freqs = utils.get_entity_counts(meta_ner_all) def actions(): for i, article in enumerate(articles.values()): sha = article['paper_id'] # extract contents of entity and metadata dict if sha in set(meta_ner_all.keys( )): # entities, source, doi, publish_time, has_full_text, journal ents = [] for type, entlist in meta_ner_all[sha]['entities'].items(): if type in entity_types: ents.extend(entlist) ents = [ent for ent in ents if ent_freqs[ent] > 1 ] # get only ents that occur > 1 in corpus ents_str = utils.untokenize( ents) # transform to string type for indexing publish_time = utils.extract_year( meta_ner_all[sha]["publish_time"]) journal = meta_ner_all[sha]['journal'] else: publish_time = 0 ents_str = '' journal = '' # extract contents of article dict title = article['metadata']['title'] if 'title' in article[ 'metadata'].keys() else '(Untitled)' cits = article['bib_entries'] if 'bib_entries' in article.keys( ) else [{}] cits = [{ "title": cit['title'], "year": cit['year'], "in_corpus": titles_to_ids[cit['title'].lower()], "authors": [{ "first": auth['first'], "last": auth["last"] } for auth in cit['authors']] } for cit in cits.values() if cit['title'] != ''] authors = [{ "first": auth['first'], "last": auth["last"] } for auth in article['metadata']['authors']] pr = ddict[article['metadata']['title'].lower()] abstract = ' '.join([ abs['text'] if 'text' in abs.keys() else '' for abs in article['abstract'] ]) if 'abstract' in article.keys() else '' anchor_text = ' '.join( [cit['text'] for cit in anchor_text_dict[title.lower()]]) section_dict = defaultdict(list) for txt in article['body_text']: section = txt['section'] section_dict[section].append(txt['text']) body = [{"name": k, "text": v} for k, v in section_dict.items()] cited_by = anchor_text_dict[title.lower()] body_text = ' '.join( [sect['text'] for sect in article['body_text']]) # check that article is in English in_english = (langid.classify(body_text)[0] == 'en') yield { "_index": args.index_name, "_type": '_doc', "_id": i, "title": title, "id_num": sha, "abstract": abstract, "body": body, "body_text": body_text, "authors": authors, "publish_time": publish_time, "journal": journal, "citations": cits, "in_english": in_english, "pr": pr, "anchor_text": anchor_text, "cited_by": cited_by, "ents": ents_str, } helpers.bulk( es, actions(), raise_on_error=True ) # one doc in corpus contains a NAN value and it has to be ignored.
from elasticsearch_dsl import Index, DocType, Text, analyzer, connections from doctype import Movies connections.create_connection(hosts=['localhost']) single_shard_movies = Index('single_shard_movies') single_shard_movies.settings(number_of_shards=1, number_of_replicas=0) single_shard_movies.doc_type(Movies) multi_shard_movies = single_shard_movies.clone('multi_shard_movies') multi_shard_movies.settings(number_of_shards=3, number_of_replicas=0) single_shard_movies.create() multi_shard_movies.create()
for f in lfiles: ftxt = codecs.open(f, "r", encoding='iso-8859-1') text = '' for line in ftxt: text += line # Insert operation for a document with fields' path' and 'text' ldocs.append({ '_op_type': 'index', '_index': index, '_type': 'document', 'path': f, 'text': text }) # Working with ElasticSearch client = Elasticsearch() try: # Drop index if it exists ind = Index(index, using=client) ind.delete() except NotFoundError: pass # then create it ind.settings(number_of_shards=1) ind.create() # Bulk execution of elasticsearch operations (faster than executing all one by one) print('Indexing ...') bulk(client, ldocs)
def __init__(self): movies = Index('imdb', using=es) movies.doc_type(Movie) movies.delete(ignore=404) movies.create()