def PublishSamples(self, samples): """Publish samples to Elasticsearch service""" try: from elasticsearch import Elasticsearch except ImportError: raise ImportError('The "elasticsearch" package is required to use ' 'the Elasticsearch publisher. Please make sure it ' 'is installed.') es = Elasticsearch([self.es_uri]) if not es.indices.exists(index=self.es_index): es.indices.create(index=self.es_index, body=self.mapping) logging.info('Create index %s and default mappings', self.es_index) for s in samples: sample = copy.deepcopy(s) # Make timestamp understandable by ES and human. sample['timestamp'] = self._FormatTimestampForElasticsearch( sample['timestamp'] ) # Keys cannot have dots for ES sample = self._deDotKeys(sample) # Add sample to the "perfkit index" of "result type" and using sample_uri # as each ES's document's unique _id es.create(index=self.es_index, doc_type=self.es_type, id=sample['sample_uri'], body=json.dumps(sample))
class ESAlertSender: def __init__(self): self.es = None self.logger = logging.getLogger("ESAlertSender") def send_alerts(self, configuration, alerts): self.es = Elasticsearch([{"host": configuration["es_host"], "port": configuration["es_port"]}]) for alert in self.flatten_alerts(alerts): self.insert_es(alert) def insert_es(self, alert): try: alert["@timestamp"] = datetime.datetime.utcnow().isoformat() alert["type"] = "reddalert" self.es.create(body=alert, id=hashlib.sha1(str(alert)).hexdigest(), index="reddalert", doc_type="reddalert") except Exception as e: self.logger.error(e) def flatten_alerts(self, alerts): for alert in alerts: details = alert[2] if isinstance(details, dict): base = {"rule": alert[0], "id": alert[1]} base.update(details) yield base else: yield {"rule": alert[0], "id": alert[1], "details": details}
class TestSingleDocSigTerms(TestCase): def setUp(self): super(TestSingleDocSigTerms, self).setUp() self.es = Elasticsearch(hosts=['localhost:%d' % es_runner.es_state.port]) self.ic = IndicesClient(self.es) self.index = 'single_doc_sigterms_test' self.doc_type = 'test-doc' self.field = 'text' if self.ic.exists(self.index): self.ic.delete(self.index) self.ic.create(self.index) self.es.create(self.index, self.doc_type, {self.field: 'foo ba knark foo knirk knark foo'}, id='doc_1') def test_tf_for_doc_id(self): sigterms = SingleDocSigTerms(self.es, self.index, self.doc_type, self.field, None) resp = dict(sigterms.tf_for_doc_id('doc_1')) self.assertEquals(4, len(resp)) self.assertEquals(3, resp['foo']) self.assertEquals(2, resp['knark']) self.assertEquals(1, resp['ba']) self.assertEquals(1, resp['knirk'])
def create_user_index(): #Define our connection string conn_string = "host='localhost' dbname='test2' user='******' password='******'" # print the connection string we will use to connect print "Connecting to database\n ->%s" % (conn_string) # get a connection, if a connect cannot be made an exception will be raised here conn = psycopg2.connect(conn_string) # conn.cursor will return a cursor object, you can use this cursor to perform queries cursor = conn.cursor() cursor.execute("select users.id, users.first_name, users.last_name, users.username, users.email from users order by users.id;") records = cursor.fetchall() # Then we connect to an elasticsearch server es = Elasticsearch() es.indices.create(index='learnapt', ignore=400, body={"mappings" : {"type1" : {"_source" : { "enabled" : 'true' }, \ "properties" : { \ "user_id" : { "type" : "integer", "index" : "analyzed" }, \ "user_name" : { "type" : "string", "index" : "analyzed" }, \ "user_fullname" : { "type" : "string", "index" : "analyzed" }, \ "user_email" : { "type" : "string", "index" : "analyzed" }}}}}) old_lesson_id=-1 for record in records: userid=record[0] username=record[3] full_name=str(record[1]) + ' '+str(record[2]) email=str(record[4]) document={'user_id':userid,'user_name':username,'user_fullname':full_name,'user_email':email} es.create('learnapt', 'user',document,userid) print document print "Converted!\n "
def PublishSamples(self, samples): """Publish samples to Elasticsearch service""" try: from elasticsearch import Elasticsearch except ImportError: raise ImportError('The "elasticsearch" package is required to use ' 'the Elasticsearch publisher. Please make sure it ' 'is installed.') es = Elasticsearch([self.es_uri]) if not es.indices.exists(index=self.es_index): # choose whether to use old or new mapings based on # the version of elasticsearch that is being used if int(es.info()['version']['number'].split('.')[0]) >= 5: es.indices.create(index=self.es_index, body=self.mapping_5_plus) logging.info('Create index %s and default mappings for' ' elasticsearch version >= 5.0.0', self.es_index) else: es.indices.create(index=self.es_index, body=self.mapping_before_5) logging.info('Create index %s and default mappings for' ' elasticsearch version < 5.0.0', self.es_index) for s in samples: sample = copy.deepcopy(s) # Make timestamp understandable by ES and human. sample['timestamp'] = self._FormatTimestampForElasticsearch( sample['timestamp'] ) # Keys cannot have dots for ES sample = self._deDotKeys(sample) # Add sample to the "perfkit index" of "result type" and using sample_uri # as each ES's document's unique _id es.create(index=self.es_index, doc_type=self.es_type, id=sample['sample_uri'], body=json.dumps(sample))
def create_item_index(): #Define our connection string conn_string = "host='localhost' dbname='test2' user='******' password='******'" # print the connection string we will use to connect print "Connecting to database\n ->%s" % (conn_string) # get a connection, if a connect cannot be made an exception will be raised here conn = psycopg2.connect(conn_string) # conn.cursor will return a cursor object, you can use this cursor to perform queries cursor = conn.cursor() cursor.execute("select items.id, items.properties, items.lesson_id, items.item_type_id, items.parent_id, itags.tag_id, tags.slug from items as items left outer join item_tags as itags on items.id=itags.item_id left outer join tags on tags.id=itags.tag_id order by items.id") records = cursor.fetchall() # Then we connect to an elasticsearch server es = Elasticsearch() es.indices.create(index='learnapt', ignore=400, body={"mappings" : {"type1" : {"_source" : { "enabled" : 'true' }, \ "properties" : { \ "item_id" : { "type" : "integer", "index" : "analyzed" }, \ "lesson_id" : { "type" : "integer", "index" : "analyzed" }, \ "item_type_id" : { "type" : "integer", "index" : "analyzed" }, \ "item_title" : { "type" : "string", "index" : "analyzed" }, \ "item_link_url" : { "type" : "string", "index" : "analyzed" }, \ "item_tags" : { "type" : "integer", "index" : "analyzed" } \ }}}}) old_item_id=-1 for record in records: parent_id=record[4] item_id=record[0] lesson_id=record[2] item_type_id=record[3] tag=record[6] if record[1] is None: continue item_contents=record[1].split('", "') if item_id is old_item_id and tag is not None: document['item_tags'].append(tag.replace('-','_')) else: document={'item_id':item_id,'lesson_id':lesson_id,'item_type_id':item_type_id} if tag is not None: document['item_tags']=[tag.replace('-','_')] for item_content in item_contents: key_value=item_content.strip().split('=>') if len(key_value)<2 or len(key_value)%2 is not 0: continue document['item_'+key_value[0].strip().replace('"','')]=key_value[1].strip().replace('"','') if item_id != old_item_id and old_item_id != -1: # print 'item_id being inserted '+str(old_item_id) es.create('learnapt', 'item',old_document,int(old_item_id)) try: print old_document['item_tags'] except: pass old_item_id=item_id old_document=document print "Converted!\n "
def populate(self): if self.download(): es = Elasticsearch(self.es_url) f = open('%s/%s' % (self.assests_dir, self.l8_metadata_filename), 'r') # Read the first line for all the headers headers = f.readline().split(',') # Read the rest of the document rows = f.readlines() added_counter = 0 skipped_counter = 0 for row in rows: fields = row.split(',') obj = {} for header in headers: try: obj[header.replace('\n', '')] = float(fields[ headers.index(header)].replace('\n', '')) except ValueError: obj[header.replace('\n', '')] = fields[ headers.index(header)].replace('\n', '') try: if not es.exists( index=self.es_main_index, doc_type=self.es_main_type, id=obj['sceneID']): es.create( index=self.es_main_index, doc_type=self.es_main_type, id=obj['sceneID'], body=json.dumps(obj), ignore=409) # print('%s-%s created' % (counter, obj['sceneID'])) added_counter += 1 else: skipped_counter += 1 print('%s added | %s skipped' % (added_counter, skipped_counter), end='\r') except ConnectionError: print('There was a connection error. Check your Elastic' + ' Search setting and make sure Elastic Search is' + 'running.') return False except: print('An expected error: %s' % (sys.exc_info()[0])) return False print('The update is completed. %s new records were added.' % added_counter) return True
def index_documents(doc_type, json_path): es = Elasticsearch() json_file = open(os.path.join(root, json_path)) documents = json.load(json_file) for document in documents: es.create(index=index_name, doc_type=doc_type, id=document['_id'], body=document)
def put_data_to_es(host=None, index=None, type=None, doc=None, port=80): es = Elasticsearch([{'host': 'search-weblog-domain-hp5lndxriluzpb74bwomzm7ci4.us-east-1.es.amazonaws.com', 'port':80}]) id = uuid.uuid1().get_hex() print id es.create(index=index, doc_type=type, id=id, body=doc) return
def create_form(request): form = StudentForm(request.POST or None) if form is not None: es = Elasticsearch() if form.is_valid(): data = {'name': form.cleaned_data['name'], 'analysis': form.cleaned_data['analysis'] , 'rno': form.cleaned_data['rno'], 'address': form.cleaned_data['address']} es.create('student', doc_type='info', body=data, id=form.cleaned_data['rno']) form.save() messages.success(request, "record added") return HttpResponseRedirect('/student/home/') context = {'form': form} return render(request, 'create_form.html', context)
class ElasticSearch(Endpoint): """ ElasticSearch Endpoint implementation :param url: URL of the Endpoint :type url: str :param auth: Authentification information :type auth: (str, str) :param port: Port of the endpoint :type port: int """ def register(self): """ Register the endpoint with init resources :return: Endpoint """ self.endpoint = ES(self.url, auth=self.auth, port=self.port) return self.endpoint def create(self, name, settings): """ Create an index :param name: Name of the index :param settings: Setting for the index :return: Bool """ return self.endpoint.create(name, body=settings) def exists(self, name): """ Check if an index exists :param name: Name of the index to be created :return: Indication of existence as boolean :rtype: Bool """ return self.endpoint.exists(name)
def test_elasticsearch(self): haproxy_ip = None for output in self.stack_info.outputs: if output['output_key'] == 'minion-haproxy-ip': haproxy_ip = output['output_value'] if haproxy_ip is None: raise Exception("Unable to find IP of stack VM") es = Elasticsearch([haproxy_ip]) doc = { "first_name" : "Daniel", "last_name" : "Curran", "age" : 25, "about": "I like to compute", "interests": [ "computers" ]} es.create( index='megacorp', doc_type='employee', id='1', body=doc ) test_val = es.get(index='megacorp', doc_type='employee', id='1') return_doc = test_val['_source'] self.assertTrue(test_val['found']) self.assertTrue(doc == return_doc)
class ElasticSearchEventsRepository(abstract_repository.AbstractRepository): def __init__(self): super(ElasticSearchEventsRepository, self).__init__() self.conf = cfg.CONF.elasticsearch self.es = Elasticsearch( hosts=self.conf.hosts, sniff_on_start=self.conf.sniff_on_start, sniff_on_connection_fail=self.conf.sniff_on_connection_fail, sniffer_timeout=self.conf.sniffer_timeout, max_retries=self.conf.max_retries ) def process_message(self, message): return utils.parse_events_message(message) def write_batch(self, data_points): for data_point in data_points: (project_id, timestamp, event_type, payload, dimensions) = data_point index = '%s-%s-%s' % (self.conf.index_name, project_id, ElasticSearchEventsRepository._normalize_timestamp(timestamp)) body = { 'project_id': project_id, '@timestamp': timestamp, 'event_type': event_type, 'payload': payload, 'dimensions': dimensions } self.es.create( index=index, doc_type='event', body=ujson.dumps(body) ) @staticmethod def _normalize_timestamp(timestamp): d = None if timestamp and len(timestamp) >= 10: try: d = datetime.strptime(timestamp[0:10], '%Y-%m-%d') except ValueError as e: LOG.warning("Unable to parse timestamp '%s' - %s" % (timestamp, str(e))) if not d: d = datetime.today() return d.strftime('%Y-%m-%d')
def insert_id_score(infile): es = Elasticsearch(["localhost:9200"]) with codecs.open(infile, 'r', 'utf-8') as infp: cnt=0 for line in infp: if not line.strip(): continue row = line.strip().split('\t') poiid = row[0] raw_cscore = int10(row[1]) cscore = float(row[2]) score = {} score['raw_cscore'] = raw_cscore score['cscore'] = cscore cnt += 1 score['time'] = cnt es.create(index=indexname, doc_type=typename, id=poiid, body=score)
class BlogspyderPipeline(object): def __init__(self): self.itemDic = {"pageUrl":"" ,"pageID":"","pageTitle":"","pageContent":"","pageRank":"" } self.es = Elasticsearch("localhost") self.buffer_userName = '******' self.buffer_pageRank = -1.0 self.default_pageRank = 0.00000001 def __del__(self): self.file.close() def getPageRankByUsername(self ,userName): sql = "select _2 from pageran8 where _1 = '{0}' ".format(userName) con = MySQLdb.connect("brian1" , "brian","general","csdn") cur = con.cursor() try: cur.execute(sql) (results,) = cur.fetchone() return results except: print "fetach pageValue by name({0}) falid".format(userName) finally: if con: con.close() def process_item(self, item, spider): self.itemDic["pageUrl"] = item["pageUrl"] self.itemDic["pageID"] = item["pageID"] self.itemDic["pageTitle"] = item["pageTitle"].encode("utf-8") self.itemDic["pageContent"] = " ".join(jieba.cut(item["pageContent"].encode("utf-8"))) # pagerank info userName = item["pageUrl"].split('/')[3] if userName == self.buffer_userName: self.itemDic["pageRank"] = self.buffer_pageRank else: pr = self.getPageRankByUsername(userName) self.itemDic["pageRank"] = pr if pr != '' else self.default_pageRank self.buffer_userName = userName self.buffer_pageRank = pr self.es.create(index="blog", doc_type="csdn",body=self.itemDic) return item
class Outputer(BaseOutputer): ''' Send output to elasticsearch ''' def __init__(self, uri, index, doc_type, *args, **kwargs): super(Outputer, self).__init__(*args, **kwargs) self.es = Elasticsearch([uri]) # pylint: disable=C0103 self.index = index self.doc_type = doc_type self.es.indices.create(index=self.index, ignore=400) def output(self, event): self.es.create(index=self.index, doc_type=self.doc_type, body=event) def shutdown(self): self.es.indices.flush(self.index)
def main(): es = Elasticsearch() # query = dashboards = {} for hit in es.search(index="kibana-int", doc_type="dashboard", size=1000)["hits"]["hits"]: data = hit["_source"] dashboards[hit["_id"]] = hit["_source"] for id_, data in dashboards.items(): dashboard = json.loads(data["dashboard"]) # Here the modification takes place if dashboard["index"]["pattern"] == ORIG_INDEX_PATTERN: dashboard["index"]["pattern"] = NEW_INDEX_PATTERN dashboards[id_]["dashboard"] = json.dumps(dashboard) for id_, data in dashboards.items(): es.delete(index="kibana-int", doc_type="dashboard", id=id_) es.create(index="kibana-int", doc_type="dashboard", id=id_, body=data)
def push(): host = os.environ.get('ELASTICSEARCH_HOST', 'localhost') connection = Elasticsearch([host]) # Delete old markers or do initial setup try: print(connection.delete_by_query(index=[INDEX], doc_type=DOC_TYPE, q='*')) except NotFoundError: set_mapping() if True: # real 0m9.839s bulk(connection, get_bulk_ready_data()) else: # real 0m30.341s for row in get_data(): connection.create( index=INDEX, doc_type=DOC_TYPE, body=row, id=row['atlas_number'], )
class DataStore: def __init__( self, host, port, username=None, password=None, use_ssl=False, default_index=None, default_doctype=None ): self.index = default_index self.doc_type = default_doctype if username and password: self.es_connection = Elasticsearch( host=host, port=port, http_auth=username + ":" + password, use_ssl=use_ssl ) else: self.es_connection = Elasticsearch(host=host, port=port, use_ssl=use_ssl) if not self.es_connection.ping(): raise DataStoreException("Connection to ElasticSearch failed.") self.es_connection = False def store(self, body): try: self.es_connection.create( body=body, id=hashlib.sha1(str(body)).hexdigest(), index=self.index, doc_type=self.doc_type ) except ElasticsearchException, e: raise DataStoreException("Exception while storing data in Elastic Search: " + str(e))
def sns_handler(request): messageType = request.META['HTTP_X_AMZ_SNS_MESSAGE_TYPE'] parsed_body = json.loads(request.body) if messageType == "SubscriptionConfirmation": url = parsed_body["SubscribeURL"] serialized_data = urllib2.urlopen(url).read() elif messageType == "Notification": message = parsed_body["Message"] j_msg = json.loads(message) print (type(j_msg['coordinates'])) print (j_msg['coordinates']) j_msg['coordinates'] = j_msg['coordinates']['coordinates'] print(j_msg) message = str(json.dumps(j_msg)) print(message) pusher_client = Pusher( app_id='xxx', key='xxx', secret='xxx', ssl=True ) pusher_client.trigger('test_channel', 'my_event', {'message': message}) es = Elasticsearch( [ 'xxx' ], use_ssl=True, verify_certs=True, connection_class = RequestsHttpConnection ) es.create(index="tweets", doc_type="tweet", body=j_msg) return HttpResponse('', status=200)
def create_test_data(): # Add some test data to Elasticsearch. es = Elasticsearch(es_url) es.indices.delete(index='companiontest', ignore=[404]) es.indices.create(index='companiontest', body={ 'index': { 'number_of_shards': 1, 'number_of_replicas': 0 } }) # Create 3 "simple" doc types and 1 "advanced" es.create(index='companiontest', doc_type='simple', id='foo', body={ 'timestamp': datetime.datetime(2015, 1, 1), 'id': 'foo' }) es.create(index='companiontest', doc_type='simple', id='bar', body={ 'timestamp': datetime.datetime(2015, 1, 2), 'id': 'bar' }) es.create(index='companiontest', doc_type='simple', id='baz', body={ 'timestamp': datetime.datetime(2015, 1, 3), 'id': 'baz' }) es.create(index='companiontest', doc_type='advanced', id='foo', body={ 'timestamp': datetime.datetime(2015, 1, 1), 'id': 'foo' }) es.indices.refresh(index='companiontest')
class ElasticsearchService(object): def __init__(self, host, port): self._es = Elasticsearch([{'host': host, 'port': port}]) def search(self, *args, **kwargs): return self._es.search(*args, **kwargs) def create(self, *args, **kwargs): return self._es.create(*args, **kwargs) def get(self, *args, **kwargs): return self._es.get(*args, **kwargs) def exists(self, *args, **kwargs): return self._es.exists(*args, **kwargs) def msearch(self, *args, **kwargs): return self._es.msearch(*args, **kwargs)
class ElasticSearchHandler: def __init__(self, host=None): if not host: host = os.getenv("ES_HOST") self.es_handle = Elasticsearch(hosts=host) def check_index(self, index): return self.es_handle.indices.exists(index) def create_index(self, index): if not self.check_index(index): self.es_handle.indices.create(index) def add_to_es(self, index, document_type, body): return self.es_handle.create(index=index, doc_type=document_type, body=body) def get_from_es(self, index, id): return self.es_handle.get(index, id=id) def delete_by_id(self, index, document_type, id): self.es_handle.delete(index, document_type, id)
def update_from_ldap(server, username, password, schema, pull): pid = os.getpid() print >> sys.stderr, '[%5d] Aquire lock...' % pid lock = filelock.FileLock("/tmp/ldap_update.lock") try: with lock.acquire(timeout=10): print >> sys.stderr, '[%5d] Got lock, now running ldap update...' % pid print >> sys.stderr, '[%5d] Server: %s, Username: %s, Schema: %s' % ( pid, server, username, schema) if pull: os.environ['LDAP_USERNAME'] = username os.environ['LDAP_PASSWORD'] = password os.environ['LDAP_SERVER'] = server os.environ['LDAP_BASE_DN'] = schema command = '/usr/bin/python /app/scripts/ldapdump.py' print >> sys.stderr, '[%5d] Execute: %s' % (pid, command) ret = os.system(command) os.environ['LDAP_USERNAME'] = '' os.environ['LDAP_PASSWORD'] = '' os.environ['LDAP_SERVER'] = '' os.environ['LDAP_BASE_DN'] = '' if ret != 0: print >> sys.stderr, '[%5d] Dump failed, aborting' % (pid) return None command = '/usr/bin/python /app/scripts/ldapmunge.py' print >> sys.stderr, '[%5d] Execute: %s' % (pid, command) if 0 != os.system(command): print >> sys.stderr, '[%5d] Munge failed, aborting' % (pid) return None records = json.load(open(USER_JSON_FILENAME)) total_records = len(records) es = Elasticsearch("http://*****:*****@/app/scripts/index.json"''' if 0 != os.system(command): print >> sys.stderr, '[%5d] Upload of index failed, aborting' % ( pid) return None print >> sys.stderr, "[%5d] Uploading %d indices to elasticsearch..." % ( pid, total_records) for record in records: username = record['username'] es.create(index=INDEX_NAME, doc_type=DOC_TYPE, body=record, id=username) print >> sys.stderr, '[%5d] Done.' % pid except filelock.Timeout: print >> sys.stderr, '[%5d] Failed to aquire lock, skipping task.' % pid return None
class IndexerUtils: add_template = """ if (ctx._source.lastin.indexOf({grp}) < 0) {{ ctx._source.lastin.add({grp}); }} if (ctx._source.groups.indexOf({grp}) < 0) {{ ctx._source.groups.add({grp}); }} """ del_template = """ if (ctx._source.lastin.indexOf({grp}) >= 0) {{ ctx._source.lastin.remove(ctx._source.lastin.indexOf({grp})); }} if (ctx._source.groups.indexOf({grp}) >= 0) {{ ctx._source.groups.remove(ctx._source.groups.indexOf({grp})); }} """ def __init__(self, config): self.log = logging.getLogger('indexrunner') self.ws = WorkspaceAdminUtil(config) self.elasticsearch = Elasticsearch([config['elastic-host']]) self.esbase = config['elastic-base'] mapfile = config.get('mapping-file') self.log.info("Mapping File: %s" % (mapfile)) self.mapping = self._read_mapfile(mapfile) if 'workspace-admin-token' in config: token = config['workspace-admin-token'] else: token = os.environ.get('KB_AUTH_TOKEN') self.method_runner = MethodRunner(config, token=token) self.ep = EventProducer(config) # TODO: access and data specs are not used? with open('specs/mapping.yml') as f: self.mapping_spec = yaml.load(f) def _read_mapfile(self, mapfile): with open(mapfile) as f: d = f.read() mapping = yaml.load(d)['types'] for type in mapping.keys(): for index in mapping[type]: name = index['index_name'] index['index_name'] = '%s.%s' % (self.esbase, name) return mapping def process_event(self, evt): """ Process a single workspace or indexer event """ etype = evt['evtype'] ws = evt['wsid'] if evt['ver']: evt['upa'] = '%d/%s/%d' % (evt['wsid'], evt['objid'], evt['ver']) if etype in ['NEW_VERSION', 'NEW_ALL_VERSIONS']: self.new_object_version(evt) elif 'PUBLISH' in etype: self.publish(evt['wsid']) elif etype.startswith('DELETE_'): self.delete(evt) elif etype == 'COPY_ACCESS_GROUP': self._index_workspace(ws) elif etype == 'RENAME_ALL_VERSIONS': self.log.warning("Warning rename not implemented.") elif etype in ['REINDEX_WORKSPACE']: # Pseudo event self._index_workspace(ws) else: self.log.error("Can't process evtype " + evt['evtype']) def _index_workspace(self, wsid): """ List the workspace and generate an index event for each object. """ min = 0 while True: objs = self.ws.list_objects({ 'ids': [wsid], 'minObjectID': min, 'limit': _MAX_LIST }) self.ep.index_objects(objs) if len(objs) <= _MAX_LIST: break min = objs[-1][0] + 1 def _create_obj_rec(self, upa): (wsid, objid, vers) = self._split_upa(upa) req = {'objects': [{'ref': upa}], 'no_data': 1} obj = self.ws.get_objects2(req)['data'][0] info = obj['info'] (otype, over) = info[2].split('-') fmt = "%Y-%m-%dT%H:%M:%S%z" ts = int(datetime.datetime.strptime(info[3], fmt).timestamp() * 1000) wsinfo = self._get_ws_info(wsid) # Don't index temporary narratives if wsinfo['temp']: return None prov = self._get_prov(obj) # TODO stags, copier, prv_cmt, time rec = { "guid": f"WS:{upa}", "otype": None, "otypever": 999, "stags": [], "oname": info[1], "creator": info[5], "copier": None, "prv_mod": prov['prv_mod'], "prv_meth": prov['prv_meth'], "prv_ver": prov['prv_ver'], "prv_cmt": None, "md5": info[8], "timestamp": ts, "prefix": "WS:%d/%d" % (wsid, objid), "str_cde": "WS", "accgrp": wsid, "version": vers, "islast": False, "public": wsinfo['public'], "shared": wsinfo['shared'], "ojson": "{}", "pjson": None } return rec # TODO: should we just add an filehandler for this? def _log_error(self, event, index, err): mes = {'event': event, 'index': index, 'error': str(type(err))} with open('error.log', 'a') as f: f.write(json.dumps(mes)) f.write('\n') def _access_rec(self, wsid, objid, vers, public=False): rec = { "extpub": [], "groups": [-2, wsid], "lastin": [-2, wsid], "pguid": f"WS:{wsid}/{objid}/{vers}", "prefix": f"WS:{wsid}/{objid}", "version": vers } if public: rec['lastin'].append(-1) rec['groups'].append(-1) # type": "access" return rec def _get_id(self, upa): """ Return the elastic id """ if not re.match('^\d+\/\d+\/\d+$', upa): raise ValueError(f"'{upa}' is not an upa") return f"WS:{upa.replace('/', ':')}" def _get_prov(self, obj): ret = { "prv_mod": None, "prv_meth": None, "prv_ver": None, "prv_cmt": None, } if 'provenance' not in obj or len(obj['provenance']) == 0: return ret prov = obj['provenance'][0] if 'service' in prov: ret['prv_mod'] = prov['service'] if 'method' in prov: ret['prv_meth'] = prov['method'] if 'script' in prov: ret['prv_mod'] = 'legacy_transform' ret['prv_meth'] = prov['script'] if 'service_ver' in prov: ret['prv_ver'] = prov['service_ver'] elif 'script_ver' in prov: ret['prv_ver'] = prov['script_ver'] if 'description' in prov: ret['prv_cmt'] = prov['description'] return ret def _put_es_data_record(self, index, upa, doc): """ Add an ES data record. Only works if the object hasn't been indexed before. Will throw an error if it has """ eid = self._get_id(upa) res = self.elasticsearch.create(index=index, parent=eid, doc_type='data', id=eid, routing=eid, body=doc, refresh=True) return res def _get_ws_info(self, wsid): info = self.ws.get_workspace_info({'id': wsid}) meta = info[8] # Don't index temporary narratives temp = (meta.get('is_temporary') == 'true') public = (info[6] != 'n') # TODO shared = False return { 'wsid': wsid, 'info': info, 'meta': meta, 'temp': temp, 'public': public, 'shared': shared } def publish(self, wsid): """This updates the visibility of objects when a workspace is made public""" # Find each index wsinfo = self._get_ws_info(wsid) public = wsinfo['public'] if public: script = self.add_template.format(grp="-1") else: script = self.del_template.format(grp="-1") aq = { "query": { "prefix": { "prefix": f"WS:{wsid:d}/" } }, "script": { "source": script } } filt = { "bool": { "filter": [{ "term": { "public": not public } }, { "term": { "accgrp": wsid } }] } } publics = "false" if public: publics = "true" dq = { "query": filt, "script": { "source": f"ctx._source.public={publics}" } } active_indexes = self._get_all_active_indexes() for index in active_indexes: self.elasticsearch.update_by_query(index=index, doc_type='access', body=aq, ignore=[400, 404], refresh=True) self.elasticsearch.update_by_query(index=index, doc_type='data', body=dq, ignore=[400, 404], refresh=True) def _get_all_active_indexes(self): indexes = (index['index_name'] for oindex in self.mapping for index in self.mapping[oindex]) return self.elasticsearch.indices.get(','.join(indexes), ignore_unavailable=True) def delete(self, event): # Find each index id = self._get_id(event['upa']) active_indexes = self._get_all_active_indexes() q = {'query': {'parent_id': {'type': 'data', 'id': id}}} for index in active_indexes: self.elasticsearch.delete_by_query(index=index, doc_type='data', routing=id, body=q, ignore=[400, 404], refresh=True) self.elasticsearch.delete(index=index, doc_type='access', id=id, ignore=404, refresh=True) def _update_es_access(self, index, wsid, objid, vers, upa): # Should pass a wsid but just in case... wsinfo = self._get_ws_info(wsid) if wsinfo['temp']: return None public = wsinfo['public'] doc = self._access_rec(wsid, objid, vers, public=public) eid = self._get_id(upa) res = self.elasticsearch.index(index=index, doc_type='access', id=eid, body=doc, refresh=True) return res def _split_upa(self, upa): return [int(x) for x in upa.split('/')] def _get_indexes(self, otype): pieces = otype.split('.') if not pieces: raise RuntimeError(f"Invalid workspace type: {otype}") generic = pieces[0] + ".*" if otype in self.mapping: return self.mapping[otype] elif generic in self.mapping: return self.mapping[generic] return self.mapping['Other'] def _ensure_mapping_exists(self, oindex, objschema): """Ensures a mapping exists in ES for 'index_name'""" index = oindex['index_name'] res = self.elasticsearch.indices.exists(index=index) if not res: schema = self.mapping_spec if oindex.get('raw'): schema = {'mappings': {'data': {'properties': objschema}}} elif objschema is not None: schema['mappings']['data']['properties']['key'] = \ {'properties': objschema} self.elasticsearch.indices.create(index=index, body=schema) def _run_module(self, oindex, upa): params = {'upa': upa} (module, method) = oindex['index_method'].split('.') resp = self.method_runner.run(module, method, params)[0] self.method_runner.cleanup() return resp def _update_islast(self, index, wsid, objid, vers): prefix = f"WS:{wsid:d}/{objid}" doc = { "query": { "bool": { "filter": [{ "term": { "prefix": prefix } }] } }, "script": { "source": "ctx._source.islast = (ctx._source.version == params.lastver)", "params": { "lastver": int(vers) } } } self.elasticsearch.update_by_query(index, 'data', doc, refresh=True) def _new_raw_version_index(self, event, oindex): """This handles indexing an object where the callout is expected to return an entire ElasticSearch reccord for storage""" upa = event['upa'] index = oindex['index_name'] eid = self._get_id(upa) res = self.elasticsearch.get(index=index, doc_type='data', id=eid, ignore=404) if res.get('status') != 404 and res['found']: self.log.info("%s already indexed in %s" % (eid, index)) return resp = self._run_module(oindex, upa) if resp.get('data') is None: raise ValueError( f"{oindex['index_method']} did not return 'data' for {event}") self._ensure_mapping_exists(oindex, resp['schema']) doc = resp['data'] self.elasticsearch.create(index=index, doc_type='data', id=eid, body=doc, refresh=True) def _new_object_version_index(self, event, oindex): """ This handles indexing a specific object version. The callout should return a structure with a 'data' """ wsid = event['wsid'] objid = event['objid'] vers = event['ver'] upa = event['upa'] index = oindex['index_name'] eid = self._get_id(upa) res = self.elasticsearch.get(index=index, doc_type='access', id=eid, ignore=404) if res.get('status') != 404 and res['found']: self.log.info("%s already indexed in %s" % (eid, index)) return doc = self._create_obj_rec(upa) extra = {} if 'default_indexer' not in oindex['index_method']: extra = self._run_module(oindex, upa) self._ensure_mapping_exists(oindex, extra.get('schema')) if extra.get('data') is not None: doc['key'] = extra['data'] if 'objdata' in extra: od = doc['key'].pop('objdata') doc['ojson'] = json.dumps(od) else: doc['ojson'] = json.dumps(doc['key']) else: self.log.warning( f"{oindex['index_method']} did not return 'data' for {event}") self._update_es_access(index, wsid, objid, vers, upa) self._put_es_data_record(index, upa, doc) oid = f'{wsid:d}/{objid}' info = self.ws.get_object_info3({'objects': [{ 'ref': oid }]})['infos'][0] if info[4] == vers: self._update_islast(index, wsid, objid, vers) def _new_object_version_multi_index(self, event, oindex): """ This handles indexing multiple sub-objects for a specific object version. The callout should return a structure with a 'features' that is a list of dictionary keys """ wsid = event['wsid'] objid = event['objid'] vers = event['ver'] upa = event['upa'] index = oindex['index_name'] # Check if any exists eid = self._get_id(upa) res = self.elasticsearch.get(index=index, doc_type='access', id=eid, ignore=404) if res.get('status') != 404 and res['found']: self.log.info(f"{eid} already indexed in {index}") return wsinfo = self._get_ws_info(wsid) if wsinfo['temp']: return None public = wsinfo['public'] adoc = self._access_rec(wsid, objid, vers, public=public) pdoc = self._create_obj_rec(upa) extra = self._run_module(oindex, upa) parent = extra['parent'] self._ensure_mapping_exists(oindex, extra['schema']) pdoc['pjson'] = json.dumps(parent) pguid = self._get_id(upa) recs = [] bdoc = [] ct = 0 for row in extra['documents']: doc = pdoc.copy() doc['key'] = {**parent, **row} guid = row.pop('guid') if not guid.startswith('WS:'): guid = "WS:" + guid # Tear apart the name so we get just the # last portion ele = guid.replace('/', ':').split(':') # build the feature ID from everything past the UPA fid = '/'.join(ele[4:]) guid = 'WS:%s:feature/%s' % (upa, fid) doc['guid'] = guid if 'objdata' in doc['key']: od = doc['key'].pop('objdata') doc['ojson'] = json.dumps(od) else: doc['ojson'] = json.dumps(doc['key']) rec = { '_id': guid, '_source': doc, '_index': index, '_parent': pguid, '_type': 'data' } bdoc.append(rec) ct += 1 if ct > BULK_MAX: bulk(self.elasticsearch, bdoc) bdoc = [] ct = 0 if ct > 0: bulk(self.elasticsearch, bdoc) self._update_es_access(index, wsid, objid, vers, upa) oid = f'{wsid:d}/{objid}' info = self.ws.get_object_info3({'objects': [{ 'ref': oid }]})['infos'][0] if info[4] == vers: self._update_islast(index, wsid, objid, info[4]) def new_object_version(self, event): # For a NEW ALL VERSION we will just index the latest versions # if event['evtype'] == 'NEW_ALL_VERSIONS': upa = f"{event['wsid']}/{event['objid']}" info = self.ws.get_object_info3({'objects': [{ 'ref': upa }]})['infos'][0] vers = info[4] event['ver'] = vers (event['objtype'], event['objtypever']) = info[2].split('-') event['upa'] = f'{upa}/{vers}' indexes = self._get_indexes(event['objtype']) for oindex in indexes: try: if oindex.get('multi'): self._new_object_version_multi_index(event, oindex) elif oindex.get('raw'): self._new_raw_version_index(event, oindex) else: self._new_object_version_index(event, oindex) except Exception as e: self.log.error("Failed for index") self._log_error(event, oindex, e) exc_type, exc_value, exc_traceback = sys.exc_info() self.log.info("*** print_tb:") traceback.print_tb(exc_traceback, limit=1, file=sys.stdout) self.log.info("*** print_exception:") traceback.print_exception(exc_type, exc_value, exc_traceback, limit=2, file=sys.stdout) self.log.info("Completed new object version") return True
#print(config) try: connection = Elasticsearch( config['elasticsearch_hosts'], # sniff before doing anything sniff_on_start=True, # refresh nodes after a node fails to respond sniff_on_connection_fail=True, # and also every 60 seconds sniffer_timeout=60) connection.create(index=config['default_index'],doc_type='group',id='2', body={ "gid": 234, "owner": "bemineni", "name": "Sammy", "grp_hash": "456678", "description": "Sammy group" }) data = connection.get_source(index=config['default_index'],doc_type="group",id='2') print(json.dumps(data,indent=4,sort_keys=True)) except Exception as e: print("Failed to add item") print("Test failed") traceback.print_exc() ret = 1 finally: print(test_name + " Test complete")
], port='9200', timeout=25) # 删除索引(库) result = es.indices.delete(index='actest', ignore=[400, 404]) print(result) # 创建索引(库) result = es.indices.create(index='actest', ignore=400) print(result) # 插入数据 data = { 'title': '美国留给伊拉克的是个烂摊子吗', 'url': 'http://view.news.qq.com/zt2011/usa_iraq/index.htm' } result = es.create(index='actest', doc_type='politics', id=1, body=data) # result = es.index(index='actest', doc_type='politics', body=data) # index方法创建文档无需指定id print(result) # 更新数据 data = { 'title': '美国留给伊拉克的是个烂摊子吗', 'url': 'http://view.news.qq.com/zt2011/usa_iraq/index.htm', 'date': '2011-12-16', 'status': 0 } result = es.update(index='actest', doc_type='politics', body={'doc': data}, id=1) # .update更新数据时body需要外包一层'doc' # result = es.index(index='actest', doc_type='politics', body=data, id=1)
import json from elasticsearch import Elasticsearch import os with open('rakuten_books.json', 'r') as f: rakuten_books = json.load(f) es = Elasticsearch(http_auth=('elastic', os.environ.get('ES_PW'))) for rakuten_book in rakuten_books: rakuten_book_item = rakuten_book['Item'] try: es.create(index='book', id=rakuten_book_item['isbn'], body=rakuten_book_item) except: pass print('{} created'.format(rakuten_book_item['title']))
def getItem(self): #获取负面关键词 negative = NegativeKeyWords() negKwList = negative.getNegativeKeyWordsList() #获取ES搜索引擎数据库连接 es = Elasticsearch([{'host': ES_HOST, 'port': ES_PORT}]) url_list = self.getPageInfoUrl() #获取页面所有信息的url MysqlHelper.excuteUpdate("tb_spider", {"Fstate": 1}, "Fid={}".format(self.spiderId)) for url in url_list: spider = MysqlHelper.excuteFindOne( "select Fnum from tb_spider where Fid={}".format( self.spiderId)) num = int(spider["Fnum"]) num += 1 #页面源码 text = self.getHtml(url).text #创建一selector,用于xpath去匹配需要的数据 selector = etree.HTML(text, parser=None, base_url=None) #标题 title = selector.xpath(u'//span[@class="s_title"]/span/text()') #发帖人 p_Fauthor = u"<a href=\".*?\" target=\"_blank\" class=\"js-vip-check\" uid=\".*?\" uname=\".*?\">([\s\S]*?)</a>" author = re.findall(p_Fauthor, text) #发帖时间 p_date = u"<span>时间:([\s\S]*?) </span>" date = re.findall(p_date, text) #内容 p_content = u"<div class=\"bbs-content clearfix\">([\s\S]*?)</div>" content = re.findall(p_content, text) data = {} data["Ftitle"] = title[0] if title else "" data["Fdate"] = date[0] if date else "Null" data["Fcontent"] = content[0] if content else "" data["Flink"] = url data["Ftype"] = "论坛" data["Fsource"] = "天涯社区" data["FcreateTime"] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) data["Fauthor"] = author[0] if author else "" #判断是否是负面信息 def isNegative(key): if re.findall(key, data["Fcontent"]): return True else: return False isNegKeyWor = map(isNegative, negKwList) #0正面,1负面 if True in isNegKeyWor: data["isNegative"] = 1 else: data["isNegative"] = 0 try: n = random.randint(1, 9999) b = random.randint(1, 9999) id = n + b #向ES插入数据 index:索引,相当于mysql的数据库名,doc_type:相当于mysql的表名 istrue = es.create(index="scdel_index", id=id, doc_type="tb_data", body=data)["created"] print istrue except Exception as ex: print ex istrue = False if istrue: MysqlHelper.excuteUpdate("tb_spider", {"Fnum": num}, "Fid={}".format(self.spiderId)) time.sleep(1)
from kafka import KafkaConsumer from json import loads import pandas as pd from elasticsearch import Elasticsearch i=2200 es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) consumer = KafkaConsumer( 'twitter_stream', bootstrap_servers=['localhost:9092'], auto_offset_reset='earliest', enable_auto_commit=False, group_id='ElasticConsumerE', value_deserializer=lambda x: loads(x.decode('utf-8'))) for message in consumer: message=message.value # message=json.dumps() # print(message['user']['screen_name']) es.create(index='idx_twp_con', doc_type='twitter_twp_con', id=i, body=message) i = i+1 print('{} indexed.'.format(i))
def crawler(self): # es = Elasticsearch(['35.227.82.63:9200']) es = Elasticsearch() f = open("url.txt", 'w') if es.ping(): r = redis.StrictRedis(host='35.227.82.63', port=6379) seen = [] url_queue = queue.Queue() url_queue.put('http://en.wikipedia.org/wiki/Catholic_Church') url_queue.put('http://en.wikipedia.org/wiki/Christianity') url_queue.put('http://en.wikipedia.org/wiki/College_of_Cardinals') url_queue.put( 'https://en.wikipedia.org/wiki/Hierarchy_of_the_Catholic_Church' ) url_queue.put('https://en.wikipedia.org/wiki/Papal_legate') url_queue.put( 'http://www.bible.ca/catholic-church-hierarchy-organization.htm' ) url_queue.put( 'https://www.britannica.com/topic/Roman-Catholicism/Structure-of-the-church' ) seen.append('http://en.wikipedia.org/wiki/Christianity') seen.append('http://en.wikipedia.org/wiki/College_of_Cardinals') seen.append( 'https://en.wikipedia.org/wiki/Hierarchy_of_the_Catholic_Church' ) seen.append('https://en.wikipedia.org/wiki/Papal_legate') seen.append( 'http://www.bible.ca/catholic-church-hierarchy-organization.htm' ) seen.append( 'https://www.britannica.com/topic/Roman-Catholicism/Structure-of-the-church' ) count = 0 tempurl = [] while not url_queue.empty() and count <= 1000: url = url_queue.get() seedUrl = self.getSeedUrl(url) if not self.robotCheck(url, seedUrl): continue response = requests.get(url) Httptype = (response.headers['Content-Type']) if "text/html" in Httptype: count += 1 headers = dict(response.headers) soup = BeautifulSoup(response.content, 'html.parser') seen.append(url) r.set(url, "visited") # data = [] out_links = [] print("init passed") # for p in soup.find_all("p"): # data.append(self.getCleanText(str(p))) data = self.getCleanText(str(soup.get_text)) rawHtml = str(soup.prettify()) for a in soup.find_all('a'): temp = a.attrs if 'title' in temp.keys(): score = self.checkKeywords(temp['title']) # if self.checkKeywords(temp['title']): if score > 0: link = temp['href'] link = self.canonicalizeUrl(link, seedUrl) if link not in out_links and link is not url: out_links.append(link) # if r.get(link) == None and count<=1000: if link not in seen and count <= 1000: # url_queue.put(link) tempurl.append([link, score]) seen.append(link) print("outlink passed") es.create("ap_dataset", "document", count, body={ "url": url, "inlinks": url, "outlinks": out_links, "text": data, "raw": rawHtml, "headers": (headers) }) print(url + " crawled " + str(count)) f.writelines(url + " crawled " + str(count) + '\n') if url_queue.empty(): tempurl = self.sortUrl(tempurl) for x in tempurl: url_queue.put(x[0]) tempurl = [] f.close()
class ElasticCorpus(BaseCorpus): """ ElasticSearch connection corpus """ def __init__(self): """ Basic creator """ super(self.__class__, self).__init__() self.es=None self.query_filter="" self.ALL_FILES=[] self.TEST_FILES=[] self.FILES_TO_IGNORE=[] self.metadata_index=None self.paths.fullLuceneIndex="index_" self.max_results=sys.maxint def connectCorpus(self, base_directory, endpoint={"host":"localhost", "port":9200}, initializing_corpus=False,suppress_error=False): """ If DB has been created, connect to it. Icf not, initialize it first. Args: base_directory: root dir of this corpus initializing_corpus: if True, create DB and directories suppress_error: if true, db doesn't complain if it's connected already """ self.endpoint=endpoint self.setPaths(ensureTrailingBackslash(base_directory)) if initializing_corpus: self.createAndInitializeDatabase() self.connectToDB(suppress_error) def createAndInitializeDatabase(self): """ Ensures that the directory structure is in place and creates the SQLite database and tables """ def createTable(name, settings, properties): """ """ if not self.es.indices.exists(index=index_equivalence[name]["index"]): self.es.indices.create( index=index_equivalence[name]["index"], body={"settings":settings,"mappings":{index_equivalence[name]["type"]:{"properties":properties}}}) settings={ "number_of_shards" : 2, "number_of_replicas" : 0 } properties={ "guid": {"type":"string", "index":"not_analyzed"}, "metadata": {"type":"nested"}, "norm_title": {"type":"string", "index":"not_analyzed"}, "author_ids":{"type":"string", "index":"not_analyzed", "store":True}, "num_in_collection_references": {"type":"integer"}, "num_resolvable_citations": {"type":"integer"}, "num_inlinks": {"type":"integer"}, "collection_id": {"type":"string", "index":"not_analyzed", "store":True}, "import_id": {"type":"string", "index":"not_analyzed", "store":True}, "time_created": {"type":"date"}, "time_modified": {"type":"date"}, "has_scidoc": {"type":"boolean","index":"not_analyzed", "store":True}, "flags": {"type":"string","index":"not_analyzed", "store":True}, # This is all now accessed through the nested metadata ## "filename": {"type":"string", "index":"not_analyzed", "store":True}, ## "corpus_id": {"type":"string", "index":"not_analyzed"}, ## "title": {"type":"string", "store":True},## "surnames": {"type":"string"}, ## "year": {"type":"integer"}, ## "in_collection_references": {"type":"string", "index":"not_analyzed", "store":True}, ## "inlinks": {"type":"string", "index":"not_analyzed", "store":True}, } createTable("papers", settings, properties) properties={ "scidoc": {"type":"string", "index": "no", "store":True}, "guid": {"type":"string", "index": "not_analyzed", "store":True}, "time_created": {"type":"date"}, "time_modified": {"type":"date"}, } createTable("scidocs", settings, properties) settings={ "number_of_shards" : 1, "number_of_replicas" : 1 } properties={ "data": {"type":"string", "index": "no", "store":True}, "time_created": {"type":"date"}, "time_modified": {"type":"date"}, } createTable("cache", settings, properties) properties={ "link":{"type":"nested"}, ## "guid_from": {"type":"string", "index":"not_analyzed", "store":True}, ## "guid_to": {"type":"string", "index":"not_analyzed", "store":True}, ## "authors_from": {"type":"string", "index":"not_analyzed", "store":True}, ## "authors_to": {"type":"string", "index":"not_analyzed", "store":True}, ## "self_citation": {"type":"boolean", "index":"not_analyzed", "store":True}, ## "year_from": {"type":"integer", "index":"not_analyzed", "store":True}, ## "year_to": {"type":"integer", "index":"not_analyzed", "store":True}, ## "numcitations": {"type":"integer", "index":"not_analyzed", "store":True}, "time_created": {"type":"date"}} createTable("links", settings, properties) properties={ ## "author_id": {"type":"string", "index":"not_analyzed", "store":True}, "author": {"type":"nested"}, ## "given": {"type":"string", "index":"analyzed", "store":True}, ## "middle": {"type":"string", "index":"analyzed", "store":True}, ## "family": {"type":"string", "index":"analyzed", "store":True}, ## "papers": {"type":"string", "index":"not_analyzed", "store":True}, ## "papers_first_author": {"type":"string", "index":"not_analyzed", "store":True}, ## "papers_last_author": {"type":"string", "index":"not_analyzed", "store":True}, ## "affiliations": {"type":"nested", "index":"not_analyzed", "store":True}, "time_created": {"type":"date"} } createTable("authors", settings, properties) properties={ "venue": {"type":"nested"}, "time_created": {"type":"date"}, "norm_title": {"type":"string", "index":"not_analyzed"}, } createTable("venues", settings, properties) properties={ "missing": {"type":"nested"}, "time_created": {"type":"date"}, "norm_title": {"type":"string", "index":"not_analyzed"}, } createTable("missing_references", settings, properties) def connectedToDB(self): """ returns True if connected to DB, False otherwise """ return self.es is not None def getRetrievalIndexPath(self, guid, index_filename, full_corpus=False): """ Returns the path to the Lucene index for a test file in the corpus if full_corpus is True, this is the general index for that method else when using Citation Resolution (resolving only from the references at the bottom of a paper) it is the specific index for that file guid """ if full_corpus: return "idx_"+index_filename ## return index_filename else: guid=guid.lower() return "idx_"+guid+"_"+index_filename def getRecord(self, rec_id, table="papers", source=None): """ Abstracts over getting data from a row in the db. Returns all the fields of the record for one type of table, or those specified in source. :param rec_id: id of the record :param table: table alias, e.g. ["papers", "scidocs"] :param source: fields to return """ self.checkConnectedToDB() if table not in index_equivalence: raise ValueError("Unknown record type") try: res=self.es.get( index=index_equivalence[table]["index"], doc_type=index_equivalence[table]["type"], id=rec_id, _source=source ) except: raise ValueError("Not found: %s in index %s" % (rec_id,index_equivalence[table]["index"])) if not res: raise IndexError("Can't find record with id %s" % rec_id) return res["_source"] def setRecord(self, rec_id, body, table="papers", op_type="update"): """ Abstracts over setting getting data for a row in the db. :param rec_id: id of the record :param table: table alias, e.g. ["papers", "scidocs"] :param body: data to set """ self.checkConnectedToDB() if table not in index_equivalence: raise ValueError("Unknown record type") ##~ try: if op_type == "update": body={"doc":body} self.es.update( index=index_equivalence[table]["index"], doc_type=index_equivalence[table]["type"], id=rec_id, body=body ) elif op_type in ["index", "create"]: self.es.index( index=index_equivalence[table]["index"], doc_type=index_equivalence[table]["type"], op_type=op_type, id=rec_id, body=body ) else: raise ValueError("Unkown op_type %s" % op_type) ## except: ## raise ValueError("Error writing record: %s in index %s : %s" % (rec_id,index_equivalence[table]["index"], str(sys.exc_info[:2]))) return def getRecordField(self, rec_id, table="papers"): """ Abstracts over getting data from a row in the db. Returns one field for one type of table. All other "getter" functions like getMetadataByGUID and loadSciDoc are aliases for this function """ return self.getRecord(rec_id, table,source=index_equivalence[table]["source"])[index_equivalence[table]["source"]] def recordExists(self, rec_id, table="papers"): """ Returns True if the specified record exists in the given table, False otherwise. """ self.checkConnectedToDB() return self.es.exists( id=rec_id, index=index_equivalence[table]["index"], doc_type=index_equivalence[table]["type"], ) def SQLQuery(self, query): """ Runs a SQL Query, returning a dict per result with the fields required. :param query: SQL query :type query: string """ uri="http://%s:%s/_sql/_explain?sql=%s" % (self.endpoint["host"],self.endpoint["port"],query) response = requests.get(uri) dsl_query = json.loads(response.text) if "error" in dsl_query: raise ConnectionError("Error in query: " + str(dsl_query["error"]["root_cause"])) dsl_query["body"]={"query":dsl_query.pop("query")} dsl_query["from_"]=dsl_query.pop("from") dsl_query["_source_include"]=dsl_query["_source"]["includes"] dsl_query["_source_exclude"]=dsl_query["_source"]["excludes"] dsl_query.pop("_source") match=re.search(r"select.+?from[\s\"\']+([\w,]+)", query, flags=re.IGNORECASE) if match: table_name=match.group(1) else: table_name="papers" dsl_query["index"]=index_equivalence[table_name]["index"] dsl_query["doc_type"]=index_equivalence[table_name]["type"] tmp_max=self.max_results ## self.max_results=dsl_query["size"] if "size" in dsl_query: del dsl_query["size"] results=self.unlimitedQuery(**dsl_query) self.max_results=tmp_max results=[r["_source"] for r in results] if len(dsl_query["_source_include"]) == 1: results=[r[dsl_query["_source_include"][0]] for r in results] return results def cachedJsonExists(self, type, guid, params=None): """ True if the cached JSON associated with the given parameters exists """ self.checkConnectedToDB() return self.es.exists( index=ES_INDEX_CACHE, doc_type=ES_TYPE_CACHE, id=self.cachedDataIDString(type, guid, params) ) def saveCachedJson(self, path, data): """ Save anything as JSON :param path: unique ID of resource to load :param data: json-formatted string or any data """ self.checkConnectedToDB() timestamp=datetime.datetime.now() self.es.index( index=ES_INDEX_CACHE, doc_type=ES_TYPE_CACHE, id=path, op_type="index", body={ "data": json.dumps(data), "time_created": timestamp, "time_modified": timestamp, } ) def loadCachedJson(self,path): """ Load precomputed JSON :param path: unique ID of resource to load """ return json.loads(self.getRecordField(path,"cache")) def loadSciDoc(self,guid, ignore_errors=None): """ If a SciDocJSON file exists for guid, it returns it, otherwise None """ data=json.loads(self.getRecordField(guid,"scidocs")) return SciDoc(data, ignore_errors=ignore_errors) def saveSciDoc(self,doc): """ Saves the document as JSON in the index """ self.checkConnectedToDB() attempts=0 while attempts < 3: try: timestamp=datetime.datetime.now() self.es.index( index=ES_INDEX_SCIDOCS, doc_type=ES_TYPE_SCIDOC, id=doc["metadata"]["guid"], op_type="index", body={ "scidoc": json.dumps(doc.data), "guid":doc["metadata"]["guid"], "time_created": timestamp, "time_modified": timestamp, } ) break except ConnectionTimeout: attempts+=1 def connectToDB(self, suppress_error=False): """ Connects to database """ self.es = Elasticsearch([self.endpoint], timeout=60) self.es.retry_on_timeout=True def getMetadataByGUID(self,guid): """ Returns a paper's metadata by GUID """ return self.getRecordField(guid, "papers") def getMetadataByField(self,field,value): """ Returns a paper's metadata by any other field """ self.checkConnectedToDB() query=self.filterQuery("%s:\"%s\"" % (field,value)) res=self.es.search( index=ES_INDEX_PAPERS, doc_type=ES_TYPE_PAPER, _source="metadata", size=1, q=query) hits=res["hits"]["hits"] if len(hits) == 0: return None return hits[0]["_source"]["metadata"] def getStatistics(self, guid): """ Easy method to get a paper's statistics """ return self.getRecord(guid, "papers", "statistics")["statistics"] def setStatistics(self, guid, stats): """ Easy method to set a paper's statistics """ return self.setRecord(guid, {"statistics":stats}, "papers", op_type="update") def filterQuery(self, query, table="papers"): """ Adds a global filter to the query so it only matches the selected collection, date, etc. :param query: string """ if table !="papers": raise NotImplementedError if self.query_filter != "": return self.query_filter+" ("+query+")" else: return query def listFieldByField(self,field1,field2,value,table="papers",max_results=100): """ Returns a list: for each paper, field1 if field2==value """ self.checkConnectedToDB() if table not in index_equivalence: raise ValueError("Unknown record type") query=self.filterQuery("%s:\"%s\"" % (field2,value)) hits=self.unlimitedQuery( q=query, index=index_equivalence[table]["index"], doc_type=index_equivalence[table]["type"], _source=field1, ) return [r["_source"][field1] for r in hits] def isNestedQuery(self, query_string): """ Returns True if a nested field is found in the query string, e.g. author.name :param query_string: query string :returns: boolean """ query_without_quotes=re.sub(r"[^\\]\".*?[^\\]\"","",query_string) nested_query=re.search(r"[a-zA-Z]\.[a-zA-Z]",query_without_quotes) is not None return nested_query def abstractNestedResults(self, query_string, hits, field=None): """ Selects results from elasticsearch API """ if self.isNestedQuery(field): if field: return [r["_source"]["metadata"][field] for r in hits] else: return [r["_source"] for r in hits] else: if field: if field.startswith("_"): return [r[field] for r in hits] else: return [r["_source"][field] for r in hits] else: return [r["_source"] for r in hits] def listRecords(self, conditions=None, field="guid", max_results=sys.maxint, table="papers"): """ This is the equivalent of a SELECT clause """ self.checkConnectedToDB() es_index=index_equivalence[table]["index"] es_type=index_equivalence[table]["type"] if conditions: query=self.filterQuery(conditions) else: ## query=self.filterQuery(field+":*") query=self.filterQuery("*:*") prev_max_results=self.max_results self.max_results=max_results hits=self.unlimitedQuery( q=query, index=es_index, doc_type=es_type, _source=field, ) self.max_results=prev_max_results return self.abstractNestedResults(query, hits, field) def listPapers(self,conditions=None,field="guid", max_results=sys.maxint): """ Return a list of GUIDs in papers table where [conditions] """ return self.listRecords(conditions, field, max_results, "papers") def runSingleValueQuery(self,query): raise NotImplementedError def addAuthor(self, author): """ Make sure author is in database """ self.checkConnectedToDB() author["author_id"]=self.generateAuthorID self.updateAuthor(author,"create") def mergeAuthorDetails(self, author_record, new_author_data): """ """ def findAffiliation(aff_list, new_aff): """ """ if new_aff.get("name","") in ["",None]: return None for aff in aff_list: if aff.get("name","")==new_aff["name"]: return aff def mergeList(new_list, record_list): """ Adds the missing papers from the new_list to to the record_list """ for paper in new_list: if paper not in record_list: record_list.append(paper) #TODO Fuzzywuzzy this! for aff in new_author_data: match=findAffiliation(author_record["affiliation"],aff) if match: mergeList(aff.get("papers",[]), match["papers"]) else: author_record["affiliation"].append(aff) mergeList(new_author_data["papers"], author_record["papers"]) mergeList(new_author_data["papers_first_author"], author_record["papers_first_author"]) mergeList(new_author_data["papers_last_author"], author_record["papers_last_author"]) def updateAuthorsFromPaper(self, metadata): """ Make sure authors are in database :param metadata: a paper's metadata, with an "authors" key """ self.checkConnectedToDB() for index, new_author in enumerate(metadata["authors"]): creating_new_record=False author_record=self.matcher.matchAuthor(new_author) if not author_record: creating_new_record=True author_record=copy.deepcopy(new_author) author_record["author_id"]=self.generateAuthorID() author_record["papers"]=[] author_record["papers_first_author"]=[] author_record["papers_last_author"]=[] author_record["num_papers"]=0 if metadata["guid"] not in author_record["papers"]: author_record["papers"].append(metadata["guid"]) if index==0: author_record["papers_first_author"].append(metadata["guid"]) if index==len(metadata["authors"]): author_record["papers_last_author"].append(metadata["guid"]) author_record["num_papers"]=len(author_record["papers"]) if not creating_new_record: self.mergeAuthorDetails(author_record, new_author) self.updateAuthor(author_record, op_type="create" if creating_new_record else "index") def updateVenuesFromPaper(self, metadata): """ Progressive update of venues """ raise NotImplementedError ## res=self.es.search( ## index=ES_INDEX_VENUES, ## doc_type=ES_TYPE_VENUE, ## _source=field, ## q="guid:*") ## ## return [r["_source"] for r in hits] def updateAuthor(self, author, op_type="index"): """ Updates an existing author in the db :param author: author data :param op_type: one of ["index", "create"] """ self.checkConnectedToDB() timestamp=datetime.datetime.now() body={ "author":author, } author["time_updated"]=timestamp if op_type=="create": body["time_created"]=timestamp self.es.index( index=ES_INDEX_AUTHORS, doc_type=ES_TYPE_AUTHOR, op_type=op_type, id=author["author_id"], body=body ) def addPaper(self, metadata, check_existing=True, has_scidoc=True): """ Add paper metadata to database """ op_type="create" if check_existing else "index" self.updatePaper(metadata, op_type, has_scidoc) if self.AUTO_ADD_AUTHORS: self.updateAuthorsFromPaper(metadata) def updatePaper(self, metadata, op_type="index", has_scidoc=None): """ Updates an existing record in the db :param metadata: metadata of paper :param op_type: one of ["index", "create"] :param has_scidoc: True if SciDoc for this paper exists in scidocs \ index, False otherwise """ self.checkConnectedToDB() timestamp=datetime.datetime.now() body={"guid": metadata["guid"], "metadata": metadata, "norm_title": metadata["norm_title"], "num_in_collection_references": metadata.get("num_in_collection_references",0), "num_resolvable_citations": metadata.get("num_resolvable_citations",0), "num_inlinks": len(metadata.get("inlinks",[])), "time_modified": timestamp, ## "corpus_id": metadata["corpus_id"], ## "filename": metadata["filename"], ## "collection_id": metadata["collection_id"], ## "import_id": metadata["import_id"], ## "title": metadata["title"], ## "surnames": metadata["surnames"], ## "year": metadata["year"], } if has_scidoc is not None: body["has_scidoc"]=has_scidoc if op_type=="create": body["time_created"]=timestamp if op_type=="update": body={"doc":body} try: self.es.update( index=ES_INDEX_PAPERS, doc_type=ES_TYPE_PAPER, id=metadata["guid"], body=body ) except TransportError as e: self.es.indices.refresh(index=ES_INDEX_PAPERS) self.es.update( index=ES_INDEX_PAPERS, doc_type=ES_TYPE_PAPER, id=metadata["guid"], body=body ) else: self.es.index( index=ES_INDEX_PAPERS, doc_type=ES_TYPE_PAPER, op_type=op_type, id=metadata["guid"], body=body ) def addLink(self,GUID_from,GUID_to,authors_from,authors_to,year_from,year_to,numcitations): """ Add a link in the citation graph. """ self.checkConnectedToDB() self.es.create( index=ES_INDEX_LINKS, doc_type=ES_TYPE_LINK, body={ "guid_from": GUID_from, "guid_to": GUID_to, "authors_from": authors_from, "authors_to": authors_from, "year_from": year_from, "year_to": year_to, "numcitations": numcitations, "time_created": datetime.datetime.now(), }) def addMissingPaper(self, metadata): """ Inserts known data about a paper with no SciDoc """ ## self.addPaper(metadata,check_existing=True,has_scidoc=False) raise NotImplementedError def createDBindeces(self): """ Call this after importing the metadata into the corpus and before matching in-collection references, it should speed up search """ self.checkConnectedToDB() for index in ES_ALL_INDECES: if self.es.indeces.exists(index=index): self.es.optimize(index=index) pass def deleteAll(self, record_type): """ WARNING! This function deletes all the records in a given "table" or of a given type. :param record_type: one of ["papers","links","authors","scidocs","cache"] """ self.checkConnectedToDB() if record_type not in index_equivalence: raise ValueError("Unknown record type") es_table=index_equivalence[record_type]["index"] if self.es.indices.exists(index=es_table): print("Deleting ALL files in %s" % es_table) # ignore 404 and 400 self.deleteIndex(es_table) self.createAndInitializeDatabase() def deleteIndex(self, pattern): """ Deletes all indexes matching the pattern. Warning! Use only if you know exactly what you are doing! """ self.es.indices.delete(index=pattern, ignore=[400, 404]) def deleteByQuery(self, record_type, query): """ Delete the entries from a table that match the query. :param record_type: one of the tables that exist, e.g. ["papers","links","authors","scidocs","cached"] :type record_type: string :param query: a query to select documents to delete :type query: string """ self.checkConnectedToDB() if not self.es.indices.exists(index=index_equivalence[record_type]["index"]): return es_table=index_equivalence[record_type]["index"] es_type=index_equivalence[record_type]["type"] to_delete=self.unlimitedQuery( index=es_table, doc_type=es_type, q=query) self.bulkDelete([item["_id"] for item in to_delete]) def bulkDelete(self, id_list, table="papers"): """ Deletes all entries in id_list from the given table that match on id. """ self.checkConnectedToDB() if not self.es.indices.exists(index=index_equivalence[table]["index"]): return es_table=index_equivalence[table]["index"] es_type=index_equivalence[table]["type"] bulk_commands=[] for item in id_list: bulk_commands.append( "{ \"delete\" : { \"_id\" : \"%s\" } }" % item ) if len(bulk_commands) > 0: self.es.bulk( body="\n".join(bulk_commands), index=es_table, doc_type=es_type, ) def unlimitedQuery(self, *args, **kwargs): """ Wraps elasticsearch querying to enable auto scroll for retrieving large amounts of results It does more or less what elasticsearch.helpers.scan does, only this one actually works. """ scroll_time="20m" size=min(self.max_results,10000) res=self.es.search( *args, size=size, search_type="scan", scroll=scroll_time, **kwargs ) results = res['hits']['hits'] scroll_size = res['hits']['total'] while (scroll_size > 0) and len(results) < self.max_results: try: scroll_id = res['_scroll_id'] rs = self.es.scroll(scroll_id=scroll_id, scroll=scroll_time) res=rs results.extend(rs['hits']['hits']) scroll_size = len(rs['hits']['hits']) except: break return results[:self.max_results] def setCorpusFilter(self, collection_id=None, import_id=None, date=None): """ Sets the filter query to limit all queries to a collection (corpus) or an import date :param collection: identifier of corpus, e.g. "ACL" or "PMC". This is set at import time. :type collection:basestring :param import: identifier of import, e.g. "initial" :type import:basestring :param date: comparison with a date, e.g. ">[date]", "<[date]", :type collection:basestring """ query_items=[] if collection_id: query_items.append("metadata.collection_id:\"%s\"" % collection_id) if import_id: query_items.append("metadata.import_id:\"%s\"" % import_id) if date: query_items.append("time_created:%s" % date) self.query_filter=" AND ".join(query_items)+" AND "
class ElasticSearchClient: """ Class used as a client to the Elasticsearch server. """ def __init__(self, host, port, username, password, indexname): """ Initializes this Elasticsearch Client. :param host: the HTTP address of the Elasticsearch server. :param port: the HTTP port of the Elasticsearch server. :param username: the username for connecting to the index. :param password: the password for connecting to the index. :param indexname: the name of the Elasticsearch index. """ self.indexname = indexname self.client = Elasticsearch(connection_class = SafeRequestsHttpConnection, host = host, port = int(port), http_auth = [username, password]) self.snapshotclient = SnapshotClient(self.client) self.indicesclient = IndicesClient(self.client) def delete_index_and_mappings(self): """ Deletes the index and all its mappings. """ try: self.client.indices.delete(index = self.indexname) except NotFoundError: pass def create_index_and_mappings(self, update_mappings = False): """ Creates or updates the index and its mappings. :param update_mappings: boolean denoting whether the mappings should be created (False) or updated (True). """ if not self.client.indices.exists(self.indexname): self.client.indices.create(index = self.indexname, body = load_file_to_json("properties/indexsettings.json")) mappings = {} if self.indexname in self.client.indices.get_mapping(self.indexname): mappings = self.client.indices.get_mapping(self.indexname)[self.indexname]['mappings'] if update_mappings: self.client.indices.close(self.indexname) if 'files' not in mappings or update_mappings: self.client.indices.put_mapping(index = self.indexname, doc_type = 'files', body = load_file_to_json("properties/filesproperties.json")) if 'projects' not in mappings or update_mappings: self.client.indices.put_mapping(index = self.indexname, doc_type = 'projects', body = load_file_to_json("properties/projectsproperties.json")) if update_mappings: self.client.indices.open(self.indexname) def has_project(self, project_id): """ Checks if the index contains a project. :param project_id: the id of the project to check if it is contained in the index. :returns: True if the index contains the project, or False otherwise. """ return self.client.exists(index = self.indexname, doc_type = 'projects', id = project_id) def has_file(self, file_id): """ Checks if the index contains a file. :param file_id: the id of the file to check if it is contained in the index. :returns: True if the index contains the file, or False otherwise. """ return self.client.exists(index = self.indexname, doc_type = 'files', id = file_id) def create_project(self, project): """ Creates a project in the index. :param project: the data of the project in JSON format. """ self.client.create(index = self.indexname, doc_type = 'projects', id = project['fullname'], body = project) def create_file(self, afile): """ Creates a file in the index. :param afile: the data of the file in JSON format. """ self.client.create(index = self.indexname, doc_type = 'files', id = afile['fullpathname'], parent = afile['project'], body = afile) def update_file(self, afile): """ Updates a file in the index. :param afile: the data of the file in JSON format. """ self.client.update(index = self.indexname, doc_type = 'files', id = afile['fullpathname'], parent = afile['project'], body = {'doc': afile}) def delete_file(self, afile_id): """ Deletes a file from the index. :param afile_id: the id of the file to be deleted. """ self.client.delete(index = self.indexname, doc_type = 'files', id = afile_id, routing = '/'.join(afile_id.split('/')[0:2])) def delete_project(self, project_id): """ Deletes a project from the index. Note that this function also deletes all the files of the project. :param project_id: the id of the project to be deleted. """ self.client.delete_by_query(index = self.indexname, doc_type = 'files', body = {"query": { "bool": { "must": { "match_all": {} }, "filter": { "term": { "_routing": project_id } } } } }) self.client.delete(index = self.indexname, doc_type = 'projects', id = project_id) def get_project_fileids_and_shas(self, project_id): """ Returns all the files and their corresponding shas for a project. :param project_id: the id of the project of which the files and the shas are returned. :returns: a dict containing the files of the project as keys and their shas as values. """ sourcefiles = self.client.search(index = self.indexname, doc_type = 'files', body = {"query": { "term" : { "_routing": project_id } } }, routing = project_id, size = 100000000)['hits']['hits'] # Limitation! Each project must have no more than 100000000 files fileidsandshas = {} for afile in sourcefiles: fileidsandshas[afile['_id']] = afile['_source']['sha'] return fileidsandshas def execute_query(self, query, doc_type = 'files'): """ Executes a query on the index. :param query: the body of the query. :param doc_type: the document type to which the query is executed, either 'projects' or 'files'. :returns: the response of the query. """ return self.client.search(index = self.indexname, doc_type = doc_type, body = query) def test_analyzer(self, analyzer, text): """ Tests an analyzer of the index. :param analyzer: the analyzer to be tested. :param text: the text to be analyzed as a test. :returns: the analyzed text. """ result = self.indicesclient.analyze(index = self.indexname, analyzer = analyzer, body = text) return [r['token'] for r in result['tokens']] def backup(self, backupdir): """ Backups the index. :param backupdir: the directory used to backup the index. """ repositoryname = os.path.basename("backup" + self.indexname) try: self.snapshotclient.get_repository(repository = repositoryname) except: self.snapshotclient.create_repository(repository = repositoryname, body = {"type": "fs", "settings": {"location": backupdir + os.sep + self.indexname}}) try: self.snapshotclient.get(repository = repositoryname, snapshot = self.indexname + "snapshot") except: self.snapshotclient.create(repository = repositoryname, snapshot = self.indexname + "snapshot", body = {"indices": self.indexname}, wait_for_completion = True) def delete_backup(self): """ Removes any backups of the index. If there are no backups, this function does nothing. """ repositoryname = os.path.basename("backup" + self.indexname) try: self.snapshotclient.delete(repository = repositoryname, snapshot = self.indexname + "snapshot") except: pass def restore_backup(self): """ Restores a backup of the index. """ repositoryname = os.path.basename("backup" + self.indexname) if not self.client.indices.exists(self.indexname): self.client.indices.create(index = self.indexname, body = load_file_to_json("properties/indexsettings.json")) self.client.indices.close(self.indexname) self.snapshotclient.restore(repository = repositoryname, snapshot = self.indexname + "snapshot", body = {"indices": self.indexname}, wait_for_completion = True) self.client.indices.open(self.indexname) def flush(self): """ Flushes the index. """ self.indicesclient.flush(index = self.indexname)
class LearnerAPITestMixin(CsvViewMixin): filename_slug = 'learners' """Manages an elasticsearch index for testing the learner API.""" def setUp(self): """Creates the index and defines a mapping.""" super(LearnerAPITestMixin, self).setUp() self._es = Elasticsearch([settings.ELASTICSEARCH_LEARNERS_HOST]) management.call_command('create_elasticsearch_learners_indices') self.addCleanup(lambda: management.call_command( 'delete_elasticsearch_learners_indices')) def _create_learner( self, username, course_id, name=None, email=None, enrollment_mode='honor', segments=None, cohort='Team edX', discussion_contributions=0, problems_attempted=0, problems_completed=0, problem_attempts_per_completed=None, attempt_ratio_order=0, videos_viewed=0, enrollment_date='2015-01-28', user_id=None, language=None, location=None, year_of_birth=None, level_of_education=None, gender=None, mailing_address=None, city=None, country=None, goals=None, ): """Create a single learner roster entry in the elasticsearch index.""" body = { 'username': username, 'course_id': course_id, 'name': name if name is not None else username, 'email': email if email is not None else '{}@example.com'.format(username), 'enrollment_mode': enrollment_mode, 'discussion_contributions': discussion_contributions, 'problems_attempted': problems_attempted, 'problems_completed': problems_completed, 'attempt_ratio_order': attempt_ratio_order, 'videos_viewed': videos_viewed, 'enrollment_date': enrollment_date, "user_id": user_id, "language": language, "location": location, "year_of_birth": year_of_birth, "level_of_education": level_of_education, "gender": gender, "mailing_address": mailing_address, "city": city, "country": country, "goals": goals, } # leave null fields from being stored in the index. Otherwise, they will have # an explicit null value and we want to test for the case when they're not returned optional_fields = [('segments', segments), ('cohort', cohort), ('problem_attempts_per_completed', problem_attempts_per_completed)] for optional_field in optional_fields: if optional_field[1]: body[optional_field[0]] = optional_field[1] self._es.create(index=settings.ELASTICSEARCH_LEARNERS_INDEX, doc_type='roster_entry', body=body) def create_learners(self, learners): """ Creates multiple learner roster entries. `learners` is a list of dicts, each representing a learner which must at least contain the keys 'username' and 'course_id'. Other learner fields can be provided as additional keys in the dict - see the mapping defined in `setUp`. """ for learner in learners: self._create_learner(**learner) self._es.indices.refresh(index=settings.ELASTICSEARCH_LEARNERS_INDEX) def create_update_index(self, date=None): """ Created an index with the date of when the learner index was updated. """ self._es.create(index=settings.ELASTICSEARCH_LEARNERS_UPDATE_INDEX, doc_type='marker', body={ 'date': date, 'target_index': settings.ELASTICSEARCH_LEARNERS_INDEX, }) self._es.indices.refresh( index=settings.ELASTICSEARCH_LEARNERS_UPDATE_INDEX) def expected_page_url(self, course_id, page, page_size): """ Returns a paginated URL for the given parameters. As with PageNumberPagination, if page=1, it's omitted from the query string. """ if page is None: return None course_q = urlencode({'course_id': course_id}) page_q = '&page={}'.format(page) if page and page > 1 else '' page_size_q = '&page_size={}'.format( page_size) if page_size > 0 else '' return 'http://testserver/api/v0/learners/?{course_q}{page_q}{page_size_q}'.format( course_q=course_q, page_q=page_q, page_size_q=page_size_q, )
class Connector: def __init__(self, prEndpoint=None, esEndpoint=None, dmonPort=5001, MInstancePort=9200, index="logstash-*", prKafkaEndpoint=None, prKafkaPort=9092, prKafkaTopic='edetopic'): if esEndpoint is None: self.esInstance = None else: self.esInstance = Elasticsearch(esEndpoint) self.esEndpoint = esEndpoint self.dmonPort = dmonPort self.esInstanceEndpoint = MInstancePort self.myIndex = index logger.info( '[{}] : [INFO] EDE ES backend Defined at: {} with port {}'. format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), esEndpoint, MInstancePort)) if prEndpoint is None: pass else: self.prEndpoint = prEndpoint self.MInstancePort = MInstancePort logger.info( '[{}] : [INFO] EDE PR backend Defined at: {} with port {}'. format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), prEndpoint, MInstancePort)) self.dataDir = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'data') if prKafkaEndpoint is None: self.producer = None logger.warning('[{}] : [WARN] EDE Kafka reporter not set'.format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'))) else: self.prKafkaTopic = prKafkaTopic try: self.producer = KafkaProducer( value_serializer=lambda v: json.dumps(v).encode('utf-8'), bootstrap_servers=[ "{}:{}".format(prKafkaEndpoint, prKafkaPort) ], retries=5) logger.info( '[{}] : [INFO] EDE Kafka reporter initialized to server {}:{}' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), prKafkaEndpoint, prKafkaPort)) except Exception as inst: logger.error( '[{}] : [ERROR] EDE Kafka reporter failed with {} and {}'. format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)) self.producer = None def pr_health_check(self): pr_target_health = '/-/healthy' pr_target_ready = '/-/ready' try: resp_h = requests.get("http://{}:{}{}".format( self.prEndpoint, self.MInstancePort, pr_target_health)) resp_r = requests.get("http://{}:{}{}".format( self.prEndpoint, self.MInstancePort, pr_target_ready)) except Exception as inst: logger.error( '[{}] : [ERROR] Exception has occured while connecting to PR endpoint with type {} at arguments {}' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)) sys.exit(2) if resp_h.status_code != 200: logger.error( '[{}] : [ERROR] PR endpoint health is bad, exiting'.format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'))) sys.exit(2) if resp_r.status_code != 200: logger.error( '[{}] : [ERROR] PR endpoint not ready to serve traffic'.format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'))) sys.exit(2) logger.info('[{}] : [INFO] PR endpoint healthcheck pass'.format( datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))) return resp_h.status_code, resp_r.status_code def pr_status(self, type=None): """ Get status of prometheus TODO: check runtimeinfo and flags :param type: suported types :return: """ suported = ['runtimeinfo', 'config', 'flags'] if type is None: pr_target_string = '/api/v1/status/config' elif type in suported: pr_target_string = '/api/v1/status/{}'.format(type) else: logger.error( '[{}] : [ERROR] unsupported status type {}, supported types are {}' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type, suported)) sys.exit(1) try: resp = requests.get("http://{}:{}{}".format( self.prEndpoint, self.MInstancePort, pr_target_string)) except Exception as inst: logger.error( '[{}] : [ERROR] Exception has occured while connecting to PR endpoint with type {} at arguments {}' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)) sys.exit(2) return resp.json() def pr_targets(self): """ Get Monitored Target Info :return: Targets Dict """ pr_target_string = '/api/v1/targets' try: resp = requests.get("http://{}:{}{}".format( self.prEndpoint, self.MInstancePort, pr_target_string)) except Exception as inst: logger.error( '[{}] : [ERROR] Exception has occured while connecting to PR endpoint with type {} at arguments {}' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)) sys.exit(2) return resp.json() def pr_labels(self, label=None): if label is None: pr_target_string = '/api/v1/labels' else: pr_target_string = '/api/v1/label/{}/values'.format(label) try: resp = requests.get("http://{}:{}{}".format( self.prEndpoint, self.MInstancePort, pr_target_string)) except Exception as inst: logger.error( '[{}] : [ERROR] Exception has occured while connecting to PR endpoint with type {} at arguments {}' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)) sys.exit(2) return resp.json() def pr_query(self, query): """ QUery Monitoring Data From PR backend :param query: Query string for PR backend :return: Monitoring Data """ try: url = '/api/v1/query' resp = requests.get('http://{}:{}{}'.format( self.prEndpoint, self.MInstancePort, url), params=query) except Exception as inst: logger.error( '[{}] : [ERROR] Exception has occured while connecting to PR endpoint with type {} at arguments {}' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)) sys.exit(2) return resp.json() def query(self, queryBody, allm=True, dMetrics=[], debug=False): # self.__check_valid_es() res = self.esInstance.search(index=self.myIndex, body=queryBody, request_timeout=230) if debug: print( "%---------------------------------------------------------%") print("Raw JSON Ouput") print(res) print(("%d documents found" % res['hits']['total'])) print( "%---------------------------------------------------------%") termsList = [] termValues = [] ListMetrics = [] for doc in res['hits']['hits']: if not allm: if not dMetrics: sys.exit( "dMetrics argument not set. Please supply valid list of metrics!" ) for met in dMetrics: # prints the values of the metrics defined in the metrics list if debug: print( "%---------------------------------------------------------%" ) print( "Parsed Output -> ES doc id, metrics, metrics values." ) print(("doc id %s) metric %s -> value %s" % (doc['_id'], met, doc['_source'][met]))) print( "%---------------------------------------------------------%" ) termsList.append(met) termValues.append(doc['_source'][met]) dictValues = dict(list(zip(termsList, termValues))) else: for terms in doc['_source']: # prints the values of the metrics defined in the metrics list if debug: print( "%---------------------------------------------------------%" ) print( "Parsed Output -> ES doc id, metrics, metrics values." ) print(("doc id %s) metric %s -> value %s" % (doc['_id'], terms, doc['_source'][terms]))) print( "%---------------------------------------------------------%" ) termsList.append(terms) termValues.append(doc['_source'][terms]) dictValues = dict(list(zip(termsList, termValues))) ListMetrics.append(dictValues) return ListMetrics, res def info(self): # self.__check_valid_es() try: res = self.esInstance.info() except Exception as inst: logger.error( '[%s] : [ERROR] Exception has occured while connecting to ES dmon with type %s at arguments %s', datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) sys.exit(2) return res def roles(self): # self.__check_valid_es() nUrl = "http://%s:%s/dmon/v1/overlord/nodes/roles" % (self.esEndpoint, self.dmonPort) logger.info( '[%s] : [INFO] dmon get roles url -> %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), nUrl) try: rRoles = requests.get(nUrl) except Exception as inst: logger.error( '[%s] : [ERROR] Exception has occured while connecting to dmon with type %s at arguments %s', datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) sys.exit(2) rData = rRoles.json() return rData def createIndex(self, indexName): # self.__check_valid_es() try: self.esInstance.create(index=indexName, ignore=400) logger.info( '[%s] : [INFO] Created index %s', datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName) except Exception as inst: logger.error( '[%s] : [ERROR] Failed to created index %s with %s and %s', datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName, type(inst), inst.args) def closeIndex(self, indexName): try: self.esInstance.close(index=indexName) logger.info( '[%s] : [INFO] Closed index %s', datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName) except Exception as inst: logger.error( '[%s] : [ERROR] Failed to close index %s with %s and %s', datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName, type(inst), inst.args) def deleteIndex(self, indexName): try: res = self.esInstance.indices.delete(index=indexName, ignore=[400, 404]) logger.info( '[%s] : [INFO] Deleted index %s', datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName) except Exception as inst: logger.error( '[%s] : [ERROR] Failed to delete index %s with %s and %s', datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName, type(inst), inst.args) return 0 return res def openIndex(self, indexName): res = self.esInstance.indices.open(index=indexName) logger.info( '[%s] : [INFO] Open index %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName) return res def getIndex(self, indexName): res = self.esInstance.indices.get(index=indexName, human=True) return res def getIndexSettings(self, indexName): res = self.esInstance.indices.get_settings(index=indexName, human=True) return res def clusterHealth(self): res = self.esInstance.cluster.health(request_timeout=15) return res def clusterSettings(self): res = self.esInstance.cluster.get_settings(request_timeout=15) return res def clusterState(self): res = self.esInstance.cluster.stats(human=True, request_timeout=15) return res def nodeInfo(self): res = self.esInstance.nodes.info(request_timeout=15) return res def nodeState(self): res = self.esInstance.nodes.stats(request_timeout=15) return res def getStormTopology(self): nUrl = "http://%s:%s/dmon/v1/overlord/detect/storm" % (self.esEndpoint, self.dmonPort) logger.info( '[%s] : [INFO] dmon get storm topology url -> %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), nUrl) try: rStormTopology = requests.get(nUrl) except Exception as inst: logger.error( '[%s] : [ERROR] Exception has occured while connecting to dmon with type %s at arguments %s', datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) print("Can't connect to dmon at %s port %s" % (self.esEndpoint, self.dmonPort)) sys.exit(2) rData = rStormTopology.json() return rData def pushAnomalyES(self, anomalyIndex, doc_type, body): try: res = self.esInstance.index(index=anomalyIndex, doc_type=doc_type, body=body) except Exception as inst: logger.error( '[%s] : [ERROR] Exception has occured while pushing anomaly with type %s at arguments %s', datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) sys.exit(2) return res def pushAnomalyKafka(self, body): if self.producer is None: logger.warning( '[{}] : [WARN] Kafka reporter not defined, skipping reporting'. format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'))) else: try: self.producer.send(self.prKafkaTopic, body) # self.producer.flush() logger.info( '[{}] : [INFO] Anomalies reported to kafka topic {}'. format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), self.prKafkaTopic)) except Exception as inst: logger.error( '[{}] : [ERROR] Failed to report anomalies to kafka topic {} with {} and {}' .format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), self.prKafkaTopic, type(inst), inst.args)) return 0 def getModel(self): return "getModel" def pushModel(self): return "push model" def localData(self, data): data_loc = os.path.join(self.dataDir, data) try: df = pd.read_csv(data_loc) except Exception as inst: logger.error( '[{}] : [ERROR] Cannot load local data with {} and {}'.format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)) sys.exit(2) logger.info( '[{}] : [INFO] Loading local data from {} with shape {}'.format( datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), data_loc, df.shape)) return df def getInterval(self): nUrl = "http://%s:%s/dmon/v1/overlord/aux/interval" % (self.esEndpoint, self.dmonPort) logger.info( '[%s] : [INFO] dmon get interval url -> %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), nUrl) try: rInterval = requests.get(nUrl) except Exception as inst: logger.error( '[%s] : [ERROR] Exception has occured while connecting to dmon with type %s at arguments %s', datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) sys.exit(2) rData = rInterval.json() return rData def aggQuery(self, queryBody): adt_timeout = os.environ['ADP_TIMEOUT'] = os.getenv( 'ADP_TIMEOUT', str(60) ) # Set timeout as env variable ADT_TIMEOUT, if not set use default 60 # print "QueryString -> {}".format(queryBody) try: res = self.esInstance.search(index=self.myIndex, body=queryBody, request_timeout=float(adt_timeout)) except Exception as inst: logger.error( '[%s] : [ERROR] Exception while executing ES query with %s and %s', datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) sys.exit(2) return res def getNodeList(self): ''' :return: -> returns the list of registered nodes from dmon ''' nUrl = "http://%s:%s/dmon/v1/observer/nodes" % (self.esEndpoint, self.dmonPort) logger.info( '[%s] : [INFO] dmon get node url -> %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), nUrl) try: rdmonNode = requests.get(nUrl) except Exception as inst: logger.error( '[%s] : [ERROR] Exception has occured while connecting to dmon with type %s at arguments %s', datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) sys.exit(2) rdata = rdmonNode.json() nodes = [] for e in rdata['Nodes']: for k in e: nodes.append(k) return nodes def getDmonStatus(self): nUrl = "http://%s:%s/dmon/v1/overlord/core/status" % (self.esEndpoint, self.dmonPort) logger.info( '[%s] : [INFO] dmon get core status url -> %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), nUrl) try: rdmonStatus = requests.get(nUrl) except Exception as inst: logger.error( '[%s] : [ERROR] Exception has occured while connecting to dmon with type %s at arguments %s', datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) sys.exit(2) return rdmonStatus.json()
class Tags(object): config_name = 'dossier.tags' @classmethod def configured(cls): return cls(**yakonfig.get_global_config('dossier.tags')) def __init__(self, hosts=None, namespace=None, type_prefix='', shards=10, replicas=0, tag_delimiter='/'): if hosts is None: raise yakonfig.ProgrammerError( 'Tags needs at least one host specified.') if namespace is None: raise yakonfig.ProgrammerError('Tags needs a namespace defined.') self.conn = Elasticsearch(hosts=hosts, timeout=60, request_timeout=60) self.index = 'tags_%s' % namespace self.type_tag = '%stag' % type_prefix self.type_assoc = '%sassociation' % type_prefix self.shards = shards self.replicas = replicas self.delim = tag_delimiter created1 = self._create_index() created2 = self._create_mappings() if created1 or created2: # It is possible to create an index and quickly launch a request # that will fail because the index hasn't been set up yet. Usually, # you'll get a "no active shards available" error. # # Since index creation is a very rare operation (it only happens # when the index doesn't already exist), we sit and wait for the # cluster to become healthy. self.conn.cluster.health(index=self.index, wait_for_status='yellow') def add(self, assoc): self._validate_association(assoc) tag = self._normalize_tag(assoc['tag']) if len(tag) == 0: return self.conn.create(index=self.index, doc_type=self.type_assoc, body=assoc) # Start with creating the full tag and continue creating parent tags # until one exists or until we hit root. This lets us save some # round trips in the common case (the tag is already created). parts = tag.split(self.delim) while len(parts) > 0: tag = self.delim.join(parts) doc_tag = { 'tag': tag, 'parent': self.delim.join(parts[:-1]), 'name': parts[-1], } try: self.conn.create(index=self.index, doc_type=self.type_tag, id=tag, body=doc_tag) except ConflictError as e: # Yay for brittle substring search for error detection! if 'DocumentAlreadyExistsException' in e.error: break raise parts = parts[:-1] def list(self, parent_tag): parent_tag = self._normalize_tag(parent_tag) return self._term_query(self.type_tag, 'parent', parent_tag) def suggest(self, parent, prefix, limit=100): if prefix == '': # No sense in issuing a request when we already know the answer. return [] body = { 'tag': { 'text': prefix, 'completion': { 'field': 'name.suggest', 'size': limit, 'context': { 'parent': parent, }, }, }, } hits = self.conn.suggest(index=self.index, body=body) if 'tag' not in hits: return [] return map(lambda hit: hit['text'], hits['tag'][0]['options']) def assocs_by_tag(self, tag): tag = self._normalize_tag(tag) return self._term_query(self.type_assoc, 'tag', tag) def assocs_by_url(self, url): return self._term_query(self.type_assoc, 'url', url) def assocs_by_stream_id(self, stream_id): return self._term_query(self.type_assoc, 'stream_id', stream_id) def sync(self): '''Tells ES to tell Lucene to do an fsync. This guarantees that any previous calls to ``add`` will be flushed to disk and available in subsequent searches. Generally, this should only be used in test code. ''' self.conn.indices.refresh(index=self.index) def delete_all(self): '''Deletes all tag data. This does not destroy the ES index, but instead only deletes all tags with the configured doc types. ''' try: self.conn.indices.delete_mapping(index=self.index, doc_type=self.type_tag) except TransportError: logger.warn('type %r in index %r already deleted', self.index, self.type_tag, exc_info=True) try: self.conn.indices.delete_mapping(index=self.index, doc_type=self.type_assoc) except TransportError: logger.warn('type %r in index %r already deleted', self.index, self.type_assoc, exc_info=True) def _create_index(self): 'Create the index' # This can race, but that should be OK. # Worst case, we initialize with the same settings more than # once. if self.conn.indices.exists(index=self.index): return False try: self.conn.indices.create(index=self.index, timeout=60, request_timeout=60, body={ 'settings': { 'number_of_shards': self.shards, 'number_of_replicas': self.replicas, }, }) except TransportError: # Hope that this is an "index already exists" error... logger.warn('index already exists? OK', exc_info=True) return True def _create_mappings(self): 'Create the field type mapping.' created1 = self._create_tag_mapping() created2 = self._create_assoc_mapping() return created1 or created2 def _create_tag_mapping(self): mapping = self.conn.indices.get_mapping(index=self.index, doc_type=self.type_tag) if len(mapping) > 0: return False self.conn.indices.put_mapping( index=self.index, doc_type=self.type_tag, timeout=60, request_timeout=60, body={ self.type_tag: { 'dynamic': False, 'properties': { 'parent': { 'type': 'string', 'index': 'not_analyzed', }, 'name': { 'type': 'string', 'index': 'not_analyzed', 'fields': { 'suggest': { 'type': 'completion', 'index_analyzer': 'simple', 'search_analyzer': 'simple', 'payloads': False, 'preserve_separators': True, 'preserve_position_increments': True, 'max_input_length': 256, 'context': { 'parent': { 'type': 'category', 'path': 'parent', }, }, }, }, }, 'tag': { 'type': 'string', 'index': 'not_analyzed', }, }, }, }) return True def _create_assoc_mapping(self): mapping = self.conn.indices.get_mapping(index=self.index, doc_type=self.type_assoc) if len(mapping) > 0: return False self.conn.indices.put_mapping(index=self.index, doc_type=self.type_assoc, timeout=60, request_timeout=60, body={ self.type_assoc: { 'dynamic': False, 'properties': { 'url': { 'type': 'string', 'index': 'not_analyzed' }, 'text': { 'type': 'string', 'index': 'analyzed' }, 'tag': { 'type': 'string', 'index': 'not_analyzed' }, 'stream_id': { 'type': 'string', 'index': 'not_analyzed' }, 'hash': { 'type': 'string', 'index': 'not_analyzed' }, 'timestamp': { 'type': 'integer', 'index': 'not_analyzed' }, 'xpath': { 'type': 'object', 'dynamic': False, 'properties': { 'start_node': { 'type': 'string', 'index': 'no' }, 'start_idx': { 'type': 'integer', 'index': 'no' }, 'end_node': { 'type': 'string', 'index': 'no' }, 'end_idx': { 'type': 'integer', 'index': 'no' }, }, }, }, }, }) return True def _validate_association(self, assoc): def check_field(d, (name, ty), prefix=''):
print("Done with delete!") es.indices.create("books", {}) print("starting book population...") id = 0 for k in range(0, df.shape[0]): # parse book data book = df.iloc[k, :] author_arr = book.authors.split(", ") bookInfo = { "title": book.title, "authors": author_arr, 'pages': book.pages, 'isbn13': str(book['isbn13']), 'quantity': 2 } # create new book entry q = random.randint(1, 5) es.create("books", id, { "title": bookInfo['title'], "authors": bookInfo['authors'], "pages": bookInfo['pages'], "isbn": bookInfo['isbn13'], "quantity": q }, doc_type="_doc") id = id + 1 print("done")
class ElasticsearchIndex(): """ Gestor para indice elasticsearch""" def __init__(self, url=None): # establecemos conexión if url: print "[WARNING] ignorando config %s, usamos url %s" % (settings.SEARCH_INDEX['url'], url) self.es = Elasticsearch(url) else: self.es = Elasticsearch(settings.SEARCH_INDEX['url']) es_conf= { "settings": { "analysis": { "filter": { "english_stop": { "type": "stop", "stopwords": "_english_" }, "light_english_stemmer": { "type": "stemmer", "language": "light_english" }, "english_possessive_stemmer": { "type": "stemmer", "language": "english" }, "light_spanish_stemmer": { "type": "stemmer", "language": "light_spanish" }, "spanish_possessive_stemmer": { "type": "stemmer", "language": "spanish" } }, "analyzer": { "case_insensitive_sort": { "tokenizer": "keyword", "filter": [ "lowercase" ] }, "english": { "tokenizer": "standard", "filter": [ "english_possessive_stemmer", "lowercase", "english_stop", "light_english_stemmer", "asciifolding" ] }, "spanish": { "tokenizer": "standard", "filter": [ "spanish_possessive_stemmer", "lowercase", "light_spanish_stemmer", ] } } } } } # se crea el indice si es que no existe # Ignora que exista el indice indices = self.es.indices.create(index=settings.SEARCH_INDEX['index'], body=es_conf, ignore=400) # primera vez que empuja el index try: if indices['acknowledged']: for doc_type in ["ds","dt","vz"]: self.es.indices.put_mapping(index=settings.SEARCH_INDEX['index'], doc_type=doc_type, body=self.__get_mapping(doc_type)) for finder in DatalPluginPoint.get_active_with_att('finder'): self.es.indices.put_mapping(index=settings.SEARCH_INDEX['index'], doc_type=finder.doc_type, body=self.__get_mapping(finder.doc_type)) # Ya existe un index except KeyError: pass self.logger = logging.getLogger(__name__) def __get_mapping(self, doc_type): if doc_type == "ds": return self.__get_datastream_mapping() elif doc_type == "dt": return self.__get_dataset_mapping() elif doc_type == "vz": return self.__get_visualization_mapping() for finder in DatalPluginPoint.get_active_with_att('finder'): if finder.doc_type == doc_type: return finder.get_mapping() def __get_datastream_mapping(self): return {"ds" : { "properties" : { "categories" : { "properties" : { "id" : { "type" : "string" }, "name" : { "type" : "string", "index" : "not_analyzed" } } }, # categories "meta_text" : { "properties" : { "field_name" : { "type" : "string" }, "field_value" : { "type" : "string"} } }, # meta_text "docid" : { "type" : "string" }, "fields" : { "properties" : { "account_id" : { "type" : "long" }, "datastream__revision_id" : { "type" : "long" }, "datastream_id" : { "type" : "long" }, "resource_id" : { "type" : "long" }, "revision_id" : { "type" : "long" }, "description" : { "type" : "string" }, "end_point" : { "type" : "string" }, "owner_nick" : { "type" : "string" }, "parameters" : { "type" : "string" }, "tags" : { "type" : "string" }, "text" : { "type" : "string", "fields": { "text_lower_sort": {"type":"string", "analyzer": "case_insensitive_sort"}, "text_english_stemmer": {"type":"string", "analyzer": "english"}, "text_spanish_stemmer": {"type":"string", "analyzer": "spanish"} }, "properties": { "text_english": {"type":"string", "analyzer": "english"}, "text_spanish": {"type":"string", "analyzer": "spanish"} }, }, "created_at" : { "type" : "long" }, "timestamp" : { "type" : "long" }, "hits" : { "type" : "integer" }, "web_hits" : { "type" : "integer" }, "api_hits" : { "type" : "integer" }, "title" : { "type" : "string" , "fields": {"title_lower_sort": {"type":"string", "analyzer": "case_insensitive_sort"}} }, "type" : { "type" : "string" } } } # fields } } } def __get_dataset_mapping(self): return {"dt" : { "properties" : { "categories" : { "properties" : { "id" : { "type" : "string" }, "name" : { "type" : "string", "index" : "not_analyzed" } } }, # categories "meta_text" : { "properties" : { "field_name" : { "type" : "string" }, "field_value" : { "type" : "string"} } }, # meta_text "docid" : { "type" : "string" }, "fields" : { "properties" : { "account_id" : { "type" : "long" }, "datasetrevision_id" : { "type" : "long" }, "dataset_id" : { "type" : "long" }, "resource_id" : { "type" : "long" }, "revision_id" : { "type" : "long" }, "description" : { "type" : "string" }, "end_point" : { "type" : "string" }, "owner_nick" : { "type" : "string" }, "parameters" : { "type" : "string" }, "tags" : { "type" : "string" }, "text" : { "type" : "string", "fields": { "text_lower_sort": {"type":"string", "analyzer": "case_insensitive_sort"}, "text_english_stemmer": {"type":"string", "analyzer": "english"}, "text_spanish_stemmer": {"type":"string", "analyzer": "spanish"} }, "properties": { "text_english": {"type":"string", "analyzer": "english"}, "text_spanish": {"type":"string", "analyzer": "spanish"} }, }, "created_at" : { "type" : "long" }, "timestamp" : { "type" : "long" }, "hits" : { "type" : "integer" }, "web_hits" : { "type" : "integer" }, "api_hits" : { "type" : "integer" }, "title" : { "type" : "string" , "fields": {"title_lower_sort": {"type":"string", "analyzer": "case_insensitive_sort"}} }, "type" : { "type" : "string" } } } # fields } } } def __get_visualization_mapping(self): return {"vz" : { "properties" : { "categories" : { "properties" : { "id" : { "type" : "string" }, "name" : { "type" : "string", "index" : "not_analyzed" } } }, # categories "meta_text" : { "properties" : { "field_name" : { "type" : "string" }, "field_value" : { "type" : "string"} } }, # meta_text "docid" : { "type" : "string" }, "fields" : { "properties" : { "account_id" : { "type" : "long" }, "resource_id" : { "type" : "long" }, "revision_id" : { "type" : "long" }, "visualization_revision_id" : { "type" : "long" }, "visualization_id" : { "type" : "long" }, "description" : { "type" : "string" }, "end_point" : { "type" : "string" }, "owner_nick" : { "type" : "string" }, "parameters" : { "type" : "string" }, "tags" : { "type" : "string" }, "text" : { "type" : "string", "fields": { "text_lower_sort": {"type":"string", "analyzer": "case_insensitive_sort"}, "text_english_stemmer": {"type":"string", "analyzer": "english"}, "text_spanish_stemmer": {"type":"string", "analyzer": "spanish"} }, "properties": { "text_english": {"type":"string", "analyzer": "english"}, "text_spanish": {"type":"string", "analyzer": "spanish"} }, }, "hits" : { "type" : "integer" }, "web_hits" : { "type" : "integer" }, "api_hits" : { "type" : "integer" }, "created_at" : { "type" : "long" }, "timestamp" : { "type" : "long" }, "title" : { "type" : "string" , "fields": {"title_lower_sort": {"type":"string", "analyzer": "case_insensitive_sort"}} }, "type" : { "type" : "string" } } } # fields } } } def indexit(self, document): """add document to index :param document: """ if document: # self.logger.info('Elasticsearch: Agregar al index %s' % str(document)) try: return self.es.create( index=settings.SEARCH_INDEX['index'], body=document, doc_type=document['fields']['type'], id=document['docid']) except: return self.es.index( index=settings.SEARCH_INDEX['index'], body=document, doc_type=document['fields']['type'], id=document['docid']) return False def count(self, doc_type=None): """return %d of documents in index, doc_type (opt) filter this document type""" if doc_type: return self.es.count(index=settings.SEARCH_INDEX['index'], doc_type=doc_type)['count'] else: return self.es.count(index=settings.SEARCH_INDEX['index'])['count'] def delete_document(self, document): """delete by ID""" try: output = self.es.delete(index=settings.SEARCH_INDEX['index'], id=document['docid'], doc_type=document['type']) return output except NotFoundError: self.logger.error("ERROR NotFound: ID %s not found in index" % document['docid']) return {u'found': False, u'documment': document, u'index': settings.SEARCH_INDEX['index']} except KeyError: self.logger.error("ERROR KeyError: Document error (doc: %s)" % str(document)) except TypeError: self.logger.error("ERROR TypeError: Document error (doc: %s)" % str(document)) return False def __filterDeleted(self, item): return item['found'] def __filterNotDeleted(self, item): return not item['found'] def flush_index(self): return self.es.indices.delete(index=settings.SEARCH_INDEX['index'], ignore=[400, 404]) def delete_documents(self, documents): """Delete from a list. Return [list(deleted), list(notdeleted)] :param documents: """ result = map(self.delete_document, documents) documents_deleted=filter(self.__filterDeleted,result) documents_not_deleted=filter(self.__filterNotDeleted,result) return [documents_deleted, documents_not_deleted] def search(self, doc_type, query, fields="*" ): """Search by query :param doc_type: :param query: :param fields: """ try: return self.es.search(index=settings.SEARCH_INDEX['index'], doc_type=doc_type, body=query, _source_include=fields) except RequestError,e: raise RequestError(e) except NotFoundError,e: raise NotFoundError,(e)
class Worker(Process): def __init__(self, work_queue): super(Worker, self).__init__() self.api_client = APIClient('http://%s:9200' % es_hosts[random.randint(0, len(es_hosts) - 1)].get('host')) self.work_queue = work_queue self.es = Elasticsearch(es_hosts) self.sentence_list = loremipsum.get_sentences(1000) self.re_first_word = re.compile('([A-z]+)') def run(self): print 'Starting %s ' % self.name counter = 0 batch = [] while True: index_batch_size = args.get('batch_size') task = self.work_queue.get(timeout=600) counter += 1 document = self.generate_document(task['field_count']) flattened_doc = self.process_document(document, task['type'], task['uuid'], task['uuid']) index_type_tuple = (task['index'], task['type']) # self.handle_document(task['index'], task['type'], task['uuid'], flattened_doc) batch.append((index_type_tuple, flattened_doc)) if len(batch) >= index_batch_size: self.handle_batch(batch) batch = [] self.work_queue.task_done() def generate_document(self, fields): doc = {} my_bool = True for i in xrange(fields): sentence_index = random.randint(0, max((fields / 2) - 1, 1)) sentence = self.sentence_list[sentence_index] if random.random() >= .5: key = self.re_first_word.findall(sentence)[1] else: key = self.re_first_word.findall(sentence)[1] + str(i) field_type = random.random() if field_type <= 0.3: doc[key] = sentence elif field_type <= 0.5: doc[key] = random.randint(1, 1000000) elif field_type <= 0.6: doc[key] = random.random() * 1000000000 elif field_type == 0.7: doc[key] = my_bool my_bool = not my_bool elif field_type == 0.8: doc[key] = self.generate_document(max(fields / 5, 1)) elif field_type <= 1.0: doc['mylocation'] = self.generate_location() return doc @staticmethod def get_fields(document, base_name=None): fields = [] for name, value in document.iteritems(): if base_name: field_name = '%s.%s' % (base_name, name) else: field_name = name if isinstance(value, dict): fields += Worker.get_fields(value, field_name) else: value_name = None if isinstance(value, basestring): value_name = 'string' elif isinstance(value, bool): value_name = 'boolean' elif isinstance(value, (int, long)): value_name = 'long' elif isinstance(value, float): value_name = 'double' if value_name: field = { 'name': field_name, value_name: value } else: field = { 'name': field_name, 'string': str(value) } fields.append(field) return fields @staticmethod def process_document(document, doc_type, application_id, uuid): response = { 'entityId': uuid, 'entityVersion': '1', 'entityType': doc_type, 'applicationId': application_id, 'fields': Worker.get_fields(document) } return response def handle_document(self, index, doc_type, uuid, document): res = self.es.create(index=index, doc_type=doc_type, id=uuid, body=document) print res def generate_location(self): response = {} lat = random.random() * 90.0 lon = random.random() * 180.0 lat_neg_true = True if lon > .5 else False lon_neg_true = True if lat > .5 else False lat = lat * -1.0 if lat_neg_true else lat lon = lon * -1.0 if lon_neg_true else lon response['location'] = { 'lat': lat, 'lon': lon } return response def handle_batch(self, batch): print 'HANDLE BATCH size=%s' % len(batch) # self.api_client.define_type_mapping(index, doc_type) self.api_client.index_batch(batch)
import json import os import progressbar from elasticsearch import Elasticsearch es = Elasticsearch(hosts="kiddd.science:19200") with open("qidian.txt") as f: lines = f.readlines() for line in progressbar.progressbar(lines, redirect_stdout=True): book = json.loads(line) path = 'books_qidian/' + book['bid'] if os.path.exists(path + '.txt'): book['download'] = 'qidian/' + book['bid'] + '.txt' elif os.path.exists(path + '.epub'): book['download'] = 'qidian/' + book['bid'] + '.epub' else: print('ERROR, there is no %s' % book['title']) continue del book['bid'] try: es.create(index='ebooks', id=book['download'], doc_type='book', body=book) except Exception as e: print(e)
class ES: index = None conn = None settings = None grok_filters = dict() notifier = None bulk_actions = [] def __init__(self, settings=None, logging=None): self.settings = settings self.logging = logging if self.settings.config.getboolean("notifier", "email_notifier"): self.notifier = Notifier(settings, logging) def init_connection(self): self.conn = Elasticsearch( [self.settings.config.get("general", "es_url")], use_ssl=False, timeout=self.settings.config.getint("general", "es_timeout"), verify_certs=False, retry_on_timeout=True) if self.conn.ping(): self.logging.logger.info( "connected to Elasticsearch on host %s" % (self.settings.config.get("general", "es_url"))) else: self.logging.logger.error( "could not connect to to host %s. Exiting!" % (self.settings.config.get("general", "es_url"))) return self.conn def scan(self, index, bool_clause=None, sort_clause=None, query_fields=None, search_query=None): preserve_order = True if sort_clause is not None else False return eshelpers.scan( self.conn, request_timeout=self.settings.config.getint( "general", "es_timeout"), index=index, query=build_search_query(bool_clause=bool_clause, sort_clause=sort_clause, search_range=self.settings.search_range, query_fields=query_fields, search_query=search_query), size=self.settings.config.getint("general", "es_scan_size"), scroll=self.settings.config.get("general", "es_scroll_time"), preserve_order=preserve_order, raise_on_error=False) def count_documents(self, index, bool_clause=None, query_fields=None, search_query=None): res = self.conn.search( index=index, body=build_search_query(bool_clause=bool_clause, search_range=self.settings.search_range, query_fields=query_fields, search_query=search_query), size=self.settings.config.getint("general", "es_scan_size"), scroll=self.settings.config.get("general", "es_scroll_time")) return res["hits"]["total"] def filter_by_query_string(self, query_string=None): bool_clause = {"filter": [{"query_string": {"query": query_string}}]} return bool_clause def filter_by_dsl_query(self, dsl_query=None): dsl_query = json.loads(dsl_query) if isinstance(dsl_query, list): bool_clause = {"filter": []} for query in dsl_query: bool_clause["filter"].append(query["query"]) else: bool_clause = {"filter": [dsl_query["query"]]} return bool_clause # this is part of housekeeping, so we should not access non-threat-save objects, such as logging progress to the console using ticks! def remove_all_whitelisted_outliers(self): from helpers.outlier import Outlier # import goes here to avoid issues with singletons & circular requirements ... //TODO: fix this outliers_filter_query = {"filter": [{"term": {"tags": "outlier"}}]} total_docs_whitelisted = 0 idx = self.settings.config.get("general", "es_index_pattern") total_nr_outliers = self.count_documents( index=idx, bool_clause=outliers_filter_query) self.logging.logger.info( "going to analyze %s outliers and remove all whitelisted items", "{:,}".format(total_nr_outliers)) for doc in self.scan(index=idx, bool_clause=outliers_filter_query): total_outliers = int(doc["_source"]["outliers"]["total_outliers"]) # Generate all outlier objects for this document total_whitelisted = 0 for i in range(total_outliers): outlier_type = doc["_source"]["outliers"]["type"][i] outlier_reason = doc["_source"]["outliers"]["reason"][i] outlier_summary = doc["_source"]["outliers"]["summary"][i] outlier = Outlier(outlier_type=outlier_type, outlier_reason=outlier_reason, outlier_summary=outlier_summary) if outlier.is_whitelisted(additional_dict_values_to_check=doc): total_whitelisted += 1 # if all outliers for this document are whitelisted, removed them all. If not, don't touch the document. # this is a limitation in the way our outliers are stored: if not ALL of them are whitelisted, we can't remove just the whitelisted ones # from the Elasticsearch event, as they are stored as array elements and potentially contain observations that should be removed, too. # In this case, just don't touch the document. if total_whitelisted == total_outliers: total_docs_whitelisted += 1 doc = remove_outliers_from_document(doc) self.conn.delete(index=doc["_index"], doc_type=doc["_type"], id=doc["_id"], refresh=True) self.conn.create(index=doc["_index"], doc_type=doc["_type"], id=doc["_id"], body=doc["_source"], refresh=True) return total_docs_whitelisted def remove_all_outliers(self): idx = self.settings.config.get("general", "es_index_pattern") must_clause = {"filter": [{"term": {"tags": "outlier"}}]} total_outliers = self.count_documents(index=idx, bool_clause=must_clause) query = build_search_query(bool_clause=must_clause, search_range=self.settings.search_range) script = { "source": "ctx._source.remove(\"outliers\"); ctx._source.tags.remove(ctx._source.tags.indexOf(\"outlier\"))", "lang": "painless" } query["script"] = script if total_outliers > 0: self.logging.logger.info("wiping %s existing outliers", "{:,}".format(total_outliers)) self.conn.update_by_query(index=idx, body=query, refresh=True, wait_for_completion=True) self.logging.logger.info("wiped outlier information of " + "{:,}".format(total_outliers) + " documents") else: self.logging.logger.info( "no existing outliers were found, so nothing was wiped") def process_outliers(self, doc=None, outliers=None, should_notify=False): for outlier in outliers: if outlier.is_whitelisted(additional_dict_values_to_check=doc): if self.settings.config.getboolean( "general", "print_outliers_to_console"): self.logging.logger.info(outlier.outlier_dict["summary"] + " [whitelisted outlier]") else: if self.settings.config.getboolean("general", "es_save_results"): self.save_outlier(doc=doc, outlier=outlier) if should_notify: self.notifier.notify_on_outlier(doc=doc, outlier=outlier) if self.settings.config.getboolean( "general", "print_outliers_to_console"): self.logging.logger.info("outlier - " + outlier.outlier_dict["summary"]) def add_bulk_action(self, action): self.bulk_actions.append(action) if len(self.bulk_actions) > BULK_FLUSH_SIZE: self.flush_bulk_actions() def flush_bulk_actions(self, refresh=False): if len(self.bulk_actions) == 0: return eshelpers.bulk(self.conn, self.bulk_actions, stats_only=True, refresh=refresh) self.bulk_actions = [] def save_outlier(self, doc=None, outlier=None): # add the derived fields as outlier observations derived_fields = self.extract_derived_fields(doc["_source"]) for derived_field, derived_value in derived_fields.items(): outlier.outlier_dict["derived_" + derived_field] = derived_value doc = add_outlier_to_document(doc, outlier) action = { '_op_type': 'update', '_index': doc["_index"], '_type': doc["_type"], '_id': doc["_id"], 'retry_on_conflict': 10, 'doc': doc["_source"] } self.add_bulk_action(action) def extract_derived_fields(self, doc_fields): derived_fields = dict() for field_name, grok_pattern in self.settings.config.items( "derivedfields"): if helpers.utils.dict_contains_dotkey(doc_fields, field_name, case_sensitive=False): if grok_pattern in self.grok_filters.keys(): grok = self.grok_filters[grok_pattern] else: grok = Grok(grok_pattern) self.grok_filters[grok_pattern] = grok match_dict = grok.match( helpers.utils.get_dotkey_value(doc_fields, field_name, case_sensitive=False)) if match_dict: for match_dict_k, match_dict_v in match_dict.items(): derived_fields[match_dict_k] = match_dict_v return derived_fields def extract_fields_from_document(self, doc, extract_derived_fields=False): doc_fields = doc["_source"] if extract_derived_fields: derived_fields = self.extract_derived_fields(doc_fields) for k, v in derived_fields.items(): doc_fields[k] = v return doc_fields
class DPRIndex(DocumentChunker): ''' Class for indexing and searching documents, using a combination of vectors producted by DPR and keyword matching from Elastic TF-IDF. As a subclass of DocumentChunker, this class automatically handles document chunking as well. ''' INDEX_NAME = 'dense-passage-retrieval' D = 768 context_tokenizer = DPRContextEncoderTokenizer.from_pretrained( 'facebook/dpr-ctx_encoder-single-nq-base') context_model = DPRContextEncoder.from_pretrained( 'facebook/dpr-ctx_encoder-single-nq-base', return_dict=True) question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained( 'facebook/dpr-question_encoder-single-nq-base') question_model = DPRQuestionEncoder.from_pretrained( 'facebook/dpr-question_encoder-single-nq-base', return_dict=True) def __init__(self, documents: List[DPRDocument]): super(DocumentChunker).__init__() self.device = 'cuda' if torch.cuda.is_available() else 'cpu' if self.device == 'cuda': self.reader_model = self.reader_model.cuda() self.faiss_index = faiss.IndexFlatIP(self.D) self._setup_elastic_index() self._build_index(documents) def _setup_elastic_index(self): '''Sets up the Elastic Index. Deletes old ones if needed.''' self.es = Elasticsearch() if self.es.indices.exists(self.INDEX_NAME): logging.warning(f'Deleting old index for {self.INDEX_NAME}.') self.es.indices.delete(self.INDEX_NAME) self.es.indices.create(index=self.INDEX_NAME) def _build_index(self, documents): ''' Initializes the data structure to keep track of which chunks correspond to which documents. ''' self.documents = documents self.doc_bodies = [doc.body for doc in self.documents] self.chunks = [] self.chunk_index = {} # {chunk: document} self.inverse_chunk_index = {} # {document: [chunks]} chunk_counter = 0 for doc_counter, doc_body in tqdm(enumerate(self.doc_bodies), total=len(self.doc_bodies)): self.inverse_chunk_index[doc_counter] = [] chunked_docs = self.chunk_document(doc_body) self.chunks.extend(chunked_docs) for chunked_doc in chunked_docs: chunk_embedding = self.embed_context(chunked_doc) self.faiss_index.add(chunk_embedding) self.es.create(self.INDEX_NAME, id=chunk_counter, body={'chunk': chunked_doc}) self.chunk_index[chunk_counter] = doc_counter self.inverse_chunk_index[doc_counter].append(chunk_counter) chunk_counter += 1 self.total_docs = len(self.documents) self.total_chunks = len(self.chunks) def embed_question(self, question: str): '''Embed the question in vector space with the question encoder.''' input_ids = self.question_tokenizer(question, return_tensors='pt')['input_ids'] embeddings = self.question_model( input_ids).pooler_output.detach().numpy() return embeddings def embed_context(self, context: str): '''Embed the context (doc) in vector space with the question encoder.''' input_ids = self.context_tokenizer(context, return_tensors='pt')['input_ids'] embeddings = self.context_model( input_ids).pooler_output.detach().numpy() return embeddings def search_dense_index(self, question: str, k: int = 5): ''' Search the vector index by encoding the question and then performing nearest neighbor on the FAISS index of context vectors. Args: question (str): The natural language question, e.g. `who is bill gates?` k (int): The number of documents to return from the index. ''' if k > self.total_chunks: k = self.total_chunks question_embedding = self.embed_question(question) dists, chunk_ids = self.faiss_index.search(question_embedding, k=k) dists, chunk_ids = list(dists[0]), list(chunk_ids[0]) dists = list(map(float, dists)) # For Flask structured_response = [] for dist, chunk_id in zip(dists, chunk_ids): chunk = self.chunks[chunk_id] document_id = self.chunk_index[chunk_id] document = self.documents[document_id] blob = { 'document': document, 'document_id': document_id, 'chunk': chunk, 'chunk_id': int(chunk_id), # For Flask 'faiss_dist': dist } structured_response.append(blob) return structured_response def search_sparse_index(self, query): body = {'size': 10, 'query': {'match': {'chunk': query}}} results = self.es.search(index=self.INDEX_NAME, body=body) hits = results['hits']['hits'] return hits def _merge_results(self, sparse_results, dense_results): '''Merges the results of sparse and dense retrieval.''' results_index = {} for sparse_result in sparse_results: id, score = sparse_result['_id'], sparse_result['_score'] id = int(id) results_index[id] = {'elastic_score': score} for dense_result in dense_results: id, score = dense_result['chunk_id'], dense_result['faiss_dist'] if id in results_index: results_index[id]['faiss_dist'] = score else: results_index[id] = {'faiss_dist': score} results = [] for chunk_id, scores in results_index.items(): document_id = self.chunk_index[chunk_id] document = self.documents[document_id] chunk = self.chunks[chunk_id] doc_profile = document.to_dict() result = { 'chunk_id': chunk_id, 'chunk': chunk, 'document_id': document_id, 'document': doc_profile, 'scores': scores } results.append(result) return results def search_dual_index(self, query: str): '''Search both the sparse and dense indices and merge the results.''' sparse_result = self.search_sparse_index(query) dense_result = self.search_dense_index(query) merged_results = self._merge_results(sparse_result, dense_result) return merged_results
class PreselectionServer: def __init__(self, address): self.es = Elasticsearch(address) def index_documents(self, dfUsr): df = dfUsr means = df.groupby(['userID'], as_index=False, sort=False) \ .mean() \ .loc[:, ['userID', 'rating']] \ .rename(columns={'rating': 'ratingMean'}) df = pd.merge(df, means, on='userID', how="left", sort=False) df['ratingNormal'] = df['rating'] - df['ratingMean'] ratings = df.loc[:, ['userID', 'movieID', 'ratingNormal']] \ .rename(columns={'ratingNormal': 'rating'}) \ .pivot_table(index='userID', columns='movieID', values='rating') \ .fillna(0) print("Indexing users...") index_users = [{ "_index": "users", "_type": "user", "_id": index, "_source": { 'ratings': row[row > 0] \ .sort_values(ascending=False) \ .index.values.tolist() } } for index, row in ratings.iterrows()] helpers.bulk(self.es, index_users) print("Done") print("Indexing movies...") index_movies = [{ "_index": "movies", "_type": "movie", "_id": column, "_source": { "whoRated": ratings[column][ratings[column] > 0] \ .sort_values(ascending=False) \ .index.values.tolist() } } for column in ratings] helpers.bulk(self.es, index_movies) print("Done") def get_movies_liked_by_user(self, user_id, index='users'): user_id = int(user_id) result = self.es.get(index=index, doc_type="user", id=user_id)["_source"] return result def get_users_that_like_movie(self, movie_id, index='movies'): movie_id = int(movie_id) return self.es.get(index=index, doc_type="movie", id=movie_id)["_source"] def get_movie_recommendations(self, user_id, index='users'): movies_liked_by_user = self.get_movies_liked_by_user(user_id) user_id = int(user_id) users_who_rated_at_least_one = self.es.search( index=index, body={'query': { 'terms': movies_liked_by_user }})["hits"]["hits"] unique_movies = set() for ratings in users_who_rated_at_least_one: if ratings["_id"] != user_id: ratings = ratings["_source"]["ratings"] for rating in ratings: if rating not in movies_liked_by_user["ratings"]: unique_movies.add(rating) return list(unique_movies) def get_user_recommendations(self, movie_id, index='movies'): users_who_liked_the_movie = self.get_users_that_like_movie(movie_id) movie_id = int(movie_id) movies_rated_by_at_least_one = self.es.search( index=index, body={'query': { 'terms': users_who_liked_the_movie }})["hits"]["hits"] unique_users = set() for ratings in movies_rated_by_at_least_one: if ratings["_id"] != movie_id: ratings = ratings["_source"]["whoRated"] for rating in ratings: if rating not in users_who_liked_the_movie["whoRated"]: unique_users.add(rating) return list(unique_users) def add_user_document(self, user_id, movies, user_index='users', movie_index='movies'): user_id = int(user_id) movies = list(set(movies)) to_update = [ self.es.get(index=movie_index, id=movie_id, doc_type='movie') for movie_id in movies ] if len(to_update) != len(movies): raise Exception("One or more movies unknown") for movie_document in to_update: users = movie_document["_source"]["whoRated"] users.append(user_id) users = list(set(users)) self.es.update(index=movie_index, id=movie_document["_id"], doc_type='movie', body={"doc": { "whoRated": users }}) self.es.create(index=user_index, id=user_id, body={"ratings": movies}, doc_type='user') def update_user_document(self, user_id, movies, user_index='users', movie_index='movies'): user_id = int(user_id) movies = list(set(movies)) to_update = self.es.get(index=user_index, id=user_id, doc_type='user') old_movies = to_update['_source']['ratings'] movies_to_add_user = np.setdiff1d(movies, old_movies) movies_to_remove_user = np.setdiff1d(old_movies, movies) for movie_to_remove_user in movies_to_remove_user: movie_document = self.es.get(index=movie_index, id=movie_to_remove_user, doc_type='movie') users_who_liked_movie = movie_document["_source"]["whoRated"] users_who_liked_movie.remove(user_id) users_who_liked_movie = list(set(users_who_liked_movie)) self.es.update(index=movie_index, id=movie_to_remove_user, doc_type='movie', body={"doc": { "whoRated": users_who_liked_movie }}) for movie_to_add_user in movies_to_add_user: movie_document = self.es.get(index=movie_index, id=movie_to_add_user, doc_type='movie') users_who_liked_movie = movie_document["_source"]["whoRated"] users_who_liked_movie.append(user_id) users_who_liked_movie = list(set(users_who_liked_movie)) self.es.update(index=movie_index, id=movie_to_add_user, doc_type='movie', body={"doc": { "whoRated": users_who_liked_movie }}) self.es.update(index=user_index, id=user_id, body={"doc": { "ratings": movies }}, doc_type="user") def get_all_index(self): return self.es.indices.get_alias("*")
import random from make_file_io import makeFileIo import time from datetime import datetime from elasticsearch import Elasticsearch import json es = Elasticsearch( hosts=["ec2-54-180-123-238.ap-northeast-2.compute.amazonaws.com"]) # es = Elasticsearch() # es.indices.create(index="io_log") doc = makeFileIo("poc1.mobis.com", "", "FR_CAM", "/ifs/raw_10/FR_CAM", "inactive", "2019-10-10 10:10:10") res = es.create(index="io_log", id=doc['datetime'], body=doc)
class pdfGraph(): """Create and manage the PDF graph in Neo4j and index in Elasticsearch""" db_path = "http://localhost:7474/db/data/" db = None pdf_documents = None authors = None keywords = None es_cluster = [{'host': 'localhost', 'port': 9200}] es = None es_ixc = None def __init__(self): """ setup Neo4j database connection and node labels and Elasticsearch mapping attachments index """ self.db = GraphDatabase(self.db_path) self.pdf_documents = self.db.labels.create("PDFDocument") self.authors = self.db.labels.create("Author") self.keywords = self.db.labels.create("Keyword") self.es = Elasticsearch(self.es_cluster) self.es_ixc = IndicesClient(self.es) self.es_ixc.create( index="pdf_documents", body={ 'mappings': { 'pdf': { 'properties': { 'url': {'type': "string"}, 'pdf_file': {'type': "attachment"} } } } } ) def createNodesAndIx(self, doc_url, doc_info, doc_metadata, doc_data): """Given document details create nodes and relationships for documents, authors and keywords and store the related documents for indexing and search""" # not all pdf docs have all fields so we need to check for existence check_for = lambda n, d: d[n] if (n in d) else '' author = check_for('Author', doc_info[0]) # create an author node if one doesn't already exists if author is not '': author_node = self.authorExists(author) if author_node is None: author_node = self.createAuthor(author) # create keyword nodes if they don't already exist if check_for('pdf', doc_metadata) is not '': keywords = check_for('Keywords', doc_metadata['pdf']) else: keywords = '' if keywords is not '': keyword_nodes = [] for keyword in map(lambda x: x.strip(" '\""), keywords.split(",")): keyword_node = self.keywordExists(keyword) if keyword_node is None: keyword_node = self.createKeyword(keyword) keyword_nodes.append(keyword_node) # create the document node pdf_node = self.db.nodes.create( url=doc_url, info=repr(doc_info), metadata=repr(doc_metadata), title=check_for('Title', doc_info[0]) ) self.pdf_documents.add(pdf_node) # create relationships b/w document, author and keywords if author is not '': pdf_node.relationships.create("AUTHORED_BY", author_node) if keywords is not '': for keyword_node in keyword_nodes: pdf_node.relationships.create("HAS_KEYWORD", keyword_node) # add the document for full-text search to ES using Neo4j id self.es.create( index="pdf_documents", doc_type="pdf", id=pdf_node.id, body={ 'url': doc_url, 'pdf_file': base64.b64encode(doc_data.getvalue()) } ) def authorExists(self, author): """Check for an existing author node""" r = self.db.query( 'match (a:Author) where a.name = "' + author + '" return a', returns=(client.Node) ) return r[0][0] if (len(r) > 0) else None def createAuthor(self, author): """Create an author node""" an_author = self.db.nodes.create(name=author) self.authors.add(an_author) return an_author def keywordExists(self, keyword): """Check for an existing keyword node""" r = self.db.query( 'match (k:Keyword) where k.name = "' + keyword + '" return k', returns=(client.Node) ) return r[0][0] if (len(r) > 0) else None def createKeyword(self, keyword): """Create a keyword node""" a_keyword = self.db.nodes.create(name=keyword) self.keywords.add(a_keyword) return a_keyword
"as": { "Organization": { "name": row[5] }, "number": row[4].replace('AS', ''), "as_type": row[7], "domain": row[6] } }, "ip_range": { "Organization": { "name": row[1], "domain": row[2], "type": row[3], }, "num_start": num_start, "num_end": num_end } } # ドキュメントの登録 es.create(index='ipinfo_internal-0001', id=str(uuid.uuid4()), body=document) if i % 5000 == 0: print('{:,}'.format(i)) i += 1 es.close()
class InventoryExporter(BaseExporter): name = "elasticsearch aggregations exporter" def __init__(self, config_g): super().__init__(config_g) error_msgs = [] self.es_hosts = config_g["es_hosts"] if not self.es_hosts: error_msgs.append("Missing es_hosts") if not isinstance(self.es_hosts, list): error_msgs.append("es_hosts must be a list") if error_msgs: raise ImproperlyConfigured("{} in {}".format( ", ".join(error_msgs), self.name)) def iter_machine_snapshots(self): for serial_number, machine_snapshots in self.get_ms_query().fetch( paginate=False, for_filtering=True): for machine_snapshot in machine_snapshots: yield machine_snapshot def get_es_client(self): self._es = Elasticsearch(hosts=self.es_hosts) self._es_version = [ int(i) for i in self._es.info()["version"]["number"].split(".") ] if self._es_version < [7]: raise ValueError( "Inventory exporter {} not compatible with ES < 7.0") # lifecycle _esilm = IlmClient(self._es) _esilm.put_lifecycle(ES_LIFECYCLE_POLICY_NAME, ES_LIFECYCLE_POLICY) # template self._es.indices.put_template(ES_TEMPLATE_NAME, ES_TEMPLATE) # create index for i in range(10): existing_indices = self._es.indices.get(ES_INDEX_PATTERN).keys() if not len(existing_indices): current_index_name = ES_INDEX_PATTERN.replace("*", "000001") try: self._es.indices.create( current_index_name, {"aliases": { ES_ALIAS: { "is_write_index": True } }}) except RequestError: # probably race pass else: break return ES_ALIAS def run(self): timestamp = timezone.now().isoformat() index_name = self.get_es_client() for source in Source.objects.current_machine_snapshot_sources(): ms_query = self.get_ms_query() source_d = { "id": source.pk, "module": source.module, "name": source.name, "display_name": source.get_display_name() } ms_query.force_filter(SourceFilter, hidden_value=source.pk) for f, f_links, _, _ in ms_query.grouping_links(): filter_d = {"title": f.title, "slug": f.get_query_kwarg()} for label, f_count, _, _, _ in f_links: if label == "\u2400": label = "NULL" elif not isinstance(label, str): label = str(label) doc = { "source": source_d, "filter": filter_d, "value": label, "count": f_count, "@timestamp": timestamp } doc_id = str(uuid.uuid4()) self._es.create(index_name, doc_id, doc)
"properties": { "text": { "type": "text", "analyzer": "my_analyzer" } } } } }) #%% 5 Load the data to the ES index resource_path = 'resources/ustawy' for filename in os.listdir(resource_path): with open(resource_path + '/' + filename, 'r') as document: legislation = document.read() es.create("my_index", "legislation", filename, {"text": legislation}) #%% 6 number of legislative acts containing the word ustawa es.search(index="my_index", doc_type="legislation", body={"query": { "match": { "text": { "query": "ustawa" } } }})["hits"]["total"] #%% 7 containing the words kodeks postępowania cywilnego es.search(index="my_index", doc_type="legislation",
class CabiAnalyzer: """ Class to manage analyzing cabi data """ # d2014_4 = genfromtxt(join(DATA_DIR, "2010-4th-quarter.csv"), # dtype=None, delimiter=",") def __init__(self, data_dir, index_name): """ create a connection to ES, make an index that is specified by the user, scan through a list of files and begin to add a line from each to the database as individual documents :param data_dir: directory with csv files in it :param index_name: elasticsearch index name :return: """ self.data_dir = data_dir self.data_list = list() self.es = Elasticsearch([{'host': 'localhost', 'port': '9200'}]) self.index_name = index_name # self.clear_elasticsearch(self.index_name) # Debating whether or not we should automatically delete all that data. self.es.indices.create(index=self.index_name, ignore=400) self.process_data() # def get_unique_stations(self): # """ # Define unique stations by running through all the data. # """ # pass def process_data(self): """ run a loop against parse file and put each file into ES :return: """ self.data_list = self.get_data_list() if len(self.data_list) == 0: print "No data to read." for i in xrange(0, len(self.data_list)): self.parse_file(i) def parse_file(self, file_index): """ after we've gathered a list of files, we pass an index to this function it goes into the list and grabs that file then parses it line by line :param file_index: int from 0 to len(self.data_list) :return: """ count = 0 this_file = self.data_list[file_index] for line in open(this_file, 'r'): if count == 0: count += 1 continue in_data = line.strip().split(",") seconds = timestr_to_sec(in_data[0]) fromtime = datetime.strptime(in_data[1], '%m/%d/%Y %H:%M') fintime = datetime.strptime(in_data[2], '%m/%d/%Y %H:%M') # print in_data start_stn_num = re.findall("([0-9]{5})", in_data[3])[0] start_stn_addr = in_data[3].split("(")[0].strip() fin_stn_num = re.findall("([0-9]{5})", in_data[4])[0] fin_stn_addr = in_data[4].split("(")[0].strip() bike_id = in_data[5] user_type = in_data[6] add_data = {"triplength": seconds, "starttime": fromtime, "fintime": fintime, "start_stn_num": start_stn_num, "start_stn_addr": start_stn_addr, "fin_stn_num": fin_stn_num, "fin_stn_addr": fin_stn_addr, "bike_id": bike_id, "user_type": user_type, "from_to_quick": start_stn_num + "_" + fin_stn_num } self.es.create(self.index_name, "rides", add_data) count += 1 print "Inserted " + str(count) + " objects." def get_data_list(self): """ Go to the target directory. Find and return an array of files that we can then analyze. """ ret_vals = list() tgt_dir = self.data_dir for c_file in listdir(tgt_dir): if isfile(join(tgt_dir, c_file)): if c_file[-3:].lower() == 'csv': ret_vals.append(join(tgt_dir, c_file)) return ret_vals def clear_elasticsearch(self, index_name): """ prep for a new data entry by clearing the entire elasticsearch index :return: """ self.es.indices.delete(index=index_name, ignore=[400, 404])
def read_json(filename): f = open(filename, 'r') jsonData = json.load(f, "utf-8") text = json.dumps(jsonData) f.close() return text, jsonData if __name__ == "__main__": #count, index = load_sample() p_text, p_json = read_json("output/one_json_time_series_patient.json") es = Elasticsearch() index = "oa" doc_type = "Patient" i = 1 setting = yaml.load(open('elastic_search/mapping.yaml')) properties = setting["mappings"]["Patient"]["properties"].keys() print es.create(index=index, doc_type=doc_type, body=setting) for p in xrange(len(p_json)): es.index(index=index, doc_type=doc_type, id=i, body=p_json["%s" % p]["0"]["Plan"].split(" ")) i += 1
''' Created on Jun 25, 2014 @author: jimhorng ''' from elasticsearch import Elasticsearch es = Elasticsearch(hosts=[{'host': '192.168.69.41', 'port': 9200}]) body = { "size": 0, "aggs" : { "test_aggr" : { "terms" : { "field" : "user_id" } } } } res = es.search(index="mongo", doc_type='device', body=body) user_owned_nas_qty = res['aggregations']['test_aggr']['buckets'] for record in user_owned_nas_qty: record['user_id'] = record.pop('key') record['owned_nas_qty'] = record.pop('doc_count') es.create(index="result_temp", doc_type="user_owned_nas_qty", body=record) print(user_owned_nas_qty)
class ESData(object): def __init__(self): es_addr = get_es_address() self.client = Elasticsearch([es_addr]) def _kibana_request(self, url, data): headers = { 'content-type': 'application/json', 'kbn-xsrf': True } data = json.dumps(data) kibana_url = settings.KIBANA_URL + url req = urllib2.Request(kibana_url, data, headers=headers) urllib2.urlopen(req) return req def _kibana_remove(self, _type, body): i = 0 ids = [] if get_es_major_version() >= 6: body['query']['query_string']['query'] += ' type:%s' % _type _type = 'doc' while True: res = self.client.search(index='.kibana', from_=i, doc_type=_type, body=body, request_cache=False) if len(res['hits']['hits']) == 0: break i += 10 _ids = [hit['_id'] for hit in res['hits']['hits']] ids += _ids for _id in ids: self.client.delete(index='.kibana', doc_type=_type, id=_id, refresh=True) def _kibana_export_obj(self, dest, _type, body): i = 0 dest = os.path.join(dest, _type) os.makedirs(dest) while True: if get_es_major_version() < 6: res = self.client.search(index='.kibana', from_=i, doc_type=_type, body=body) else: res = self.client.search(index='.kibana', from_=i, body=body) if len(res['hits']['hits']) == 0: break i += 10 for hit in res['hits']['hits']: _id = hit['_id'] filename = os.path.join(dest, _id) filename += '.json' if get_es_major_version() < 6: res = self.client.get(index='.kibana', doc_type=_type, id=_id) else: res = self.client.get(index='.kibana', doc_type='doc', id=_id) with open(filename, 'w') as f: f.write(json.dumps(res['_source'], separators= (',', ':'))) def kibana_export(self, full=False): dest = tempfile.mkdtemp() _types = ('search', 'visualization', 'dashboard') if full: _types = _types + ('index-pattern',) for _type in _types: if get_es_major_version() < 6: if full: body = {'query': {'match_all': {}}} else: body = { 'query': { 'query_string': { 'query': 'NOT title: SN *' } } } else: if full: body = { 'query': { 'query_string': { 'query': 'type: %s' % _type } } } else: body = { 'query': { 'query_string': { 'query': 'type: %s AND NOT title: SN *' % _type } } } self._kibana_export_obj(dest, _type, body) f = tempfile.NamedTemporaryFile(delete=False) tar_name = 'scirius-dashboards-%s' % strftime('%Y%m%d%H%M') tar = tarfile.open(mode='w:bz2', fileobj=f) tar.add(dest, tar_name) tar.close() rmtree(dest) f.close() tar_name += '.tar.bz2' return tar_name, f.name def _create_kibana_mappings(self): if not self.client.indices.exists('.kibana'): self.client.indices.create(index='.kibana',body={ "mappings": get_kibana_mappings() }) self.client.indices.refresh(index='.kibana') elif not "visualization" in str(self.client.indices.get_mapping(index='.kibana')): self.client.indices.delete(index='.kibana') self.client.indices.create(index='.kibana',body={ "mappings": get_kibana_mappings() }) self.client.indices.refresh(index='.kibana') def _kibana_inject(self, _type, _file): with open(_file) as f: content = f.read() name = _file.rsplit('/', 1)[1] name = name.rsplit('.', 1)[0] if get_es_major_version() < 6: doc_type = _type else: doc_type = 'doc' self.client.create(index='.kibana', doc_type=doc_type, id=name, body=content, refresh=True) def _kibana_set_default_index(self, idx): if get_es_major_version() < 6: res = self.client.search(index='.kibana', doc_type='config', body={'query': {'match_all': {}}}, request_cache=False) else: body = {'query': {'query_string': {'query': 'type: config'}}} res = self.client.search(index='.kibana', doc_type='doc', body=body, request_cache=False) for hit in res['hits']['hits']: content = hit['_source'] content['defaultIndex'] = idx if get_es_major_version() < 6: self.client.update(index='.kibana', doc_type='config', id=hit['_id'], body={'doc': content}, refresh=True) else: self.client.update(index='.kibana', doc_type='doc', id=hit['_id'], body=content, refresh=True) else: if get_es_major_version() >= 6: self._kibana_request('/api/kibana/settings/defaultIndex', {'value': 'logstash-*'}) else: print >> sys.stderr, "Warning: unknown ES version, not setting Kibana's defaultIndex" def _get_kibana_files(self, source, _type): files = [] path = os.path.join(source, _type) if not os.path.isdir(path): return [] for _file in os.listdir(path): if not _file.endswith('.json'): continue _file = os.path.join(path, _file) files.append(_file) return files def _get_kibana_subdirfiles(self, _type): files = [] for _dir in os.listdir(settings.KIBANA_DASHBOARDS_PATH): src_path = os.path.join(settings.KIBANA_DASHBOARDS_PATH, _dir) if os.path.isdir(src_path): files += self._get_kibana_files(src_path, _type) return files def kibana_import_fileobj(self, fileobj): tar = tarfile.open(mode='r:bz2', fileobj=fileobj) tmpdir = tempfile.mkdtemp() tar.extractall(tmpdir) tar.close() subdirs = os.listdir(tmpdir) if len(subdirs) != 1: raise Exception('Archive does not appear to contain dashboards, visualizations or searches') source = os.path.join(tmpdir, subdirs[0]) self._create_kibana_mappings() count = 0 for _type in ('search', 'visualization', 'dashboard'): source_files = self._get_kibana_files(source, _type) count += len(source_files) for _file in source_files: self._kibana_inject(_type, _file) rmtree(tmpdir) if count == 0: raise Exception('No data loaded') return count def kibana_clear(self): body = { 'query': { 'query_string': { 'query': 'NOT title: SN *' } } } _types = ('search', 'visualization', 'dashboard') for _type in _types: self._kibana_remove(_type, body) def kibana_reset(self): self._create_kibana_mappings() if not os.path.isdir(settings.KIBANA_DASHBOARDS_PATH): raise Exception('Please make sure Kibana dashboards are installed at %s' % settings.KIBANA_DASHBOARDS_PATH) if self._get_kibana_subdirfiles('index-pattern') == []: raise Exception('Please make sure Kibana dashboards are installed at %s: no index-pattern found' % settings.KIBANA_DASHBOARDS_PATH) self._kibana_remove('dashboard', {'query': {'query_string': {'query': 'SN*'}}}) self._kibana_remove('visualization', {'query': {'query_string': {'query': 'SN*'}}}) self._kibana_remove('search', {'query': {'query_string': {'query': 'SN*'}}}) self._kibana_remove('index-pattern', {'query': {'query_string': {'query': '*'}}}) for _type in ('index-pattern', 'search', 'visualization', 'dashboard'): for _file in self._get_kibana_subdirfiles(_type): self._kibana_inject(_type, _file) if get_es_major_version() >= 6: self._kibana_request('/api/spaces/space', KIBANA6_NAMESPACE) self._kibana_set_default_index(u'logstash-*') def _get_indexes(self): res = self.client.indices.stats() indexes = res['indices'].keys() try: indexes.remove('.kibana') except ValueError: pass return indexes def es_clear(self): indexes = self._get_indexes() self.client.indices.delete(index=indexes) return len(indexes) def wait_until_up(self): for i in xrange(1024): try: ret = self.client.cluster.health(wait_for_status='green', request_timeout=15 * 60) if ret.get('status') == 'green': break sleep(10) except ConnectionError: pass
class ESAgent(object): """ ESAgent warpper class for ES operation """ def __init__(self, *args, **kwargs): """ init from settings key/value of ES :param args: :param kwargs: :return: """ settings = kwargs['settings'] self.agent = Elasticsearch(hosts=settings['ES_HOST']) for key in settings: if key.upper().startswith('ES_'): self.__setattr__(key.upper(), settings[key]) self.indices = [self.ES_INDEX, self.ES_REF_INDEX, self.ES_TIMESERIES_INDEX, self.ES_CONFIG_INDEX] def initalize(self): for index in self.indices: body = '' if index == self.ES_INDEX: body = { 'mappings':{ 'comment':{ 'properties':{ 'content':{ # object 'properties':{ 'symbol': { 'type':'string', 'index':'not_analyzed' } } } } } }, 'aliases':{'all_xueqiu':{}} } self.agent.indices.create(index, body=body, ignore=400) #ignore indices exists def clean(self): for index in self.indices: self.agent.indices.delete(index=index, ignore=(400, 404)) def exist_indices(self, index): return self.agent.indices.exists(index=index) def add_user(self, uid): pass def remove_user(self, uid): pass def get_watched_symbol(self): doc = self.agent.search(index=self.ES_CONFIG_INDEX, doc_type='watch_symbols', body={ 'size': 100, 'query':{ 'match_all': {} }, 'sort':[ { 'symbol':{'order': 'asc'} } ] }) if not doc.has_key('status') and doc['hits']['total'] > 0: return [item['_source']['symbol'] for item in doc['hits']['hits']] else: return [] def add_symbol_to_watch(self, symbol): self.agent.create(index=self.ES_CONFIG_INDEX, doc_type="watch_symbols", id=symbol, body={ 'symbol':symbol }, ignore=(409)) def remove_symbol_to_watch(self, symbol): self.agent.delete(index=self.ES_CONFIG_INDEX, doc_type='watch_symbols', id=symbol, ignore=(404)) # XueQiu Comment API def get_last_comment_id(self, symbol): doc = self.agent.search(index=self.ES_INDEX, doc_type='comment', body={ 'size': 1, 'query': { 'match': { 'content.symbol': symbol } # not_analyzed this field like term }, 'sort': [ {'content.id': {'order': 'desc'}} ] }, ignore=(400, 404)) if not doc.has_key('status') and doc['hits']['total'] > 0: return doc['hits']['hits'][0]['_source']['content']['id'] else: return 0 def create_comment(self, doc): self.agent.create(index=self.ES_INDEX, doc_type="comment", id=doc['content']['id'], body=doc, ignore=(409)) def update_comment_with_author(self, id, body): self.agent.update(index=self.ES_INDEX, doc_type='comment', id=id, body=body, ignore=(400)) def create_reference(self, doc): self.agent.create(index=self.ES_REF_INDEX, doc_type='instrument', id=doc['code'], body=doc, ignore=(400))
def main(): # Connect to localhost:9200 by default. client = MongoClient(port=27017) db = client["mydatabase"] highest_previous_primary_key = 1 highest_previous_primary_key2 = 1 mycol1 = db['tweets_test'] mycol = db['tweets_text_sentiment'] emoji_sentiment = db['tweets_emoji_sentiment'] es = Elasticsearch() es.indices.delete(index="new_tweets5", ignore=404) es.indices.create( index="new_tweets5", body={ 'mappings': { "tweet": { 'properties': { 'text': {'type': 'text'}, 'timestamp': {'type': 'date'}, 'country': {'type': 'text'}, 'textSentScore': { 'type': 'text', 'fields': { 'raw':{ 'type': 'keyword' } } }, 'location':{'type': "geo_point" } } }, }, 'settings': { 'analysis': { 'analyzer': { 'custom_english_analyzer': { 'type': 'english', 'stopwords': ['made', '_english_'] } } } } }, ) # es.indices.delete(index="emojitweets1", ignore=404) # es.indices.create( # index="emojitweets1", # body={ # 'mappings': { # "tweetEmoji": { # 'properties': { # 'timestamp': {'type': 'date'}, # 'emoji' : {'type': 'text'}, # 'country': {'type': 'text'}, # 'emojiSent': {'type': 'text'} # } # } # }, # 'settings': { # 'analysis': { # 'analyzer': { # 'custom_english_analyzer': { # 'type': 'english', # 'stopwords': ['made', '_english_'] # } # } # } # } # }, # ) count = 0 count2 = 0 while True: cursor = mycol.find({}, no_cursor_timeout=True) for msg in cursor: #print(msg) count += 1 current_primary_key = int(str(msg['_id'])[-6:],16) if current_primary_key > highest_previous_primary_key: print(count) action = { "index": "new_tweets5", "type": "tweet", 'text' : msg["text"], 'timestamp': msg["created_at"], 'country': msg["country"], 'textSentScore': msg['sentimentScoreText'], 'location': msg['location'] } es.create(index = "new_tweets5", doc_type = "tweet", id = count, body = action) #print(msg["created_at"]) highest_previous_primary_key = current_primary_key
'died': author_result[7], } # TODO implement parent-child relationship between books and pages content_page = { '_id': str(author_result[0]) + '-' + str(row[0]), 'book_id': author_result[0], 'page_body': row[1], 'volume': row[2], 'page_number': row[3], 'book': content } response = client.create( index='shamela', # REVIEW the id for the page needs looking at. id=str(author_result[0]) + '-' + str(row[0]), body=content_page, # parent='b`ooks' doc_type='pages' ) # print response print "[" + str(count) + "] completed inserting '" + author_result[1] + "' into index" count = count + 1 conn.close()
class APIDatabase: def __init__(self, elastic_index='address-book', *args, **kwargs): # calls Elasticsearch() to connect to the database and creates the index for the address book if needed import json from elasticsearch import Elasticsearch from elasticsearch import exceptions as es_exceptions # hold on to the exceptions so they can be recognized in the try...except blocks later self.es_exceptions = es_exceptions # host and port information for Elasticshare() is in the separate json file try: with open('./elastic_host_config.json') as f: elastic_host_info = json.load(f) except FileNotFoundError: elastic_host_info = {'host': 'localhost', 'port': 9200} self.database = Elasticsearch(elastic_host_info, *args, **kwargs) self.elastic_index = elastic_index # ensure the Elasticsearch index exists self.database.indices.create(index=elastic_index, ignore=400) def get_contact_by_query(self, page_size, page_num, query_string): # searches the data store using query_string and returns page_size entries starting on page page_num if page_size < 0: return { 'error': 'pageSize must be a nonnegative integer', 'status': 400 } elif page_num < 0: return { 'error': 'page must be a nonnegative integer', 'status': 400 } try: result = self.database.search(index=self.elastic_index, from_=page_num, q=query_string, size=page_size) return [ contact['_source']['doc'] for contact in result['hits']['hits'] ] except self.es_exceptions.RequestError as err: return { 'error': err.info['error']['root_cause'][0]['reason'], 'status': err.status_code } def get_contact_by_name(self, name): # returns the contact with the given name try: return self.database.get_source(index=self.elastic_index, id=name)['doc'] except self.es_exceptions.NotFoundError: return {'error': 'not found', 'status': 404} def create_contact(self, contact_details): # creates a contact with the given contact_details (which includes a name) try: self.database.create(index=self.elastic_index, id=contact_details['name'], body={'doc': contact_details}) return {'message': 'created', 'status': 200} except self.es_exceptions.ConflictError: return {'error': 'contact already exists', 'status': 409} def update_contact(self, name, contact_details): # updates a contact with the new contact_details try: result = self.database.update( index=self.elastic_index, id=name, body={'doc': { 'doc': contact_details }}) return {'message': result['result'], 'status': 200} except self.es_exceptions.NotFoundError: return {'error': 'not found', 'status': 404} def delete_contact(self, name): # deletes the contact with the given name try: self.database.delete(index=self.elastic_index, id=name) return {'message': 'deleted', 'status': 200} except self.es_exceptions.NotFoundError: return {'error': 'not found', 'status': 404}
class ElasticSearchReporting(Report): """Stores report in Elasticsearch.""" def connect(self): """Connect to Elasticsearch. @raise CuckooReportError: if unable to connect. """ hosts = [] for host in self.options.get("hosts", "127.0.0.1:9200").split(","): if host.strip(): hosts.append(host.strip()) self.index = self.options.get("index", "cuckoo") self.type_ = self.options.get("type", "cuckoo") try: self.es = Elasticsearch(hosts) except TypeError: raise CuckooReportError( "Elasticsearch connection hosts must be host:port or host" ) except (ConnectionError, ConnectionTimeout) as e: raise CuckooReportError("Cannot connect to Elasticsearch: %s" % e) def do_index(self, obj): index = "%s-%d" % (self.index, self.task["id"]) try: self.es.create(index=index, doc_type=self.type_, body=obj) except Exception as e: raise CuckooReportError( "Failed to save results in ElasticSearch for " "task #%d: %s" % (self.task["id"], e) ) self.idx += 1 def process_summary(self, results): """Index the behavioral summary.""" summary = results.get("behavior", {}).get("summary") if summary: self.do_index(summary) def process_behavior(self, results, paginate=100): """Index the behavioral data.""" for process in results.get("behavior", {}).get("processes", []): page, calls = 0, [] for call in process["calls"]: calls.append(call) if len(calls) == paginate: self.do_index({ "process": { "pid": process["pid"], "page": page, "calls": calls, }, }) page += 1 calls = [] if calls: self.do_index({ "process": { "pid": process["pid"], "page": page, "calls": calls, }, }) def run(self, results): """Index the Cuckoo report into ElasticSearch. @param results: analysis results dictionary. @raise CuckooReportError: if the connection or reporting failed. """ if not HAVE_ELASTIC: raise CuckooDependencyError( "Unable to import elasticsearch (install with " "`pip install elasticsearch`)" ) self.connect() self.idx = 0 # Index the summary. self.process_summary(results) # Index the API calls. if self.options.get("calls", True): self.process_behavior(results)
""" 你的 APPID AK SK """ APP_ID = '23771318' API_KEY = '0wqWkZ0Ww50uz8hZu5G3WEgG' SECRET_KEY = 'bPmhDxHDZQZb0GzGQoHWEE9QjGYhGta6' client = AipOcr(APP_ID, API_KEY, SECRET_KEY) i = open('denggao.png', 'rb') img = i.read() message = client.basicGeneral(img) for i in message.get('words_result'): print(i.get('words')) #识别结果替换为变量 a = i.get('words') #生成uuid import datetime import string import random from uuid import * nod_uuid = lambda x: str( uuid5( NAMESPACE_X500, str(x) + str(datetime.datetime.now()) + ''.join( random.sample(string.ascii_letters + string.digits, 8)))) print(nod_uuid('')) #创建索引 print(es.create(index='a1', id=nod_uuid(''), body={"内容": a}))
import sys, json, yaml from elasticsearch import Elasticsearch def read_json(filename): f = open(filename, 'r') jsonData = json.load(f,"utf-8") text = json.dumps(jsonData) f.close() return text, jsonData if __name__ == "__main__": #count, index = load_sample() p_text, p_json = read_json("output/one_json_time_series_patient.json") es = Elasticsearch() index = "oa" doc_type = "Patient" i = 1 setting = yaml.load(open('elastic_search/mapping.yaml')) properties = setting["mappings"]["Patient"]["properties"].keys() print es.create(index=index, doc_type=doc_type, body=setting) for p in xrange(len(p_json)): es.index(index=index, doc_type=doc_type, id=i, body=p_json["%s"%p]["0"]["Plan"].split(" ")) i += 1
def splitDataset(): es = Elasticsearch(timeout=60) #Find all the tweets from the trump index authors = es.search(index="trump", size=100000, from_=0, _source_include="user", body={"query": { "match_all": {} }}) #Iterate through the authors of the tweets clicker = 0 for hit in authors['hits']['hits']: source = hit["_source"] author = source["user"] #Find 2 tweets by the same author tweets = es.search(index="trump", size=2, body={"query": { "term": { "user": author } }}) count = tweets['hits']['total'] if count >= 6: #Create two different tweets t1 = tweets['hits']['hits'][0] t2 = tweets['hits']['hits'][1] #Insert tweet 1 into the 140set index op1 = es.create(id=t1['_id'], index="140set", doc_type="doc", body={ 'author': t1['_source']['user'], 'message': t1['_source']['message'] }, ignore=[403, 409]) #Insert tweet 2 (including the message of t1) into the 280set index op2 = es.create(id=t2['_id'], index="280set", doc_type="doc", body={ 'author': t2['_source']['user'], 'message': t1['_source']['message'] + ' ' + t2['_source']['message'] }, ignore=[403, 409]) #Every 100 tweets generated, print a message clicker += 1 if (clicker % 100) == 0: print("Generating tweet sets...") #splitDataset()