示例#1
0
  def PublishSamples(self, samples):
    """Publish samples to Elasticsearch service"""
    try:
      from elasticsearch import Elasticsearch
    except ImportError:
      raise ImportError('The "elasticsearch" package is required to use '
                        'the Elasticsearch publisher. Please make sure it '
                        'is installed.')

    es = Elasticsearch([self.es_uri])
    if not es.indices.exists(index=self.es_index):
      es.indices.create(index=self.es_index, body=self.mapping)
      logging.info('Create index %s and default mappings', self.es_index)
    for s in samples:
      sample = copy.deepcopy(s)
      # Make timestamp understandable by ES and human.
      sample['timestamp'] = self._FormatTimestampForElasticsearch(
          sample['timestamp']
      )
      # Keys cannot have dots for ES
      sample = self._deDotKeys(sample)
      # Add sample to the "perfkit index" of "result type" and using sample_uri
      # as each ES's document's unique _id
      es.create(index=self.es_index, doc_type=self.es_type,
                id=sample['sample_uri'], body=json.dumps(sample))
示例#2
0
class ESAlertSender:
    def __init__(self):
        self.es = None
        self.logger = logging.getLogger("ESAlertSender")

    def send_alerts(self, configuration, alerts):
        self.es = Elasticsearch([{"host": configuration["es_host"], "port": configuration["es_port"]}])
        for alert in self.flatten_alerts(alerts):
            self.insert_es(alert)

    def insert_es(self, alert):
        try:
            alert["@timestamp"] = datetime.datetime.utcnow().isoformat()
            alert["type"] = "reddalert"
            self.es.create(body=alert, id=hashlib.sha1(str(alert)).hexdigest(), index="reddalert", doc_type="reddalert")
        except Exception as e:
            self.logger.error(e)

    def flatten_alerts(self, alerts):
        for alert in alerts:
            details = alert[2]
            if isinstance(details, dict):
                base = {"rule": alert[0], "id": alert[1]}
                base.update(details)
                yield base
            else:
                yield {"rule": alert[0], "id": alert[1], "details": details}
class TestSingleDocSigTerms(TestCase):
    def setUp(self):
        super(TestSingleDocSigTerms, self).setUp()

        self.es = Elasticsearch(hosts=['localhost:%d' % es_runner.es_state.port])
        self.ic = IndicesClient(self.es)
        self.index = 'single_doc_sigterms_test'
        self.doc_type = 'test-doc'
        self.field = 'text'

        if self.ic.exists(self.index):
            self.ic.delete(self.index)

        self.ic.create(self.index)
        self.es.create(self.index, self.doc_type, {self.field: 'foo ba knark foo knirk knark foo'}, id='doc_1')

    def test_tf_for_doc_id(self):
        sigterms = SingleDocSigTerms(self.es, self.index, self.doc_type, self.field, None)

        resp = dict(sigterms.tf_for_doc_id('doc_1'))
        self.assertEquals(4, len(resp))
        self.assertEquals(3, resp['foo'])
        self.assertEquals(2, resp['knark'])
        self.assertEquals(1, resp['ba'])
        self.assertEquals(1, resp['knirk'])
示例#4
0
def create_user_index():
	#Define our connection string

    conn_string = "host='localhost' dbname='test2' user='******' password='******'"
 
	# print the connection string we will use to connect
    print "Connecting to database\n	->%s" % (conn_string)
 
	# get a connection, if a connect cannot be made an exception will be raised here
    conn = psycopg2.connect(conn_string)
 
	# conn.cursor will return a cursor object, you can use this cursor to perform queries
    cursor = conn.cursor()
    cursor.execute("select users.id, users.first_name, users.last_name, users.username, users.email  from users order by users.id;")
    records = cursor.fetchall()
	# Then we connect to an elasticsearch server
    es = Elasticsearch()
    es.indices.create(index='learnapt', ignore=400, body={"mappings" : {"type1" : {"_source" : { "enabled" : 'true' }, \
            "properties" : { \
                "user_id" : { "type" : "integer", "index" : "analyzed" }, \
                "user_name" : { "type" : "string", "index" : "analyzed" }, \
                "user_fullname" : { "type" : "string", "index" : "analyzed" }, \
                "user_email" : { "type" : "string", "index" : "analyzed" }}}}})
    old_lesson_id=-1
    for record in records:
        userid=record[0]
        username=record[3]
        full_name=str(record[1]) + ' '+str(record[2])
        email=str(record[4])
        document={'user_id':userid,'user_name':username,'user_fullname':full_name,'user_email':email}
        es.create('learnapt', 'user',document,userid)
        print document
    print "Converted!\n "
示例#5
0
  def PublishSamples(self, samples):
    """Publish samples to Elasticsearch service"""
    try:
      from elasticsearch import Elasticsearch
    except ImportError:
      raise ImportError('The "elasticsearch" package is required to use '
                        'the Elasticsearch publisher. Please make sure it '
                        'is installed.')

    es = Elasticsearch([self.es_uri])
    if not es.indices.exists(index=self.es_index):
      # choose whether to use old or new mapings based on
      # the version of elasticsearch that is being used
      if int(es.info()['version']['number'].split('.')[0]) >= 5:
        es.indices.create(index=self.es_index, body=self.mapping_5_plus)
        logging.info('Create index %s and default mappings for'
                     ' elasticsearch version >= 5.0.0',
                     self.es_index)
      else:
        es.indices.create(index=self.es_index, body=self.mapping_before_5)
        logging.info('Create index %s and default mappings for'
                     ' elasticsearch version < 5.0.0',
                     self.es_index)
    for s in samples:
      sample = copy.deepcopy(s)
      # Make timestamp understandable by ES and human.
      sample['timestamp'] = self._FormatTimestampForElasticsearch(
          sample['timestamp']
      )
      # Keys cannot have dots for ES
      sample = self._deDotKeys(sample)
      # Add sample to the "perfkit index" of "result type" and using sample_uri
      # as each ES's document's unique _id
      es.create(index=self.es_index, doc_type=self.es_type,
                id=sample['sample_uri'], body=json.dumps(sample))
示例#6
0
def create_item_index():
	#Define our connection string

    conn_string = "host='localhost' dbname='test2' user='******' password='******'"
 
	# print the connection string we will use to connect
    print "Connecting to database\n	->%s" % (conn_string)
 
	# get a connection, if a connect cannot be made an exception will be raised here
    conn = psycopg2.connect(conn_string)
 
	# conn.cursor will return a cursor object, you can use this cursor to perform queries
    cursor = conn.cursor()
    cursor.execute("select  items.id, items.properties, items.lesson_id, items.item_type_id, items.parent_id, itags.tag_id, tags.slug from items as items left outer join item_tags as itags on items.id=itags.item_id left outer join tags  on tags.id=itags.tag_id order by items.id")
    records = cursor.fetchall()
	# Then we connect to an elasticsearch server
    es = Elasticsearch()
    es.indices.create(index='learnapt', ignore=400, body={"mappings" : {"type1" : {"_source" : { "enabled" : 'true' }, \
            "properties" : { \
                "item_id" : { "type" : "integer", "index" : "analyzed" }, \
                "lesson_id" : { "type" : "integer", "index" : "analyzed" }, \
                "item_type_id" : { "type" : "integer", "index" : "analyzed" }, \
                "item_title" : { "type" : "string", "index" : "analyzed" }, \
                "item_link_url" : { "type" : "string", "index" : "analyzed" }, \
                "item_tags" : { "type" : "integer", "index" : "analyzed" } \
                }}}})
    old_item_id=-1
    for record in records:
        parent_id=record[4]
        item_id=record[0]
        lesson_id=record[2]
        item_type_id=record[3]
        tag=record[6]
        if record[1] is None:
            continue
        item_contents=record[1].split('", "')
        if item_id is old_item_id and tag is not None:
            document['item_tags'].append(tag.replace('-','_'))
        else:
            document={'item_id':item_id,'lesson_id':lesson_id,'item_type_id':item_type_id}
            if tag is not None:
                document['item_tags']=[tag.replace('-','_')]
        for item_content in item_contents:
            key_value=item_content.strip().split('=>')
            if len(key_value)<2 or len(key_value)%2 is not 0:
                continue
            document['item_'+key_value[0].strip().replace('"','')]=key_value[1].strip().replace('"','')
        if item_id != old_item_id and old_item_id != -1:
#            print 'item_id being inserted '+str(old_item_id)
            es.create('learnapt', 'item',old_document,int(old_item_id))
            try:
                print old_document['item_tags']
            except:
                pass
        old_item_id=item_id
        old_document=document
    print "Converted!\n "
示例#7
0
    def populate(self):
        if self.download():
            es = Elasticsearch(self.es_url)

            f = open('%s/%s' % (self.assests_dir, self.l8_metadata_filename),
                     'r')

            # Read the first line for all the headers
            headers = f.readline().split(',')

            # Read the rest of the document
            rows = f.readlines()
            added_counter = 0
            skipped_counter = 0
            for row in rows:
                fields = row.split(',')
                obj = {}
                for header in headers:
                    try:
                        obj[header.replace('\n', '')] = float(fields[
                            headers.index(header)].replace('\n', ''))
                    except ValueError:
                        obj[header.replace('\n', '')] = fields[
                            headers.index(header)].replace('\n', '')
                try:
                    if not es.exists(
                            index=self.es_main_index,
                            doc_type=self.es_main_type,
                            id=obj['sceneID']):
                        es.create(
                            index=self.es_main_index,
                            doc_type=self.es_main_type,
                            id=obj['sceneID'],
                            body=json.dumps(obj),
                            ignore=409)
                        # print('%s-%s created' % (counter, obj['sceneID']))
                        added_counter += 1

                    else:
                        skipped_counter += 1

                    print('%s added | %s skipped' % (added_counter, skipped_counter), end='\r')

                except ConnectionError:
                    print('There was a connection error. Check your Elastic' +
                          ' Search setting and make sure Elastic Search is' +
                          'running.')
                    return False
                except:
                    print('An expected error: %s' % (sys.exc_info()[0]))
                    return False

            print('The update is completed. %s new records were added.' %
                  added_counter)

            return True
示例#8
0
def index_documents(doc_type, json_path):
    es = Elasticsearch()
    json_file = open(os.path.join(root, json_path))
    documents = json.load(json_file)

    for document in documents:
        es.create(index=index_name,
                  doc_type=doc_type,
                  id=document['_id'],
                  body=document)
示例#9
0
def put_data_to_es(host=None, index=None, type=None, doc=None, port=80):
    es = Elasticsearch([{'host': 'search-weblog-domain-hp5lndxriluzpb74bwomzm7ci4.us-east-1.es.amazonaws.com', 'port':80}])

    id = uuid.uuid1().get_hex()

    print id

    es.create(index=index, doc_type=type, id=id, body=doc)

    return
示例#10
0
def create_form(request):
    form = StudentForm(request.POST or None)
    if form is not None:
        es = Elasticsearch()

        if form.is_valid():

            data = {'name': form.cleaned_data['name'], 'analysis': form.cleaned_data['analysis']
                    , 'rno': form.cleaned_data['rno'], 'address': form.cleaned_data['address']}
            es.create('student', doc_type='info', body=data, id=form.cleaned_data['rno'])
            form.save()
            messages.success(request, "record added")
            return HttpResponseRedirect('/student/home/')
    context = {'form': form}
    return render(request, 'create_form.html', context)
示例#11
0
文件: ingest.py 项目: Capitains/Flint
class ElasticSearch(Endpoint):
    """ ElasticSearch Endpoint implementation

    :param url: URL of the Endpoint
    :type url: str
    :param auth: Authentification information
    :type auth: (str, str)
    :param port: Port of the endpoint
    :type port: int
    """

    def register(self):
        """ Register the endpoint with init resources
        :return: Endpoint
        """
        self.endpoint = ES(self.url, auth=self.auth, port=self.port)
        return self.endpoint

    def create(self, name, settings):
        """ Create an index

        :param name: Name of the index
        :param settings: Setting for the index
        :return: Bool
        """
        return self.endpoint.create(name, body=settings)

    def exists(self, name):
        """ Check if an index exists

        :param name: Name of the index to be created
        :return: Indication of existence as boolean
        :rtype: Bool
        """
        return self.endpoint.exists(name)
示例#12
0
    def test_elasticsearch(self):
        haproxy_ip = None
        for output in self.stack_info.outputs:
            if output['output_key'] == 'minion-haproxy-ip':
                haproxy_ip = output['output_value']

        if haproxy_ip is None:
            raise Exception("Unable to find IP of stack VM")

        es = Elasticsearch([haproxy_ip])
        doc = { "first_name" :  "Daniel", "last_name" : "Curran", "age" : 25, "about": "I like to compute", "interests":  [ "computers" ]}
        es.create( index='megacorp', doc_type='employee', id='1', body=doc )
        test_val = es.get(index='megacorp', doc_type='employee', id='1')
        return_doc = test_val['_source']
        self.assertTrue(test_val['found'])
        self.assertTrue(doc == return_doc)
class ElasticSearchEventsRepository(abstract_repository.AbstractRepository):
    def __init__(self):
        super(ElasticSearchEventsRepository, self).__init__()
        self.conf = cfg.CONF.elasticsearch
        self.es = Elasticsearch(
            hosts=self.conf.hosts,
            sniff_on_start=self.conf.sniff_on_start,
            sniff_on_connection_fail=self.conf.sniff_on_connection_fail,
            sniffer_timeout=self.conf.sniffer_timeout,
            max_retries=self.conf.max_retries
        )

    def process_message(self, message):
        return utils.parse_events_message(message)

    def write_batch(self, data_points):
        for data_point in data_points:
            (project_id, timestamp, event_type, payload, dimensions) = data_point

            index = '%s-%s-%s' % (self.conf.index_name, project_id,
                                  ElasticSearchEventsRepository._normalize_timestamp(timestamp))

            body = {
                'project_id': project_id,
                '@timestamp': timestamp,
                'event_type': event_type,
                'payload': payload,
                'dimensions': dimensions
            }

            self.es.create(
                index=index,
                doc_type='event',
                body=ujson.dumps(body)
            )

    @staticmethod
    def _normalize_timestamp(timestamp):
        d = None
        if timestamp and len(timestamp) >= 10:
            try:
                d = datetime.strptime(timestamp[0:10], '%Y-%m-%d')
            except ValueError as e:
                LOG.warning("Unable to parse timestamp '%s' - %s" % (timestamp, str(e)))
        if not d:
            d = datetime.today()
        return d.strftime('%Y-%m-%d')
示例#14
0
def insert_id_score(infile):
    es = Elasticsearch(["localhost:9200"])
    with codecs.open(infile, 'r', 'utf-8') as infp:
        cnt=0
        for line in infp:
            if not line.strip():
                continue
            row = line.strip().split('\t')
            poiid = row[0]
            raw_cscore = int10(row[1])
            cscore = float(row[2])
            score = {}
            score['raw_cscore'] = raw_cscore
            score['cscore'] = cscore
            cnt += 1
            score['time'] = cnt
            es.create(index=indexname, doc_type=typename, id=poiid, body=score)
示例#15
0
class BlogspyderPipeline(object):
    def __init__(self):
      
        self.itemDic = {"pageUrl":"" ,"pageID":"","pageTitle":"","pageContent":"","pageRank":"" }
        self.es = Elasticsearch("localhost")
        self.buffer_userName = '******'
        self.buffer_pageRank = -1.0
        self.default_pageRank = 0.00000001
    def __del__(self):
        self.file.close()
    def getPageRankByUsername(self ,userName):
        sql = "select _2 from pageran8  where _1 = '{0}' ".format(userName)
        con = MySQLdb.connect("brian1" , "brian","general","csdn")
        cur = con.cursor()
        try:
    		  cur.execute(sql)
    		  (results,) = cur.fetchone()
    		  return results
        except:
    		  print "fetach pageValue by name({0}) falid".format(userName)  
        finally:        
    		  if con:
    		      con.close()         
      
    def process_item(self, item, spider):
        self.itemDic["pageUrl"] = item["pageUrl"]
        self.itemDic["pageID"] = item["pageID"]
        self.itemDic["pageTitle"] = item["pageTitle"].encode("utf-8")
        self.itemDic["pageContent"] = " ".join(jieba.cut(item["pageContent"].encode("utf-8")))
        # pagerank info 
        userName = item["pageUrl"].split('/')[3]     
        if userName == self.buffer_userName:            
            self.itemDic["pageRank"] = self.buffer_pageRank
        else:
            pr = self.getPageRankByUsername(userName)           
            self.itemDic["pageRank"] =  pr if pr != ''  else self.default_pageRank
            self.buffer_userName = userName
            self.buffer_pageRank = pr       

        self.es.create(index="blog", doc_type="csdn",body=self.itemDic)
        return item
示例#16
0
class Outputer(BaseOutputer):
    '''
    Send output to elasticsearch
    '''
    def __init__(self,
                 uri,
                 index,
                 doc_type,
                 *args,
                 **kwargs):
        super(Outputer, self).__init__(*args, **kwargs)
        self.es = Elasticsearch([uri])  # pylint: disable=C0103
        self.index = index
        self.doc_type = doc_type
        self.es.indices.create(index=self.index, ignore=400)

    def output(self, event):
        self.es.create(index=self.index, doc_type=self.doc_type, body=event)

    def shutdown(self):
        self.es.indices.flush(self.index)
示例#17
0
def main():
    es = Elasticsearch()
    # query =
    dashboards = {}

    for hit in es.search(index="kibana-int", doc_type="dashboard", size=1000)["hits"]["hits"]:
        data = hit["_source"]
        dashboards[hit["_id"]] = hit["_source"]


    for id_, data in dashboards.items():
        dashboard = json.loads(data["dashboard"])

        # Here the modification takes place
        if dashboard["index"]["pattern"] == ORIG_INDEX_PATTERN:
            dashboard["index"]["pattern"] = NEW_INDEX_PATTERN

        dashboards[id_]["dashboard"] = json.dumps(dashboard)

    for id_, data in dashboards.items():
        es.delete(index="kibana-int", doc_type="dashboard", id=id_)
        es.create(index="kibana-int", doc_type="dashboard", id=id_, body=data)
示例#18
0
def push():
    host = os.environ.get('ELASTICSEARCH_HOST', 'localhost')
    connection = Elasticsearch([host])

    # Delete old markers or do initial setup
    try:
        print(connection.delete_by_query(index=[INDEX], doc_type=DOC_TYPE, q='*'))
    except NotFoundError:
        set_mapping()

    if True:
        # real  0m9.839s
        bulk(connection, get_bulk_ready_data())
    else:
        # real  0m30.341s
        for row in get_data():
            connection.create(
                index=INDEX,
                doc_type=DOC_TYPE,
                body=row,
                id=row['atlas_number'],
            )
示例#19
0
class DataStore:
    def __init__(
        self, host, port, username=None, password=None, use_ssl=False, default_index=None, default_doctype=None
    ):
        self.index = default_index
        self.doc_type = default_doctype
        if username and password:
            self.es_connection = Elasticsearch(
                host=host, port=port, http_auth=username + ":" + password, use_ssl=use_ssl
            )
        else:
            self.es_connection = Elasticsearch(host=host, port=port, use_ssl=use_ssl)
        if not self.es_connection.ping():
            raise DataStoreException("Connection to ElasticSearch failed.")
            self.es_connection = False

    def store(self, body):
        try:
            self.es_connection.create(
                body=body, id=hashlib.sha1(str(body)).hexdigest(), index=self.index, doc_type=self.doc_type
            )
        except ElasticsearchException, e:
            raise DataStoreException("Exception while storing data in Elastic Search: " + str(e))
示例#20
0
def sns_handler(request):
	
	messageType = request.META['HTTP_X_AMZ_SNS_MESSAGE_TYPE']
	parsed_body = json.loads(request.body)
	if messageType == "SubscriptionConfirmation":
		url = parsed_body["SubscribeURL"]
		serialized_data = urllib2.urlopen(url).read()
	elif messageType == "Notification":
		message = parsed_body["Message"]
		j_msg = json.loads(message)
		print (type(j_msg['coordinates']))
		print (j_msg['coordinates'])
		j_msg['coordinates'] = j_msg['coordinates']['coordinates']
		print(j_msg)
		message = str(json.dumps(j_msg))
		print(message)

		pusher_client = Pusher(
	  		app_id='xxx',
	  		key='xxx',
	  		secret='xxx',
	  		ssl=True
		)
		pusher_client.trigger('test_channel', 'my_event', {'message': message})
		
		es = Elasticsearch(
  			[
  			'xxx'
			  ],
			  use_ssl=True,
			  verify_certs=True,
			  connection_class = RequestsHttpConnection
		)
		es.create(index="tweets", doc_type="tweet", body=j_msg)
  			
	return HttpResponse('', status=200)
示例#21
0
def create_test_data():
    # Add some test data to Elasticsearch.
    es = Elasticsearch(es_url)
    es.indices.delete(index='companiontest', ignore=[404])
    es.indices.create(index='companiontest',
                      body={
                          'index': {
                              'number_of_shards': 1,
                              'number_of_replicas': 0
                          }
                      })

    # Create 3 "simple" doc types and 1 "advanced"
    es.create(index='companiontest',
              doc_type='simple',
              id='foo',
              body={
                  'timestamp': datetime.datetime(2015, 1, 1),
                  'id': 'foo'
              })
    es.create(index='companiontest',
              doc_type='simple',
              id='bar',
              body={
                  'timestamp': datetime.datetime(2015, 1, 2),
                  'id': 'bar'
              })
    es.create(index='companiontest',
              doc_type='simple',
              id='baz',
              body={
                  'timestamp': datetime.datetime(2015, 1, 3),
                  'id': 'baz'
              })

    es.create(index='companiontest',
              doc_type='advanced',
              id='foo',
              body={
                  'timestamp': datetime.datetime(2015, 1, 1),
                  'id': 'foo'
              })

    es.indices.refresh(index='companiontest')
示例#22
0
class ElasticsearchService(object):
    def __init__(self, host, port):
        self._es = Elasticsearch([{'host': host, 'port': port}])

    def search(self, *args, **kwargs):
        return self._es.search(*args, **kwargs)

    def create(self, *args, **kwargs):
        return self._es.create(*args, **kwargs)

    def get(self, *args, **kwargs):
        return self._es.get(*args, **kwargs)

    def exists(self, *args, **kwargs):
        return self._es.exists(*args, **kwargs)

    def msearch(self, *args, **kwargs):
        return self._es.msearch(*args, **kwargs)
示例#23
0
class ElasticSearchHandler:
    def __init__(self, host=None):
        if not host:
            host = os.getenv("ES_HOST")
        self.es_handle = Elasticsearch(hosts=host)

    def check_index(self, index):
        return self.es_handle.indices.exists(index)

    def create_index(self, index):
        if not self.check_index(index):
            self.es_handle.indices.create(index)

    def add_to_es(self, index, document_type, body):
        return self.es_handle.create(index=index, doc_type=document_type, body=body)

    def get_from_es(self, index, id):
        return self.es_handle.get(index, id=id)


    def delete_by_id(self, index, document_type, id):
        self.es_handle.delete(index, document_type, id)
示例#24
0
def update_from_ldap(server, username, password, schema, pull):
    pid = os.getpid()

    print >> sys.stderr, '[%5d] Aquire lock...' % pid

    lock = filelock.FileLock("/tmp/ldap_update.lock")

    try:
        with lock.acquire(timeout=10):
            print >> sys.stderr, '[%5d] Got lock, now running ldap update...' % pid
            print >> sys.stderr, '[%5d] Server: %s, Username: %s, Schema: %s' % (
                pid, server, username, schema)

            if pull:
                os.environ['LDAP_USERNAME'] = username
                os.environ['LDAP_PASSWORD'] = password
                os.environ['LDAP_SERVER'] = server
                os.environ['LDAP_BASE_DN'] = schema

                command = '/usr/bin/python /app/scripts/ldapdump.py'

                print >> sys.stderr, '[%5d] Execute: %s' % (pid, command)

                ret = os.system(command)

                os.environ['LDAP_USERNAME'] = ''
                os.environ['LDAP_PASSWORD'] = ''
                os.environ['LDAP_SERVER'] = ''
                os.environ['LDAP_BASE_DN'] = ''

                if ret != 0:
                    print >> sys.stderr, '[%5d] Dump failed, aborting' % (pid)
                    return None

            command = '/usr/bin/python /app/scripts/ldapmunge.py'
            print >> sys.stderr, '[%5d] Execute: %s' % (pid, command)
            if 0 != os.system(command):
                print >> sys.stderr, '[%5d] Munge failed, aborting' % (pid)
                return None

            records = json.load(open(USER_JSON_FILENAME))
            total_records = len(records)

            es = Elasticsearch("http://*****:*****@/app/scripts/index.json"'''
            if 0 != os.system(command):
                print >> sys.stderr, '[%5d] Upload of index failed, aborting' % (
                    pid)
                return None

            print >> sys.stderr, "[%5d] Uploading %d indices to elasticsearch..." % (
                pid, total_records)
            for record in records:
                username = record['username']
                es.create(index=INDEX_NAME,
                          doc_type=DOC_TYPE,
                          body=record,
                          id=username)

            print >> sys.stderr, '[%5d] Done.' % pid
    except filelock.Timeout:
        print >> sys.stderr, '[%5d] Failed to aquire lock, skipping task.' % pid

    return None
示例#25
0
class IndexerUtils:
    add_template = """
        if (ctx._source.lastin.indexOf({grp}) < 0) {{
            ctx._source.lastin.add({grp});
        }}
        if (ctx._source.groups.indexOf({grp}) < 0) {{
          ctx._source.groups.add({grp});
        }}
        """

    del_template = """
        if (ctx._source.lastin.indexOf({grp}) >= 0) {{
          ctx._source.lastin.remove(ctx._source.lastin.indexOf({grp}));
        }}
        if (ctx._source.groups.indexOf({grp}) >= 0) {{
          ctx._source.groups.remove(ctx._source.groups.indexOf({grp}));
        }}
        """

    def __init__(self, config):
        self.log = logging.getLogger('indexrunner')
        self.ws = WorkspaceAdminUtil(config)
        self.elasticsearch = Elasticsearch([config['elastic-host']])
        self.esbase = config['elastic-base']
        mapfile = config.get('mapping-file')
        self.log.info("Mapping File: %s" % (mapfile))
        self.mapping = self._read_mapfile(mapfile)

        if 'workspace-admin-token' in config:
            token = config['workspace-admin-token']
        else:
            token = os.environ.get('KB_AUTH_TOKEN')
        self.method_runner = MethodRunner(config, token=token)
        self.ep = EventProducer(config)
        # TODO: access and data specs are not used?
        with open('specs/mapping.yml') as f:
            self.mapping_spec = yaml.load(f)

    def _read_mapfile(self, mapfile):
        with open(mapfile) as f:
            d = f.read()
        mapping = yaml.load(d)['types']
        for type in mapping.keys():
            for index in mapping[type]:
                name = index['index_name']
                index['index_name'] = '%s.%s' % (self.esbase, name)
        return mapping

    def process_event(self, evt):
        """
        Process a single workspace or indexer event
        """
        etype = evt['evtype']
        ws = evt['wsid']
        if evt['ver']:
            evt['upa'] = '%d/%s/%d' % (evt['wsid'], evt['objid'], evt['ver'])
        if etype in ['NEW_VERSION', 'NEW_ALL_VERSIONS']:
            self.new_object_version(evt)
        elif 'PUBLISH' in etype:
            self.publish(evt['wsid'])
        elif etype.startswith('DELETE_'):
            self.delete(evt)
        elif etype == 'COPY_ACCESS_GROUP':
            self._index_workspace(ws)
        elif etype == 'RENAME_ALL_VERSIONS':
            self.log.warning("Warning rename not implemented.")
        elif etype in ['REINDEX_WORKSPACE']:
            # Pseudo event
            self._index_workspace(ws)
        else:
            self.log.error("Can't process evtype " + evt['evtype'])

    def _index_workspace(self, wsid):
        """
        List the workspace and generate an index event for each object.
        """
        min = 0
        while True:
            objs = self.ws.list_objects({
                'ids': [wsid],
                'minObjectID': min,
                'limit': _MAX_LIST
            })
            self.ep.index_objects(objs)
            if len(objs) <= _MAX_LIST:
                break
            min = objs[-1][0] + 1

    def _create_obj_rec(self, upa):
        (wsid, objid, vers) = self._split_upa(upa)
        req = {'objects': [{'ref': upa}], 'no_data': 1}
        obj = self.ws.get_objects2(req)['data'][0]
        info = obj['info']
        (otype, over) = info[2].split('-')
        fmt = "%Y-%m-%dT%H:%M:%S%z"
        ts = int(datetime.datetime.strptime(info[3], fmt).timestamp() * 1000)
        wsinfo = self._get_ws_info(wsid)
        # Don't index temporary narratives
        if wsinfo['temp']:
            return None

        prov = self._get_prov(obj)
        # TODO stags, copier, prv_cmt, time

        rec = {
            "guid": f"WS:{upa}",
            "otype": None,
            "otypever": 999,
            "stags": [],
            "oname": info[1],
            "creator": info[5],
            "copier": None,
            "prv_mod": prov['prv_mod'],
            "prv_meth": prov['prv_meth'],
            "prv_ver": prov['prv_ver'],
            "prv_cmt": None,
            "md5": info[8],
            "timestamp": ts,
            "prefix": "WS:%d/%d" % (wsid, objid),
            "str_cde": "WS",
            "accgrp": wsid,
            "version": vers,
            "islast": False,
            "public": wsinfo['public'],
            "shared": wsinfo['shared'],
            "ojson": "{}",
            "pjson": None
        }

        return rec

    # TODO: should we just add an filehandler for this?
    def _log_error(self, event, index, err):
        mes = {'event': event, 'index': index, 'error': str(type(err))}
        with open('error.log', 'a') as f:
            f.write(json.dumps(mes))
            f.write('\n')

    def _access_rec(self, wsid, objid, vers, public=False):
        rec = {
            "extpub": [],
            "groups": [-2, wsid],
            "lastin": [-2, wsid],
            "pguid": f"WS:{wsid}/{objid}/{vers}",
            "prefix": f"WS:{wsid}/{objid}",
            "version": vers
        }
        if public:
            rec['lastin'].append(-1)
            rec['groups'].append(-1)
        # type": "access"
        return rec

    def _get_id(self, upa):
        """
        Return the elastic id
        """
        if not re.match('^\d+\/\d+\/\d+$', upa):
            raise ValueError(f"'{upa}' is not an upa")
        return f"WS:{upa.replace('/', ':')}"

    def _get_prov(self, obj):
        ret = {
            "prv_mod": None,
            "prv_meth": None,
            "prv_ver": None,
            "prv_cmt": None,
        }
        if 'provenance' not in obj or len(obj['provenance']) == 0:
            return ret
        prov = obj['provenance'][0]
        if 'service' in prov:
            ret['prv_mod'] = prov['service']

        if 'method' in prov:
            ret['prv_meth'] = prov['method']
        if 'script' in prov:
            ret['prv_mod'] = 'legacy_transform'
            ret['prv_meth'] = prov['script']

        if 'service_ver' in prov:
            ret['prv_ver'] = prov['service_ver']
        elif 'script_ver' in prov:
            ret['prv_ver'] = prov['script_ver']

        if 'description' in prov:
            ret['prv_cmt'] = prov['description']
        return ret

    def _put_es_data_record(self, index, upa, doc):
        """
        Add an ES data record.
        Only works if the object hasn't been indexed before.  Will throw an
        error if it has
        """
        eid = self._get_id(upa)
        res = self.elasticsearch.create(index=index,
                                        parent=eid,
                                        doc_type='data',
                                        id=eid,
                                        routing=eid,
                                        body=doc,
                                        refresh=True)
        return res

    def _get_ws_info(self, wsid):
        info = self.ws.get_workspace_info({'id': wsid})
        meta = info[8]
        # Don't index temporary narratives
        temp = (meta.get('is_temporary') == 'true')
        public = (info[6] != 'n')

        # TODO
        shared = False

        return {
            'wsid': wsid,
            'info': info,
            'meta': meta,
            'temp': temp,
            'public': public,
            'shared': shared
        }

    def publish(self, wsid):
        """This updates the visibility of objects when a workspace is made public"""
        # Find each index
        wsinfo = self._get_ws_info(wsid)
        public = wsinfo['public']

        if public:
            script = self.add_template.format(grp="-1")
        else:
            script = self.del_template.format(grp="-1")

        aq = {
            "query": {
                "prefix": {
                    "prefix": f"WS:{wsid:d}/"
                }
            },
            "script": {
                "source": script
            }
        }

        filt = {
            "bool": {
                "filter": [{
                    "term": {
                        "public": not public
                    }
                }, {
                    "term": {
                        "accgrp": wsid
                    }
                }]
            }
        }
        publics = "false"
        if public:
            publics = "true"
        dq = {
            "query": filt,
            "script": {
                "source": f"ctx._source.public={publics}"
            }
        }
        active_indexes = self._get_all_active_indexes()
        for index in active_indexes:
            self.elasticsearch.update_by_query(index=index,
                                               doc_type='access',
                                               body=aq,
                                               ignore=[400, 404],
                                               refresh=True)
            self.elasticsearch.update_by_query(index=index,
                                               doc_type='data',
                                               body=dq,
                                               ignore=[400, 404],
                                               refresh=True)

    def _get_all_active_indexes(self):
        indexes = (index['index_name'] for oindex in self.mapping
                   for index in self.mapping[oindex])
        return self.elasticsearch.indices.get(','.join(indexes),
                                              ignore_unavailable=True)

    def delete(self, event):
        # Find each index
        id = self._get_id(event['upa'])
        active_indexes = self._get_all_active_indexes()
        q = {'query': {'parent_id': {'type': 'data', 'id': id}}}
        for index in active_indexes:
            self.elasticsearch.delete_by_query(index=index,
                                               doc_type='data',
                                               routing=id,
                                               body=q,
                                               ignore=[400, 404],
                                               refresh=True)
            self.elasticsearch.delete(index=index,
                                      doc_type='access',
                                      id=id,
                                      ignore=404,
                                      refresh=True)

    def _update_es_access(self, index, wsid, objid, vers, upa):
        # Should pass a wsid but just in case...
        wsinfo = self._get_ws_info(wsid)
        if wsinfo['temp']:
            return None
        public = wsinfo['public']
        doc = self._access_rec(wsid, objid, vers, public=public)
        eid = self._get_id(upa)
        res = self.elasticsearch.index(index=index,
                                       doc_type='access',
                                       id=eid,
                                       body=doc,
                                       refresh=True)
        return res

    def _split_upa(self, upa):
        return [int(x) for x in upa.split('/')]

    def _get_indexes(self, otype):
        pieces = otype.split('.')
        if not pieces:
            raise RuntimeError(f"Invalid workspace type: {otype}")
        generic = pieces[0] + ".*"
        if otype in self.mapping:
            return self.mapping[otype]
        elif generic in self.mapping:
            return self.mapping[generic]
        return self.mapping['Other']

    def _ensure_mapping_exists(self, oindex, objschema):
        """Ensures a mapping exists in ES for 'index_name'"""
        index = oindex['index_name']
        res = self.elasticsearch.indices.exists(index=index)
        if not res:
            schema = self.mapping_spec
            if oindex.get('raw'):
                schema = {'mappings': {'data': {'properties': objschema}}}
            elif objschema is not None:
                schema['mappings']['data']['properties']['key'] = \
                    {'properties': objschema}
            self.elasticsearch.indices.create(index=index, body=schema)

    def _run_module(self, oindex, upa):
        params = {'upa': upa}
        (module, method) = oindex['index_method'].split('.')
        resp = self.method_runner.run(module, method, params)[0]
        self.method_runner.cleanup()
        return resp

    def _update_islast(self, index, wsid, objid, vers):
        prefix = f"WS:{wsid:d}/{objid}"
        doc = {
            "query": {
                "bool": {
                    "filter": [{
                        "term": {
                            "prefix": prefix
                        }
                    }]
                }
            },
            "script": {
                "source":
                "ctx._source.islast = (ctx._source.version == params.lastver)",
                "params": {
                    "lastver": int(vers)
                }
            }
        }
        self.elasticsearch.update_by_query(index, 'data', doc, refresh=True)

    def _new_raw_version_index(self, event, oindex):
        """This handles indexing an object where the callout is expected to
        return an entire ElasticSearch reccord for storage"""
        upa = event['upa']
        index = oindex['index_name']
        eid = self._get_id(upa)
        res = self.elasticsearch.get(index=index,
                                     doc_type='data',
                                     id=eid,
                                     ignore=404)
        if res.get('status') != 404 and res['found']:
            self.log.info("%s already indexed in %s" % (eid, index))
            return

        resp = self._run_module(oindex, upa)
        if resp.get('data') is None:
            raise ValueError(
                f"{oindex['index_method']} did not return 'data' for {event}")
        self._ensure_mapping_exists(oindex, resp['schema'])
        doc = resp['data']
        self.elasticsearch.create(index=index,
                                  doc_type='data',
                                  id=eid,
                                  body=doc,
                                  refresh=True)

    def _new_object_version_index(self, event, oindex):
        """
        This handles indexing a specific object version.
        The callout should return a structure with a 'data'
        """
        wsid = event['wsid']
        objid = event['objid']
        vers = event['ver']
        upa = event['upa']
        index = oindex['index_name']

        eid = self._get_id(upa)
        res = self.elasticsearch.get(index=index,
                                     doc_type='access',
                                     id=eid,
                                     ignore=404)
        if res.get('status') != 404 and res['found']:
            self.log.info("%s already indexed in %s" % (eid, index))
            return

        doc = self._create_obj_rec(upa)
        extra = {}
        if 'default_indexer' not in oindex['index_method']:
            extra = self._run_module(oindex, upa)
        self._ensure_mapping_exists(oindex, extra.get('schema'))

        if extra.get('data') is not None:
            doc['key'] = extra['data']
            if 'objdata' in extra:
                od = doc['key'].pop('objdata')
                doc['ojson'] = json.dumps(od)
            else:
                doc['ojson'] = json.dumps(doc['key'])

        else:
            self.log.warning(
                f"{oindex['index_method']} did not return 'data' for {event}")
        self._update_es_access(index, wsid, objid, vers, upa)
        self._put_es_data_record(index, upa, doc)
        oid = f'{wsid:d}/{objid}'
        info = self.ws.get_object_info3({'objects': [{
            'ref': oid
        }]})['infos'][0]
        if info[4] == vers:
            self._update_islast(index, wsid, objid, vers)

    def _new_object_version_multi_index(self, event, oindex):
        """
        This handles indexing multiple sub-objects for a specific object version.
        The callout should return a structure with a 'features' that
        is a list of dictionary keys
        """
        wsid = event['wsid']
        objid = event['objid']
        vers = event['ver']
        upa = event['upa']
        index = oindex['index_name']

        # Check if any exists
        eid = self._get_id(upa)
        res = self.elasticsearch.get(index=index,
                                     doc_type='access',
                                     id=eid,
                                     ignore=404)
        if res.get('status') != 404 and res['found']:
            self.log.info(f"{eid} already indexed in {index}")
            return
        wsinfo = self._get_ws_info(wsid)
        if wsinfo['temp']:
            return None
        public = wsinfo['public']
        adoc = self._access_rec(wsid, objid, vers, public=public)

        pdoc = self._create_obj_rec(upa)
        extra = self._run_module(oindex, upa)
        parent = extra['parent']
        self._ensure_mapping_exists(oindex, extra['schema'])
        pdoc['pjson'] = json.dumps(parent)
        pguid = self._get_id(upa)
        recs = []
        bdoc = []
        ct = 0
        for row in extra['documents']:
            doc = pdoc.copy()
            doc['key'] = {**parent, **row}

            guid = row.pop('guid')
            if not guid.startswith('WS:'):
                guid = "WS:" + guid
            # Tear apart the name so we get just the
            # last portion
            ele = guid.replace('/', ':').split(':')
            # build the feature ID from everything past the UPA
            fid = '/'.join(ele[4:])
            guid = 'WS:%s:feature/%s' % (upa, fid)
            doc['guid'] = guid
            if 'objdata' in doc['key']:
                od = doc['key'].pop('objdata')
                doc['ojson'] = json.dumps(od)
            else:
                doc['ojson'] = json.dumps(doc['key'])
            rec = {
                '_id': guid,
                '_source': doc,
                '_index': index,
                '_parent': pguid,
                '_type': 'data'
            }
            bdoc.append(rec)
            ct += 1
            if ct > BULK_MAX:
                bulk(self.elasticsearch, bdoc)
                bdoc = []
                ct = 0

        if ct > 0:
            bulk(self.elasticsearch, bdoc)

        self._update_es_access(index, wsid, objid, vers, upa)
        oid = f'{wsid:d}/{objid}'
        info = self.ws.get_object_info3({'objects': [{
            'ref': oid
        }]})['infos'][0]
        if info[4] == vers:
            self._update_islast(index, wsid, objid, info[4])

    def new_object_version(self, event):
        # For a NEW ALL VERSION we will just index the latest versions
        #
        if event['evtype'] == 'NEW_ALL_VERSIONS':
            upa = f"{event['wsid']}/{event['objid']}"
            info = self.ws.get_object_info3({'objects': [{
                'ref': upa
            }]})['infos'][0]
            vers = info[4]
            event['ver'] = vers
            (event['objtype'], event['objtypever']) = info[2].split('-')
            event['upa'] = f'{upa}/{vers}'

        indexes = self._get_indexes(event['objtype'])
        for oindex in indexes:
            try:
                if oindex.get('multi'):
                    self._new_object_version_multi_index(event, oindex)
                elif oindex.get('raw'):
                    self._new_raw_version_index(event, oindex)
                else:
                    self._new_object_version_index(event, oindex)
            except Exception as e:
                self.log.error("Failed for index")
                self._log_error(event, oindex, e)
                exc_type, exc_value, exc_traceback = sys.exc_info()
                self.log.info("*** print_tb:")
                traceback.print_tb(exc_traceback, limit=1, file=sys.stdout)
                self.log.info("*** print_exception:")
                traceback.print_exception(exc_type,
                                          exc_value,
                                          exc_traceback,
                                          limit=2,
                                          file=sys.stdout)
        self.log.info("Completed new object version")
        return True
示例#26
0
	#print(config)
	try:
		connection = Elasticsearch( config['elasticsearch_hosts'],
						# sniff before doing anything 
						sniff_on_start=True,
						# refresh nodes after a node fails to respond
						sniff_on_connection_fail=True,
						# and also every 60 seconds
						sniffer_timeout=60)
		
		

		connection.create(index=config['default_index'],doc_type='group',id='2',
							body={ 
							"gid": 234, 
							"owner": "bemineni", 
							"name": "Sammy", 
							"grp_hash": "456678", 
							"description": "Sammy group"
							})
		data = connection.get_source(index=config['default_index'],doc_type="group",id='2')
		print(json.dumps(data,indent=4,sort_keys=True))

		
	except Exception as e:
		print("Failed to add item")
		print("Test failed")
		traceback.print_exc()
		ret = 1
	finally:
		print(test_name + " Test complete")
示例#27
0
], port='9200', timeout=25)

# 删除索引(库)
result = es.indices.delete(index='actest', ignore=[400, 404])
print(result)

# 创建索引(库)
result = es.indices.create(index='actest', ignore=400)
print(result)

# 插入数据
data = {
    'title': '美国留给伊拉克的是个烂摊子吗',
    'url': 'http://view.news.qq.com/zt2011/usa_iraq/index.htm'
}
result = es.create(index='actest', doc_type='politics', id=1, body=data)
# result = es.index(index='actest', doc_type='politics', body=data)  # index方法创建文档无需指定id
print(result)

# 更新数据
data = {
    'title': '美国留给伊拉克的是个烂摊子吗',
    'url': 'http://view.news.qq.com/zt2011/usa_iraq/index.htm',
    'date': '2011-12-16',
    'status': 0
}
result = es.update(index='actest',
                   doc_type='politics',
                   body={'doc': data},
                   id=1)  # .update更新数据时body需要外包一层'doc'
# result = es.index(index='actest', doc_type='politics', body=data, id=1)
import json
from elasticsearch import Elasticsearch
import os

with open('rakuten_books.json', 'r') as f:
    rakuten_books = json.load(f)

es = Elasticsearch(http_auth=('elastic', os.environ.get('ES_PW')))

for rakuten_book in rakuten_books:
    rakuten_book_item = rakuten_book['Item']
    try:
        es.create(index='book',
                  id=rakuten_book_item['isbn'],
                  body=rakuten_book_item)
    except:
        pass
    print('{} created'.format(rakuten_book_item['title']))
示例#29
0
    def getItem(self):
        #获取负面关键词
        negative = NegativeKeyWords()
        negKwList = negative.getNegativeKeyWordsList()
        #获取ES搜索引擎数据库连接
        es = Elasticsearch([{'host': ES_HOST, 'port': ES_PORT}])
        url_list = self.getPageInfoUrl()  #获取页面所有信息的url
        MysqlHelper.excuteUpdate("tb_spider", {"Fstate": 1},
                                 "Fid={}".format(self.spiderId))
        for url in url_list:
            spider = MysqlHelper.excuteFindOne(
                "select Fnum from tb_spider where Fid={}".format(
                    self.spiderId))
            num = int(spider["Fnum"])
            num += 1
            #页面源码
            text = self.getHtml(url).text
            #创建一selector,用于xpath去匹配需要的数据
            selector = etree.HTML(text, parser=None, base_url=None)
            #标题
            title = selector.xpath(u'//span[@class="s_title"]/span/text()')
            #发帖人
            p_Fauthor = u"<a href=\".*?\" target=\"_blank\" class=\"js-vip-check\" uid=\".*?\" uname=\".*?\">([\s\S]*?)</a>"
            author = re.findall(p_Fauthor, text)
            #发帖时间
            p_date = u"<span>时间:([\s\S]*?) </span>"
            date = re.findall(p_date, text)
            #内容
            p_content = u"<div class=\"bbs-content clearfix\">([\s\S]*?)</div>"
            content = re.findall(p_content, text)

            data = {}
            data["Ftitle"] = title[0] if title else ""
            data["Fdate"] = date[0] if date else "Null"
            data["Fcontent"] = content[0] if content else ""
            data["Flink"] = url
            data["Ftype"] = "论坛"
            data["Fsource"] = "天涯社区"
            data["FcreateTime"] = time.strftime('%Y-%m-%d %H:%M:%S',
                                                time.localtime(time.time()))
            data["Fauthor"] = author[0] if author else ""

            #判断是否是负面信息
            def isNegative(key):
                if re.findall(key, data["Fcontent"]):
                    return True
                else:
                    return False

            isNegKeyWor = map(isNegative, negKwList)
            #0正面,1负面
            if True in isNegKeyWor:
                data["isNegative"] = 1
            else:
                data["isNegative"] = 0
            try:
                n = random.randint(1, 9999)
                b = random.randint(1, 9999)
                id = n + b
                #向ES插入数据  index:索引,相当于mysql的数据库名,doc_type:相当于mysql的表名
                istrue = es.create(index="scdel_index",
                                   id=id,
                                   doc_type="tb_data",
                                   body=data)["created"]
                print istrue
            except Exception as ex:
                print ex
                istrue = False
            if istrue:
                MysqlHelper.excuteUpdate("tb_spider", {"Fnum": num},
                                         "Fid={}".format(self.spiderId))
            time.sleep(1)
示例#30
0
from kafka import KafkaConsumer
from json import loads
import pandas as pd
from elasticsearch import Elasticsearch

i=2200
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
consumer = KafkaConsumer(
    'twitter_stream',
     bootstrap_servers=['localhost:9092'],
     auto_offset_reset='earliest',
     enable_auto_commit=False,
     group_id='ElasticConsumerE',
     value_deserializer=lambda x: loads(x.decode('utf-8')))

for message in consumer:
    message=message.value
    # message=json.dumps()
    # print(message['user']['screen_name'])
    es.create(index='idx_twp_con', doc_type='twitter_twp_con', id=i, body=message)
    i = i+1
    print('{} indexed.'.format(i))
示例#31
0
 def crawler(self):
     # es = Elasticsearch(['35.227.82.63:9200'])
     es = Elasticsearch()
     f = open("url.txt", 'w')
     if es.ping():
         r = redis.StrictRedis(host='35.227.82.63', port=6379)
         seen = []
         url_queue = queue.Queue()
         url_queue.put('http://en.wikipedia.org/wiki/Catholic_Church')
         url_queue.put('http://en.wikipedia.org/wiki/Christianity')
         url_queue.put('http://en.wikipedia.org/wiki/College_of_Cardinals')
         url_queue.put(
             'https://en.wikipedia.org/wiki/Hierarchy_of_the_Catholic_Church'
         )
         url_queue.put('https://en.wikipedia.org/wiki/Papal_legate')
         url_queue.put(
             'http://www.bible.ca/catholic-church-hierarchy-organization.htm'
         )
         url_queue.put(
             'https://www.britannica.com/topic/Roman-Catholicism/Structure-of-the-church'
         )
         seen.append('http://en.wikipedia.org/wiki/Christianity')
         seen.append('http://en.wikipedia.org/wiki/College_of_Cardinals')
         seen.append(
             'https://en.wikipedia.org/wiki/Hierarchy_of_the_Catholic_Church'
         )
         seen.append('https://en.wikipedia.org/wiki/Papal_legate')
         seen.append(
             'http://www.bible.ca/catholic-church-hierarchy-organization.htm'
         )
         seen.append(
             'https://www.britannica.com/topic/Roman-Catholicism/Structure-of-the-church'
         )
         count = 0
         tempurl = []
         while not url_queue.empty() and count <= 1000:
             url = url_queue.get()
             seedUrl = self.getSeedUrl(url)
             if not self.robotCheck(url, seedUrl):
                 continue
             response = requests.get(url)
             Httptype = (response.headers['Content-Type'])
             if "text/html" in Httptype:
                 count += 1
                 headers = dict(response.headers)
                 soup = BeautifulSoup(response.content, 'html.parser')
                 seen.append(url)
                 r.set(url, "visited")
                 # data = []
                 out_links = []
                 print("init passed")
                 # for p in soup.find_all("p"):
                 #     data.append(self.getCleanText(str(p)))
                 data = self.getCleanText(str(soup.get_text))
                 rawHtml = str(soup.prettify())
                 for a in soup.find_all('a'):
                     temp = a.attrs
                     if 'title' in temp.keys():
                         score = self.checkKeywords(temp['title'])
                         # if self.checkKeywords(temp['title']):
                         if score > 0:
                             link = temp['href']
                             link = self.canonicalizeUrl(link, seedUrl)
                             if link not in out_links and link is not url:
                                 out_links.append(link)
                             # if r.get(link) == None and count<=1000:
                             if link not in seen and count <= 1000:
                                 # url_queue.put(link)
                                 tempurl.append([link, score])
                                 seen.append(link)
                 print("outlink passed")
                 es.create("ap_dataset",
                           "document",
                           count,
                           body={
                               "url": url,
                               "inlinks": url,
                               "outlinks": out_links,
                               "text": data,
                               "raw": rawHtml,
                               "headers": (headers)
                           })
                 print(url + " crawled " + str(count))
                 f.writelines(url + " crawled " + str(count) + '\n')
                 if url_queue.empty():
                     tempurl = self.sortUrl(tempurl)
                     for x in tempurl:
                         url_queue.put(x[0])
                     tempurl = []
     f.close()
示例#32
0
class ElasticCorpus(BaseCorpus):
    """
        ElasticSearch connection corpus
    """
    def __init__(self):
        """
            Basic creator
        """
        super(self.__class__, self).__init__()
        self.es=None

        self.query_filter=""

        self.ALL_FILES=[]
        self.TEST_FILES=[]
        self.FILES_TO_IGNORE=[]
        self.metadata_index=None
        self.paths.fullLuceneIndex="index_"
        self.max_results=sys.maxint

    def connectCorpus(self, base_directory, endpoint={"host":"localhost", "port":9200}, initializing_corpus=False,suppress_error=False):
        """
            If DB has been created, connect to it. Icf not, initialize it first.

            Args:
                base_directory: root dir of this corpus
                initializing_corpus: if True, create DB and directories
                suppress_error: if true, db doesn't complain if it's connected already
        """
        self.endpoint=endpoint
        self.setPaths(ensureTrailingBackslash(base_directory))

        if initializing_corpus:
            self.createAndInitializeDatabase()
        self.connectToDB(suppress_error)

    def createAndInitializeDatabase(self):
        """
            Ensures that the directory structure is in place and creates
            the SQLite database and tables
        """

        def createTable(name, settings, properties):
            """
            """
            if not self.es.indices.exists(index=index_equivalence[name]["index"]):
                self.es.indices.create(
                    index=index_equivalence[name]["index"],
                    body={"settings":settings,"mappings":{index_equivalence[name]["type"]:{"properties":properties}}})

        settings={
            "number_of_shards" : 2,
            "number_of_replicas" : 0
        }
        properties={
            "guid": {"type":"string", "index":"not_analyzed"},
            "metadata": {"type":"nested"},
            "norm_title": {"type":"string", "index":"not_analyzed"},
            "author_ids":{"type":"string", "index":"not_analyzed", "store":True},
            "num_in_collection_references": {"type":"integer"},
            "num_resolvable_citations": {"type":"integer"},
            "num_inlinks": {"type":"integer"},
            "collection_id": {"type":"string", "index":"not_analyzed", "store":True},
            "import_id": {"type":"string", "index":"not_analyzed", "store":True},
            "time_created": {"type":"date"},
            "time_modified": {"type":"date"},
            "has_scidoc": {"type":"boolean","index":"not_analyzed", "store":True},
            "flags": {"type":"string","index":"not_analyzed", "store":True},
            # This is all now accessed through the nested metadata
##            "filename": {"type":"string", "index":"not_analyzed", "store":True},
##            "corpus_id": {"type":"string", "index":"not_analyzed"},
##            "title": {"type":"string", "store":True},##            "surnames": {"type":"string"},
##            "year": {"type":"integer"},
##            "in_collection_references": {"type":"string", "index":"not_analyzed", "store":True},
##            "inlinks": {"type":"string", "index":"not_analyzed", "store":True},
            }
        createTable("papers", settings, properties)


        properties={
            "scidoc": {"type":"string", "index": "no", "store":True},
            "guid": {"type":"string", "index": "not_analyzed", "store":True},
            "time_created": {"type":"date"},
            "time_modified": {"type":"date"},
            }
        createTable("scidocs", settings, properties)

        settings={
            "number_of_shards" : 1,
            "number_of_replicas" : 1
        }
        properties={
            "data": {"type":"string", "index": "no", "store":True},
            "time_created": {"type":"date"},
            "time_modified": {"type":"date"},
            }
        createTable("cache", settings, properties)

        properties={
            "link":{"type":"nested"},
##                "guid_from": {"type":"string", "index":"not_analyzed", "store":True},
##                "guid_to": {"type":"string", "index":"not_analyzed", "store":True},
##                "authors_from": {"type":"string", "index":"not_analyzed", "store":True},
##                "authors_to": {"type":"string", "index":"not_analyzed", "store":True},
##                "self_citation": {"type":"boolean", "index":"not_analyzed", "store":True},
##                "year_from": {"type":"integer", "index":"not_analyzed", "store":True},
##                "year_to": {"type":"integer", "index":"not_analyzed", "store":True},
##                "numcitations": {"type":"integer", "index":"not_analyzed", "store":True},
            "time_created": {"type":"date"}}

        createTable("links", settings, properties)

        properties={
##                "author_id": {"type":"string", "index":"not_analyzed", "store":True},
            "author": {"type":"nested"},
##                "given": {"type":"string", "index":"analyzed", "store":True},
##                "middle": {"type":"string", "index":"analyzed", "store":True},
##                "family": {"type":"string", "index":"analyzed", "store":True},
##                "papers": {"type":"string", "index":"not_analyzed", "store":True},
##                "papers_first_author": {"type":"string", "index":"not_analyzed", "store":True},
##                "papers_last_author": {"type":"string", "index":"not_analyzed", "store":True},
##                "affiliations": {"type":"nested", "index":"not_analyzed", "store":True},
            "time_created": {"type":"date"}
            }
        createTable("authors", settings, properties)

        properties={
            "venue": {"type":"nested"},
            "time_created": {"type":"date"},
            "norm_title": {"type":"string", "index":"not_analyzed"},
            }
        createTable("venues", settings, properties)

        properties={
            "missing": {"type":"nested"},
            "time_created": {"type":"date"},
            "norm_title": {"type":"string", "index":"not_analyzed"},
            }
        createTable("missing_references", settings, properties)


    def connectedToDB(self):
        """
            returns True if connected to DB, False otherwise
        """
        return self.es is not None

    def getRetrievalIndexPath(self, guid, index_filename, full_corpus=False):
        """
            Returns the path to the Lucene index for a test file in
            the corpus

            if full_corpus is True, this is the general index for that method
                else
            when using Citation Resolution (resolving only from the
            references at the bottom of a paper) it is the specific index for
            that file guid
        """
        if full_corpus:
            return "idx_"+index_filename
##            return index_filename
        else:
            guid=guid.lower()
            return "idx_"+guid+"_"+index_filename

    def getRecord(self, rec_id, table="papers", source=None):
        """
            Abstracts over getting data from a row in the db. Returns all the
            fields of the record for one type of table, or those specified in source.

            :param rec_id: id of the record
            :param table: table alias, e.g. ["papers", "scidocs"]
            :param source: fields to return
        """
        self.checkConnectedToDB()

        if table not in index_equivalence:
            raise ValueError("Unknown record type")

        try:
            res=self.es.get(
                index=index_equivalence[table]["index"],
                doc_type=index_equivalence[table]["type"],
                id=rec_id,
                _source=source
                )
        except:
            raise ValueError("Not found: %s in index %s" % (rec_id,index_equivalence[table]["index"]))

        if not res:
            raise IndexError("Can't find record with id %s" % rec_id)
        return res["_source"]

    def setRecord(self, rec_id, body, table="papers", op_type="update"):
        """
            Abstracts over setting getting data for a row in the db.

            :param rec_id: id of the record
            :param table: table alias, e.g. ["papers", "scidocs"]
            :param body: data to set
        """
        self.checkConnectedToDB()

        if table not in index_equivalence:
            raise ValueError("Unknown record type")

##~        try:
        if op_type == "update":
            body={"doc":body}
            self.es.update(
                            index=index_equivalence[table]["index"],
                            doc_type=index_equivalence[table]["type"],
                            id=rec_id,
                            body=body
                            )
        elif op_type in ["index", "create"]:
            self.es.index(
                index=index_equivalence[table]["index"],
                doc_type=index_equivalence[table]["type"],
                op_type=op_type,
                id=rec_id,
                body=body
                )
        else:
            raise ValueError("Unkown op_type %s" % op_type)
##        except:
##            raise ValueError("Error writing record: %s in index %s : %s" % (rec_id,index_equivalence[table]["index"], str(sys.exc_info[:2])))

        return

    def getRecordField(self, rec_id, table="papers"):
        """
            Abstracts over getting data from a row in the db. Returns one field
            for one type of table.

            All other "getter" functions like getMetadataByGUID and loadSciDoc
            are aliases for this function
        """
        return self.getRecord(rec_id, table,source=index_equivalence[table]["source"])[index_equivalence[table]["source"]]

    def recordExists(self, rec_id, table="papers"):
        """
            Returns True if the specified record exists in the given table, False
            otherwise.
        """
        self.checkConnectedToDB()

        return self.es.exists(
            id=rec_id,
            index=index_equivalence[table]["index"],
            doc_type=index_equivalence[table]["type"],
            )

    def SQLQuery(self, query):
        """
            Runs a SQL Query, returning a dict per result with the fields required.

            :param query: SQL query
            :type query: string
        """
        uri="http://%s:%s/_sql/_explain?sql=%s" % (self.endpoint["host"],self.endpoint["port"],query)
        response = requests.get(uri)
        dsl_query = json.loads(response.text)

        if "error" in dsl_query:
            raise ConnectionError("Error in query: " + str(dsl_query["error"]["root_cause"]))

        dsl_query["body"]={"query":dsl_query.pop("query")}
        dsl_query["from_"]=dsl_query.pop("from")
        dsl_query["_source_include"]=dsl_query["_source"]["includes"]
        dsl_query["_source_exclude"]=dsl_query["_source"]["excludes"]
        dsl_query.pop("_source")

        match=re.search(r"select.+?from[\s\"\']+([\w,]+)", query, flags=re.IGNORECASE)
        if match:
            table_name=match.group(1)
        else:
            table_name="papers"

        dsl_query["index"]=index_equivalence[table_name]["index"]
        dsl_query["doc_type"]=index_equivalence[table_name]["type"]

        tmp_max=self.max_results
##        self.max_results=dsl_query["size"]
        if "size" in dsl_query:
            del dsl_query["size"]
        results=self.unlimitedQuery(**dsl_query)
        self.max_results=tmp_max

        results=[r["_source"] for r in results]
        if len(dsl_query["_source_include"]) == 1:
            results=[r[dsl_query["_source_include"][0]] for r in results]

        return results

    def cachedJsonExists(self, type, guid, params=None):
        """
            True if the cached JSON associated with the given parameters exists
        """
        self.checkConnectedToDB()

        return self.es.exists(
            index=ES_INDEX_CACHE,
            doc_type=ES_TYPE_CACHE,
            id=self.cachedDataIDString(type, guid, params)
            )

    def saveCachedJson(self, path, data):
        """
            Save anything as JSON

            :param path: unique ID of resource to load
            :param data: json-formatted string or any data
        """
        self.checkConnectedToDB()

        timestamp=datetime.datetime.now()
        self.es.index(
            index=ES_INDEX_CACHE,
            doc_type=ES_TYPE_CACHE,
            id=path,
            op_type="index",
            body={
                "data": json.dumps(data),
                "time_created": timestamp,
                "time_modified": timestamp,
                }
            )

    def loadCachedJson(self,path):
        """
            Load precomputed JSON

            :param path: unique ID of resource to load
        """
        return json.loads(self.getRecordField(path,"cache"))

    def loadSciDoc(self,guid, ignore_errors=None):
        """
            If a SciDocJSON file exists for guid, it returns it, otherwise None
        """
        data=json.loads(self.getRecordField(guid,"scidocs"))
        return SciDoc(data, ignore_errors=ignore_errors)

    def saveSciDoc(self,doc):
        """
            Saves the document as JSON in the index
        """
        self.checkConnectedToDB()

        attempts=0
        while attempts < 3:
            try:
                timestamp=datetime.datetime.now()
                self.es.index(
                    index=ES_INDEX_SCIDOCS,
                    doc_type=ES_TYPE_SCIDOC,
                    id=doc["metadata"]["guid"],
                    op_type="index",
                    body={
                        "scidoc": json.dumps(doc.data),
                        "guid":doc["metadata"]["guid"],
                        "time_created": timestamp,
                        "time_modified": timestamp,
                        }
                    )
                break
            except ConnectionTimeout:
                attempts+=1

    def connectToDB(self, suppress_error=False):
        """
            Connects to database
        """
        self.es = Elasticsearch([self.endpoint], timeout=60)
        self.es.retry_on_timeout=True

    def getMetadataByGUID(self,guid):
        """
            Returns a paper's metadata by GUID
        """
        return self.getRecordField(guid, "papers")

    def getMetadataByField(self,field,value):
        """
            Returns a paper's metadata by any other field
        """
        self.checkConnectedToDB()

        query=self.filterQuery("%s:\"%s\"" % (field,value))

        res=self.es.search(
            index=ES_INDEX_PAPERS,
            doc_type=ES_TYPE_PAPER,
            _source="metadata",
            size=1,
            q=query)

        hits=res["hits"]["hits"]
        if len(hits) == 0:
            return None

        return hits[0]["_source"]["metadata"]

    def getStatistics(self, guid):
        """
            Easy method to get a paper's statistics
        """
        return self.getRecord(guid, "papers", "statistics")["statistics"]

    def setStatistics(self, guid, stats):
        """
            Easy method to set a paper's statistics
        """
        return self.setRecord(guid, {"statistics":stats}, "papers", op_type="update")

    def filterQuery(self, query, table="papers"):
        """
            Adds a global filter to the query so it only matches the selected
            collection, date, etc.

            :param query: string
        """
        if table !="papers":
            raise NotImplementedError

        if self.query_filter != "":
            return self.query_filter+" ("+query+")"
        else:
            return query


    def listFieldByField(self,field1,field2,value,table="papers",max_results=100):
        """
            Returns a list: for each paper, field1 if field2==value
        """
        self.checkConnectedToDB()

        if table not in index_equivalence:
            raise ValueError("Unknown record type")

        query=self.filterQuery("%s:\"%s\"" % (field2,value))

        hits=self.unlimitedQuery(
                q=query,
                index=index_equivalence[table]["index"],
                doc_type=index_equivalence[table]["type"],
                _source=field1,

        )

        return [r["_source"][field1] for r in hits]


    def isNestedQuery(self, query_string):
        """
            Returns True if a nested field is found in the query string, e.g.
            author.name

            :param query_string: query string
            :returns: boolean
        """
        query_without_quotes=re.sub(r"[^\\]\".*?[^\\]\"","",query_string)
        nested_query=re.search(r"[a-zA-Z]\.[a-zA-Z]",query_without_quotes) is not None
        return nested_query

    def abstractNestedResults(self, query_string, hits, field=None):
        """
            Selects results from elasticsearch API
        """
        if self.isNestedQuery(field):
            if field:
                return [r["_source"]["metadata"][field] for r in hits]
            else:
                return [r["_source"] for r in hits]
        else:
            if field:
                if field.startswith("_"):
                    return [r[field] for r in hits]
                else:
                    return [r["_source"][field] for r in hits]
            else:
                return [r["_source"] for r in hits]

    def listRecords(self, conditions=None, field="guid", max_results=sys.maxint, table="papers"):
        """
            This is the equivalent of a SELECT clause
        """
        self.checkConnectedToDB()

        es_index=index_equivalence[table]["index"]
        es_type=index_equivalence[table]["type"]

        if conditions:
            query=self.filterQuery(conditions)
        else:
##            query=self.filterQuery(field+":*")
            query=self.filterQuery("*:*")

        prev_max_results=self.max_results
        self.max_results=max_results

        hits=self.unlimitedQuery(
                q=query,
                index=es_index,
                doc_type=es_type,
                _source=field,
        )
        self.max_results=prev_max_results

        return self.abstractNestedResults(query, hits, field)

    def listPapers(self,conditions=None,field="guid", max_results=sys.maxint):
        """
            Return a list of GUIDs in papers table where [conditions]
        """
        return self.listRecords(conditions, field, max_results, "papers")

    def runSingleValueQuery(self,query):
        raise NotImplementedError

    def addAuthor(self, author):
        """
            Make sure author is in database
        """
        self.checkConnectedToDB()

        author["author_id"]=self.generateAuthorID
        self.updateAuthor(author,"create")

    def mergeAuthorDetails(self, author_record, new_author_data):
        """
        """
        def findAffiliation(aff_list, new_aff):
            """
            """
            if new_aff.get("name","") in ["",None]:
                return None

            for aff in aff_list:
                if aff.get("name","")==new_aff["name"]:
                    return aff

        def mergeList(new_list, record_list):
            """
                Adds the missing papers from the new_list to to the record_list
            """
            for paper in new_list:
                if paper not in record_list:
                    record_list.append(paper)

        #TODO Fuzzywuzzy this!
        for aff in new_author_data:
            match=findAffiliation(author_record["affiliation"],aff)
            if match:
                mergeList(aff.get("papers",[]), match["papers"])
            else:
                author_record["affiliation"].append(aff)

        mergeList(new_author_data["papers"], author_record["papers"])
        mergeList(new_author_data["papers_first_author"], author_record["papers_first_author"])
        mergeList(new_author_data["papers_last_author"], author_record["papers_last_author"])

    def updateAuthorsFromPaper(self, metadata):
        """
            Make sure authors are in database

            :param metadata: a paper's metadata, with an "authors" key
        """
        self.checkConnectedToDB()

        for index, new_author in enumerate(metadata["authors"]):
            creating_new_record=False
            author_record=self.matcher.matchAuthor(new_author)
            if not author_record:
                creating_new_record=True
                author_record=copy.deepcopy(new_author)
                author_record["author_id"]=self.generateAuthorID()
                author_record["papers"]=[]
                author_record["papers_first_author"]=[]
                author_record["papers_last_author"]=[]
                author_record["num_papers"]=0

            if metadata["guid"] not in author_record["papers"]:
                author_record["papers"].append(metadata["guid"])
                if index==0:
                    author_record["papers_first_author"].append(metadata["guid"])
                if index==len(metadata["authors"]):
                    author_record["papers_last_author"].append(metadata["guid"])
            author_record["num_papers"]=len(author_record["papers"])

            if not creating_new_record:
                self.mergeAuthorDetails(author_record, new_author)

            self.updateAuthor(author_record, op_type="create" if creating_new_record else "index")

    def updateVenuesFromPaper(self, metadata):
        """
            Progressive update of venues
        """
        raise NotImplementedError
##        res=self.es.search(
##            index=ES_INDEX_VENUES,
##            doc_type=ES_TYPE_VENUE,
##            _source=field,
##            q="guid:*")
##
##        return [r["_source"] for r in hits]

    def updateAuthor(self, author, op_type="index"):
        """
            Updates an existing author in the db

            :param author: author data
            :param op_type: one of ["index", "create"]
        """
        self.checkConnectedToDB()

        timestamp=datetime.datetime.now()

        body={
            "author":author,
        }

        author["time_updated"]=timestamp

        if op_type=="create":
            body["time_created"]=timestamp

        self.es.index(
            index=ES_INDEX_AUTHORS,
            doc_type=ES_TYPE_AUTHOR,
            op_type=op_type,
            id=author["author_id"],
            body=body
            )

    def addPaper(self, metadata, check_existing=True, has_scidoc=True):
        """
            Add paper metadata to database
        """
        op_type="create" if check_existing else "index"
        self.updatePaper(metadata, op_type, has_scidoc)
        if self.AUTO_ADD_AUTHORS:
            self.updateAuthorsFromPaper(metadata)

    def updatePaper(self, metadata, op_type="index", has_scidoc=None):
        """
            Updates an existing record in the db

            :param metadata: metadata of paper
            :param op_type: one of ["index", "create"]
            :param has_scidoc: True if SciDoc for this paper exists in scidocs \
                index, False otherwise
        """
        self.checkConnectedToDB()

        timestamp=datetime.datetime.now()
        body={"guid": metadata["guid"],
                "metadata": metadata,
                "norm_title": metadata["norm_title"],
                "num_in_collection_references": metadata.get("num_in_collection_references",0),
                "num_resolvable_citations": metadata.get("num_resolvable_citations",0),
                "num_inlinks": len(metadata.get("inlinks",[])),
                "time_modified": timestamp,
##                "corpus_id": metadata["corpus_id"],
##                "filename": metadata["filename"],
##                 "collection_id": metadata["collection_id"],
##                "import_id": metadata["import_id"],
##                "title": metadata["title"],
##                "surnames": metadata["surnames"],
##                "year": metadata["year"],
                  }

        if has_scidoc is not None:
            body["has_scidoc"]=has_scidoc

        if op_type=="create":
            body["time_created"]=timestamp

        if op_type=="update":
            body={"doc":body}
            try:
                self.es.update(
                    index=ES_INDEX_PAPERS,
                    doc_type=ES_TYPE_PAPER,
                    id=metadata["guid"],
                    body=body
                    )
            except TransportError as e:
                self.es.indices.refresh(index=ES_INDEX_PAPERS)
                self.es.update(
                    index=ES_INDEX_PAPERS,
                    doc_type=ES_TYPE_PAPER,
                    id=metadata["guid"],
                    body=body
                    )

        else:
            self.es.index(
                index=ES_INDEX_PAPERS,
                doc_type=ES_TYPE_PAPER,
                op_type=op_type,
                id=metadata["guid"],
                body=body
                )

    def addLink(self,GUID_from,GUID_to,authors_from,authors_to,year_from,year_to,numcitations):
        """
            Add a link in the citation graph.
        """
        self.checkConnectedToDB()

        self.es.create(
            index=ES_INDEX_LINKS,
            doc_type=ES_TYPE_LINK,
            body={
                "guid_from": GUID_from,
                "guid_to": GUID_to,
                "authors_from": authors_from,
                "authors_to": authors_from,
                "year_from": year_from,
                "year_to": year_to,
                "numcitations": numcitations,
                "time_created": datetime.datetime.now(),
            })

    def addMissingPaper(self, metadata):
        """
            Inserts known data about a paper with no SciDoc
        """
##        self.addPaper(metadata,check_existing=True,has_scidoc=False)
        raise NotImplementedError

    def createDBindeces(self):
        """
            Call this after importing the metadata into the corpus and before
            matching in-collection references, it should speed up search
        """
        self.checkConnectedToDB()

        for index in ES_ALL_INDECES:
            if self.es.indeces.exists(index=index):
                self.es.optimize(index=index)
        pass

    def deleteAll(self, record_type):
        """
            WARNING! This function deletes all the records in a given "table" or
            of a given type.

            :param record_type: one of ["papers","links","authors","scidocs","cache"]

        """
        self.checkConnectedToDB()

        if record_type not in index_equivalence:
            raise ValueError("Unknown record type")

        es_table=index_equivalence[record_type]["index"]

        if self.es.indices.exists(index=es_table):
            print("Deleting ALL files in %s" % es_table)
            # ignore 404 and 400
            self.deleteIndex(es_table)
            self.createAndInitializeDatabase()

    def deleteIndex(self, pattern):
        """
            Deletes all indexes matching the pattern.

            Warning! Use only if you know exactly what you are doing!
        """
        self.es.indices.delete(index=pattern, ignore=[400, 404])

    def deleteByQuery(self, record_type, query):
        """
            Delete the entries from a table that match the query.

            :param record_type: one of the tables that exist, e.g. ["papers","links","authors","scidocs","cached"]
            :type record_type: string
            :param query: a query to select documents to delete
            :type query: string
        """
        self.checkConnectedToDB()

        if not self.es.indices.exists(index=index_equivalence[record_type]["index"]):
            return

        es_table=index_equivalence[record_type]["index"]
        es_type=index_equivalence[record_type]["type"]

        to_delete=self.unlimitedQuery(
            index=es_table,
            doc_type=es_type,
            q=query)

        self.bulkDelete([item["_id"] for item in to_delete])

    def bulkDelete(self, id_list, table="papers"):
        """
            Deletes all entries in id_list from the given table that match on id.


        """
        self.checkConnectedToDB()

        if not self.es.indices.exists(index=index_equivalence[table]["index"]):
            return

        es_table=index_equivalence[table]["index"]
        es_type=index_equivalence[table]["type"]

        bulk_commands=[]
        for item in id_list:
            bulk_commands.append( "{ \"delete\" : {  \"_id\" : \"%s\" } }" % item )

        if len(bulk_commands) > 0:
            self.es.bulk(
                body="\n".join(bulk_commands),
                index=es_table,
                doc_type=es_type,
            )

    def unlimitedQuery(self, *args, **kwargs):
        """
            Wraps elasticsearch querying to enable auto scroll for retrieving
            large amounts of results

            It does more or less what elasticsearch.helpers.scan does, only this
            one actually works.
        """
        scroll_time="20m"

        size=min(self.max_results,10000)

        res=self.es.search(
            *args,
            size=size,
            search_type="scan",
            scroll=scroll_time,
            **kwargs
            )

        results = res['hits']['hits']
        scroll_size = res['hits']['total']
        while (scroll_size > 0) and len(results) < self.max_results:
            try:
                scroll_id = res['_scroll_id']
                rs = self.es.scroll(scroll_id=scroll_id, scroll=scroll_time)
                res=rs
                results.extend(rs['hits']['hits'])
                scroll_size = len(rs['hits']['hits'])
            except:
                break

        return results[:self.max_results]

    def setCorpusFilter(self, collection_id=None, import_id=None, date=None):
        """
            Sets the filter query to limit all queries to a collection (corpus)
            or an import date

            :param collection: identifier of corpus, e.g. "ACL" or "PMC". This is set at import time.
            :type collection:basestring
            :param import: identifier of import, e.g. "initial"
            :type import:basestring
            :param date: comparison with a date, e.g. ">[date]", "<[date]",
            :type collection:basestring
        """
        query_items=[]
        if collection_id:
            query_items.append("metadata.collection_id:\"%s\"" % collection_id)
        if import_id:
            query_items.append("metadata.import_id:\"%s\"" % import_id)
        if date:
            query_items.append("time_created:%s" % date)

        self.query_filter=" AND ".join(query_items)+" AND "
class ElasticSearchClient:
	"""
	Class used as a client to the Elasticsearch server.
	"""
	def __init__(self, host, port, username, password, indexname):
		"""
		Initializes this Elasticsearch Client.
		
		:param host: the HTTP address of the Elasticsearch server.
		:param port: the HTTP port of the Elasticsearch server.
		:param username: the username for connecting to the index.
		:param password: the password for connecting to the index.
		:param indexname: the name of the Elasticsearch index.
		"""
		self.indexname = indexname
		self.client = Elasticsearch(connection_class = SafeRequestsHttpConnection, host = host, port = int(port), http_auth = [username, password])
		self.snapshotclient = SnapshotClient(self.client)
		self.indicesclient = IndicesClient(self.client)

	def delete_index_and_mappings(self):
		"""
		Deletes the index and all its mappings.
		"""
		try:
			self.client.indices.delete(index = self.indexname)
		except NotFoundError:
			pass

	def create_index_and_mappings(self, update_mappings = False):
		"""
		Creates or updates the index and its mappings.
		
		:param update_mappings: boolean denoting whether the mappings should be created (False) or updated (True).
		"""
		if not self.client.indices.exists(self.indexname):
			self.client.indices.create(index = self.indexname, body = load_file_to_json("properties/indexsettings.json"))
		mappings = {}
		if self.indexname in self.client.indices.get_mapping(self.indexname):
			mappings = self.client.indices.get_mapping(self.indexname)[self.indexname]['mappings']
		if update_mappings:
			self.client.indices.close(self.indexname)
		if 'files' not in mappings or update_mappings:
			self.client.indices.put_mapping(index = self.indexname, doc_type = 'files',
				body = load_file_to_json("properties/filesproperties.json"))
		if 'projects' not in mappings or update_mappings:
			self.client.indices.put_mapping(index = self.indexname, doc_type = 'projects',
				body = load_file_to_json("properties/projectsproperties.json"))
		if update_mappings:
			self.client.indices.open(self.indexname)

	def has_project(self, project_id):
		"""
		Checks if the index contains a project.
		
		:param project_id: the id of the project to check if it is contained in the index.
		:returns: True if the index contains the project, or False otherwise.
		"""
		return self.client.exists(index = self.indexname, doc_type = 'projects', id = project_id)

	def has_file(self, file_id):
		"""
		Checks if the index contains a file.
		
		:param file_id: the id of the file to check if it is contained in the index.
		:returns: True if the index contains the file, or False otherwise.
		"""
		return self.client.exists(index = self.indexname, doc_type = 'files', id = file_id)

	def create_project(self, project):
		"""
		Creates a project in the index.
		
		:param project: the data of the project in JSON format.
		"""
		self.client.create(index = self.indexname, doc_type = 'projects', id = project['fullname'], body = project)

	def create_file(self, afile):
		"""
		Creates a file in the index.
		
		:param afile: the data of the file in JSON format.
		"""
		self.client.create(index = self.indexname, doc_type = 'files', id = afile['fullpathname'], parent = afile['project'], body = afile)

	def update_file(self, afile):
		"""
		Updates a file in the index.
		
		:param afile: the data of the file in JSON format.
		"""
		self.client.update(index = self.indexname, doc_type = 'files', id = afile['fullpathname'], parent = afile['project'], body = {'doc': afile})

	def delete_file(self, afile_id):
		"""
		Deletes a file from the index.
		
		:param afile_id: the id of the file to be deleted.
		"""
		self.client.delete(index = self.indexname, doc_type = 'files', id = afile_id, routing = '/'.join(afile_id.split('/')[0:2]))

	def delete_project(self, project_id):
		"""
		Deletes a project from the index. Note that this function also deletes all the files of the project.
		
		:param project_id: the id of the project to be deleted.
		"""
		self.client.delete_by_query(index = self.indexname, doc_type = 'files', body = {"query": { "bool": { "must": { "match_all": {} }, "filter": { "term": { "_routing": project_id } } } } })
		self.client.delete(index = self.indexname, doc_type = 'projects', id = project_id)

	def get_project_fileids_and_shas(self, project_id):
		"""
		Returns all the files and their corresponding shas for a project.
		
		:param project_id: the id of the project of which the files and the shas are returned.
		:returns: a dict containing the files of the project as keys and their shas as values.
		"""
		sourcefiles = self.client.search(index = self.indexname, doc_type = 'files',
			body = {"query": { "term" : { "_routing": project_id } } }, routing = project_id, size = 100000000)['hits']['hits']  # Limitation! Each project must have no more than 100000000 files
		fileidsandshas = {}
		for afile in sourcefiles:
			fileidsandshas[afile['_id']] = afile['_source']['sha']
		return fileidsandshas

	def execute_query(self, query, doc_type = 'files'):
		"""
		Executes a query on the index.
		
		:param query: the body of the query.
		:param doc_type: the document type to which the query is executed, either 'projects' or 'files'.
		:returns: the response of the query.
		"""
		return self.client.search(index = self.indexname, doc_type = doc_type, body = query)

	def test_analyzer(self, analyzer, text):
		"""
		Tests an analyzer of the index.
		
		:param analyzer: the analyzer to be tested.
		:param text: the text to be analyzed as a test.
		:returns: the analyzed text.
		"""
		result = self.indicesclient.analyze(index = self.indexname, analyzer = analyzer, body = text)
		return [r['token'] for r in result['tokens']]

	def backup(self, backupdir):
		"""
		Backups the index.
		
		:param backupdir: the directory used to backup the index.
		"""
		repositoryname = os.path.basename("backup" + self.indexname)
		try:
			self.snapshotclient.get_repository(repository = repositoryname)
		except:
			self.snapshotclient.create_repository(repository = repositoryname, body = {"type": "fs", "settings": {"location": backupdir + os.sep + self.indexname}})
		try:
			self.snapshotclient.get(repository = repositoryname, snapshot = self.indexname + "snapshot")
		except:
			self.snapshotclient.create(repository = repositoryname, snapshot = self.indexname + "snapshot", body = {"indices": self.indexname}, wait_for_completion = True)

	def delete_backup(self):
		"""
		Removes any backups of the index. If there are no backups, this function does nothing.
		"""
		repositoryname = os.path.basename("backup" + self.indexname)
		try:
			self.snapshotclient.delete(repository = repositoryname, snapshot = self.indexname + "snapshot")
		except:
			pass

	def restore_backup(self):
		"""
		Restores a backup of the index.
		"""
		repositoryname = os.path.basename("backup" + self.indexname)
		if not self.client.indices.exists(self.indexname):
			self.client.indices.create(index = self.indexname, body = load_file_to_json("properties/indexsettings.json"))
		self.client.indices.close(self.indexname)
		self.snapshotclient.restore(repository = repositoryname, snapshot = self.indexname + "snapshot", body = {"indices": self.indexname}, wait_for_completion = True)
		self.client.indices.open(self.indexname)

	def flush(self):
		"""
		Flushes the index.
		"""
		self.indicesclient.flush(index = self.indexname)
示例#34
0
class LearnerAPITestMixin(CsvViewMixin):
    filename_slug = 'learners'
    """Manages an elasticsearch index for testing the learner API."""
    def setUp(self):
        """Creates the index and defines a mapping."""
        super(LearnerAPITestMixin, self).setUp()
        self._es = Elasticsearch([settings.ELASTICSEARCH_LEARNERS_HOST])
        management.call_command('create_elasticsearch_learners_indices')
        self.addCleanup(lambda: management.call_command(
            'delete_elasticsearch_learners_indices'))

    def _create_learner(
        self,
        username,
        course_id,
        name=None,
        email=None,
        enrollment_mode='honor',
        segments=None,
        cohort='Team edX',
        discussion_contributions=0,
        problems_attempted=0,
        problems_completed=0,
        problem_attempts_per_completed=None,
        attempt_ratio_order=0,
        videos_viewed=0,
        enrollment_date='2015-01-28',
        user_id=None,
        language=None,
        location=None,
        year_of_birth=None,
        level_of_education=None,
        gender=None,
        mailing_address=None,
        city=None,
        country=None,
        goals=None,
    ):
        """Create a single learner roster entry in the elasticsearch index."""
        body = {
            'username':
            username,
            'course_id':
            course_id,
            'name':
            name if name is not None else username,
            'email':
            email if email is not None else '{}@example.com'.format(username),
            'enrollment_mode':
            enrollment_mode,
            'discussion_contributions':
            discussion_contributions,
            'problems_attempted':
            problems_attempted,
            'problems_completed':
            problems_completed,
            'attempt_ratio_order':
            attempt_ratio_order,
            'videos_viewed':
            videos_viewed,
            'enrollment_date':
            enrollment_date,
            "user_id":
            user_id,
            "language":
            language,
            "location":
            location,
            "year_of_birth":
            year_of_birth,
            "level_of_education":
            level_of_education,
            "gender":
            gender,
            "mailing_address":
            mailing_address,
            "city":
            city,
            "country":
            country,
            "goals":
            goals,
        }

        # leave null fields from being stored in the index.  Otherwise, they will have
        # an explicit null value and we want to test for the case when they're not returned
        optional_fields = [('segments', segments), ('cohort', cohort),
                           ('problem_attempts_per_completed',
                            problem_attempts_per_completed)]
        for optional_field in optional_fields:
            if optional_field[1]:
                body[optional_field[0]] = optional_field[1]

        self._es.create(index=settings.ELASTICSEARCH_LEARNERS_INDEX,
                        doc_type='roster_entry',
                        body=body)

    def create_learners(self, learners):
        """
        Creates multiple learner roster entries.  `learners` is a list of
        dicts, each representing a learner which must at least contain
        the keys 'username' and 'course_id'.  Other learner fields can
        be provided as additional keys in the dict - see the mapping
        defined in `setUp`.
        """
        for learner in learners:
            self._create_learner(**learner)
        self._es.indices.refresh(index=settings.ELASTICSEARCH_LEARNERS_INDEX)

    def create_update_index(self, date=None):
        """
        Created an index with the date of when the learner index was updated.
        """
        self._es.create(index=settings.ELASTICSEARCH_LEARNERS_UPDATE_INDEX,
                        doc_type='marker',
                        body={
                            'date': date,
                            'target_index':
                            settings.ELASTICSEARCH_LEARNERS_INDEX,
                        })
        self._es.indices.refresh(
            index=settings.ELASTICSEARCH_LEARNERS_UPDATE_INDEX)

    def expected_page_url(self, course_id, page, page_size):
        """
        Returns a paginated URL for the given parameters.
        As with PageNumberPagination, if page=1, it's omitted from the query string.
        """
        if page is None:
            return None
        course_q = urlencode({'course_id': course_id})
        page_q = '&page={}'.format(page) if page and page > 1 else ''
        page_size_q = '&page_size={}'.format(
            page_size) if page_size > 0 else ''
        return 'http://testserver/api/v0/learners/?{course_q}{page_q}{page_size_q}'.format(
            course_q=course_q,
            page_q=page_q,
            page_size_q=page_size_q,
        )
示例#35
0
class Connector:
    def __init__(self,
                 prEndpoint=None,
                 esEndpoint=None,
                 dmonPort=5001,
                 MInstancePort=9200,
                 index="logstash-*",
                 prKafkaEndpoint=None,
                 prKafkaPort=9092,
                 prKafkaTopic='edetopic'):
        if esEndpoint is None:
            self.esInstance = None
        else:
            self.esInstance = Elasticsearch(esEndpoint)
            self.esEndpoint = esEndpoint
            self.dmonPort = dmonPort
            self.esInstanceEndpoint = MInstancePort
            self.myIndex = index
            logger.info(
                '[{}] : [INFO] EDE ES backend Defined at: {} with port {}'.
                format(
                    datetime.fromtimestamp(
                        time.time()).strftime('%Y-%m-%d %H:%M:%S'), esEndpoint,
                    MInstancePort))
        if prEndpoint is None:
            pass
        else:
            self.prEndpoint = prEndpoint
            self.MInstancePort = MInstancePort
            logger.info(
                '[{}] : [INFO] EDE PR backend Defined at: {} with port {}'.
                format(
                    datetime.fromtimestamp(
                        time.time()).strftime('%Y-%m-%d %H:%M:%S'), prEndpoint,
                    MInstancePort))
            self.dataDir = os.path.join(
                os.path.dirname(os.path.abspath(__file__)), 'data')
        if prKafkaEndpoint is None:
            self.producer = None
            logger.warning('[{}] : [WARN] EDE Kafka reporter not set'.format(
                datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%d %H:%M:%S')))
        else:
            self.prKafkaTopic = prKafkaTopic
            try:
                self.producer = KafkaProducer(
                    value_serializer=lambda v: json.dumps(v).encode('utf-8'),
                    bootstrap_servers=[
                        "{}:{}".format(prKafkaEndpoint, prKafkaPort)
                    ],
                    retries=5)
                logger.info(
                    '[{}] : [INFO] EDE Kafka reporter initialized to server {}:{}'
                    .format(
                        datetime.fromtimestamp(
                            time.time()).strftime('%Y-%m-%d %H:%M:%S'),
                        prKafkaEndpoint, prKafkaPort))
            except Exception as inst:
                logger.error(
                    '[{}] : [ERROR] EDE Kafka reporter failed with {} and {}'.
                    format(
                        datetime.fromtimestamp(
                            time.time()).strftime('%Y-%m-%d %H:%M:%S'),
                        type(inst), inst.args))
                self.producer = None

    def pr_health_check(self):
        pr_target_health = '/-/healthy'
        pr_target_ready = '/-/ready'
        try:
            resp_h = requests.get("http://{}:{}{}".format(
                self.prEndpoint, self.MInstancePort, pr_target_health))
            resp_r = requests.get("http://{}:{}{}".format(
                self.prEndpoint, self.MInstancePort, pr_target_ready))
        except Exception as inst:
            logger.error(
                '[{}] : [ERROR] Exception has occured while connecting to PR endpoint with type {} at arguments {}'
                .format(
                    datetime.fromtimestamp(
                        time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst),
                    inst.args))
            sys.exit(2)
        if resp_h.status_code != 200:
            logger.error(
                '[{}] : [ERROR] PR endpoint health is bad, exiting'.format(
                    datetime.fromtimestamp(
                        time.time()).strftime('%Y-%m-%d %H:%M:%S')))
            sys.exit(2)
        if resp_r.status_code != 200:
            logger.error(
                '[{}] : [ERROR] PR endpoint not ready to serve traffic'.format(
                    datetime.fromtimestamp(
                        time.time()).strftime('%Y-%m-%d %H:%M:%S')))
            sys.exit(2)
        logger.info('[{}] : [INFO] PR endpoint healthcheck pass'.format(
            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))
        return resp_h.status_code, resp_r.status_code

    def pr_status(self, type=None):
        """
        Get status of prometheus

        TODO: check runtimeinfo and flags
        :param type: suported types
        :return:
        """
        suported = ['runtimeinfo', 'config', 'flags']
        if type is None:
            pr_target_string = '/api/v1/status/config'
        elif type in suported:
            pr_target_string = '/api/v1/status/{}'.format(type)
        else:
            logger.error(
                '[{}] : [ERROR] unsupported status type {}, supported types are {}'
                .format(
                    datetime.fromtimestamp(
                        time.time()).strftime('%Y-%m-%d %H:%M:%S'), type,
                    suported))
            sys.exit(1)
        try:
            resp = requests.get("http://{}:{}{}".format(
                self.prEndpoint, self.MInstancePort, pr_target_string))
        except Exception as inst:
            logger.error(
                '[{}] : [ERROR] Exception has occured while connecting to PR endpoint with type {} at arguments {}'
                .format(
                    datetime.fromtimestamp(
                        time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst),
                    inst.args))
            sys.exit(2)
        return resp.json()

    def pr_targets(self):
        """
        Get Monitored Target Info
        :return: Targets Dict
        """
        pr_target_string = '/api/v1/targets'
        try:
            resp = requests.get("http://{}:{}{}".format(
                self.prEndpoint, self.MInstancePort, pr_target_string))
        except Exception as inst:
            logger.error(
                '[{}] : [ERROR] Exception has occured while connecting to PR endpoint with type {} at arguments {}'
                .format(
                    datetime.fromtimestamp(
                        time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst),
                    inst.args))
            sys.exit(2)
        return resp.json()

    def pr_labels(self, label=None):
        if label is None:
            pr_target_string = '/api/v1/labels'
        else:
            pr_target_string = '/api/v1/label/{}/values'.format(label)
        try:
            resp = requests.get("http://{}:{}{}".format(
                self.prEndpoint, self.MInstancePort, pr_target_string))
        except Exception as inst:
            logger.error(
                '[{}] : [ERROR] Exception has occured while connecting to PR endpoint with type {} at arguments {}'
                .format(
                    datetime.fromtimestamp(
                        time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst),
                    inst.args))
            sys.exit(2)
        return resp.json()

    def pr_query(self, query):
        """
        QUery Monitoring Data From PR backend
        :param query: Query string for PR backend
        :return: Monitoring Data
        """
        try:
            url = '/api/v1/query'
            resp = requests.get('http://{}:{}{}'.format(
                self.prEndpoint, self.MInstancePort, url),
                                params=query)
        except Exception as inst:
            logger.error(
                '[{}] : [ERROR] Exception has occured while connecting to PR endpoint with type {} at arguments {}'
                .format(
                    datetime.fromtimestamp(
                        time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst),
                    inst.args))
            sys.exit(2)
        return resp.json()

    def query(self, queryBody, allm=True, dMetrics=[], debug=False):
        # self.__check_valid_es()
        res = self.esInstance.search(index=self.myIndex,
                                     body=queryBody,
                                     request_timeout=230)
        if debug:
            print(
                "%---------------------------------------------------------%")
            print("Raw JSON Ouput")
            print(res)
            print(("%d documents found" % res['hits']['total']))
            print(
                "%---------------------------------------------------------%")
        termsList = []
        termValues = []
        ListMetrics = []
        for doc in res['hits']['hits']:
            if not allm:
                if not dMetrics:
                    sys.exit(
                        "dMetrics argument not set. Please supply valid list of metrics!"
                    )
                for met in dMetrics:
                    # prints the values of the metrics defined in the metrics list
                    if debug:
                        print(
                            "%---------------------------------------------------------%"
                        )
                        print(
                            "Parsed Output -> ES doc id, metrics, metrics values."
                        )
                        print(("doc id %s) metric %s -> value %s" %
                               (doc['_id'], met, doc['_source'][met])))
                        print(
                            "%---------------------------------------------------------%"
                        )
                    termsList.append(met)
                    termValues.append(doc['_source'][met])
                dictValues = dict(list(zip(termsList, termValues)))
            else:
                for terms in doc['_source']:
                    # prints the values of the metrics defined in the metrics list
                    if debug:
                        print(
                            "%---------------------------------------------------------%"
                        )
                        print(
                            "Parsed Output -> ES doc id, metrics, metrics values."
                        )
                        print(("doc id %s) metric %s -> value %s" %
                               (doc['_id'], terms, doc['_source'][terms])))
                        print(
                            "%---------------------------------------------------------%"
                        )
                    termsList.append(terms)
                    termValues.append(doc['_source'][terms])
                    dictValues = dict(list(zip(termsList, termValues)))
            ListMetrics.append(dictValues)
        return ListMetrics, res

    def info(self):
        # self.__check_valid_es()
        try:
            res = self.esInstance.info()
        except Exception as inst:
            logger.error(
                '[%s] : [ERROR] Exception has occured while connecting to ES dmon with type %s at arguments %s',
                datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst),
                inst.args)
            sys.exit(2)
        return res

    def roles(self):
        # self.__check_valid_es()
        nUrl = "http://%s:%s/dmon/v1/overlord/nodes/roles" % (self.esEndpoint,
                                                              self.dmonPort)
        logger.info(
            '[%s] : [INFO] dmon get roles url -> %s',
            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'),
            nUrl)
        try:
            rRoles = requests.get(nUrl)
        except Exception as inst:
            logger.error(
                '[%s] : [ERROR] Exception has occured while connecting to dmon with type %s at arguments %s',
                datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst),
                inst.args)
            sys.exit(2)
        rData = rRoles.json()
        return rData

    def createIndex(self, indexName):
        # self.__check_valid_es()
        try:
            self.esInstance.create(index=indexName, ignore=400)
            logger.info(
                '[%s] : [INFO] Created index %s',
                datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName)
        except Exception as inst:
            logger.error(
                '[%s] : [ERROR] Failed to created index %s with %s and %s',
                datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName,
                type(inst), inst.args)

    def closeIndex(self, indexName):
        try:
            self.esInstance.close(index=indexName)
            logger.info(
                '[%s] : [INFO] Closed index %s',
                datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName)
        except Exception as inst:
            logger.error(
                '[%s] : [ERROR] Failed to close index %s with %s and %s',
                datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName,
                type(inst), inst.args)

    def deleteIndex(self, indexName):
        try:
            res = self.esInstance.indices.delete(index=indexName,
                                                 ignore=[400, 404])
            logger.info(
                '[%s] : [INFO] Deleted index %s',
                datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName)
        except Exception as inst:
            logger.error(
                '[%s] : [ERROR] Failed to delete index %s with %s and %s',
                datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName,
                type(inst), inst.args)
            return 0
        return res

    def openIndex(self, indexName):
        res = self.esInstance.indices.open(index=indexName)
        logger.info(
            '[%s] : [INFO] Open index %s',
            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'),
            indexName)
        return res

    def getIndex(self, indexName):
        res = self.esInstance.indices.get(index=indexName, human=True)
        return res

    def getIndexSettings(self, indexName):
        res = self.esInstance.indices.get_settings(index=indexName, human=True)
        return res

    def clusterHealth(self):
        res = self.esInstance.cluster.health(request_timeout=15)
        return res

    def clusterSettings(self):
        res = self.esInstance.cluster.get_settings(request_timeout=15)
        return res

    def clusterState(self):
        res = self.esInstance.cluster.stats(human=True, request_timeout=15)
        return res

    def nodeInfo(self):
        res = self.esInstance.nodes.info(request_timeout=15)
        return res

    def nodeState(self):
        res = self.esInstance.nodes.stats(request_timeout=15)
        return res

    def getStormTopology(self):
        nUrl = "http://%s:%s/dmon/v1/overlord/detect/storm" % (self.esEndpoint,
                                                               self.dmonPort)
        logger.info(
            '[%s] : [INFO] dmon get storm topology url -> %s',
            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'),
            nUrl)
        try:
            rStormTopology = requests.get(nUrl)
        except Exception as inst:
            logger.error(
                '[%s] : [ERROR] Exception has occured while connecting to dmon with type %s at arguments %s',
                datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst),
                inst.args)
            print("Can't connect to dmon at %s port %s" %
                  (self.esEndpoint, self.dmonPort))
            sys.exit(2)
        rData = rStormTopology.json()
        return rData

    def pushAnomalyES(self, anomalyIndex, doc_type, body):
        try:
            res = self.esInstance.index(index=anomalyIndex,
                                        doc_type=doc_type,
                                        body=body)
        except Exception as inst:
            logger.error(
                '[%s] : [ERROR] Exception has occured while pushing anomaly with type %s at arguments %s',
                datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst),
                inst.args)
            sys.exit(2)
        return res

    def pushAnomalyKafka(self, body):
        if self.producer is None:
            logger.warning(
                '[{}] : [WARN] Kafka reporter not defined, skipping reporting'.
                format(
                    datetime.fromtimestamp(
                        time.time()).strftime('%Y-%m-%d %H:%M:%S')))
        else:
            try:
                self.producer.send(self.prKafkaTopic, body)
                # self.producer.flush()
                logger.info(
                    '[{}] : [INFO] Anomalies reported to kafka topic {}'.
                    format(
                        datetime.fromtimestamp(
                            time.time()).strftime('%Y-%m-%d %H:%M:%S'),
                        self.prKafkaTopic))
            except Exception as inst:
                logger.error(
                    '[{}] : [ERROR] Failed to report anomalies to kafka topic {} with {} and {}'
                    .format(
                        datetime.fromtimestamp(
                            time.time()).strftime('%Y-%m-%d %H:%M:%S'),
                        self.prKafkaTopic, type(inst), inst.args))
        return 0

    def getModel(self):
        return "getModel"

    def pushModel(self):
        return "push model"

    def localData(self, data):
        data_loc = os.path.join(self.dataDir, data)
        try:
            df = pd.read_csv(data_loc)
        except Exception as inst:
            logger.error(
                '[{}] : [ERROR] Cannot load local data with  {} and {}'.format(
                    datetime.fromtimestamp(
                        time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst),
                    inst.args))
            sys.exit(2)
        logger.info(
            '[{}] : [INFO] Loading local data from {} with shape {}'.format(
                datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%d %H:%M:%S'), data_loc,
                df.shape))
        return df

    def getInterval(self):
        nUrl = "http://%s:%s/dmon/v1/overlord/aux/interval" % (self.esEndpoint,
                                                               self.dmonPort)
        logger.info(
            '[%s] : [INFO] dmon get interval url -> %s',
            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'),
            nUrl)
        try:
            rInterval = requests.get(nUrl)
        except Exception as inst:
            logger.error(
                '[%s] : [ERROR] Exception has occured while connecting to dmon with type %s at arguments %s',
                datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst),
                inst.args)
            sys.exit(2)
        rData = rInterval.json()
        return rData

    def aggQuery(self, queryBody):
        adt_timeout = os.environ['ADP_TIMEOUT'] = os.getenv(
            'ADP_TIMEOUT', str(60)
        )  # Set timeout as env variable ADT_TIMEOUT, if not set use default 60
        # print "QueryString -> {}".format(queryBody)
        try:
            res = self.esInstance.search(index=self.myIndex,
                                         body=queryBody,
                                         request_timeout=float(adt_timeout))
        except Exception as inst:
            logger.error(
                '[%s] : [ERROR] Exception while executing ES query with %s and %s',
                datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst),
                inst.args)
            sys.exit(2)
        return res

    def getNodeList(self):
        '''
        :return: -> returns the list of registered nodes from dmon
        '''
        nUrl = "http://%s:%s/dmon/v1/observer/nodes" % (self.esEndpoint,
                                                        self.dmonPort)
        logger.info(
            '[%s] : [INFO] dmon get node url -> %s',
            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'),
            nUrl)
        try:
            rdmonNode = requests.get(nUrl)
        except Exception as inst:
            logger.error(
                '[%s] : [ERROR] Exception has occured while connecting to dmon with type %s at arguments %s',
                datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst),
                inst.args)
            sys.exit(2)
        rdata = rdmonNode.json()
        nodes = []
        for e in rdata['Nodes']:
            for k in e:
                nodes.append(k)
        return nodes

    def getDmonStatus(self):
        nUrl = "http://%s:%s/dmon/v1/overlord/core/status" % (self.esEndpoint,
                                                              self.dmonPort)
        logger.info(
            '[%s] : [INFO] dmon get core status url -> %s',
            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'),
            nUrl)
        try:
            rdmonStatus = requests.get(nUrl)
        except Exception as inst:
            logger.error(
                '[%s] : [ERROR] Exception has occured while connecting to dmon with type %s at arguments %s',
                datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst),
                inst.args)
            sys.exit(2)
        return rdmonStatus.json()
示例#36
0
class Tags(object):
    config_name = 'dossier.tags'

    @classmethod
    def configured(cls):
        return cls(**yakonfig.get_global_config('dossier.tags'))

    def __init__(self,
                 hosts=None,
                 namespace=None,
                 type_prefix='',
                 shards=10,
                 replicas=0,
                 tag_delimiter='/'):
        if hosts is None:
            raise yakonfig.ProgrammerError(
                'Tags needs at least one host specified.')
        if namespace is None:
            raise yakonfig.ProgrammerError('Tags needs a namespace defined.')
        self.conn = Elasticsearch(hosts=hosts, timeout=60, request_timeout=60)
        self.index = 'tags_%s' % namespace
        self.type_tag = '%stag' % type_prefix
        self.type_assoc = '%sassociation' % type_prefix
        self.shards = shards
        self.replicas = replicas
        self.delim = tag_delimiter

        created1 = self._create_index()
        created2 = self._create_mappings()
        if created1 or created2:
            # It is possible to create an index and quickly launch a request
            # that will fail because the index hasn't been set up yet. Usually,
            # you'll get a "no active shards available" error.
            #
            # Since index creation is a very rare operation (it only happens
            # when the index doesn't already exist), we sit and wait for the
            # cluster to become healthy.
            self.conn.cluster.health(index=self.index,
                                     wait_for_status='yellow')

    def add(self, assoc):
        self._validate_association(assoc)
        tag = self._normalize_tag(assoc['tag'])
        if len(tag) == 0:
            return
        self.conn.create(index=self.index,
                         doc_type=self.type_assoc,
                         body=assoc)

        # Start with creating the full tag and continue creating parent tags
        # until one exists or until we hit root. This lets us save some
        # round trips in the common case (the tag is already created).
        parts = tag.split(self.delim)
        while len(parts) > 0:
            tag = self.delim.join(parts)
            doc_tag = {
                'tag': tag,
                'parent': self.delim.join(parts[:-1]),
                'name': parts[-1],
            }
            try:
                self.conn.create(index=self.index,
                                 doc_type=self.type_tag,
                                 id=tag,
                                 body=doc_tag)
            except ConflictError as e:
                # Yay for brittle substring search for error detection!
                if 'DocumentAlreadyExistsException' in e.error:
                    break
                raise
            parts = parts[:-1]

    def list(self, parent_tag):
        parent_tag = self._normalize_tag(parent_tag)
        return self._term_query(self.type_tag, 'parent', parent_tag)

    def suggest(self, parent, prefix, limit=100):
        if prefix == '':
            # No sense in issuing a request when we already know the answer.
            return []
        body = {
            'tag': {
                'text': prefix,
                'completion': {
                    'field': 'name.suggest',
                    'size': limit,
                    'context': {
                        'parent': parent,
                    },
                },
            },
        }
        hits = self.conn.suggest(index=self.index, body=body)
        if 'tag' not in hits:
            return []
        return map(lambda hit: hit['text'], hits['tag'][0]['options'])

    def assocs_by_tag(self, tag):
        tag = self._normalize_tag(tag)
        return self._term_query(self.type_assoc, 'tag', tag)

    def assocs_by_url(self, url):
        return self._term_query(self.type_assoc, 'url', url)

    def assocs_by_stream_id(self, stream_id):
        return self._term_query(self.type_assoc, 'stream_id', stream_id)

    def sync(self):
        '''Tells ES to tell Lucene to do an fsync.

        This guarantees that any previous calls to ``add`` will be
        flushed to disk and available in subsequent searches.

        Generally, this should only be used in test code.
        '''
        self.conn.indices.refresh(index=self.index)

    def delete_all(self):
        '''Deletes all tag data.

        This does not destroy the ES index, but instead only
        deletes all tags with the configured doc types.
        '''
        try:
            self.conn.indices.delete_mapping(index=self.index,
                                             doc_type=self.type_tag)
        except TransportError:
            logger.warn('type %r in index %r already deleted',
                        self.index,
                        self.type_tag,
                        exc_info=True)
        try:
            self.conn.indices.delete_mapping(index=self.index,
                                             doc_type=self.type_assoc)
        except TransportError:
            logger.warn('type %r in index %r already deleted',
                        self.index,
                        self.type_assoc,
                        exc_info=True)

    def _create_index(self):
        'Create the index'
        # This can race, but that should be OK.
        # Worst case, we initialize with the same settings more than
        # once.
        if self.conn.indices.exists(index=self.index):
            return False
        try:
            self.conn.indices.create(index=self.index,
                                     timeout=60,
                                     request_timeout=60,
                                     body={
                                         'settings': {
                                             'number_of_shards': self.shards,
                                             'number_of_replicas':
                                             self.replicas,
                                         },
                                     })
        except TransportError:
            # Hope that this is an "index already exists" error...
            logger.warn('index already exists? OK', exc_info=True)
        return True

    def _create_mappings(self):
        'Create the field type mapping.'
        created1 = self._create_tag_mapping()
        created2 = self._create_assoc_mapping()
        return created1 or created2

    def _create_tag_mapping(self):
        mapping = self.conn.indices.get_mapping(index=self.index,
                                                doc_type=self.type_tag)
        if len(mapping) > 0:
            return False
        self.conn.indices.put_mapping(
            index=self.index,
            doc_type=self.type_tag,
            timeout=60,
            request_timeout=60,
            body={
                self.type_tag: {
                    'dynamic': False,
                    'properties': {
                        'parent': {
                            'type': 'string',
                            'index': 'not_analyzed',
                        },
                        'name': {
                            'type': 'string',
                            'index': 'not_analyzed',
                            'fields': {
                                'suggest': {
                                    'type': 'completion',
                                    'index_analyzer': 'simple',
                                    'search_analyzer': 'simple',
                                    'payloads': False,
                                    'preserve_separators': True,
                                    'preserve_position_increments': True,
                                    'max_input_length': 256,
                                    'context': {
                                        'parent': {
                                            'type': 'category',
                                            'path': 'parent',
                                        },
                                    },
                                },
                            },
                        },
                        'tag': {
                            'type': 'string',
                            'index': 'not_analyzed',
                        },
                    },
                },
            })
        return True

    def _create_assoc_mapping(self):
        mapping = self.conn.indices.get_mapping(index=self.index,
                                                doc_type=self.type_assoc)
        if len(mapping) > 0:
            return False
        self.conn.indices.put_mapping(index=self.index,
                                      doc_type=self.type_assoc,
                                      timeout=60,
                                      request_timeout=60,
                                      body={
                                          self.type_assoc: {
                                              'dynamic': False,
                                              'properties': {
                                                  'url': {
                                                      'type': 'string',
                                                      'index': 'not_analyzed'
                                                  },
                                                  'text': {
                                                      'type': 'string',
                                                      'index': 'analyzed'
                                                  },
                                                  'tag': {
                                                      'type': 'string',
                                                      'index': 'not_analyzed'
                                                  },
                                                  'stream_id': {
                                                      'type': 'string',
                                                      'index': 'not_analyzed'
                                                  },
                                                  'hash': {
                                                      'type': 'string',
                                                      'index': 'not_analyzed'
                                                  },
                                                  'timestamp': {
                                                      'type': 'integer',
                                                      'index': 'not_analyzed'
                                                  },
                                                  'xpath': {
                                                      'type': 'object',
                                                      'dynamic': False,
                                                      'properties': {
                                                          'start_node': {
                                                              'type': 'string',
                                                              'index': 'no'
                                                          },
                                                          'start_idx': {
                                                              'type':
                                                              'integer',
                                                              'index': 'no'
                                                          },
                                                          'end_node': {
                                                              'type': 'string',
                                                              'index': 'no'
                                                          },
                                                          'end_idx': {
                                                              'type':
                                                              'integer',
                                                              'index': 'no'
                                                          },
                                                      },
                                                  },
                                              },
                                          },
                                      })
        return True

    def _validate_association(self, assoc):
        def check_field(d, (name, ty), prefix=''):
示例#37
0
    print("Done with delete!")

    es.indices.create("books", {})
    print("starting book population...")
    id = 0
    for k in range(0, df.shape[0]):
        # parse book data
        book = df.iloc[k, :]
        author_arr = book.authors.split(", ")
        bookInfo = {
            "title": book.title,
            "authors": author_arr,
            'pages': book.pages,
            'isbn13': str(book['isbn13']),
            'quantity': 2
        }
        # create new book entry
        q = random.randint(1, 5)
        es.create("books",
                  id, {
                      "title": bookInfo['title'],
                      "authors": bookInfo['authors'],
                      "pages": bookInfo['pages'],
                      "isbn": bookInfo['isbn13'],
                      "quantity": q
                  },
                  doc_type="_doc")
        id = id + 1

    print("done")
示例#38
0
class ElasticsearchIndex():
    """ Gestor para indice elasticsearch"""
    
    def __init__(self, url=None):

        # establecemos conexión
        if url:
            print "[WARNING] ignorando config %s, usamos url %s" % (settings.SEARCH_INDEX['url'], url)
            self.es = Elasticsearch(url)
        else:
            self.es = Elasticsearch(settings.SEARCH_INDEX['url'])
 
        es_conf= { "settings": {
                "analysis": {
                  "filter": {
                    "english_stop": {
                      "type":       "stop",
                      "stopwords":  "_english_"
                    },
                    "light_english_stemmer": {
                      "type":       "stemmer",
                      "language":   "light_english"
                    },
                    "english_possessive_stemmer": {
                      "type":       "stemmer",
                      "language":   "english"
                    },
 
                    "light_spanish_stemmer": {
                      "type":       "stemmer",
                      "language":   "light_spanish"
                    },
                    "spanish_possessive_stemmer": {
                      "type":       "stemmer",
                      "language":   "spanish"
                    }
                  },
                    "analyzer": {
                        "case_insensitive_sort": {
                            "tokenizer": "keyword",
                            "filter":  [ "lowercase" ]
                        },
                        "english": {
                          "tokenizer":  "standard",
                          "filter": [
                            "english_possessive_stemmer",
                            "lowercase",
                            "english_stop",
                            "light_english_stemmer",
                            "asciifolding"
                          ]
                        },
                        "spanish": {
                          "tokenizer":  "standard",
                          "filter": [
                            "spanish_possessive_stemmer",
                            "lowercase",
                            "light_spanish_stemmer",
                          ]
                        }
 
                    }
                }               
            } }

        # se crea el indice si es que no existe
        # Ignora que exista el indice
        indices = self.es.indices.create(index=settings.SEARCH_INDEX['index'], body=es_conf, ignore=400)

        # primera vez que empuja el index
        try:
            if indices['acknowledged']:
                for doc_type in ["ds","dt","vz"]:
                    self.es.indices.put_mapping(index=settings.SEARCH_INDEX['index'], doc_type=doc_type, body=self.__get_mapping(doc_type))
                for finder in DatalPluginPoint.get_active_with_att('finder'):
                    self.es.indices.put_mapping(index=settings.SEARCH_INDEX['index'], doc_type=finder.doc_type, body=self.__get_mapping(finder.doc_type))
        # Ya existe un index
        except KeyError:
            pass

        self.logger = logging.getLogger(__name__)

    def __get_mapping(self, doc_type):
        if doc_type == "ds":
            return self.__get_datastream_mapping()
        elif doc_type == "dt":
            return self.__get_dataset_mapping()
        elif doc_type == "vz":
            return self.__get_visualization_mapping()

        for finder in DatalPluginPoint.get_active_with_att('finder'):
            if finder.doc_type == doc_type:
                return finder.get_mapping()

    def __get_datastream_mapping(self):
        return {"ds" : {
                "properties" : {
                  "categories" : {
                    "properties" : {
                      "id" : { "type" : "string" },
                      "name" : { "type" : "string", 
                                 "index" : "not_analyzed" }
                    }
                  }, # categories

                  "meta_text" : {
                    "properties" : {
                      "field_name" : { "type" : "string" },
                      "field_value" : { "type" : "string"}
                    }
                  }, # meta_text
                  "docid" : { "type" : "string" },
                  "fields" : {
                    "properties" : {
                      "account_id" : { "type" : "long" },
                      "datastream__revision_id" : { "type" : "long" },
                      "datastream_id" : { "type" : "long" },
                      "resource_id" : { "type" : "long" },
                      "revision_id" : { "type" : "long" },
                      "description" : { "type" : "string" },
                      "end_point" : { "type" : "string" },
                      "owner_nick" : { "type" : "string" },
                      "parameters" : { "type" : "string" },
                      "tags" : { "type" : "string" },
                      "text" : {
                        "type" : "string",
                        "fields": {
                                "text_lower_sort": {"type":"string", "analyzer": "case_insensitive_sort"},
                                "text_english_stemmer": {"type":"string", "analyzer": "english"},
                                "text_spanish_stemmer": {"type":"string", "analyzer": "spanish"}
                                },
                        "properties": { 
                                "text_english": {"type":"string", "analyzer": "english"},
                                "text_spanish": {"type":"string", "analyzer": "spanish"}
                        },
                      },
                      "created_at" : { "type" : "long" },
                      "timestamp" : { "type" : "long" },
                      "hits" : { "type" : "integer" },
                      "web_hits" : { "type" : "integer" },
                      "api_hits" : { "type" : "integer" },
                      "title" : { "type" : "string" ,
                        "fields": {"title_lower_sort": {"type":"string", "analyzer": "case_insensitive_sort"}}
                          },
                      "type" : { "type" : "string" }
                    }
                  } # fields
                }
              }
        }

    def __get_dataset_mapping(self):
        return {"dt" : {
                "properties" : {
                  "categories" : {
                    "properties" : {
                      "id" : { "type" : "string" },
                      "name" : { "type" : "string",
                                 "index" : "not_analyzed" }
                    }
                  }, # categories
                  "meta_text" : {
                    "properties" : {
                      "field_name" : { "type" : "string" },
                      "field_value" : { "type" : "string"}
                    }
                  }, # meta_text
                  "docid" : { "type" : "string" },
                  "fields" : {
                    "properties" : {
                      "account_id" : { "type" : "long" },
                      "datasetrevision_id" : { "type" : "long" },
                      "dataset_id" : { "type" : "long" },
                      "resource_id" : { "type" : "long" },
                      "revision_id" : { "type" : "long" },
                      "description" : { "type" : "string" },
                      "end_point" : { "type" : "string" },
                      "owner_nick" : { "type" : "string" },
                      "parameters" : { "type" : "string" },
                      "tags" : { "type" : "string" },
                      "text" : {
                        "type" : "string",
                        "fields": {
                                "text_lower_sort": {"type":"string", "analyzer": "case_insensitive_sort"},
                                "text_english_stemmer": {"type":"string", "analyzer": "english"},
                                "text_spanish_stemmer": {"type":"string", "analyzer": "spanish"}
                                },
                        "properties": { 
                                "text_english": {"type":"string", "analyzer": "english"},
                                "text_spanish": {"type":"string", "analyzer": "spanish"}
                        },
                      },
 
                      "created_at" : { "type" : "long" },
                      "timestamp" : { "type" : "long" },
                      "hits" : { "type" : "integer" },
                      "web_hits" : { "type" : "integer" },
                      "api_hits" : { "type" : "integer" },
                      "title" : { "type" : "string" ,
                        "fields": {"title_lower_sort": {"type":"string", "analyzer": "case_insensitive_sort"}}
                          },
                      "type" : { "type" : "string" }
                    }
                  } # fields
                }
              }
        }
 
    def __get_visualization_mapping(self):
        return {"vz" : {
                "properties" : {
                  "categories" : {
                    "properties" : {
                      "id" : { "type" : "string" },
                      "name" : { "type" : "string",
                                 "index" : "not_analyzed" }
                    }
                  }, # categories
                  "meta_text" : {
                    "properties" : {
                      "field_name" : { "type" : "string" },
                      "field_value" : { "type" : "string"}
                    }
                  }, # meta_text
                  "docid" : { "type" : "string" },
                  "fields" : {
                    "properties" : {
                      "account_id" : { "type" : "long" },
                      "resource_id" : { "type" : "long" },
                      "revision_id" : { "type" : "long" },
                      "visualization_revision_id" : { "type" : "long" },
                      "visualization_id" : { "type" : "long" },
                      "description" : { "type" : "string" },
                      "end_point" : { "type" : "string" },
                      "owner_nick" : { "type" : "string" },
                      "parameters" : { "type" : "string" },
                      "tags" : { "type" : "string" },
                      "text" : {
                        "type" : "string",
                        "fields": {
                                "text_lower_sort": {"type":"string", "analyzer": "case_insensitive_sort"},
                                "text_english_stemmer": {"type":"string", "analyzer": "english"},
                                "text_spanish_stemmer": {"type":"string", "analyzer": "spanish"}
                                },
                        "properties": { 
                                "text_english": {"type":"string", "analyzer": "english"},
                                "text_spanish": {"type":"string", "analyzer": "spanish"}
                        },
                      },
 
                      "hits" : { "type" : "integer" },
                      "web_hits" : { "type" : "integer" },
                      "api_hits" : { "type" : "integer" },
                      "created_at" : { "type" : "long" },
                      "timestamp" : { "type" : "long" },
                      "title" : { "type" : "string" ,
                        "fields": {"title_lower_sort": {"type":"string", "analyzer": "case_insensitive_sort"}}
                          },
                      "type" : { "type" : "string" }
                    }
                  } # fields
                }
              }
        }

    def indexit(self, document):
        """add document to index
        :param document:
        """

        if document:
            # self.logger.info('Elasticsearch: Agregar al index %s' % str(document))
            try:
                return self.es.create(
                    index=settings.SEARCH_INDEX['index'],
                    body=document,
                    doc_type=document['fields']['type'],
                    id=document['docid'])
            except:
                return self.es.index(
                    index=settings.SEARCH_INDEX['index'],
                    body=document,
                    doc_type=document['fields']['type'],
                    id=document['docid'])


        return False
        
    def count(self, doc_type=None):
        """return %d of documents in index, doc_type (opt) filter this document type"""

        if doc_type:
            return self.es.count(index=settings.SEARCH_INDEX['index'], doc_type=doc_type)['count']
        else:
            return self.es.count(index=settings.SEARCH_INDEX['index'])['count']
        
    def delete_document(self, document):
        """delete by ID"""

        try:
            output = self.es.delete(index=settings.SEARCH_INDEX['index'], id=document['docid'], doc_type=document['type'])
            return output
        except NotFoundError:
            self.logger.error("ERROR NotFound: ID %s not found in index" % document['docid'])
            return {u'found': False, u'documment': document, u'index': settings.SEARCH_INDEX['index']}
        except KeyError:
            self.logger.error("ERROR KeyError: Document error (doc: %s)" % str(document))
        except TypeError:
            self.logger.error("ERROR TypeError: Document error (doc: %s)" % str(document))

        return False

    def __filterDeleted(self, item):
        return item['found']

    def __filterNotDeleted(self, item):
        return not item['found']

    def flush_index(self):
        return self.es.indices.delete(index=settings.SEARCH_INDEX['index'], ignore=[400, 404])

    def delete_documents(self, documents):
        """Delete from a list. Return [list(deleted), list(notdeleted)]
        :param documents:
        """
        result = map(self.delete_document, documents)

        documents_deleted=filter(self.__filterDeleted,result)
        documents_not_deleted=filter(self.__filterNotDeleted,result)

        return [documents_deleted, documents_not_deleted]

    def search(self, doc_type, query, fields="*" ):
        """Search by query
        :param doc_type:
        :param query:
        :param fields:
        """

        try:
            return self.es.search(index=settings.SEARCH_INDEX['index'], doc_type=doc_type, body=query, _source_include=fields)
        except RequestError,e:
            raise RequestError(e)
        except NotFoundError,e:
            raise NotFoundError,(e)
class Worker(Process):
    def __init__(self, work_queue):
        super(Worker, self).__init__()
        self.api_client = APIClient('http://%s:9200' % es_hosts[random.randint(0, len(es_hosts) - 1)].get('host'))
        self.work_queue = work_queue
        self.es = Elasticsearch(es_hosts)
        self.sentence_list = loremipsum.get_sentences(1000)
        self.re_first_word = re.compile('([A-z]+)')

    def run(self):
        print 'Starting %s ' % self.name
        counter = 0

        batch = []

        while True:
            index_batch_size = args.get('batch_size')
            task = self.work_queue.get(timeout=600)
            counter += 1

            document = self.generate_document(task['field_count'])
            flattened_doc = self.process_document(document,
                                                  task['type'],
                                                  task['uuid'],
                                                  task['uuid'])

            index_type_tuple = (task['index'], task['type'])

            # self.handle_document(task['index'], task['type'], task['uuid'], flattened_doc)

            batch.append((index_type_tuple, flattened_doc))

            if len(batch) >= index_batch_size:
                self.handle_batch(batch)
                batch = []

            self.work_queue.task_done()

    def generate_document(self, fields):

        doc = {}

        my_bool = True

        for i in xrange(fields):
            sentence_index = random.randint(0, max((fields / 2) - 1, 1))
            sentence = self.sentence_list[sentence_index]

            if random.random() >= .5:
                key = self.re_first_word.findall(sentence)[1]
            else:
                key = self.re_first_word.findall(sentence)[1] + str(i)

            field_type = random.random()

            if field_type <= 0.3:
                doc[key] = sentence

            elif field_type <= 0.5:
                doc[key] = random.randint(1, 1000000)

            elif field_type <= 0.6:
                doc[key] = random.random() * 1000000000

            elif field_type == 0.7:
                doc[key] = my_bool
                my_bool = not my_bool

            elif field_type == 0.8:
                doc[key] = self.generate_document(max(fields / 5, 1))

            elif field_type <= 1.0:
                doc['mylocation'] = self.generate_location()

        return doc

    @staticmethod
    def get_fields(document, base_name=None):
        fields = []

        for name, value in document.iteritems():
            if base_name:
                field_name = '%s.%s' % (base_name, name)
            else:
                field_name = name

            if isinstance(value, dict):
                fields += Worker.get_fields(value, field_name)
            else:
                value_name = None
                if isinstance(value, basestring):
                    value_name = 'string'

                elif isinstance(value, bool):
                    value_name = 'boolean'

                elif isinstance(value, (int, long)):
                    value_name = 'long'

                elif isinstance(value, float):
                    value_name = 'double'

                if value_name:
                    field = {
                        'name': field_name,
                        value_name: value
                    }
                else:
                    field = {
                        'name': field_name,
                        'string': str(value)
                    }

                fields.append(field)

        return fields


    @staticmethod
    def process_document(document, doc_type, application_id, uuid):
        response = {
            'entityId': uuid,
            'entityVersion': '1',
            'entityType': doc_type,
            'applicationId': application_id,
            'fields': Worker.get_fields(document)
        }

        return response

    def handle_document(self, index, doc_type, uuid, document):

        res = self.es.create(index=index,
                             doc_type=doc_type,
                             id=uuid,
                             body=document)

        print res

    def generate_location(self):
        response = {}

        lat = random.random() * 90.0
        lon = random.random() * 180.0

        lat_neg_true = True if lon > .5 else False
        lon_neg_true = True if lat > .5 else False

        lat = lat * -1.0 if lat_neg_true else lat
        lon = lon * -1.0 if lon_neg_true else lon

        response['location'] = {
            'lat': lat,
            'lon': lon
        }

        return response

    def handle_batch(self, batch):
        print 'HANDLE BATCH size=%s' % len(batch)
        # self.api_client.define_type_mapping(index, doc_type)
        self.api_client.index_batch(batch)
示例#40
0
import json
import os

import progressbar
from elasticsearch import Elasticsearch

es = Elasticsearch(hosts="kiddd.science:19200")
with open("qidian.txt") as f:
    lines = f.readlines()
    for line in progressbar.progressbar(lines, redirect_stdout=True):
        book = json.loads(line)

        path = 'books_qidian/' + book['bid']
        if os.path.exists(path + '.txt'):
            book['download'] = 'qidian/' + book['bid'] + '.txt'
        elif os.path.exists(path + '.epub'):
            book['download'] = 'qidian/' + book['bid'] + '.epub'
        else:
            print('ERROR, there is no %s' % book['title'])
            continue
        del book['bid']
        try:
            es.create(index='ebooks',
                      id=book['download'],
                      doc_type='book',
                      body=book)
        except Exception as e:
            print(e)
示例#41
0
class ES:
    index = None
    conn = None
    settings = None

    grok_filters = dict()

    notifier = None

    bulk_actions = []

    def __init__(self, settings=None, logging=None):
        self.settings = settings
        self.logging = logging

        if self.settings.config.getboolean("notifier", "email_notifier"):
            self.notifier = Notifier(settings, logging)

    def init_connection(self):
        self.conn = Elasticsearch(
            [self.settings.config.get("general", "es_url")],
            use_ssl=False,
            timeout=self.settings.config.getint("general", "es_timeout"),
            verify_certs=False,
            retry_on_timeout=True)

        if self.conn.ping():
            self.logging.logger.info(
                "connected to Elasticsearch on host %s" %
                (self.settings.config.get("general", "es_url")))
        else:
            self.logging.logger.error(
                "could not connect to to host %s. Exiting!" %
                (self.settings.config.get("general", "es_url")))

        return self.conn

    def scan(self,
             index,
             bool_clause=None,
             sort_clause=None,
             query_fields=None,
             search_query=None):
        preserve_order = True if sort_clause is not None else False
        return eshelpers.scan(
            self.conn,
            request_timeout=self.settings.config.getint(
                "general", "es_timeout"),
            index=index,
            query=build_search_query(bool_clause=bool_clause,
                                     sort_clause=sort_clause,
                                     search_range=self.settings.search_range,
                                     query_fields=query_fields,
                                     search_query=search_query),
            size=self.settings.config.getint("general", "es_scan_size"),
            scroll=self.settings.config.get("general", "es_scroll_time"),
            preserve_order=preserve_order,
            raise_on_error=False)

    def count_documents(self,
                        index,
                        bool_clause=None,
                        query_fields=None,
                        search_query=None):
        res = self.conn.search(
            index=index,
            body=build_search_query(bool_clause=bool_clause,
                                    search_range=self.settings.search_range,
                                    query_fields=query_fields,
                                    search_query=search_query),
            size=self.settings.config.getint("general", "es_scan_size"),
            scroll=self.settings.config.get("general", "es_scroll_time"))
        return res["hits"]["total"]

    def filter_by_query_string(self, query_string=None):
        bool_clause = {"filter": [{"query_string": {"query": query_string}}]}
        return bool_clause

    def filter_by_dsl_query(self, dsl_query=None):
        dsl_query = json.loads(dsl_query)

        if isinstance(dsl_query, list):
            bool_clause = {"filter": []}
            for query in dsl_query:
                bool_clause["filter"].append(query["query"])
        else:
            bool_clause = {"filter": [dsl_query["query"]]}
        return bool_clause

    # this is part of housekeeping, so we should not access non-threat-save objects, such as logging progress to the console using ticks!
    def remove_all_whitelisted_outliers(self):
        from helpers.outlier import Outlier  # import goes here to avoid issues with singletons & circular requirements ... //TODO: fix this

        outliers_filter_query = {"filter": [{"term": {"tags": "outlier"}}]}
        total_docs_whitelisted = 0

        idx = self.settings.config.get("general", "es_index_pattern")
        total_nr_outliers = self.count_documents(
            index=idx, bool_clause=outliers_filter_query)
        self.logging.logger.info(
            "going to analyze %s outliers and remove all whitelisted items",
            "{:,}".format(total_nr_outliers))

        for doc in self.scan(index=idx, bool_clause=outliers_filter_query):
            total_outliers = int(doc["_source"]["outliers"]["total_outliers"])
            # Generate all outlier objects for this document
            total_whitelisted = 0

            for i in range(total_outliers):
                outlier_type = doc["_source"]["outliers"]["type"][i]
                outlier_reason = doc["_source"]["outliers"]["reason"][i]
                outlier_summary = doc["_source"]["outliers"]["summary"][i]

                outlier = Outlier(outlier_type=outlier_type,
                                  outlier_reason=outlier_reason,
                                  outlier_summary=outlier_summary)
                if outlier.is_whitelisted(additional_dict_values_to_check=doc):
                    total_whitelisted += 1

            # if all outliers for this document are whitelisted, removed them all. If not, don't touch the document.
            # this is a limitation in the way our outliers are stored: if not ALL of them are whitelisted, we can't remove just the whitelisted ones
            # from the Elasticsearch event, as they are stored as array elements and potentially contain observations that should be removed, too.
            # In this case, just don't touch the document.
            if total_whitelisted == total_outliers:
                total_docs_whitelisted += 1
                doc = remove_outliers_from_document(doc)

                self.conn.delete(index=doc["_index"],
                                 doc_type=doc["_type"],
                                 id=doc["_id"],
                                 refresh=True)
                self.conn.create(index=doc["_index"],
                                 doc_type=doc["_type"],
                                 id=doc["_id"],
                                 body=doc["_source"],
                                 refresh=True)

        return total_docs_whitelisted

    def remove_all_outliers(self):
        idx = self.settings.config.get("general", "es_index_pattern")

        must_clause = {"filter": [{"term": {"tags": "outlier"}}]}
        total_outliers = self.count_documents(index=idx,
                                              bool_clause=must_clause)

        query = build_search_query(bool_clause=must_clause,
                                   search_range=self.settings.search_range)

        script = {
            "source":
            "ctx._source.remove(\"outliers\"); ctx._source.tags.remove(ctx._source.tags.indexOf(\"outlier\"))",
            "lang": "painless"
        }

        query["script"] = script

        if total_outliers > 0:
            self.logging.logger.info("wiping %s existing outliers",
                                     "{:,}".format(total_outliers))
            self.conn.update_by_query(index=idx,
                                      body=query,
                                      refresh=True,
                                      wait_for_completion=True)
            self.logging.logger.info("wiped outlier information of " +
                                     "{:,}".format(total_outliers) +
                                     " documents")
        else:
            self.logging.logger.info(
                "no existing outliers were found, so nothing was wiped")

    def process_outliers(self, doc=None, outliers=None, should_notify=False):
        for outlier in outliers:
            if outlier.is_whitelisted(additional_dict_values_to_check=doc):
                if self.settings.config.getboolean(
                        "general", "print_outliers_to_console"):
                    self.logging.logger.info(outlier.outlier_dict["summary"] +
                                             " [whitelisted outlier]")
            else:
                if self.settings.config.getboolean("general",
                                                   "es_save_results"):
                    self.save_outlier(doc=doc, outlier=outlier)

                if should_notify:
                    self.notifier.notify_on_outlier(doc=doc, outlier=outlier)

                if self.settings.config.getboolean(
                        "general", "print_outliers_to_console"):
                    self.logging.logger.info("outlier - " +
                                             outlier.outlier_dict["summary"])

    def add_bulk_action(self, action):
        self.bulk_actions.append(action)
        if len(self.bulk_actions) > BULK_FLUSH_SIZE:
            self.flush_bulk_actions()

    def flush_bulk_actions(self, refresh=False):
        if len(self.bulk_actions) == 0:
            return
        eshelpers.bulk(self.conn,
                       self.bulk_actions,
                       stats_only=True,
                       refresh=refresh)
        self.bulk_actions = []

    def save_outlier(self, doc=None, outlier=None):
        # add the derived fields as outlier observations
        derived_fields = self.extract_derived_fields(doc["_source"])
        for derived_field, derived_value in derived_fields.items():
            outlier.outlier_dict["derived_" + derived_field] = derived_value

        doc = add_outlier_to_document(doc, outlier)

        action = {
            '_op_type': 'update',
            '_index': doc["_index"],
            '_type': doc["_type"],
            '_id': doc["_id"],
            'retry_on_conflict': 10,
            'doc': doc["_source"]
        }
        self.add_bulk_action(action)

    def extract_derived_fields(self, doc_fields):
        derived_fields = dict()
        for field_name, grok_pattern in self.settings.config.items(
                "derivedfields"):
            if helpers.utils.dict_contains_dotkey(doc_fields,
                                                  field_name,
                                                  case_sensitive=False):
                if grok_pattern in self.grok_filters.keys():
                    grok = self.grok_filters[grok_pattern]
                else:
                    grok = Grok(grok_pattern)
                    self.grok_filters[grok_pattern] = grok

                match_dict = grok.match(
                    helpers.utils.get_dotkey_value(doc_fields,
                                                   field_name,
                                                   case_sensitive=False))

                if match_dict:
                    for match_dict_k, match_dict_v in match_dict.items():
                        derived_fields[match_dict_k] = match_dict_v

        return derived_fields

    def extract_fields_from_document(self, doc, extract_derived_fields=False):
        doc_fields = doc["_source"]

        if extract_derived_fields:
            derived_fields = self.extract_derived_fields(doc_fields)

            for k, v in derived_fields.items():
                doc_fields[k] = v

        return doc_fields
示例#42
0
class DPRIndex(DocumentChunker):
    '''
    Class for indexing and searching documents, using a combination of
    vectors producted by DPR and keyword matching from Elastic TF-IDF. As a
    subclass of DocumentChunker, this class automatically handles document
    chunking as well.
    '''

    INDEX_NAME = 'dense-passage-retrieval'
    D = 768
    context_tokenizer = DPRContextEncoderTokenizer.from_pretrained(
        'facebook/dpr-ctx_encoder-single-nq-base')
    context_model = DPRContextEncoder.from_pretrained(
        'facebook/dpr-ctx_encoder-single-nq-base', return_dict=True)
    question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
        'facebook/dpr-question_encoder-single-nq-base')
    question_model = DPRQuestionEncoder.from_pretrained(
        'facebook/dpr-question_encoder-single-nq-base', return_dict=True)

    def __init__(self, documents: List[DPRDocument]):
        super(DocumentChunker).__init__()
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        if self.device == 'cuda':
            self.reader_model = self.reader_model.cuda()
        self.faiss_index = faiss.IndexFlatIP(self.D)
        self._setup_elastic_index()
        self._build_index(documents)

    def _setup_elastic_index(self):
        '''Sets up the Elastic Index. Deletes old ones if needed.'''
        self.es = Elasticsearch()
        if self.es.indices.exists(self.INDEX_NAME):
            logging.warning(f'Deleting old index for {self.INDEX_NAME}.')
            self.es.indices.delete(self.INDEX_NAME)
        self.es.indices.create(index=self.INDEX_NAME)

    def _build_index(self, documents):
        '''
        Initializes the data structure to keep track of which chunks
        correspond to which documents.
        '''
        self.documents = documents
        self.doc_bodies = [doc.body for doc in self.documents]
        self.chunks = []
        self.chunk_index = {}  # {chunk: document}
        self.inverse_chunk_index = {}  # {document: [chunks]}
        chunk_counter = 0
        for doc_counter, doc_body in tqdm(enumerate(self.doc_bodies),
                                          total=len(self.doc_bodies)):
            self.inverse_chunk_index[doc_counter] = []
            chunked_docs = self.chunk_document(doc_body)
            self.chunks.extend(chunked_docs)
            for chunked_doc in chunked_docs:
                chunk_embedding = self.embed_context(chunked_doc)
                self.faiss_index.add(chunk_embedding)
                self.es.create(self.INDEX_NAME,
                               id=chunk_counter,
                               body={'chunk': chunked_doc})
                self.chunk_index[chunk_counter] = doc_counter
                self.inverse_chunk_index[doc_counter].append(chunk_counter)
                chunk_counter += 1
        self.total_docs = len(self.documents)
        self.total_chunks = len(self.chunks)

    def embed_question(self, question: str):
        '''Embed the question in vector space with the question encoder.'''
        input_ids = self.question_tokenizer(question,
                                            return_tensors='pt')['input_ids']
        embeddings = self.question_model(
            input_ids).pooler_output.detach().numpy()
        return embeddings

    def embed_context(self, context: str):
        '''Embed the context (doc) in vector space with the question encoder.'''
        input_ids = self.context_tokenizer(context,
                                           return_tensors='pt')['input_ids']
        embeddings = self.context_model(
            input_ids).pooler_output.detach().numpy()
        return embeddings

    def search_dense_index(self, question: str, k: int = 5):
        '''
        Search the vector index by encoding the question and then performing
        nearest neighbor on the FAISS index of context vectors.

        Args:
            question (str):
                The natural language question, e.g. `who is bill gates?`
            k (int):
                The number of documents to return from the index.
        '''
        if k > self.total_chunks:
            k = self.total_chunks
        question_embedding = self.embed_question(question)
        dists, chunk_ids = self.faiss_index.search(question_embedding, k=k)
        dists, chunk_ids = list(dists[0]), list(chunk_ids[0])
        dists = list(map(float, dists))  # For Flask
        structured_response = []
        for dist, chunk_id in zip(dists, chunk_ids):
            chunk = self.chunks[chunk_id]
            document_id = self.chunk_index[chunk_id]
            document = self.documents[document_id]
            blob = {
                'document': document,
                'document_id': document_id,
                'chunk': chunk,
                'chunk_id': int(chunk_id),  # For Flask
                'faiss_dist': dist
            }
            structured_response.append(blob)
        return structured_response

    def search_sparse_index(self, query):
        body = {'size': 10, 'query': {'match': {'chunk': query}}}
        results = self.es.search(index=self.INDEX_NAME, body=body)
        hits = results['hits']['hits']
        return hits

    def _merge_results(self, sparse_results, dense_results):
        '''Merges the results of sparse and dense retrieval.'''
        results_index = {}
        for sparse_result in sparse_results:
            id, score = sparse_result['_id'], sparse_result['_score']
            id = int(id)
            results_index[id] = {'elastic_score': score}
        for dense_result in dense_results:
            id, score = dense_result['chunk_id'], dense_result['faiss_dist']
            if id in results_index:
                results_index[id]['faiss_dist'] = score
            else:
                results_index[id] = {'faiss_dist': score}
        results = []
        for chunk_id, scores in results_index.items():
            document_id = self.chunk_index[chunk_id]
            document = self.documents[document_id]
            chunk = self.chunks[chunk_id]
            doc_profile = document.to_dict()
            result = {
                'chunk_id': chunk_id,
                'chunk': chunk,
                'document_id': document_id,
                'document': doc_profile,
                'scores': scores
            }
            results.append(result)
        return results

    def search_dual_index(self, query: str):
        '''Search both the sparse and dense indices and merge the results.'''
        sparse_result = self.search_sparse_index(query)
        dense_result = self.search_dense_index(query)
        merged_results = self._merge_results(sparse_result, dense_result)
        return merged_results
示例#43
0
class PreselectionServer:
    def __init__(self, address):
        self.es = Elasticsearch(address)

    def index_documents(self, dfUsr):
        df = dfUsr

        means = df.groupby(['userID'], as_index=False, sort=False) \
            .mean() \
            .loc[:, ['userID', 'rating']] \
            .rename(columns={'rating': 'ratingMean'})

        df = pd.merge(df, means, on='userID', how="left", sort=False)
        df['ratingNormal'] = df['rating'] - df['ratingMean']

        ratings = df.loc[:, ['userID', 'movieID', 'ratingNormal']] \
            .rename(columns={'ratingNormal': 'rating'}) \
            .pivot_table(index='userID', columns='movieID', values='rating') \
            .fillna(0)

        print("Indexing users...")
        index_users = [{
            "_index": "users",
            "_type": "user",
            "_id": index,
            "_source": {
                'ratings': row[row > 0] \
                .sort_values(ascending=False) \
                .index.values.tolist()
            }
        } for index, row in ratings.iterrows()]
        helpers.bulk(self.es, index_users)
        print("Done")

        print("Indexing movies...")
        index_movies = [{
            "_index": "movies",
            "_type": "movie",
            "_id": column,
            "_source": {
                "whoRated": ratings[column][ratings[column] > 0] \
                        .sort_values(ascending=False) \
                        .index.values.tolist()
                        }
        } for column in ratings]
        helpers.bulk(self.es, index_movies)
        print("Done")

    def get_movies_liked_by_user(self, user_id, index='users'):
        user_id = int(user_id)
        result = self.es.get(index=index, doc_type="user",
                             id=user_id)["_source"]
        return result

    def get_users_that_like_movie(self, movie_id, index='movies'):
        movie_id = int(movie_id)
        return self.es.get(index=index, doc_type="movie",
                           id=movie_id)["_source"]

    def get_movie_recommendations(self, user_id, index='users'):
        movies_liked_by_user = self.get_movies_liked_by_user(user_id)
        user_id = int(user_id)

        users_who_rated_at_least_one = self.es.search(
            index=index, body={'query': {
                'terms': movies_liked_by_user
            }})["hits"]["hits"]

        unique_movies = set()
        for ratings in users_who_rated_at_least_one:
            if ratings["_id"] != user_id:
                ratings = ratings["_source"]["ratings"]
                for rating in ratings:
                    if rating not in movies_liked_by_user["ratings"]:
                        unique_movies.add(rating)

        return list(unique_movies)

    def get_user_recommendations(self, movie_id, index='movies'):
        users_who_liked_the_movie = self.get_users_that_like_movie(movie_id)
        movie_id = int(movie_id)

        movies_rated_by_at_least_one = self.es.search(
            index=index, body={'query': {
                'terms': users_who_liked_the_movie
            }})["hits"]["hits"]

        unique_users = set()
        for ratings in movies_rated_by_at_least_one:
            if ratings["_id"] != movie_id:
                ratings = ratings["_source"]["whoRated"]
                for rating in ratings:
                    if rating not in users_who_liked_the_movie["whoRated"]:
                        unique_users.add(rating)

        return list(unique_users)

    def add_user_document(self,
                          user_id,
                          movies,
                          user_index='users',
                          movie_index='movies'):
        user_id = int(user_id)
        movies = list(set(movies))
        to_update = [
            self.es.get(index=movie_index, id=movie_id, doc_type='movie')
            for movie_id in movies
        ]

        if len(to_update) != len(movies):
            raise Exception("One or more movies unknown")

        for movie_document in to_update:
            users = movie_document["_source"]["whoRated"]
            users.append(user_id)
            users = list(set(users))
            self.es.update(index=movie_index,
                           id=movie_document["_id"],
                           doc_type='movie',
                           body={"doc": {
                               "whoRated": users
                           }})

        self.es.create(index=user_index,
                       id=user_id,
                       body={"ratings": movies},
                       doc_type='user')

    def update_user_document(self,
                             user_id,
                             movies,
                             user_index='users',
                             movie_index='movies'):
        user_id = int(user_id)

        movies = list(set(movies))
        to_update = self.es.get(index=user_index, id=user_id, doc_type='user')
        old_movies = to_update['_source']['ratings']

        movies_to_add_user = np.setdiff1d(movies, old_movies)
        movies_to_remove_user = np.setdiff1d(old_movies, movies)

        for movie_to_remove_user in movies_to_remove_user:
            movie_document = self.es.get(index=movie_index,
                                         id=movie_to_remove_user,
                                         doc_type='movie')
            users_who_liked_movie = movie_document["_source"]["whoRated"]
            users_who_liked_movie.remove(user_id)
            users_who_liked_movie = list(set(users_who_liked_movie))
            self.es.update(index=movie_index,
                           id=movie_to_remove_user,
                           doc_type='movie',
                           body={"doc": {
                               "whoRated": users_who_liked_movie
                           }})

        for movie_to_add_user in movies_to_add_user:
            movie_document = self.es.get(index=movie_index,
                                         id=movie_to_add_user,
                                         doc_type='movie')
            users_who_liked_movie = movie_document["_source"]["whoRated"]
            users_who_liked_movie.append(user_id)
            users_who_liked_movie = list(set(users_who_liked_movie))
            self.es.update(index=movie_index,
                           id=movie_to_add_user,
                           doc_type='movie',
                           body={"doc": {
                               "whoRated": users_who_liked_movie
                           }})

        self.es.update(index=user_index,
                       id=user_id,
                       body={"doc": {
                           "ratings": movies
                       }},
                       doc_type="user")

    def get_all_index(self):
        return self.es.indices.get_alias("*")
示例#44
0
import random
from make_file_io import makeFileIo

import time
from datetime import datetime
from elasticsearch import Elasticsearch
import json

es = Elasticsearch(
    hosts=["ec2-54-180-123-238.ap-northeast-2.compute.amazonaws.com"])
# es = Elasticsearch()

# es.indices.create(index="io_log")

doc = makeFileIo("poc1.mobis.com", "", "FR_CAM", "/ifs/raw_10/FR_CAM",
                 "inactive", "2019-10-10 10:10:10")

res = es.create(index="io_log", id=doc['datetime'], body=doc)
示例#45
0
class pdfGraph():
    """Create and manage the PDF graph in Neo4j and index in Elasticsearch"""

    db_path = "http://localhost:7474/db/data/"
    db = None
    pdf_documents = None
    authors = None
    keywords = None
    es_cluster = [{'host': 'localhost', 'port': 9200}]
    es = None
    es_ixc = None

    def __init__(self):
        """ setup Neo4j database connection and node labels
            and Elasticsearch mapping attachments index """

        self.db = GraphDatabase(self.db_path)
        self.pdf_documents = self.db.labels.create("PDFDocument")
        self.authors = self.db.labels.create("Author")
        self.keywords = self.db.labels.create("Keyword")

        self.es = Elasticsearch(self.es_cluster)
        self.es_ixc = IndicesClient(self.es)
        self.es_ixc.create(
            index="pdf_documents",
            body={
                'mappings': {
                    'pdf': {
                        'properties': {
                            'url': {'type': "string"},
                            'pdf_file': {'type': "attachment"}
                        }
                    }
                }
            }
        )

    def createNodesAndIx(self, doc_url, doc_info, doc_metadata, doc_data):
        """Given document details create nodes and relationships for documents,
        authors and keywords and store the related documents for indexing and
        search"""

        # not all pdf docs have all fields so we need to check for existence
        check_for = lambda n, d: d[n] if (n in d) else ''
        author = check_for('Author', doc_info[0])
        # create an author node if one doesn't already exists
        if author is not '':
            author_node = self.authorExists(author)
            if author_node is None:
                author_node = self.createAuthor(author)
        # create keyword nodes if they don't already exist
        if check_for('pdf', doc_metadata) is not '':
            keywords = check_for('Keywords', doc_metadata['pdf'])
        else:
            keywords = ''
        if keywords is not '':
            keyword_nodes = []
            for keyword in map(lambda x: x.strip(" '\""), keywords.split(",")):
                keyword_node = self.keywordExists(keyword)
                if keyword_node is None:
                    keyword_node = self.createKeyword(keyword)
                keyword_nodes.append(keyword_node)
        # create the document node
        pdf_node = self.db.nodes.create(
            url=doc_url,
            info=repr(doc_info),
            metadata=repr(doc_metadata),
            title=check_for('Title', doc_info[0])
        )
        self.pdf_documents.add(pdf_node)
        # create relationships b/w document, author and keywords
        if author is not '':
            pdf_node.relationships.create("AUTHORED_BY", author_node)
        if keywords is not '':
            for keyword_node in keyword_nodes:
                pdf_node.relationships.create("HAS_KEYWORD", keyword_node)
        # add the document for full-text search to ES using Neo4j id
        self.es.create(
            index="pdf_documents",
            doc_type="pdf",
            id=pdf_node.id,
            body={
                'url': doc_url,
                'pdf_file': base64.b64encode(doc_data.getvalue())
            }
        )

    def authorExists(self, author):
        """Check for an existing author node"""

        r = self.db.query(
            'match (a:Author) where a.name = "' + author + '" return a',
            returns=(client.Node)
        )
        return r[0][0] if (len(r) > 0) else None

    def createAuthor(self, author):
        """Create an author node"""

        an_author = self.db.nodes.create(name=author)
        self.authors.add(an_author)
        return an_author

    def keywordExists(self, keyword):
        """Check for an existing keyword node"""

        r = self.db.query(
            'match (k:Keyword) where k.name = "' + keyword + '" return k',
            returns=(client.Node)
        )
        return r[0][0] if (len(r) > 0) else None

    def createKeyword(self, keyword):
        """Create a keyword node"""

        a_keyword = self.db.nodes.create(name=keyword)
        self.keywords.add(a_keyword)
        return a_keyword
示例#46
0
                "as": {
                    "Organization": {
                        "name": row[5]
                    },
                    "number": row[4].replace('AS', ''),
                    "as_type": row[7],
                    "domain": row[6]
                }
            },
            "ip_range": {
                "Organization": {
                    "name": row[1],
                    "domain": row[2],
                    "type": row[3],
                },
                "num_start": num_start,
                "num_end": num_end
            }
        }

        # ドキュメントの登録
        es.create(index='ipinfo_internal-0001',
                  id=str(uuid.uuid4()),
                  body=document)

        if i % 5000 == 0:
            print('{:,}'.format(i))
        i += 1

es.close()
示例#47
0
class InventoryExporter(BaseExporter):
    name = "elasticsearch aggregations exporter"

    def __init__(self, config_g):
        super().__init__(config_g)
        error_msgs = []
        self.es_hosts = config_g["es_hosts"]
        if not self.es_hosts:
            error_msgs.append("Missing es_hosts")
        if not isinstance(self.es_hosts, list):
            error_msgs.append("es_hosts must be a list")
        if error_msgs:
            raise ImproperlyConfigured("{} in {}".format(
                ", ".join(error_msgs), self.name))

    def iter_machine_snapshots(self):
        for serial_number, machine_snapshots in self.get_ms_query().fetch(
                paginate=False, for_filtering=True):
            for machine_snapshot in machine_snapshots:
                yield machine_snapshot

    def get_es_client(self):
        self._es = Elasticsearch(hosts=self.es_hosts)
        self._es_version = [
            int(i) for i in self._es.info()["version"]["number"].split(".")
        ]
        if self._es_version < [7]:
            raise ValueError(
                "Inventory exporter {} not compatible with ES < 7.0")
        # lifecycle
        _esilm = IlmClient(self._es)
        _esilm.put_lifecycle(ES_LIFECYCLE_POLICY_NAME, ES_LIFECYCLE_POLICY)
        # template
        self._es.indices.put_template(ES_TEMPLATE_NAME, ES_TEMPLATE)
        # create index
        for i in range(10):
            existing_indices = self._es.indices.get(ES_INDEX_PATTERN).keys()
            if not len(existing_indices):
                current_index_name = ES_INDEX_PATTERN.replace("*", "000001")
                try:
                    self._es.indices.create(
                        current_index_name,
                        {"aliases": {
                            ES_ALIAS: {
                                "is_write_index": True
                            }
                        }})
                except RequestError:
                    # probably race
                    pass
                else:
                    break
        return ES_ALIAS

    def run(self):
        timestamp = timezone.now().isoformat()
        index_name = self.get_es_client()
        for source in Source.objects.current_machine_snapshot_sources():
            ms_query = self.get_ms_query()
            source_d = {
                "id": source.pk,
                "module": source.module,
                "name": source.name,
                "display_name": source.get_display_name()
            }
            ms_query.force_filter(SourceFilter, hidden_value=source.pk)
            for f, f_links, _, _ in ms_query.grouping_links():
                filter_d = {"title": f.title, "slug": f.get_query_kwarg()}
                for label, f_count, _, _, _ in f_links:
                    if label == "\u2400":
                        label = "NULL"
                    elif not isinstance(label, str):
                        label = str(label)
                    doc = {
                        "source": source_d,
                        "filter": filter_d,
                        "value": label,
                        "count": f_count,
                        "@timestamp": timestamp
                    }
                    doc_id = str(uuid.uuid4())
                    self._es.create(index_name, doc_id, doc)
示例#48
0
                "properties": {
                    "text": {
                        "type": "text",
                        "analyzer": "my_analyzer"
                    }
                }
            }
        }
    })

#%% 5 Load the data to the ES index
resource_path = 'resources/ustawy'
for filename in os.listdir(resource_path):
    with open(resource_path + '/' + filename, 'r') as document:
        legislation = document.read()
        es.create("my_index", "legislation", filename, {"text": legislation})

#%% 6 number of legislative acts containing the word ustawa
es.search(index="my_index",
          doc_type="legislation",
          body={"query": {
              "match": {
                  "text": {
                      "query": "ustawa"
                  }
              }
          }})["hits"]["total"]

#%% 7 containing the words kodeks postępowania cywilnego
es.search(index="my_index",
          doc_type="legislation",
示例#49
0
class CabiAnalyzer:
    """ 
        Class to manage analyzing cabi data 
    """
    #   d2014_4 = genfromtxt(join(DATA_DIR, "2010-4th-quarter.csv"),
    #                         dtype=None, delimiter=",")

    def __init__(self, data_dir, index_name):
        """
        create a connection to ES, make an index that is specified by the
        user, scan through a list of files and begin to add a line from each
        to the database as individual documents
        :param data_dir: directory with csv files in it
        :param index_name: elasticsearch index name
        :return:
        """
        self.data_dir = data_dir
        self.data_list = list()
        self.es = Elasticsearch([{'host': 'localhost', 'port': '9200'}])
        self.index_name = index_name
        # self.clear_elasticsearch(self.index_name)
        # Debating whether or not we should automatically delete all that data.
        self.es.indices.create(index=self.index_name, ignore=400)
        self.process_data()

    # def get_unique_stations(self):
    #     """
    #         Define unique stations by running through all the data.
    #     """
    #     pass

    def process_data(self):
        """
            run a loop against parse file and put each file into ES
        :return:
        """
        self.data_list = self.get_data_list()
        if len(self.data_list) == 0:
            print "No data to read."
        for i in xrange(0, len(self.data_list)):
            self.parse_file(i)

    def parse_file(self, file_index):
        """
        after we've gathered a list of files, we pass an index to this function
        it goes into the list and grabs that file then parses it line by line
        :param file_index: int from 0 to len(self.data_list)
        :return:
        """
        count = 0
        this_file = self.data_list[file_index]
        for line in open(this_file, 'r'):
            if count == 0:
                count += 1
                continue
            in_data = line.strip().split(",")
            seconds = timestr_to_sec(in_data[0])
            fromtime = datetime.strptime(in_data[1], '%m/%d/%Y %H:%M')
            fintime = datetime.strptime(in_data[2], '%m/%d/%Y %H:%M')
            # print in_data
            start_stn_num = re.findall("([0-9]{5})", in_data[3])[0]
            start_stn_addr = in_data[3].split("(")[0].strip()

            fin_stn_num = re.findall("([0-9]{5})", in_data[4])[0]
            fin_stn_addr = in_data[4].split("(")[0].strip()

            bike_id = in_data[5]
            user_type = in_data[6]

            add_data = {"triplength": seconds,
                        "starttime": fromtime,
                        "fintime": fintime,
                        "start_stn_num": start_stn_num,
                        "start_stn_addr": start_stn_addr,
                        "fin_stn_num": fin_stn_num,
                        "fin_stn_addr": fin_stn_addr,
                        "bike_id": bike_id,
                        "user_type": user_type,
                        "from_to_quick": start_stn_num + "_" + fin_stn_num
            }
            self.es.create(self.index_name, "rides", add_data)
            count += 1
        print "Inserted " + str(count) + " objects."

    def get_data_list(self):
        """
            Go to the target directory.  Find and return an array of files
            that we can then analyze.
        """
        ret_vals = list()
        tgt_dir = self.data_dir
        for c_file in listdir(tgt_dir):
            if isfile(join(tgt_dir, c_file)):
                if c_file[-3:].lower() == 'csv':
                    ret_vals.append(join(tgt_dir, c_file))
        return ret_vals

    def clear_elasticsearch(self, index_name):
        """
        prep for a new data entry by clearing the entire elasticsearch index
        :return:
        """
        self.es.indices.delete(index=index_name, ignore=[400, 404])
示例#50
0

def read_json(filename):
    f = open(filename, 'r')
    jsonData = json.load(f, "utf-8")
    text = json.dumps(jsonData)
    f.close()
    return text, jsonData


if __name__ == "__main__":
    #count, index = load_sample()

    p_text, p_json = read_json("output/one_json_time_series_patient.json")

    es = Elasticsearch()
    index = "oa"
    doc_type = "Patient"
    i = 1

    setting = yaml.load(open('elastic_search/mapping.yaml'))
    properties = setting["mappings"]["Patient"]["properties"].keys()
    print es.create(index=index, doc_type=doc_type, body=setting)

    for p in xrange(len(p_json)):
        es.index(index=index,
                 doc_type=doc_type,
                 id=i,
                 body=p_json["%s" % p]["0"]["Plan"].split(" "))
        i += 1
'''
Created on Jun 25, 2014

@author: jimhorng
'''
from elasticsearch import Elasticsearch
es = Elasticsearch(hosts=[{'host': '192.168.69.41', 'port': 9200}])

body = {
    "size": 0,
    "aggs" : {
        "test_aggr" : {
            "terms" : {
                "field" : "user_id"
            }
        }
    }
}


res = es.search(index="mongo", doc_type='device', body=body)

user_owned_nas_qty = res['aggregations']['test_aggr']['buckets']

for record in user_owned_nas_qty:
    record['user_id'] = record.pop('key')
    record['owned_nas_qty'] = record.pop('doc_count')
    es.create(index="result_temp", doc_type="user_owned_nas_qty", body=record)

print(user_owned_nas_qty)
示例#52
0
class ESData(object):
    def __init__(self):
        es_addr = get_es_address()
        self.client = Elasticsearch([es_addr])

    def _kibana_request(self, url, data):
        headers = {
            'content-type': 'application/json',
            'kbn-xsrf': True
        }
        data = json.dumps(data)
        kibana_url = settings.KIBANA_URL + url
        req = urllib2.Request(kibana_url, data, headers=headers)
        urllib2.urlopen(req)
        return req

    def _kibana_remove(self, _type, body):
        i = 0
        ids = []

        if get_es_major_version() >= 6:
            body['query']['query_string']['query'] += ' type:%s' % _type
            _type = 'doc'

        while True:
            res = self.client.search(index='.kibana', from_=i, doc_type=_type, body=body, request_cache=False)
            if len(res['hits']['hits']) == 0:
                break
            i += 10

            _ids = [hit['_id'] for hit in res['hits']['hits']]
            ids += _ids

        for _id in ids:
            self.client.delete(index='.kibana', doc_type=_type, id=_id, refresh=True)


    def _kibana_export_obj(self, dest, _type, body):
        i = 0

        dest = os.path.join(dest, _type)
        os.makedirs(dest)

        while True:
            if get_es_major_version() < 6:
                res = self.client.search(index='.kibana', from_=i, doc_type=_type, body=body)
            else:
                res = self.client.search(index='.kibana', from_=i, body=body)

            if len(res['hits']['hits']) == 0:
                break
            i += 10

            for hit in res['hits']['hits']:

                _id = hit['_id']
                filename = os.path.join(dest, _id)
                filename += '.json'

                if get_es_major_version() < 6:
                    res = self.client.get(index='.kibana', doc_type=_type, id=_id)
                else:
                    res = self.client.get(index='.kibana', doc_type='doc', id=_id)

                with open(filename, 'w') as f:
                    f.write(json.dumps(res['_source'], separators= (',', ':')))

    def kibana_export(self, full=False):
        dest = tempfile.mkdtemp()
        _types = ('search', 'visualization', 'dashboard')

        if full:
            _types = _types + ('index-pattern',)

        for _type in _types:
            if get_es_major_version() < 6:
                if full:
                    body = {'query': {'match_all': {}}}
                else:
                    body = {
                        'query': {
                            'query_string': {
                                'query': 'NOT title: SN *'
                            }
                        }
                    }
            else:
                if full:
                    body = {
                        'query': {
                            'query_string': {
                                'query': 'type: %s' % _type
                            }
                        }
                    }
                else:
                    body = {
                        'query': {
                            'query_string': {
                                'query': 'type: %s AND NOT title: SN *' % _type
                            }
                        }
                    }
            self._kibana_export_obj(dest, _type, body)

        f = tempfile.NamedTemporaryFile(delete=False)
        tar_name = 'scirius-dashboards-%s' % strftime('%Y%m%d%H%M')
        tar = tarfile.open(mode='w:bz2', fileobj=f)
        tar.add(dest, tar_name)
        tar.close()
        rmtree(dest)
        f.close()
        tar_name += '.tar.bz2'
        return tar_name, f.name

    def _create_kibana_mappings(self):
        if not self.client.indices.exists('.kibana'):
            self.client.indices.create(index='.kibana',body={ "mappings": get_kibana_mappings() })
            self.client.indices.refresh(index='.kibana')
        elif not "visualization" in str(self.client.indices.get_mapping(index='.kibana')):
            self.client.indices.delete(index='.kibana')
            self.client.indices.create(index='.kibana',body={ "mappings": get_kibana_mappings() })
            self.client.indices.refresh(index='.kibana')

    def _kibana_inject(self, _type, _file):
        with open(_file) as f:
            content = f.read()
        name = _file.rsplit('/', 1)[1]
        name = name.rsplit('.', 1)[0]
        if get_es_major_version() < 6:
            doc_type = _type
        else:
            doc_type = 'doc'
        self.client.create(index='.kibana', doc_type=doc_type, id=name, body=content, refresh=True)

    def _kibana_set_default_index(self, idx):
        if get_es_major_version() < 6:
            res = self.client.search(index='.kibana', doc_type='config', body={'query': {'match_all': {}}}, request_cache=False)
        else:
            body = {'query': {'query_string': {'query': 'type: config'}}}
            res = self.client.search(index='.kibana', doc_type='doc', body=body, request_cache=False)

        for hit in res['hits']['hits']:
            content = hit['_source']
            content['defaultIndex'] = idx

            if get_es_major_version() < 6:
                self.client.update(index='.kibana', doc_type='config', id=hit['_id'], body={'doc': content}, refresh=True)
            else:
                self.client.update(index='.kibana', doc_type='doc', id=hit['_id'], body=content, refresh=True)
        else:
            if get_es_major_version() >= 6:
                self._kibana_request('/api/kibana/settings/defaultIndex', {'value': 'logstash-*'})
            else:
                print >> sys.stderr, "Warning: unknown ES version, not setting Kibana's defaultIndex"

    def _get_kibana_files(self, source, _type):
        files = []
        path = os.path.join(source, _type)
        if not os.path.isdir(path):
            return []
        for _file in os.listdir(path):
            if not _file.endswith('.json'):
                continue
            _file = os.path.join(path, _file)
            files.append(_file)
        return files

    def _get_kibana_subdirfiles(self, _type):
        files = []
        for _dir in os.listdir(settings.KIBANA_DASHBOARDS_PATH):
            src_path = os.path.join(settings.KIBANA_DASHBOARDS_PATH, _dir)
            if os.path.isdir(src_path):
                files += self._get_kibana_files(src_path, _type)
        return files

    def kibana_import_fileobj(self, fileobj):
        tar = tarfile.open(mode='r:bz2', fileobj=fileobj)
        tmpdir = tempfile.mkdtemp()
        tar.extractall(tmpdir)
        tar.close()

        subdirs = os.listdir(tmpdir)
        if len(subdirs) != 1:
            raise Exception('Archive does not appear to contain dashboards, visualizations or searches')
        source = os.path.join(tmpdir, subdirs[0])

        self._create_kibana_mappings()

        count = 0
        for _type in ('search', 'visualization', 'dashboard'):
            source_files = self._get_kibana_files(source, _type)
            count += len(source_files)
            for _file in source_files:
                self._kibana_inject(_type, _file)
        rmtree(tmpdir)

        if count == 0:
            raise Exception('No data loaded')

        return count

    def kibana_clear(self):
        body = {
            'query': {
                'query_string': {
                    'query': 'NOT title: SN *'
                }
            }
        }

        _types = ('search', 'visualization', 'dashboard')
        for _type in _types:
            self._kibana_remove(_type, body)

    def kibana_reset(self):

        self._create_kibana_mappings()

        if not os.path.isdir(settings.KIBANA_DASHBOARDS_PATH):
            raise Exception('Please make sure Kibana dashboards are installed at %s' % settings.KIBANA_DASHBOARDS_PATH)

        if self._get_kibana_subdirfiles('index-pattern') == []:
            raise Exception('Please make sure Kibana dashboards are installed at %s: no index-pattern found' % settings.KIBANA_DASHBOARDS_PATH)

        self._kibana_remove('dashboard', {'query': {'query_string': {'query': 'SN*'}}})
        self._kibana_remove('visualization', {'query': {'query_string': {'query': 'SN*'}}})
        self._kibana_remove('search', {'query': {'query_string': {'query': 'SN*'}}})
        self._kibana_remove('index-pattern', {'query': {'query_string': {'query': '*'}}})

        for _type in ('index-pattern', 'search', 'visualization', 'dashboard'):
            for _file in self._get_kibana_subdirfiles(_type):
                self._kibana_inject(_type, _file)

        if get_es_major_version() >= 6:
            self._kibana_request('/api/spaces/space', KIBANA6_NAMESPACE)

        self._kibana_set_default_index(u'logstash-*')

    def _get_indexes(self):
        res = self.client.indices.stats()
        indexes = res['indices'].keys()
        try:
            indexes.remove('.kibana')
        except ValueError:
            pass
        return indexes

    def es_clear(self):
        indexes = self._get_indexes()
        self.client.indices.delete(index=indexes)
        return len(indexes)

    def wait_until_up(self):
        for i in xrange(1024):
            try:
                ret = self.client.cluster.health(wait_for_status='green', request_timeout=15 * 60)
                if ret.get('status') == 'green':
                    break
                sleep(10)
            except ConnectionError:
                pass
示例#53
0
文件: esagent.py 项目: cnwarden/mico
class ESAgent(object):
    """
    ESAgent warpper class for ES operation
    """
    def __init__(self, *args, **kwargs):
        """
        init from settings key/value of ES
        :param args:
        :param kwargs:
        :return:
        """
        settings = kwargs['settings']
        self.agent = Elasticsearch(hosts=settings['ES_HOST'])

        for key in settings:
            if key.upper().startswith('ES_'):
                self.__setattr__(key.upper(), settings[key])

        self.indices = [self.ES_INDEX, self.ES_REF_INDEX, self.ES_TIMESERIES_INDEX, self.ES_CONFIG_INDEX]

    def initalize(self):
        for index in self.indices:
            body = ''
            if index == self.ES_INDEX:
                body = {
                    'mappings':{
                      'comment':{
                          'properties':{
                              'content':{
                                    # object
                                    'properties':{
                                        'symbol': { 'type':'string', 'index':'not_analyzed' }
                                    }
                              }
                          }
                      }
                    },
                    'aliases':{'all_xueqiu':{}}
                }
            self.agent.indices.create(index,
                                      body=body,
                                      ignore=400) #ignore indices exists

    def clean(self):
        for index in self.indices:
            self.agent.indices.delete(index=index, ignore=(400, 404))

    def exist_indices(self, index):
        return self.agent.indices.exists(index=index)

    def add_user(self, uid):
        pass

    def remove_user(self, uid):
        pass

    def get_watched_symbol(self):
        doc = self.agent.search(index=self.ES_CONFIG_INDEX,
                                doc_type='watch_symbols',
                                body={
                                    'size': 100,
                                    'query':{
                                        'match_all': {}
                                    },
                                    'sort':[
                                        {
                                            'symbol':{'order': 'asc'}
                                        }
                                    ]
                                })
        if not doc.has_key('status') and doc['hits']['total'] > 0:
            return [item['_source']['symbol'] for item in doc['hits']['hits']]
        else:
            return []

    def add_symbol_to_watch(self, symbol):
        self.agent.create(index=self.ES_CONFIG_INDEX,
                          doc_type="watch_symbols",
                          id=symbol,
                          body={
                              'symbol':symbol
                          },
                          ignore=(409))

    def remove_symbol_to_watch(self, symbol):
        self.agent.delete(index=self.ES_CONFIG_INDEX,
                          doc_type='watch_symbols',
                          id=symbol,
                          ignore=(404))

    # XueQiu Comment API
    def get_last_comment_id(self, symbol):
        doc = self.agent.search(index=self.ES_INDEX,
                          doc_type='comment',
                          body={
                              'size': 1,
                              'query': {
                                  'match':
                                      { 'content.symbol': symbol }  # not_analyzed this field like term
                              },
                              'sort': [
                                  {'content.id': {'order': 'desc'}}
                              ]
                          },
                          ignore=(400, 404))
        if not doc.has_key('status') and doc['hits']['total'] > 0:
            return doc['hits']['hits'][0]['_source']['content']['id']
        else:
            return 0

    def create_comment(self, doc):
        self.agent.create(index=self.ES_INDEX, doc_type="comment", id=doc['content']['id'], body=doc, ignore=(409))


    def update_comment_with_author(self, id, body):
        self.agent.update(index=self.ES_INDEX, doc_type='comment', id=id, body=body, ignore=(400))


    def create_reference(self, doc):
        self.agent.create(index=self.ES_REF_INDEX, doc_type='instrument', id=doc['code'], body=doc, ignore=(400))
示例#54
0
def main():
    # Connect to localhost:9200 by default.
    client = MongoClient(port=27017)
    db = client["mydatabase"]
    highest_previous_primary_key = 1
    highest_previous_primary_key2 = 1
    mycol1 = db['tweets_test']
    mycol = db['tweets_text_sentiment']
    emoji_sentiment = db['tweets_emoji_sentiment']

    es = Elasticsearch()

    es.indices.delete(index="new_tweets5", ignore=404)
    es.indices.create(
        index="new_tweets5",
        body={
            'mappings': {
                "tweet": {
                    'properties': {
                        'text': {'type': 'text'},
                        'timestamp': {'type': 'date'},
                        'country': {'type': 'text'},
                        'textSentScore': {
                            'type': 'text',
                            'fields': {
                                'raw':{
                                    'type': 'keyword'
                                }
                            }
                        },
                        'location':{'type': "geo_point" }
                    }
                },


            },
            'settings': {
                'analysis': {
                    'analyzer': {
                        'custom_english_analyzer': {
                            'type': 'english',
                            'stopwords': ['made', '_english_']
                        }
                    }
                }
            }
        },
    )

    # es.indices.delete(index="emojitweets1", ignore=404)
    # es.indices.create(
    #     index="emojitweets1",
    #     body={
    #         'mappings': {
    #             "tweetEmoji": {
    #                 'properties': {
    #                     'timestamp': {'type': 'date'},
    #                     'emoji' : {'type': 'text'},
    #                     'country': {'type': 'text'},
    #                     'emojiSent': {'type': 'text'}
    #                 }
    #             }
    #         },
    #         'settings': {
    #             'analysis': {
    #                 'analyzer': {
    #                     'custom_english_analyzer': {
    #                         'type': 'english',
    #                         'stopwords': ['made', '_english_']
    #                     }
    #                 }
    #             }
    #         }
    #     },
    # )

    count = 0
    count2 = 0
    while True:
        cursor = mycol.find({}, no_cursor_timeout=True)
        for msg in cursor:
        #print(msg)
            count += 1
            current_primary_key = int(str(msg['_id'])[-6:],16)
            if current_primary_key > highest_previous_primary_key:
                print(count)
                action = {
                    "index": "new_tweets5",
                    "type": "tweet",
                    'text' : msg["text"],
                    'timestamp': msg["created_at"],
                    'country': msg["country"],
                    'textSentScore': msg['sentimentScoreText'],
                    'location': msg['location']
                }
                es.create(index = "new_tweets5", doc_type = "tweet", id = count, body = action)
                #print(msg["created_at"])
                highest_previous_primary_key = current_primary_key
示例#55
0
文件: es.py 项目: shaybix/Raazi
                            'died': author_result[7],
                            }

                # TODO implement parent-child relationship between books and pages


                content_page = {
                                '_id': str(author_result[0]) + '-' + str(row[0]),
                                'book_id': author_result[0],
                                'page_body': row[1],
                                'volume': row[2],
                                'page_number': row[3],
                                'book': content
                                }

                response = client.create(
                                index='shamela',
                                # REVIEW the id for the page needs looking at.
                                id=str(author_result[0]) + '-' + str(row[0]),
                                body=content_page,
                                # parent='b`ooks'
                                doc_type='pages'
                                )

                # print response

            print "[" + str(count) + "] completed inserting '" + author_result[1] + "' into index"
            count = count + 1

            conn.close()
示例#56
0
class APIDatabase:
    def __init__(self, elastic_index='address-book', *args, **kwargs):
        # calls Elasticsearch() to connect to the database and creates the index for the address book if needed

        import json
        from elasticsearch import Elasticsearch
        from elasticsearch import exceptions as es_exceptions

        # hold on to the exceptions so they can be recognized in the try...except blocks later
        self.es_exceptions = es_exceptions

        # host and port information for Elasticshare() is in the separate json file
        try:
            with open('./elastic_host_config.json') as f:
                elastic_host_info = json.load(f)
        except FileNotFoundError:
            elastic_host_info = {'host': 'localhost', 'port': 9200}

        self.database = Elasticsearch(elastic_host_info, *args, **kwargs)
        self.elastic_index = elastic_index

        # ensure the Elasticsearch index exists
        self.database.indices.create(index=elastic_index, ignore=400)

    def get_contact_by_query(self, page_size, page_num, query_string):
        # searches the data store using query_string and returns page_size entries starting on page page_num

        if page_size < 0:
            return {
                'error': 'pageSize must be a nonnegative integer',
                'status': 400
            }
        elif page_num < 0:
            return {
                'error': 'page must be a nonnegative integer',
                'status': 400
            }

        try:
            result = self.database.search(index=self.elastic_index,
                                          from_=page_num,
                                          q=query_string,
                                          size=page_size)
            return [
                contact['_source']['doc'] for contact in result['hits']['hits']
            ]
        except self.es_exceptions.RequestError as err:
            return {
                'error': err.info['error']['root_cause'][0]['reason'],
                'status': err.status_code
            }

    def get_contact_by_name(self, name):
        # returns the contact with the given name

        try:
            return self.database.get_source(index=self.elastic_index,
                                            id=name)['doc']
        except self.es_exceptions.NotFoundError:
            return {'error': 'not found', 'status': 404}

    def create_contact(self, contact_details):
        # creates a contact with the given contact_details (which includes a name)

        try:
            self.database.create(index=self.elastic_index,
                                 id=contact_details['name'],
                                 body={'doc': contact_details})
            return {'message': 'created', 'status': 200}
        except self.es_exceptions.ConflictError:
            return {'error': 'contact already exists', 'status': 409}

    def update_contact(self, name, contact_details):
        # updates a contact with the new contact_details

        try:
            result = self.database.update(
                index=self.elastic_index,
                id=name,
                body={'doc': {
                    'doc': contact_details
                }})
            return {'message': result['result'], 'status': 200}
        except self.es_exceptions.NotFoundError:
            return {'error': 'not found', 'status': 404}

    def delete_contact(self, name):
        # deletes the contact with the given name

        try:
            self.database.delete(index=self.elastic_index, id=name)
            return {'message': 'deleted', 'status': 200}
        except self.es_exceptions.NotFoundError:
            return {'error': 'not found', 'status': 404}
示例#57
0
class ElasticSearchReporting(Report):
    """Stores report in Elasticsearch."""

    def connect(self):
        """Connect to Elasticsearch.
        @raise CuckooReportError: if unable to connect.
        """
        hosts = []
        for host in self.options.get("hosts", "127.0.0.1:9200").split(","):
            if host.strip():
                hosts.append(host.strip())

        self.index = self.options.get("index", "cuckoo")
        self.type_ = self.options.get("type", "cuckoo")

        try:
            self.es = Elasticsearch(hosts)
        except TypeError:
            raise CuckooReportError(
                "Elasticsearch connection hosts must be host:port or host"
            )
        except (ConnectionError, ConnectionTimeout) as e:
            raise CuckooReportError("Cannot connect to Elasticsearch: %s" % e)

    def do_index(self, obj):
        index = "%s-%d" % (self.index, self.task["id"])

        try:
            self.es.create(index=index, doc_type=self.type_, body=obj)
        except Exception as e:
            raise CuckooReportError(
                "Failed to save results in ElasticSearch for "
                "task #%d: %s" % (self.task["id"], e)
            )

        self.idx += 1

    def process_summary(self, results):
        """Index the behavioral summary."""
        summary = results.get("behavior", {}).get("summary")
        if summary:
            self.do_index(summary)

    def process_behavior(self, results, paginate=100):
        """Index the behavioral data."""
        for process in results.get("behavior", {}).get("processes", []):
            page, calls = 0, []
            for call in process["calls"]:
                calls.append(call)

                if len(calls) == paginate:
                    self.do_index({
                        "process": {
                            "pid": process["pid"],
                            "page": page,
                            "calls": calls,
                        },
                    })

                    page += 1
                    calls = []

            if calls:
                self.do_index({
                    "process": {
                        "pid": process["pid"],
                        "page": page,
                        "calls": calls,
                    },
                })

    def run(self, results):
        """Index the Cuckoo report into ElasticSearch.
        @param results: analysis results dictionary.
        @raise CuckooReportError: if the connection or reporting failed.
        """
        if not HAVE_ELASTIC:
            raise CuckooDependencyError(
                "Unable to import elasticsearch (install with "
                "`pip install elasticsearch`)"
            )

        self.connect()
        self.idx = 0

        # Index the summary.
        self.process_summary(results)

        # Index the API calls.
        if self.options.get("calls", True):
            self.process_behavior(results)
示例#58
0
""" 你的 APPID AK SK """
APP_ID = '23771318'
API_KEY = '0wqWkZ0Ww50uz8hZu5G3WEgG'
SECRET_KEY = 'bPmhDxHDZQZb0GzGQoHWEE9QjGYhGta6'

client = AipOcr(APP_ID, API_KEY, SECRET_KEY)

i = open('denggao.png', 'rb')
img = i.read()
message = client.basicGeneral(img)
for i in message.get('words_result'):
    print(i.get('words'))
#识别结果替换为变量
a = i.get('words')
#生成uuid
import datetime
import string
import random
from uuid import *

nod_uuid = lambda x: str(
    uuid5(
        NAMESPACE_X500,
        str(x) + str(datetime.datetime.now()) + ''.join(
            random.sample(string.ascii_letters + string.digits, 8))))

print(nod_uuid(''))

#创建索引
print(es.create(index='a1', id=nod_uuid(''), body={"内容": a}))
示例#59
0
import sys, json, yaml
from elasticsearch import Elasticsearch

def read_json(filename):
        f = open(filename, 'r')
        jsonData = json.load(f,"utf-8")
        text = json.dumps(jsonData)
        f.close()
        return text, jsonData


if __name__ == "__main__":
        #count, index = load_sample()

        p_text, p_json = read_json("output/one_json_time_series_patient.json")


	es = Elasticsearch()
	index = "oa"
	doc_type = "Patient"
	i = 1

	setting = yaml.load(open('elastic_search/mapping.yaml'))
	properties = setting["mappings"]["Patient"]["properties"].keys()
	print es.create(index=index, doc_type=doc_type, body=setting)

	for p in xrange(len(p_json)):
    		es.index(index=index, doc_type=doc_type, id=i, body=p_json["%s"%p]["0"]["Plan"].split(" "))
    		i += 1
示例#60
0
def splitDataset():
    es = Elasticsearch(timeout=60)

    #Find all the tweets from the trump index
    authors = es.search(index="trump",
                        size=100000,
                        from_=0,
                        _source_include="user",
                        body={"query": {
                            "match_all": {}
                        }})

    #Iterate through the authors of the tweets
    clicker = 0
    for hit in authors['hits']['hits']:
        source = hit["_source"]
        author = source["user"]

        #Find 2 tweets by the same author
        tweets = es.search(index="trump",
                           size=2,
                           body={"query": {
                               "term": {
                                   "user": author
                               }
                           }})

        count = tweets['hits']['total']
        if count >= 6:

            #Create two different tweets
            t1 = tweets['hits']['hits'][0]
            t2 = tweets['hits']['hits'][1]

            #Insert tweet 1 into the 140set index
            op1 = es.create(id=t1['_id'],
                            index="140set",
                            doc_type="doc",
                            body={
                                'author': t1['_source']['user'],
                                'message': t1['_source']['message']
                            },
                            ignore=[403, 409])

            #Insert tweet 2 (including the message of t1) into the 280set index
            op2 = es.create(id=t2['_id'],
                            index="280set",
                            doc_type="doc",
                            body={
                                'author':
                                t2['_source']['user'],
                                'message':
                                t1['_source']['message'] + ' ' +
                                t2['_source']['message']
                            },
                            ignore=[403, 409])

            #Every 100 tweets generated, print a message
            clicker += 1
            if (clicker % 100) == 0:
                print("Generating tweet sets...")


#splitDataset()