示例#1
0
class NewsDelInfo(object):

    def __init__(self):
        self.hbase_con = HbaseInfoTask()
        self.redis_con = RedisTools()
        self.es = Elasticsearch(ES_ADDR, ignore=404)

    def es_ping(self):
        if not self.es.ping():
            self.es = Elasticsearch(ES_ADDR, ignore=404)

    def run(self):
        while True:
            rowkey = self.redis_con.get_yy_rowkey("es:news:del:info")
            _id = trans_md5(rowkey)
            self.es_ping()
            try:
                boo = self.es.exists(index="xw_info",doc_type="sino",id=_id)
                if boo:
                    self.es.delete(index="xw_info",doc_type="sino",id=_id)
            except Exception as e:
                log_info = "news info delete error %s" %str(e)
                logging.error(log_info)
                boo = self.es.exists(index="xw_info", doc_type="sino", id=_id)
                if boo:
                    self.es.delete(index="xw_info", doc_type="sino", id=_id)
def test_elkhost(eshost):
    try:
        es_conn = Elasticsearch(eshost)
        es_conn.exists(index="logeureka-*", doc_type="_all", id=1)
    except:
        print("ELK host not ready", eshost)
        print("Error:",sys.exc_info()[0])        
        sys.exit(3)
def deleteESItem(elasticsearchDomain, documentId):
    host = elasticsearchDomain

    if (documentId):
        service = 'es'
        ss = boto3.Session()
        credentials = ss.get_credentials()
        region = ss.region_name

        awsauth = AWS4Auth(credentials.access_key,
                           credentials.secret_key,
                           region,
                           service,
                           session_token=credentials.token)

        es = Elasticsearch(hosts=[{
            'host': host,
            'port': 443
        }],
                           http_auth=awsauth,
                           use_ssl=True,
                           verify_certs=True,
                           connection_class=RequestsHttpConnection)

        if es.exists(index="textract", doc_type="document", id=documentId):
            es.delete(index="textract", doc_type="document", id=documentId)
            print("Deleted document: {}".format(documentId))
def validate_split(es: Elasticsearch, resolution_updates: Dict[str, any],
                   config: dict) -> None:
    prev_res_num = 0
    prev_para_num = None
    for res in resolution_updates['resolutions']:
        res_num = int(res.metadata['id'].split('resolution-')[1])
        if res_num != prev_res_num + 1:
            raise ValueError(
                f'invalid sequence of resolution numbers: prev num {prev_res_num}, curr num: {res_num}'
            )
        prev_res_num = res_num
        for para in res.paragraphs:
            para_num = int(para.metadata['id'].split('-para-')[1])
            if prev_para_num is None:
                pass
            elif para_num != prev_para_num + 1:
                raise ValueError(
                    f'invalid sequence of paragraph numbers: prev num {prev_para_num}, curr num: {para_num}'
                )
            prev_para_num = para_num
    for match in resolution_updates['remove_matches']:
        match_id = make_hash_id(match)
        if not es.exists(index=config['phrase_match_index'], id=match_id):
            message = f'unknown phrase match id {match_id} (text id: {match.text_id}, phrase match cannot be removed'
            raise ValueError(message)
    return None
示例#5
0
def annotate(config, documentId):
  if "getPosTags" in config and config["getPosTags"] == False: return
  esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"]))
  corpusIndex = config["corpus"]["index"]
  corpusType = config["corpus"]["type"]
  corpusFields = config["corpus"]["text_fields"]
  processorIndex = config["processor"]["index"]
  processorType = config["processor"]["type"]
  document = esClient.get(index=corpusIndex, doc_type=corpusType, id = documentId, fields=corpusFields)
  content = ""
  if "fields" in document:
    for field in corpusFields:
      if field in document["fields"]:
        if type(document["fields"][field]) is list:
          for element in document["fields"][field]:
            content += element + ". "
        else:
          content += document["fields"][field] + ". "
      
  annotatedDocument = {}
  sentences = nltk.sent_tokenize(content)
  posTaggedSentences = []
  for sentence in sentences:
    sentence = sentence.strip()
    if len(sentence) > 1:
      sentence = sentence.replace("-", " ")
      sentenceWords = nltk.word_tokenize(sentence.lower())
      sentenceWords = map(lambda x: x.replace(".", ""), sentenceWords)
      posTags = nltk.pos_tag(sentenceWords)
      posTaggedSentences.append(posTags)
  if esClient.exists(index=processorIndex, doc_type=processorType, id=document["_id"]):
    annotatedDocument = esClient.get(index=processorIndex, doc_type=processorType, id=document["_id"])["_source"]
  annotatedDocument["pos_tagged_sentences"] = posTaggedSentences
  esClient.index(index=processorIndex, doc_type=processorType, id=document["_id"], body=annotatedDocument)
  config["logger"].info("pos-processor: Annotated document '" + document["_id"] + "'")
示例#6
0
class ElasticSearch(object):
    def check_node_status(self):
        res = requests.get('http://localhost:9200')
        if res.status_code == 200:
            return (res.content)
        return None

    def connect_es(self):
        self.es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

    def add_document(self, id, data):
        self.es.index(index='green_bond', doc_type='report', id=id, body=data)

    def update_document(self, index, doc, id, data):
        self.es.delete(index=index, doc_type=doc, id=id)
        self.add_document(id, data)

    def check_document_exists(self, index, doc, id):
        return self.es.exists(index=index, doc_type=doc, id=id)

    def find_document(self, index, doc, id):
        return self.es.get(index=index, doc_type=doc, id=id)

    def get_all_document(self, index, doc):
        return self.es.search(index=index, doc_type=doc, size=1000, pretty=1)
    def get(self, job_listing_id):
        print("Request for job listing with id: " + job_listing_id)

        es = Elasticsearch(hosts=["elastic"])
        if (es.exists(index='joblistings',
                      doc_type='job-listing',
                      id=job_listing_id)):
            print('Found the document in ElasticSearch')
            doc = es.get(index='joblistings',
                         doc_type='job-listing',
                         id=job_listing_id)
            return doc['_source']

        print('Not found in ElasticSearch, trying a scrape')
        with ClusterRpcProxy(CONFIG) as rpc:
            listing = rpc.stack_overflow_job_listings_scraping_microservice.get_job_listing_info(
                job_listing_id)
            print(
                "Microservice returned with a result - storing in ElasticSearch"
            )
            es.index(index='joblistings',
                     doc_type='job-listing',
                     id=job_listing_id,
                     body=listing)
            return listing
示例#8
0
def setup_index():
    """Sets up index and mapping if needed"""
    try:
        es = Elasticsearch(get_elasticsearch_endpoint()).indices

        # create index with mapping if needed
        if not es.exists(index=get_elasticsearch_index()):
            es.create(
                index=get_elasticsearch_index(),
                body='{"mappings": {"properties": {"filter": {' +
                '"type":  "keyword"},"autocomplete": {"type": "completion"' +
                ',"contexts": [{"name": "filter","type": "category","path":' +
                ' "filter"}]}}}}')
        # add mapping if needed
        elif len(
                es.get_field_mapping(fields='autocomplete',
                                     index=get_elasticsearch_index())
            [get_elasticsearch_index()]['mappings']) == 0:
            es.put_mapping(
                index=get_elasticsearch_index(),
                body='{"mappings": {"properties": {"filter": {' +
                '"type":  "keyword"},"autocomplete": {"type": "completion"' +
                ',"contexts": [{"name": "filter","type": "category","path":' +
                ' "filter"}]}}}}')

        # continue workflow
        check_records_to_load_into_xse()
    except Exception as e:
        logging.error(e)
        raise SystemExit('Exiting! Connection error with elastic search')
示例#9
0
def __check_botometer(screen_name:str):
    """Calls ES API to get cached BotOMeter API scoring about a screen name if is a Bot or Not
    
    Arguments:
        screen_name {str} -- Twitter Screen Name
    
    Returns:
        [dict] -- BotOMeter API Response
    """    
    settings = Settings()

    es = Elasticsearch(settings.ELASTICSEARCH_URL)

    logger.debug("Checking ES Botometer Info 🤖: %s" % screen_name)
    if es.exists(index=settings.ELASTICSEARCH_BOT_INDEX, doc_type='res', id=screen_name):
        logger.debug("Botometer Info 🤖found for: %s" % screen_name)
        res = es.get(index=settings.ELASTICSEARCH_BOT_INDEX, doc_type='res', id=screen_name)
        delta = arrow.utcnow() - arrow.get(res['_source']['updated_at'])
        
        if MAX_DELTA_BOTOMETER < delta.total_seconds():
            logger.debug("Deprecated ES Botometer Info 🤖: %s" % screen_name)
            return False
        else:
            logger.debug("Found ES Botometer Info 🤖: %s" % screen_name)
            return res['_source']
    else:
        logger.debug("NOT Found ES Botometer Info 🤖: %s" % screen_name)
        return False
示例#10
0
def load_answers(es: Elasticsearch, data_f):
    with open(data_f, 'r', encoding="utf8") as csvfile:  # open data file
        reader = csv.reader(csvfile, delimiter=',',
                            quotechar='"')  # setup reader with delimiter
        for row in reader:  # loop over rows in csv
            if len(row) is not 8:  # check if row is valid
                continue
            try:  # try to setup the data for Elasticsearch, if it fails just continue with the next row
                index = int(row[0])
                if es.exists(
                        'goeievraag', 'answers', index
                ):  # if data already loaded into elasticsearch -> continue
                    continue
                data = {  # create json payload
                    "answerId": index,
                    "date": datetime.datetime.strptime(row[1],
                                                       '%Y-%m-%d %H:%M:%S'),
                    "userId": int(row[2]),
                    "questionId": int(row[3]),
                    "answer": str(row[4]),
                    "thumbsDown": int(row[5]),
                    "thumbsUp": int(row[6]),
                    "isBestAnswer": to_boolean(row[7])
                }
            except ValueError:
                print('Invalid answer',
                      row[0])  # log to command line which answers are invalid
                continue
            es.create('goeievraag', 'answers', index,
                      data)  # send data to elasticsearch
示例#11
0
文件: ingest.py 项目: Capitains/Flint
class ElasticSearch(Endpoint):
    """ ElasticSearch Endpoint implementation

    :param url: URL of the Endpoint
    :type url: str
    :param auth: Authentification information
    :type auth: (str, str)
    :param port: Port of the endpoint
    :type port: int
    """

    def register(self):
        """ Register the endpoint with init resources
        :return: Endpoint
        """
        self.endpoint = ES(self.url, auth=self.auth, port=self.port)
        return self.endpoint

    def create(self, name, settings):
        """ Create an index

        :param name: Name of the index
        :param settings: Setting for the index
        :return: Bool
        """
        return self.endpoint.create(name, body=settings)

    def exists(self, name):
        """ Check if an index exists

        :param name: Name of the index to be created
        :return: Indication of existence as boolean
        :rtype: Bool
        """
        return self.endpoint.exists(name)
示例#12
0
class DB:

    doc_type = 'article'

    def __init__(self, host='localhost', port=9200, index_name='articles'):
        self.es = Elasticsearch([{'host': host, 'port': port}])
        self.index_name = index_name
        self.logger = logging.getLogger(__name__)
        self.es.ping()
        self.create_index_if_not_exists()

    def create_index_if_not_exists(self):
        if self.es.indices.exists(self.index_name):
            return
        configuration = {
            "mappings": {
                self.doc_type: get_resource('elasticsearch/index_mapping.json')
            }
        }
        self.es.indices.create(index=self.index_name, body=configuration)
        self.logger.debug("Elasticsearch index %s created." % self.index_name)

    def index_document(self, document, id_):
        self.es.create(body=document,
                       id=id_,
                       index=self.index_name,
                       doc_type=self.doc_type)

    def id_exists(self, id_):
        return self.es.exists(id=id_,
                              index=self.index_name,
                              doc_type=self.doc_type)
示例#13
0
def index_tweets():
    es = Elasticsearch(["http://mixednode1:9200"], use_ssl=False)
    inputs = glob("filtered/*/*.json")
    logging.info(inputs)
    for filename in inputs:
        logging.info("going to index %s" % filename)
        with open(filename, 'r') as input:
            docs = json.loads(input.read())
            if len(docs) == 0:
                continue
            if es.exists(index="tweets", id=docs[0]["id"]):
                logging.info("Skipping")
                continue
            total = len(docs)
            i = 0
            index_doc = []
            for doc in docs:
                #es.index(index="tweets", doc_type="opinion", id=doc["id"], body=doc)
                i += 1
                doc["created_at"] = parse_date(doc["created_at"])
                index_doc.append({
                    "_index": "tweets",
                    "_type": "tweet",
                    "_id": doc["id"],
                    "_source": doc
                })
                if len(index_doc) == 500:
                    logging.info("indexing %s/%s" % (i, total))
                    helpers.bulk(es, index_doc)
                    index_doc = []
            if len(index_doc) != 0:
                logging.info("indexing %s/%s" % (i, total))
                helpers.bulk(es, index_doc)
示例#14
0
def index_tweets():
    es = Elasticsearch(["http://mixednode1:9200"], use_ssl=False)
    inputs = glob("filtered/*/*.json")
    logging.info(inputs)
    for filename in inputs:
        logging.info("going to index %s" % filename)
        with open(filename, "r") as input:
            docs = json.loads(input.read())
            if len(docs) == 0:
                continue
            if es.exists(index="tweets", id=docs[0]["id"]):
                logging.info("Skipping")
                continue
            total = len(docs)
            i = 0
            index_doc = []
            for doc in docs:
                # es.index(index="tweets", doc_type="opinion", id=doc["id"], body=doc)
                i += 1
                doc["created_at"] = parse_date(doc["created_at"])
                index_doc.append({"_index": "tweets", "_type": "tweet", "_id": doc["id"], "_source": doc})
                if len(index_doc) == 500:
                    logging.info("indexing %s/%s" % (i, total))
                    helpers.bulk(es, index_doc)
                    index_doc = []
            if len(index_doc) != 0:
                logging.info("indexing %s/%s" % (i, total))
                helpers.bulk(es, index_doc)
示例#15
0
def get_rdap_asn(asn):
	es = Elasticsearch()
        does_exist = es.exists(index='whois', doc_type='asn_rdap', id = asn)
        print does_exist
        if does_exist is True:
                status = 200
                print "Found it!"
                get_record = es.get(index='rdap',doc_type='asn', id = asn)
                results = jsonify(get_record['_source'])
	else:
		try:
			url = 'http://hailey.opendnsbl.net:8080/rdapbootstrap/autnum/%s' % asn
			r = requests.get(url)
			status = 200
			b = r.json()
			#c = json.loads(b)
			#d = c['entities']
			#print d
			#e = json.dumps(c)
			#es.index(index='rwhois', doc_type='asn', id=asn, body=json.dumps(b))
			results = jsonify(b)
		except Exception as e:
			print e
			results_raw = jsonify({'status': "not_found"})
        	        status = 404
	                results = jsonify({'status': "not_found"})
	return results,status
示例#16
0
文件: ingest.py 项目: Capitains/Flint
class ElasticSearch(Endpoint):
    """ ElasticSearch Endpoint implementation

    :param url: URL of the Endpoint
    :type url: str
    :param auth: Authentification information
    :type auth: (str, str)
    :param port: Port of the endpoint
    :type port: int
    """
    def register(self):
        """ Register the endpoint with init resources
        :return: Endpoint
        """
        self.endpoint = ES(self.url, auth=self.auth, port=self.port)
        return self.endpoint

    def create(self, name, settings):
        """ Create an index

        :param name: Name of the index
        :param settings: Setting for the index
        :return: Bool
        """
        return self.endpoint.create(name, body=settings)

    def exists(self, name):
        """ Check if an index exists

        :param name: Name of the index to be created
        :return: Indication of existence as boolean
        :rtype: Bool
        """
        return self.endpoint.exists(name)
示例#17
0
def retrieve_resolution_by_id(es: Elasticsearch, resolution_id: str,
                              config: dict) -> Union[Resolution, None]:
    if es.exists(index=config['resolution_index'], id=resolution_id):
        response = es.get(index=config['resolution_index'], id=resolution_id)
        return json_to_republic_resolution(response['_source'])
    else:
        return None
示例#18
0
class ElasticsearchService(object):
    def __init__(self, host, port):
        self._es = Elasticsearch([{'host': host, 'port': port}])

    def search(self, *args, **kwargs):
        return self._es.search(*args, **kwargs)

    def create(self, *args, **kwargs):
        return self._es.create(*args, **kwargs)

    def get(self, *args, **kwargs):
        return self._es.get(*args, **kwargs)

    def exists(self, *args, **kwargs):
        return self._es.exists(*args, **kwargs)

    def msearch(self, *args, **kwargs):
        return self._es.msearch(*args, **kwargs)

    def index(self, *args, **kwargs):
        return self._es.index(*args, **kwargs)

    def update(self, *args, **kwargs):
        return self._es.update(*args, **kwargs)

    def delete(self, *args, **kwargs):
        return self._es.delete(*args, **kwargs)

    def put_template(self, *args, **kwargs):
        return self._es.indices.put_template(*args, **kwargs)
def lambda_handler(event, context):
	id = event['id']
	user = event['user']
	ask = event['ask']

	host = os.environ["NAME_ES_DOMAIN"]

	if "https" in host :
		es = Elasticsearch(
	        [host],
	        use_ssl=True,
	        verify_certs=True,
	        connection_class=RequestsHttpConnection
	    )
	else:
		es = Elasticsearch([host])

	user['id'] = id

	datas = {
		"text": ask['text'],
		"tags": ask['tags'],
		"lang": user['lang'] if "lang" in user else "FR",
		"user": user,
		"date": datetime.date.today(),
		"timestamp": int(time.time())
	}

	if not es.exists(index="questions", doc_type="question", id=int(ask['id'])):
		es.index(index="questions", doc_type="question", id=int(ask['id']), body=datas)

	return True
示例#20
0
class PeragroClient():
    """
    An audio search client
    """
    def __init__(self):
        """
        initialize client object with elasticsearch object
        """
        self.es = Elasticsearch()

    def set_index(self, index):
        """
        set index for to lookup in elasticsearch

        Input:
            -index: an elasticsearch index
        """
        self.index = index

    def get_sound(self, id_):
        """
        Get sound by its id

        input:
            -id: id of sound

        output:
            -sound: sound details if it exists otherwise None

        Usage:

        >>> id = "X2VFAB12GH"
        >>> sound = c.get_sound(id)
        """
        if self.es.exists(index=self.index, doc_type='_all', id=id_):
            res = self.es.get(index=self.index, id=id_)
            return res
        else:
            return None

    def text_search(self, query):
        """
        Get sound results based on text query.
        It also has support for field queries.

        Usage:

        >>> query = "tum hi ho"
        >>> sounds = c.text_search(query)

        >>> # OR field query
        >>> query = "tags:'interscope' genre:'hip hop'"
        >>> sounds = c.text_search(query)
        """
        # print self.index
        # print self.es.search(index=self.index)
        res = self.es.search(index=self.index, q=query)
        print("Got %d Hits:" % res['hits']['total'])
        return res
示例#21
0
class EventoConsumer(AbstractBaseConsumer):

    """
        consumes the messages in cc-zem (json structure based on data of dalite repository)
        and should create on the fly the elasticsearch doc for the zem index
        aim: make it rather simple and usable for prototyping the zem frontend
    """

    def createDoc(self, message):
        course = json.loads(message)

        transformations = EventoESTransformation(course, self.edu_utilities)
        transformations.set_configuration(self.configuration.configuration)
        transformations.make_structure()
        result = transformations.es_structure

        return result


    def __init__(self, config_path: str, configrepshare: str = None, **kwargs):
        super().__init__(config_path, ConsumerConfig, **kwargs)
        self._initialize()


    def _initialize(self):
        if self.configuration["ES"]["active"]:
            self.es = Elasticsearch((self.configuration["ES"]["hosts"]).split("#"),
                                    index=self.configuration["ES"]["index"])
            self.indexClient = self.es.indices
            self.dI = index = self.configuration["ES"]["index"]

        self.all_docs = []
        self.edu_utilities = EduplatformUtilities(self.configuration.configuration)


    def _index_doc(self,key, message):
        if self.configuration["ES"]["active"]:
            doc = self.createDoc(message)
            #bug im update https://github.com/elastic/elasticsearch/issues/41625
            #response = self.es.update(index="zem",id=key,body=doc) if self.es.exists(index="zem",id=key) else self.es.create(index="zem",id=key,body=doc)
            if not self.es.exists(index=self.dI, id=doc["id"]):
                response = self.es.create(index=self.dI, id=doc["id"], body=doc)

    def _append_doc_for_dump(self,key, message):
        self.all_docs.append(json.loads(message))



    def process(self):

        #test = self.indexClient.get_mapping(index=self.dI)
        message = next(self._consumer,None)

        while (message is not None):
            value = message.value.decode('utf-8')
            key = message.key.decode('utf-8')
            self._index_doc(key, value)
            #self._append_doc_for_dump(key,value)
            message = next(self._consumer,None)
示例#22
0
    def populate(self):
        if self.download():
            es = Elasticsearch(self.es_url)

            f = open('%s/%s' % (self.assests_dir, self.l8_metadata_filename),
                     'r')
            # Read the first line for all the headers
            headers = f.readline().split(',')

            # Read the rest of the document
            rows = f.readlines()
            added_counter = 0
            skipped_counter = 0
            for row in rows:
                fields = row.split(',')
                obj = {}
                for header in headers:
                    try:
                        obj[header.replace('\n', '')] = float(
                            fields[headers.index(header)].replace('\n', ''))
                    except ValueError:
                        obj[header.replace(
                            '\n', '')] = fields[headers.index(header)].replace(
                                '\n', '')
                try:
                    if not es.exists(index=self.es_main_index,
                                     doc_type=self.es_main_type,
                                     id=obj['sceneID']):
                        es.create(index=self.es_main_index,
                                  doc_type=self.es_main_type,
                                  id=obj['sceneID'],
                                  body=json.dumps(obj),
                                  ignore=409)
                        # print('%s-%s created' % (counter, obj['sceneID']))
                        added_counter += 1
                        print('%s new records added' % added_counter, end='\r')
                    else:
                        skipped_counter += 1

                    # New meta data is added to the top of the document.
                    # When the script starts to see existing records, it means
                    # that all new records are added and it's safe to break
                    # the loop.
                    if skipped_counter > 10:
                        break

                    return True

                except ConnectionError:
                    print('There was a connection error. Check your Elastic' +
                          ' Search setting and make sure Elastic Search is' +
                          'running.')
                    return False
                except:
                    print('An expected error: %s' % (sys.exc_info()[0]))
                    return False

            print('The update is completed. %s new records were added.' %
                  added_counter)
示例#23
0
def _update_status_ES(status_id: int, json_data: dict):
    settings = Settings()
    es = Elasticsearch(settings.ELASTICSEARCH_URL)

    if es.exists(index=settings.ELASTICSEARCH_STATUS_INDEX, doc_type='status', id=status_id):
        _index_status_ES(status_id, json_data)
    else:
        return None
示例#24
0
class SyncElasticSearch(object):
    host = settings.NAME_ES_DOMAIN
    index = None
    doc_type = None

    def __init__(self, id):
        self.id = id
        if "https" in self.host:
            self.es = Elasticsearch([self.host],
                                    use_ssl=True,
                                    verify_certs=True,
                                    connection_class=RequestsHttpConnection)
        else:
            self.es = Elasticsearch([self.host])

    def transform_user_to_dict(self, instance):
        return {
            "first_name": instance.first_name,
            "last_name": instance.last_name,
            "id": instance.id,
            "username": instance.username,
            "photo": str(instance.photo.url)
        }

    def create(self):
        self.es.index(index=self.index,
                      doc_type=self.doc_type,
                      id=int(self.id),
                      body=self.object)

    def delete(self):
        if self.es.exists(index=self.index,
                          doc_type=self.doc_type,
                          id=int(self.id)):
            self.es.delete(index=self.index,
                           doc_type=self.doc_type,
                           id=int(self.id))

    def update(self):
        if self.es.exists(index=self.index,
                          doc_type=self.doc_type,
                          id=int(self.id)):
            self.es.update(index=self.index,
                           doc_type=self.doc_type,
                           id=int(self.id),
                           body={"doc": self.object})
示例#25
0
    def process_item(self, item, spider):
        # 获取情感分类
        q = lstm_predict(item['content'])
        # q = 0
        es2 = Elasticsearch(hosts=['192.168.3.15'])
        # 存入mysql,同时存入es
        res2 = es2.exists(index="spider",
                          doc_type='article',
                          id=item['article_id'])
        if res2 is not True:
            try:
                # 插入数据
                self.cursor.execute(
                    "INSERT INTO weibo (id,article_id,content,url,media,publish_time,create_time,qinggan,comm_num,read_num,fav_num,env_num,user_id,user_name) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
                    ('0', str(item['article_id']), str(item['content']),
                     str(item['url']), "微博", item['time'], item['create_time'],
                     str(q), str(item['comm_num']), str(item['read_num']),
                     str(item['fav_num']), str(item['env_num']),
                     str(item['user_id']), item['user_name']))
                self.connect.commit()
                print('mysql一条数据插入成功')
            except Exception as e:
                # 出现错误时打印错误日志
                print('mysql 错误', e)

            # 将数据存入es
            es = connections.create_connection(Sina_type._doc_type.using)
            try:
                print("04%" * 30)
                art = Sina_type()
                content = ''.join(item['content']).replace(
                    u'\u3000',
                    u' ').replace(u'\xa0',
                                  u' ').replace(u'\\u3000',
                                                u' ').replace(u'\\xa0', u' ')
                art.content = remove_tags(content)
                art.media = '微博'
                art.publish_time = item['time']
                art.create_time = item['create_time']
                art.url = item['url']
                art.qinggan = q
                art.comm_num = int(item['comm_num'])
                art.read_num = int(item['read_num'])
                art.fav_num = int(item['fav_num'])
                art.env_num = int(item['env_num'])
                art.hot_value = int(item['comm_num']) + int(
                    item['read_num']) + int(item['fav_num']) + int(
                        item['env_num'])
                art.user_id = item['user_id']
                art.user_name = item['user_name']
                art.meta.id = item['article_id']

                art.save()
                print("elasticsearch 存入一条数据", item['article_id'])
            except Exception as e:
                print(e)
                print("03" * 30)
        return item
示例#26
0
 def connect(self, host, port, wait=5):
     """
     Tries to connect to the given elasticsearch
     database
     Returns ElasticSearch object or None
     """
     start = time.time()
     while True:
         try:
             es = Elasticsearch(hosts=host, port=port)
             es.exists(index="test", id=1)
         except exceptions.ConnectionError:
             if time.time() - start > wait:
                 break
         else:
             self.es = es
             return True
     return False
示例#27
0
def get_meeting_by_date(es: Elasticsearch,
                        date: RepublicDate) -> Union[None, Meeting]:
    # pre-session ID for old index
    doc_id = f'meeting-{date.isoformat()}-session-1'
    if es.exists(index=index, doc_type=doc_type, id=doc_id):
        response = es.get(index=index, doc_type=doc_type, id=doc_id)
        return session_from_json(response['_source'])
    else:
        return None
示例#28
0
def get_whois_ip(ip,refresh=None):
	es = Elasticsearch()
	print repr(ip)
	id_num = str(ip).replace(".","0")
	does_exist = es.exists(index='rwhois2', doc_type='ipaddr', id = id_num)
	print does_exist
	if does_exist is True and refresh is None:
		status = 200
		print "Found it!"
		get_record = es.get(index='rwhois2',doc_type='ipaddr', id = id_num)
		results = jsonify(get_record['_source'])
	elif does_exist is True and refresh is not None:
                status = 200
                print "Forcing refresh!"
                es.delete(index='rwhois2', doc_type='ipaddr', id = id_num)
                try:
                        ipwhois.net.socks.setdefaultproxy(ipwhois.net.socks.SOCKS5,"localhost")
			obj = IPWhois(ip)
                        try:
                                results_raw = obj.lookup_whois(get_referral=True,inc_nir=True)
                        except:
                                results_raw = obj.lookup_whois()

                        status = 200
                        results = jsonify(results_raw)
                        es.index(index='rwhois2', doc_type='ipaddr', id=id_num, body=results_raw)

                except Exception as e:
                        print e
                        results = jsonify({'status': "not_found"})
                        status = 404


	
	else:
		try:
			obj = IPWhois(ip)
			try:
				results_raw = obj.lookup_whois(get_referral=True)
			except:
				results_raw = obj.lookup_whois()
			status = 200
			results = jsonify(results_raw)
			id_num = str(ip).replace(".","0")
                        print results
                        try:
				es.index(index='rwhois2', doc_type='ipaddr', id=id_num, body=results_raw)
			except Exception as e:
				print "Elasticsearch encountered a problem ", e
                                pass
		except Exception as e:
                        #print results_raw
        	        print e
                	results_raw = jsonify({'status': "not_found"})
	                status = 404
        	        results = jsonify({'status': "not_found"})
        return results,status
def get_inventory_metadata(es: Elasticsearch, inventory_num: int, config: dict) -> Union[dict, None]:
    if not es.exists(index=config["inventory_index"],
                     doc_type=config["inventory_doc_type"],
                     id=inventory_num):
        return None
    response = es.get(index=config["inventory_index"],
                      doc_type=config["inventory_doc_type"],
                      id=inventory_num)
    return response["_source"]
示例#30
0
    def populate(self):
        if self.download():
            es = Elasticsearch(self.es_url)

            f = open('%s/%s' % (self.assests_dir, self.l8_metadata_filename),
                     'r')

            # Read the first line for all the headers
            headers = f.readline().split(',')

            # Read the rest of the document
            rows = f.readlines()
            added_counter = 0
            skipped_counter = 0
            for row in rows:
                fields = row.split(',')
                obj = {}
                for header in headers:
                    try:
                        obj[header.replace('\n', '')] = float(fields[
                            headers.index(header)].replace('\n', ''))
                    except ValueError:
                        obj[header.replace('\n', '')] = fields[
                            headers.index(header)].replace('\n', '')
                try:
                    if not es.exists(
                            index=self.es_main_index,
                            doc_type=self.es_main_type,
                            id=obj['sceneID']):
                        es.create(
                            index=self.es_main_index,
                            doc_type=self.es_main_type,
                            id=obj['sceneID'],
                            body=json.dumps(obj),
                            ignore=409)
                        # print('%s-%s created' % (counter, obj['sceneID']))
                        added_counter += 1

                    else:
                        skipped_counter += 1

                    print('%s added | %s skipped' % (added_counter, skipped_counter), end='\r')

                except ConnectionError:
                    print('There was a connection error. Check your Elastic' +
                          ' Search setting and make sure Elastic Search is' +
                          'running.')
                    return False
                except:
                    print('An expected error: %s' % (sys.exc_info()[0]))
                    return False

            print('The update is completed. %s new records were added.' %
                  added_counter)

            return True
示例#31
0
class send_data:

    def __init__(self):
        self.es = Elasticsearch([{'host': 'localhost', 'port': '9200'}])

    def read_from_file(self, text_file):
        # counter = 0

        with open(text_file, 'r') as file_read:
            next(file_read)

            while True:
                line = file_read.readline()
                # line = line.strip('\n')
                record = line.split('###')
                # print(record)

                self.send_to_index(record)

                # if not line or counter == 5:
                #     break
                # counter += 1
                if not line:
                    break

    """
    Sends data to ElasticSearch for indexing
    """
    def send_to_index(self,a_webpage):

        webpage_id = int(a_webpage[1])
        img_link = a_webpage[2]
        web_link = a_webpage[3]
        webpage_title = a_webpage[4]
        web_content = a_webpage[5]

        # print(webpage_id,'\t', img_link,'\t', web_link,'\t', webpage_title)
        if not self.es.exists(index="final_kellyhe", id=webpage_id):
            doc = {'title': webpage_title, 'content': web_content, 'image': img_link, 'link':web_link}
            self.es.index(index='final_kellyhe', id=webpage_id, body=doc)
            print('\tSuccess! ID is:  ', webpage_id)
        else:
            print('Duplicate webpage id exist! ID is: ', webpage_id, webpage_title, web_link)
            exit()

    """
    Deletes an index
    """
    def delete_an_index(self, index_name):

        if self.es.indices.exists(index= index_name):
            print('index name exists!')
            self.es.indices.delete(index=index_name)
            print('index deleted!')
        else:
            print('index name NOT exists!')
示例#32
0
def check_id_in_es(es: Elasticsearch, index: str, id: str):
    """
    Check if the news has arleady been indexed in ElasticSearch

    :param es: ElasticSearch connection
    :param index: index in elastic
    :param id: url of the news
    :return: true if already exists, false otherwise
    """
    return es.exists(index, id)
示例#33
0
def retrieve_page_by_id(es: Elasticsearch, page_id: str,
                        config) -> Union[PageXMLPage, None]:
    if not es.exists(index=config['page_index'], id=page_id):
        return None
    response = es.get(index=config['page_index'], id=page_id)
    if '_source' in response:
        page_doc = json_to_pagexml_page(response['_source'])
        return page_doc
    else:
        return None
示例#34
0
def _get_status_ES(status_id: int):

    settings = Settings()
    es = Elasticsearch(settings.ELASTICSEARCH_URL)

    if es.exists(index=settings.ELASTICSEARCH_STATUS_INDEX, doc_type='status', id=status_id):
        res = es.get(index=settings.ELASTICSEARCH_STATUS_INDEX, doc_type='status', id=status_id)
        return res['_source']
    else:
        return None
示例#35
0
def main():
    global resh
    es = Elasticsearch([{'host': '127.0.0.1', 'port': 9200}])
    temp_list = []
    if system("wc -w dump.pickle > /dev/null 2>&1 &") == 0:
        pickle_in = open("dump.pickle", "rb")
        indexing = pickle.load(pickle_in)
        system("rm dump.pickle > /dev/null 2>&1 &")
        for i in indexing:
            temps = i
            i = re.sub("[^a-zA-Z]", "", i)
            str = ""
            i = str.join(i)
            if i == "":
                pass
            else:
                resh = es.exists(index='storage', doc_type='dbs', id=i.lower())
        #print(i)
            if resh == True:
                try:
                    #print ("i is {}".format(i))
                    res = es.get(index='storage', doc_type='dbs', id=i.lower())
                    temp_list = res['_source'][i.lower()]
                    for m in indexing[temps]:
                        temp_list.append(m)
                    data = {
                        i.lower(): temp_list,
                    }
                    res = es.index(index="storage",
                                   doc_type='dbs',
                                   body=data,
                                   id=i.lower())
                    if res['_shards']['successful'] == 1:
                        print("modified")
                    else:
                        print("ERROR!!!")
                except:
                    pass
            elif resh == False:
                data = {
                    i.lower(): indexing[temps],
                }
                try:
                    res = es.index(index="storage",
                                   doc_type='dbs',
                                   body=data,
                                   id=i.lower())
                    if res['_shards']['successful'] == 1:
                        print("Uploaded")
                    else:
                        print("not uploaded")
                except:
                    pass
    else:
        pass
示例#36
0
def retrieve_inventory_metadata(es: Elasticsearch, inventory_num: int, config):
    if not es.exists(index=config['inventory_index'],
                     doc_type=config['inventory_doc_type'],
                     id=inventory_num):
        raise ValueError(
            'No inventory metadata available for inventory num {}'.format(
                inventory_num))
    response = es.get(index=config['inventory_index'],
                      doc_type=config['inventory_doc_type'],
                      id=inventory_num)
    return response['_source']
	def get_lastmodified(self, docid, parameters = {}):
	
		es = Elasticsearch()

		doc_exists = es.exists(index=self.config['index'], doc_type="document", id=docid)

		# if doc with id exists in index, read modification date
		if doc_exists:	
			doc = es.get(index=self.config['index'], doc_type="document", id=docid, _source=False, fields="file_modified_dt")
			last_modified = doc['fields']['file_modified_dt'][0]
		else:
			last_modified=None
			
		return last_modified
示例#38
0
class ElasticsearchService(object):
    def __init__(self, host, port):
        self._es = Elasticsearch([{'host': host, 'port': port}])

    def search(self, *args, **kwargs):
        return self._es.search(*args, **kwargs)

    def create(self, *args, **kwargs):
        return self._es.create(*args, **kwargs)

    def get(self, *args, **kwargs):
        return self._es.get(*args, **kwargs)

    def exists(self, *args, **kwargs):
        return self._es.exists(*args, **kwargs)

    def msearch(self, *args, **kwargs):
        return self._es.msearch(*args, **kwargs)
示例#39
0
def get_rdap_ip(ip):
	es = Elasticsearch()
        does_exist = es.exists(index='rdap', doc_type='ipaddr', id = ip)
        print does_exist
        if does_exist is True:
                status = 200
                print "Found it!"
                get_record = es.get(index='rdap',doc_type='ipaddr', id = ip)
                results = jsonify(get_record['_source'])
	else:
		try:
			obj = IPWhois(ip)
			results_raw = obj.lookup_rdap(depth=1)
			status = 200
			results = jsonify(results_raw)
			es.index(index='rdap', doc_type='ipaddr', id=ip, body=json.dumps(results_raw))
		except Exception as e:
			print e
			results = jsonify({'status': "not_found"}) 
			status = 404
			results = jsonify({'status': "not_found"})
	return results,status
示例#40
0
def get_whois_domain(domain,refresh=None):
        es = Elasticsearch()
        id_num = domain
        does_exist = es.exists(index='domain', doc_type='domain', id = domain)
        print does_exist
        if does_exist is True and refresh is None:
                status = 200
                print "Found it!"
                get_record = es.get(index='domain',doc_type='domain', id = domain)
                results = jsonify(get_record['_source'])
	elif does_exist is True and refresh is not None:
		status = 200
		print "Forcing refresh!"
		es.delete(index='domain', doc_type='domain', id = domain)
                try:
                        obj = whois.whois(domain)
                        status = 200
                        results = jsonify(obj)
                        es.index(index='domain', doc_type='domain', id=domain, body=obj)

                except Exception as e:
                        print e
                        results_raw = jsonify({'status': "not_found"})
                        status = 404
	     	
		
        else:
                try:
                        obj = whois.whois(domain)
                        status = 200
                        results = jsonify(obj)
                        es.index(index='domain', doc_type='domain', id=domain, body=obj)

                except Exception as e:
                        print e
                        results_raw = jsonify({'status': "not_found"})
                        status = 404
                        results = jsonify({'status': "not_found"})
        return results,status
示例#41
0
class Search(object):
    """Search Repository"""

    def __init__(self, config):
        if 'ELASTICSEARCH' in config:
            options = {"host": config["ELASTICSEARCH"]["host"],
                       "port": config["ELASTICSEARCH"]["port"]}
            if 'url_prefix' in config["ELASTICSEARCH"]:
                options['url_prefix'] = config["ELASTICSEARCH"]['url_prefix']
            self.search_index = Elasticsearch(options)
        self.triplestore = TripleStore(config)
        self.body = None

    def __get_id_or_value__(self, value):
        """Helper function takes a dict with either a value or id and returns
        the dict value

        Args:
	    value(dict)
        Returns:
	    string or None
        """
        if [str, float, int, bool].count(type(value)) > 0:
            return value 
        elif '@value' in value:
            return value.get('@value')
        elif '@id' in value:
            result = self.triplestore.__get_id__(value.get('@id'))
            if len(result) > 0:
                return result[0]['uuid']['value']
            return value.get('@id')
        return value

    def __generate_body__(self, graph, prefix=None):
        """Internal method generates the body for indexing into Elastic search
        based on the JSON-LD serializations of the Fedora Commons Resource graph.

        Args:
            graph -- rdflib.Graph of Resource
            prefix -- Prefix filter, will only index if object starts with a prefix,
                      default is None to index everything.
        """
        self.body = dict()
        graph_json = json.loads(
            graph.serialize(
                format='json-ld',
                context=CONTEXT).decode())
        if '@graph' in graph_json:
            for graph in graph_json.get('@graph'):
                # Index only those graphs that have been created in the
                # repository
                if 'fedora:created' in graph:
                    for key, val in graph.items():
                        if key in [
                            'fedora:lastModified',
                            'fedora:created',
                            'fedora:uuid'
                        ]:
                            self.__set_or_expand__(key, val)
                        elif key.startswith('@type'):
                            for name in val:
                                #! prefix should be a list 
                                if prefix:
                                    if name.startswith(prefix):
                                        self.__set_or_expand__('type', name)
                                else:
                                    self.__set_or_expand__('type', name)
                        elif key.startswith('@id'):
                            self.__set_or_expand__('fedora:hasLocation', val)
                        elif not key.startswith('fedora') and not key.startswith('owl'):
                            self.__set_or_expand__(key, val) 


    def __index__(self, subject, graph, doc_type, index, prefix=None): 
        self.__generate_body__(graph, prefix)
        doc_id = str(graph.value(
                     subject=subject,
                     predicate=FEDORA.uuid))
        self.__generate_suggestion__(subject, graph, doc_id)
        self.search_index.index(
            index=index,
            doc_type=doc_type,
            id=doc_id,
            body=self.body)

    def __set_or_expand__(self, key, value):
        """Helper method takes a key and value and either creates a key
        with either a list or appends an existing key-value to the value

        Args:
            key
            value
        """
        if key not in self.body:
           self.body[key] = []
        if type(value) == list:
            for row in value:
                self.body[key].append(self.__get_id_or_value__(row))
        else:
            self.body[key].append(self.__get_id_or_value__(value))

    def __update__(self, **kwargs):
        """Helper method updates a stored document in Elastic Search and Fuseki. 
        Method must have doc_id 

        Keyword args:
            doc_id -- Elastic search document ID
            field -- Field name to update index, raises exception if None
            value -- Field value to update index, raises exception if None
        """
        doc_id, doc_type, index = kwargs.get('doc_id'), None, None
        if not doc_id:
            raise falcon.HTTPMissingParam("doc_id")
        field = kwargs.get('field')
        if not field:
            raise falcon.HTTPMissingParam("field")
        value = kwargs.get('value')
        if not value:
            raise falcon.HTTPMissingParam("field")
        for row in self.search_index.indices.stats()['indices'].keys():
            # Doc id should be unique across all indices 
            if self.search_index.exists(index=row, id=doc_id): 
                result = self.search_index.get(index=row, id=doc_id)
                doc_type = result['_type']
                index=row
                break
        if doc_type is None or index is None:
            raise falcon.HTTPNotFound()                 
        self.search_index.update(
            index=index,
            doc_type=doc_type,
            id=doc_id,
            body={"doc": {
                field: self.__get_id_or_value__(value)
            }})
        result = self.triplestore.__get_subject__(uuid=doc_id)
        if len(result) == 1:
            self.triplestore.__update_triple__(
                result[0]['subject']['value'], 
                field, 
                value)         
            

    def on_get(self, req, resp):
        """Method takes a a phrase, returns the expanded result.

        Args:
            req -- Request
            resp -- Response
        """
        phrase = req.get_param('phrase') or '*'
        size = req.get_param('size') or 25
        resource_type = req.get_param('resource') or None
        if resource_type:
            resp.body = json.dumps(self.search_index.search(
                q=phrase,
                doc_type=resource_type,
                size=size))
        else:
            resp.body = json.dumps(self.search_index.search(
                q=phrase,
                size=size))
        resp.status = falcon.HTTP_200

    def on_patch(self, req, resp):
        """Method takes either sparql statement or predicate and object 
        and updates the Resource.

        Args:
            req -- Request
            resp -- Response
        """
        doc_uuid = req.get_param('uuid')
        if not doc_uuid:
            raise falcon.HTTPMissingParam('uuid')
        predicate = req.get_param('predicate') or None
        if not predicate:
            raise falcon.HTTPMissingParam('predicate')
        object_ = req.get_param('object') or None
        if not object_:
            raise falcon.HTTPMissingParam('object')
        doc_type = req.get_param('doc_type') or None
        if self.__update__(
            doc_id=doc_uuid,
            doc_type=doc_type,
            field=predicate,
            value=object_):
            resp.status = falcon.HTTP_202
            resp.body = json.dumps(True)
        else:
            raise falcon.HTTPInternalServerError(
                "Error with PATCH for {}".format(doc_uuid),
                "Failed setting {} to {}".format(
                    predicate,
                    object_))
示例#42
0
        index_exist = es.indices.exists(index="activity")
        if not index_exist:
            es.indices.create(index="activity", ignore=400)
    except Exception,r:
        print Exception,":",r

    s_re = scan(es, query={"query":{"match_all":{}},"size":1000}, index="20130901",doc_type='bci')
    bulk_action = [] # new uid record to es
    count_index = 0
    while 1:
        try:
            item = s_re.next()['_source']
        except:
            break
        user_id = item['user']
        doc_exist = es.exists(index="activity", id=user_id)
        if not doc_exist:
            activity_info = {}
            activity_info['uid'] = user_id
            activity_info['max_index'] = item['user_index']
            activity_info['min_index'] = item['user_index']
            activity_info['index_number'] = 1
            activity_info['lower_than_average_number'] = 0
            activity_info['remove'] = 0 # 0 denotes not remove
            xdata = expand_index_action(activity_info)
            bulk_action.extend([xdata[0], xdata[1]])
            count_index += 1
            if count_index % 2000 == 0:
                test_speed(es, count_index, bulk_action)
                bulk_action = []
示例#43
0
文件: test.py 项目: prodja/djascralog
#! coding: utf-8
from elasticsearch import Elasticsearch
import httplib2
from os import getcwd
#test elastic in python
#товар - распарсил, в json -> (если _source пуст)документ elastic - заполняем _source в доке
#https://elasticsearch-py.readthedocs.org/en/master/api.html
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
doc='{"name":"one_name", "arrs":{"el1":1,"els2":2}}'
#res = es.index(index="gearbest_index", doc_type='product_type', id=5, body=doc)
#res=es.create(index="gearbest_index",doc_type='product_type',body=doc,id=1)
b=es.exists(index="gearbest_index",doc_type="product_type",id=1)
if(b==True):
	print 'OK' #111
示例#44
0
__author__ = 'terry'

import sys
from elasticsearch import Elasticsearch
import time

if __name__ == '__main__':

    time.sleep(5)
    # create a connection to the Elasticsearch database
    client = Elasticsearch(['pureelk-elasticsearch:9200'], retry_on_timeout=True)

    if client.exists(index='.kibana', doc_type='index-pattern',id='pureelk-global-arrays'):
        sys.exit(0)
    else:
        sys.exit(1)
示例#45
0
class Archiver(object):
    """ A mailman 3 archiver that forwards messages to pony mail. """
    if config.has_section('mailman') and config.has_option('mailman', 'plugin'):
        implementer(IArchiver)

    # This is a list of the headers we're interested in publishing.
    keys = [
        "archived-at",
        "delivered-to",
        "from",
        "cc",
        "to",
        "date",
        "in-reply-to",
        "message-id",
        "subject",
        "x-message-id-hash",
        "references",
        "x-mailman-rule-hits",
        "x-mailman-rule-misses",
    ]

    def __init__(self, parseHTML=False):
        """ Just initialize ES. """
        self.html = parseHTML
        if parseHTML:
            import html2text
            self.html2text = html2text.html2text
        self.dbname = config.get("elasticsearch", "dbname")
        ssl = config.get("elasticsearch", "ssl", fallback="false").lower() == 'true'
        self.consistency = config.get('elasticsearch', 'write', fallback='quorum')
        self.cropout = config.get("debug", "cropout", fallback=None)
        uri = config.get("elasticsearch", "uri", fallback="")
        dbs = [
            {
                'host': config.get("elasticsearch", "hostname"),
                'port': int(config.get("elasticsearch", "port")),
                'use_ssl': ssl,
                'url_prefix': uri,
                'http_auth': auth
            }]
        # Backup ES?
        backup = config.get("elasticsearch", "backup", fallback="")
        if backup != "":
            dbs.append(
                {
                'host': backup,
                'port': int(config.get("elasticsearch", "port")),
                'use_ssl': ssl,
                'url_prefix': uri,
                'http_auth': auth
            }
            )
        self.es = Elasticsearch(dbs,
            max_retries=5,
            retry_on_timeout=True
            )

    def msgfiles(self, msg):
        attachments = []
        contents = {}
        for part in msg.walk():
            part_meta, part_file = parse_attachment(part)
            if part_meta:
                attachments.append(part_meta)
                contents[part_meta['hash']] = part_file
        return attachments, contents
    
    
    def msgbody(self, msg):
        body = None
        firstHTML = None
        if msg.is_multipart():
            for part in msg.walk():
                try:
                    if part.is_multipart(): 
                        for subpart in part.walk():
                            if subpart.get_content_type() == 'text/plain' and not body:
                                body = subpart.get_payload(decode=True)
                            if subpart.get_content_type() == 'text/enriched' and not body:
                                body = subpart.get_payload(decode=True)
                            elif subpart.get_content_type() == 'text/html' and self.html and not firstHTML:
                                firstHTML = subpart.get_payload(decode=True)
            
                    elif part.get_content_type() == 'text/plain' and not body:
                        body = part.get_payload(decode=True)
                    elif part.get_content_type() == 'text/html' and self.html and not firstHTML:
                        firstHTML = part.get_payload(decode=True)
                except Exception as err:
                    print(err)
        elif msg.get_content_type() == 'text/plain':
            body = msg.get_payload(decode=True)
        elif msg.get_content_type() == 'text/enriched':
            body = msg.get_payload(decode=True)
        elif msg.get_content_type() == 'text/html' and self.html and not firstHTML:
            firstHTML = msg.get_payload(decode=True)
            
        # this requires a GPL lib, user will have to install it themselves
        if firstHTML and (not body or len(body) <= 1 or (iBody and str(body).find(str(iBody)) != -1)):
            body = self.html2text(firstHTML.decode("utf-8", 'ignore') if type(firstHTML) is bytes else firstHTML)
    
        for charset in pm_charsets(msg):
            try:
                body = body.decode(charset) if type(body) is bytes else body
            except:
                body = body.decode('utf-8', errors='replace') if type(body) is bytes else body
                
        return body    

    def compute_updates(self, lid, private, msg):
        """Determine what needs to be sent to the archiver.

        :param lid: The list id
        :param msg: The message object.

        :return None if the message could not be parsed
        """

        ojson = None
        if not lid:
            lid= msg.get('list-id')
        if self.cropout:
            crops = self.cropout.split(" ")
            # Regex replace?
            if len(crops) == 2:
                lid = re.sub(crops[0], crops[1], lid)
            # Standard crop out?
            else:
                lid = lid.replace(self.cropout, "")
        
        defaultEmptyString = lambda value: value and str(value) or ""
        msg_metadata = dict([(k, defaultEmptyString(msg.get(k))) for k in self.keys])
        mid = hashlib.sha224(str("%s-%s" % (lid, msg_metadata['archived-at'])).encode('utf-8')).hexdigest() + "@" + (lid if lid else "none")
        for key in ['to','from','subject','message-id']:
            try:
                hval = ""
                if msg_metadata.get(key):
                    for t in email.header.decode_header(msg_metadata[key]):
                        if t[1] == None or t[1].find("8bit") != -1:
                            hval += t[0].decode('utf-8') if type(t[0]) is bytes else t[0]
                        else:
                            hval += t[0].decode(t[1],errors='ignore')
                    msg_metadata[key] = hval
            except Exception as err:
                print("Could not decode headers, ignoring..: %s" % err)
        if not msg_metadata.get('message-id'):
            msg_metadata['message-id'] = mid
        mdate = None
        uid_mdate = 0 # mdate for UID generation
        try:
            mdate = email.utils.parsedate_tz(msg_metadata.get('date'))
            uid_mdate = email.utils.mktime_tz(mdate) # Only set if Date header is valid
        except:
            pass
        if not mdate and msg_metadata.get('archived-at'):
            mdate = email.utils.parsedate_tz(msg_metadata.get('archived-at'))
        elif not mdate:
            print("Date (%s) seems totally wrong, setting to _now_ instead." % mdate)
            mdate = time.gmtime() # Get a standard 9-tuple
            mdate = mdate + (0, ) # Fake a TZ (10th element)
        mdatestring = time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(email.utils.mktime_tz(mdate)))
        body = self.msgbody(msg)
        try:
            if 'content-type' in msg_metadata and msg_metadata['content-type'].find("flowed") != -1:
                body = convertToWrapped(body, character_set="utf-8")
            if isinstance(body, str):
                body = body.encode('utf-8')
        except Exception as err:
            try:
                body = body.decode(chardet.detect(body)['encoding'])
            except Exception as err:
                try:
                    body = body.decode('latin-1')
                except:
                    try:
                        if isinstance(body, str):
                            body = body.encode('utf-8')
                    except:
                        body = None

        attachments, contents = self.msgfiles(msg)
        irt = ""
        if body is not None or attachments:
            pmid = mid
            try:
                # Use full message as bytes for mid?
                if archiver_generator == "full":
                    mid = "%s@%s" % (hashlib.sha224(msg.as_bytes()).hexdigest(), lid)
                elif archiver_generator == "medium":
                    xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
                    xbody += bytes(lid, encoding='ascii')
                    xbody += bytes(mdatestring, encoding='ascii')
                    mid = "%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
                else:
                    # Or revert to the old way?
                    mid = "%s@%s@%s" % (hashlib.sha224(body if type(body) is bytes else body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid)
            except Exception as err:
                if logger:
                    logger.warn("Could not generate MID: %s" % err)
                mid = pmid
            if 'in-reply-to' in msg_metadata:
                try:
                    try:
                        irt = "".join(msg_metadata['in-reply-to'])
                    except:
                        irt = msg_metadata.get('in-reply-to').__str__()
                except:
                    irt = ""
            ojson = {
                'from_raw': msg_metadata['from'],
                'from': msg_metadata['from'],
                'to': msg_metadata['to'],
                'subject': msg_metadata['subject'],
                'message-id': msg_metadata['message-id'],
                'mid': mid,
                'cc': msg_metadata.get('cc'),
                'epoch': email.utils.mktime_tz(mdate),
                'list': lid,
                'list_raw': lid,
                'date': mdatestring,
                'private': private,
                'references': msg_metadata['references'],
                'in-reply-to': irt,
                'body': body.decode('utf-8', 'replace') if type(body) is bytes else body,
                'attachments': attachments
            }

        self.msg_metadata = msg_metadata
        self.irt = irt

        return  ojson, contents
            
    def archive_message(self, mlist, msg):
        """Send the message to the archiver.

        :param mlist: The IMailingList object.
        :param msg: The message object.

        :return (lid, mid)
        """

        lid = normalize_lid(mlist.list_id)

        private = False
        if hasattr(mlist, 'archive_public') and mlist.archive_public == True:
            private = False
        elif hasattr(mlist, 'archive_public') and mlist.archive_public == False:
            private = True
        elif hasattr(mlist, 'archive_policy') and mlist.archive_policy is not ArchivePolicy.public:
            private = True

        ojson, contents = self.compute_updates(lid, private, msg)
        if not ojson:
            id = msg.get('message-id') or msg.get('Subject') or msg.get("Date")
            raise Exception("Could not parse message %s for %s" % (id,lid))

        if args.dry:
            print("**** Dry run, not saving message to database *****")
            return lid, ojson['mid']

        msg_metadata = self.msg_metadata
        irt = self.irt

        if contents:
            for key in contents:
                self.es.index(
                    index=self.dbname,
                    doc_type="attachment",
                    id=key,
                    body = {
                        'source': contents[key]
                    }
                )
    
        self.es.index(
            index=self.dbname,
            doc_type="mbox",
            id=ojson['mid'],
            consistency = self.consistency,
            body = ojson
        )
        
        self.es.index(
            index=self.dbname,
            doc_type="mbox_source",
            id=ojson['mid'],
            consistency = self.consistency,
            body = {
                "message-id": msg_metadata['message-id'],
                "source": self.mbox_source(msg)
            }
        )
        
        # If MailMan and list info is present, save/update it in ES:
        if hasattr(mlist, 'description') and hasattr(mlist, 'list_name') and mlist.description and mlist.list_name:
            self.es.index(
                index=self.dbname,
                doc_type="mailinglists",
                id=lid,
                consistency = self.consistency,
                body = {
                    'list': lid,
                    'name': mlist.list_name,
                    'description': mlist.description,
                    'private': private
                }
            )
        
        if logger:
            logger.info("Pony Mail archived message %s successfully" % mid)
        oldrefs = []
        
        # Is this a direct reply to a pony mail email?
        if irt != "":
            dm = re.search(r"pony-([a-f0-9]+)-([a-f0-9]+)@", irt)
            if dm:
                cid = dm.group(1)
                mid = dm.group(2)
                if self.es.exists(index = self.dbname, doc_type = 'account', id = cid):
                    doc = self.es.get(index = self.dbname, doc_type = 'account', id = cid)
                    if doc:
                        oldrefs.append(cid)
                        # N.B. no index is supplied, so ES will generate one
                        self.es.index(
                            index=self.dbname,
                            doc_type="notifications",
                            consistency = self.consistency,
                            body = {
                                'type': 'direct',
                                'recipient': cid,
                                'list': lid,
                                'private': private,
                                'date': ojson['date'],
                                'from': msg_metadata['from'],
                                'to': msg_metadata['to'],
                                'subject': msg_metadata['subject'],
                                'message-id': msg_metadata['message-id'],
                                'in-reply-to': irt,
                                'epoch': ojson['epoch'],
                                'mid': mid,
                                'seen': 0
                            }
                        )
                        if logger:
                            logger.info("Notification sent to %s for %s" % (cid, mid))

        # Are there indirect replies to pony emails?
        if msg_metadata.get('references'):
            for im in re.finditer(r"pony-([a-f0-9]+)-([a-f0-9]+)@", msg_metadata.get('references')):
                cid = im.group(1)
                mid = im.group(2)
                if self.es.exists(index = self.dbname, doc_type = 'account', id = cid):
                    doc = self.es.get(index = self.dbname, doc_type = 'account', id = cid)
                    
                    # does the user want to be notified of indirect replies?
                    if doc and 'preferences' in doc['_source'] and doc['_source']['preferences'].get('notifications') == 'indirect' and not cid in oldrefs:
                        oldrefs.append(cid)
                        # N.B. no index is supplied, so ES will generate one
                        self.es.index(
                            index=self.dbname,
                            consistency = self.consistency,
                            doc_type="notifications",
                            body = {
                                'type': 'indirect',
                                'recipient': cid,
                                'list': lid,
                                'private': private,
                                'date': ojson['date'],
                                'from': msg_metadata['from'],
                                'to': msg_metadata['to'],
                                'subject': msg_metadata['subject'],
                                'message-id': msg_metadata['message-id'],
                                'in-reply-to': irt,
                                'epoch': ojson['epoch'],
                                'mid': mid,
                                'seen': 0
                            }
                        )
                        if logger:
                            logger.info("Notification sent to %s for %s" % (cid, mid))
        return lid, ojson['mid']

    def mbox_source(self, msg):
        # Common method shared with import-mbox
        policy = msg.policy.clone(max_line_length=0) # don't wrap headers
        return msg.as_bytes(policy=policy).decode('utf-8', errors='replace')

    def list_url(self, mlist):
        """ Required by MM3 plugin API
        """
        return None

    def permalink(self, mlist, msg):
        """ Required by MM3 plugin API
        """
        return None
示例#46
0
class Archiver(object):
    """ A mailman 3 archiver that forwards messages to pony mail. """
    if __name__ != '__main__':
        implementer(IArchiver)
    name = "ponymail"

    # This is a list of the headers we're interested in publishing.
    keys = [
        "archived-at",
        "delivered-to",
        "from",
        "cc",
        "to",
        "date",
        "in-reply-to",
        "message-id",
        "subject",
        "x-message-id-hash",
        "references",
        "x-mailman-rule-hits",
        "x-mailman-rule-misses",
    ]

    def __init__(self):
        """ Just initialize ES. """
        global config, auth, parseHTML
        ssl = False
        self.cropout = None
        self.html = parseHTML
        self.dbname = config.get("elasticsearch", "dbname")
        self.consistency = 'quorum'
        if config.has_option("elasticsearch", "ssl") and config.get("elasticsearch", "ssl").lower() == 'true':
            ssl = True
        if config.has_option("elasticsearch", "write") and config.get("elasticsearch", "write") != "":
            self.consistency = config.get('elasticsearch', 'write')
        if config.has_option("debug", "cropout") and config.get("debug", "cropout") != "":
            self.cropout = config.get("debug", "cropout")
        uri = ""
        if config.has_option("elasticsearch", "uri") and config.get("elasticsearch", "uri") != "":
            uri = config.get("elasticsearch", "uri")
        dbs = [
            {
                'host': config.get("elasticsearch", "hostname"),
                'port': int(config.get("elasticsearch", "port")),
                'use_ssl': ssl,
                'url_prefix': uri,
                'http_auth': auth
            }]
        # Backup ES?
        if config.has_option("elasticsearch", "backup") and config.get("elasticsearch", "backup") != "":
            backup = config.get("elasticsearch", "backup")
            dbs.append(
                {
                'host': config.get("elasticsearch", "backup"),
                'port': int(config.get("elasticsearch", "port")),
                'use_ssl': ssl,
                'url_prefix': uri,
                'http_auth': auth
            }
            )
        self.es = Elasticsearch(dbs,
            max_retries=5,
            retry_on_timeout=True
            )

    def msgfiles(self, msg):
        attachments = []
        contents = {}
        if msg.is_multipart():    
            for part in msg.walk():
                part_meta, part_file = parse_attachment(part)
                if part_meta:
                    attachments.append(part_meta)
                    contents[part_meta['hash']] = part_file
        return attachments, contents
    
    
    def msgbody(self, msg):
        body = None
        firstHTML = None
        if msg.is_multipart():
            for part in msg.walk():
                try:
                    if part.is_multipart(): 
                        for subpart in part.walk():
                            if subpart.get_content_type() == 'text/plain' and not body:
                                body = subpart.get_payload(decode=True)
                            elif subpart.get_content_type() == 'text/html' and self.html and not firstHTML:
                                firstHTML = subpart.get_payload(decode=True)
            
                    elif part.get_content_type() == 'text/plain' and not body:
                        body = part.get_payload(decode=True)
                    elif part.get_content_type() == 'text/html' and self.html and not firstHTML:
                        firstHTML = part.get_payload(decode=True)
                except Exception as err:
                    print(err)
        elif msg.get_content_type() == 'text/plain':
            body = msg.get_payload(decode=True)
        elif msg.get_content_type() == 'text/html' and self.html and not firstHTML:
            firstHTML = msg.get_payload(decode=True)
            
        # this requires a GPL lib, user will have to install it themselves
        if firstHTML and (not body or len(body) <= 1):
            body = html2text.html2text(firstHTML.decode("utf-8", 'ignore') if type(firstHTML) is bytes else firstHTML)
    
        for charset in pm_charsets(msg):
            try:
                body = body.decode(charset) if type(body) is bytes else body
            except:
                body = body.decode('utf-8', errors='replace') if type(body) is bytes else body
                
        return body    

    def archive_message(self, mlist, msg):
        """Send the message to the archiver.

        :param mlist: The IMailingList object.
        :param msg: The message object.
        """

        lid = None
        m = re.search(r"(<.+>)", mlist.list_id.replace("@", "."))
        if m:
            lid = m.group(1)
        else:
            lid = "<%s>" % mlist.list_id.strip("<>").replace("@", ".")
        if self.cropout:
            crops = self.cropout.split(" ")
            # Regex replace?
            if len(crops) == 2:
                lid = re.sub(crops[0], crops[1], lid)
            # Standard crop out?
            else:
                lid = lid.replace(self.cropout, "")
        
        format = lambda value: value and str(value) or ""
        msg_metadata = dict([(k, format(msg.get(k))) for k in self.keys])
        mid = hashlib.sha224(str("%s-%s" % (lid, msg_metadata['archived-at'])).encode('utf-8')).hexdigest() + "@" + (lid if lid else "none")
        for key in ['to','from','subject','message-id']:
            try:
                hval = ""
                if msg_metadata.get(key):
                    for t in email.header.decode_header(msg_metadata[key]):
                        if t[1] == None or t[1].find("8bit") != -1:
                            hval += t[0].decode('utf-8') if type(t[0]) is bytes else t[0]
                        else:
                            hval += t[0].decode(t[1],errors='ignore')
                    msg_metadata[key] = hval
            except Exception as err:
                print("Could not decode headers, ignoring..: %s" % err)
        if not msg_metadata.get('message-id'):
            msg_metadata['message-id'] = mid
        mdate = None
        uid_mdate = 0 # mdate for UID generation
        try:
            mdate = email.utils.parsedate_tz(msg_metadata.get('date'))
            uid_mdate = email.utils.mktime_tz(mdate) # Only set if Date header is valid
        except:
            pass
        if not mdate and msg_metadata.get('archived-at'):
            mdate = email.utils.parsedate_tz(msg_metadata.get('archived-at'))
        elif not mdate:
            print("Date seems totally wrong, setting to _now_ instead.")
            mdate = time.gmtime()
        mdatestring = time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(email.utils.mktime_tz(mdate)))
        body = self.msgbody(msg)
        try:
            if 'content-type' in msg_metadata and msg_metadata['content-type'].find("flowed") != -1:
                body = convertToWrapped(body, character_set="utf-8")
            if isinstance(body, str):
                body = body.encode('utf-8')
        except Exception as err:
            try:
                body = body.decode(chardet.detect(body)['encoding'])
            except Exception as err:
                try:
                    body = body.decode('latin-1')
                except:
                    try:
                        if isinstance(body, str):
                            body = body.encode('utf-8')
                    except:
                        body = None
        if body:
            attachments, contents = self.msgfiles(msg)
            private = False
            if hasattr(mlist, 'archive_public') and mlist.archive_public == True:
                private = False
            elif hasattr(mlist, 'archive_public') and mlist.archive_public == False:
                private = True
            elif hasattr(mlist, 'archive_policy') and mlist.archive_policy is not ArchivePolicy.public:
                private = True
            pmid = mid
            try:
                mid = "%s@%s@%s" % (hashlib.sha224(body if type(body) is bytes else body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid)
            except Exception as err:
                if logger:
                    logger.warn("Could not generate MID: %s" % err)
                mid = pmid
            irt = ""
            if 'in-reply-to' in msg_metadata:
                try:
                    try:
                        irt = "".join(msg_metadata['in-reply-to'])
                    except:
                        irt = msg_metadata.get('in-reply-to').__str__()
                except:
                    irt = ""
            ojson = {
                'from_raw': msg_metadata['from'],
                'from': msg_metadata['from'],
                'to': msg_metadata['to'],
                'subject': msg_metadata['subject'],
                'message-id': msg_metadata['message-id'],
                'mid': mid,
                'cc': msg_metadata.get('cc'),
                'epoch': email.utils.mktime_tz(mdate),
                'list': lid,
                'list_raw': lid,
                'date': mdatestring,
                'private': private,
                'references': msg_metadata['references'],
                'in-reply-to': irt,
                'body': body.decode('utf-8', 'replace') if type(body) is bytes else body,
                'attachments': attachments
            }
            
            if contents:
                for key in contents:
                    self.es.index(
                        index=self.dbname,
                        doc_type="attachment",
                        id=key,
                        body = {
                            'source': contents[key]
                        }
                    )
        
            self.es.index(
                index=self.dbname,
                doc_type="mbox",
                id=mid,
                consistency = self.consistency,
                body = ojson
            )
            
            self.es.index(
                index=self.dbname,
                doc_type="mbox_source",
                id=mid,
                consistency = self.consistency,
                body = {
                    "message-id": msg_metadata['message-id'],
                    "source": msg.as_string()
                }
            )
            
            # If MailMan and list info is present, save/update it in ES:
            if hasattr(mlist, 'description') and hasattr(mlist, 'list_name') and mlist.description and mlist.list_name:
                self.es.index(
                    index=self.dbname,
                    doc_type="mailinglists",
                    id=lid,
                    consistency = self.consistency,
                    body = {
                        'list': lid,
                        'name': mlist.list_name,
                        'description': mlist.description,
                        'private': private
                    }
                )
            
            if logger:
                logger.info("Pony Mail archived message %s successfully" % mid)
            oldrefs = []
            
            # Is this a direct reply to a pony mail email?
            if irt != "":
                dm = re.search(r"pony-([a-f0-9]+)-([a-f0-9]+)@", irt)
                if dm:
                    cid = dm.group(1)
                    mid = dm.group(2)
                    if self.es.exists(index = self.dbname, doc_type = 'account', id = cid):
                        doc = self.es.get(index = self.dbname, doc_type = 'account', id = cid)
                        if doc:
                            oldrefs.append(cid)
                            self.es.index(
                                index=self.dbname,
                                doc_type="notifications",
                                consistency = self.consistency,
                                body = {
                                    'type': 'direct',
                                    'recipient': cid,
                                    'list': lid,
                                    'private': private,
                                    'date': msg_metadata['date'],
                                    'from': msg_metadata['from'],
                                    'to': msg_metadata['to'],
                                    'subject': msg_metadata['subject'],
                                    'message-id': msg_metadata['message-id'],
                                    'in-reply-to': irt,
                                    'epoch': email.utils.mktime_tz(mdate),
                                    'mid': mid,
                                    'seen': 0
                                }
                            )
                            if logger:
                                logger.info("Notification sent to %s for %s" % (cid, mid))

            # Are there indirect replies to pony emails?
            if msg_metadata.get('references'):
                for im in re.finditer(r"pony-([a-f0-9]+)-([a-f0-9]+)@", msg_metadata.get('references')):
                    cid = im.group(1)
                    mid = im.group(2)
                    if self.es.exists(index = self.dbname, doc_type = 'account', id = cid):
                        doc = self.es.get(index = self.dbname, doc_type = 'account', id = cid)
                        
                        # does the user want to be notified of indirect replies?
                        if doc and 'preferences' in doc['_source'] and doc['_source']['preferences'].get('notifications') == 'indirect' and not cid in oldrefs:
                            oldrefs.append(cid)
                            self.es.index(
                                index=self.dbname,
                                consistency = self.consistency,
                                doc_type="notifications",
                                body = {
                                    'type': 'indirect',
                                    'recipient': cid,
                                    'list': lid,
                                    'private': private,
                                    'date': msg_metadata['date'],
                                    'from': msg_metadata['from'],
                                    'to': msg_metadata['to'],
                                    'subject': msg_metadata['subject'],
                                    'message-id': msg_metadata['message-id'],
                                    'in-reply-to': mirt,
                                    'epoch': email.utils.mktime_tz(mdate),
                                    'mid': mid,
                                    'seen': 0
                                }
                            )
                            if logger:
                                logger.info("Notification sent to %s for %s" % (cid, mid))
        return lid
            
    def list_url(self, mlist):
        """ Gots
            to
            be
            here
        """
        return None

    def permalink(self, mlist, msg):
        """ Gots
            to
            be
            here
        """
        return None
示例#47
0
class IndexMgr(object):
    index_pattern = "result_*"
    doc_feedback = "feedback"
    doc_percent = "percent"
    doc_module = "module"

    def __init__(self, *args, **kwargs):
        self.server = kwargs.get("host", "localhost")
        self.es = Elasticsearch([{"host": self.server}])

    def feedback_create(self):
        self.es.indices.delete(index=self.current(), ignore=[400, 404])
        self.es.indices.create(index=self.current(), ignore=[400])

        mapping = {
            "feedback": {
                "_timestamp": {"enabled": "true", "path": "tsapi.received"},
                "properties": {
                    "tsapi.product.name": {"type": "string", "index": "not_analyzed"},
                    "md5": {"type": "string", "index": "not_analyzed"},
                    "tag": {"type": "string", "index": "not_analyzed"},
                    "autoflag": {"type": "string", "index": "not_analyzed"},
                    "filetype": {"type": "string", "index": "not_analyzed"},
                    "assessment": {"type": "string", "index": "not_analyzed"},
                    "hit_modules.name": {"type": "string", "index": "not_analyzed"},
                },
            }
        }
        rt = self.es.indices.put_mapping(index=self.current(), doc_type=self.doc_feedback, body=mapping)
        mapping = {"percent": {"properties": {"timestamp": {"type": "date"}}}}

        rt = self.es.indices.put_mapping(index=self.current(), doc_type=self.doc_percent, body=mapping)
        mapping = {"module": {"properties": {"created": {"type": "date"}}}}
        rt = self.es.indices.put_mapping(index=self.current(), doc_type=self.doc_module, body=mapping)
        mp = self.es.indices.get_mapping(index=self.current())
        print "Index created", rt, "\n"

    def delete(self):
        self.es.indices.delete(index="result_*", ignore=[400, 404])

    def module_insert(self, idd, doc):
        try:
            res = self.es.index(index=self.current(), doc_type=self.doc_module, id=idd, body=doc)
        except elasticsearch.ElasticsearchException as e:
            print "Insert -", e.info

    def percent_insert(self, idd, doc):
        try:
            res = self.es.index(index=self.current(), doc_type=self.doc_percent, id=idd, body=doc)
        except elasticsearch.ElasticsearchException as e:
            print "Insert -", e.info

    def feedback_get(self, idd):
        doc = None
        try:
            doc = self.es.get(index=self.current(), doc_type=self.doc_feedback, id=idd)
        except elasticsearch.ElasticsearchException as e:
            print "Get -", e.info
        return doc

    def feedback_exists(self, idd):
        IsExists = False
        try:
            IsExists = self.es.exists(index=self.current(), doc_type=self.doc_feedback, id=idd)
        except elasticsearch.ElasticsearchException as e:
            print "Exists -", e.info
        except:
            pass
        return IsExists

    def feedback_insert(self, idd, doc):
        res = None
        try:
            res = self.es.index(index=self.current(), doc_type=self.doc_feedback, id=idd, body=doc)
        except elasticsearch.ElasticsearchException as e:
            print "Insert -", e.info
        return res

    def feedback_update(self, idd, doc):
        res = None
        try:
            res = self.es.update(index=self.current(), doc_type=self.doc_feedback, id=idd, body=doc)
        except elasticsearch.ElasticsearchException as e:
            print "Update (%s-%s)-" % (idd, doc), e.info
        return res

    def feedback_search(self, content, sz=10):
        res = None
        try:
            res = self.es.search(index=self.current(), doc_type=self.doc_feedback, body=content, size=sz)
        except elasticsearch.ElasticsearchException as e:
            print e.info
        return res

    def build_query(self, **kwargs):
        query = {"query": {"filtered": {"filter": {"bool": {"must": [], "must_not": []}}}}}
        must = query["query"]["filtered"]["filter"]["bool"]["must"]
        must_not = query["query"]["filtered"]["filter"]["bool"]["must_not"]

        dt_range = kwargs.get("daterange", None)
        vt_result = kwargs.get("vt_result", None)
        vt_detected = kwargs.get("vt_detected", None)
        tscp_score = kwargs.get("tscp_score", None)
        tag = kwargs.get("tag", None)
        assessment = kwargs.get("assessment", None)
        autoflag = kwargs.get("autoflag", None)
        not_gt_cca = kwargs.get("not_gt_cca", None)

        if dt_range:
            must.append({"range": dt_range})
        if vt_result == 0 or vt_result == 1:
            must.append({"term": {"virustotal.result": vt_result}})
        if vt_detected >= 0:
            must.append({"term": {"virustotal.value": vt_detected}})
        if tscp_score >= 0:
            must.append({"term": {"threatscope.score": tscp_score}})
        if tag:
            must.append({"term": {"tag": tag}})
        if assessment:
            must.append({"term": {"assessment": assessment}})
        if autoflag:
            must.append({"term": {"autoflag": autoflag}})

        if not_gt_cca >= 0:
            #      must.append({"exists" : {"field": "threatscope.score"}})
            must_not.append({"range": {"cca_result_count": {"gt": 0}}})

        return query

    def get_count(self, **kwargs):
        res = self.feedback_search(self.build_query(**kwargs), 0)
        return int(res["hits"]["total"])

    def get_rules_count(self, **kwargs):
        agg = {"aggs": {"modules": {"terms": {"field": "hit_modules.name"}}}}
        if len(kwargs) > 0:
            agg["query"] = self.build_query(**kwargs)["query"]

        res = self.feedback_search(agg, 0)
        if res and res["hits"]["total"] > 0:
            return res["aggregations"]["modules"]["buckets"]
        return []

    def current(self):
        index = "result_{0}".format(date.today().strftime("%Y%m%d"))
        return index
示例#48
0
class AnnotationWorker:
  
  def __init__(self, config):
    self.config = config
    self.logger = config["logger"]
    self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"]))
    self.corpusIndex = config["corpus"]["index"]
    self.corpusType = config["corpus"]["type"]
    self.corpusFields = config["corpus"]["text_fields"]
    self.corpusSize = 0
    self.workerName = "bayzee.annotation.worker"
    self.timeout = 6000
    self.processorIndex = config["processor"]["index"]
    self.processorType = config["processor"]["type"]
    self.processorPhraseType = config["processor"]["type"] + "__phrase"
    self.analyzerIndex = self.corpusIndex + "__analysis__"
    self.worker = DurableChannel(self.workerName, config)
    self.dispatchers = {}

  def annotate(self):
    while True:
      message = self.worker.receive()
      if message["content"] == "kill":
        message["responseId"] = message["requestId"]
        self.worker.close(message)
        if len(self.dispatchers) == 0:
          self.worker.end()
          break
        else:
          self.worker.send(content="kill", to=self.workerName)
          continue
      elif message["content"]["type"] == "annotate":
        if message["content"]["from"] not in self.dispatchers:
          self.dispatchers[message["content"]["from"]] = RemoteChannel(message["content"]["from"], self.config)
          self.dispatchers[message["content"]["from"]].listen(self.unregisterDispatcher)
        documentId = message["content"]["documentId"]
        document = self.esClient.get(index=self.corpusIndex, doc_type=self.corpusType, id = documentId, fields=self.corpusFields)
        if "fields" in document:  
          for field in self.corpusFields:
            shingles = []
            if field in document["fields"]:
              if type(document["fields"][field]) is list:
                for element in document["fields"][field]:
                  if len(element) > 0:
                    shingleTokens = self.esClient.indices.analyze(index=self.analyzerIndex, body=element, analyzer="analyzer_shingle")
                    shingles += shingleTokens["tokens"]
              else:
                if len(document["fields"][field]) > 0:
                  shingles = self.esClient.indices.analyze(index=self.analyzerIndex, body=document["fields"][field], analyzer="analyzer_shingle")["tokens"]
              shingles = map(self.__replaceUnderscore, shingles)
              shingles = filter(self.__filterTokens, shingles)
            if shingles != None and len(shingles) > 0:
              for shingle in shingles:
                phrase = shingle["token"]
                key = self.__keyify(phrase)
                if len(key) > 0:
                  data = {"phrase": phrase,"phrase__not_analyzed": phrase,"document_id": document["_id"]}
                  if not self.esClient.exists(index=self.processorIndex, doc_type=self.processorPhraseType, id=key):
                    self.esClient.index(index=self.processorIndex, doc_type=self.processorPhraseType, id=key, body=data)
        sleep(1)
        for processorInstance in self.config["processor_instances"]:
          processorInstance.annotate(self.config, documentId)
        self.worker.reply(message, {"documentId": documentId, "status" : "processed", "type" : "reply"}, self.timeout)

    self.logger.info("Terminating annotation worker")

  def unregisterDispatcher(self, dispatcher, message):
    if message == "dying":
      self.dispatchers.pop(dispatcher, None)

    if len(self.dispatchers) == 0:
      self.worker.send(content="kill", to=self.workerName)

  def __keyify(self, phrase):
    phrase = phrase.strip()
    if len(phrase) == 0:
      return ""
    key = re.sub("[^A-Za-z0-9]", " ", phrase)
    key = " ".join(phrase.split())
    key = key.lower()
    key = "-".join(phrase.split())
    return key

  def __replaceUnderscore(self,shingle):
    token = shingle["token"]
    token = token.replace("_","")
    token = re.sub('\s+', ' ', token).strip()
    shingle["token"] = token
    return shingle
    
  def __filterTokens(self, shingle):
    global esStopWords
    tokens = shingle["token"].split(" ")
    firstToken = tokens[0]
    lastToken = tokens[-1]
    isValid = True
    isValid = (isValid and lastToken != None)
    isValid = (isValid and len(lastToken) > 1)
    isValid = (isValid and not firstToken.replace(".","",1).isdigit())
    isValid = (isValid and not lastToken.replace(".","",1).isdigit())
    isValid = (isValid and firstToken not in esStopWords)
    isValid = (isValid and lastToken not in esStopWords)
    return isValid
class ModelRegistry(object):

    BATCH_SIZE = 100

    def __init__(self, es=None):
        if es is None:
            logging.info('Attempting to connect to ES: {0}'.format(OS_ELASTICSEARCH_ADDRESS))
            self.es = Elasticsearch(hosts=[OS_ELASTICSEARCH_ADDRESS])
            logging.info('Successful connection to ES')
        else:
            self.es = es

    @staticmethod
    def table_name_for_package(datapackage_owner, datapackage_name):
        return model_name(datapackage_owner, datapackage_name)

    def save_model(self, name, datapackage_url, datapackage, model, dataset_name, author):
        """
        Save a model in the registry
        :param name: name for the model
        :param datapackage_url: origin URL for the datapackage which is the source for this model
        :param datapackage: datapackage object from which this model was derived
        :param model: model to save
        """
        document = {
            # Fields used by babbage API
            'id': name,
            'model': model,
            'package': datapackage,
            'origin_url': datapackage_url,

            # Extra fields available in search
            'dataset': dataset_name,
            'author': author
        }
        self.es.index(index='packages', doc_type='package', body=document, id=name)
        # Make sure that the data is saved
        self.es.indices.flush('packages')

    def list_models(self):
        """
        List all available models in the DB
        :return: A generator yielding strings (one per model)
        """
        try:
            count = self.es.count(index='packages', doc_type='package', q='*')['count']
            from_ = 0
            while from_ < count:
                ret = self.es.search(index='packages', doc_type='package', q='*',
                                     size=self.BATCH_SIZE, from_=from_, _source=PACKAGE_FIELDS)
                for hit in ret.get('hits',{}).get('hits',[]):
                    yield hit['_source']['id']
                from_ += self.BATCH_SIZE
        except NotFoundError:
            return

    def has_model(self, name):
        """
        Check if a model exists in the registry
        :param name: model name to test
        :return: True if yes
        """
        return self.es.exists(index='packages', doc_type='package', id=name)

    def get_model(self, name):
        """
        Return the model associated with a specific name.
        Raises KeyError in case the model doesn't exist.
        :param name: model name to fetch
        :return: Python object representing the model
        """
        try:
            ret = self.es.get(index='packages', doc_type='package', id=name, _source=PACKAGE_FIELDS)
            if ret['found']:
                return ret['_source']['model']
            raise KeyError(name)
        except NotFoundError:
            raise KeyError(name)

    def get_package(self, name):
        """
        Return the original package contents associated with a specific name.
        Raises KeyError in case the model doesn't exist.
        :param name: model name to fetch
        :return: Python object representing the package
        """
        try:
            rec = self.es.get(index='packages', doc_type='package', id=name, _source=PACKAGE_FIELDS)
            if rec['found']:
                ret = rec['_source']['package']
                ret['__origin_url'] = rec['_source']['origin_url']
                return ret
            raise KeyError(name)
        except NotFoundError:
            raise KeyError(name)
示例#50
0
class Docstore():
    hosts = None
    indexname = None
    facets = None
    es = None

    def __init__(self, hosts=config.DOCSTORE_HOST, index=config.DOCSTORE_INDEX, connection=None):
        self.hosts = hosts
        self.indexname = index
        if connection:
            self.es = connection
        else:
            self.es = Elasticsearch(hosts, timeout=config.DOCSTORE_TIMEOUT)
    
    def __repr__(self):
        return "<%s.%s %s:%s>" % (
            self.__module__, self.__class__.__name__, self.hosts, self.indexname
        )
    
    def print_configs(self):
        print('CONFIG_FILES:           %s' % config.CONFIG_FILES)
        print('')
        print('DOCSTORE_HOST:          %s' % config.DOCSTORE_HOST)
        print('DOCSTORE_INDEX:         %s' % config.DOCSTORE_INDEX)
        print('')
    
    def health(self):
        return self.es.cluster.health()
    
    def index_exists(self, index):
        """
        """
        return self.es.indices.exists(index=index)
    
    def status(self):
        """Returns status information from the Elasticsearch cluster.
        
        >>> docstore.Docstore().status()
        {
            u'indices': {
                u'ddrpublic-dev': {
                    u'total': {
                        u'store': {
                            u'size_in_bytes': 4438191,
                            u'throttle_time_in_millis': 0
                        },
                        u'docs': {
                            u'max_doc': 2664,
                            u'num_docs': 2504,
                            u'deleted_docs': 160
                        },
                        ...
                    },
                    ...
                }
            },
            ...
        }
        """
        return self.es.indices.stats()
    
    def index_names(self):
        """Returns list of index names
        """
        return [name for name in self.status()['indices'].keys()]
     
    def aliases(self):
        """
        @param hosts: list of dicts containing host information.
        """
        return _parse_cataliases(
            self.es.cat.aliases(h=['index','alias'])
        )
    
    def delete_alias(self, alias, index):
        """Remove specified alias.
        
        @param alias: Name of the alias
        @param index: Name of the alias' target index.
        """
        logger.debug('deleting alias %s -> %s' % (alias, index))
        alias = make_index_name(alias)
        index = make_index_name(index)
        if alias not in [alias for index,alias in self.aliases()]:
            logger.error('Alias does not exist: "%s".' % alias)
            return
        result = self.es.indices.delete_alias(index=index, name=alias)
        logger.debug(result)
        logger.debug('DONE')
        return result
    
    def create_alias(self, alias, index):
        """Point alias at specified index; create index if doesn't exist.
        
        IMPORTANT: There should only ever be ONE alias per index.
        Existing aliases are deleted before specified one is created.
        
        @param alias: Name of the alias
        @param index: Name of the alias' target index.
        """
        logger.debug('creating alias %s -> %s' % (alias, index))
        alias = make_index_name(alias)
        index = make_index_name(index)
        # delete existing alias
        for i,a in self.aliases():
            removed = ''
            if a == alias:
                self.es.indices.delete_alias(
                    # NOTE: "i" is probably not the arg "index".  That's what
                    #       we want. We only want the arg "index".
                    index=i,
                    name=alias
                )
                removed = ' (removed)'
            print('%s -> %s%s' % (a,i,removed))
        result = self.es.indices.put_alias(index=index, name=alias, body='')
        logger.debug(result)
        logger.debug('DONE')
        return result
     
    def target_index(self, alias):
        """Get the name of the index to which the alias points
        
        >>> es.cat.aliases(h=['alias','index'])
        u'documents0 wd5000bmv-2 \n'
        
        @param alias: Name of the alias
        @returns: name of target index
        """
        alias = make_index_name(alias)
        target = []
        for i,a in _parse_cataliases(self.es.cat.aliases(h=['index','alias'])):
            if a == alias:
                target = i
        return target
     
    def create_index(self, index=None):
        """Creates the specified index if it does not already exist.
        
        @returns: JSON dict with status codes and responses
        """
        if not index:
            index = self.indexname
        logger.debug('creating new index: %s' % index)
        body = {
            'settings': {},
            'mappings': {}
            }
        status = self.es.indices.create(index=index, body=body)
        logger.debug(status)
        statuses = self.init_mappings()
        self.model_fields_lists()
        logger.debug('DONE')
     
    def delete_index(self, index=None):
        """Delete the specified index.
        
        @returns: JSON dict with status code and response
        """
        if not index:
            index = self.indexname
        logger.debug('deleting index: %s' % index)
        if self.index_exists(index):
            status = self.es.indices.delete(index=index)
        else:
            status = '{"status":500, "message":"Index does not exist"}'
        logger.debug(status)
        return status
    
    def init_mappings(self):
        """Initializes mappings for Elasticsearch objects
        
        Mappings for objects in (ddr-defs)repo_models.elastic.ELASTICSEARCH_CLASSES
                
        @returns: JSON dict with status code and response
        """
        logger.debug('registering doc types')
        statuses = []
        for class_ in ELASTICSEARCH_CLASSES['all']:
            logger.debug('- %s' % class_['doctype'])
            print('- %s' % class_)
            status = class_['class'].init(index=self.indexname, using=self.es)
            statuses.append( {'doctype':class_['doctype'], 'status':status} )
        return statuses

    def model_fields_lists(self):
        """
        Lists of class-specific fields for each class, in order,
        so documents may be emitted as OrderedDicts with fields in order.
        HOSTS:PORT/INDEX/modelfields/collection/
        HOSTS:PORT/INDEX/modelfields/entity/
        HOSTS:PORT/INDEX/modelfields/segment/
        HOSTS:PORT/INDEX/modelfields/file/
        
        identifier.MODEL_REPO_MODELS
        Identifier.fields_module
        """
        DOCTYPE = 'esobjectfields'
        EXCLUDED = [
            'id', 'title', 'description',
        ]
        for model in MODEL_REPO_MODELS.keys():
            module = module_for_name(MODEL_REPO_MODELS[model]['module']
            )
            fields = [
                f['name'] for f in module.FIELDS
                if f['elasticsearch']['public'] and (f['name'] not in EXCLUDED)
            ]
            data = {
                'model': model,
                'fields': fields,
            }
            self.post_json(
                doc_type=DOCTYPE,
                document_id=model,
                json_text=json.dumps(data),
            )
    
    def get_mappings(self, raw=False):
        """Get mappings for ESObjects
        
        @param raw: boolean Use lower-level function to get all mappings
        @returns: str JSON
        """
        if raw:
            return self.es.indices.get_mapping(self.indexname)
        return {
            class_['doctype']: elasticsearch_dsl.Mapping.from_es(
                index=self.indexname,
                doc_type=class_['doctype'],
                using=self.es,
            ).to_dict()
            for class_ in ELASTICSEARCH_CLASSES['all']
        }
    
    def post_vocabs(self, path=config.VOCABS_URL):
        """Posts ddr-vocab facets,terms to ES.
        
        curl -XPUT 'http://localhost:9200/meta/facet/format' -d '{ ... }'
        >>> elasticsearch.post_facets(
            '192.168.56.120:9200', 'meta',
            '/opt/ddr-local/ddr-vocab'
            )
        
        @param path: Absolute path to dir containing facet files.
        @returns: JSON dict with status code and response
        """
        logger.debug('index_facets(%s, %s)' % (self.indexname, path))
        vocabs = vocab.get_vocabs(path)
        
        # get classes from ddr-defs
        Facet = ELASTICSEARCH_CLASSES_BY_MODEL['facet']
        FacetTerm = ELASTICSEARCH_CLASSES_BY_MODEL['facetterm']
        
        # push facet data
        statuses = []
        for v in vocabs.keys():
            fid = vocabs[v]['id']
            facet = Facet()
            facet.meta.id = fid
            facet.id = fid
            facet.model = 'facet'
            facet.links_html = fid
            facet.links_json = fid
            facet.links_children = fid
            facet.title = vocabs[v]['title']
            facet.description = vocabs[v]['description']
            logging.debug(facet)
            status = facet.save(using=self.es, index=self.indexname)
            statuses.append(status)
            
            for t in vocabs[v]['terms']:
                tid = t.get('id')
                facetterm_id = '-'.join([
                    str(fid),
                    str(tid),
                ])
                term = FacetTerm()
                term.meta.id = facetterm_id
                term.facet = fid
                term.term_id = tid
                term.links_html = facetterm_id
                term.links_json = facetterm_id
                # TODO doesn't handle location_geopoint
                for field in FacetTerm._doc_type.mapping.to_dict()[
                        FacetTerm._doc_type.name]['properties'].keys():
                    if t.get(field):
                        setattr(term, field, t[field])
                term.id = facetterm_id  # overwrite term.id from original
                logging.debug(term)
                status = term.save(using=self.es, index=self.indexname)
                statuses.append(status)
        
        forms_choices = {
            'topics-choices': vocab.topics_choices(
                vocab.get_vocabs(config.VOCABS_URL)['topics'],
                ELASTICSEARCH_CLASSES_BY_MODEL['facetterm']
            ),
            'facility-choices': vocab.form_vocab_choices(
                vocab.get_vocabs(config.VOCABS_URL)['facility'],
                'facility'
            ),
            'format-choices': vocab.form_vocab_choices(
                vocab.get_vocabs(config.VOCABS_URL)['format'],
                'format'
            ),
            'genre-choices': vocab.form_vocab_choices(
                vocab.get_vocabs(config.VOCABS_URL)['genre'],
                'genre'
            ),
            'rights-choices': vocab.form_vocab_choices(
                vocab.get_vocabs(config.VOCABS_URL)['rights'],
                'rights'
            ),
        }
        self.post_json('forms', 'forms-choices', forms_choices)
        return statuses
    
    def facet_terms(self, facet, order='term', all_terms=True, model=None):
        """Gets list of terms for the facet.
        
        $ curl -XGET 'http://192.168.56.101:9200/ddr/entity/_search?format=yaml' -d '{
          "fields": ["id"],
          "query": { "match_all": {} },
          "facets": {
            "genre_facet_result": {
              "terms": {
                "order": "count",
                "field": "genre"
              }
            }
          }
        }'
        Sample results:
            {
              u'_type': u'terms',
              u'missing': 203,
              u'total': 49,
              u'other': 6,
              u'terms': [
                {u'term': u'photograph', u'count': 14},
                {u'term': u'ephemera', u'count': 6},
                {u'term': u'advertisement', u'count': 6},
                {u'term': u'book', u'count': 5},
                {u'term': u'architecture', u'count': 3},
                {u'term': u'illustration', u'count': 2},
                {u'term': u'fieldnotes', u'count': 2},
                {u'term': u'cityscape', u'count': 2},
                {u'term': u'blank_form', u'count': 2},
                {u'term': u'portrait, u'count': 1'}
              ]
            }
        
        @param facet: Name of field
        @param order: term, count, reverse_term, reverse_count
        @param model: (optional) Type of object ('collection', 'entity', 'file')
        @returns raw output of facet query
        """
        payload = {
            "fields": ["id"],
            "query": { "match_all": {} },
            "facets": {
                "results": {
                    "terms": {
                        "size": MAX_SIZE,
                        "order": order,
                        "all_terms": all_terms,
                        "field": facet
                    }
                }
            }
        }
        results = self.es.search(index=self.indexname, doc_type=model, body=payload)
        return results['facets']['results']

    def _repo_org(self, path, doctype, remove=False):
        """
        seealso DDR.models.common.DDRObject.to_esobject
        """
        # get and validate file
        data = load_json(path)
        if (not (data.get('id') and data.get('repo'))):
            raise Exception('Data file is not well-formed.')
        oi = Identifier(id=data['id'])
        d = OrderedDict()
        d['id'] = oi.id
        d['model'] = oi.model
        d['parent_id'] = oi.parent_id(stubs=1)
        # links
        d['links_html'] = oi.id
        d['links_json'] = oi.id
        d['links_img'] = '%s/logo.png' % oi.id
        d['links_thumb'] = '%s/logo.png' % oi.id
        d['links_parent'] = oi.parent_id(stubs=1)
        d['links_children'] = oi.id
        # title,description
        d['title'] = data['title']
        d['description'] = data['description']
        d['url'] = data['url']
        # ID components (repo, org, cid, ...) as separate fields
        idparts = deepcopy(oi.idparts)
        idparts.pop('model')
        for k in ID_COMPONENTS:
            d[k] = '' # ensure all fields present
        for k,v in idparts.iteritems():
            d[k] = v
        # add/update
        if remove and self.exists(doctype, oi):
            results = self.es.delete(
                index=self.indexname, doc_type=doctype, id=oi.id
            )
        else:
            results = self.es.index(
                index=self.indexname, doc_type=doctype, id=oi.id, body=d
            )
        return results
    
    def repo(self, path, remove=False):
        """Add/update or remove base repository metadata.
        
        @param path: str Absolute path to repository.json
        @param remove: bool Remove record from ES
        @returns: dict
        """
        return self._repo_org(path, 'repository', remove)
    
    def org(self, path, remove=False):
        """Add/update or remove base organization metadata.
        
        @param path: str Absolute path to organization.json
        @param remove: bool Remove record from ES
        @returns: dict
        """
        return self._repo_org(path, 'organization', remove)
    
    def narrators(self, path):
        """Add/update or remove narrators metadata.
        
        @param path: str Absolute path to narrators.json
        @returns: dict
        """
        DOC_TYPE = 'narrator'
        data = load_json(path)
        for document in data['narrators']:
            document['model'] = 'narrator'
            has_published = document.get('has_published', '')
            if has_published.isdigit():
                has_published = int(has_published)
            if has_published:
                result = self.post_json(DOC_TYPE, document['id'], json.dumps(document))
                logging.debug(document['id'], result)
            else:
                logging.debug('%s not published' % document['id'])
                if self.get(DOC_TYPE, document['id'], fields=[]):
                    self.delete(document['id'])
    
    def post_json(self, doc_type, document_id, json_text):
        """POST the specified JSON document as-is.
        
        @param doc_type: str
        @param document_id: str
        @param json_text: str JSON-formatted string
        @returns: dict Status info.
        """
        logger.debug('post_json(%s, %s, %s)' % (
            self.indexname, doc_type, document_id
        ))
        return self.es.index(
            index=self.indexname, doc_type=doc_type, id=document_id, body=json_text
        )

    def post(self, document, public_fields=[], additional_fields={}, parents={}, force=False):
        """Add a new document to an index or update an existing one.
        
        This function can produce ElasticSearch documents in two formats:
        - old-style list-of-dicts used in the DDR JSON files.
        - normal dicts used by ddr-public.
        
        DDR metadata JSON files are structured as a list of fieldname:value dicts.
        This is done so that the fields are always in the same order, making it
        possible to easily see the difference between versions of a file.
        [IMPORTANT: documents MUST contain an 'id' field!]
        
        In ElasticSearch, documents are structured in a normal dict so that faceting
        works properly.
        
        curl -XPUT 'http://localhost:9200/ddr/collection/ddr-testing-141' -d '{ ... }'
        
        @param document: Collection,Entity,File The object to post.
        @param public_fields: list
        @param additional_fields: dict
        @param parents: dict Basic metadata for parent documents.
        @param force: boolean Bypass status and public checks.
        @returns: JSON dict with status code and response
        """
        logger.debug('post(%s, %s, %s)' % (
            self.indexname, document, force
        ))

        if force:
            publishable = True
            public = False
        else:
            if not parents:
                parents = _parents_status([document.identifier.path_abs()])
            publishable = _publishable([document.identifier.path_abs()], parents)
            public = True
        if not publishable:
            return {'status':403, 'response':'object not publishable'}

        d = document.to_esobject(public_fields=public_fields, public=public)
        logger.debug('saving')
        status = d.save(using=self.es, index=self.indexname)
        logger.debug(str(status))
        return status
    
    def post_multi(self, path, recursive=False, force=False):
        """Publish (index) specified document and (optionally) its children.
        
        After receiving a list of metadata files, index() iterates through the
        list several times.  The first pass weeds out paths to objects that can
        not be published (e.g. object or its parent is unpublished).
        
        In the final pass, a list of public/publishable fields is chosen based
        on the model.  Additional fields not in the model (e.g. parent ID, parent
        organization/collection/entity ID) are packaged.  Then everything is sent
        off to post().
        
        @param path: Absolute path to directory containing object metadata files.
        @param recursive: Whether or not to recurse into subdirectories.
        @param force: boolean Just publish the damn collection already.
        @returns: number successful,list of paths that didn't work out
        """
        logger.debug('index(%s, %s, %s, %s)' % (self.indexname, path, recursive, force))
        
        publicfields = _public_fields()
        
        # process a single file if requested
        if os.path.isfile(path):
            paths = [path]
        else:
            # files listed first, then entities, then collections
            paths = util.find_meta_files(path, recursive, files_first=1)
        
        # Store value of public,status for each collection,entity.
        # Values will be used by entities and files to inherit these values
        # from their parent.
        parents = _parents_status(paths)
        
        # Determine if paths are publishable or not
        paths = _publishable(paths, parents, force=force)
        
        skipped = 0
        successful = 0
        bad_paths = []
        
        num = len(paths)
        for n,path in enumerate(paths):
            oi = path.get('identifier')
            # TODO write logs instead of print
            print('%s | %s/%s %s %s %s' % (
                datetime.now(config.TZ), n+1, num, path['action'], oi.id, path['note'])
            )
            
            if not oi:
                path['note'] = 'No identifier'
                bad_paths.append(path)
                continue
            try:
                document = oi.object()
            except Exception as err:
                path['note'] = 'Could not instantiate: %s' % err
                bad_paths.append(path)
                continue
            if not document:
                path['note'] = 'No document'
                bad_paths.append(path)
                continue
            
            # see if document exists
            existing_v = None
            d = self.get(oi.model, oi.id)
            if d:
                existing_v = d.meta.version
            
            # post document
            if path['action'] == 'POST':
                created = self.post(document, parents=parents, force=True)
                # force=True bypasses _publishable in post() function
            # delete previously published items now marked incomplete/private
            elif existing_v and (path['action'] == 'SKIP'):
                print('%s | %s/%s DELETE' % (datetime.now(config.TZ), n+1, num))
                self.delete(oi.id)
            
            if path['action'] == 'SKIP':
                skipped += 1
                continue
            
            # version is incremented with each updated
            posted_v = None
            # for e.g. segment the ES doc_type will be 'entity' but oi.model is 'segment'
            es_model = ELASTICSEARCH_CLASSES_BY_MODEL[oi.model]._doc_type.name
            d = self.get(es_model, oi.id)
            if d:
                posted_v = d.meta.version

            # success: created, or version number incremented
            status = 'ERROR - unspecified'
            if posted_v and not existing_v:
                status = 'CREATED'
                successful += 1
            elif (existing_v and posted_v) and (existing_v < posted_v):
                status = 'UPDATED'
                successful += 1
            elif not posted_v:
                status = 'ERROR: not created'
                bad_paths.append(path)
                print(status)
            
        logger.debug('INDEXING COMPLETED')
        return {'total':len(paths), 'skipped':skipped, 'successful':successful, 'bad':bad_paths}
     
    def exists(self, model, document_id):
        """
        @param model:
        @param document_id:
        """
        return self.es.exists(index=self.indexname, doc_type=model, id=document_id)
     
    def get(self, model, document_id, fields=None):
        """
        @param model:
        @param document_id:
        @param fields: boolean Only return these fields
        """
        if self.exists(model, document_id):
            ES_Class = ELASTICSEARCH_CLASSES_BY_MODEL[model]
            return ES_Class.get(document_id, using=self.es, index=self.indexname)
        return None

    def count(self, doctypes=[], query={}):
        """Executes a query and returns number of hits.
        
        The "query" arg must be a dict that conforms to the Elasticsearch query DSL.
        See docstore.search_query for more info.
        
        @param doctypes: list Type of object ('collection', 'entity', 'file')
        @param query: dict The search definition using Elasticsearch Query DSL
        @returns raw ElasticSearch query output
        """
        logger.debug('count(index=%s, doctypes=%s, query=%s' % (
            self.indexname, doctypes, query
        ))
        if not query:
            raise Exception("Can't do an empty search. Give me something to work with here.")
        
        doctypes = ','.join(doctypes)
        logger.debug(json.dumps(query))
        
        return self.es.count(
            index=self.indexname,
            doc_type=doctypes,
            body=query,
        )
    
    def delete(self, document_id, recursive=False):
        """Delete a document and optionally its children.
        
        @param document_id:
        @param recursive: True or False
        """
        identifier = Identifier(id=document_id)
        if recursive:
            if identifier.model == 'collection': doc_type = 'collection,entity,file'
            elif identifier.model == 'entity': doc_type = 'entity,file'
            elif identifier.model == 'file': doc_type = 'file'
            query = 'id:"%s"' % identifier.id
            try:
                return self.es.delete_by_query(
                    index=self.indexname, doc_type=doc_type, q=query
                )
            except TransportError:
                pass
        else:
            try:
                return self.es.delete(
                    index=self.indexname, doc_type=identifier.model, id=identifier.id
                )
            except TransportError:
                pass

    def search(self, doctypes=[], query={}, sort=[], fields=[], from_=0, size=MAX_SIZE):
        """Executes a query, get a list of zero or more hits.
        
        The "query" arg must be a dict that conforms to the Elasticsearch query DSL.
        See docstore.search_query for more info.
        
        @param doctypes: list Type of object ('collection', 'entity', 'file')
        @param query: dict The search definition using Elasticsearch Query DSL
        @param sort: list of (fieldname,direction) tuples
        @param fields: str
        @param from_: int Index of document from which to start results
        @param size: int Number of results to return
        @returns raw ElasticSearch query output
        """
        logger.debug('search(index=%s, doctypes=%s, query=%s, sort=%s, fields=%s, from_=%s, size=%s' % (
            self.indexname, doctypes, query, sort, fields, from_, size
        ))
        if not query:
            raise Exception("Can't do an empty search. Give me something to work with here.")
        
        doctypes = ','.join(doctypes)
        logger.debug(json.dumps(query))
        _clean_dict(sort)
        sort_cleaned = _clean_sort(sort)
        fields = ','.join(fields)
        
        results = self.es.search(
            index=self.indexname,
            doc_type=doctypes,
            body=query,
            sort=sort_cleaned,
            from_=from_,
            size=size,
            _source_include=fields,
        )
        return results
    
    def reindex(self, source, dest):
        """Copy documents from one index to another.
        
        @param source: str Name of source index.
        @param dest: str Name of destination index.
        @returns: number successful,list of paths that didn't work out
        """
        logger.debug('reindex(%s, %s)' % (source, dest))
        
        if self.index_exists(source):
            logger.info('Source index exists: %s' % source)
        else:
            return '{"status":500, "message":"Source index does not exist"}'
        
        if self.index_exists(dest):
            logger.info('Destination index exists: %s' % dest)
        else:
            return '{"status":500, "message":"Destination index does not exist"}'
        
        version = self.es.info()['version']['number']
        logger.debug('Elasticsearch version %s' % version)
        
        if version >= '2.3':
            logger.debug('new API')
            body = {
                "source": {"index": source},
                "dest": {"index": dest}
            }
            results = self.es.reindex(
                body=json.dumps(body),
                refresh=None,
                requests_per_second=0,
                timeout='1m',
                wait_for_active_shards=1,
                wait_for_completion=False,
            )
        else:
            logger.debug('pre-2.3 legacy API')
            from elasticsearch import helpers
            results = helpers.reindex(
                self.es, source, dest,
                #query=None,
                #target_client=None,
                #chunk_size=500,
                #scroll=5m,
                #scan_kwargs={},
                #bulk_kwargs={}
            )
        return results
	if jobIDMatch:
		jobID = re.search("\d+\.\d+", jobIDMatch.group())
		jO1 = jobObject(jobID.group(), dateTime.isoformat()) # create new job object with jobID and the time the job started at
		jobDict[jO1.jobID] = jO1 # put this jobObject in the dict under its jobID
	else:
		jobIDMatch =  re.search("\(\d+\.\d+\)", line)
		if(jobIDMatch): # check if logline is related to a job process
			jobID = re.search("\d+\.\d+", jobIDMatch.group())
			if jobID.group() in jobDict:
				jO1 = jobDict[jobID.group()]
				jobTerminateMatch = re.search("Job \d+\.\d+ terminated", line)
				if not jobTerminateMatch:
					jobTerminateMatch = re.search("terminating job \d+\.\d+", line)
				if jobTerminateMatch:
					jO1.setEndTime(dateTime.isoformat())
					if es.exists(index = "htcondor", doc_type = "mongoData", id = jO1.jobID):
						print "jobID: " + str(jobID.group())
						es.update(index = "htcondor", doc_type = "mongoData", id = jO1.jobID, body = {"script" : "ctx._source.jobShadowStartTime = dateTime", "params" : { "dateTime" : jO1.jobShadowStartTime } } )
						es.update(index = "htcondor", doc_type = "mongoData", id = jO1.jobID, body = {"script" : "ctx._source.jobEndTime = dateTime", "params" : { "dateTime" : jO1.jobEndTime } })
						es.update(index = "htcondor", doc_type = "mongoData", id = jO1.jobID, body = {"script" : "ctx._source.jobTimeSHADOW = diff", "params" : { "diff" : jO1.jobTimeShadow } })
						res = es.get(index = "htcondor", doc_type = "mongoData", id = jO1.jobID)
						if "QDate" in res["_source"]:
							jO1.setLagTime((res["_source"])["QDate"])	
							es.update(index = "htcondor", doc_type = "mongoData", id = jO1.jobID, body = {"script" : "ctx._source.lagTimeSecondsSHADOW = diff", "params" : { "diff" : jO1.lagTimeShadow } })
							es.update(index = "htcondor", doc_type = "mongoData", id = jO1.jobID, body = {"script" : "ctx._source.lagTimeMinutesSHADOW = diff", "params" : { "diff" : jO1.lagTimeShadow/60.0 } })
							es.update(index = "htcondor", doc_type = "mongoData", id = jO1.jobID, body = {"script" : "ctx._source.lagTimeHoursSHADOW = diff", "params" : { "diff" : jO1.lagTimeShadow/3600.0 } })
						es.update(index = "htcondor", doc_type = "mongoData", id = jO1.jobID, body = {"script" : "ctx._source.jobMessages = jM", "params" : { "jM" : jO1.jobMessages } })
				else: # creates new jobMessages entry that includes the jobTimestamp and jobMessage or appends the jobMessage if key already exists
					messageMatch = re.search(": .*", line)
					message = messageMatch.group(0)[2:]
					if dateTime.isoformat() in jO1.jobMessages:
示例#52
0
class IndexMgr(object):
    index_pattern='event_*'
    doc_feedback='log'
    def __init__(self, *args, **kwargs):
        self.server = kwargs.get('host', 'localhost')
        self.es = Elasticsearch([{'host': self.server}])

    def feedback_create(self):
        self.es.indices.delete(index=self.current(), ignore=[400, 404])
        self.es.indices.create(index=self.current(), ignore=[400])

        mapping = {
            "log" : {
                "_timestamp": {
                    "enabled": True
                },
                "properties" : {
                    "vendor" : {
                        "type" : "string",
                        "index" : "not_analyzed"
                    },
                    "URL" : {
                        "type" : "string",
                        "index" : "not_analyzed"
                    },
                    "source" : {
                        "type" : "string",
                        "index" : "not_analyzed"
                    },
                    "appid_action" : {
                        "type" : "string",
                        "index" : "not_analyzed"
                    }
                }
            }
        }

        rt=self.es.indices.put_mapping(index=self.current(), doc_type=self.doc_feedback, body=mapping)
        mp=self.es.indices.get_mapping(index=self.current())
        print 'Index created', rt, '\n'

    def delete(self):
        self.es.indices.delete(index=self.index_pattern, ignore=[400, 404])

    def feedback_get(self, idd):
        doc=None
        try:
            doc = self.es.get(index=self.current(), doc_type=self.doc_feedback, id=idd)
        except elasticsearch.ElasticsearchException as e:
            print 'Get -', e.info
        return doc

    def feedback_exists(self, idd):
        IsExists = False
        try:
            IsExists = self.es.exists(index=self.current(), doc_type=self.doc_feedback, id=idd)
        except elasticsearch.ElasticsearchException as e:
            print 'Exists -', e.info
        except:
            pass
        return IsExists

    def feedback_insert(self, idd, doc):
        res = None
        try:
            res = self.es.index(index=self.current(), doc_type=self.doc_feedback, id=idd, body=doc)
        except elasticsearch.ElasticsearchException as e:
            print 'Insert -', e.info

        return res

    def current(self):
        index = 'event_{0}'.format(date.today().strftime('%Y%m%d'))
        return index
示例#53
0
class Docstore():

    def __init__(self, hosts=settings.DOCSTORE_HOSTS, index=settings.DOCSTORE_INDEX, connection=None):
        self.hosts = hosts
        self.indexname = index
        if connection:
            self.es = connection
        else:
            self.es = Elasticsearch(hosts)
    
    def health(self):
        return self.es.cluster.health()
    
    def index_exists(self, index):
        return self.es.indices.exists(index=index)
     
    def exists(self, model, document_id):
        """
        @param model:
        @param document_id:
        """
        return self.es.exists(index=self.indexname, doc_type=model, id=document_id)
     
    def get(self, model, document_id, fields=None):
        """
        @param model:
        @param document_id:
        @param fields: boolean Only return these fields
        """
        if self.exists(model, document_id):
            ES_Class = ELASTICSEARCH_CLASSES_BY_MODEL[model]
            return ES_Class.get(document_id, using=self.es, index=self.indexname)
        return None

    def count(self, doctypes=[], query={}):
        """Executes a query and returns number of hits.
        
        The "query" arg must be a dict that conforms to the Elasticsearch query DSL.
        See docstore.search_query for more info.
        
        @param doctypes: list Type of object ('collection', 'entity', 'file')
        @param query: dict The search definition using Elasticsearch Query DSL
        @returns raw ElasticSearch query output
        """
        logger.debug('count(index=%s, doctypes=%s, query=%s' % (
            self.indexname, doctypes, query
        ))
        if not query:
            raise Exception("Can't do an empty search. Give me something to work with here.")
        
        doctypes = ','.join(doctypes)
        logger.debug(json.dumps(query))
        
        return self.es.count(
            index=self.indexname,
            doc_type=doctypes,
            body=query,
        )
    
    def search(self, doctypes=[], query={}, sort=[], fields=[], from_=0, size=MAX_SIZE):
        """Executes a query, get a list of zero or more hits.
        
        The "query" arg must be a dict that conforms to the Elasticsearch query DSL.
        See docstore.search_query for more info.
        
        @param doctypes: list Type of object ('collection', 'entity', 'file')
        @param query: dict The search definition using Elasticsearch Query DSL
        @param sort: list of (fieldname,direction) tuples
        @param fields: str
        @param from_: int Index of document from which to start results
        @param size: int Number of results to return
        @returns raw ElasticSearch query output
        """
        logger.debug('search(index=%s, doctypes=%s, query=%s, sort=%s, fields=%s, from_=%s, size=%s' % (
            self.indexname, doctypes, query, sort, fields, from_, size
        ))
        if not query:
            raise Exception("Can't do an empty search. Give me something to work with here.")
        
        doctypes = ','.join(doctypes)
        logger.debug(json.dumps(query))
        _clean_dict(sort)
        sort_cleaned = _clean_sort(sort)
        fields = ','.join(fields)
        
        results = self.es.search(
            index=self.indexname,
            doc_type=doctypes,
            body=query,
            sort=sort_cleaned,
            from_=from_,
            size=size,
            _source_include=fields,
        )
        return results
示例#54
0
# try:
#     es.indices.delete(index=INDEX)
# except:
#     pass
es.indices.create(index=INDEX,ignore=400)
import phonenumbers
import pycountry
for r in range(0,500):
    phone = fake.phone_number().split("x")[0]
    Id =   str(int("".join([ c if c.isdigit() else "" for c in phone])))
    body = dict(phone=Id,
                name=fake.first_name(),
                age=random.choice(range(18,35)),
                gender=random.choice(['male','female']),
                location=fake.city(),
                status=random.choice([1,0]),
                status_message=" ".join(fake.text().split()[:10]))
    p = ("+%s" % Id.strip())
    try:
        phone_number = phonenumbers.parse(p, None)
        locale_code = phonenumbers.region_code_for_country_code(phone_number.country_code)
        country=pycountry.countries.get(alpha2=locale_code)
        body['country_name'] = country.name
        body['locale_code'] = locale_code
        if not (es.exists(index=INDEX, doc_type=DOC, id=Id)):
            es.index(index=INDEX, doc_type=DOC, id=Id, body=body)
    except Exception as e:
        print "Error :%s" % str(e)


示例#55
0
                                        "person either by simply appending the fields of the former to the fields of "
                                        "the latter or by nesting the fields of the former into the document of type"
                                        "person.")
p.add_argument('--index', metavar='<str>', dest='index', type=str, required=True, help='Name of index')
p.add_argument('--node', metavar='<str>', dest='node', type=str, required=True, help='Url and port of node')
args = p.parse_args()

es = Elasticsearch([args.node])
same_as = es.search(index=args.index,
                    doc_type='person',
                    _source=['owl:sameAs'],
                    body='{"query":{"exists":{"field":"owl:sameAs"}}}',
                    size=1000)['hits']['hits']
ref_viaf = dict()
for e in same_as:
    if es.exists(index=args.index, doc_type='person', id=e['_id']):
        print('Retrieving document ' + e['_source']['owl:sameAs'] + ' in order to add it to document ' + e['_id'] + '.')
        query_body = '{"query":{"ids":{"values":["' + e['_source']['owl:sameAs'][21:] + '"]}}}'
        viaf_entry = es.search(index=args.index,
                           doc_type='viaf',
                           _source=True,
                           body=query_body)['hits']['hits'][0]['_source']
        inner_viaf = es.get(index=args.index,
                              doc_type='person',
                              id=e['_id'],
                              _source = True)['_source']
        embedded_viaf = es.get(index=args.index,
                              doc_type='person',
                              id=e['_id'],
                              _source = True)['_source']
        print('Updating document ' + e['_id'] + '.')