Пример #1
0
def reset_and_fill_all_indices():
    all_orgs = [
        'explaain',
        'Matthew_Rusk_62352643',
        'yc',
        'Andrew_Davies_29862274',
        'financialtimes',
    ]
    all_index_names = [org.lower() + '__cards' for org in all_orgs] + [
        org.lower() + '__files' for org in all_orgs
    ] + ['organisations', 'sources', 'users']

    all_card_indices = [db.Cards(org) for org in all_orgs]
    all_file_indices = [db.Files(org) for org in all_orgs]
    # all_indices = all_card_indices + all_file_indices + [db.Organisations(), db.Sources(), db.Users()]
    all_indices = all_file_indices + [
        db.Organisations(), db.Sources(),
        db.Users()
    ]

    template_type = 'cards'
    template = templates.get_template(template_type)
    pp.pprint(
        es_client.IndicesClient(es).put_template(name=template_type,
                                                 body=template))
    # for org in all_orgs:
    #   index_name = org.lower() + '__cards'
    #   print(index_name)
    # print(json.dumps(es_client.IndicesClient(es).get_mapping(index=index_name, doc_type='card'), indent=2))
    # es_client.IndicesClient(es).close(index=index_name)
    # try:
    #   es_client.IndicesClient(es).put_mapping(index=index_name, doc_type='card', body=cards_template['mappings']['card'])
    #   es_client.IndicesClient(es).put_settings(index=index_name, body=cards_template['settings'])
    # except Exception as e:
    #   print(e)
    # es_client.IndicesClient(es).open(index=index_name)

    template_type = 'files'
    template = templates.get_template(template_type)
    pp.pprint(
        es_client.IndicesClient(es).put_template(name=template_type,
                                                 body=template))
    # for org in all_orgs:
    #   index_name = org.lower() + '__files'
    #   print(index_name)
    # es_client.IndicesClient(es).put_mapping(index=index_name, doc_type='file', body=files_template['mappings']['file'])

    # for index_name in all_index_names:
    #   print(index_name)
    #   if es_client.IndicesClient(es).exists(index=index_name):
    #     es_client.IndicesClient(es).delete(index=index_name)
    #   es_client.IndicesClient(es).create(index=index_name)
    for index in all_indices:
        copy_docs_from_algolia(index=index)
Пример #2
0
 def create(self):
     #create indexES instance
     indexES = client.IndicesClient(self.es)
     if (self.es.indices.exists(index=self.indexNameES)):
         #logger.info('index %s already exists', self.indexNameES)
         #index already exists but it does not mean that the type exists
         if (self.es.indices.exists_type(index=self.indexNameES,
                                         doc_type=[self.typeNameES])):
             #logger.info('type %s already exists', self.typeNameES)
             #type already exists nothing to do
             pass
         else:
             #type does not exists, creating it with the mapping to apply
             #logger.info('type %s does no exist, creating it', self.typeNameES)
             indexES.put_mapping(doc_type=self.typeNameES,
                                 body=self.docMapping,
                                 update_all_types=True)
     else:
         #index does not exists, neither type (type can't exist without index)
         #creating both
         #logger.info('index %s and type %s do not exist, creating them', self.indexNameES, self.typeNameES)
         indexES.create(index=self.indexNameES)
         #indicate mapping which applies only on index/type
         indexES.put_mapping(doc_type=self.typeNameES,
                             body=self.docMapping,
                             update_all_types=True)
Пример #3
0
    def calc_jm(self, query, queryNo, lamb, sumDocLength):
        jm_scores = {}
        fjm = open("Results/jm_output.txt",'a')
        queryArray = []
        ic = client.IndicesClient(self.es)
        analyzedResult = ic.analyze(index="ap_dataset",analyzer="my_english",body=query)
        tokenLength = len(analyzedResult['tokens'])
        for i in range(tokenLength):
            queryArray.append(str(analyzedResult['tokens'][i]['token']))

        queryBody = {"query": {"function_score": {"query": {"match": {"text": query}},"functions":[{"script_score": {"script": "getJM", "lang": "groovy", "params": {"query": queryArray, "field":"text", "lamb": lamb, "sumdoclength": sumDocLength }}}], "boost_mode": "replace"}}, "fields":["stream_id"]}
        jmResult = self.es.search(index="ap_dataset",
                                  doc_type="document",
                                  size=self.search_size,
                                  analyzer="my_english",
                                  body=queryBody)

        resultSize = len(jmResult['hits']['hits'] )
        rank = 1
        for i in range(resultSize):

            docId = str(jmResult['hits']['hits'][i]['_id'])
            score = jmResult['hits']['hits'][i]['_score']
            if score != 0:
                fjm.write(queryNo + " Q0 " + docId + " " + str(rank) + " " + str(score) + " Exp\n")
                jm_scores[docId] = score
                rank = rank + 1

        fjm.close()
        return jm_scores
Пример #4
0
    def calc_okapi_bm(self, query, queryNo, avgDocLength, nDocs):
        okapi_bm_scores = {}
        fokapiBM = open("Results/okapiBM_output.txt",'a')
        queryArray = []
        ic = client.IndicesClient(self.es)
        analyzedResult = ic.analyze(index="ap_dataset",analyzer="my_english",body=query)
        tokenLength = len(analyzedResult['tokens'])
        for i in range(tokenLength):
            queryArray.append(str(analyzedResult['tokens'][i]['token']))

        queryBody = {"query": {"function_score": {"query": {"match": {"text": query}},
            "functions":[{"script_score": {"script": "getOkapiBM", "lang": "groovy",
                "params": {"query": queryArray, "field":"text", "avgLength": avgDocLength, "ndocs" : nDocs}}}],
            "boost_mode": "replace"}}, "fields":["stream_id"]}
        okapiBMResult = self.es.search(index="ap_dataset", doc_type="document", size=self.search_size,
            analyzer = "my_english", body = queryBody)

        resultSize = len(okapiBMResult['hits']['hits'] )
        rank = 1
        for i in range(resultSize):

            docId = str(okapiBMResult['hits']['hits'][i]['_id'])
            score = okapiBMResult['hits']['hits'][i]['_score']
            if score != 0:
                fokapiBM.write(queryNo + " Q0 " + docId + " " + str(rank) + " " + str(score) + " Exp\n")
                okapi_bm_scores[docId] = score
                rank += 1
        fokapiBM.close()
        return okapi_bm_scores
Пример #5
0
 def __init__(self, cfg):
     es = Elasticsearch([{'host': cfg["host"], 'port': cfg["port"]}])
     esClient = client.IndicesClient(es)
     self._es = es
     self._esc = esClient
     self._index = cfg["index"]
     pass
Пример #6
0
 def get_size(self, search_service: str = 'elasticsearch'):
     stats = es_client.IndicesClient(es).stats(
         index=self.get_index_name('elasticsearch'))
     size = stats.get('_all', {}).get('primaries',
                                      {}).get('docs',
                                              {}).get('count', None)
     return size
Пример #7
0
    def __init__(self, delete_index=True):

        self.es = Elasticsearch()
        if delete_index:
            self.es.indices.delete(index='cnn_news', ignore=[400, 404])
            es_analyzer = client.IndicesClient(self.es)
            es_analyzer.create(index='cnn_news', body=ANALYZER_PARAMS)
Пример #8
0
 def get_index_properties(self):
     """This is now interpreted as get_settings() for Algolia and get_mapping() for ElasticSearch.
 It returns both, as {
   'algolia': {},
   'elasticsearch': {}
 }
 """
     algolia_settings = None
     elasticsearch_mapping = None
     if UsingAlgolia:
         try:
             algolia_settings = self.index.get_settings()
         except Exception as e:
             print(
                 'Algolia: Couldn\'t get settings from index "' +
                 self.get_index_name('algolia') + '". ', e)
             sentry.captureException()
     try:
         elasticsearch_mapping = es_client.IndicesClient(es).get_mapping(
             index=self.get_index_name('elasticsearch'),
             doc_type=self.doc_type)
     except Exception as e:
         print(
             'ElasticSearch: Couldn\'t get settings from index "' +
             self.get_index_name('elasticsearch') + '". ', e)
         sentry.captureException()
     properties = {
         'algolia': algolia_settings,
         'elasticsearch': elasticsearch_mapping
     }
     return properties
Пример #9
0
def create_index():
    """创建索引"""
    
    es = Elasticsearch()
    ic = client.IndicesClient(es)
    
    # 如果索引存在则删除
    try:
        ic.delete(index="earthquake")
    except:
        pass

    # 创建索引
    ic.create(
        index="earthquake", 
        body={
            "mappings": {
                "properties": {
                    "time":     {"type": "date"}, # 发震时间
                    "level":    {"type": "float"}, # 震级 
                    "geo":      {"type": "geo_point"}, # 地理位置
                    "deep":     {"type": "float"}, # 深度 
                    "location": {"type": "text"}, # 位置 
                    "source":   {"type": "keyword"} # 数据来源 
                }
            }
        }
    )
Пример #10
0
 def __init__(self, host=None, port=None, index=None, index_suffix=None):
     self.host = (host or getattr(conf, 'elasticsearch_host', None)
                  or 'localhost')
     self.port = (port or getattr(conf, 'elasticsearch_port', None) or 9200)
     self.index = (index or getattr(conf, 'elasticsearch_index', None)
                   or 'repoxplorer')
     if index_suffix:
         self.index += "-%s" % index_suffix
     if (getattr(conf, 'elasticsearch_user', None)
             and getattr(conf, 'elasticsearch_password', None)):
         self.http_auth = "%s:%s" % (getattr(
             conf, 'elasticsearch_user',
             None), getattr(conf, 'elasticsearch_password', None))
         # NOTE(dpawlik) Opendistro is using self signed certs,
         # so verify_certs is set to False.
         self.es = client.Elasticsearch([{
             "host": self.host,
             "port": self.port,
             "http_auth": self.http_auth,
             "use_ssl": True,
             "verify_certs": False,
             "ssl_show_warn": True
         }],
                                        timeout=60)
     else:
         self.es = client.Elasticsearch([{
             "host": self.host,
             "port": self.port
         }],
                                        timeout=60)
     self.ic = client.IndicesClient(self.es)
     if not self.ic.exists(index=self.index):
         self.ic.create(index=self.index)
         # Give some time to have the index fully created
         time.sleep(1)
Пример #11
0
def create_index():
    es = Elasticsearch()
    ic = client.IndicesClient(es)

    # 判断索引是否存在
    if not ic.exists(index="poetry"):
        # 创建索引
        doc = {
            "mappings": {
                "properties": {
                    "title": {
                        "type": "keyword"
                    },
                    "epigraph": {
                        "type": "keyword"
                    },
                    "dynasty": {
                        "type": "keyword"
                    },
                    "author": {
                        "type": "keyword"
                    },
                    "content": {
                        "type": "text"
                    }
                }
            }
        }

        ic.create(index='poetry', body=doc)
Пример #12
0
    def calc_frequencies(self, query, docno):
        ic = client.IndicesClient(self.es)
        sum_tf = 0
        sum_df = 0
        sum_ttf = 0

        query_array = []

        analyzed_result = ic.analyze(index="ap_dataset",analyzer="my_english",body=query)
        token_length = len(analyzed_result['tokens'])

        for i in range(token_length):
            query_array.append(str(analyzed_result['tokens'][i]['token']))

        res = self.es.termvector(index="ap_dataset", doc_type="document", id=docno, term_statistics=True)

        term_dict = res['term_vectors']['text']['terms']

        for term in query_array:
            if term in term_dict.keys():
                sum_tf += res['term_vectors']['text']['terms'][term]['term_freq']
                sum_df += res['term_vectors']['text']['terms'][term]['doc_freq']
                sum_ttf += res['term_vectors']['text']['terms'][term]['ttf']

        return sum_tf, sum_df, sum_ttf
Пример #13
0
    def create_mapping(self):
        """ Creates a mapping specific to the current index AND doc_type.
        No need to check for existence of either based on previous functions"""

        indice = client.IndicesClient(self.es)

        indice.put_mapping(index=self.es_main_index,
                           doc_type=self.es_main_type,
                           body=self.mapping)
Пример #14
0
    def create_index(self):
        """ Check whether index exists, if not create it """

        indice = client.IndicesClient(self.es)

        if not indice.exists(self.es_main_index):
            indice.create(index=self.es_main_index)

        return True
    def __init__(self, config):
        if config['is_es_cloud']:
            self.es = Elasticsearch(cloud_id=config['es_cloud_host'],
                                    http_auth=(config['username'],
                                               config['password']))
        else:
            self.es = Elasticsearch(host=config['es_host'],
                                    port=config['es_port'])

        self.indicesClient = client.IndicesClient(self.es)
Пример #16
0
 def __set_es(self):
     es = Elasticsearch(hosts=[{
         'host': self.host,
         'port': self.theport
     }],
                        http_auth=self.awsauth,
                        connection_class=RequestsHttpConnection)
     if self.__check_es_status(es):
         return es, client.IndicesClient(es)
     else:
         raise Exception("Elasticsearch is not reachable")
Пример #17
0
 def list_indices(self, search_service: str = 'elasticsearch'):
     if search_service == 'algolia':
         return self.client.list_indexes()
     else:
         indices = es_client.IndicesClient(es).get(index='_all')
         # @NOTE: Currently only produces property 'name' for each index
         if not indices:
             return None
         return {
             'items': [{
                 'name': key
             } for key in indices if key[0] != '.']
         }
Пример #18
0
def index(es, ref_gen, table_name, testing=False, file_len=0):
    '''
    Insert values from ref_gen in the Elasticsearch index
    
    INPUT:
        - ref_gen: a pandas DataFrame generator (ref_gen=pd.read_csv(file_path, chunksize=XXX))
        - table_name: name of the Elasticsearch index to insert to
        - (testing): whether or not to refresh index at each insertion
        - (file_len): original file len to display estimated time
    '''

    ic = client.IndicesClient(es)

    # For efficiency, reset refresh interval
    # see https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-update-settings.html
    if not testing:
        low_refresh = {"index": {"refresh_interval": "-1"}}
        ic.put_settings(low_refresh, table_name)

    # Bulk insert
    logging.info('Started indexing')
    i = 0
    t_start = time.time()
    for ref_tab in ref_gen:
        ref_tab = pre_process_tab(ref_tab)
        body = ''
        for key, doc in ref_tab.where(ref_tab.notnull(),
                                      None).to_dict('index').items():
            #TODO: make function that limits bulk size
            index_order = json.dumps({
                "index": {
                    "_index": table_name,
                    "_type": 'structure',
                    "_id": str(key)
                }
            })
            body += index_order + '\n'
            body += json.dumps(doc) + '\n'
        es.bulk(body)
        i += len(ref_tab)

        # Display progress
        t_cur = time.time()
        eta = (file_len - i) * (t_cur - t_start) / i
        logging.info('Indexed {0} rows / ETA: {1} s'.format(i, eta))

    # Back to default refresh
    if not testing:
        default_refresh = {"index": {"refresh_interval": "1s"}}
        ic.put_settings(default_refresh, table_name)
        es.indices.refresh(index=table_name)
Пример #19
0
    def _indexDocument(self, text):

        host = self.elasticsearchDomain

        if (text):
            service = 'es'
            ss = boto3.Session()
            credentials = ss.get_credentials()
            region = ss.region_name

            awsauth = AWS4Auth(credentials.access_key,
                               credentials.secret_key,
                               region,
                               service,
                               session_token=credentials.token)

            es = Elasticsearch(hosts=[{
                'host': host,
                'port': 443
            }],
                               http_auth=awsauth,
                               use_ssl=True,
                               verify_certs=True,
                               connection_class=RequestsHttpConnection)

            es_index_client = client.IndicesClient(es)

            document = {
                "documentId": "{}".format(self.documentId),
                "name": "{}".format(self.objectName),
                "bucket": "{}".format(self.bucketName),
                "content": text
            }

            if not es_index_client.exists(index='textract'):
                print("Index 'textract' does not exist, creating...")
                es_index_client.create(
                    index='textract',
                    body={'settings': {
                        'index': {
                            "number_of_shards": 2
                        }
                    }})

            es.index(index="textract",
                     doc_type="document",
                     id=self.documentId,
                     body=document)

            print("Indexed document: {}".format(self.objectName))
Пример #20
0
def template_es():
    # This function put mapping for create data structure
    print('Create mapping for Elasticsearch')
    connect = connect_to_es()
    interface = client.IndicesClient(connect)
    if connect:
        body = '{"order":0,"template":"*","settings":{},"mappings":{"_default_":{"dynamic_templates":[' \
                      '{"string_fields":{"mapping":{"index":"analyzed","type":"string","fields":{"raw":{' \
                      '"index":"not_analyzed","type":"string"}}},"match_mapping_type":"string","match":"*"}}]'\
                      ',"_all":{"enabled":true}}},"aliases":{}}'
        template = interface.exists_template(ELASTICSEARCH['template'], )
        if template:
            print('Mapping existis, using it.')
        else:
            print('Creating map for use!')
            interface.put_template(name=ELASTICSEARCH['template'], body=body)
Пример #21
0
 def __init__(self, host=None, port=None, index=None, index_suffix=None):
     self.host = (host or getattr(conf, 'elasticsearch_host', None)
                  or 'localhost')
     self.port = (port or getattr(conf, 'elasticsearch_port', None) or 9200)
     self.index = (index or getattr(conf, 'elasticsearch_index', None)
                   or 'repoxplorer')
     if index_suffix:
         self.index += "-%s" % index_suffix
     self.es = client.Elasticsearch([{
         "host": self.host,
         "port": self.port
     }],
                                    timeout=60)
     self.ic = client.IndicesClient(self.es)
     if not self.ic.exists(index=self.index):
         self.ic.create(index=self.index)
         # Give some time to have the index fully created
         time.sleep(1)
Пример #22
0
def test_es_types(init_interpreter, doc_type):
    """
    Check that no field have "text" type
    """
    interpreter = init_interpreter
    parser = interpreter[doc_type].parser
    es = Elasticsearch([{
        "host": config.ES["es.nodes"],
        "port": config.ES["es.port"]
    }])

    indices = client.IndicesClient(es)
    index_name = list(indices.get_alias(name=parser.name).keys())[0]

    mapping = indices.get_mapping(index=index_name)

    for k, t in list(
            mapping[index_name]["mappings"][doc_type]["properties"].items()):
        assert t["type"] != "text"
Пример #23
0
    def check_type(self):
        """ Checks if the type already exists, if it does, first delete
        the type, then start uploading data """
        pass

        indice = client.IndicesClient(self.es)
        print(self.es_main_index)
        if indice.exists_type(index=self.es_main_index,
                              doc_type=self.es_main_type):
            print('Scenario %s already exists, deleting the current one' %
                  self.es_main_type)
            indice.delete_mapping(index=self.es_main_index,
                                  doc_type=self.es_main_type)

            print('Waiting for 10 seconds to ensure the current type is ' +
                  'deleted.')
            time.sleep(10)

        return
Пример #24
0
    def __init__(self, aws_access_code, aws_secret_code):
        self.es = None
        self.client = None

        host = 'search-bueventshowcase-2vvvoxbm5u73pdzuqois4u2aru.us-east-2.es.amazonaws.com'
        awsauth = AWS4Auth(aws_access_code, aws_secret_code, 'us-east-2', 'es')
        try:
            # this is hard coded
            self.indexName = 'defaultevents'
            self.es = Elasticsearch(hosts=[{
                'host': host,
                'port': 443
            }],
                                    http_auth=awsauth,
                                    use_ssl=True,
                                    verify_certs=True,
                                    connection_class=RequestsHttpConnection)
            self.client = client.IndicesClient(self.es)
        except:
            print("Connection to Elasticsearch host %s failed.", self.es)
Пример #25
0
    def __init__(self, items=[], **kwargs):

        self.bulk_chunk_size = kwargs.get('bulk_chunk_size',
                                          config.bulk_chunk_size)

        self._sort = []

        self.results_per_page = kwargs.get('results_per_page',
                                           config.results_per_page)

        self._querybody = querybuilder.QueryBody(
        )  # sets up the new query bodies

        if kwargs.get('base_obj'):
            self.base_obj = kwargs.get('base_obj')
        else:
            try:
                self.base_obj = self.__class__.__model__
            except AttributeError:
                raise AttributeError(
                    'Base object must contain a model or pass base_obj')

        self._es = Elasticsearch(config.dsn)
        self._esc = client.IndicesClient(self._es)

        if '__index__' in dir(self.base_obj):
            idx = self.base_obj.__index__
        else:
            idx = config.default_index

        self._search_params = []
        self._raw = {}
        self.idx = idx
        self.type = self.base_obj.__type__
        self._special_body = {}
        self._items = items  # special list of items that can be committed in bulk

        # these values are used in the _build_body() to determine where additional _build_body()
        # options should exist. Defaults to and/must
        self._last_top_level_boolean = None
        self._last_boolean = None
Пример #26
0
    def calc_okapi_tf(self, query, query_no, avg_doc_length):
        """
        Calculates the OkapiTf scores
        :param query: str
        :param query_no: int
        :param avg_doc_length: float
        :return: okapi_tf scores: float
        """
        okapi_tf_scores = {}
        f_okapi_tf = open("Results/okapi_tf_output.txt",'a')
        query_array = []
        ic = client.IndicesClient(self.es)

        analyzed_result = ic.analyze(index="ap_dataset",analyzer="my_english",body=query)
        token_length = len(analyzed_result['tokens'])
        for i in range(token_length):
            query_array.append(str(analyzed_result['tokens'][i]['token']))

        query_body = {"query":
                          {"function_score": {"query": {"match": {"text": query}},
                                              "functions": [
                                                  {"script_score":
                                                      {"script": "getOkapiTF", "lang": "groovy",
                                                       "params": {"query": query_array, "field": "text",
                                                                  "avgLength": avg_doc_length}}}],
                                              "boost_mode": "replace"}}, "fields":["stream_id"]}

        okapi_result = self.es.search(index="ap_dataset", doc_type="document", size=self.search_size,
                                      analyzer="my_english", body=query_body)
        result_size = len(okapi_result['hits']['hits'])

        rank = 1
        for i in range(result_size):
            doc_id = str(okapi_result['hits']['hits'][i]['_id'])
            score = okapi_result['hits']['hits'][i]['_score']
            if score != 0:
                f_okapi_tf.write(query_no + " Q0 " + doc_id + " " + str(rank) + " " + str(score) + " Exp\n")
                okapi_tf_scores[doc_id] = score
                rank += 1
        f_okapi_tf.close()
        return okapi_tf_scores
Пример #27
0
 def delete(self, type):
     # make sure index exists
     indice = client.IndicesClient(self.es)
     try:
         if indice.exists(self.es_main_index):
             # if type is 'all' delete everything
             if type == 'all':
                 try:
                     self.es.delete_by_query(index=self.es_main_index,
                                             body=match_all,
                                             conflicts='proceed')
                     print('Deleted ' + self.es_main_index)
                     return True
                 except ConnectionError:
                     print(
                         'There was a connection error. Check your Elastic'
                         +
                         ' Search setting and make sure Elastic Search is' +
                         'running.')
                     return False
             elif type:
                 try:
                     if indice.exists_type(index=self.es_main_index,
                                           doc_type=type):
                         self.es.delete_by_query(index=self.es_main_index,
                                                 doc_type=type,
                                                 body=match_all,
                                                 conflicts='proceed')
                         print('Deleted ' + self.es_main_index + '/' + type)
                         return True
                 except ConnectionError:
                     print(
                         'There was a connection error. Check your Elastic'
                         +
                         ' Search setting and make sure Elastic Search is' +
                         'running.')
                     return False
     except TransportError:
         print('Incorrect username or password')
         return False
Пример #28
0
def create_index(es,
                 table_name,
                 columns_to_index,
                 default_analyzer='keyword',
                 analyzer_index_settings=None,
                 force=False):
    '''
    Create a new empty Elasticsearch index (used to host documents)
    
    INPUT:
        - es: an Elasticsearch connection
        - table_name: name of the index in Elasticsearch
        - columns_to_index: dict containing the columns to index and as values
            the analyzers to use in addition to the default analyzer
            
            Ex: {'col1': {'analyzerA', 'analyzerB'}, 
                 'col2': {}, 
                 'col3': 'analyzerB'}
        - force: whether or not to delete and re-create an index if the name 
                is already associated to an existing index
    
    '''

    ic = client.IndicesClient(es)

    if ic.exists(table_name) and force:
        ic.delete(table_name)

    if not ic.exists(table_name):
        index_settings = gen_index_settings(default_analyzer, columns_to_index,
                                            analyzer_index_settings)
        try:
            ic.create(table_name, body=json.dumps(index_settings))
        except Exception as e:
            new_message = e.__str__() + '\n\n(MERGE MACHINE)--> This may be due to ' \
                            'ES resource not being available. ' \
                            'Run es_gen_resource.py (in sudo) for this to work'
            raise Exception(new_message)
    def indexDocument(self, text, entitiesToIndex):
        
        if(self.elasticsearchDomain):

            host = self.elasticsearchDomain

            if(text):
                service = 'es'
                ss = boto3.Session()
                credentials = ss.get_credentials()
                region = ss.region_name

                awsauth = AWS4Auth(credentials.access_key, credentials.secret_key,
                                region, service, session_token=credentials.token)

                es = Elasticsearch(
                    hosts=[{'host': host, 'port': 443}],
                    http_auth=awsauth,
                    use_ssl=True,
                    verify_certs=True,
                    connection_class=RequestsHttpConnection
                )

                es_index_client = client.IndicesClient(es)

                document = {
                    "documentId": "{}".format(self.documentId),
                    "name": "{}".format(self.objectName),
                    "bucket": "{}".format(self.bucketName),
                    "content": text
                }

                # add comprehend entities while indexing the document
                for key, val in entitiesToIndex.items():
                    key = key.lower()
                    if(key == "date"):
                        for date in val:
                            date_object = format_date(date)
                            if(date_object!= UNSUPPORTED_DATE_FORMAT):
                                if(key not in document):
                                    document[key] = []
                                document[key].append(date_object.strftime("%Y-%m-%d"))
                        print("Document with Converted dates: {}".format(document))
                    else:
                        document[key] = val
                    
                try:
                    if not es_index_client.exists(index='textract'):
                        print("Index 'textract' does not exist, creating...")
                        es_index_client.create(
                            index="textract",
                            body={
                                "settings": {
                                    "index": {
                                        "number_of_shards": 2
                                    }
                                },
                                "mappings":{
                                    "document":{
                                        "properties":{
                                        "date":{ 
                                            "type": "date",
                                            "format": "M'/'dd'/'yyyy||date||year||year_month||dd MMM yyyy||dd'/'MM'/'yyyy||yyyy'/'MM'/'dd||dd'/'MM'/'YY||year_month_day||MM'/'dd'/'yy||dd MMM||MM'/'yyyy||M-dd-yyyy||MM'/'dd'/'yyyy||M||d'/'MM'/'yyyy||MM'/'dd'/'yy"
                                        }
                                    }
                                }
                            }
                        }
                    )

                    es.index(index="textract", doc_type="document",
                            id=self.documentId, body=json.loads(json.dumps(document)))

                    print("Indexed document: {}".format(self.objectName))
                except Exception as E:
                    print("Failed to create index with desired mapping {}".format(E))
        else:
            print("Document not indexed {}".format(self.elasticsearchDomain))
Пример #30
0
def write_to_es(file_name):
    """
    input parmeter : filename
    Write user documents of a logfile into ES
    """
    try:
        # Get file in folder
        log_dir = os.path.join('logs', file_name)
        f = open(log_dir, 'r')
        # read file
        try:
            # Find all indices
            index_client = client.IndicesClient(es_con)
            print index_client
            index_name = 'demo_1'
            #check Index exist or not on list of indices
            if not index_client.exists(index=index_name):
                # create new mapping for new index
                body_dict = {
                    "mappings": {
                        "user": {
                            "dynamic_templates": [{
                                "string_template": {
                                    "match_mapping_type": "string",
                                    "mapping": {
                                        "index": "not_analyzed",
                                        "type": "string"
                                    },
                                    "match": "*"
                                }
                            }]
                        }
                    }
                }
                # create new index
                index_client.create(index=index_name, body=body_dict)
                # Refresh Index
                index_client.refresh(index=index_name)
            es_doc_list = []
            # get all user doc's one by one from logfile
            for each_dict in f:
                try:
                    user_dict = json.loads(each_dict)
                    uid = int(user_dict['uid'])
                    # Update datetime  of user doc on each action
                    user_dict['updated'] = datetime.now()
                    try:
                        # check user exist or not
                        uid_exists = es_con.exists(index=index_name,
                                                   doc_type="user",
                                                   id=uid)
                    except:
                        uid_exists = None

                    if uid_exists:
                        # update user doc
                        es_doc = {
                            "_op_type": "update",
                            "_index": index_name,
                            "_type": "user",
                            "_id": uid,
                            "script":
                            "ctx._source['name']=name\n ctx._source['age'] = age\n ctx._source['gender'] = gender\n ctx._source['mobile'] = mobile\n ctx._source.events.add(events)\n ctx._source['updated'] = updated",
                            "params": {
                                "name": user_dict['name'],
                                "age": user_dict['age'],
                                "gender": user_dict['gender'],
                                "mobile": user_dict['mobile'],
                                "events": user_dict['events'],
                                "updated": user_dict['updated']
                            }
                        }

                    else:
                        # create new user doc
                        es_doc = {
                            "_index": index_name,
                            "_type": "user",
                            "_id": uid,
                            "_source": user_dict
                        }

                    es_doc_list.append(es_doc)
                    # Insert document on every BULK_INSERT_SIZE
                    if (len(es_doc_list) == BULK_INSERT_SIZE):
                        helpers.bulk(es_con, es_doc_list)
                        es_doc_list = []

                except ValueError as e:
                    print(e)
                    pass

            # Insert remain documents
            if es_doc_list:
                helpers.bulk(es_con, es_doc_list)
                es_doc_list = []

        except (ImproperlyConfigured, ElasticsearchException) as e:
            print(e)
            pass
        f.close()
    except IOError as e:
        print(e)
        pass