Python scan 예제들, elasticsearch.helpers.scan Python 예제들

예제 #1

0

파일 보기

파일: test_helpers.py 프로젝트: elastic/elasticsearch-py

    def test_initial_search_error(self):
        with patch.object(self, "client") as client_mock:
            client_mock.search.return_value = {
                "_scroll_id": "dummy_id",
                "_shards": {"successful": 4, "total": 5},
                "hits": {"hits": [{"search_data": 1}]},
            }
            client_mock.scroll.side_effect = self.mock_scroll_responses

            data = list(
                helpers.scan(
                    self.client, index="test_index", size=2, raise_on_error=False
                )
            )
            self.assertEqual(data, [{"search_data": 1}, {"scroll_data": 42}])

            client_mock.scroll.side_effect = self.mock_scroll_responses
            with self.assertRaises(ScanError):
                data = list(
                    helpers.scan(
                        self.client, index="test_index", size=2, raise_on_error=True
                    )
                )
                self.assertEqual(data, [{"search_data": 1}])
                client_mock.scroll.assert_not_called()

예제 #2

0

파일 보기

파일: test_helpers.py 프로젝트: elastic/elasticsearch-py

    def test_clear_scroll(self):
        bulk = []
        for x in range(4):
            bulk.append({"index": {"_index": "test_index", "_type": "_doc"}})
            bulk.append({"value": x})
        self.client.bulk(bulk, refresh=True)

        with patch.object(
            self.client, "clear_scroll", wraps=self.client.clear_scroll
        ) as spy:
            list(helpers.scan(self.client, index="test_index", size=2))
            spy.assert_called_once()

            spy.reset_mock()
            list(
                helpers.scan(self.client, index="test_index", size=2, clear_scroll=True)
            )
            spy.assert_called_once()

            spy.reset_mock()
            list(
                helpers.scan(
                    self.client, index="test_index", size=2, clear_scroll=False
                )
            )
            spy.assert_not_called()

예제 #3

0

파일 보기

파일: test_helpers.py 프로젝트: elastic/elasticsearch-py

    def test_logger(self, logger_mock):
        bulk = []
        for x in range(4):
            bulk.append({"index": {"_index": "test_index", "_type": "_doc"}})
            bulk.append({"value": x})
        self.client.bulk(bulk, refresh=True)

        with patch.object(self.client, "scroll") as scroll_mock:
            scroll_mock.side_effect = self.mock_scroll_responses
            list(
                helpers.scan(
                    self.client,
                    index="test_index",
                    size=2,
                    raise_on_error=False,
                    clear_scroll=False,
                )
            )
            logger_mock.warning.assert_called()

            scroll_mock.side_effect = self.mock_scroll_responses
            try:
                list(
                    helpers.scan(
                        self.client,
                        index="test_index",
                        size=2,
                        raise_on_error=True,
                        clear_scroll=False,
                    )
                )
            except ScanError:
                pass
            logger_mock.warning.assert_called()

예제 #4

0

파일 보기

파일: ESio.py 프로젝트: youyou35/swallow

    def scan_and_queue(self,p_queue,p_index,p_query={},p_doctype=None,p_scroll_time='5m',p_timeout='1m'):
        """Reads docs from an es index according to a query and pushes them to the queue

            p_queue:         Queue where items are pushed to
            p_scroll_time:    Time for scroll method
            p_timeout:        Timeout - After this period, scan context is closed
            p_index:        Index where items are picked from
            p_doctype:        DocType of the items
            p_query:        ElasticSearch query for scanning the index
        """
        try:
            param = [{'host':self.host,'port':self.port}]
            es = Elasticsearch(param)
            logger.info('Connected to ES Server for reading: %s',json.dumps(param))
        except Exception as e:
            logger.error('Connection failed to ES Server for reading: %s',json.dumps(param))
            logger.error(e)
            sys.exit(EXIT_IO_ERROR)

        try:
            if 'p_doctype' is not None:
                documents = helpers.scan(client=es, query=p_query, size=1000, scroll=p_scroll_time, index=p_index, doc_type=p_doctype, timeout=p_timeout)
            else:
                documents = helpers.scan(client=es, query=p_query, size=1000, scroll= p_scroll_time, index=p_index, timeout=p_timeout)
            for doc in documents:
                logger.debug(doc)
                p_queue.put(doc)
        except Exception as e:
            logger.info("Error while scanning ES index %s with query %s",p_index,p_query)

예제 #5

0

파일 보기

파일: test_helpers.py 프로젝트: elastic/elasticsearch-py

    def test_scroll_error(self):
        bulk = []
        for x in range(4):
            bulk.append({"index": {"_index": "test_index", "_type": "_doc"}})
            bulk.append({"value": x})
        self.client.bulk(bulk, refresh=True)

        with patch.object(self.client, "scroll") as scroll_mock:
            scroll_mock.side_effect = self.mock_scroll_responses
            data = list(
                helpers.scan(
                    self.client,
                    index="test_index",
                    size=2,
                    raise_on_error=False,
                    clear_scroll=False,
                )
            )
            self.assertEqual(len(data), 3)
            self.assertEqual(data[-1], {"scroll_data": 42})

            scroll_mock.side_effect = self.mock_scroll_responses
            with self.assertRaises(ScanError):
                data = list(
                    helpers.scan(
                        self.client,
                        index="test_index",
                        size=2,
                        raise_on_error=True,
                        clear_scroll=False,
                    )
                )
            self.assertEqual(len(data), 3)
            self.assertEqual(data[-1], {"scroll_data": 42})

예제 #6

0

파일 보기

파일: test_helpers.py 프로젝트: andreip/elasticsearch-py

 def test_general_kwargs_forwarded_to_search(self):
     inexistent_index = 'test_index_123'
     self.assertRaises(
         NotFoundError,
         lambda: list(helpers.scan(self.client, index=inexistent_index, doc_type="answers", size=2))
     )
     global_kwargs = {'ignore': 404}
     list(helpers.scan(self.client, index=inexistent_index, doc_type="answers", size=2, global_kwargs=global_kwargs))

예제 #7

0

파일 보기

파일: es_methods.py 프로젝트: chenxi-shi/Information-Retrieval

def generate_all_doc(_es_instance, _my_index, _my_type="_all"):
	if _my_type == "_all":
		for _doc in scan(_es_instance, index=_my_index,
		                 query={"query": {"match_all": {}}}):
			yield _doc
	else:
		for _doc in scan(_es_instance, index=_my_index, doc_type=_my_type,
		               query={"query": {"match_all": {}}}):
			yield _doc

예제 #8

0

파일 보기

파일: test_indices_manager.py 프로젝트: hampsterx/slingshot

 def test_migrate(self):
     real_names = self.client.indices_manager.real_names('slingshot')
     docs = list(scan(self.client, index='slingshot'))
     self.assertEqual(3, len(docs))
     self.client.indices_manager.migrate('slingshot', CONFIG)
     self.assertNotEqual(real_names, self.client.indices_manager.real_names('slingshot'))
     self.client.indices.refresh('slingshot')
     docs = list(scan(self.client, index='slingshot'))
     self.assertEqual(3, len(docs))
     self.client.indices.refresh('slingshot')

예제 #9

0

파일 보기

파일: test_helpers.py 프로젝트: andreip/elasticsearch-py

 def test_general_kwargs_forwarded_to_scroll(self):
     with self.assertRaises(NotFoundError):
         for page in helpers.scan(self.client, index="test_index", doc_type="answers", size=2):
             # Deleting the index after first request was done makes sure
             # we test the scroll method.
             self.client.indices.delete('test_index', ignore=404)
     self.setUp()
     # Still raises a scanning error, but gets to that point only because
     # ignore=404 was forwarded to scroll.
     with self.assertRaises(helpers.ScanError):
         for page in helpers.scan(self.client, index="test_index", doc_type="answers", size=2, global_kwargs={'ignore': 404}):
             self.client.indices.delete('test_index', ignore=404)

예제 #10

0

파일 보기

파일: es_methods.py 프로젝트: chenxi-shi/Information-Retrieval

def generate_all_doc_list(_es_instance, _my_index, _my_type="_all"):
	_docs_lst = []
	if _my_type == "_all":
		for _doc in scan(_es_instance, index=_my_index,
		                 query={"query": {"match_all": {}}}):
			_docs_lst.append(_doc)
	else:
		for _doc in scan(_es_instance, index=_my_index, doc_type=_my_type,
		                 query={"query": {"match_all": {}}}):
			_docs_lst.append(_doc)

	return _docs_lst

예제 #11

0

파일 보기

파일: ESio.py 프로젝트: pagesjaunes/swallow

    def scan_and_queue(self, p_queue, p_index, p_query={}, p_doctype=None, p_scroll_time='5m', p_timeout='1m', p_size=100, p_overall_timeout=30, p_nbmax_retry=3):
        """Reads docs from an es index according to a query and pushes them to the queue

            p_queue:         Queue where items are pushed to
            p_scroll_time:    Time for scroll method
            p_timeout:        Timeout - After this period, scan context is closed
            p_index:        Index where items are picked from
            p_doctype:        DocType of the items
            p_query:        ElasticSearch query for scanning the index
        """
        logger_mp = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter)

        try:
            param = [{'host': self.host, 'port': self.port, 'timeout': p_overall_timeout, 'max_retries': p_nbmax_retry, 'retry_on_timeout': True}]
            if self.proxy is None:
                es = Elasticsearch(param)
            else:
                es = Elasticsearch(param, connection_class=MyConnection, proxies={'http': self.proxy})
            es.ping()
            logger_mp.info('Connected to ES Server for reading: {0}'.format(json.dumps(param)))
        except Exception as e:
            logger_mp.error('Connection failed to ES Server for reading: {0}'.format(json.dumps(param)))
            logger_mp.error(e)

        try:
            if not self.scroll_docs:
                if 'p_doctype' is not None:
                    self.scroll_docs = helpers.scan(client=es, query=p_query, size=p_size, scroll=p_scroll_time, index=p_index, doc_type=p_doctype, timeout=p_timeout)
                else:
                    self.scroll_docs = helpers.scan(client=es, query=p_query, size=p_size, scroll=p_scroll_time, index=p_index, timeout=p_timeout)

            start = time.time()
            for doc in self.scroll_docs:
                p_queue.put(doc)

                elapsed = time.time() - start

                with self.counters['nb_items_scanned'].get_lock():
                    self.counters['nb_items_scanned'].value += 1
                    nb_items = self.counters['nb_items_scanned'].value
                    self.counters['scan_time'].value += elapsed

                    if nb_items % self.counters['log_every'] == 0:
                        logger_mp.info("Scan : {0} items".format(nb_items))
                        logger_mp.debug("   -> Avg scan time : {0}ms".format(1000 * self.counters['scan_time'].value / nb_items))

                    # Start timers reinit
                    start = time.time()

        except Exception as e:
            logger_mp.info("Error while scanning ES index %s with query %s", p_index, p_query)
            with self.counters['nb_items_error'].get_lock():
                self.counters['nb_items_error'].value += 1

예제 #12

0

파일 보기

파일: suburban.py 프로젝트: igoral5/synchro

 def load_old(self):
     query = {'query': {'prefix': {'_id': '%d:' % group_code}}}
     try:
         res1 = es_client.count(index=name_index, doc_type='geometry', body=query)
         res2 = es_client.count(index=name_index, doc_type='route', body=query)
         if res1['count'] > res2['count']:
             for geometry in scan(es_client, query, '10m', index=name_index, doc_type='geometry'):
                 self.es_geometry[geometry['_id']] = geometry['_source']['points']
         else:
             for route in scan(es_client, query, '10m', index=name_index, doc_type='route'):
                 self.es_geometry[route['_id']] = route['_source']['geometry']
     except:
         logger.error(u'Ошибка чтения геометрии из ElasticSearch')

예제 #13

0

파일 보기

파일: es.py 프로젝트: putmantime/mygeneinfo_gh

    def doc_feeder(self, index_type=None, index_name=None, step=10000, verbose=True, query=None, scroll='10m', **kwargs):
        conn = self.conn
        index_name = index_name or self.ES_INDEX_NAME
        doc_type = index_type or self.ES_INDEX_TYPE

        n = self.count(query=query)['count']
        cnt = 0
        t0 = time.time()
        if verbose:
            print('\ttotal docs: {}'.format(n))

        _kwargs = kwargs.copy()
        _kwargs.update(dict(size=step, index=index_name, doc_type=doc_type))
        res = helpers.scan(conn, query=query, scroll=scroll, **_kwargs)
        t1 = time.time()
        for doc in res:
            if verbose and cnt % step == 0:
                if cnt != 0:
                    print('done.[%.1f%%,%s]' % (cnt*100./n, timesofar(t1)))
                print('\t{}-{}...'.format(cnt+1, min(cnt+step, n)), end='')
                t1 = time.time()
            yield doc
            cnt += 1
        if verbose:
            print('done.[%.1f%%,%s]' % (cnt*100./n, timesofar(t1)))
            print("Finished! [{}]".format(timesofar(t0)))

예제 #14

0

파일 보기

파일: tweet_helper.py 프로젝트: roy-2404/sentiment_analysis

  def searchTweets(keyword, latlondist):
    #Variables that contains the user credentials to access Twitter API 
    if TwitterHelper.AWS_ACCESS_KEY == None:
      raise KeyError("Please set the AWS_ACCESS_KEY env. variable")
    
    if TwitterHelper.AWS_SECRET_KEY == None:
      raise KeyError("Please set the AWS_SECRET_KEY env. variable")

    s = Search()
    if latlondist != None:
      locJson = json.loads(latlondist)
      s = s.query({"filtered" : {"query" : {"match_all" : {}}, "filter" : {"geo_distance" : {"distance" : locJson['dist'], "location" : {"lat" : locJson['lat'], "lon" : locJson['lon']}}}}})

    if keyword != None:
      q = Q("match_phrase", text = keyword)
      s = s.query(q)
    
    scanResp = None
    scanResp = helpers.scan(client = TwitterHelper.ES, query = s.to_dict(), scroll = "1m", index = "tweets", timeout = "1m")

    arr = []
    for resp in scanResp:
      hit = resp['_source']
      d = {}
      d['name'] = hit['name']
      d['text'] = hit['text']
      d['sentiment'] = hit['sentiment']
      d['lat'] = hit['location']['lat']
      d['lon'] = hit['location']['lon']
      arr.append(d)
    allD = {}
    allD['tweets'] = arr
    mapInput = json.dumps(allD)
    return mapInput

예제 #15

0

파일 보기

파일: eshits2csv.py 프로젝트: andreapalaia/es-utils

def query_and_dump_reults(args):
    es = Elasticsearch([args.hostname + ':' + str(args.port)])

    query = '{"query":{"match_all":{}}}'
    if args.query is not None:
        query = args.query

    doc_type = None
    if args.doc_type is not None:
        doc_type = args.doc_type

    target = "output.csv"
    if args.target is not None:
        target = args.target

    res = es.count(index=args.index, body=query)
    nhits = res['count']

    counter = 0
    bar = progressbar.ProgressBar(max_value=nhits)

    res = helpers.scan(es, index=args.index, query=query, doc_type=doc_type)
    fields = args.fields.split(',')
    with open(target, 'w') as csvfile:
        datawriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
        datawriter.writerow(fields)
        for item in res:
            item = item['_source']
            datawriter.writerow([get_var(item, field) for field in fields])

            counter += 1
            bar.update(counter)
        bar.finish()

예제 #16

0

파일 보기

파일: schema.py 프로젝트: ecreall/lagendacommun

def get_venues_by_location(location, radius, hour_for_cache):
    """Return a list of venues oid that are in location ('latitude,longitude')
    in the given radius. Cache results for one hour.
    """
    lat_lon = location.split(',')
    body = {
        "query": {
            "filtered": {
                "query": {"match_all": {}},
                "filter": {
                    "geo_distance": {
                        "distance": str(radius) + 'km',
                        "location": {
                            "lat": float(lat_lon[0]),
                            "lon": float(lat_lon[1])
                        }
                    }
                }
            }
        },
        "_source": {
            "include": ["oid"]
        }
    }
    try:
        result = scan(
            es,
            index='lac',
            doc_type='geo_location',
            query=body,
            size=500)
        return [v['_source']['oid'] for v in result]
    except Exception as e:
        log.exception(e)
        return []

예제 #17

0

파일 보기

파일: change_data.py 프로젝트: ferrero-zhang/user_portrait_0324

def main():
    s_re = scan(es_user_portrait, query={'query':{'match_all':{}}, 'size':1}, index=index_name, doc_type=index_type)
    count = 0
    while True:
        try:
            scan_re = s_re.next()['_source']
            count += 1
            task_name = scan_re['task_name']
            history_status = json.loads(scan_re['history_status'])
            #iter history status
            new_history_status = []
            for history_item in history_status:
                history_item_last = history_item[-1]
                if history_item_last == u'':
                    new_history_item = history_item[:-1]
                    new_history_item.append("0")
                    new_history_status.append(new_history_item)
                else:
                    new_history_status.append(history_item)
                    new_history_item = history_item
            print 'new_history_status:', new_history_status
            es_user_portrait.update(index=index_name, doc_type=index_type, \
                    id=task_name, body={'doc':{'history_status': json.dumps(new_history_status)}})
        except StopIteration:
            print 'all done'
            break
        except Exception as e:
            raise e
    print 'count:', count

예제 #18

0

파일 보기

파일: fb_trend.py 프로젝트: curtisbeck/bin

 def facebook_trend_corpus(self):
     return scan(self._es,
                 query={"query": {"match_all": {}}},
                 index="signals_time_series_20160601",
                 doc_type="facebookTrend",
                 _source=['url', 'parentPageId']
                 )

예제 #19

0

파일 보기

파일: scan_topic2sentiment.py 프로젝트: ferrero-zhang/user_portrait_0324

def scan_topic2redis():
    count = 0
    s_re = scan(es_user_portrait, query={'query':{'match_all': {}}, 'size':1000}, index=portrait_index_name, doc_type=portrait_index_type)
    start_ts = time.time()
    hmset_dict = {}
    while True:
        try:
            scan_re = s_re.next()['_source']
            count += 1
            uid = scan_re['uid']
            topic_ch_string = scan_re['topic_string']
            topic_ch_list = topic_ch_string.split('&')
            topic_en_string = [topic_ch2en_dict[item] for item in topic_ch_list]
            hmset_dict[uid] = json.dumps(topic_en_string)
            if count % 1000 == 0 and count != 0:
                R_TOPIC.hmset(r_topic_name, hmset_dict)
                end_ts = time.time()
                print '%s sec count 1000' % (end_ts - start_ts)
        except StopIteration:
            if hmset_dict:
                R_TOPIC.hmset(r_topic_name, hmset_dict)
                hmset_dict = {}
            break
        except Exception as e:
            raise e
            break
    if hmset_dict:
        R_TOPIC.hmset(r_topic_name, hmset_dict)
    print 'all count:', count

예제 #20

0

파일 보기

파일: esstorage.py 프로젝트: ClinGen/clincoded

 def __iter__(self, item_type=None):
     query = {
         'fields': ['uuid'],
         'filter': {'term': {'item_type': item_type}} if item_type else {'match_all': {}},
     }
     for hit in scan(self.es, query=query):
         yield hit['fields']['uuid'][0]

예제 #21

0

파일 보기

파일: update_day.py 프로젝트: taozhiiq/user_portrait

def update_attribute_day():
    # scan the user_portrait and bulk action to update
    status = False
    results = {}
    count = 0
    index_name = "user_portrait"
    index_type = "user"
    s_re = scan(es, query={"query": {"match_all": {}}, "size": 1000}, index=index_name, doc_type=index_type)
    while True:
        bulk_action = []
        while True:
            try:
                scan_re = s_re.next()["_source"]
                count += 1
            except StopIteration:
                print "all done"
                if bulk_action:
                    # print 'bulk_action:', bulk_action
                    status = save_user_results(bulk_action)
                    # print 'status:', status
                sys.exit(0)
            except Exception, r:
                print Exception, r
                sys.exit(0)
            uid = scan_re["uid"]
            user_info = {"uid": uid}
            evaluate_result = get_evaluate_index(user_info, status="update")
            results = {}
            results = dict(results, **evaluate_result)
            action = {"update": {"_id": str(uid)}}
            bulk_action.extend([action, {"doc": results}])

예제 #22

0

파일 보기

파일: input.py 프로젝트: redref/tantale

    def freshness_iterator(
        self, query, outdated_status, prefix, start_time, timeout
    ):
        """
        Make a scan query then manipulate hits
        Yield on all modified hits
        """
        self.elasticclient.indices.refresh(
            index=self.status_index, ignore_unavailable=True)

        start_time = start_time * 1000
        now = int(time.time()) * 1000

        for hit in helpers.scan(
            self.elasticclient,
            index=self.status_index,
            size=self.batch_size,
            query=query,
            scroll="%ss" % timeout,
        ):
            # Startup grace_time handle
            if 'last_check' not in hit['_source']:
                hit['_source']['last_check'] = hit['_source']['timestamp']

            if (
                hit['_source']['last_check'] < start_time and
                now < (
                    start_time + hit['_source']['last_check'] -
                    hit['_source']['freshness'])
            ):
                continue

            hit['_op_type'] = 'update'
            hit['doc'] = {}

            # Update status only if OK before
            if hit['_source']['status'] == 0:
                hit['doc']['status'] = outdated_status
            else:
                hit['doc']['status'] = hit['_source']['status']

            hit['doc']['timestamp'] = int(time.time()) * 1000

            hit['doc']['output'] = prefix + hit['_source']['output']

            # Build a log entry
            log = {}
            for field in Check.log_fields:
                if field in ('timestamp', 'status', 'output'):
                    log[field] = hit['doc'][field]
                else:
                    log[field] = hit['_source'][field]

            del hit['_source']

            yield hit

            # Update OK
            # Forward update to _send_to_logs
            self.logs.append(log)

예제 #23

0

파일 보기

파일: manual_tagger.py 프로젝트: geitje01/algodb

def main(flag):
    es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
    r = redis.StrictRedis()

    if flag == '1':
        body = {'query': {'match_all': {}}}
        result = scan(es, index='throwtable', doc_type='implementation',
            query=body)
        for p in result:
            r.sadd('pkgs', p['_source']['instruction']['package'])
    else:
        result = json.load(open('list.json'))
        for p in result:
            r.sadd('pmgs', p)

    samples = r.srandmember('pkgs', 100)
    for pkgName in samples:
        r.sadd('samples', pkgName)
        continue
        print '================'
        printPkgContent(pkgName)
        result = []
        while True:
            answer = getUserInput("Article? ")
            if answer == '':
                break
            page = queryWikipedia(answer)
            answer = getUserInput("Do you mean: %s page? ")
            if answer is True:
                result.append(page)
                break
        if len(result) != 0:
            r.sadd('%s:map' % pkgName, *result)

예제 #24

0

파일 보기

파일: scan_es2redis.py 프로젝트: SwoJa/ruman

def scan_es2redis_month():
    count = 0
    s_re = scan(es_user_portrait, query={'query':{'match_all': {}}, 'size':1000}, index=portrait_index_name, doc_type=portrait_index_type)
    start_ts = time.time()
    user_info = {}
    while True:
        try:
            scan_re = s_re.next()['_source']
            count += 1
            uid = scan_re['uid']
            user_info[uid] = {'fansnum':scan_re['fansnum'], 'topic_string':scan_re['topic_string']}
            update_month_redis.lpush(UPDATE_MONTH_REDIS_KEY, json.dumps(user_info))
            user_info = {}
            if count % 1000 == 0 and count != 0:
                end_ts = time.time()
                print '%s sec count 1000' % (end_ts - start_ts)
                start_ts = end_ts
        except StopIteration:
            print 'all done'
            if user_info:
                update_month_redis.lpush(UPDATE_MONTH_REDIS_KEY, json.dumps(user_info))
                user_info = {}
            break
        except Exception, r:
            raise r
            break

예제 #25

0

파일 보기

파일: indices_manager.py 프로젝트: hampsterx/slingshot

    def copy(self, source_index, target_index, transform=None, ignore_types=None):
        if source_index == target_index:
            raise SameIndex("source_index and target_index must be different")

        if not self.client.indices.exists(source_index):
            raise IndexDoesNotExist("source_index '{}' does not exist".format(source_index))

        if not self.client.indices.exists(target_index):
            raise IndexDoesNotExist("target_index '{}' does not exist".format(source_index))

        transform = transform or (lambda doc: doc)
        ignore_types = ignore_types or []
        hits = helpers.scan(self.client, index=source_index)

        def _process_hits(hits, index):
            for doc in hits:
                if doc['_type'] in ignore_types:
                    continue
                doc['_index'] = index
                doc['_op_type'] = 'create'
                doc = transform(doc)
                if not doc:
                    continue
                yield doc

        return helpers.bulk(self.client, _process_hits(hits, target_index), chunk_size=1000, stats_only=True)

예제 #26

0

파일 보기

파일: elasticsearch_storage.py 프로젝트: MITRECND/multiscanner

    def search(self, query_string, search_type='default'):
        '''Run a Query String query and return a list of sample_ids associated
        with the matches. Run the query against all document types.
        '''
        if search_type == 'advanced':
            query = self.build_query(query_string)
        else:
            es_reserved_chars_re = r'([\+\-=\>\<\!\(\)\{\}\[\]\^\"\~\*\?\:\\/ ])'
            query_string = re.sub(es_reserved_chars_re, r'\\\g<1>', query_string)
            if search_type == 'default':
                query = self.build_query("*" + query_string + "*")
            elif search_type == 'exact':
                query = self.build_query("\"" + query_string + "\"")
            else:
                print('Unknown search type!')
                return None
        result = helpers.scan(
            self.es, query=query, index=self.index
        )

        matches = []
        for r in result:
            if r.get('_source', {}).get('doc_type', {}) == 'sample':
                field = '_id'
            else:
                field = '_routing'
            matches.append(r[field])
        return tuple(set(matches))

예제 #27

0

파일 보기

파일: search.py 프로젝트: azerbini/eamena

    def delete(self, **kwargs):
        """
        Deletes a document from the index
        Pass an index, doc_type, and id to delete a specific document
        Pass a body with a query dsl to delete by query

        """

        body = kwargs.pop('body', None)
        if body != None:
            try:
                data = []
                refresh = kwargs.pop('refresh', False)
                for hit in helpers.scan(self.es, query=body, **kwargs):
                    hit['_op_type'] = 'delete'
                    data.append(hit)

                return helpers.bulk(self.es, data, refresh=refresh, **kwargs)
            except Exception as detail:
                self.logger.warning('%s: WARNING: failed to delete document by query: %s \nException detail: %s\n' % (datetime.now(), body, detail))
                raise detail
        else:
            try:
                return self.es.delete(ignore=[404], **kwargs)
            except Exception as detail:
                self.logger.warning('%s: WARNING: failed to delete document: %s \nException detail: %s\n' % (datetime.now(), body, detail))
                raise detail

예제 #28

0

파일 보기

파일: elastic2_doc_manager.py 프로젝트: mongodb-labs/elastic2-doc-manager

 def _stream_search(self, *args, **kwargs):
     """Helper method for iterating over ES search results."""
     for hit in scan(
         self.elastic, query=kwargs.pop("body", None), scroll="10m", **kwargs
     ):
         hit["_source"]["_id"] = hit["_id"]
         yield hit["_source"]

예제 #29

0

파일 보기

파일: suburban.py 프로젝트: igoral5/synchro

 def load_old(self):
     query = {'query': {'prefix': { '_id': '%s:' % group_code }}}
     try:
         for station in scan(es_client, query, '10m', index=name_index, doc_type='station'):
             self.es_location[station['_id']] = {'location': {'long': station['_source']['location'][0], 'lat': station['_source']['location'][1] }}
     except:
         logger.error(u'Ошибка чтения остановок из ElasticSearch')

예제 #30

0

파일 보기

파일: verify-index-copy.py 프로젝트: appsembler/configuration

def scan_documents(old_es, new_es, old_index, new_index):
    """
    Scan for matching documents

     In order to match the two indices without having to deal with ordering issues,
     we pull a set of documents from the old ES index, and then try to find matching
     documents with the same _id in the new ES index. This process is batched to avoid
     making individual network calls to the new ES index.
    """

    matching = 0
    total = 0
    old_iter = scan(old_es, index=old_index)
    for old_elts in grouper(old_iter, SCAN_ITER_STEP):

        old_elt_ids = []
        old_elt_docs = {}
        for elt in old_elts:
            if elt is not None:
                old_elt_ids.append({'_id': elt['_id']})
                old_elt_docs[elt['_id']] = elt

        matching += find_matching_ids(new_es, new_index, old_elt_ids, old_elt_docs)
        total += len(old_elt_ids)
        if total % 100 == 0:
            print 'processed {} items'.format(total)

    ratio = float(matching)/total
    print "{}: scanned documents matching ({} out of {}, {:.6}%)".format(
        'OK' if ratio > SCAN_MATCH_THRESHOLD else 'FAILURE', matching, total, ratio * 100
    )

예제 #31

0

파일 보기

            search_body = {
                "query": {
                    "bool": {
                        "filter": [
                            {"term": {"platform.keyword": platform}},
                            # {"term": {"releaser.keyword": releaser}},
                                {"term": {"releaser_id_str": doc_id}},
                            {"range": {"release_time": {"gte": re_s_t, "lt": re_e_t}}},
                          {"range": {"fetch_time": {"gte": re_s_t}}}
                        ]
                    }
                }
            }

#             #scan_re = scan(client=es, index='crawler-data-raw', doc_type='doc',query=search_body, scroll='3m')
            scan_re = scan(client=es, index='short-video-all-time-url', doc_type='all-time-url',query=search_body, scroll='3m')
            for one_scan in scan_re:
                doc_id = cal_doc_id(one_scan["_source"]["platform"], url=one_scan["_source"]["url"], doc_id_type='all-time-url', data_dict=one_scan["_source"])
                find_exist = {
                    "query": {
                        "bool": {
                            "filter": [
                                {"term": {"_id": doc_id}}
                            ]
                        }
                    }
                }
                search_re = es.search(index='short-video-weekly', doc_type=weekly_doc_type_name,
                                      body=find_exist)
                if search_re['hits']['total'] == 0:
                    re_list.append(one_scan['_source'])

예제 #32

0

파일 보기

파일: operations.py 프로젝트: jimazmarin/eland

    def _es_results(self, query_compiler, collector):
        query_params, post_processing = self._resolve_tasks(query_compiler)

        size, sort_params = Operations._query_params_to_size_and_sort(
            query_params)

        script_fields = query_params.script_fields
        query = Query(query_params.query)

        body = query.to_search_body()
        if script_fields is not None:
            body["script_fields"] = script_fields

        # Only return requested field_names
        _source = query_compiler.get_field_names(include_scripted_fields=False)
        if _source:
            # For query_compiler._client.search we could add _source
            # as a parameter, or add this value in body.
            #
            # If _source is a parameter it is encoded into to the url.
            #
            # If _source is a large number of fields (1000+) then this can result in an
            # extremely long url and a `too_long_frame_exception`. Therefore, add
            # _source to the body rather than as a _source parameter
            body["_source"] = _source
        else:
            body["_source"] = False

        es_results = None

        # If size=None use scan not search - then post sort results when in df
        # If size>10000 use scan
        is_scan = False
        if size is not None and size <= DEFAULT_ES_MAX_RESULT_WINDOW:
            if size > 0:
                try:

                    es_results = query_compiler._client.search(
                        index=query_compiler._index_pattern,
                        size=size,
                        sort=sort_params,
                        body=body,
                    )
                except Exception:
                    # Catch all ES errors and print debug (currently to stdout)
                    error = {
                        "index": query_compiler._index_pattern,
                        "size": size,
                        "sort": sort_params,
                        "body": body,
                    }
                    print("Elasticsearch error:", error)
                    raise
        else:
            is_scan = True
            es_results = scan(
                client=query_compiler._client,
                index=query_compiler._index_pattern,
                query=body,
            )
            # create post sort
            if sort_params is not None:
                post_processing.append(SortFieldAction(sort_params))

        if is_scan:
            while True:
                partial_result, df = query_compiler._es_results_to_pandas(
                    es_results, collector.batch_size(),
                    collector.show_progress)
                df = self._apply_df_post_processing(df, post_processing)
                collector.collect(df)
                if not partial_result:
                    break
        else:
            partial_result, df = query_compiler._es_results_to_pandas(
                es_results)
            df = self._apply_df_post_processing(df, post_processing)
            collector.collect(df)

예제 #33

0

파일 보기

for i in aliases:
    if index in aliases[i]['aliases']:
        index = i
'''
Fetch the mapping in order to create the header
'''
mapping = es.indices.get_mapping(
    index=index,
    doc_type=doc_type)[index]['mappings'][doc_type]['properties'].keys()
'''
Set handler to elasticsearch
'''
scanResp = helpers.scan(client=es,
                        query=query,
                        scroll="10m",
                        index=index,
                        size=size,
                        doc_type=doc_type,
                        clear_scroll=False,
                        request_timeout=300)

with open(output_files, 'w') as f:
    counter = 0
    if fields == "all":
        w = csv.DictWriter(f,
                           mapping,
                           delimiter=delimiter,
                           quoting=csv.QUOTE_MINIMAL)
    else:
        fields = fields.split(",")
        w = csv.DictWriter(f, [i for i in mapping if i in fields],
                           delimiter=delimiter,

예제 #34

0

파일 보기

파일: erwthma_3_preprocess.py 프로젝트: alphazita/Elasticsearch-moviesDataset

df_movies = pd.read_csv(r".\data\movies.csv", engine="python")
df_ratings = pd.read_csv(r".\data\ratings.csv",
                         engine="python").drop("timestamp", axis=1)
all_genres = []

for i in range(len(df_movies)):
    temp = df_movies.loc[i, "genres"].split("|")
    for j in range(len(temp)):
        if (temp[j] not in all_genres):
            all_genres.append(temp[j])
#print (all_genres)

# In[ ]:

results = helpers.scan(es, index='ratings', query={"query": {"match_all": {}}})
res_set = set()
for item in results:
    popo = item['_source']['userId']
    res_set.add(int(popo))
#print(res_set)

# In[ ]:

df3 = df_ratings.drop("movieId", axis=1)
df3.groupby('userId').first()
df3.drop("rating", axis=1)

lstset = list(res_set)

data = {'userId': lstset}

예제 #35

0

파일 보기

es = Elasticsearch('123.123.123.123:9201')

sres = helpers.scan(
    es,
    index="webhook-*",
    preserve_order=True,
    query={
        "query": {
            "bool": {
                "must": [{
                    "query_string": {
                        "query":
                        "_type:notify  AND notify_number:1 AND paid_time:{2018-04-23T04:00 TO * }",
                        "analyze_wildcard": True
                    }
                }, {
                    "range": {
                        "@timestamp": {
                            "gte": 1524457200000,
                            "lte": 1524461100000,
                            "format": "epoch_millis"
                        }
                    }
                }],
                "must_not": []
            }
        }
    },
    scroll="300s")

dd = {}

예제 #36

0

파일 보기

def fb_count2flow_text():

    index_name = facebook_count_index_name_pre + '2017-10-12'

    query_body = {'query': {'match_all': {}}}

    scan_results = scan(es,
                        index=index_name,
                        doc_type=facebook_count_index_type,
                        query=query_body,
                        size=1000)

    count = 0
    t1 = time.time()

    while 1:

        try:

            body_dict = {}

            data = scan_results.next()
            item = data['_source']
            body_dict['comment'] = item['comment']
            body_dict['favorite'] = item['favorite']
            body_dict['share'] = item['share']

            body_dict['update_time'] = item['update_time']

            start_ts = datetime2ts('2017-10-10')
            end_ts = datetime2ts('2017-10-25')

            day_num = (end_ts - start_ts) / (24 * 3600) + 1

            count += 1
            if count % 1000 == 0:
                print 'fb..', count
                t2 = time.time()

                print 'time cost..', t2 - t1
                t1 = t2

            for i in range(day_num):

                timestamp = start_ts + i * 24 * 3600
                date = ts2datetime(timestamp)

                flow_text_index_name = facebook_flow_text_index_name_pre + date

                _id = item['fid']
                try:
                    es.update(index=flow_text_index_name,doc_type=facebook_flow_text_index_type,\
                    id=_id,body={'doc':body_dict})

                    # count += 1

                    # if count % 1000 == 0:
                    # 	print 'fb..',count

                except:
                    continue

        except StopIteration:
            break

예제 #37

0

파일 보기

def tw_flow_text():

    start_ts = datetime2ts('2017-10-10')
    end_ts = datetime2ts('2017-10-25')

    day_num = (end_ts - start_ts) / (24 * 3600) + 1

    count = 0

    for i in range(day_num):
        timestamp = start_ts + i * 24 * 3600
        date = ts2datetime(timestamp)

        index_name = twitter_flow_text_index_name_pre + date

        query_body = {'query': {'match_all': {}}}

        scan_results = scan(es,
                            index=index_name,
                            doc_type=twitter_flow_text_index_type,
                            query=query_body,
                            size=1000)

        bulk_action = []

        while 1:

            try:

                body_dict = {}

                data = scan_results.next()
                item = data['_source']
                body_dict['comment'] = 0
                body_dict['favorite'] = 0
                body_dict['share'] = 0

                body_dict['update_time'] = item['timestamp']

                #flow_text_index_name = twitter_flow_text_index_name_pre + ts2datetime(item['timestamp'])

                _id = item['tid']

                action = {'update': {'_id': _id}}

                bulk_action.extend([action, {'doc': body_dict}])

                count += 1

                if count % 100 == 0:

                    print 'tw..', count
                    es.bulk(bulk_action,
                            index=index_name,
                            doc_type=twitter_flow_text_index_type,
                            timeout=100)

            except StopIteration:
                break

        if bulk_action:
            es.bulk(bulk_action,
                    index=index_name,
                    doc_type=twitter_flow_text_index_type,
                    timeout=100)

예제 #38

0

파일 보기

파일: connector.py 프로젝트: eugene6124/share

 def read(self, index, query):
     return [[doc["_id"], doc["_source"]]
             for doc in scan(self.conn, index=index, query=query)]

예제 #39

0

파일 보기

def build_orange_table_from_es_logs(
        mongo_query,
        valid_keys=None,
        prune_null_resources=True,
        all_logs_index='flat-all-log-entries',
        unique_logs_index='flat-unique-log-entries'):
    field_values = {}
    single_value_columns = set(
    )  #TODO return values that are always true, add them to Rules
    records = 0
    key_value_counter = Counter()
    paginator = helpers.scan(es,
                             query={"query": {
                                 "match_all": {}
                             }},
                             index=unique_logs_index,
                             doc_type='doc')
    for hit in paginator:
        records += 1
        # if records % 1000 == 0:
        #     print('Records : ' + str(records))
        for key, value in hit['_source'].items():
            if key == '_id' or (valid_keys and key not in valid_keys):
                continue
            RuleUtils.addMulti(field_values, key, value)
            key_value_counter.update(['%s=%s' % (key, value)])

    for k, v in dict(key_value_counter).items():
        if v == records:
            single_value_columns.add(
                k)  # ignore fields that always have the same value
            field_name = k.split('=')[0]
            field_values.pop(field_name)

    orange_columns = []
    for key, value in field_values.items():
        # if len(value) == 1 and records > 1:
        #     single_value_columns.add('%s=%s' % (key, value.pop()))#ignore fields that always have the same value
        #     continue
        for elem in value:
            if not isinstance(elem, str):
                value.remove(elem)
                value.add(str(elem))
        try:
            column = DiscreteVariable(key, values=value)
        except Exception as ex:
            traceback.print_exc()
            print(value)
        orange_columns.append(column)
    # if use_resources:
    #     resource_encoder = OrangeTableResourceColumnGenerator(mongo_query)
    #     resource_columns = resource_encoder.get_table_columns()
    #     orange_columns.extend(resource_columns)
    # else:
    resource_encoder = None
    domain = Domain(orange_columns)

    records = 0
    table = Table(domain)
    paginator = helpers.scan(es,
                             query={"query": {
                                 "match_all": {}
                             }},
                             index=all_logs_index,
                             doc_type='doc')
    for hit in paginator:
        instance = createInstance(domain, hit['_source'], resource_encoder,
                                  prune_null_resources)
        table.append(instance)
        records += 1
        # if records % 1000 == 0:
        #     print('Records : ' + str(records))
    # print('Built Table: %d recrods' % len(table))
    return table, single_value_columns

예제 #40

0

파일 보기

파일: sentiment.py 프로젝트: csinchok/fccforensics

    def tag_positive_terms(self):
        '''
            get documents without a sentiment tag that match phrase with slop:
              - protect|support|keep|need net neutrality
              - let the new neutrality stand
            for a broader result set than regex in analyze
        '''
        query = {
            "_source": "text_data",
            "query": {
                "bool": {
                    "filter": {
                        "bool": {
                            "should": [],
                            "must": [{
                                "term": {
                                    "analysis.source": "unknown"
                                }
                            }],
                            "must_not": [{
                                "exists": {
                                    "field": "analysis.titleii"
                                }
                            }, {
                                "exists": {
                                    "field": "analysis.sentiment_manual"
                                }
                            }, {
                                "exists": {
                                    "field":
                                    "analysis.sentiment_sig_terms_ordered"
                                }
                            }]
                        }
                    }
                }
            }
        }

        phrases = [
            'essential net neutrality', 'keep net neutrality',
            'maintain net neutrality', 'need net neutrality',
            'preserve net neutrality'
            'protect net neutrality', 'save net neutrality',
            'support net neutrality', 'support title 2', 'support title II',
            'let the new neutrality stand',
            'net neutrality rules are extremely important'
            'net neutrality is important'
        ]
        for phrase in phrases:
            subq = {
                "match_phrase": {
                    "text_data": {
                        "query": phrase,
                        "slop": 3
                    }
                }
            }
            query['query']['bool']['filter']['bool']['should'].append(subq)
        print(json.dumps(query))
        resp = self.es.search(index='fcc-comments', body=query, size=0)
        total = resp['hits']['total']
        print('tagging %s / %s matches' % (self.limit, total))
        docs = []
        for doc in scan(self.es, index='fcc-comments', query=query, size=1000):
            docs.append(
                lib.bulk_update_doc(doc['_id'],
                                    {'source': 'es_terms_positive'}))
            if not len(docs) % 1000:
                print(
                    '\tfetched %s\n%s\t%s' %
                    (len(docs), doc['_id'], doc['_source']['text_data'][:400]))
            if len(docs) == self.limit:
                break

        print('indexing %s' % (len(docs)))
        tagged = lib.bulk_update(self.es, docs)
        print('tagged %s / %s matches' % (tagged, total))
        return tagged

예제 #41

0

파일 보기

파일: dedupe.py 프로젝트: maddadder/docker-fscrawler

def scroll_over_all_docs(_index):
    for hit in helpers.scan(es, index=_index):
        populate_dict_of_duplicate_docs(hit)

예제 #42

0

파일 보기

파일: xbrl-rss-interpolation.py 프로젝트: emmettFC/selected-projects

def interpolate(a):
    doc = a['_source']['__meta__']['financials']
    for k, v in doc.iteritems():
        if v == None and k == 'assets':
            doc['assets'] = {}
            doc['assets']['value'] = _assets(doc)
        elif v == None and k == 'liabilities':
            doc['liabilities'] = {}
            doc['liabilities']['value'] = _liabilities(doc)
        elif v == None and k == 'stockholdersEquity':
            doc['stockholdersEquity'] = {}
            doc['stockholdersEquity']['value'] = _stockholdersEquity(doc)
        elif v == None and k == 'liabilitiesAndStockholdersEquity':
            doc['liabilitiesAndStockholdersEquity'] = {}
            doc['liabilitiesAndStockholdersEquity'][
                'value'] = _liabilitiesAndStockholdersEquity(doc)
        else:
            pass
    doc['interpolated'] = True
    return a


# --
# run

for a in scan(client, index=config['aq_forms_enrich']['index'], query=query):
    s = interpolate(a)
    client.index(index=config['aq_forms_enrich']['index'],
                 doc_type=config['aq_forms_enrich']['_type'],
                 body=s['_source'],
                 id=s['_id'])

예제 #43

0

파일 보기

파일: es_export_csv.py 프로젝트: Battleroid/es-export-csv

def grab(args):
    """
    Find index pattern, iterate through documents, collecting entries. If
    fields is set we will narrow our results to only include the specified
    fields. Otherwise, all fields will be returned, using the first document
    to determine the fields.
    """

    # Setup client
    username = args.username
    password = args.password
    if not args.password:
        password = getpass()
    auth = (username, password)

    es = Elasticsearch(args.host,
                       use_ssl='https' in args.host,
                       verify_certs=True,
                       http_auth=auth)

    query_string = args.query

    # Build query
    query = {
        'query': {
            'bool': {
                'must': [{
                    'range': {
                        '@timestamp': {
                            'gte': args.range_from,
                            'lte': args.range_to
                        }
                    }
                }]
            }
        }
    }

    if query_string:
        query['query']['bool']['must'].append(
            {'query_string': {
                'query': query_string
            }})
    else:
        query['query']['bool']['must'].append({'match_all': {}})

    args.total = int(args.total)
    # Search for records
    kwargs = {}
    kwargs['index'] = args.index
    kwargs['size'] = args.total
    if args.fields:
        kwargs['_source_includes'] = args.fields

    try:
        results = None
        # Scan if we're looking for more than 500 results. Note that size
        # for scan denotes number of entries retrieved each call.
        if kwargs['size'] > 500:
            kwargs['size'] = 500
            kwargs['query'] = query
            results = scan(es, **kwargs)
        else:
            kwargs['body'] = query
            results = es.search(**kwargs)
            results = results['hits']['hits']

        def flatten(d, path=None):
            """
            Returns list of fields and their path recursively separated by dot
            notation.
            """
            l = []
            path = path or []
            for key, value in d.items():
                if isinstance(value, dict):
                    for item in flatten(value, [*path, key]):
                        l.append(item)
                else:
                    if isinstance(value, list):
                        l.append(('.'.join([*path, key]), ','.join(value)))
                    else:
                        l.append(('.'.join([*path, key]), value))

            return l

        # Flatten all records to dot notation
        records = []
        for i, hit in enumerate(results):
            if i >= args.total:
                break
            if 'sort' in hit:
                del hit['sort']
            if args.only_source:
                records.append(dict(flatten(hit['_source'])))
            else:
                records.append(dict(flatten(hit)))

    except:
        raise SystemExit(
            f'Error connecting to ElasticSearch at "{args.host}". Please ensure that ElasticSearch is running, and your credentials are correct.'
        )

    if len(records) == 0:
        raise SystemExit('Query returned no results')

    # Setup CSV field names
    field_names = set()
    for item in records:
        for key in item.keys():
            field_names.add(key)

    # Save as CSV
    with open(args.output, 'w') as f:
        writer = csv.DictWriter(f, sorted(field_names), extrasaction='ignore')
        if not args.no_header:
            writer.writeheader()
        writer.writerows(records)

예제 #44

0

파일 보기

 def get_hits(self, index, query=None):
     return scan(self.es, query=query, index=index)

예제 #45

0

파일 보기

def create_dump(ctx, index):
    """
    Create a dump of an index. If you don't provide an ``--index`` option,
    you will be prompted with a list of available index names. Dumps are
    stored as a gzipped txt file in ``settings.DUMPS_DIR/<index_name>/<
    timestamp>_<index-name>.gz``, and a symlink ``<index-name>_latest.gz`` is
    created, pointing to the last created dump.

    :param ctx: Click context, so we can issue other management commands
    :param index: name of the index you want to create a dump for
    """
    if not index:
        available_idxs = ctx.invoke(available_indices)
        if not available_idxs:
            return
        index = click.prompt('Name of index to dump')

        if index not in available_idxs:
            click.secho('"%s" is not an available index' % index, fg='red')
            return

    match_all = {'query': {'match_all': {}}}

    total_docs = es.count(index=index).get('count')

    path = _create_path(path=os.path.join(DUMPS_DIR, index))
    dump_name = '%(index_name)s_%(timestamp)s.gz' % {
        'index_name': index,
        'timestamp': datetime.now().strftime('%Y%m%d%H%M%S')
    }
    new_dump = os.path.join(path, dump_name)

    with gzip.open(new_dump, 'wb') as g:
        with click.progressbar(es_helpers.scan(es, query=match_all, scroll='1m',
                                               index=index),
                               length=total_docs) as documents:
            for doc in documents:
                g.write('%s\n' % json.dumps(doc))

    click.secho('Generating checksum', fg='green')
    checksum = _checksum_file(new_dump)
    checksum_path = os.path.join(DUMPS_DIR, index, '%s.sha1' % dump_name)

    with open(checksum_path, 'w') as f:
        f.write(checksum)

    click.secho('Created dump "%s" (checksum %s)' % (dump_name, checksum),
                fg='green')


    latest = os.path.join(path, '%s_latest.gz' % index)
    try:
        os.unlink(latest)
    except OSError:
        click.secho('First time creating dump, skipping unlinking',
                    fg='yellow')
    os.symlink(new_dump, latest)
    click.secho('Created symlink "%s_latest.gz" to "%s"' % (index, new_dump),
                fg='green')

    latest_checksum = os.path.join(os.path.dirname(checksum_path), '%s_latest.gz.sha1' % index)
    try:
        os.unlink(latest_checksum)
    except OSError:
        click.secho('First time creating dump, skipping unlinking checksum',
                    fg='yellow')
    os.symlink(checksum_path, latest_checksum)
    click.secho('Created symlink "%s_latest.gz.sha1" to "%s"' % (index, checksum_path),
                fg='green')

예제 #46

0

파일 보기

파일: CountWords.py 프로젝트: Joaquimdcp/CAIM-Labs

import argparse

__author__ = 'bejar'

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--index', default=None, required=True, help='Index to search')
    parser.add_argument('--alpha', action='store_true', default=False, help='Sort words alphabetically')
    args = parser.parse_args()

    index = args.index

    try:
        client = Elasticsearch()
        voc = {}
        sc = scan(client, index=index, doc_type='document', query={"query" : {"match_all": {}}})
        for s in sc:
            tv = client.termvectors(index=index, doc_type='document', id=s['_id'], fields=['text'])
            if 'text' in tv['term_vectors']:
                for t in tv['term_vectors']['text']['terms']:
                    if t in voc:
                        voc[t] += tv['term_vectors']['text']['terms'][t]['term_freq']
                    else:
                        voc[t] = tv['term_vectors']['text']['terms'][t]['term_freq']
        lpal = []

        for v in voc:
            lpal.append((v.encode("utf8", "ignore"), voc[v]))


        for pal, cnt in sorted(lpal, key=lambda x: x[0 if args.alpha else 1]):

예제 #47

0

파일 보기

파일: espy.py 프로젝트: pmkhant/ESPY

def get_source(index, type_):
    """ Get _source of an index. """
    if es.indices.exists(index):
        res = helpers.scan(es, index=index, doc_type=type_)
        return res

예제 #48

0

파일 보기

def do_search(opts, host, protocol, port, LOGGER):
    query = \
        {
            'query': {
                'bool': {
                    'must': []
                }
            },
            'aggs': {
                'groupby': {
                    'terms': {
                        'field': 'module.keyword',
                        'size': 10000
                    },
                    'aggs': {
                        'latest-revision': {
                            'max': {
                                'field': 'revision'
                            }
                        }
                    }
                }
            }
        }

    es = Elasticsearch([{'host': '{}'.format(host), 'port': port}])
    search_term = opts['search']

    if 'case-sensitive' in opts and opts['case-sensitive']:
        case_sensitive = 'sensitive'
    else:
        case_sensitive = 'lowercase'
        search_term = search_term.lower()

    #    case_sensitivity = 'BINARY '

    sts = __search_fields
    if 'search-fields' in opts:
        sts = opts['search-fields']

    if 'type' in opts and opts['type'] == 'regex':
        term_regex = 'regexp'
    else:
        term_regex = 'term'

    should = {'bool': {'should': []}}
    request_number = 1
    if 'request-number' in opts:
        request_number = opts['request-number']
    for field in sts:
        if field in __search_fields:
            if field == 'module':
                field_term = field
                final_term = search_term.lower()
            else:
                field_term = '{}.{}'.format(field, case_sensitive)
                final_term = search_term

            term = {term_regex: {field_term: final_term}}
            should['bool']['should'].append(term)

    queries = []
    if 'schema-types' in opts:
        for st in opts['schema-types']:
            if st in __schema_types:
                queries.append(st)
    must = {'terms': {'statement': queries}}
    query['query']['bool']['must'].append(must)
    query['query']['bool']['must'].append(should)
    LOGGER.info('query:  {}'.format(query))
    limit_reacher = LimitReacher()
    search = scan(es,
                  LOGGER,
                  limit_reacher,
                  query,
                  scroll=u'2m',
                  scroll_limit=2 * request_number,
                  index='yindex',
                  doc_type='modules')
    LOGGER.info(search)

    filter_list = __node_data.keys()
    results = []
    rows = search

    if 'filter' in opts and 'node' in opts['filter']:
        filter_list = opts['filter']['node']
    LOGGER.info('filter list:   {}'.format(filter_list))
    latest_revisions = {}

    all_revisions = True
    if 'latest-revisions' in opts and opts['latest-revisions'] is True:
        all_revisions = False
        aggregations = es.search(index='yindex',
                                 doc_type='modules',
                                 body=query,
                                 size=0)['aggregations']['groupby']['buckets']
        for agg in aggregations:
            latest_revisions[agg['key']] = agg['latest-revision'][
                'value_as_string'].split('T')[0]
    for row in rows:
        r = row['_source']
        if all_revisions or r['revision'] == latest_revisions[r['module']]:
            module = {
                'name': r['module'],
                'revision': r['revision'],
                'organization': r['organization']
            }
            result = {'module': module}
            result['node'] = {}
            for nf in filter_list:
                if nf in __node_data:
                    result['node'][nf] = r[__node_data[nf]]

            results.append(result)
    return (results, limit_reacher.limit_reached)

예제 #49

0

파일 보기

파일: query.py 프로젝트: tpreusse/aleph

 def scan(self):
     """Return an iterator over the whole result set, unpaginated and
     without aggregations."""
     body = {'query': self.get_query(), '_source': self.RETURN_FIELDS}
     return scan(es, index=es_index, doc_type=self.DOC_TYPES, query=body)

예제 #50

0

파일 보기

    def generate_graph(app,
                       app_type='logs',
                       log_type='raw',
                       targets=[],
                       events=[],
                       time_range=['now-1h', 'now'],
                       size=20):
        """
        Return all elements from an application, possible matching against
        a specific event type (e.g. click, mouseover, etc)
        """
        # @TODO ref_url filter

        must_not_query = [{
            "term": {
                "type": "mousedown"
            }
        }, {
            "term": {
                "type": "mouseup"
            }
        }]

        filter_query = [
            {
                "term": {
                    "logType": log_type
                },
            },
        ]

        # Filtering
        should_query = []
        must_query = []

        # Include these events in the request
        if events:
            include_events = {"terms": {"type": events}}
            filter_query.append(include_events)

        target_in = targets[0]
        target_out = targets[1]

        if target_in:
            include_targets = {"terms": {"target": target_in}}

            filter_query.append(include_targets)

        # Remove these elementids from result set
        for target in target_out:
            res = {"term": {"target": target}}
            must_not_query.append(res)

        # Finish off should query
        # must_query.append({"bool": {"should": should_query}})

        # Sort By Time
        sort_query = [{"clientTime": {"order": "asc"}}]

        # Timestamp range - date math
        timestamp_query = {
            "range": {
                "@timestamp": {
                    "gte": time_range[0],
                    "lte": time_range[1]
                }
            }
        }
        filter_query.append(timestamp_query)

        agg_query = dict()

        # Get all unique sessions
        session_query = {"terms": {"field": "sessionID", "min_doc_count": 1}}

        agg_query['sessions'] = session_query

        # Generating all top targets and breakdowns by type, including path_length
        target_query = {
            "terms": {
                "field": "target",
                "min_doc_count": 1,
                "size": size
            },
            "aggs": {
                "events": {
                    "terms": {
                        "field": "type",
                        "min_doc_count": 1,
                        "size": size
                    }
                },
                "top_target": {
                    "top_hits": {
                        "script_fields": {
                            "path_length": {
                                "script": {
                                    "lang": "painless",
                                    "inline": "doc['path'].length;"
                                }
                            }
                        },
                        "size": 1
                    }
                }
            }
        }

        agg_query['targets'] = target_query

        # Main query
        query = {
            "sort": sort_query,
            "query": {
                "bool": {
                    # "must": must_query,
                    # "should": should_query,
                    "filter": filter_query,
                    "must_not": must_not_query,
                    # "minimum_should_match": len(should_query) - 1
                }
            },
            "_source": {
                "includes": ['*'],
            },
            "script_fields": {
                "path_length": {
                    "script": {
                        "lang": "painless",
                        "inline": "doc['path'].length;"
                    }
                }
            },
            "aggregations": agg_query
        }

        # return query
        # Process Aggregate Results
        response = es.search(app, doc_type=app_type, body=query, size=0)
        # Only want to look at aggregations
        sessions = response['aggregations']['sessions']['buckets']
        # allSessions = { x['key']: [] for x in sessions }
        # intervalSessions = { x['key']: [] for x in sessions }

        # Deal with bar chart
        allTargets = response['aggregations']['targets']['buckets']

        # Re-execute query to get all hits
        iter = helpers.scan(es,
                            query=query,
                            index=app,
                            doc_type=app_type,
                            preserve_order=True)

        allSessions = dict()
        # Store all hits in the user's bucket.
        for elem in iter:
            data = elem['_source']
            data['pathLength'] = elem['fields']['path_length'][0]
            if 'sessionID' in data:
                sessionID = data['sessionID']
                if sessionID in allSessions:
                    allSessions[sessionID].append(data)
                else:
                    allSessions[sessionID] = [data]

        # This fixed sequence/interval logging that what was produced in
        # UserALE.js v 0.2.0
        # Possible to remove self-loops here as well (html->html->html->window) := (html->window)
        intervalSessions = dict()
        for sessionID in allSessions:
            data = allSessions[sessionID]
            newData = []
            intervalLog = []
            pairs = zip(data, data[1:])

            for curr, next in pairs:
                target1 = curr['target']
                event1 = curr['type']
                target2 = next['target']
                event2 = next['type']
                if target1 != target2:  # ignore self-loops
                    targetChange = int(True)
                    eventChange = int(False)
                    if event1 != event2:
                        eventChange = int(True)

                    # Starting over no matter what
                    # Based off of curr, update the log
                    curr['targetChange'] = targetChange
                    curr['typeChange'] = eventChange
                    curr['intervalCount'] = len(
                        intervalLog)  # some number maybe 0
                    if len(intervalLog) >= 2:
                        # Calculate duration
                        curr['duration'] = intervalLog[-1:]['clientTime'] - \
                                           intervalLog[0]['clientTime']
                    else:
                        curr['duration'] = 0
                    newData.append(curr)
                    intervalLog = []
                # else:
                #     # They are the same
                #     targetChange = int(False)
                #     eventChange = int(False)
                #     if event1 != event2:
                #         eventChange = int(True)
                #         # starting over
                #         curr['targetChange'] = targetChange
                #         curr['typeChange'] = eventChange
                #         curr['intervalCount'] = len(intervalLog)
                #         # if len(intervalLog) >= 2:
                #         #     # Calculate duration
                #         #     curr['duration'] = intervalLog[-1:]['clientTime'] - \
                #         #                        intervalLog[0]['clientTime']
                #         # else:
                #         #     curr['duration'] = 0
                #         newData.append(curr)
                #         intervalLog = []
                #     else:
                #         # increase counter
                #         intervalLog.append(curr)
            intervalSessions[sessionID] = newData

        # return intervalSessions
        newSessions = []

        # Generate all edges tied to a user
        # [ edge list, edge list, ... ]
        for k, v in intervalSessions.items():
            pairs = pairwise(v)  # list of edges for a user
            newSessions.append(pairs)

        # Node Map
        node_list = []  # Need to keep 0-based index for sankey diagram
        links = []  # Aggregate sequence list
        node_map = []  # Final node map {"name": "foo", "id": 0"}

        # Align the sequences
        alignment = itertools.izip_longest(*newSessions)
        src_ids = {}
        target_ids = {}

        for i, step in enumerate(alignment):
            # print(i)
            c = collections.Counter()
            visitedLinks = []
            # visitedLinksUnique = set([])
            nodenames = set([])

            for edge in step:  # for a single step look at all links
                if edge:
                    node1 = edge[0]
                    node2 = edge[1]
                    session = node1['sessionID']
                    nodename1 = node1['target']
                    nodename2 = node2['target']

                    seqID = '%s->%s' % (nodename1, nodename2)
                    #print(seqID)

                    if nodename1 != nodename2:  #double check again for self-loops
                        #print(node1)
                        link = {
                            'sequenceID':
                            seqID,
                            'sourceName':
                            nodename1,
                            'targetName':
                            nodename2,
                            'type':
                            node1['type'],
                            'duration':
                            node1['duration'],
                            'pathLength':
                            len(node1['path'])
                            if node1['path'] is not None else 0,
                            'targetChange':
                            node1['targetChange'],
                            'typeChange':
                            node1['typeChange']
                        }
                        visitedLinks.append(link)

            # Done with visits in a step. Now calculate counts
            counts = collections.Counter(k['sequenceID'] for k in visitedLinks
                                         if k.get('sequenceID'))
            # print(counts)
            visitedLinksUnique = {v['sequenceID']: v
                                  for v in visitedLinks}.values()
            # print(visitedLinksUnique)

            # Visit unique links and generate src/targetid
            if len(node_map) == 0:
                for link in visitedLinksUnique:
                    # Add all sources
                    if link['sourceName'] not in src_ids:
                        node_map.append({"name": link['sourceName']})
                        src_ids[link['sourceName']] = len(node_map) - 1

                    # Add all targets
                    if link['targetName'] not in target_ids:
                        node_map.append({"name": link['targetName']})
                        target_ids[link['targetName']] = len(node_map) - 1

            else:
                src_ids = target_ids  # sources were previous targets
                target_ids = {}
                for link in visitedLinksUnique:
                    # Add all sources
                    # if link['sourceName'] not in src_ids.values():
                    #     node_map.append(link['sourceName'])
                    #     src_ids[len(node_map)-1] = link['sourceName']

                    # Add all targets
                    if link['targetName'] not in target_ids:
                        node_map.append({"name": link['targetName']})
                        target_ids[link['targetName']] = len(node_map) - 1

            for link in visitedLinksUnique:
                # Perform lookup for ids
                # Perform lookup for counts
                link['source'] = src_ids[link['sourceName']]
                link['target'] = target_ids[link['targetName']]
                link['value'] = counts[link['sequenceID']]

                links.append(link)

        # for step in alignment:
        #     # step through every users sequence
        #     c = collections.Counter()
        #     visitedLinks = []
        #     nodenames = set([])
        #
        #     # Process all the edges
        #     for edge in step:
        #         if edge:
        #             node1 = edge[0]
        #             node2 = edge[1]
        #
        #             nodename1 = node1['target']
        #             nodename2 = node2['target']
        #
        #             # Add src and targetids
        #             nodenames.add(nodename1)
        #             nodenames.add(nodename2)
        #
        #             # Generate sequence ID
        #             seqID = '%s->%s' % (nodename1, nodename2)
        #
        #             # @todo Ensure src and target are not the same (self-loop)
        #             if nodename1 != nodename2:
        #                 link = {
        #                     'sequenceID': seqID,
        #                     'sourceName': nodename1,
        #                     'targetName': nodename2,
        #                     'type': node1['type'],
        #                     # 'duration': node1['duration'],
        #                     'pathLength': len(node1['path']),
        #                     'targetChange': node1['targetChange'],
        #                     'typeChange': node1['typeChange']
        #                 }
        #                 visitedLinks.append(link)
        #
        #     # How many users visited a sequence at this step
        #     counts = collections.Counter(k['sequenceID'] for k in visitedLinks if k.get('sequenceID'))
        #     # print(counts)
        #     # Append into growing node_list
        #     map(lambda x: node_list.append(x), nodenames)
        #
        #     # map(lambda x: node_map.append({ "name": x}
        #     #                                 "id": len(node_list) - 1 - node_list[::-1].index(x)}), nodenames)
        #
        #     map(lambda x: node_map.append({ "name": x}), nodenames)
        #                                     # "id": len(node_list) - 1 - node_list[::-1].index(x)}), nodenames)
        #     for v in visitedLinks:
        #         # Pass through and update count, also generate src and target id
        #         v['value'] = counts[v['sequenceID']]
        #         # Last occurence is the src and target id
        #         v['source'] = len(node_list) -1 - node_list[::-1].index(v['sourceName'])
        #         v['target'] = len(node_list) -1 - node_list[::-1].index(v['targetName'])
        #         links.append(v)

        # Save everything
        res = dict()
        res['histogram'] = generate_bargraph(allTargets)
        # res['sankey'] = {
        #     # 'sessions': sessions,
        #     'links': links,
        #     'nodes': node_map
        # }

        res['nodes'] = node_map
        res['links'] = links
        res['sessions'] = sessions
        # with open('sankey.json', 'w') as outfile:
        #     json.dump(res, outfile, sort_keys=False, indent=4)

        # with open('data.txt', 'w') as outfile:
        #     json.dump(intervalSessions, outfile, indent=4, sort_keys=False)
        #
        # with open('query.json', 'w') as outfile:
        #     json.dump(query, outfile, indent=4, sort_keys=False)
        # Iterate first to get nodes
        # pairs = pairwise(iter)
        #
        # nodes = []
        # links = []

        # for p in pairs:
        #     node1 = p[0]['_source']
        #     node2 = p[1]['_source']

        #     # Append nodes to list
        #     nodes.append(node1['target'])
        #     nodes.append(node2['target'])

        # Iterate again to get edges
        # pairs = pairwise(iter2)

        # srcID = targetID = None
        # for p in pairs:
        #     node1 = p[0]['_source']
        #     node2 = p[1]['_source']
        #
        #     # Append nodes to list
        #     nodes.append(node1['target'])
        #     # nodes.append(node2['target'])
        #
        #     srcID = len(nodes) - 1
        #     targetID = len(nodes)
        #
        #     # if (node1['target'] != node2['target']):
        #     # Append links to list (remove self-loops)
        #     link = {
        #         'sourceID': srcID,
        #         'targetID': targetID,
        #         'sourceName': node1['target'],
        #         'targetName': node2['target'],
        #         'type': node1['type'],
        #         'duration': node1['duration'],
        #         'value': node1['count'],
        #         'pathLength': len(node1['path']),
        #         'targetChange': int(node1['targetChange']),
        #         'typeChange': int(node1['typeChange'])
        #     }
        #     links.append(link)
        #
        # # Get all unique nodes
        # # node_names = np.unique(nodes).tolist()
        # node_list = []
        #
        # for indx, name in enumerate(nodes):
        #     n = {'id': indx, 'name': name}
        #     node_list.append(n)
        #
        # # Remove self-loops
        # newLinks = []
        # for indx, elem in enumerate(links):
        #     srcID = elem['sourceID']
        #     targetID = elem['targetID']
        #
        #     if srcID != targetID:
        #         newLinks.append(elem)
        #

        #
        return res

예제 #51

0

파일 보기

파일: test2.py 프로젝트: chlrlcjf123/team-crawlcrawl

from elasticsearch import Elasticsearch
from elasticsearch import helpers

es = Elasticsearch("http://127.0.0.1:9200/")
print(es.info())
helpers.bulk()
helpers.scan()

예제 #52

0

파일 보기

# 引入安装的库
import Bert_clear_title
#模型下载自https://www.kaggle.com/terrychanorg/bertcleartitlemodel
TClear = Bert_clear_title.Marker(
    model_path="/mnt/data/dev/model/Bert_clear_title/model/")
TClear.load_model()

es = Elasticsearch('127.0.0.1:9200')
index_v = "terry-index"
index_v = "scrapy_search-2020-11"
doc_type_v = "items"
query = {"query": {"match_all": {}}}
scanResp = helpers.scan(client=es,
                        query=query,
                        scroll="10m",
                        index=index_v,
                        doc_type=doc_type_v,
                        timeout="10m")

items = []
for i, resp in enumerate(scanResp):
    print("\n" * 2)
    qid = resp['_id']
    # print(resp)
    # print(resp['_source']['title'])
    one = TClear.pre(resp['_source']['title'])

    # print(TClear.get_mark_data(one[0]))
    if len(TClear.get_mark_data(one[0])) == 0:
        items.append(resp['_source']['title'])
    else:

예제 #53

0

파일 보기

파일: score_table.py 프로젝트: wellcomecollection/data-science

es = Elasticsearch(
    hosts=[{
        'host': os.environ["ES_HOST"],
        'port': os.environ["ES_PORT"]
    }],
    http_auth=(
        os.environ["ES_USER"],
        os.environ["ES_PASS"]
    )
)

data = pd.DataFrame([thing["_source"] for thing in list(
    helpers.scan(
        es,
        index="assessment",
        query={"query": {"match_all": {}}}
    ))
])

for candidate in np.unique(data[['candidate_a', 'candidate_b']]):
    elo.addPlayer(candidate)

for candidate_a, candidate_b, winner in data.values:
    elo.recordMatch(candidate_a, candidate_b, winner=winner)

n_classifiers_options = [32, 64, 128, 256, 512]
n_clusters_options = [8, 16, 32, 64, 128, 256]
results = pd.DataFrame({
    str(n_classifiers): {
        str(n_clusters): None

예제 #54

0

파일 보기

파일: get_user_portrait.py 프로젝트: zhaishujie2/knowledge-management

import redis
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan

es = Elasticsearch("219.224.134.213:9200", timeout=600)
user_keys = [
    "domain", "uid", "importance", "influence", "activity_geo", "uname",
    "hashtag", "fansnum", "tendency", "photo_url", "statusnum", "gender",
    "topic_string", "activeness", "location", "friendsnum",
    "character_sentiment", "character_text"
]
es_scan = scan(es,
               query={
                   "query": {
                       "match_all": {}
                   },
                   "size": 1000
               },
               index="user_portrait_1222",
               doc_type="user")
f = open("user_portrait.txt", "wb")
while 1:
    try:
        k = es_scan.next()
        scan_re = k["_source"]
        user_dict = dict()
        for key in user_keys:
            user_dict[key] = scan_re[key]
        f.write(json.dumps(user_dict) + "\n")
    except StopIteration:
        print "all done"

예제 #55

0

파일 보기

파일: add_ZCTA.py 프로젝트: kmshelley/SafeRoad

def add_zcta_zip_to_index(index, doc_type, loc_field, id_field, prefix=None):
    #iterates through zcta zip code polygons, updates index records with zip code id they are contained within
    #if no lat/lng adds 'NA' to field
    #input: index name, doc_type
    #Searches ES collisions data, updates records with gepshape mapping
    es_url = 'http://%s:%s@%s:9200' % (ES_username, ES_password, ES_url)
    es = Elasticsearch(es_url)

    proj = Proj(init='epsg:2263')  #NY/Long Island UTM projection

    if prefix:
        zip_field1 = prefix + '_ZCTA_ZIP'
        zip_field2 = prefix + '_ZCTA_ZIP_NoSuffix'
    else:
        zip_field1 = 'ZCTA_ZIP'
        zip_field2 = 'ZCTA_ZIP_NoSuffix'

    try:
        mapping = {}
        mapping['properties'] = {}

        #set the ZCTA zip fieldmapping
        mapping['properties'][zip_field1] = {'type': 'string'}
        mapping['properties'][zip_field2] = {'type': 'string'}

        #use cURL to put the mapping
        p = subprocess.Popen([
            'curl',
            '%s/%s/_mapping/%s' % (es_url, index, doc_type), '-d',
            '%s' % json.dumps(mapping)
        ],
                             stderr=subprocess.PIPE)
        out, err = p.communicate()
        if err: print '\n' + err

    except Exception as e:
        #do not try to recreate the index
        print "Error creating index:"
        print e

    idx = 0
    updates = []
    for result in helpers.scan(es, index=index, doc_type=doc_type):
        idx += 1
        _id = result['_id']
        #Add placeholder for ZCTA zip code
        result['_source'][zip_field1] = 'NA'
        result['_source'][zip_field2] = 'NA'

        if loc_field in result['_source']:
            query = '''{
                        "query":{
                                "bool":{
                                        "must":{"match_all": {}},
                                        "filter":{
                                                "geo_shape":{
                                                        "coords":{
                                                                "indexed_shape": {
                                                                        "index": "%s",
                                                                        "type": "%s",
                                                                        "id": "%s",
                                                                        "path": "%s"
                                                                        },
                                                                "relation": "intersects"
                                                                }
                                                        }
                                                }
                                        }
                                }
                        }''' % (index, doc_type, _id, loc_field)
            max_area = 0
            max_zip = False
            #query the zip codes, finding all zip shapes that contain the current colision
            for shape in helpers.scan(es,
                                      query=query,
                                      index='nyc_zip_codes',
                                      doc_type='zip_codes'):
                coords = [
                    proj(lng, lat) for lng, lat in shape['_source']['coords']
                    ['coordinates'][0]
                ]
                poly = Polygon(coords)
                if poly.area > max_area:
                    #get the largest zip code by geographic area
                    max_area = poly.area
                    max_zip = shape['_id']
            if max_zip:
                result['_source'][zip_field1] = max_zip
                result['_source'][zip_field2] = max_zip.split('-')[0]
        updates.append(result['_source'])

        if idx >= 10000:
            upload_to_Elasticsearch.update_ES_records_curl(updates,
                                                           index=index,
                                                           doc_type=doc_type,
                                                           id_field=id_field)
            idx = 0
            updates = []

    #upload the remaining records
    upload_to_Elasticsearch.update_ES_records_curl(updates,
                                                   index=index,
                                                   doc_type=doc_type,
                                                   id_field=id_field)

예제 #56

0

파일 보기

파일: search.py 프로젝트: simon987/od-database

 def stream_all_docs(self):
     return helpers.scan(query={
         "query": {
             "match_all": {}
         }
     }, scroll="30s", client=self.es, index=self.index_name, request_timeout=30)

예제 #57

0

파일 보기

def delete_collection_entities():
    q = {'query': {'exists': {'field': 'collection_id'}}}
    for ent in scan(es, query=q, index=es_index, doc_type=[TYPE_ENTITY]):
        es.delete(index=es_index, doc_type=TYPE_ENTITY, id=ent.get('_id'))

예제 #58

0

파일 보기

파일: areas_code.py 프로젝트: zhquan/grimoirelab-cereslib

def analyze_git(es_read, es_write, es_read_index, es_write_index, git_enrich,
                size, incremental):

    query = {"match_all": {}}
    sort = [{"metadata__timestamp": {"order": "asc"}}]

    if incremental.lower() == 'true':
        search = Search(using=es_write, index=es_write_index)
        # from:to parameters (=> from: 0, size: 0)
        search = search[0:0]
        search = search.aggs.metric('max_date',
                                    'max',
                                    field='metadata__timestamp')

        try:
            response = search.execute()

            if response.to_dict()['aggregations']['max_date']['value'] is None:
                msg = "No data for 'metadata__timestamp' field found in "
                msg += es_write_index + " index"
                logging.warning(msg)
                init_write_index(es_write, es_write_index)

            else:
                # Incremental case: retrieve items from last item in ES write index
                max_date = response.to_dict(
                )['aggregations']['max_date']['value_as_string']
                max_date = date_parser.parse(max_date).isoformat()

                logging.info("Starting retrieval from: " + max_date)
                query = {"range": {"metadata__timestamp": {"gte": max_date}}}

        except NotFoundError:
            logging.warning("Index not found: " + es_write_index)
            init_write_index(es_write, es_write_index)

    else:
        init_write_index(es_write, es_write_index)

    search_query = {"query": query, "sort": sort}

    logging.info(search_query)

    logging.info("Start reading items...")

    commits = []
    cont = 0

    for hit in helpers.scan(es_read,
                            search_query,
                            scroll='300m',
                            index=es_read_index,
                            preserve_order=True):

        cont = cont + 1

        item = hit["_source"]
        commits.append(item)
        logging.debug("[Hit] metadata__timestamp: " +
                      item['metadata__timestamp'])

        if cont % size == 0:
            logging.info("Total Items read: " + str(cont))

            events_df = eventize_and_enrich(commits, git_enrich)
            upload_data(events_df, es_write_index, es_write)

            commits = []
            events_df = None

    # In case we have some commits pending, process them
    if len(commits) > 0:
        logging.info("Total Items read: " + str(cont))
        events_df = eventize_and_enrich(commits, git_enrich)
        upload_data(events_df, es_write_index, es_write)

예제 #59

0

파일 보기

파일: elasticsearch.py 프로젝트: sathish1357/haystack

        body: dict = {"query": {"bool": {}}}

        if filters:
            filter_clause = []
            for key, values in filters.items():
                filter_clause.append(
                    {
                        "terms": {key: values}
                    }
                )
            body["query"]["bool"]["filter"] = filter_clause

        if only_documents_without_embedding:
            body["query"]["bool"] = {"must_not": {"exists": {"field": self.embedding_field}}}

        result = scan(self.client, query=body, index=index, size=batch_size, scroll="1d")
        yield from result

    def query(
        self,
        query: Optional[str],
        filters: Optional[Dict[str, List[str]]] = None,
        top_k: int = 10,
        custom_query: Optional[str] = None,
        index: Optional[str] = None,
    ) -> List[Document]:
        """
        Scan through documents in DocumentStore and return a small number documents
        that are most relevant to the query as defined by the BM25 algorithm.

        :param query: The query

예제 #60

0

파일 보기

    delete_index()

if args.create:

    create_index()

if args.insert:

    insert(args.insert[0], args.update, args.expires, args.depth)

if args.kexport:

    # Export dashboards, searches, and visualizations.
    kibana_export = list(
        scan(client=es,
             index='.kibana',
             doc_type='dashboard,search,visualization'))

    # Export the ipwhois index pattern.
    kibana_idx_export = list(
        scan(client=es,
             index='.kibana',
             doc_type='index-pattern',
             query={'query': {
                 'match': {
                     '_id': 'ipwhois'
                 }
             }}))

    # Dump exports to json file.
    with io.open(args.kexport[0], 'w') as data_file: