def search(self, args, es_client=client): search = Search(using=es_client, index=SearchableEvent.meta.index) if args.get('name'): search = search.query('fuzzy', name=args['name']) search = search.highlight('name') if args.get('description'): search = search.query('match', description=args['description']) search = search.highlight('description') if args.get('location-name'): search = search.query('fuzzy', location_name=args['location_name']) search = search.highlight('location_name') if args.get('organizer-name'): search = search.query( 'fuzzy', organizer_name=args['organizer_name']) search = search.highlight('organizer_name') if args.get('organizer-description'): search = search.query( 'fuzzy', organizer_description=args['organizer_description']) search = search.highlight('organizer_description') return [to_dict(r) for r in search.execute()]
def process(self, start_time:datetime, end_time:datetime, input:DataFrame): logger.debug('Start: %s End: %s Log: index=%s fields=%s' % (start_time.isoformat(), end_time.isoformat(), str(self.indices), str(self.fields))) search = Search(using=self.client, index=self.indices[0]) search = search.filter(Range(** {'@timestamp': {'gte': start_time.isoformat(), 'lte': end_time.isoformat()}})) for k,v in self.fields.items(): if isinstance(v, list): for sv in v: search = search.query("match", **{k:sv}) else: search = search.query("match", **{k:v}) logger.debug('ES Query: %s' % str(search.to_dict())) response = search.execute() logger.debug('Results: success:%d failed:%d hits:%d' % (response._shards.successful, response._shards.failed, len(response.hits))) for hit in response: # filter out the meta key and flatten the values row = {k: str(hit[k]) for k in hit if k != 'meta'} logger.debug(row) input = input.append(row, ignore_index=True) return input
def searchTweets(keyword, latlondist): #Variables that contains the user credentials to access Twitter API if TwitterHelper.AWS_ACCESS_KEY == None: raise KeyError("Please set the AWS_ACCESS_KEY env. variable") if TwitterHelper.AWS_SECRET_KEY == None: raise KeyError("Please set the AWS_SECRET_KEY env. variable") s = Search() if latlondist != None: locJson = json.loads(latlondist) s = s.query({"filtered" : {"query" : {"match_all" : {}}, "filter" : {"geo_distance" : {"distance" : locJson['dist'], "location" : {"lat" : locJson['lat'], "lon" : locJson['lon']}}}}}) if keyword != None: q = Q("match_phrase", text = keyword) s = s.query(q) scanResp = None scanResp = helpers.scan(client = TwitterHelper.ES, query = s.to_dict(), scroll = "1m", index = "tweets", timeout = "1m") arr = [] for resp in scanResp: hit = resp['_source'] d = {} d['name'] = hit['name'] d['text'] = hit['text'] d['sentiment'] = hit['sentiment'] d['lat'] = hit['location']['lat'] d['lon'] = hit['location']['lon'] arr.append(d) allD = {} allD['tweets'] = arr mapInput = json.dumps(allD) return mapInput
def get(self,request): form = SearchForm(request.GET) ctx={ "form":form } if form.is_valid(): name_query=form.cleaned_data.get("name") if name_query: s=Search(index="daintree").query("match",name=name_query) else: s=Search(index="daintree") min_price=form.cleaned_data.get("min_price") max_price=form.cleaned_data.get("max_price") if min_price is not None or max_price is not None: price_query=dict() if min_price is not None: price_query["gte"]=min_price if max_price is not None: price_query["lte"]=max_price s=s.query("range",price=price_query) s.aggs.bucket("categories","terms",field="category") if request.GET.get("category"): s=s.query("match",category=request.GET["category"]) result=s.execute() ctx["products"]=result.hits category_aggregations=list() for bucket in result.aggregations.categories.buckets: category_name=bucket.key doc_count=bucket.doc_count category_url_params=request.GET.copy() category_url_params["category"]=category_name category_url="{}?{}".format(reverse("home"),category_url_params.urlencode()) category_aggregations.append({ "name":category_name, "doc_count":doc_count, "url":category_url }) ctx["category_aggs"]=category_aggregations if "category" in request.GET: remove_category_search_params=request.GET.copy() del remove_category_search_params["category"] remove_category_url="{}?{}".format(reverse("home"),remove_category_search_params.urlencode()) ctx["remove_category_url"]=remove_category_url return render(request,"home.html",ctx)
def get_unique_terms(self, field_name, min_docs=5): assert isinstance(self.search_obj, Search) # define a bucket aggregation and metrics inside: self.search_obj.aggs.bucket('tokens', 'terms', field=field_name, size=20) s = Search(self.es).index(self.index_name) s.query('match_all') s.aggs.bucket('myaggs', 'terms', field=field_name, size=0, min_doc_count=min_docs) res = {} for i in s.execute().aggregations.myaggs.buckets: res[i['key']] = i['doc_count'] return res
def update_sentiments(self): from watson_developer_cloud import ToneAnalyzerV3Beta tone_analyzer = ToneAnalyzerV3Beta(username='******', password='******', version='2016-02-11') client = connections.get_connection() search = Search(using=client, index='articles', doc_type='article') q = Q('bool', must=[Q('missing', field='watson_analyzed')]) search = search.query(q) counter = 0 for result in search.scan(): doc = Article.get(result.meta.id) try: analysis = tone_analyzer.tone(text=doc.body) tone_categories = analysis['document_tone']['tone_categories'] emotion_tones = list(filter(lambda x: x['category_id'] == 'emotion_tone', tone_categories))[0] doc.tone = {} for tone in emotion_tones['tones']: doc.tone[tone['tone_id']] = tone['score'] doc.watson_success = True except WatsonException: continue finally: doc.watson_analyzed = True doc.save() counter += 1 print(counter) if counter == 0: raise RealError()
def query_articles(self, query, prefs): client = connections.get_connection() search = Search(using=client, index='articles') q = Q('bool', must=[Q('exists', field='watson_analyzed'), Q('match', watson_success=True), Q('match', body=query)]) search = search.query(q) search.execute() documents = [] for hit in search[:100]: if '#' not in hit.url and '?' not in hit.url: documents.append({ 'id': hit.meta.id, 'title': hit.title, 'body': hit.body, 'url': hit.url, 'score': hit.meta.score, 'tone': dict( joy=hit.tone.joy, fear=hit.tone.fear, sadness=hit.tone.sadness, disgust=hit.tone.disgust, anger=hit.tone.anger ), 'top_image': hit.top_image }) if len(documents) < 10: return documents else: return select_k_and_sort(documents, prefs)
def search(self, **params): index = params.get('index', self.index) search = Search(using=self.client, index=index) page = params.get('page', None) per_page = params.get('per_page', None) if page and per_page: page = page - 1 search._extra = {'from': page, 'size': per_page} sort = params.get('sort', None) if sort and sort.replace('-', '') in ['created_at', 'level']: search = search.sort(sort) date_filter = self._filter_by_date_interval(params) if date_filter: search = search.filter(date_filter) level = params.get('group_by', None) if level: search = search.query('match', level=level) hits = search.execute() format = params.get('format', 'object') if format == 'dict': return self._to_dict(hits) else: return self._to_logs(hits)
def gracc_query_apel(year, month): index = osg_summary_index starttime = datetime.datetime(year, month, 1) onemonth = dateutil.relativedelta.relativedelta(months=1) endtime = starttime + onemonth s = Search(using=es, index=index) s = s.query('bool', filter=[ Q('range', EndTime={'gte': starttime, 'lt': endtime }) & Q('terms', VOName=vo_list) & ( Q('term', ResourceType='Batch') | ( Q('term', ResourceType='Payload') & Q('term', Grid='Local') ) ) ] ) bkt = s.aggs bkt = bkt.bucket('Cores', 'terms', size=MAXSZ, field='Processors') bkt = bkt.bucket('VO', 'terms', size=MAXSZ, field='VOName') bkt = bkt.bucket('DN', 'terms', size=MAXSZ, field='DN') bkt = bkt.bucket('Site', 'terms', size=MAXSZ, missing=MISSING, field='OIM_ResourceGroup') #bkt = bkt.bucket('Site', 'terms', size=MAXSZ, field='SiteName') #bkt = bkt.bucket('Site', 'terms', size=MAXSZ, field='WLCGAccountingName') add_bkt_metrics(bkt) bkt = bkt.bucket('SiteName', 'terms', size=MAXSZ, field='SiteName') add_bkt_metrics(bkt) response = s.execute() return response
def reverse(): try: lon = float(request.args.get('lon')) lat = float(request.args.get('lat')) except (TypeError, ValueError): lon = lat = None if not lat or not lon: abort(400, "missing 'lon' or 'lat': /?lon=2.0984&lat=48.0938") s = Search(es).index(INDEX).query(MatchAll()).extra(size=1).sort({ "_geo_distance": { "coordinate": { "lat": lat, "lon": lon }, "order": "asc" }}) _type = request.args.get('type', None) if _type: s = s.query({'match': {'type': _type}}) results = s.execute() if len(results.hits) < 1: notfound.debug('reverse: lat: {}, lon: {}, type: {}'.format( lat, lon, _type)) debug = 'debug' in request.args data = to_geo_json(results, debug=debug) data = json.dumps(data, indent=4 if debug else None) response = Response(data, mimetype='application/json') cors(response) return response
def GetAuditData(self, case, child_id, data_type, start=None, length=None, str_query=None, sort=None, order=None): q = ['w32registryraw', 'filedownloadhistory', 'urlhistory', 'timeline', 'w32apifiles', 'w32rawfiles', 'w32eventlogs'] if data_type in q: query = search_queries.GetGeneratorQuery(data_type, str_query, case, child_id, start, length, sort, order) else: s = Search() s = s[0:1000] t = Q('query_string', default_field="ComputerName.raw", query=child_id) & Q('query_string', default_field="CaseInfo.case_name", query=case) query = s.query(t).filter('term', AuditType__Generator=data_type) try: r = requests.post(self.es_host + ":" + self.es_port + self.index + self.type_audit_type + '/_search', data=json.dumps(query.to_dict()), auth=(self.elastic_user, self.elastic_pass), verify=False) except ConnectionError as e: ret = {"connection_error": e.args[0]} return ret data = [] try: for x in r.json()['hits']['hits']: data.append(x) except KeyError: return data return data
def gracc_query_jobs(es, index, starttime, endtime, interval, offset=None): s = Search(using=es, index=index) s = s.query('bool', filter=[ Q('range', EndTime={'gte': starttime, 'lt': endtime }) & Q('term', ResourceType='Batch') & ~Q('terms', SiteName=['NONE', 'Generic', 'Obsolete']) & ~Q('terms', VOName=['Unknown', 'unknown', 'other']) ] ) if offset is None: extra = {} else: extra = {'offset': "-%ds" % offset} curBucket = s.aggs.bucket('EndTime', 'date_histogram', field='EndTime', interval=interval, **extra) curBucket = curBucket.metric('CoreHours', 'sum', field='CoreHours') curBucket = curBucket.metric('Records', 'sum', field='Count') response = s.execute() return response
def search(): q = request.args.get('q') #resp = es.search(index='hoe', doc_type='record', q=q, body=aggs) #logging.info(q) s = Search(using=es, index='hoe', doc_type='record') s.aggs.bucket('library_place', 'terms', field='library-place') s.aggs.bucket('type', 'terms', field='type') s.aggs.bucket('genre', 'terms', field='genre') s.aggs.bucket('keywords', 'terms', field='keywords.label') s.aggs.bucket('author', 'terms', field='author.literal') s.query = Q('multi_match', query=q, fields=['_all']) filters = [] if 'filter' in request.args: filters = request.args.getlist('filter') logging.info(filters) for filter in filters: cat, val = filter.split(':') cat = cat.replace('_', '-') filter_dict = {} filter_dict.setdefault(cat, val) logging.info(cat) s.filter = F('term', **filter_dict) #if request.args resp = s.execute() #logging.info(resp) #logging.info(resp.aggregations.per_category.buckets) return render_template('resultlist.html', records=resp.to_dict().get('hits'), facets=resp.aggregations.to_dict(), header=q, query=q, filters=filters)
def BuildAuditAggs(self, child_id, parent_id): s = Search() s = s[0] t = Q('query_string', default_field="CaseInfo.case_name", query=parent_id) & Q('match', ComputerName=child_id) aggs_generator = A('terms', field='AuditType.Generator', size=0) s.aggs.bucket('datatypes', aggs_generator) query = s.query(t) try: r = requests.post(self.es_host + ":" + self.es_port + self.index + self.type_audit_type + '/_search', data=json.dumps(query.to_dict()), auth=(self.elastic_user, self.elastic_pass), verify=False) except ConnectionError as e: ret = {"connection_error": e.args[0]} return ret data = [] exclude = ['w32processes-memory', 'stateagentinspector', 'w32disks'] for y in r.json()['aggregations']['datatypes']['buckets']: if not y['key'] in exclude: data.append({ "id": y['key'], "parent": child_id, "text": y['key'], "type": "audit", "a_attr": {"href": "#" + y['key'] + '/' + parent_id + "/" + child_id } }) return data
def categories(self): s = Search( using=docstore._get_connection(settings.DOCSTORE_HOSTS), index=settings.DOCSTORE_INDEX, doc_type='articles' ).fields([ 'title', 'title_sort', 'categories', ])[0:docstore.MAX_SIZE] if not settings.MEDIAWIKI_SHOW_UNPUBLISHED: s = s.query('match', published=True) response = s.execute() pages = [] for hit in response: page = Page() page.url_title = hit.title[0] page.title = hit.title[0] page.title_sort = hit.title_sort[0] page.categories = hit.get('categories', []) pages.append(page) articles = sorted(pages, key=lambda page: page.title_sort) categories = {} for page in articles: for category in page.categories: # exclude internal editorial categories if category not in settings.MEDIAWIKI_HIDDEN_CATEGORIES: if category not in categories.keys(): categories[category] = [] # pages already sorted so category lists will be sorted if page not in categories[category]: categories[category].append(page) return categories
def get(self, request, *args, **kwargs): q = request.GET.get('q') # Make search. queries = [ query.Q('match', slug=self._phrase(q)), # Slug. query.Q('match', type=self._phrase(q)), # Type. query.Q('match', search_names=self._phrase(q)), # Name. query.Q('prefix', carrier=q), # Shelf carrier. query.Q('term', region=q) # Shelf region. ] sq = query.Bool(should=queries) # Search. res = {'apps': [], 'brands': [], 'collections': [], 'shelves': []} es = Search(using=FeedItemIndexer.get_es(), index=self.get_feed_element_index()) feed_elements = es.query(sq).execute().hits if not feed_elements: return response.Response(res, status=status.HTTP_404_NOT_FOUND) # Deserialize. ctx = {'app_map': self.get_apps(request, self.get_app_ids_all(feed_elements)), 'request': request} for feed_element in feed_elements: item_type = feed_element.item_type serializer = self.SERIALIZERS[item_type] data = serializer(feed_element, context=ctx).data res[self.PLURAL_TYPES[item_type]].append(data) # Return. return response.Response(res, status=status.HTTP_200_OK)
def search(self, doc_type, query=""): """ Execute search query and retrive results :param doc_type: Type in ElasticSearch :param query: search query :return: list with results """ results = [] if type(query) in [str, unicode] and type(doc_type) == DocTypeMeta: q = Q("multi_match", query=query.lower(), fields=["title"]) s = Search() s = s.using(self.client) s = s.index(self.index_name) s = s.doc_type(doc_type) s = s.query(q) print "search query: " + str(s.to_dict()) response = s.execute() for resp in response: results.append(resp) return results
def GetAuditDataMain(self, data): s = Search() s = s[0:1000] s = s.highlight('*') s = s.highlight_options(require_field_match=False) t = Q('query_string', query=data) & ~Q('query_string', default_field="AuditType.Generator", query="stateagentinspector") & ~Q('query_string', default_field="AuditType.Generator", query="w32processes-tree") query = s.query(t) try: r = requests.post(self.es_host + ":" + self.es_port + self.index + self.type_audit_type + '/_search', data=json.dumps(query.to_dict()), auth=(self.elastic_user, self.elastic_pass), verify=False) except ConnectionError as e: ret = {"connection_error": e.args[0]} return ret data = [] try: for x in r.json()['hits']['hits']: for y, v in x['highlight'].iteritems(): data.append({ "doc_id": x['_id'], "endpoint": x['_parent'], "audittype": x['_source']['AuditType']['Generator'], "field": y, "response": v }) except KeyError: pass return data
def BuildRootTree(self): s = Search() t = Q('has_parent', type='hostname', query=Q('query_string', query="*")) aggs = A('terms', field='AuditType.Generator', size=16) s.aggs.bucket('datatypes', aggs) query = s.query(t) try: r = requests.post(self.es_host + self.es_port + self.index + self.type_audit_type + '/_search', data=json.dumps(query.to_dict())) except ConnectionError as e: ret = {"connection_error": e.args[0]} return ret data = [{ "id": "stackable", "parent": "#", "text": "Stackable Data" }] i = ['w32services', 'w32tasks', 'w32scripting-persistence', 'w32prefetch', 'w32network-dns', 'urlhistory'] for x in r.json()['aggregations']['datatypes']['buckets']: if x['key'] not in i: pass else: data.append({ "id" : x['key'], "parent": "stackable", "text": x['key'], "children": True }) return data
def BuildRootTree(self): s = Search() t = Q('query_string', query="*") aggs_casenum = A('terms', field="CaseInfo.case_name", size=0) s.aggs.bucket('casenum', aggs_casenum) query = s.query(t) try: r = requests.post(self.es_host + ":" + self.es_port + self.index + self.type_audit_type + '/_search', data=json.dumps(query.to_dict()), auth=(self.elastic_user, self.elastic_pass), verify=False) except ConnectionError as e: ret = {"connection_error": e.args[0]} return ret data = [{ "id": "current_inv", "parent": "#", "text": "Current Investigations", "type": "root" }, { "id": "comp_inv", "parent": "#", "text": "Completed Investigations", "type": "root" }] for x in r.json()['aggregations']['casenum']['buckets']: data.append({ "id": x['key'], "parent": "current_inv", "text": x['key'], "children": True, "type": "case" }) return data
def GetAuditGenerator(endpoints): audit_type = '' q = [] a = [] for k, v in endpoints.iteritems(): q.append(k) for x in v: if not v in a: a.append(v) joined = ' OR '.join([x for x in a[0]]) generator = q[0] s = Search() s = s[0] if generator == 'w32scripting-persistence': aggs_gen = A('terms', field='Record.Path.raw', size=0) aggs_endpoint = A('terms', field="ComputerName.raw", size=0) s.aggs.bucket('generator', aggs_gen).bucket('endpoint', aggs_endpoint) t = Q('query_string', default_field="ComputerName.raw", query=joined) query = s.query(t).filter('term', AuditType__Generator=generator) elif generator == 'w32prefetch': aggs_gen = A('terms', field='Record.ApplicationFileName.raw', size=0) aggs_endpoint = A('terms', field="ComputerName.raw", size=0) s.aggs.bucket('generator', aggs_gen).bucket('endpoint', aggs_endpoint) t = Q('query_string', default_field="ComputerName.raw", query=joined) query = s.query(t).filter('term', AuditType__Generator=generator) elif generator == 'w32network-dns': aggs_gen = A('terms', field='Record.RecordName.raw', size=0) aggs_endpoint = A('terms', field="ComputerName.raw", size=0) s.aggs.bucket('generator', aggs_gen).bucket('endpoint', aggs_endpoint) t = Q('query_string', default_field="ComputerName.raw", query=joined) query = s.query(t).filter('term', AuditType__Generator=generator) else: aggs_gen = A('terms', field='Record.Name.raw', size=0) aggs_endpoint = A('terms', field="ComputerName.raw", size=0) s.aggs.bucket('generator', aggs_gen).bucket('endpoint', aggs_endpoint) t = Q('query_string', default_field="ComputerName.raw", query=joined) query = s.query(t).filter('term', AuditType__Generator=generator) return query.to_dict()
def search(self, query: str, filters: dict=None, only_this_type: bool=True, **kwargs: dict) -> list: """performs a search against elasticsearch and then pulls the corresponding data from the db :param query: query terms to search by :param filters: named (attribute, value) filters to limit the query results :param kwargs: additional search keyword arguments :return: a list of models with an additional `__score` value added """ # build base search object s = Search(using=self.indexer.es).index(self.indexer.index_name) if only_this_type: s = s.doc_type(self.indexer.doc_type_name) # build query s = s.query('match', _all=query) # add filter if filters is not None: for attr, value in filters.items(): s = s.filter(F({'term': {attr: value}})) # execute query res = s.execute() # build up django query results = {} for hit in res: # get the model dj_type = hit._meta.doc_type model = get_model(dj_type) # get the pk pk_name = model._meta.pk.name pk = getattr(hit, pk_name) # get the score score = hit._meta.score # add to mapping results.setdefault(model, {}) results[model][pk] = score # get queryset querysets = [] for model, pk_score in results.items(): qs = model.objects.filter(pk__in=pk_score.keys()) querysets += list(qs) # attach scores to instances for instance in querysets: score = results[type(instance)][instance.pk] instance._meta.es_score = score # order by score querysets = sorted(querysets, key=lambda i: i._meta.es_score, reverse=True) # return return querysets
def get_item(self, identifier): s = Search(using=self.client) s = s.query("match", **{"_id": identifier}) response = s.execute() if response.hits.total != 1: return None return ElasticSearchRDFRecord.get_rdf_records_from_query( query=s, response=response)[0]
def query(self): search_obj = Search() for f in self.filters: search_obj = search_obj.filter(f) for q in self.queries: search_obj = search_obj.query(q) return search_obj.to_dict()
def get_articles_by_iid(iid, page_from=0, page_size=1000): search = Search(index=INDEX).query("match", issue_iid=iid) search = search.query("match", _type="article") search = search[page_from:page_size] search_response = search.execute() return search_response
def get_issue_by_iid(iid): search = Search(index=INDEX).query("match", iid=iid) search = search.query("match", _type="issue") search_response = search.execute() if search_response.success() and search_response.hits.total > 0: issue = search_response[0] return issue else: return None
def get_article_by_aid(aid): search = Search(index=INDEX).query("match", aid=aid) search = search.query("match", _type="article") search_response = search.execute() if search_response.success() and search_response.hits.total > 0: article = search_response[0] return article else: return None
def GetGeneratorQuery(case, endpoint_id, start, length, str_query, sort, order): s = Search() s = s[int(start):int(length)+int(start)] s = s.fields([ "Record.Path", "Record.Url", "Record.SourceUrl", "Record.TlnTime", "Record.File.Accessed", "Record.File.Modified", "Record.File.Changed", "AuditType.Generator" ]) order_dict = { "0": "TlnTime" } if str_query == "": _sort = { "Record.{0}".format(order_dict[str(sort)]): { "order": order } } t = Q('query_string', default_field="Record.TlnTime", query="*") & Q('match', ComputerName=endpoint_id) & ~Q('match', AuditType__Generator="w32processes-memory") & ~Q('match', AuditType__Generator="w32useraccounts") query = s.query(t).filter('term', CaseInfo__case_name=case).sort(_sort) else: _sort = { "Record.{0}".format(order_dict[str(sort)]): { "order": order } } t = Q('query_string', default_field="Record.TlnTime", query="*") & Q('match', ComputerName=endpoint_id) & ~Q('match', AuditType__Generator="w32processes-memory") & ~Q('match', AuditType__Generator="w32useraccounts") & Q('query_string', fields=[ "Record.Path", "Record.Url", "Record.SourceUrl", "AuditType.Generator"], query="{0}*".format(str_query)) query = s.query(t).filter('term', CaseInfo__case_name=case).sort(_sort) return query.to_dict()
def convert_filters_to_query(self, filters): s = Search(using=self.client) spec = filters.get("dataset__spec", None) modified_from = filters.get('modified__gt', None) modified_until = filters.get('modified__lt', None) if spec and not self.spec: self.spec = spec if self.spec: s = s.query("match", **{'system.spec.raw': self.spec}) if self.query: if 'query' in self.query: s = s.query(self.query.get('query')) if 'filter' in self.query: s = s.query(self.query.get('filter')) if modified_from: s = s.filter("range", **{"system.modified_at": {"gte": modified_from}}) if modified_until: s = s.filter("range", **{"system.modified_at": {"lte": modified_until}}) s = s.sort({"system.modified_at": {"order": "asc"}}) return s[self.cursor: self.get_next_cursor()]
def get_dataset_list(self): s = Search(using=self.client) datasets = A("terms", field="delving_spec.raw") if self.query: s = s.filter(self.query.get('filter')) elif self.spec: s = s.query("match", **{'system.spec.raw': self.spec}) s.aggs.bucket("dataset-list", datasets) response = s.execute() specs = response.aggregations['dataset-list'].buckets return [self.ESDataSet(spec.key, None, None, spec.doc_count, None) for spec in specs]
def save_forensic_report_to_elasticsearch(forensic_report, index_suffix=None, monthly_indexes=False, number_of_shards=1, number_of_replicas=1): """ Saves a parsed DMARC forensic report to ElasticSearch Args: forensic_report (OrderedDict): A parsed forensic report index_suffix (str): The suffix of the name of the index to save to monthly_indexes (bool): Use monthly indexes instead of daily indexes number_of_shards (int): The number of shards to use in the index number_of_replicas (int): The number of replicas to use in the index Raises: AlreadySaved """ logger.debug("Saving forensic report to Elasticsearch") forensic_report = forensic_report.copy() sample_date = None if forensic_report["parsed_sample"]["date"] is not None: sample_date = forensic_report["parsed_sample"]["date"] sample_date = human_timestamp_to_datetime(sample_date) original_headers = forensic_report["parsed_sample"]["headers"] headers = OrderedDict() for original_header in original_headers: headers[original_header.lower()] = original_headers[original_header] arrival_date_human = forensic_report["arrival_date_utc"] arrival_date = human_timestamp_to_datetime(arrival_date_human) search = Search(index="dmarc_forensic*") arrival_query = {"match": {"arrival_date": arrival_date}} q = Q(arrival_query) from_ = None to_ = None subject = None if "from" in headers: from_ = headers["from"] from_query = {"match_phrase": {"sample.headers.from": from_}} q = q & Q(from_query) if "to" in headers: to_ = headers["to"] to_query = {"match_phrase": {"sample.headers.to": to_}} q = q & Q(to_query) if "subject" in headers: subject = headers["subject"] subject_query = {"match_phrase": {"sample.headers.subject": subject}} q = q & Q(subject_query) search.query = q existing = search.execute() if len(existing) > 0: raise AlreadySaved("A forensic sample to {0} from {1} " "with a subject of {2} and arrival date of {3} " "already exists in " "Elasticsearch".format(to_, from_, subject, arrival_date_human)) parsed_sample = forensic_report["parsed_sample"] sample = _ForensicSampleDoc( raw=forensic_report["sample"], headers=headers, headers_only=forensic_report["sample_headers_only"], date=sample_date, subject=forensic_report["parsed_sample"]["subject"], filename_safe_subject=parsed_sample["filename_safe_subject"], body=forensic_report["parsed_sample"]["body"]) for address in forensic_report["parsed_sample"]["to"]: sample.add_to(display_name=address["display_name"], address=address["address"]) for address in forensic_report["parsed_sample"]["reply_to"]: sample.add_reply_to(display_name=address["display_name"], address=address["address"]) for address in forensic_report["parsed_sample"]["cc"]: sample.add_cc(display_name=address["display_name"], address=address["address"]) for address in forensic_report["parsed_sample"]["bcc"]: sample.add_bcc(display_name=address["display_name"], address=address["address"]) for attachment in forensic_report["parsed_sample"]["attachments"]: sample.add_attachment(filename=attachment["filename"], content_type=attachment["mail_content_type"], sha256=attachment["sha256"]) try: forensic_doc = _ForensicReportDoc( feedback_type=forensic_report["feedback_type"], user_agent=forensic_report["user_agent"], version=forensic_report["version"], original_mail_from=forensic_report["original_mail_from"], arrival_date=arrival_date, domain=forensic_report["reported_domain"], original_envelope_id=forensic_report["original_envelope_id"], authentication_results=forensic_report["authentication_results"], delivery_results=forensic_report["delivery_result"], source_ip_address=forensic_report["source"]["ip_address"], source_country=forensic_report["source"]["country"], source_reverse_dns=forensic_report["source"]["reverse_dns"], source_base_domain=forensic_report["source"]["base_domain"], authentication_mechanisms=forensic_report[ "authentication_mechanisms"], auth_failure=forensic_report["auth_failure"], dkim_domain=forensic_report["dkim_domain"], original_rcpt_to=forensic_report["original_rcpt_to"], sample=sample) index = "dmarc_forensic" if index_suffix: index = "{0}_{1}".format(index, index_suffix) if monthly_indexes: index_date = arrival_date.strftime("%Y-%m") else: index_date = arrival_date.strftime("%Y-%m-%d") index = "{0}-{1}".format(index, index_date) index_settings = dict(number_of_shards=number_of_shards, number_of_replicas=number_of_replicas) create_indexes([index], index_settings) forensic_doc.meta.index = index try: forensic_doc.save() except Exception as e: raise ElasticsearchError("Elasticsearch error: {0}".format( e.__str__())) except KeyError as e: raise InvalidForensicReport( "Forensic report missing required field: {0}".format(e.__str__()))
def user_last_interaction(userid): s = Search().extra(size=1) s = s.query("match", user_id=userid).sort("-datetime") return s
def example10(): """ DSL objects for common entities instead of dict/json. All importable from elasticsearch_dsl """ from elasticsearch_dsl import Q, Search """ Straightforward mapping to json - kwargs are translated into keys into json. You can use the to_dict() method to see the result json. """ q = Q("terms", tags=["python", "search"]) q.to_dict() """ All objects can also be constructed using the raw dict. """ q = Q({"terms": {"tags": ["python", "search"]}}) q.to_dict() """ Query objects support logical operators which result in bool queries """ q = q | Q("match", title="python") q.to_dict() """ DSL objects also allow for attribute access instead of ['key'] """ q.minimum_should_match = 2 q.minimum_should_match q.to_dict() from datetime import date q = q & Q("range", **{"@timestamp": {"lt": date(2019, 1, 1)}}) q.to_dict() """ Configuration is global so no client needs to be passed around. """ from elasticsearch_dsl import connections """ Default connection used where no other connection specified. Any configuration methods just pass all parameters to the underlying elasticsearch-py client. """ connections.create_connection(hosts=["localhost"]) """ Optionally specify an alias for the connection in case of multiple connections. """ connections.create_connection("prod", hosts=["localhost"]) s = Search(using="prod") s.count() """ You can always just pass in your own client instance """ s = Search(using=Elasticsearch()) s.count() """ Any method on Search returns a clone so you need to always assign it back to the same variable. """ s = Search() s = s.params(q="fix") """ Multiple queries are combined together using the AND operator """ s = Search() s = s.query("match", description="fix") s = s.query("match", author="Honza") """ Filter shortcut to use {bool: {filter: []}} """ s = s.filter("range", committed_date={"lt": date(2016, 1, 1)}) s.to_dict() """ Exclude as a wrapper around must_not, use __ instead of dots for convenience. """ s = s.exclude("term", committer__name__keyword="Honza Král") """ Search is executed when iterated on or when .execute() is called. """ for hit in s: """ Hit class offers direct access to fields and via .meta any other properties on the returned hit (_id, _seq_no, ...) """ print(f"{hit.meta.id[:6]} ({hit.author.name}): {hit.description[:50]}") """ Aggregations are implemented in place to allow for chaining """ s = Search(index="git") s.aggs.bucket("tags", "terms", field="terms").metric( "lines", "sum", field="stats.lines").metric("authors", "cardinality", field="author.name.keyword") r = s.execute() """ Or modify aggregation in place """ s.aggs["tags"].bucket("months", "date_histogram", field="committed_date", interval="month") """ Analysis """ from elasticsearch_dsl import analyzer, token_filter a = analyzer( "file_analyzer", tokenizer="path_hierarchy", filter=[ "lowercase", token_filter( "split_ext", "pattern_capture", preserve_original=True, patterns=[r"^([^\.]+)"], ), ], ) a.simulate("test/integration/search.py") """ """ from elasticsearch_dsl import Document, Text, Keyword, InnerDoc, Date, Nested class FileDiff(InnerDoc): filename = Text(analyzer=a) patch = Text() class Commit(Document): description = Text() committed_date = Date() author = Text(fields={"keyword": Keyword()}) files = Nested(FileDiff) def subject(self): return self.description.split("\n", 1)[0][:80] class Index: name = "git*" settings = {"number_of_replicas": 0} """ Create the index """ Commit.init(index="git-v2") """ Search now returns Commit objects """ for c in Commit.search(): print(f"{c.meta.id}: {c.subject()}")
'w32apifiles', 'w32rawfiles', 'w32eventlogs' ] if data_type in q: query = search_queries.GetGeneratorQuery(data_type, str_query, case, child_id, start, length, sort, order) else: s = Search() s = s[0:1000] t = Q('query_string', default_field="ComputerName.raw", query=child_id) & Q('query_string', default_field="CaseInfo.case_name", query=case) query = s.query(t).filter('term', AuditType__Generator=data_type) try: r = requests.post(self.es_host + ":" + self.es_port + self.index + self.type_audit_type + '/_search', data=json.dumps(query.to_dict()), auth=(self.elastic_user, self.elastic_pass), verify=False) except ConnectionError as e: ret = {"connection_error": e.args[0]} return ret data = [] try: for x in r.json()['hits']['hits']:
def search_day(self, qterm, score_metric='perc', **kwargs): """ Searches in the elasticsearch index for irc messages, grouped by day and channel. Uses the elasticsearch aggregation function to build following aggregation levels of the documents: - A: filter (day/channel) -> B: group by day (day-bucket) -> C: group by channel (channel-bucket) ---- The channel-buckets are sorted by their 99-percentile of their containing document-scores (This means that 1% of all the documents in the channel-bucket have a higher score than the 99-percentile of the channel-bucket. In comparsion to sum or avg, the 99-percentile has the advantage that higher document-scores/matching documents in the channel-bucket are valued higher. Many lower document scores will be valued less or even ignored.) For each day the highest perc-score of all channel-buckets on that day is remembered as ``max_score_day``. The day-buckets are then sorted by this highest perc-score ``max_score_day``. Important: This case describes the behaviour with a ``score_metric`` == 'perc'. If ``score_metric`` is changed, the behaviour is the same, except another metric is used. Definition: a document is one log-message :param score_metric: ``str`` Which metric to use for calculating channel-bucket score. This metric will also be used for sorting these buckets. - 'perc' 99-percentile of documents in channel-bucket. -> High-matching documents are valued higher - 'sum' sum of all document scores in channel-bucket -> All documents equal, many medium-matching documents may "eat-up" high-matching ones. - 'max' highest document score in channel-bucket as channel-bucket score -> Returns the day and channel with the highest matching log-message. Other messages on that day in that channel will be ignored. :param qterm: ``str`` Query-string to find :param \**kwargs: See below :Keyword Arguments: * *date_gte* (``datetime``) -- Filter, From: only emails greater than * *date_lte* (``datetime``) -- Filter, To: only emails less than * *date_sliding* (``str``) -- Filter sliding window, only emails of the past XX-hours/days/years... e.g. '-1d/d','-5y/y' -- See: https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#date-math * *date_sliding_type* (``str``) -- Valid date-type: e.g. y M d * *use_sliding_value* (``bool``) -- True: Only respect date_sliding and date_sliding_type. False: only respect fix date: date_gte and date_lte * *number_results* (``int``) -- Number of total results to return * *sort_field* (``str``) -- By which field should results be sorted e.g. date, _score, username * *sort_dir* (``str``) -- In Which direction should results be sorted '+': ascending '-': descending) :return: """ number_results = 50 number_top_hits = 5 # Get arguments date_gte = None # '2010-01-31T22:28:14+0300' # from date_lte = 'now' # ''2012-09-20T17:41:14+0900' # 'now' # to date_sliding_value = '' date_sliding_type = '' use_sliding_value = True sort_field = '_score' sort_dir = '-' for key, value in kwargs.items(): if key == 'date_gte': date_gte = ('{:' + dementor_constants.JSON_DATETIME_FORMAT + '}').format(value) if key == 'date_lte': date_lte = ('{:' + dementor_constants.JSON_DATETIME_FORMAT + '}').format(value) if key == 'use_sliding_value': use_sliding_value = value if key == 'date_sliding_value': date_sliding_value = value if key == 'date_sliding_type': date_sliding_type = value if key == 'number_results': number_results = value if key == 'sort_field': sort_field = value if key == 'sort_dir': sort_dir = value # Get specific query arguments filter_channel = '' for key, value in kwargs.items(): if key == 'filter_channel': filter_channel = value # Prepare query s = DslSearch(using=self._es, index=self._index_prefix.format('*')) s = s[0:0] # don't return other results, only aggregation # Search-Query s = s.query(self._get_query(qterm)) # Prepare score-metric and corresponding order-field for buckets and couments percentiles_percents = 99 percentiles_percents_field = '99.0' percentiles_percents_field_order = elastic_constants.IRC_DAY_ORDER_FIELD[ 'perc'] score_order_field = elastic_constants.IRC_DAY_ORDER_FIELD[score_metric] # Prepare aggregate-query: # Aggregation levels: A (date/channel filtered) -> B (bucket days) -> C (bucket channel) # A date/channel filtered filters = [] # Date if use_sliding_value & (date_sliding_value != '') & (date_sliding_type != ''): filters.append({ 'range': { '@timestamp': { 'gte': 'now-{0}{1}'.format(date_sliding_value, date_sliding_type), 'lte': 'now' } } }) elif date_gte is not None: filters.append( {'range': { '@timestamp': { 'gte': date_gte, 'lte': date_lte } }}) # Channel if filter_channel != '': filters.append({'term': {'channel.keyword': filter_channel}}) a_log_filtered = A('filter', Q('bool', must=filters)) # B bucket days b_bucket_days = A('date_histogram', field='@timestamp', interval='day', format='yyyy-MM-dd', min_doc_count=1, order={'max_score_day': 'desc'}) # C bucket channels c_bucket_channels = A('terms', field='channel.keyword', min_doc_count=1, order={score_order_field: 'desc'}) c_bucket_channels = c_bucket_channels \ .metric('max_date', 'max', field='@timestamp') \ .metric('sum_score_channel', 'sum', script={'inline': '_score', 'lang': 'painless'}) \ .metric('max_score_channel', 'max', script={'inline': '_score', 'lang': 'painless'}) \ .metric('percentiles_score_channel', 'percentiles', percents=[percentiles_percents], script={'inline': '_score', 'lang': 'painless'}) \ .metric('top_msg_hits', 'top_hits', size=number_top_hits, highlight={'fields': {'msg': {}, 'username': {}, 'channel': {}}}, sort=[{'_score': {'order': 'desc'}}], **{'_source': { 'includes': ['channel', 'username', '@timestamp', 'msg']}}) # Stack aggregations Main -> A -> B -> C (reversed order) b_bucket_days.bucket('logs_per_channel', c_bucket_channels) b_bucket_days.metric('max_score_day', 'max', field=score_order_field) # Add metric a_log_filtered.bucket('logs_per_day', b_bucket_days) s.aggs.bucket('logs_filtered', a_log_filtered) # Execute query response = s.execute() # Flatten days-channels buckets (see: http://stackoverflow.com/a/952952/2003325) bucket_days = response.aggregations.logs_filtered.logs_per_day.buckets bucket_channel_flat = [ item for sub in bucket_days for item in sub.logs_per_channel.buckets ] # Sort flattened buckets (one bucket is a channel per day) if sort_field == 'channel.keyword': def sort_lambda(bucket_channel): return bucket_channel['key'] elif sort_field == '_score' and score_metric == 'perc': def sort_lambda(bucket_channel): return bucket_channel.percentiles_score_channel.values[ percentiles_percents_field] elif sort_field == '_score': # '@timestamp' or 'sum_score_channel' or 'max_score_channel' def sort_lambda(bucket_channel): return bucket_channel[score_order_field].value elif sort_field == '@timestamp': def sort_lambda(bucket_channel): return bucket_channel['max_date'].value sort_dir = 'desc' if sort_dir == '-' else 'asc' bucket_channel_flat_sorted = sorted(bucket_channel_flat, key=sort_lambda, reverse=(sort_dir == 'desc')) # Limit result-size number_results_buckets = int(number_results / 3) bucket_channel_flat_sorted = bucket_channel_flat_sorted[ 0:number_results_buckets] # Get hits to display from flattened buckets hit_list = [] for channel_bucket in bucket_channel_flat_sorted: for hit in channel_bucket.top_msg_hits.hits.hits: if score_order_field == percentiles_percents_field_order: score = channel_bucket.percentiles_score_channel.values[ percentiles_percents_field] else: score = channel_bucket[score_order_field].value hit.meta = {'score': score, 'highlight': {}} hit_src = hit['_source'] hit.sent = dateutil.parser.parse(hit_src['@timestamp']) hit.day_raw = '{:%Y-%m-%d}'.format(hit.sent) hit.timestamp_raw = hit_src['@timestamp'] hit.username = hit_src.username hit.channel = hit_src.channel hit.msg = hit_src.msg if hasattr(hit, 'highlight'): hit.meta.highlight = copy.deepcopy(hit.highlight) hit.meta.id = hit['_id'] hit_list[len( hit_list ):] = channel_bucket.top_msg_hits.hits.hits # create hits list return hit_list
def get_elastic_container_histogram_legacy(ident) -> List: """ Fetches a stacked histogram of {year, in_ia}. This is for the older style of coverage graph (SVG or JSON export). This function should be DEPRECATED to be removed in the near future. Filters to the past 500 years (at most), or about 1000 values. Returns a list of tuples: (year, in_ia, count) """ search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) search = search.query( 'bool', must=[ Q("range", release_year={ "gte": datetime.datetime.today().year - 499, "lte": datetime.datetime.today().year, }), ], filter=[ Q("bool", minimum_should_match=1, should=[ Q("match", container_id=ident), ]), ], ) search.aggs.bucket( 'year_in_ia', 'composite', size=1000, sources=[ { "year": { "histogram": { "field": "release_year", "interval": 1, }, } }, { "in_ia": { "terms": { "field": "in_ia", }, } }, ], ) search = search[:0] search = search.params(request_cache='true') resp = wrap_es_execution(search) buckets = resp.aggregations.year_in_ia.buckets vals = [(int(h['key']['year']), h['key']['in_ia'], h['doc_count']) for h in buckets] vals = sorted(vals) return vals
def get_elastic_entity_stats() -> dict: """ TODO: files, filesets, webcaptures (no schema yet) Returns dict: changelog: {latest: {index, datetime}} release: {total, refs_total} papers: {total, in_web, in_oa, in_kbart, in_web_not_kbart} """ stats = {} # release totals search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) search.aggs.bucket( 'release_ref_count', 'sum', field='ref_count', ) search = search[:0] # pylint: disable=unsubscriptable-object search = search.params(request_cache=True) resp = wrap_es_execution(search) stats['release'] = { "total": int(resp.hits.total), "refs_total": int(resp.aggregations.release_ref_count.value), } # paper counts search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) search = search.query( 'terms', release_type=[ "article-journal", "paper-conference", # "chapter", # "thesis", ], ) search.aggs.bucket('paper_like', 'filters', filters={ "in_web": { "term": { "in_web": "true" } }, "is_oa": { "term": { "is_oa": "true" } }, "in_kbart": { "term": { "in_kbart": "true" } }, "in_web_not_kbart": { "bool": { "filter": [ { "term": { "in_web": "true" } }, { "term": { "in_kbart": "false" } }, ] } }, }) search = search[:0] search = search.params(request_cache=True) resp = wrap_es_execution(search) buckets = resp.aggregations.paper_like.buckets stats['papers'] = { 'total': resp.hits.total, 'in_web': buckets.in_web.doc_count, 'is_oa': buckets.is_oa.doc_count, 'in_kbart': buckets.in_kbart.doc_count, 'in_web_not_kbart': buckets.in_web_not_kbart.doc_count, } # container counts search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_CONTAINER_INDEX']) search.aggs.bucket( 'release_ref_count', 'sum', field='ref_count', ) search = search[:0] # pylint: disable=unsubscriptable-object search = search.params(request_cache=True) resp = wrap_es_execution(search) stats['container'] = { "total": resp.hits.total, } return stats
def search(): query = request.args.get('q') judge = request.args.get('judge') category = request.args.get('category') acts = request.args.get('acts') date_from = request.args.get('from') date_to = request.args.get('to') start = request.args.get('pagenum') if start: start = int(start) * 20 else: start = 0 verdict_tokens = [] legal_tokens = [] judge_name_tokens = [] other_tokens = [] judge2 = "" if query is not None: duration, verdict_tokens, legal_tokens, judge_name_tokens, other_tokens = parse_query( query) legal_other = legal_tokens + other_tokens verdict = "" for i in judge_name_tokens: if i is not None: judge2 += " " + i for i in verdict_tokens: if i is not None: verdict += " " + i print(legal_other) file_object = open('things.txt', "w+") # file_object1 = open('Word2.txt', "w+") for i in range(0, len(legal_other)): t = legal_other[i] t = t + '\n' print(t) file_object.write(t) t = 'EXIT' t = t + '\n' file_object.write(t) file_object.close() # file_object11 = open('things.txt', "r") subprocess.Popen("./distance vectors.bin < things.txt > Word2.txt", shell=True) # call("["./distance" , "vectors.bin"], stdin = file_object11, stdout = file_object1) # file_object1.close() with open('Word2.txt', 'r') as f: strin = f.read() words = re.sub('[^a-zA-Z0-9\n]', ',', strin) words = list([x for x in set(words.split(',')) if x != query]) print(words) new_str = query + ' ' + ' '.join(words[:min(10, len(words))]) # with open('Word2.txt', 'r') as f: # t = f.readlines() # t1 = t[0] # queryNew = '' # for i in range(0, len(t1)-2): # queryNew = queryNew + t1[i] print('this :: ' + new_str) judge = judge if judge is not None else judge2 s = Search(using=client) allfields = [ 'content', 'summary', 'judge', 'acts', 'title', 'verdict', 'keywords', 'appeal', 'verdict', 'subject' ] should = [] if (query is not None): q_base = Q('multi_match', query=new_str, fuzziness="1", prefix_length=3, fields=allfields) should.append(q_base) if (judge is not None): if (len(judge) > 0): q_judge = Q('multi_match', query=judge, fields=['judge']) should.append(q_judge) if (acts is not None): q_acts = Q('multi_match', query=acts, fields=['acts']) should.append(q_acts) if (category is not None): q_category = Q('multi_match', query=category, fields=['subject']) should.append(q_category) if (date_from is not None and date_to is not None): q_date = Q('range', date={ 'gte': date_from, 'lte': date_to, 'format': "yyyy/MM/dd" }) should.append(q_date) # if(date_from is None and date_to is not None): # q_date = Q('range',fields=['date'],gte="1940/01/01",lte=date_to,format="yyyy/MM/dd") # should.append(q_date) # if(date_from is not None and date_to is not None): # q_date = Q('range',fields=['date'],gte=date_from,lte="now",format="yyyy/MM/dd") # should.append(q_date) q = Q('bool', should=should, minimum_should_match=len(should)) print(q) s = s.query(q) count = s.count() end = start + 20 response = s[start:min(end, count)].execute() response = response.to_dict() result = {} # global query_ # query_ = query # query_ = query_.strip().split() # query_ = m.infer_vector(query_, alpha=start_alpha, steps=infer_epoch) for i in range(len(response['hits']['hits'])): resp = response['hits']['hits'][i]["_source"] resp['score'] = response['hits']['hits'][i]["_score"] result[str(i)] = resp resp.pop('content') result['count'] = count # result_ = result.values() # result_ = sorted(result_, key=functools.cmp_to_key(compare)) # result = {} # for i in range(len(response['hits']['hits'])): # result[str(i)]=result_[i] return json.dumps(result)
def save_aggregate_report_to_elasticsearch(aggregate_report, index_suffix=None, monthly_indexes=False, number_of_shards=1, number_of_replicas=1): """ Saves a parsed DMARC aggregate report to ElasticSearch Args: aggregate_report (OrderedDict): A parsed forensic report index_suffix (str): The suffix of the name of the index to save to monthly_indexes (bool): Use monthly indexes instead of daily indexes number_of_shards (int): The number of shards to use in the index number_of_replicas (int): The number of replicas to use in the index Raises: AlreadySaved """ logger.debug("Saving aggregate report to Elasticsearch") aggregate_report = aggregate_report.copy() metadata = aggregate_report["report_metadata"] org_name = metadata["org_name"] report_id = metadata["report_id"] domain = aggregate_report["policy_published"]["domain"] begin_date = human_timestamp_to_datetime(metadata["begin_date"]) end_date = human_timestamp_to_datetime(metadata["end_date"]) begin_date_human = begin_date.strftime("%Y-%m-%d %H:%M:%S") end_date_human = end_date.strftime("%Y-%m-%d %H:%M:%S") if monthly_indexes: index_date = begin_date.strftime("%Y-%m") else: index_date = begin_date.strftime("%Y-%m-%d") aggregate_report["begin_date"] = begin_date aggregate_report["end_date"] = end_date date_range = [aggregate_report["begin_date"], aggregate_report["end_date"]] org_name_query = Q(dict(match_phrase=dict(org_name=org_name))) report_id_query = Q(dict(match_phrase=dict(report_id=report_id))) domain_query = Q(dict(match_phrase={"published_policy.domain": domain})) begin_date_query = Q(dict(match=dict(date_range=begin_date))) end_date_query = Q(dict(match=dict(date_range=end_date))) search = Search(index="dmarc_aggregate*") query = org_name_query & report_id_query & domain_query query = query & begin_date_query & end_date_query search.query = query existing = search.execute() if len(existing) > 0: raise AlreadySaved("An aggregate report ID {0} from {1} about {2} " "with a date range of {3} UTC to {4} UTC already " "exists in " "Elasticsearch".format(report_id, org_name, domain, begin_date_human, end_date_human)) published_policy = _PublishedPolicy( domain=aggregate_report["policy_published"]["domain"], adkim=aggregate_report["policy_published"]["adkim"], aspf=aggregate_report["policy_published"]["aspf"], p=aggregate_report["policy_published"]["p"], sp=aggregate_report["policy_published"]["sp"], pct=aggregate_report["policy_published"]["pct"], fo=aggregate_report["policy_published"]["fo"]) for record in aggregate_report["records"]: agg_doc = _AggregateReportDoc( xml_schema=aggregate_report["xml_schema"], org_name=metadata["org_name"], org_email=metadata["org_email"], org_extra_contact_info=metadata["org_extra_contact_info"], report_id=metadata["report_id"], date_range=date_range, date_begin=aggregate_report["begin_date"], date_end=aggregate_report["end_date"], errors=metadata["errors"], published_policy=published_policy, source_ip_address=record["source"]["ip_address"], source_country=record["source"]["country"], source_reverse_dns=record["source"]["reverse_dns"], source_base_domain=record["source"]["base_domain"], message_count=record["count"], disposition=record["policy_evaluated"]["disposition"], dkim_aligned=record["policy_evaluated"]["dkim"] is not None and record["policy_evaluated"]["dkim"].lower() == "pass", spf_aligned=record["policy_evaluated"]["spf"] is not None and record["policy_evaluated"]["spf"].lower() == "pass", header_from=record["identifiers"]["header_from"], envelope_from=record["identifiers"]["envelope_from"], envelope_to=record["identifiers"]["envelope_to"]) for override in record["policy_evaluated"]["policy_override_reasons"]: agg_doc.add_policy_override(type_=override["type"], comment=override["comment"]) for dkim_result in record["auth_results"]["dkim"]: agg_doc.add_dkim_result(domain=dkim_result["domain"], selector=dkim_result["selector"], result=dkim_result["result"]) for spf_result in record["auth_results"]["spf"]: agg_doc.add_spf_result(domain=spf_result["domain"], scope=spf_result["scope"], result=spf_result["result"]) index = "dmarc_aggregate" if index_suffix: index = "{0}_{1}".format(index, index_suffix) index = "{0}-{1}".format(index, index_date) index_settings = dict(number_of_shards=number_of_shards, number_of_replicas=number_of_replicas) create_indexes([index], index_settings) agg_doc.meta.index = index try: agg_doc.save() except Exception as e: raise ElasticsearchError("Elasticsearch error: {0}".format( e.__str__()))
nrounds = args.nrounds alpha = args.alpha beta = args.beta R = args.R try: client = Elasticsearch() s = Search(using=client, index=index) if query is not None: for i in range(0, nrounds): q = Q('query_string', query=query[0]) for i in range(1, len(query)): q &= Q('query_string', query=query[i]) s = s.query(q) response = s[0:nhits].execute() print("QUERY:") print(query) #Passem la query a un diccionary query_dict = queryToDict(query) sumDocs = {} # calcul dels documents #print( "------------------- CALULEM ELS DOCUMENTS -------------") for r in response: # only returns a specific number of results file_tw = toTFIDF(client, index, r.meta.id) # tf-idf sumDocs = {
def _delete(_by_filter): search = Search(index=self._index, using=self._es_client) search = search.query(_by_filter) return search.delete()
import sys import os #logging.basicConfig(level=logging.WARN) #es = elasticsearch.Elasticsearch( # ['https://gracc.opensciencegrid.org/q'], # timeout=300, use_ssl=True, verify_certs=False) es = elasticsearch.Elasticsearch(['localhost:9200'], timeout=300) osg_raw_index = 'gracc.osg.raw-*' s = Search(using=es, index=osg_raw_index) # Match the records by ProbeName and processors = 0. s = s.query("match", ProbeName="htcondor-ce:hosted-ce18.grid.uchicago.edu") s = s.query("match", Processors=0) s = s.filter('range', EndTime={'from': 'now-12M', 'to': 'now'}) response = s.execute() print "Query took %i milliseconds" % response.took print "Query got %i hits" % response.hits.total #update_id = "8c5816978fee6fc17718bcf81350d1f4" #print "About to update record with id: %s" % update_id #es.update(index="gracc.osg.raw3-2017.07", doc_type='JobUsageRecord', id=update_id, body={'doc': {'VOName': 'UserSchool2017'}}) update_buffer = [] for hit in s.scan(): # Calculate the new CoreHours (cores = 1): core_hours = hit.WallDuration / 3600.0
r = s.execute() # for b in r.aggregations.percents.values: # print(b, r.aggregations.percents.values[b]) s = Search(using=ES_CLIENT, index=f"{ES_INDEX_DOCUMENT_EVAL}_{tm}_{criterion_id}") q1 = Q("range", value={ "gt": r.aggregations.percents.values[f'{high_threshold}.0'], }) q2 = Q("range", value={ "lt": r.aggregations.percents.values[f'{low_threshold}.0'], }) s = s.query(q1 | q2) s = s.query({ "function_score": { "functions": [{ "random_score": { "seed": "iivtiicthelyon1488" } }], } }) s = s.source(('document_es_id', 'value'))[:2000] r = s.execute() document_eval_dict = dict((hit.document_es_id, hit.value) for hit in r) s = Search(using=ES_CLIENT, index=ES_INDEX_DOCUMENT)
def es_get_all(date_from, date_to, filters="", options=""): client = Elasticsearch(['192.168.129.132']) s = Search(using=client, index='logs-*') if date_from and date_to: datef, timef = date_from.split(" ") datetimef = str(datef) + "T" + str(timef) + ".000Z" datet, timet = date_to.split(" ") datetimet = str(datet) + "T" + str(timet) + ".000Z" s = s.query('bool', filter=[ Q('range', log_ingest_timestamp={ 'gte': datetimef, 'lt': datetimet }) ]) if filters: for e in filters: if "=" == filters[e]["operator"]: f = [] options = filters[e]["text"].split(",") for option in options: if len(f) == 0: f.append(Q("match", **{filters[e]["element"]: option})) else: f[0] = f[0] | Q("match", ** {filters[e]["element"]: option}) s = s.query('bool', filter=f) elif "!=" == filters[e]["operator"]: f = [] options = filters[e]["text"].split(",") for option in options: if len(f) == 0: f.append( ~Q("match", **{filters[e]["element"]: option})) else: f[0] = f[0] | Q("match", ** {filters[e]["element"]: option}) s = s.query('bool', filter=f) total = s.count() s = s[0:total] response = s.execute() events = [] for hit in response: event = {} j = hit.to_dict() if "powershell" in options and "Microsoft-Windows-PowerShell/Operational" in j["log_name"] or\ "sysmon" in options and "Sysmon" in j["log_name"]: event["event_id"] = j["event_id"] event["log_name"] = j[ "log_name"] # "Microsoft-Windows-Sysmon/Operational" event["computer_name"] = j["host_name"] event["event_data"] = {} if "Sysmon" in j["log_name"]: #print("entra en sysmon elastic") if "z_original_message" in j: lines = str(j["z_original_message"]).splitlines() for line in lines: elements = line.split(": ") key = elements[0] if len(elements) > 1: value = elements[1] else: value = "" event["event_data"][key] = value events.append(event) elif "Microsoft-Windows-PowerShell/Operational" in j["log_name"]: try: event["event_data"]["log_ingest_timestamp"] = j[ "log_ingest_timestamp"] if "powershell" in j: if "host" in j["powershell"]: if "application" in j["powershell"]["host"]: decrypted = base64_in_application( j["powershell"]["host"]["application"]) event["event_data"]["application"] = j[ "powershell"]["host"]["application"] event["event_data"]["param"] = decrypted elif "scriptblock" in j["powershell"]: if "text" in j["powershell"]["scriptblock"]: event["event_data"]["application"] = j[ "powershell"]["scriptblock"]["text"] event["event_data"]["param"] = "" elif "param1" in j: event["event_data"]["param"] = "" event["event_data"]["application"] = j["param1"] elif "param2" in j: event["event_data"]["param"] = "" event["event_data"]["application"] = j["param2"] events.append(event) except Exception as e: print("Eception: {}, Event: {}".format(e, j)) return events
def do_release_search(query: ReleaseQuery, deep_page_limit: int = 2000) -> SearchHits: search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) # availability filters if query.fulltext_only: search = search.filter("term", in_ia=True) # Below, we combine several queries to improve scoring. # this query use the fancy built-in query string parser basic_biblio = Q( "query_string", query=query.q, default_operator="AND", analyze_wildcard=True, allow_leading_wildcard=False, lenient=True, fields=[ "title^2", "biblio", ], ) has_fulltext = Q("term", in_ia=True) poor_metadata = Q( "bool", should=[ # if these fields aren't set, metadata is poor. The more that do # not exist, the stronger the signal. Q("bool", must_not=Q("exists", field="title")), Q("bool", must_not=Q("exists", field="release_year")), Q("bool", must_not=Q("exists", field="release_type")), Q("bool", must_not=Q("exists", field="release_stage")), Q("bool", must_not=Q("exists", field="container_id")), ], ) search = search.query( "boosting", positive=Q( "bool", must=basic_biblio, should=[has_fulltext], ), negative=poor_metadata, negative_boost=0.5, ) # Sanity checks limit = min((int(query.limit or 25), 100)) offset = max((int(query.offset or 0), 0)) if offset > deep_page_limit: # Avoid deep paging problem. offset = deep_page_limit search = search[offset:(offset + limit)] resp = wrap_es_execution(search) results = results_to_dict(resp) for h in results: # Ensure 'contrib_names' is a list, not a single string if type(h['contrib_names']) is not list: h['contrib_names'] = [ h['contrib_names'], ] h['contrib_names'] = [ name.encode('utf8', 'ignore').decode('utf8') for name in h['contrib_names'] ] return SearchHits( count_returned=len(results), count_found=int(resp.hits.total), offset=offset, limit=limit, deep_page_limit=deep_page_limit, query_time_ms=int(resp.took), results=results, )
def search(search_params, index, page_size, ip, request, filter_dead, page=1) -> Tuple[List[Hit], int, int]: """ Given a set of keywords and an optional set of filters, perform a ranked paginated search. :param search_params: Search parameters. See :class: `ImageSearchQueryStringSerializer`. :param index: The Elasticsearch index to search (e.g. 'image') :param page_size: The number of results to return per page. :param ip: The user's hashed IP. Hashed IPs are used to anonymously but uniquely identify users exclusively for ensuring query consistency across Elasticsearch shards. :param request: Django's request object. :param filter_dead: Whether dead links should be removed. :param page: The results page number. :return: Tuple with a List of Hits from elasticsearch, the total count of pages and results. """ s = Search(index=index) # Add requested filters. if 'li' in search_params.data: s = _filter_licenses(s, search_params.data['li']) elif 'lt' in search_params.data: s = _filter_licenses(s, search_params.data['lt']) if 'provider' in search_params.data: provider_filters = [] for provider in search_params.data['provider'].split(','): provider_filters.append(Q('term', provider=provider)) s = s.filter('bool', should=provider_filters, minimum_should_match=1) if 'extension' in search_params.data: extension = search_params.data['extension'] extension_filter = Q('term', extension=extension) s = s.filter('bool', should=extension_filter, minimum_should_match=1) # It is sometimes desirable to hide content providers from the catalog # without scrubbing them from the database or reindexing. filter_cache_key = 'filtered_providers' filtered_providers = cache.get(key=filter_cache_key) if not filtered_providers: filtered_providers = ContentProvider.objects\ .filter(filter_content=True)\ .values('provider_identifier') cache.set( key=filter_cache_key, timeout=CACHE_TIMEOUT, value=filtered_providers ) for filtered in filtered_providers: s = s.exclude('match', provider=filtered['provider_identifier']) # Search either by generic multimatch or by "advanced search" with # individual field-level queries specified. search_fields = ['tags.name', 'title', 'description'] if 'q' in search_params.data: query = _quote_escape(search_params.data['q']) s = s.query( 'query_string', query=query, fields=search_fields, type='most_fields' ) else: if 'creator' in search_params.data: creator = _quote_escape(search_params.data['creator']) s = s.query( 'query_string', query=creator, default_field='creator' ) if 'title' in search_params.data: title = _quote_escape(search_params.data['title']) s = s.query( 'query_string', query=title, default_field='title' ) if 'tags' in search_params.data: tags = _quote_escape(search_params.data['tags']) s = s.query( 'query_string', default_field='tags.name', query=tags ) # Use highlighting to determine which fields contribute to the selection of # top results. s = s.highlight(*search_fields) s = s.highlight_options(order='score') s.extra(track_scores=True) # Route users to the same Elasticsearch worker node to reduce # pagination inconsistencies and increase cache hits. s = s.params(preference=str(ip)) # Paginate start, end = _get_query_slice(s, page_size, page, filter_dead) s = s[start:end] search_response = s.execute() results = _post_process_results( s, start, end, page_size, search_response, request, filter_dead ) result_count, page_count = _get_result_and_page_count( search_response, results, page_size ) return results, page_count, result_count
def get_elastic_container_stats(ident, issnl=None): """ Returns dict: ident issnl (optional) total in_web in_kbart preserved """ search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) search = search.query( 'term', container_id=ident, ) search.aggs.bucket( 'container_stats', 'filters', filters={ "in_web": { "term": { "in_web": True }, }, "in_kbart": { "term": { "in_kbart": True }, }, "is_preserved": { "term": { "is_preserved": True }, }, }, ) search.aggs.bucket( 'preservation', 'terms', field='preservation', missing='_unknown', ) search.aggs.bucket( 'release_type', 'terms', field='release_type', missing='_unknown', ) search = search[:0] search = search.params(request_cache=True) resp = wrap_es_execution(search) container_stats = resp.aggregations.container_stats.buckets preservation_bucket = agg_to_dict(resp.aggregations.preservation) preservation_bucket['total'] = resp.hits.total for k in ('bright', 'dark', 'shadows_only', 'none'): if not k in preservation_bucket: preservation_bucket[k] = 0 if app.config['FATCAT_MERGE_SHADOW_PRESERVATION']: preservation_bucket['none'] += preservation_bucket['shadows_only'] preservation_bucket['shadows_only'] = 0 release_type_bucket = agg_to_dict(resp.aggregations.release_type) stats = { 'ident': ident, 'issnl': issnl, 'total': resp.hits.total, 'in_web': container_stats['in_web']['doc_count'], 'in_kbart': container_stats['in_kbart']['doc_count'], 'is_preserved': container_stats['is_preserved']['doc_count'], 'preservation': preservation_bucket, 'release_type': release_type_bucket, } return stats
def more_like_this(obj, fields, max_query_terms=25, min_term_freq=2, min_doc_freq=5, max_doc_freq=0, query=None): """More like this. https://www.elastic.co/guide/en/elasticsearch/reference/current/ query-dsl-mlt-query.html :param obj: Django model instance for which similar objects shall be found. :param fields: Fields to search in. :param max_query_terms: :param min_term_freq: :param min_doc_freq: :param max_doc_freq: :param query: Q query :type obj: Instance of `django.db.models.Model` (sub-classed) model. :type fields: list :type max_query_terms: int :type min_term_freq: int :type min_doc_freq: int :type max_doc_freq: int :type query: elasticsearch_dsl.query.Q :return: List of objects. :rtype: elasticsearch_dsl.search.Search Example: >>> from django_elasticsearch_dsl_drf.helpers import more_like_this >>> from books.models import Book >>> book = Book.objects.first() >>> similar_books = more_like_this( >>> book, >>> ['title', 'description', 'summary'] >>> ) """ _index, _mapping = get_index_and_mapping_for_model(obj._meta.model) if _index is None: return None _client = connections.get_connection() _search = Search(using=_client, index=_index) if query is not None: _search = _search.query(query) kwargs = {} if max_query_terms is not None: kwargs['max_query_terms'] = max_query_terms if min_term_freq is not None: kwargs['min_term_freq'] = min_term_freq if min_doc_freq is not None: kwargs['min_doc_freq'] = min_doc_freq if max_doc_freq is not None: kwargs['max_doc_freq'] = max_doc_freq _like_options = { '_id': "{}".format(obj.pk), '_index': "{}".format(_index), } if not ELASTICSEARCH_GTE_7_0: _like_options.update({'_type': "{}".format(_mapping)}) return _search.query( MoreLikeThis(fields=fields, like=_like_options, **kwargs))
def get_elastic_preservation_by_year(query) -> List[dict]: """ Fetches a stacked histogram of {year, preservation}. Preservation has 4 potential values; this function filters to the past 250 years (at most), or about 1000 values. Returns a list of dicts, sorted by year, with keys/values like: {year (int), bright (int), dark (int), shadows_only (int), none (int)} """ search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) if query.q not in [None, "*"]: search = search.query( "query_string", query=query.q, default_operator="AND", analyze_wildcard=True, allow_leading_wildcard=False, lenient=True, fields=[ "biblio", ], ) if query.container_id: search = search.filter( "term", container_id=query.container_id, ) search = search.filter( "range", release_year={ "gte": datetime.datetime.today().year - 249, "lte": datetime.datetime.today().year, }, ) search.aggs.bucket( 'year_preservation', 'composite', size=1500, sources=[ { "year": { "histogram": { "field": "release_year", "interval": 1, }, } }, { "preservation": { "terms": { "field": "preservation", }, } }, ], ) search = search[:0] search = search.params(request_cache='true') resp = wrap_es_execution(search) buckets = resp.aggregations.year_preservation.buckets year_nums = set([int(h['key']['year']) for h in buckets]) year_dicts = dict() if year_nums: for num in range(min(year_nums), max(year_nums) + 1): year_dicts[num] = dict(year=num, bright=0, dark=0, shadows_only=0, none=0) for row in buckets: year_dicts[int( row['key']['year'])][row['key']['preservation']] = int( row['doc_count']) if app.config['FATCAT_MERGE_SHADOW_PRESERVATION']: for k in year_dicts.keys(): year_dicts[k]['none'] += year_dicts[k]['shadows_only'] year_dicts[k]['shadows_only'] = 0 return sorted(year_dicts.values(), key=lambda x: x['year'])
def search(self, query, field, client): q = Q("multi_match", query=query, fields=[field], operator="and", tie_breaker=1, type="most_fields") s = Search(using=client) s = s.query(q) return s.execute().to_dict()["hits"]["hits"]
def search(self, qterm, **kwargs): r"""Searches in the elasticsearch index for the mail :param qterm: Query-string :type qterm: ``str`` :param \**kwargs: See below :Keyword Arguments: * *date_gte* (``datetime``) -- Filter, From: only emails greater than * *date_lte* (``datetime``) -- Filter, To: only emails less than * *date_sliding* (``str``) -- Filter sliding window, only emails of the past XX-hours/days/years... e.g. '-1d/d','-5y/y' -- See: https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#date-math * *date_sliding_type* (``str``) -- Valid date-type: e.g. y M d * *use_sliding_value* (``bool``) -- True: Only respect date_sliding and date_sliding_type. False: only respect fix date: date_gte and date_lte * *number_results* (``int``) -- Number of total results to return * *sort_field* (``str``) -- By which field should results be sorted e.g. date, _score, fromEmail.keyword * *sort_dir* (``str``) -- In Which direction should results be sorted '+': ascending '-': descending) :return: ``DslSearch Response`` """ number_results = 10 # Get arguments date_gte = None # '2010-01-31T22:28:14+0300' # from date_lte = 'now' # ''2012-09-20T17:41:14+0900' # 'now' # to date_sliding_value = '' date_sliding_type = '' use_sliding_value = True sort_field = '_score' sort_dir = '-' for key, value in kwargs.items(): if key == 'date_gte': date_gte = ('{:' + dementor_constants.JSON_DATETIME_FORMAT + '}').format(value) if key == 'date_lte': date_lte = ('{:' + dementor_constants.JSON_DATETIME_FORMAT + '}').format(value) if key == 'use_sliding_value': use_sliding_value = value if key == 'date_sliding_value': date_sliding_value = value if key == 'date_sliding_type': date_sliding_type = value if key == 'number_results': number_results = value if key == 'sort_field': sort_field = value if key == 'sort_dir': sort_dir = value # Prepare query s = DslSearch(using=self._es, index=self._index_prefix.format('*')) # Filter date date_field_name = self.get_date_field_name() if use_sliding_value & (date_sliding_value != '') & (date_sliding_type != ''): s = s.query( 'bool', filter=[ Range( **{ date_field_name: { 'gte': 'now-{0}{1}'.format(date_sliding_value, date_sliding_type) } }) ]) # s = s.filter('range', date={'gte': 'now-{0}{1}'.format(date_sliding_value, date_sliding_type)}) elif date_gte is not None: s = s.query( 'bool', filter=[ Range( ** {date_field_name: { 'lte': date_lte, 'gte': date_gte }}) ]) # s = s.filter('range', date={'lte': date_lte, 'gte': date_gte}) # Add query-specific fields s = self.add_query_fields(s, qterm, **kwargs) s = s.sort( ''.join((sort_dir, sort_field)), '-_score', ) # Number of results s = s[0:number_results] # Execute response = s.execute() response_altered = self.alter_response(response) return response_altered
def find(self, query, client): q = Q("match", _id=query) s = Search(using=client) s = s.query(q) return s.execute().to_dict()["hits"]["hits"]
def dataset_prepare(**kwargs): import os import itertools import shutil import artm import datetime from elasticsearch_dsl import Search, Q from dags.bigartm.services.cleaners import txt_writer from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT, ES_INDEX_TOPIC_MODELLING from mainapp.models_user import TopicGroup import logging es_logger = logging.getLogger('elasticsearch') es_logger.setLevel(logging.ERROR) # Recreate index object try: index = init_tm_index(**kwargs) except TMNotFoundException: return 1 lc = artm.messages.ConfigureLoggingArgs() lib = artm.wrapper.LibArtm(logging_config=lc) lc.minloglevel = 3 # 0 = INFO, 1 = WARNING, 2 = ERROR, 3 = FATAL lib.ArtmConfigureLogging(lc) perform_actualize = 'perform_actualize' in kwargs fast = 'fast' in kwargs name = kwargs['name'] name_translit = kwargs['name_translit'] corpus = kwargs['corpus'] if type(corpus) != list: corpus = [corpus] corpus_datetime_ignore = kwargs.get('corpus_datetime_ignore', []) source = kwargs['source'] datetime_from = kwargs['datetime_from'] datetime_to = kwargs['datetime_to'] group_id = kwargs['group_id'] topic_weight_threshold = kwargs['topic_weight_threshold'] topic_doc = kwargs['topic_doc'] uniq_topic_doc = kwargs['uniq_topic_doc'] temp_folder = kwargs['temp_folder'] text_field = kwargs['text_field'] is_dynamic = 'is_dynamic' in kwargs and kwargs['is_dynamic'] # Extract s = Search(using=ES_CLIENT, index=ES_INDEX_DOCUMENT).filter("terms", corpus=corpus) \ .filter('exists', field=text_field) q_from = Q() q_to = Q() if source: s = s.filter("term", **{"source": source}) if datetime_from: q_from = Q("range", datetime={"gte": datetime_from}) if datetime_to and not perform_actualize: q_to = Q("range", datetime={"lte": datetime_to}) q = (q_from & q_to) for corpus_to_ignore in corpus_datetime_ignore: q = q | (~Q('exists', field="datetime") & Q("term", corpus=corpus_to_ignore)) s = s.query(q) s = s.source(["id", "text", text_field, "title", "source", "num_views", "num_comments", "datetime", "corpus"])[:50_000_000] group_document_es_ids = None print("!!! group_id", group_id) # TODO Remove prints if group_id: group = TopicGroup.objects.get(id=group_id) topic_ids = [t.topic_id for t in group.topics.all()] if not topic_ids: return "Group is empty" topic_modelling_name = group.topic_modelling_name st = Search(using=ES_CLIENT, index=f"{topic_doc}_{topic_modelling_name}") \ .filter("terms", **{"topic_id": topic_ids}) \ .filter("range", topic_weight={"gte": topic_weight_threshold}) \ .filter("range", datetime={"gte": datetime.date(2000, 1, 1)}) \ .source(('document_es_id'))[:5000000] print("!!!", f"{topic_doc}_{topic_modelling_name}", topic_ids, topic_weight_threshold) r = st.scan() group_document_es_ids = set([doc.document_es_id for doc in r]) print(len(group_document_es_ids)) # Exclude document already in TM if actualizing ids_to_skip = None if perform_actualize: std = Search(using=ES_CLIENT, index=f"{uniq_topic_doc}_{name}").source(['document_es_id'])[:50_000_000] ids_to_skip = set((doc.document_es_id for doc in std.scan())) print("!!!", "Skipping", len(ids_to_skip)) print("!!!", "Potential docs", s.count()) formated_data = document_scanner(s, text_field, corpus, ids_to_skip, group_document_es_ids) try: peek_doc = next(formated_data) except Exception as e: print("!!! No docs", e) peek_doc = False # if perform_actualize and peek_doc == False: # return f"No documents to actualize" data_folder = os.path.join("/big_data/", temp_folder) try: os.mkdir(data_folder) except: pass if is_dynamic: data_folder = os.path.join(data_folder, f"bigartm_formated_data_{name if not name_translit else name_translit}{'_actualize' if perform_actualize else ''}{'_fast' if fast else ''}_{datetime_from.date()}_{datetime_to.date()}") else: data_folder = os.path.join(data_folder, f"bigartm_formated_data_{name if not name_translit else name_translit}{'_actualize' if perform_actualize else ''}{'_fast' if fast else ''}_{datetime_from}_{datetime_to}") try: shutil.rmtree(data_folder, ignore_errors=True) os.mkdir(data_folder) except: pass print("!!!", f"Writing documents") txt_writer(data=itertools.chain([peek_doc], formated_data), filename=os.path.join(data_folder, f"bigartm_formated_data.txt")) artm.BatchVectorizer(data_path=os.path.join(data_folder, f"bigartm_formated_data.txt"), data_format="vowpal_wabbit", target_folder=os.path.join(data_folder, "batches")) return f"index.number_of_document={index.number_of_documents}"
def find_match(queue, clf): # connection to target and reference database client = Elasticsearch(timeout=200, port=ref_index_port) csxdb = mysql.connector.connect(user='******', password='******', host='csxstaging01', database='citeseerx2', charset='utf8', use_unicode=True) CSXcursor = csxdb.cursor(dictionary=True) CSXauthorCursor = csxdb.cursor(dictionary=True) REFdb = mysql.connector.connect(user='******', password='******', host='csxstaging01', database='wos2017_12', charset='utf8', use_unicode=True) REFcursor = REFdb.cursor(dictionary=True) while (True): if queue.empty(): break try: csxID = queue.get() if csxID is None: queue.task_done() break CSXcursor.execute(cmd_paper % (csxID)) CSXPaper = CSXcursor.fetchone() if CSXPaper is None: queue.task_done() continue CSXauthorCursor.execute(cmd_author % (csxID)) CSXauthors = CSXauthorCursor.fetchall() s = Search(using=client, index=ref_index) if CSXPaper['title'] is None or len(CSXPaper['title']) < 20: if len(CSXauthors) > 0 and CSXauthors[0][ 'lname'] is not None and CSXPaper['year'] is not None: s.query = Q('bool', should=[ Q('match', year=CSXPaper['year']), Q('match', authors=CSXauthors[0]['lname']) ]) else: if CSXPaper['abstract'] is not None: s = s.query("match", abstract=CSXPaper['abstract']) else: queue.task_done() continue else: s = s.query("match", title=CSXPaper['title']) response = s.execute() for hit in response: REFcursor.execute(cmd_REFpaper % (hit['id'])) REFpaper = REFcursor.fetchone() REFcursor.execute(cmd_REFauthor % (hit['id'])) REFauthors = REFcursor.fetchall() features = SimilarityProfile.calcFeatureVector( REFpaper, REFauthors, CSXPaper, CSXauthors) label = clf.predict([features]) if label == 1: with open("results.txt", "a") as g: fcntl.flock(g, fcntl.LOCK_EX) g.write(csxID + '\t' + hit['id'] + '\n') fcntl.flock(g, fcntl.LOCK_UN) break queue.task_done() except: queue.task_done() print("-" * 60) print(csxID) print(traceback.format_exc()) print(sys.exc_info()[0]) print("-" * 60)
import random from elasticsearch import Elasticsearch from elasticsearch_dsl import Search es = Elasticsearch() s = Search(es) dictmatch = {"doctors.location": "Jayanagar"} hits = s.query("match", **dictmatch).extra(from_=5, size=1).execute() hits = hits.to_dict() for hit in hits['hits']['hits']: print str(hit['_source']['doctors'])
def search(self, lucene, index="*", doctype="doc", fields=None, date_field="@timestamp", days=None, start_time=None, end_time=None): ''' Search Elastic and return the results as a list of dicts. lucene: A string containing the Elastic search (e.g., 'item:5282 AND color:red') index: A string containing the index name to search, or an index name pattern if you want to search multiple indices (e.g., 'myindex' or 'myindex-*') doctype: The document type you are interested in. fields: A string containing a comma-separated list of field names to return. The default is to return all fields, but using this list you can select only certain fields, which may make things a bit faster. date_field: The name of the field used for date/time comparison. days: Search the past X days. If provided, this supercedes both start_time and end_time. start_time: A datetime() object representing the start of the search window. If used without end_time, the end of the search window is the current time. end_time: A datetime() object representing the end of the search window. If used without start_time, the search start will be the earliest time in the index. ''' s = Search(using=self.es_conn, index=index, doc_type=doctype) s = s.query("query_string", query=lucene) if fields: s = s.source(fields.split(',')) # Add timestamp filters, if provided. Days takes precendence over # use of either/both of start_time and end_time. # Note the weird unpacked dictionary syntax in the call to s.filter(). # We have to do it this way because Python has an issue naming things # with "@" in them, but the default timestamp field in many ES servers is # "@timestamp". # ref: https://github.com/elastic/elasticsearch-dsl-py/blob/master/docs/search_dsl.rst if days: end = datetime.now() start = end - timedelta(days=days) s = s.filter('range', **{date_field: {"gte": start, "lte": end}}) elif start_time and not end_time: s = s.filter('range', **{date_field: {"gte": start_time}}) elif end_time and not start_time: s = s.filter('range', **{date_field: {"lte": end_time}}) elif start_time and end_time: s = s.filter('range', **{date_field: { "gte": start_time, "lte": end_time }}) # execute the search results = s.scan() for hit in results: yield hit.to_dict()
def results(page): global tmp_text global tmp_title global tmp_star global tmp_director global tmp_language global tmp_location global tmp_time global tmp_categories global tmp_country global tmp_min global tmp_max global gresults # convert the <page> parameter in url to integer. if type(page) is not int: page = int(page.encode('utf-8')) # if the method of request is post (for initial query), store query in local global variables # if the method of request is get (for "next" results), extract query contents from client's global variables if request.method == 'POST': text_query = request.form['query'] star_query = request.form['starring'] director_query = request.form['director'] language_query = request.form['language'] location_query = request.form['location'] time_query = request.form['time'] categories_query = request.form['categories'] country_query = request.form['country'] mintime_query = request.form['mintime'] if len(mintime_query) is 0: mintime = 0 else: if mintime_query.replace('.', '', 1).isdigit(): mintime = float(mintime_query) else: return render_template('error_page.html') maxtime_query = request.form['maxtime'] if len(maxtime_query) is 0: maxtime = 99999 else: if maxtime_query.replace('.', '', 1).isdigit(): maxtime = float(maxtime_query) else: return render_template('error_page.html') # update global variable template data tmp_text = text_query tmp_star = star_query tmp_director = director_query tmp_language = language_query tmp_location = location_query tmp_time = time_query tmp_categories = categories_query tmp_country = country_query tmp_min = mintime tmp_max = maxtime else: # use the current values stored in global variables. text_query = tmp_text star_query = tmp_star director_query = tmp_director language_query = tmp_language location_query = tmp_location time_query = tmp_time categories_query = tmp_categories country_query = tmp_country mintime = tmp_min if tmp_min > 0: mintime_query = tmp_min else: mintime_query = "" maxtime = tmp_max if tmp_max < 99999: maxtime_query = tmp_max else: maxtime_query = "" # store query values to display in search boxes in UI shows = {} shows['text'] = text_query shows['starring'] = star_query shows['director'] = director_query shows['language'] = language_query shows['location'] = location_query shows['time'] = time_query shows['categories'] = categories_query shows['maxtime'] = maxtime_query shows['mintime'] = mintime_query # Create a search object to query our index search = Search(index='sample_film_index') # Build up your elasticsearch query in piecemeal fashion based on the user's parameters passed in. # The search API is "chainable". # Each call to search.query method adds criteria to our growing elasticsearch query. # You will change this section based on how you want to process the query data input into your interface. # search for runtime using a range query s = search.query('range', runtime={'gte': mintime, 'lte': maxtime}) # Conjunctive search over multiple fields (title and text) using the text_query passed in if len(text_query) > 0: s = s.query('multi_match', query=text_query, type='cross_fields', fields=['title', 'text'], operator='and') response = s.execute() if len(response) == 0: s = search.query('range', runtime={'gte': mintime, 'lte': maxtime}) s = s.query('multi_match', query=text_query, type='cross_fields', fields=['title^4', 'text'], operator='or') phrase = re.findall(r'"(.*?)"', text_query) if len(phrase) != 0: s = s.query(Q('match_phrase', text=phrase[0])) # search for matching stars # You should support multiple values (list) if len(star_query) > 0: s = s.query('match', starring=star_query) if len(director_query) > 0: s = s.query('match', director=director_query) if len(language_query) > 0: s = s.query('match', language=language_query) if len(location_query) > 0: s = s.query('match', location=location_query) if len(time_query) > 0: s = s.query('match', time=time_query) if len(categories_query) > 0: s = s.query('match', categories=categories_query) if len(country_query) > 0: s = s.query('match', categories=country_query) # highlight s = s.highlight_options(pre_tags='<mark>', post_tags='</mark>') # s = s.highlight('text', fragment_size=999999999, number_of_fragments=1) # s = s.highlight('title', fragment_size=999999999, number_of_fragments=1) for key in shows: s = s.highlight(key, fragment_size=999999999, number_of_fragments=1) # determine the subset of results to display (based on current <page> value) start = 0 + (page - 1) * 10 end = 10 + (page - 1) * 10 # execute search and return results in specified range. response = s[start:end].execute() # insert data into response resultList = {} for hit in response.hits: result = {} result['score'] = hit.meta.score if 'highlight' in hit.meta: if 'title' in hit.meta.highlight: result['title'] = hit.meta.highlight.title[0] else: result['title'] = hit.title if 'starring' in hit.meta.highlight: result['starring'] = hit.meta.highlight.starring[0] else: result['starring'] = hit.starring if 'runtime' in hit.meta.highlight: result['runtime'] = hit.meta.highlight.runtime[0] else: result['runtime'] = hit.runtime if 'director' in hit.meta.highlight: result['director'] = hit.meta.highlight.director[0] else: result['director'] = hit.director if 'location' in hit.meta.highlight: result['location'] = hit.meta.highlight.location[0] else: result['location'] = hit.location if 'time' in hit.meta.highlight: result['time'] = hit.meta.highlight.time[0] else: result['time'] = hit.time if 'language' in hit.meta.highlight: result['language'] = hit.meta.highlight.language[0] else: result['language'] = hit.language if 'categories' in hit.meta.highlight: result['categories'] = hit.meta.highlight.categories[0] else: result['categories'] = hit.categories if 'country' in hit.meta.highlight: result['country'] = hit.meta.highlight.country[0] else: result['country'] = hit.country if 'text' in hit.meta.highlight: result['text'] = hit.meta.highlight.text[0] else: result['text'] = hit.text else: result['title'] = hit.title result['starring'] = hit.starring result['runtime'] = hit.runtime result['director'] = hit.director result['location'] = hit.location result['time'] = hit.time result['language'] = hit.language result['categories'] = hit.categories result['country'] = hit.country result['text'] = hit.text resultList[hit.meta.id] = result # make the result list available globally gresults = resultList # get the total number of matching results result_num = response.hits.total # if we find the results, extract title and text information from doc_data, else do nothing if result_num > 0: return render_template('page_SERP.html', results=resultList, res_num=result_num, page_num=page, queries=shows) else: message = [] if len(text_query) > 0: message.append('Unknown search term: ' + text_query) if len(star_query) > 0: message.append('Cannot find star: ' + star_query) if len(time_query) > 0: message.append('Cannot find time: ' + time_query) if len(director_query) > 0: message.append('Cannot find director: ' + director_query) if len(location_query) > 0: message.append('Cannot find location: ' + location_query) if len(language_query) > 0: message.append('Cannot find language: ' + language_query) if len(categories_query) > 0: message.append('Cannot find categories: ' + categories_query) if len(country_query) > 0: message.append('Cannot find country: ' + country_query) if len(mintime_query) > 0 and len(maxtime_query) > 0: message.append( 'Cannot find running time between {} mins and {} mins'.format( mintime_query, maxtime_query)) elif len(mintime_query) > 0: message.append( 'Cannot find running time greater than {} mins'.format( mintime_query)) else: message.append('Cannot find running time less than {} mins'.format( maxtime_query)) return render_template('page_SERP.html', results=message, res_num=result_num, page_num=page, queries=shows)
def get_bucket_indexes(self, corpusId, bucketsIds: List[str] = [], bucketNames: List[str] = [], docTypes=["default"]): """ Internal method to get indexes to search from :param bucketsIds: :param bucketNames: :param docTypes: Get indexes for specific docTypes. If empty will use all docTypes. :return: {searchIndices: "<all indexes separted by comma>", indexByBucketId : {index1 : bucketId1, index2 : bucketId1 ...}} """ logger = logging.getLogger(__name__) es = get_es_conn() s = Search(using=es, index=self.bucketBindingIndex) terms = {} if not corpusId: logger.info("Invalid search corpusId: '{0}'".format(corpusId)) raise InvalidSearchParameterException( "Invalid search corpusId: '{0}'".format(corpusId)) else: terms["corpusId"] = corpusId # Filter Ids if bucketsIds: for id in bucketsIds: if "*" in id: logger.info("Invalid search bucketIds: {0}".format(id)) raise InvalidSearchParameterException( "Invalid search bucketIds: {0}".format(id)) # find all matching buckets. if bucketNames and bucketsIds: s.query = Q('bool', must=[Q('term', corpusId=terms["corpusId"])], should=[ Q("term", bucketId=bucketsIds), Q("match", name=bucketNames) ], minimum_should_match=1) elif bucketNames: s.query = Q('bool', must=[Q('term', corpusId=terms["corpusId"])], should=[Q("match", name=bucketNames)], minimum_should_match=1) elif bucketsIds: s.query = Q('bool', must=[Q('term', corpusId=terms["corpusId"])], should=[Q("term", bucketId=bucketsIds)], minimum_should_match=1) else: s.query = Q('bool', must=[Q('term', corpusId=terms["corpusId"])]) bucketInfo = s.execute() indexByBucketId = {} searchIndices = [] # TODO: exception for bucket not allowed. for info in bucketInfo: bucket = self.get_bucket(corpusId, info.meta.id) strIndexes = bucket.dd.get_indices(docTypes) searchIndices.append(strIndexes) indices = strIndexes.replace('*', "").split(',') for index in indices: indexByBucketId[index] = bucket.id return { "searchIndices": searchIndices.join(","), "indexByBucketId": indexByBucketId }
def calc(result): #xx = datetime.datetime.utcnow() #print 'x: ', xx #result['level1']['start'] = datetime.datetime.now().strftime("%B %d %Y, %X") #result['level1']['start'] = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.000Z") pue = dict() # Constants #pue['N1'] = 1000 / 1000 #pue['N2'] = 1000 / 1000 #pue['N3'] = 710 / 1000 #pue['N4'] = 1700 / 1000 ##pue['N6'] = 0 #pue['N8'] = 500 / 1000 #pue['N9'] = 1600 / 1000 pue['N1'] = 1000 pue['N2'] = 1000 pue['N3'] = 710 pue['N4'] = 1700 ##pue['N6'] = 0 pue['N8'] = 500 pue['N9'] = 1600 result['level1']['start'] = datetime.datetime.utcnow().strftime( "%Y-%m-%dT%H:%M:%S") result['level2']['start'] = datetime.datetime.utcnow().strftime( "%Y-%m-%dT%H:%M:%S") for x in i: indx = x + '*' ##print 'index', indx for eskey in i[x]: ##timespan = i[x][eskey] ##print 'key', eskey ##print 'value', i[x][eskey] (valueField, scale, variable, source) = i[x][eskey].split('|') if variable not in pue: pue[variable] = 0 #print("clears pue") k = eskey.split('|') s = Search(using=esdb, index=indx) for j in k: (subkey, subvalue) = j.split(':') s = s.query("term", **{subkey: subvalue}) ##print 'subkey', subkey ##print 'subvalue', subvalue ##s = s.query('range', **{'@timestamp':{'gte': '2018-07-01T00:00:00.000Z', 'lt':'2018-08-01T00:00:00.000Z'}}) s = s.query('range', **{'@timestamp': { 'gte': 'now-30m', 'lt': 'now' }}) s = s.sort('-@timestamp') #s = s.aggs.metric('power_sum', 'sum', field=valueField) s = s[0:1] #print s.to_dict() response = s.execute() #print 'Total %d hits found.' % response.hits.total if response.hits.total != 0: for commit in response: # print commit.to_dict() pue[variable] += commit['data']['datum'] * float(scale) # ##print commit.to_dict() # for n in k: # (sk, sv) = n.split(':') # if sk.find('.') != -1: # (psk, ssk) = sk.split('.') # ##print 'key: ', psk # ##print 'ha', commit[psk][ssk] # ##else: # ##print 'key: ', sk # ##print 'value: ', sv # ##print 'ha', commit[sk] # v = response.aggregations.power_sum # pue[variable] += ( v['value'] / response.hits.total ) # print("Processing %s" % variable) else: ##print s.to_dict() if result['level1'].has_key('missing') is False: result['level1']['missing'] = [variable] result['level2']['missing'] = [variable] else: result['level1']['missing'].append(variable) result['level2']['missing'].append(variable) if result['level2'].has_key('missing-meters') is False: result['level1']['missing-meters'] = [source] result['level2']['missing-meters'] = [source] else: result['level1']['missing-meters'].append(source) result['level2']['missing-meters'].append(source) #print 'No Value for: ', variable, ' ', source pue['N7'] = pue['N7p'] - pue['N7pp'] pue['N10pp'] = pue['N10p'] - pue['N10'] pue['D'] = pue['D1'] + pue['D2'] pue['E'] = pue['E1'] + pue['E2'] pue['F'] = pue['F1'] + pue['F2'] if (pue['B1'] + pue['B2'] + pue['C1'] + pue['C2'] + pue['D1'] + pue['D2'] + pue['E1'] + pue['E2'] + pue['F1'] + pue['F2']) == 0: lineLoss = 0 else: lineLoss = (pue['A1'] + pue['A2']) / ( pue['B1'] + pue['B2'] + pue['C1'] + pue['C2'] + pue['D1'] + pue['D2'] + pue['E1'] + pue['E2'] + pue['F1'] + pue['F2']) if pue['Bp'] == 0: txLoss590 = 0 else: txLoss590 = (pue['B1'] + pue['B2']) / pue['Bp'] if pue['Cp'] == 0: txLoss596 = 0 else: txLoss596 = (pue['C1'] + pue['C2']) / pue['Cp'] #numm1 = ( ( pue['N1'] + pue['N2'] + pue['N3'] + pue['N4'] + pue['N5'] + pue['N6'] + pue['N7'] + pue['N8'] + pue['N9'] - pue['N7p'] + (pue['ND1-1'] + pue['ND1-2'] + pue['ND1-3'] + pue['ND1-4'] + pue['ND1-5'] + pue['ND1-6']) / 1000 ) * txLoss590 + ( pue['Cp'] - pue['N10pp'] - pue['N11pp'] ) * txLoss596 + pue['D'] + pue['E'] + pue['F'] ) * lineLoss #demon1 = (pue['ND1-1'] + pue['ND1-2'] + pue['ND1-3'] + pue['ND1-4'] + pue['ND1-5'] + pue['ND1-6']) / 1000 - pue['N7p'] + pue['Dp'] + pue['Ep'] + pue['Fp'] numm1 = ((pue['N1'] + pue['N2'] + pue['N3'] + pue['N4'] + pue['N5'] + pue['N6'] + pue['N7'] + pue['N8'] + pue['N9'] - pue['N7p'] + pue['ND1-1'] + pue['ND1-2'] + pue['ND1-3'] + pue['ND1-4'] + pue['ND1-5'] + pue['ND1-6']) * txLoss590 + (pue['Cp'] - pue['N10pp'] - pue['N11pp']) * txLoss596 + pue['D'] + pue['E'] + pue['F']) * lineLoss demon1 = (pue['ND1-1'] + pue['ND1-2'] + pue['ND1-3'] + pue['ND1-4'] + pue['ND1-5'] + pue['ND1-6']) - pue['N7p'] + pue['Dp'] + pue['Ep'] + pue['Fp'] numm2 = ((pue['N1'] + pue['N2'] + pue['N3'] + pue['N4'] + pue['N5'] + pue['N7'] + pue['N6'] + pue['N8'] + pue['N9'] + pue['ND2-1'] + pue['ND2-2'] + pue['ND2-3'] + pue['ND2-4'] + pue['ND2-5'] + pue['ND2-6'] + pue['ND2-7'] + pue['ND2-8'] + pue['ND2-9'] + pue['ND2-10'] + pue['ND2-11'] + pue['ND2-12'] + pue['ND2-13'] + pue['ND2-14'] + pue['ND2-15'] + pue['ND2-16'] + pue['ND2-17'] + pue['ND2-18']) * txLoss590 + (pue['Cp'] - pue['N10pp'] - pue['N11pp']) * txLoss596 + pue['D'] + pue['E'] + pue['F']) * lineLoss demon2 = pue['ND2-1'] + pue['ND2-2'] + pue['ND2-3'] + pue['ND2-4'] + pue[ 'ND2-5'] + pue['ND2-6'] + pue['ND2-7'] + pue['ND2-8'] + pue[ 'ND2-9'] + pue['ND2-10'] + pue['ND2-11'] + pue['ND2-12'] + pue[ 'ND2-13'] + pue['ND2-14'] + pue['ND2-15'] + pue[ 'ND2-16'] + pue['ND2-17'] + pue['ND2-18'] + pue[ 'Dp'] + pue['Ep'] + pue['Fp'] if demon1 == 0: p1 = 0 else: p1 = numm1 / demon1 if demon2 == 0: p2 = 0 else: p2 = numm2 / demon2 result['level1']['pue'] = p1 result['level2']['pue'] = p2 result['level1']['end'] = datetime.datetime.utcnow().strftime( "%Y-%m-%dT%H:%M:%S") result['level2']['end'] = datetime.datetime.utcnow().strftime( "%Y-%m-%dT%H:%M:%S")
def search_elastic(term='', user=None, sort='id', order='desc', category='0_0', quality_filter='0', page=1, rss=False, admin=False, logged_in_user=None, per_page=75, max_search_results=1000): # This function can easily be memcached now es_client = Elasticsearch() es_sort_keys = { 'id': 'id', 'size': 'filesize', # 'name': 'display_name', # This is slow and buggy 'seeders': 'seed_count', 'leechers': 'leech_count', 'downloads': 'download_count' } sort_ = sort.lower() if sort_ not in es_sort_keys: flask.abort(400) es_sort = es_sort_keys[sort] order_keys = {'desc': 'desc', 'asc': 'asc'} order_ = order.lower() if order_ not in order_keys: flask.abort(400) # Only allow ID, desc if RSS if rss: sort = es_sort_keys['id'] order = 'desc' # funky, es sort is default asc, prefixed by '-' if desc if 'desc' == order: es_sort = '-' + es_sort # Quality filter quality_keys = [ '0', # Show all '1', # No remakes '2', # Only trusted '3' # Only completed ] if quality_filter.lower() not in quality_keys: flask.abort(400) quality_filter = int(quality_filter) # Category filter main_category = None sub_category = None main_cat_id = 0 sub_cat_id = 0 if category: cat_match = re.match(r'^(\d+)_(\d+)$', category) if not cat_match: flask.abort(400) main_cat_id = int(cat_match.group(1)) sub_cat_id = int(cat_match.group(2)) if main_cat_id > 0: if sub_cat_id > 0: sub_category = models.SubCategory.by_category_ids( main_cat_id, sub_cat_id) if not sub_category: flask.abort(400) else: main_category = models.MainCategory.by_id(main_cat_id) if not main_category: flask.abort(400) # This might be useless since we validate users # before coming into this method, but just to be safe... if user: user = models.User.by_id(user) if not user: flask.abort(404) user = user.id same_user = False if logged_in_user: same_user = user == logged_in_user.id s = Search(using=es_client, index=app.config.get('ES_INDEX_NAME')) # todo, sukebei prefix # Apply search term if term: s = s.query('simple_query_string', analyzer='my_search_analyzer', default_operator="AND", query=term) # User view (/user/username) if user: s = s.filter('term', uploader_id=user) if not admin: # Hide all DELETED torrents if regular user s = s.filter('term', deleted=False) # If logged in user is not the same as the user being viewed, # show only torrents that aren't hidden or anonymous. # # If logged in user is the same as the user being viewed, # show all torrents including hidden and anonymous ones. # # On RSS pages in user view, show only torrents that # aren't hidden or anonymous no matter what if not same_user or rss: s = s.filter('term', hidden=False) s = s.filter('term', anonymous=False) # General view (homepage, general search view) else: if not admin: # Hide all DELETED torrents if regular user s = s.filter('term', deleted=False) # If logged in, show all torrents that aren't hidden unless they belong to you # On RSS pages, show all public torrents and nothing more. if logged_in_user and not rss: hiddenFilter = Q('term', hidden=False) userFilter = Q('term', uploader_id=logged_in_user.id) combinedFilter = hiddenFilter | userFilter s = s.filter('bool', filter=[combinedFilter]) else: s = s.filter('term', hidden=False) if main_category: s = s.filter('term', main_category_id=main_cat_id) elif sub_category: s = s.filter('term', main_category_id=main_cat_id) s = s.filter('term', sub_category_id=sub_cat_id) if quality_filter == 0: pass elif quality_filter == 1: s = s.filter('term', remake=False) elif quality_filter == 2: s = s.filter('term', trusted=True) elif quality_filter == 3: s = s.filter('term', complete=True) # Apply sort s = s.sort(es_sort) # Only show first RESULTS_PER_PAGE items for RSS if rss: s = s[0:per_page] else: max_page = min(page, int(math.ceil(max_search_results / float(per_page)))) from_idx = (max_page - 1) * per_page to_idx = min(max_search_results, max_page * per_page) s = s[from_idx:to_idx] highlight = app.config.get('ENABLE_ELASTIC_SEARCH_HIGHLIGHT') if highlight: s = s.highlight_options(tags_schema='styled') s = s.highlight("display_name") # Return query, uncomment print line to debug query # from pprint import pprint # print(json.dumps(s.to_dict())) return s.execute()