def records_query_shoulds(state): shoulds = [] if state.has_text: shoulds.append(text_query_string(state.text)) for term in state.highlight_terms: shoulds.append(text_query_string(term)) return shoulds
def scan_entity_mentions(entity): """Find mentions of a given entity in all records.""" shoulds = [] for term in entity.regex_terms: shoulds.append(text_query_string(term)) query = { 'query': { 'bool': { 'should': shoulds, "minimum_should_match": 1 } }, 'sort': [{ 'document_id': 'desc' }], '_source': ['document_id', 'text'] } for res in scan(get_es(), query=query, index=get_es_index(), doc_type=[TYPE_RECORD]): text = res.get('_source').get('text') texts = text if isinstance(text, list) else [text] for text in texts: yield (res.get('_source').get('document_id'), text)
def records_query(document_id, args, size=5, snippet_size=100): shoulds = [] text = args.get('q', '').strip() if len(text): shoulds.append(text_query_string(text)) entities = Entity.by_id_set(args.getlist('entity')) for entity in entities.values(): for term in entity.terms: shoulds.append({ 'multi_match': { 'query': term, 'type': "best_fields", 'fields': ['text^5', 'text_latin'], 'operator': 'AND' } }) if not len(shoulds): return None q = { 'bool': { 'minimum_should_match': 1, 'should': shoulds } } if document_id is not None: q['bool']['must'] = { 'term': {'document_id': document_id} } try: snippet_size = int(args.get('snippet', snippet_size)) except: pass return { 'size': size, 'query': q, 'highlight': { 'fields': { 'text': { 'fragment_size': snippet_size, 'number_of_fragments': 1 }, 'text_latin': { 'fragment_size': snippet_size, 'number_of_fragments': 1 } } }, '_source': ['document_id', 'sheet', 'row_id', 'page'] }
def text_query(text): """Part of a query which finds a piece of text.""" if text is None or not len(text.strip()): return match_all() return { "bool": { "minimum_should_match": 1, "should": [ meta_query_string(text), child_record({"bool": { "should": [text_query_string(text)] }}) ] } }
def scan_entity_mentions(entity): """Find mentions of a given entity in all records.""" shoulds = [] for term in entity.regex_terms: shoulds.append(text_query_string(term)) query = { "query": {"bool": {"should": shoulds, "minimum_should_match": 1}}, "sort": [{"document_id": "desc"}], "_source": ["document_id", "text"], } for res in scan(get_es(), query=query, index=get_es_index(), doc_type=[TYPE_RECORD]): text = res.get("_source").get("text") texts = text if isinstance(text, list) else [text] for text in texts: yield (res.get("_source").get("document_id"), text)
def records_query(document_id, args, size=5, snippet_size=100): shoulds = [] text = args.get('q', '').strip() if len(text): shoulds.append(text_query_string(text)) entities = Entity.by_id_set(args.getlist('entity')) for entity in entities.values(): for term in entity.terms: shoulds.append({ 'multi_match': { 'query': term, 'type': "best_fields", 'fields': ['text^5', 'text_latin'], 'operator': 'AND' } }) if not len(shoulds): return None q = {'bool': {'minimum_should_match': 1, 'should': shoulds}} if document_id is not None: q['bool']['must'] = {'term': {'document_id': document_id}} try: snippet_size = int(args.get('snippet', snippet_size)) except: pass return { 'size': size, 'query': q, 'highlight': { 'fields': { 'text': { 'fragment_size': snippet_size, 'number_of_fragments': 1 }, 'text_latin': { 'fragment_size': snippet_size, 'number_of_fragments': 1 } } }, '_source': ['document_id', 'sheet', 'row_id', 'page'] }
def text_query(text): """Part of a query which finds a piece of text.""" if text is None or not len(text.strip()): return match_all() return { "bool": { "minimum_should_match": 1, "should": [ meta_query_string(text), child_record({ "bool": { "should": [text_query_string(text)] } }) ] } }
def tabular_query(document_id, sheet, args): scored = False q = match_all() text = args.get('q', '').strip() if len(text): scored = True q = text_query_string(text) try: rows = [int(r) for r in args.getlist('row')] except Exception: rows = [] if len(rows): scored = True q = { "bool": { "must": [q], "should": [{ "constant_score": { "filter": { 'terms': { 'row_id': rows } }, "boost": 1000 } }] } } q = add_filter(q, {'term': {'document_id': document_id}}) q = add_filter(q, {'term': {'sheet': sheet}}) # pprint(q) sort = [{'row_id': 'asc'}] if scored: sort.insert(0, '_score') return { 'from': 0, 'size': 100, 'query': q, 'sort': sort, '_source': ['document_id', 'sheet', 'row_id', 'raw'] }
def text_query(text): """ Construct the part of a query which is responsible for finding a piece of thext in the selected documents. """ if text is None or not len(text.strip()): return match_all() return { "bool": { "minimum_should_match": 1, "should": [ meta_query_string(text), child_record({ "bool": { "should": [text_query_string(text)] } }) ] } }
def records_query_shoulds(args): shoulds = [] query_text = args.get('q', '').strip() if len(query_text): shoulds.append(text_query_string(query_text)) entities = Entity.by_id_set(args.getlist('entity')) for entity in entities.values(): for term in entity.terms: shoulds.append({ 'multi_match': { 'query': term, 'type': "best_fields", 'fields': ['text^5', 'text_latin'], 'operator': 'AND' } }) return shoulds
def scan_entity_mentions(entity): """Find mentions of a given entity in all records.""" shoulds = [] for term in entity.regex_terms: shoulds.append(text_query_string(term)) query = { 'query': { 'bool': {'should': shoulds, "minimum_should_match": 1} }, 'sort': [{'document_id': 'desc'}], '_source': ['document_id', 'text'] } for res in scan(get_es(), query=query, index=get_es_index(), doc_type=[TYPE_RECORD]): text = res.get('_source').get('text') texts = text if isinstance(text, list) else [text] for text in texts: yield (res.get('_source').get('document_id'), text)
def scan_entity_mentions(entity): """Find mentions of a given entity in all records.""" shoulds = [] for term in entity.regex_terms: shoulds.append(text_query_string(term)) query = { 'query': { 'bool': { 'should': shoulds, 'minimum_should_match': 1 } }, 'sort': [{'document_id': 'desc'}], '_source': ['document_id', 'text'] } for res in scan(es, query=query, index=es_index, doc_type=[TYPE_RECORD]): for text in ensure_list(res.get('_source').get('text')): yield (res.get('_source').get('document_id'), text)
def tabular_query(document_id, sheet, args): scored = False q = match_all() text = args.get('q', '').strip() if len(text): scored = True q = text_query_string(text) try: rows = [int(r) for r in args.getlist('row')] except Exception: rows = [] if len(rows): scored = True q = { "bool": { "must": q, "should": { "constant_score": { "filter": {'terms': {'row_id': rows}}, "boost": 1000 } } } } q = add_filter(q, {'term': {'document_id': document_id}}) q = add_filter(q, {'term': {'sheet': sheet}}) # pprint(q) sort = [{'row_id': 'asc'}] if scored: sort.insert(0, '_score') return { 'from': 0, 'size': 100, 'query': q, 'sort': sort, '_source': ['document_id', 'sheet', 'row_id', 'raw'] }
def records_query_shoulds(args): shoulds = [] query_text = args.get("q", "").strip() if len(query_text): shoulds.append(text_query_string(query_text)) entities = Entity.by_id_set(args.getlist("entity")) for entity in entities.values(): for term in entity.terms: shoulds.append( { "multi_match": { "query": term, "type": "best_fields", "fields": ["text^5", "text_latin"], "operator": "AND", } } ) return shoulds
def analyze_terms(terms, seen=None): if seen is None: seen = set() for term in terms: query = { "bool": { "minimum_should_match": 1, "should": [ meta_query_string(term, literal=True), child_record({ "bool": { "should": [text_query_string(term, literal=True)] } }) ] } } for doc_id in query_doc_ids(query): if doc_id not in seen: analyze_document.delay(doc_id) seen.add(doc_id)
def analyze_terms(terms, seen=None): if seen is None: seen = set() for term in terms: term = normalize_strong(term) query = { "bool": { "minimum_should_match": 1, "should": [ meta_query_string(term), child_record( {"bool": { "should": [text_query_string(term)] }}) ] } } for doc_id in query_doc_ids(query): if doc_id not in seen: analyze_document.delay(doc_id) seen.add(doc_id)