示例#1
0
文件: records.py 项目: wilbrodn/aleph
def records_query_shoulds(state):
    shoulds = []
    if state.has_text:
        shoulds.append(text_query_string(state.text))

    for term in state.highlight_terms:
        shoulds.append(text_query_string(term))
    return shoulds
示例#2
0
def scan_entity_mentions(entity):
    """Find mentions of a given entity in all records."""
    shoulds = []
    for term in entity.regex_terms:
        shoulds.append(text_query_string(term))

    query = {
        'query': {
            'bool': {
                'should': shoulds,
                "minimum_should_match": 1
            }
        },
        'sort': [{
            'document_id': 'desc'
        }],
        '_source': ['document_id', 'text']
    }
    for res in scan(get_es(),
                    query=query,
                    index=get_es_index(),
                    doc_type=[TYPE_RECORD]):
        text = res.get('_source').get('text')
        texts = text if isinstance(text, list) else [text]
        for text in texts:
            yield (res.get('_source').get('document_id'), text)
示例#3
0
文件: records.py 项目: 01-/aleph
def records_query(document_id, args, size=5, snippet_size=100):
    shoulds = []
    text = args.get('q', '').strip()
    if len(text):
        shoulds.append(text_query_string(text))

    entities = Entity.by_id_set(args.getlist('entity'))
    for entity in entities.values():
        for term in entity.terms:
            shoulds.append({
                'multi_match': {
                    'query': term,
                    'type': "best_fields",
                    'fields': ['text^5', 'text_latin'],
                    'operator': 'AND'
                }
            })

    if not len(shoulds):
        return None

    q = {
        'bool': {
            'minimum_should_match': 1,
            'should': shoulds
        }
    }
    if document_id is not None:
        q['bool']['must'] = {
            'term': {'document_id': document_id}
        }

    try:
        snippet_size = int(args.get('snippet', snippet_size))
    except:
        pass

    return {
        'size': size,
        'query': q,
        'highlight': {
            'fields': {
                'text': {
                    'fragment_size': snippet_size,
                    'number_of_fragments': 1
                },
                'text_latin': {
                    'fragment_size': snippet_size,
                    'number_of_fragments': 1
                }
            }
        },
        '_source': ['document_id', 'sheet', 'row_id', 'page']
    }
示例#4
0
def text_query(text):
    """Part of a query which finds a piece of text."""
    if text is None or not len(text.strip()):
        return match_all()
    return {
        "bool": {
            "minimum_should_match":
            1,
            "should": [
                meta_query_string(text),
                child_record({"bool": {
                    "should": [text_query_string(text)]
                }})
            ]
        }
    }
示例#5
0
def scan_entity_mentions(entity):
    """Find mentions of a given entity in all records."""
    shoulds = []
    for term in entity.regex_terms:
        shoulds.append(text_query_string(term))

    query = {
        "query": {"bool": {"should": shoulds, "minimum_should_match": 1}},
        "sort": [{"document_id": "desc"}],
        "_source": ["document_id", "text"],
    }
    for res in scan(get_es(), query=query, index=get_es_index(), doc_type=[TYPE_RECORD]):
        text = res.get("_source").get("text")
        texts = text if isinstance(text, list) else [text]
        for text in texts:
            yield (res.get("_source").get("document_id"), text)
示例#6
0
def records_query(document_id, args, size=5, snippet_size=100):
    shoulds = []
    text = args.get('q', '').strip()
    if len(text):
        shoulds.append(text_query_string(text))

    entities = Entity.by_id_set(args.getlist('entity'))
    for entity in entities.values():
        for term in entity.terms:
            shoulds.append({
                'multi_match': {
                    'query': term,
                    'type': "best_fields",
                    'fields': ['text^5', 'text_latin'],
                    'operator': 'AND'
                }
            })

    if not len(shoulds):
        return None

    q = {'bool': {'minimum_should_match': 1, 'should': shoulds}}
    if document_id is not None:
        q['bool']['must'] = {'term': {'document_id': document_id}}

    try:
        snippet_size = int(args.get('snippet', snippet_size))
    except:
        pass

    return {
        'size': size,
        'query': q,
        'highlight': {
            'fields': {
                'text': {
                    'fragment_size': snippet_size,
                    'number_of_fragments': 1
                },
                'text_latin': {
                    'fragment_size': snippet_size,
                    'number_of_fragments': 1
                }
            }
        },
        '_source': ['document_id', 'sheet', 'row_id', 'page']
    }
示例#7
0
def text_query(text):
    """Part of a query which finds a piece of text."""
    if text is None or not len(text.strip()):
        return match_all()
    return {
        "bool": {
            "minimum_should_match": 1,
            "should": [
                meta_query_string(text),
                child_record({
                    "bool": {
                        "should": [text_query_string(text)]
                    }
                })
            ]
        }
    }
示例#8
0
文件: tabular.py 项目: cdharris/aleph
def tabular_query(document_id, sheet, args):
    scored = False
    q = match_all()
    text = args.get('q', '').strip()
    if len(text):
        scored = True
        q = text_query_string(text)

    try:
        rows = [int(r) for r in args.getlist('row')]
    except Exception:
        rows = []

    if len(rows):
        scored = True
        q = {
            "bool": {
                "must": [q],
                "should": [{
                    "constant_score": {
                        "filter": {
                            'terms': {
                                'row_id': rows
                            }
                        },
                        "boost": 1000
                    }
                }]
            }
        }

    q = add_filter(q, {'term': {'document_id': document_id}})
    q = add_filter(q, {'term': {'sheet': sheet}})

    # pprint(q)

    sort = [{'row_id': 'asc'}]
    if scored:
        sort.insert(0, '_score')
    return {
        'from': 0,
        'size': 100,
        'query': q,
        'sort': sort,
        '_source': ['document_id', 'sheet', 'row_id', 'raw']
    }
示例#9
0
def text_query(text):
    """ Construct the part of a query which is responsible for finding a
    piece of thext in the selected documents. """
    if text is None or not len(text.strip()):
        return match_all()
    return {
        "bool": {
            "minimum_should_match": 1,
            "should": [
                meta_query_string(text),
                child_record({
                    "bool": {
                        "should": [text_query_string(text)]
                    }
                })
            ]
        }
    }
示例#10
0
def records_query_shoulds(args):
    shoulds = []
    query_text = args.get('q', '').strip()
    if len(query_text):
        shoulds.append(text_query_string(query_text))

    entities = Entity.by_id_set(args.getlist('entity'))
    for entity in entities.values():
        for term in entity.terms:
            shoulds.append({
                'multi_match': {
                    'query': term,
                    'type': "best_fields",
                    'fields': ['text^5', 'text_latin'],
                    'operator': 'AND'
                }
            })
    return shoulds
示例#11
0
def scan_entity_mentions(entity):
    """Find mentions of a given entity in all records."""
    shoulds = []
    for term in entity.regex_terms:
        shoulds.append(text_query_string(term))

    query = {
        'query': {
            'bool': {'should': shoulds, "minimum_should_match": 1}
        },
        'sort': [{'document_id': 'desc'}],
        '_source': ['document_id', 'text']
    }
    for res in scan(get_es(), query=query, index=get_es_index(),
                    doc_type=[TYPE_RECORD]):
        text = res.get('_source').get('text')
        texts = text if isinstance(text, list) else [text]
        for text in texts:
            yield (res.get('_source').get('document_id'), text)
def scan_entity_mentions(entity):
    """Find mentions of a given entity in all records."""
    shoulds = []
    for term in entity.regex_terms:
        shoulds.append(text_query_string(term))

    query = {
        'query': {
            'bool': {
                'should': shoulds,
                'minimum_should_match': 1
            }
        },
        'sort': [{'document_id': 'desc'}],
        '_source': ['document_id', 'text']
    }
    for res in scan(es, query=query, index=es_index, doc_type=[TYPE_RECORD]):
        for text in ensure_list(res.get('_source').get('text')):
            yield (res.get('_source').get('document_id'), text)
示例#13
0
文件: tabular.py 项目: 01-/aleph
def tabular_query(document_id, sheet, args):
    scored = False
    q = match_all()
    text = args.get('q', '').strip()
    if len(text):
        scored = True
        q = text_query_string(text)

    try:
        rows = [int(r) for r in args.getlist('row')]
    except Exception:
        rows = []

    if len(rows):
        scored = True
        q = {
            "bool": {
                "must": q,
                "should": {
                    "constant_score": {
                        "filter": {'terms': {'row_id': rows}},
                        "boost": 1000
                    }
                }
            }
        }

    q = add_filter(q, {'term': {'document_id': document_id}})
    q = add_filter(q, {'term': {'sheet': sheet}})

    # pprint(q)

    sort = [{'row_id': 'asc'}]
    if scored:
        sort.insert(0, '_score')
    return {
        'from': 0,
        'size': 100,
        'query': q,
        'sort': sort,
        '_source': ['document_id', 'sheet', 'row_id', 'raw']
    }
示例#14
0
def records_query_shoulds(args):
    shoulds = []
    query_text = args.get("q", "").strip()
    if len(query_text):
        shoulds.append(text_query_string(query_text))

    entities = Entity.by_id_set(args.getlist("entity"))
    for entity in entities.values():
        for term in entity.terms:
            shoulds.append(
                {
                    "multi_match": {
                        "query": term,
                        "type": "best_fields",
                        "fields": ["text^5", "text_latin"],
                        "operator": "AND",
                    }
                }
            )
    return shoulds
示例#15
0
文件: __init__.py 项目: 01-/aleph
def analyze_terms(terms, seen=None):
    if seen is None:
        seen = set()
    for term in terms:
        query = {
            "bool": {
                "minimum_should_match": 1,
                "should": [
                    meta_query_string(term, literal=True),
                    child_record({
                        "bool": {
                            "should": [text_query_string(term, literal=True)]
                        }
                    })
                ]
            }
        }
        for doc_id in query_doc_ids(query):
            if doc_id not in seen:
                analyze_document.delay(doc_id)
            seen.add(doc_id)
示例#16
0
def analyze_terms(terms, seen=None):
    if seen is None:
        seen = set()
    for term in terms:
        term = normalize_strong(term)
        query = {
            "bool": {
                "minimum_should_match":
                1,
                "should": [
                    meta_query_string(term),
                    child_record(
                        {"bool": {
                            "should": [text_query_string(term)]
                        }})
                ]
            }
        }
        for doc_id in query_doc_ids(query):
            if doc_id not in seen:
                analyze_document.delay(doc_id)
            seen.add(doc_id)