Пример #1
0
def _search_query_rank(query, **kwargs):
    """
    Rank query results, sorting by search rank for most content types and
    integrating the rating for subjects.
    """
    rank_func = None
    text = kwargs.get('text')
    language = kwargs.get('language')
    rank_cutoff = kwargs.get('rank_cutoff')

    if language is not None:
        rank_func = func.ts_rank_cd(SearchItem.terms, func.to_tsquery(SearchItem.getDictForLanguage(language), text))
    else:
        rank_func = func.ts_rank_cd(SearchItem.terms, func.to_tsquery(text))

    if rank_cutoff is not None:
        query = query.filter(rank_func >= rank_cutoff)

    if kwargs.get('use_rating'):
        query = query.order_by((SearchItem.rating * rank_func).desc())
    else:
        query = query.order_by(rank_func.desc())
    return query
Пример #2
0
def tag_search(text, count=5):
    """Search in the tag_search_items table (for location tags)."""
    QTag = aliased(LocationTag)
    QParent = aliased(LocationTag)
    text = text.lower().strip()
    query = meta.Session.query(TagSearchItem)\
        .join(QTag)\
        .outerjoin((QParent, QParent.id==QTag.parent_id))\
        .filter(TagSearchItem.terms.op('@@')(func.plainto_tsquery(text)))\
        .order_by(or_(func.lower(func.btrim(QParent.title)) == text, func.lower(func.btrim(QParent.title_short)) == text).desc())\
        .order_by(or_(func.lower(func.btrim(QTag.title)) == text, func.lower(func.btrim(QTag.title_short)) == text).desc())\
        .order_by(func.ts_rank_cd(TagSearchItem.terms, func.plainto_tsquery(text)))
    if count is not None:
        query = query.limit(count)
    return query.all()
Пример #3
0
def construct_fulltext_query_and_rank(searched):
    # Text version of the text search query directly constructed from
    # the searched string
    simple_ts_query_text = cast(func.plainto_tsquery('english', searched),
                                TEXT)
    # Text version of the search query using prefix search for the last
    # word in the base query
    # Case expression is necessary to capture the case of empty base query
    # (arising e.g. when the query consists entirely of special characters
    # and stop words)
    prefix_ts_query_text = case([(simple_ts_query_text == "", cast("", TEXT))],
                                else_=simple_ts_query_text.op('||')(cast(
                                    ":*", TEXT)))
    # Final text search query
    ts_query = func.to_tsquery('english', prefix_ts_query_text)
    # Rank for each search result
    ts_rank = func.ts_rank_cd(Post.__ts_vector__, ts_query).label("rank")
    return (ts_query, ts_rank)
Пример #4
0
def search_query(q, search):
    # works only for pg backend
    search = preprocess_search_query(search)
    return q.filter(model.Ebook.full_text.match(search))\
        .order_by(desc(func.ts_rank_cd(model.Ebook.full_text, func.to_tsquery(text("'custom'"), search))))
Пример #5
0
def search_query(q, search):
    # works only for pg backend
    search = preprocess_search_query(search)
    return q.filter(model.Ebook.full_text.match(search))\
        .order_by(desc(func.ts_rank_cd(model.Ebook.full_text, func.to_tsquery(text("'custom'"), search))))
Пример #6
0
def _gen_search_elements(query,
                         title_only,
                         next_rank,
                         page_size,
                         A,
                         B=[],
                         C=[],
                         D=[]):
    """
    Given a query and four sets of fields, (A, B, C, D), generates a bunch of postgres expressions for full text search.

    The four sets are in decreasing order of "importance" for ranking.

    A should be the "title", the others can be anything.

    If title_only=True, we only perform a trigram search against A only
    """
    if not title_only:
        # a postgres tsquery object that can be used to match against a tsvector
        tsq = func.websearch_to_tsquery(REGCONFIG, query)

        # the tsvector object that we want to search against with our tsquery
        tsv = _build_tsv(A, B, C, D)

        # document to generate snippet from
        doc = _build_doc(A, B, C, D)

        title = _build_doc(A)

        # trigram based text similarity between title and query string
        sim = _similarity(query, title)

        # ranking algo, weigh the similarity a lot, the text-based ranking less
        rank = (TRI_SIMILARITY_WEIGHT * sim +
                func.ts_rank_cd(tsv, tsq)).label("rank")

        # the snippet with results highlighted
        snippet = func.ts_headline(REGCONFIG, doc, tsq,
                                   "StartSel=**,StopSel=**").label("snippet")

        def do_search_query(orig_query):
            """
            Does the right search filtering, limiting, and ordering for the query
            """
            return (orig_query.filter(
                or_(tsv.op("@@")(tsq), sim > TRI_SIMILARITY_THRESHOLD)).filter(
                    rank <= next_rank if next_rank is not None else True).
                    order_by(rank.desc()).limit(page_size + 1).all())

    else:
        title = _build_doc(A)

        # trigram based text similarity between title and query string
        sim = _similarity(query, title)

        # ranking algo, weigh the similarity a lot, the text-based ranking less
        rank = sim.label("rank")

        # used only for headline
        tsq = func.websearch_to_tsquery(REGCONFIG, query)
        doc = _build_doc(A, B, C, D)

        # the snippet with results highlighted
        snippet = func.ts_headline(REGCONFIG, doc, tsq,
                                   "StartSel=**,StopSel=**").label("snippet")

        def do_search_query(orig_query):
            """
            Does the right search filtering, limiting, and ordering for the query
            """
            return (orig_query.filter(sim > TRI_SIMILARITY_THRESHOLD).filter(
                rank <= next_rank if next_rank is not None else True).order_by(
                    rank.desc()).limit(page_size + 1).all())

    return rank, snippet, do_search_query
Пример #7
0
def do_search(search_query, offset, result_count, new_domains, framework_slug):
    try:
        sort_dir = list(search_query['sort'][0].values())[0]['order']
    except (KeyError, IndexError):
        sort_dir = 'asc'

    try:
        sort_by = list(search_query['sort'][0].values())[0]['sort_by']
    except (KeyError, IndexError):
        sort_by = None

    try:
        terms = search_query['query']['filtered']['filter']['terms']
    except (KeyError, IndexError):
        terms = {}

    roles_list = None
    seller_types_list = None

    if terms:
        new_domains = 'prices.serviceRole.role' not in terms

        try:
            if new_domains:
                roles_list = terms['domains.assessed']
            else:
                roles = terms['prices.serviceRole.role']
                roles_list = set(_['role'][7:] for _ in roles)
        except KeyError:
            pass

        try:
            seller_types_list = terms['seller_types']
        except:  # noqa
            pass

    try:
        search_term = search_query['query']['match_phrase_prefix']['name']
    except KeyError:
        search_term = ''

    EXCLUDE_LEGACY_ROLES = not current_app.config['LEGACY_ROLE_MAPPING']

    if new_domains:
        q = db.session.query(Supplier).outerjoin(SupplierDomain).outerjoin(Domain) \
            .outerjoin(SupplierFramework).outerjoin(Framework)
    else:
        q = db.session.query(Supplier).outerjoin(PriceSchedule).outerjoin(ServiceRole) \
            .outerjoin(SupplierFramework).outerjoin(Framework)

    q = q.filter(
        Supplier.status != 'deleted', Supplier.abn != Supplier.DUMMY_ABN,
        or_(Framework.slug == framework_slug, ~Supplier.frameworks.any()))

    tsquery = None
    if search_term:
        if any(c in search_term for c in ['#', '-', '_', '/', '\\']):
            tsquery = func.phraseto_tsquery(search_term)
        elif ' ' in search_term:
            tsquery = func.plainto_tsquery(search_term)
        else:
            tsquery = func.to_tsquery(search_term + ":*")
        q = q.add_column(
            func.ts_headline(
                'english',
                func.concat(Supplier.summary, ' ',
                            Supplier.data['tools'].astext, ' ',
                            Supplier.data['methodologies'].astext, ' ',
                            Supplier.data['technologies'].astext, ''), tsquery,
                'MaxWords=25, MinWords=20, ShortWord=3, HighlightAll=FALSE, MaxFragments=1'
            ))

    q = q.group_by(Supplier.id)

    try:
        code = search_query['query']['term']['code']
        q = q.filter(Supplier.code == code)
    except KeyError:
        pass

    if roles_list is not None:
        if new_domains:
            if EXCLUDE_LEGACY_ROLES:
                d_agg = postgres.array_agg(cast(Domain.name, TEXT))
                q = q.filter(SupplierDomain.status == 'assessed')
                q = q.having(d_agg.contains(array(roles_list)))
        else:
            sr_agg = postgres.array_agg(
                cast(func.substring(ServiceRole.name, 8), TEXT))
            q = q.having(sr_agg.contains(array(roles_list)))

    if seller_types_list is not None and 'recruiter' in seller_types_list:
        q = q.filter(Supplier.is_recruiter == 'true')
        seller_types_list.remove('recruiter')
        if len(seller_types_list) == 0:
            seller_types_list = None

    if seller_types_list is not None:
        selected_seller_types = select(
            [postgres.array_agg(column('key'))],
            from_obj=func.json_each_text(Supplier.data[('seller_type', )]),
            whereclause=cast(column('value'), Boolean)).as_scalar()

        q = q.filter(selected_seller_types.contains(array(seller_types_list)))

    if sort_by:
        if sort_by == 'latest':
            ob = [desc(Supplier.last_update_time)]
        else:
            ob = [asc(Supplier.name)]
    else:
        if sort_dir == 'desc':
            ob = [desc(Supplier.name)]
        else:
            ob = [asc(Supplier.name)]

    if search_term:
        ob = [desc(func.ts_rank_cd(Supplier.text_vector, tsquery))] + ob

        q = q.filter(Supplier.text_vector.op('@@')(tsquery))

    q = q.order_by(*ob)

    raw_results = list(q)
    results = []

    for x in range(len(raw_results)):
        if type(raw_results[x]) is Supplier:
            result = raw_results[x]
        else:
            result = raw_results[x][0]
            if raw_results[x][1] is not None and raw_results[x][1] != '':
                result.summary = raw_results[x][1]
        results.append(result)

    sliced_results = results[offset:(offset + result_count)]

    q = db.session.query(Supplier.code, Supplier.name, Supplier.summary, Supplier.is_recruiter,
                         Supplier.data, Domain.name.label('domain_name'),
                         SupplierDomain.status.label('domain_status'))\
        .outerjoin(SupplierDomain, Domain)\
        .filter(Supplier.id.in_([sr.id for sr in sliced_results]))\
        .order_by(Supplier.name)

    suppliers = [r._asdict() for r in q]

    sliced_results = []
    for key, group in groupby(suppliers, key=itemgetter('code')):
        supplier = group.next()

        supplier['seller_type'] = supplier.get(
            'data') and supplier['data'].get('seller_type')

        supplier['domains'] = {'assessed': [], 'unassessed': []}
        for s in chain([supplier], group):
            domain, status = s['domain_name'], s['domain_status']
            if domain:
                if status == 'assessed':
                    supplier['domains']['assessed'].append(domain)
                else:
                    supplier['domains']['unassessed'].append(domain)

        for e in ['domain_name', 'domain_status', 'data']:
            supplier.pop(e, None)

        sliced_results.append(supplier)

    return sliced_results, len(results)
Пример #8
0
def casestudies_search():
    search_query = get_json_from_request()

    offset = get_nonnegative_int_or_400(request.args, 'from', 0)
    result_count = get_positive_int_or_400(
        request.args, 'size', current_app.config['DM_API_SUPPLIERS_PAGE_SIZE'])

    sort_dir = search_query.get('sort_dir', 'asc')
    sort_by = search_query.get('sort_by', None)
    domains = search_query.get('domains', None)
    seller_types = search_query.get('seller_types', None)
    search_term = search_query.get('search_term', None)
    framework_slug = request.args.get('framework', 'digital-marketplace')

    q = db.session.query(CaseStudy).join(Supplier).outerjoin(SupplierDomain).outerjoin(Domain) \
        .outerjoin(SupplierFramework).outerjoin(Framework)
    q = q.filter(
        Supplier.status != 'deleted',
        or_(Framework.slug == framework_slug, ~Supplier.frameworks.any()))
    tsquery = None
    if search_term:
        if ' ' in search_term:
            tsquery = func.plainto_tsquery(search_term)
        else:
            tsquery = func.to_tsquery(search_term + ":*")
        q = q.add_column(
            func.ts_headline(
                'english',
                func.concat(CaseStudy.data['approach'].astext, ' ',
                            CaseStudy.data['role'].astext), tsquery,
                'MaxWords=150, MinWords=75, ShortWord=3, HighlightAll=FALSE, FragmentDelimiter=" ... " '
            ))
    else:
        q = q.add_column("''")
    q = q.add_column(Supplier.name)
    q = q.add_column(postgres.array_agg(Supplier.data))
    q = q.group_by(CaseStudy.id, Supplier.name)

    if domains:
        d_agg = postgres.array_agg(cast(Domain.name, TEXT))
        q = q.having(d_agg.contains(array(domains)))

    if seller_types:
        selected_seller_types = select(
            [postgres.array_agg(column('key'))],
            from_obj=func.json_each_text(Supplier.data[('seller_type', )]),
            whereclause=cast(column('value'), Boolean)).as_scalar()

        q = q.filter(selected_seller_types.contains(array(seller_types)))

    if sort_dir in ('desc', 'z-a'):
        ob = [desc(CaseStudy.data['title'].astext)]
    else:
        ob = [asc(CaseStudy.data['title'].astext)]

    if search_term:
        ob = [
            desc(
                func.ts_rank_cd(
                    func.to_tsvector(
                        func.concat(
                            Supplier.name, CaseStudy.data['title'].astext,
                            CaseStudy.data['approach'].astext)), tsquery))
        ] + ob

        condition = func.to_tsvector(
            func.concat(Supplier.name, CaseStudy.data['title'].astext,
                        CaseStudy.data['approach'].astext)).op('@@')(tsquery)

        q = q.filter(condition)
    q = q.order_by(*ob)

    raw_results = list(q)
    results = []

    for x in range(len(raw_results)):
        result = raw_results[x][0].serialize()
        if raw_results[x][1] is not None and raw_results[x][1] != '':
            result['approach'] = raw_results[x][1]
        if raw_results[x][2] is not None:
            result['supplierName'] = raw_results[x][2]
        if raw_results[x][3] is not None and raw_results[x][3][0] is not None:
            result['seller_type'] = raw_results[x][3][0].get('seller_type')
        results.append(result)

    total_results = len(results)

    sliced_results = results[offset:(offset + result_count)]

    result = {
        'hits': {
            'total': total_results,
            'hits': [{
                '_source': r
            } for r in sliced_results]
        }
    }

    try:
        response = jsonify(result), 200
    except Exception as e:
        response = jsonify(message=str(e)), 500

    return response
def do_search(search_query, offset, result_count, new_domains, framework_slug):
    try:
        sort_dir = list(search_query['sort'][0].values())[0]['order']
    except (KeyError, IndexError):
        sort_dir = 'asc'

    try:
        sort_by = list(search_query['sort'][0].values())[0]['sort_by']
    except (KeyError, IndexError):
        sort_by = None

    try:
        terms = search_query['query']['filtered']['filter']['terms']
    except (KeyError, IndexError):
        terms = {}

    roles_list = None
    seller_types_list = None

    if terms:
        new_domains = 'prices.serviceRole.role' not in terms

        try:
            if new_domains:
                roles_list = terms['domains.assessed']
            else:
                roles = terms['prices.serviceRole.role']
                roles_list = set(_['role'][7:] for _ in roles)
        except KeyError:
            pass

        try:
            seller_types_list = terms['seller_types']
        except:  # noqa
            pass

    try:
        search_term = search_query['query']['match_phrase_prefix']['name']
    except KeyError:
        search_term = ''

    EXCLUDE_LEGACY_ROLES = not current_app.config['LEGACY_ROLE_MAPPING']

    if new_domains:
        q = db.session.query(Supplier).outerjoin(SupplierDomain).outerjoin(Domain) \
            .outerjoin(SupplierFramework).outerjoin(Framework)
    else:
        q = db.session.query(Supplier).outerjoin(PriceSchedule).outerjoin(ServiceRole) \
            .outerjoin(SupplierFramework).outerjoin(Framework)

    q = q.filter(Supplier.status != 'deleted', Supplier.abn != Supplier.DUMMY_ABN,
                 or_(Framework.slug == framework_slug, ~Supplier.frameworks.any()))

    tsquery = None
    if search_term:
        if any(c in search_term for c in ['#', '-', '_', '/', '\\']):
            tsquery = func.phraseto_tsquery(search_term)
        elif ' ' in search_term:
            tsquery = func.plainto_tsquery(search_term)
        else:
            tsquery = func.to_tsquery(search_term + ":*")
        q = q.add_column(func.ts_headline(
            'english',
            func.concat(Supplier.summary,
                         ' ',
                         Supplier.data['tools'].astext,
                         ' ',
                         Supplier.data['methodologies'].astext,
                         ' ',
                         Supplier.data['technologies'].astext, ''),
            tsquery,
            'MaxWords=25, MinWords=20, ShortWord=3, HighlightAll=FALSE, MaxFragments=1'
        ))

    q = q.group_by(Supplier.id)

    try:
        code = search_query['query']['term']['code']
        q = q.filter(Supplier.code == code)
    except KeyError:
        pass

    if roles_list is not None:
        if new_domains:
            if EXCLUDE_LEGACY_ROLES:
                d_agg = postgres.array_agg(cast(Domain.name, TEXT))
                q = q.having(d_agg.contains(array(roles_list)))
        else:
            sr_agg = postgres.array_agg(cast(func.substring(ServiceRole.name, 8), TEXT))
            q = q.having(sr_agg.contains(array(roles_list)))

    if seller_types_list is not None and 'recruiter' in seller_types_list:
        q = q.filter(Supplier.is_recruiter == 'true')
        seller_types_list.remove('recruiter')
        if len(seller_types_list) == 0:
            seller_types_list = None

    if seller_types_list is not None:
        selected_seller_types = select(
            [postgres.array_agg(column('key'))],
            from_obj=func.json_each_text(Supplier.data[('seller_type',)]),
            whereclause=cast(column('value'), Boolean)
        ).as_scalar()

        q = q.filter(selected_seller_types.contains(array(seller_types_list)))

    if sort_by:
        if sort_by == 'latest':
            ob = [desc(Supplier.last_update_time)]
        else:
            ob = [asc(Supplier.name)]
    else:
        if sort_dir == 'desc':
            ob = [desc(Supplier.name)]
        else:
            ob = [asc(Supplier.name)]

    if search_term:
        ob = [desc(func.ts_rank_cd(Supplier.text_vector, tsquery))] + ob

        q = q.filter(Supplier.text_vector.op('@@')(tsquery))

    q = q.order_by(*ob)

    raw_results = list(q)
    results = []

    for x in range(len(raw_results)):
        if type(raw_results[x]) is Supplier:
            result = raw_results[x]
        else:
            result = raw_results[x][0]
            if raw_results[x][1] is not None and raw_results[x][1] != '':
                result.summary = raw_results[x][1]
        results.append(result)

    sliced_results = results[offset:(offset + result_count)]

    q = db.session.query(Supplier.code, Supplier.name, Supplier.summary, Supplier.is_recruiter,
                         Supplier.data, Domain.name.label('domain_name'),
                         SupplierDomain.status.label('domain_status'))\
        .outerjoin(SupplierDomain, Domain)\
        .filter(Supplier.id.in_([sr.id for sr in sliced_results]))\
        .order_by(Supplier.name)

    suppliers = [r._asdict() for r in q]

    sliced_results = []
    for key, group in groupby(suppliers, key=itemgetter('code')):
        supplier = group.next()

        supplier['seller_type'] = supplier.get('data') and supplier['data'].get('seller_type')

        supplier['domains'] = {'assessed': [], 'unassessed': []}
        for s in chain([supplier], group):
            domain, status = s['domain_name'], s['domain_status']
            if domain:
                if status == 'assessed':
                    supplier['domains']['assessed'].append(domain)
                else:
                    supplier['domains']['unassessed'].append(domain)

        for e in ['domain_name', 'domain_status', 'data']:
            supplier.pop(e, None)

        sliced_results.append(supplier)

    return sliced_results, len(results)
Пример #10
0
def casestudies_search():
    search_query = get_json_from_request()

    offset = get_nonnegative_int_or_400(request.args, 'from', 0)
    result_count = get_positive_int_or_400(request.args, 'size', current_app.config['DM_API_SUPPLIERS_PAGE_SIZE'])

    sort_dir = search_query.get('sort_dir', 'asc')
    sort_by = search_query.get('sort_by', None)
    domains = search_query.get('domains', None)
    seller_types = search_query.get('seller_types', None)
    search_term = search_query.get('search_term', None)
    framework_slug = request.args.get('framework', 'digital-marketplace')

    q = db.session.query(CaseStudy).join(Supplier).outerjoin(SupplierDomain).outerjoin(Domain) \
        .outerjoin(SupplierFramework).outerjoin(Framework)
    q = q.filter(Supplier.status != 'deleted', or_(Framework.slug == framework_slug, ~Supplier.frameworks.any()))
    tsquery = None
    if search_term:
        if ' ' in search_term:
            tsquery = func.plainto_tsquery(search_term)
        else:
            tsquery = func.to_tsquery(search_term + ":*")
        q = q.add_column(func.ts_headline(
            'english',
            func.concat(
                CaseStudy.data['approach'].astext,
                ' ',
                CaseStudy.data['role'].astext),
            tsquery,
            'MaxWords=150, MinWords=75, ShortWord=3, HighlightAll=FALSE, FragmentDelimiter=" ... " '
        ))
    else:
        q = q.add_column("''")
    q = q.add_column(Supplier.name)
    q = q.add_column(postgres.array_agg(Supplier.data))
    q = q.group_by(CaseStudy.id, Supplier.name)

    if domains:
        d_agg = postgres.array_agg(cast(Domain.name, TEXT))
        q = q.having(d_agg.contains(array(domains)))

    if seller_types:
        selected_seller_types = select(
            [postgres.array_agg(column('key'))],
            from_obj=func.json_each_text(Supplier.data[('seller_type',)]),
            whereclause=cast(column('value'), Boolean)
        ).as_scalar()

        q = q.filter(selected_seller_types.contains(array(seller_types)))

    if sort_dir in ('desc', 'z-a'):
        ob = [desc(CaseStudy.data['title'].astext)]
    else:
        ob = [asc(CaseStudy.data['title'].astext)]

    if search_term:
        ob = [desc(func.ts_rank_cd(func.to_tsvector(
            func.concat(Supplier.name, CaseStudy.data['title'].astext,
                        CaseStudy.data['approach'].astext)), tsquery))] + ob

        condition = func.to_tsvector(func.concat(Supplier.name,
                                                 CaseStudy.data['title'].astext,
                                                 CaseStudy.data['approach'].astext)).op('@@')(tsquery)

        q = q.filter(condition)
    q = q.order_by(*ob)

    raw_results = list(q)
    results = []

    for x in range(len(raw_results)):
        result = raw_results[x][0].serialize()
        if raw_results[x][1] is not None and raw_results[x][1] != '':
            result['approach'] = raw_results[x][1]
        if raw_results[x][2] is not None:
            result['supplierName'] = raw_results[x][2]
        if raw_results[x][3] is not None and raw_results[x][3][0] is not None:
            result['seller_type'] = raw_results[x][3][0].get('seller_type')
        results.append(result)

    total_results = len(results)

    sliced_results = results[offset:(offset + result_count)]

    result = {
        'hits': {
            'total': total_results,
            'hits': [{'_source': r} for r in sliced_results]
        }
    }

    try:
        response = jsonify(result), 200
    except Exception as e:
        response = jsonify(message=str(e)), 500

    return response