def _search_query_rank(query, **kwargs): """ Rank query results, sorting by search rank for most content types and integrating the rating for subjects. """ rank_func = None text = kwargs.get('text') language = kwargs.get('language') rank_cutoff = kwargs.get('rank_cutoff') if language is not None: rank_func = func.ts_rank_cd(SearchItem.terms, func.to_tsquery(SearchItem.getDictForLanguage(language), text)) else: rank_func = func.ts_rank_cd(SearchItem.terms, func.to_tsquery(text)) if rank_cutoff is not None: query = query.filter(rank_func >= rank_cutoff) if kwargs.get('use_rating'): query = query.order_by((SearchItem.rating * rank_func).desc()) else: query = query.order_by(rank_func.desc()) return query
def tag_search(text, count=5): """Search in the tag_search_items table (for location tags).""" QTag = aliased(LocationTag) QParent = aliased(LocationTag) text = text.lower().strip() query = meta.Session.query(TagSearchItem)\ .join(QTag)\ .outerjoin((QParent, QParent.id==QTag.parent_id))\ .filter(TagSearchItem.terms.op('@@')(func.plainto_tsquery(text)))\ .order_by(or_(func.lower(func.btrim(QParent.title)) == text, func.lower(func.btrim(QParent.title_short)) == text).desc())\ .order_by(or_(func.lower(func.btrim(QTag.title)) == text, func.lower(func.btrim(QTag.title_short)) == text).desc())\ .order_by(func.ts_rank_cd(TagSearchItem.terms, func.plainto_tsquery(text))) if count is not None: query = query.limit(count) return query.all()
def construct_fulltext_query_and_rank(searched): # Text version of the text search query directly constructed from # the searched string simple_ts_query_text = cast(func.plainto_tsquery('english', searched), TEXT) # Text version of the search query using prefix search for the last # word in the base query # Case expression is necessary to capture the case of empty base query # (arising e.g. when the query consists entirely of special characters # and stop words) prefix_ts_query_text = case([(simple_ts_query_text == "", cast("", TEXT))], else_=simple_ts_query_text.op('||')(cast( ":*", TEXT))) # Final text search query ts_query = func.to_tsquery('english', prefix_ts_query_text) # Rank for each search result ts_rank = func.ts_rank_cd(Post.__ts_vector__, ts_query).label("rank") return (ts_query, ts_rank)
def search_query(q, search): # works only for pg backend search = preprocess_search_query(search) return q.filter(model.Ebook.full_text.match(search))\ .order_by(desc(func.ts_rank_cd(model.Ebook.full_text, func.to_tsquery(text("'custom'"), search))))
def _gen_search_elements(query, title_only, next_rank, page_size, A, B=[], C=[], D=[]): """ Given a query and four sets of fields, (A, B, C, D), generates a bunch of postgres expressions for full text search. The four sets are in decreasing order of "importance" for ranking. A should be the "title", the others can be anything. If title_only=True, we only perform a trigram search against A only """ if not title_only: # a postgres tsquery object that can be used to match against a tsvector tsq = func.websearch_to_tsquery(REGCONFIG, query) # the tsvector object that we want to search against with our tsquery tsv = _build_tsv(A, B, C, D) # document to generate snippet from doc = _build_doc(A, B, C, D) title = _build_doc(A) # trigram based text similarity between title and query string sim = _similarity(query, title) # ranking algo, weigh the similarity a lot, the text-based ranking less rank = (TRI_SIMILARITY_WEIGHT * sim + func.ts_rank_cd(tsv, tsq)).label("rank") # the snippet with results highlighted snippet = func.ts_headline(REGCONFIG, doc, tsq, "StartSel=**,StopSel=**").label("snippet") def do_search_query(orig_query): """ Does the right search filtering, limiting, and ordering for the query """ return (orig_query.filter( or_(tsv.op("@@")(tsq), sim > TRI_SIMILARITY_THRESHOLD)).filter( rank <= next_rank if next_rank is not None else True). order_by(rank.desc()).limit(page_size + 1).all()) else: title = _build_doc(A) # trigram based text similarity between title and query string sim = _similarity(query, title) # ranking algo, weigh the similarity a lot, the text-based ranking less rank = sim.label("rank") # used only for headline tsq = func.websearch_to_tsquery(REGCONFIG, query) doc = _build_doc(A, B, C, D) # the snippet with results highlighted snippet = func.ts_headline(REGCONFIG, doc, tsq, "StartSel=**,StopSel=**").label("snippet") def do_search_query(orig_query): """ Does the right search filtering, limiting, and ordering for the query """ return (orig_query.filter(sim > TRI_SIMILARITY_THRESHOLD).filter( rank <= next_rank if next_rank is not None else True).order_by( rank.desc()).limit(page_size + 1).all()) return rank, snippet, do_search_query
def do_search(search_query, offset, result_count, new_domains, framework_slug): try: sort_dir = list(search_query['sort'][0].values())[0]['order'] except (KeyError, IndexError): sort_dir = 'asc' try: sort_by = list(search_query['sort'][0].values())[0]['sort_by'] except (KeyError, IndexError): sort_by = None try: terms = search_query['query']['filtered']['filter']['terms'] except (KeyError, IndexError): terms = {} roles_list = None seller_types_list = None if terms: new_domains = 'prices.serviceRole.role' not in terms try: if new_domains: roles_list = terms['domains.assessed'] else: roles = terms['prices.serviceRole.role'] roles_list = set(_['role'][7:] for _ in roles) except KeyError: pass try: seller_types_list = terms['seller_types'] except: # noqa pass try: search_term = search_query['query']['match_phrase_prefix']['name'] except KeyError: search_term = '' EXCLUDE_LEGACY_ROLES = not current_app.config['LEGACY_ROLE_MAPPING'] if new_domains: q = db.session.query(Supplier).outerjoin(SupplierDomain).outerjoin(Domain) \ .outerjoin(SupplierFramework).outerjoin(Framework) else: q = db.session.query(Supplier).outerjoin(PriceSchedule).outerjoin(ServiceRole) \ .outerjoin(SupplierFramework).outerjoin(Framework) q = q.filter( Supplier.status != 'deleted', Supplier.abn != Supplier.DUMMY_ABN, or_(Framework.slug == framework_slug, ~Supplier.frameworks.any())) tsquery = None if search_term: if any(c in search_term for c in ['#', '-', '_', '/', '\\']): tsquery = func.phraseto_tsquery(search_term) elif ' ' in search_term: tsquery = func.plainto_tsquery(search_term) else: tsquery = func.to_tsquery(search_term + ":*") q = q.add_column( func.ts_headline( 'english', func.concat(Supplier.summary, ' ', Supplier.data['tools'].astext, ' ', Supplier.data['methodologies'].astext, ' ', Supplier.data['technologies'].astext, ''), tsquery, 'MaxWords=25, MinWords=20, ShortWord=3, HighlightAll=FALSE, MaxFragments=1' )) q = q.group_by(Supplier.id) try: code = search_query['query']['term']['code'] q = q.filter(Supplier.code == code) except KeyError: pass if roles_list is not None: if new_domains: if EXCLUDE_LEGACY_ROLES: d_agg = postgres.array_agg(cast(Domain.name, TEXT)) q = q.filter(SupplierDomain.status == 'assessed') q = q.having(d_agg.contains(array(roles_list))) else: sr_agg = postgres.array_agg( cast(func.substring(ServiceRole.name, 8), TEXT)) q = q.having(sr_agg.contains(array(roles_list))) if seller_types_list is not None and 'recruiter' in seller_types_list: q = q.filter(Supplier.is_recruiter == 'true') seller_types_list.remove('recruiter') if len(seller_types_list) == 0: seller_types_list = None if seller_types_list is not None: selected_seller_types = select( [postgres.array_agg(column('key'))], from_obj=func.json_each_text(Supplier.data[('seller_type', )]), whereclause=cast(column('value'), Boolean)).as_scalar() q = q.filter(selected_seller_types.contains(array(seller_types_list))) if sort_by: if sort_by == 'latest': ob = [desc(Supplier.last_update_time)] else: ob = [asc(Supplier.name)] else: if sort_dir == 'desc': ob = [desc(Supplier.name)] else: ob = [asc(Supplier.name)] if search_term: ob = [desc(func.ts_rank_cd(Supplier.text_vector, tsquery))] + ob q = q.filter(Supplier.text_vector.op('@@')(tsquery)) q = q.order_by(*ob) raw_results = list(q) results = [] for x in range(len(raw_results)): if type(raw_results[x]) is Supplier: result = raw_results[x] else: result = raw_results[x][0] if raw_results[x][1] is not None and raw_results[x][1] != '': result.summary = raw_results[x][1] results.append(result) sliced_results = results[offset:(offset + result_count)] q = db.session.query(Supplier.code, Supplier.name, Supplier.summary, Supplier.is_recruiter, Supplier.data, Domain.name.label('domain_name'), SupplierDomain.status.label('domain_status'))\ .outerjoin(SupplierDomain, Domain)\ .filter(Supplier.id.in_([sr.id for sr in sliced_results]))\ .order_by(Supplier.name) suppliers = [r._asdict() for r in q] sliced_results = [] for key, group in groupby(suppliers, key=itemgetter('code')): supplier = group.next() supplier['seller_type'] = supplier.get( 'data') and supplier['data'].get('seller_type') supplier['domains'] = {'assessed': [], 'unassessed': []} for s in chain([supplier], group): domain, status = s['domain_name'], s['domain_status'] if domain: if status == 'assessed': supplier['domains']['assessed'].append(domain) else: supplier['domains']['unassessed'].append(domain) for e in ['domain_name', 'domain_status', 'data']: supplier.pop(e, None) sliced_results.append(supplier) return sliced_results, len(results)
def casestudies_search(): search_query = get_json_from_request() offset = get_nonnegative_int_or_400(request.args, 'from', 0) result_count = get_positive_int_or_400( request.args, 'size', current_app.config['DM_API_SUPPLIERS_PAGE_SIZE']) sort_dir = search_query.get('sort_dir', 'asc') sort_by = search_query.get('sort_by', None) domains = search_query.get('domains', None) seller_types = search_query.get('seller_types', None) search_term = search_query.get('search_term', None) framework_slug = request.args.get('framework', 'digital-marketplace') q = db.session.query(CaseStudy).join(Supplier).outerjoin(SupplierDomain).outerjoin(Domain) \ .outerjoin(SupplierFramework).outerjoin(Framework) q = q.filter( Supplier.status != 'deleted', or_(Framework.slug == framework_slug, ~Supplier.frameworks.any())) tsquery = None if search_term: if ' ' in search_term: tsquery = func.plainto_tsquery(search_term) else: tsquery = func.to_tsquery(search_term + ":*") q = q.add_column( func.ts_headline( 'english', func.concat(CaseStudy.data['approach'].astext, ' ', CaseStudy.data['role'].astext), tsquery, 'MaxWords=150, MinWords=75, ShortWord=3, HighlightAll=FALSE, FragmentDelimiter=" ... " ' )) else: q = q.add_column("''") q = q.add_column(Supplier.name) q = q.add_column(postgres.array_agg(Supplier.data)) q = q.group_by(CaseStudy.id, Supplier.name) if domains: d_agg = postgres.array_agg(cast(Domain.name, TEXT)) q = q.having(d_agg.contains(array(domains))) if seller_types: selected_seller_types = select( [postgres.array_agg(column('key'))], from_obj=func.json_each_text(Supplier.data[('seller_type', )]), whereclause=cast(column('value'), Boolean)).as_scalar() q = q.filter(selected_seller_types.contains(array(seller_types))) if sort_dir in ('desc', 'z-a'): ob = [desc(CaseStudy.data['title'].astext)] else: ob = [asc(CaseStudy.data['title'].astext)] if search_term: ob = [ desc( func.ts_rank_cd( func.to_tsvector( func.concat( Supplier.name, CaseStudy.data['title'].astext, CaseStudy.data['approach'].astext)), tsquery)) ] + ob condition = func.to_tsvector( func.concat(Supplier.name, CaseStudy.data['title'].astext, CaseStudy.data['approach'].astext)).op('@@')(tsquery) q = q.filter(condition) q = q.order_by(*ob) raw_results = list(q) results = [] for x in range(len(raw_results)): result = raw_results[x][0].serialize() if raw_results[x][1] is not None and raw_results[x][1] != '': result['approach'] = raw_results[x][1] if raw_results[x][2] is not None: result['supplierName'] = raw_results[x][2] if raw_results[x][3] is not None and raw_results[x][3][0] is not None: result['seller_type'] = raw_results[x][3][0].get('seller_type') results.append(result) total_results = len(results) sliced_results = results[offset:(offset + result_count)] result = { 'hits': { 'total': total_results, 'hits': [{ '_source': r } for r in sliced_results] } } try: response = jsonify(result), 200 except Exception as e: response = jsonify(message=str(e)), 500 return response
def do_search(search_query, offset, result_count, new_domains, framework_slug): try: sort_dir = list(search_query['sort'][0].values())[0]['order'] except (KeyError, IndexError): sort_dir = 'asc' try: sort_by = list(search_query['sort'][0].values())[0]['sort_by'] except (KeyError, IndexError): sort_by = None try: terms = search_query['query']['filtered']['filter']['terms'] except (KeyError, IndexError): terms = {} roles_list = None seller_types_list = None if terms: new_domains = 'prices.serviceRole.role' not in terms try: if new_domains: roles_list = terms['domains.assessed'] else: roles = terms['prices.serviceRole.role'] roles_list = set(_['role'][7:] for _ in roles) except KeyError: pass try: seller_types_list = terms['seller_types'] except: # noqa pass try: search_term = search_query['query']['match_phrase_prefix']['name'] except KeyError: search_term = '' EXCLUDE_LEGACY_ROLES = not current_app.config['LEGACY_ROLE_MAPPING'] if new_domains: q = db.session.query(Supplier).outerjoin(SupplierDomain).outerjoin(Domain) \ .outerjoin(SupplierFramework).outerjoin(Framework) else: q = db.session.query(Supplier).outerjoin(PriceSchedule).outerjoin(ServiceRole) \ .outerjoin(SupplierFramework).outerjoin(Framework) q = q.filter(Supplier.status != 'deleted', Supplier.abn != Supplier.DUMMY_ABN, or_(Framework.slug == framework_slug, ~Supplier.frameworks.any())) tsquery = None if search_term: if any(c in search_term for c in ['#', '-', '_', '/', '\\']): tsquery = func.phraseto_tsquery(search_term) elif ' ' in search_term: tsquery = func.plainto_tsquery(search_term) else: tsquery = func.to_tsquery(search_term + ":*") q = q.add_column(func.ts_headline( 'english', func.concat(Supplier.summary, ' ', Supplier.data['tools'].astext, ' ', Supplier.data['methodologies'].astext, ' ', Supplier.data['technologies'].astext, ''), tsquery, 'MaxWords=25, MinWords=20, ShortWord=3, HighlightAll=FALSE, MaxFragments=1' )) q = q.group_by(Supplier.id) try: code = search_query['query']['term']['code'] q = q.filter(Supplier.code == code) except KeyError: pass if roles_list is not None: if new_domains: if EXCLUDE_LEGACY_ROLES: d_agg = postgres.array_agg(cast(Domain.name, TEXT)) q = q.having(d_agg.contains(array(roles_list))) else: sr_agg = postgres.array_agg(cast(func.substring(ServiceRole.name, 8), TEXT)) q = q.having(sr_agg.contains(array(roles_list))) if seller_types_list is not None and 'recruiter' in seller_types_list: q = q.filter(Supplier.is_recruiter == 'true') seller_types_list.remove('recruiter') if len(seller_types_list) == 0: seller_types_list = None if seller_types_list is not None: selected_seller_types = select( [postgres.array_agg(column('key'))], from_obj=func.json_each_text(Supplier.data[('seller_type',)]), whereclause=cast(column('value'), Boolean) ).as_scalar() q = q.filter(selected_seller_types.contains(array(seller_types_list))) if sort_by: if sort_by == 'latest': ob = [desc(Supplier.last_update_time)] else: ob = [asc(Supplier.name)] else: if sort_dir == 'desc': ob = [desc(Supplier.name)] else: ob = [asc(Supplier.name)] if search_term: ob = [desc(func.ts_rank_cd(Supplier.text_vector, tsquery))] + ob q = q.filter(Supplier.text_vector.op('@@')(tsquery)) q = q.order_by(*ob) raw_results = list(q) results = [] for x in range(len(raw_results)): if type(raw_results[x]) is Supplier: result = raw_results[x] else: result = raw_results[x][0] if raw_results[x][1] is not None and raw_results[x][1] != '': result.summary = raw_results[x][1] results.append(result) sliced_results = results[offset:(offset + result_count)] q = db.session.query(Supplier.code, Supplier.name, Supplier.summary, Supplier.is_recruiter, Supplier.data, Domain.name.label('domain_name'), SupplierDomain.status.label('domain_status'))\ .outerjoin(SupplierDomain, Domain)\ .filter(Supplier.id.in_([sr.id for sr in sliced_results]))\ .order_by(Supplier.name) suppliers = [r._asdict() for r in q] sliced_results = [] for key, group in groupby(suppliers, key=itemgetter('code')): supplier = group.next() supplier['seller_type'] = supplier.get('data') and supplier['data'].get('seller_type') supplier['domains'] = {'assessed': [], 'unassessed': []} for s in chain([supplier], group): domain, status = s['domain_name'], s['domain_status'] if domain: if status == 'assessed': supplier['domains']['assessed'].append(domain) else: supplier['domains']['unassessed'].append(domain) for e in ['domain_name', 'domain_status', 'data']: supplier.pop(e, None) sliced_results.append(supplier) return sliced_results, len(results)
def casestudies_search(): search_query = get_json_from_request() offset = get_nonnegative_int_or_400(request.args, 'from', 0) result_count = get_positive_int_or_400(request.args, 'size', current_app.config['DM_API_SUPPLIERS_PAGE_SIZE']) sort_dir = search_query.get('sort_dir', 'asc') sort_by = search_query.get('sort_by', None) domains = search_query.get('domains', None) seller_types = search_query.get('seller_types', None) search_term = search_query.get('search_term', None) framework_slug = request.args.get('framework', 'digital-marketplace') q = db.session.query(CaseStudy).join(Supplier).outerjoin(SupplierDomain).outerjoin(Domain) \ .outerjoin(SupplierFramework).outerjoin(Framework) q = q.filter(Supplier.status != 'deleted', or_(Framework.slug == framework_slug, ~Supplier.frameworks.any())) tsquery = None if search_term: if ' ' in search_term: tsquery = func.plainto_tsquery(search_term) else: tsquery = func.to_tsquery(search_term + ":*") q = q.add_column(func.ts_headline( 'english', func.concat( CaseStudy.data['approach'].astext, ' ', CaseStudy.data['role'].astext), tsquery, 'MaxWords=150, MinWords=75, ShortWord=3, HighlightAll=FALSE, FragmentDelimiter=" ... " ' )) else: q = q.add_column("''") q = q.add_column(Supplier.name) q = q.add_column(postgres.array_agg(Supplier.data)) q = q.group_by(CaseStudy.id, Supplier.name) if domains: d_agg = postgres.array_agg(cast(Domain.name, TEXT)) q = q.having(d_agg.contains(array(domains))) if seller_types: selected_seller_types = select( [postgres.array_agg(column('key'))], from_obj=func.json_each_text(Supplier.data[('seller_type',)]), whereclause=cast(column('value'), Boolean) ).as_scalar() q = q.filter(selected_seller_types.contains(array(seller_types))) if sort_dir in ('desc', 'z-a'): ob = [desc(CaseStudy.data['title'].astext)] else: ob = [asc(CaseStudy.data['title'].astext)] if search_term: ob = [desc(func.ts_rank_cd(func.to_tsvector( func.concat(Supplier.name, CaseStudy.data['title'].astext, CaseStudy.data['approach'].astext)), tsquery))] + ob condition = func.to_tsvector(func.concat(Supplier.name, CaseStudy.data['title'].astext, CaseStudy.data['approach'].astext)).op('@@')(tsquery) q = q.filter(condition) q = q.order_by(*ob) raw_results = list(q) results = [] for x in range(len(raw_results)): result = raw_results[x][0].serialize() if raw_results[x][1] is not None and raw_results[x][1] != '': result['approach'] = raw_results[x][1] if raw_results[x][2] is not None: result['supplierName'] = raw_results[x][2] if raw_results[x][3] is not None and raw_results[x][3][0] is not None: result['seller_type'] = raw_results[x][3][0].get('seller_type') results.append(result) total_results = len(results) sliced_results = results[offset:(offset + result_count)] result = { 'hits': { 'total': total_results, 'hits': [{'_source': r} for r in sliced_results] } } try: response = jsonify(result), 200 except Exception as e: response = jsonify(message=str(e)), 500 return response