def _build_tsv(A, B=[], C=[], D=[]): """ Given lists for A, B, C, and D, builds a tsvector from them. """ tsv = func.setweight( func.to_tsvector( REGCONFIG, _join_with_space([func.coalesce(bit, "") for bit in A])), "A") if B: tsv = tsv.concat( func.setweight( func.to_tsvector( REGCONFIG, _join_with_space([func.coalesce(bit, "") for bit in B])), "B")) if C: tsv = tsv.concat( func.setweight( func.to_tsvector( REGCONFIG, _join_with_space([func.coalesce(bit, "") for bit in C])), "C")) if D: tsv = tsv.concat( func.setweight( func.to_tsvector( REGCONFIG, _join_with_space([func.coalesce(bit, "") for bit in D])), "D")) return tsv
def __table_args__(self): return ( db.Index('idx_tsv_title', func.to_tsvector("english", self.title), postgresql_using="gin"), db.Index('idx_tsv_title_description', func.to_tsvector( "english", self.title + ' ' + coalesce(self.description, '')), postgresql_using="gin"), )
def build_query(params, full=False): columns = [users.c.id, users.c.name, users.c.email_address] if full \ else [users.c.id] query = select(columns) query = query.order_by(users.c.id) if 'email' in params: query = query.where(users.c.email_address == params['email']) if 'name' in params: search_string = " & ".join( "{}:*".format(word) for word in params['name'].split(' ') ) name_vector = func.to_tsvector(users.c.name) query = query.where(name_vector.match(search_string)) query = query.order_by( desc(func.ts_rank(name_vector, func.to_tsquery(search_string))) ) # if 'page_size' in params: # query = query.limit(params['page_size']) return query
class Goods(Base): __tablename__ = 'goods' id = Column(Integer, primary_key=True) height = Column(Integer, nullable=False) width = Column(Integer, nullable=False) depth = Column(Integer, nullable=False) weight = Column(Integer, nullable=False) description = Column(Text) invoice_num = Column(Integer, ForeignKey('invoices.num', onupdate='restrict', ondelete='restrict'), nullable=False) invoice = relationship("Invoice", backref="goods") __table_args__ = ( Index('goods_descriptions_index', func.to_tsvector('english', description), postgresql_using='gin'), Index('height_index', height, postgresql_using='hash'), Index('invoice_num_index', invoice_num, postgresql_using='hash'), ) def __str__(self): return f"Goods [id={self.id}, height={self.height}, width={self.width}, depth={self.depth}, " \ f"weight={self.weight}, description={self.description}, invoice_num={self.invoice_num}]"
class FoodDetail(db.Model, ReturnHelper): """ Daily Bites food_detail table - provides details for food recorded """ __tablename__ = FOOD_DETAIL_TABLE food_id = db.Column(db.String(36), db.ForeignKey("nut_per_100_gram.food_id"), primary_key=True) food_desc = db.Column(db.String(256)) barcode = db.Column(db.String(50)) brand = db.Column(db.String(256)) __table_args__ = (db.Index( "ix_food_detail_food_desc", func.to_tsvector("english", food_desc), postgresql_using="gin", ), ) min_fields = {"food_id"} search_result_fields = {*min_fields, "barcode", "brand"} all_fields = {*search_result_fields, "food_desc"} @classmethod def get_food_detail_by_id(cls, food_id): return cls.query.filter_by(food_id=food_id).first()
def search_foods(search_query): query = FoodDetail.query.filter( func.to_tsvector("english", FoodDetail.food_desc).op("@@")(func.plainto_tsquery( "english", search_query))) foods = [food.search_result_view() for food in query.all()] return foods
def tokenize(): recreate_database() s = Session() all_reviews = s.query(Reviews).all() for review in all_reviews: current_token = func.to_tsvector('english', review.review) review.tokens = current_token s.commit()
def add_simple_text_search(query, text_columns, keywords, include_rank=True): rank = None keywords_j = ' & '.join(keywords) fts_config = 'simple' filters = [ func.to_tsvector(fts_config, column).match(keywords_j) for column in text_columns ] if len(filters) > 1: filter = or_(*filters) else: filter = filters[0] query = query.filter(filter) if include_rank: ranks = [ func.ts_rank(func.to_tsvector(fts_config, column), func.to_tsquery(fts_config, keywords_j)) for column in text_columns ] rank = reduce(lambda a, b: a + b, ranks) query = query.add_column(rank.label('score')) return query, rank
def upsertReponseContent(self, job, response): while 1: try: # If we have already fetched the page, push what we have back # into the history table. last = None if job.content: last = self.pushBackHistory(job, response) job.title = response['title'] job.content = response['contents'] job.mimetype = response['mimeType'] # Update the tsv_content column if we have data for it. if response['contents']: job.tsv_content = func.to_tsvector(func.coalesce(response['contents'])) if "text" in job.mimetype: job.is_text = True else: job.is_text = False job.state = 'complete' # Disabled for space-reasons. # if 'rawcontent' in response: # job.raw_content = response['rawcontent'] job.fetchtime = datetime.datetime.now() self.db.get_session().commit() self.log.info("Marked plain job with id %s, url %s as complete!", job.id, job.url) break except sqlalchemy.exc.OperationalError: self.db.get_session().rollback() except sqlalchemy.exc.InvalidRequestError: self.db.get_session().rollback()
def score_result(self, searchterm=None): """ Technical debt here: the identifiers in the indexed document are hard-coded into the scoring loop https://github.com/Chemical-Curation/resolver/pull/16#discussion_r536216962 """ matches = self.get_matches(searchterm) max_score = 0 if matches: max_key = max(matches, key=matches.get) max_score = matches[max_key] return max_score ix_identifiers = db.Index( "ix_identifiers", Substance.identifiers, text("(identifiers->'values') jsonb_path_ops"), postgresql_using="gin", ) ix_identifiers_tsv = db.Index( "ix_identifiers_tsv", Substance.identifiers, func.to_tsvector("english", Substance.identifiers), postgresql_using="gin", ) if __name__ == "__main__": ix_identifiers.create(bind=db.engine) ix_identifiers_tsv.create(bind=db.engine)
def search_mv_foods(search_query): query = TotalNutritionView.query.filter( func.to_tsvector("english", TotalNutritionView.food_desc).op("@@")( func.plainto_tsquery("english", search_query))) foods = [food.search_result_view() for food in query.all()] return foods
async def scrape_post(post_id, feed_id, loop, page_number): # Connect to database session = models.Session() # Get current UTC time in seconds now = int(datetime.utcnow().strftime('%s')) # Get HTML tree from post's webpage, specifying page number if given if page_number: post_html = requests.get('https://news.ycombinator.com/item?id=' + str(post_id) + '&p=' + str(page_number)) else: post_html = requests.get('https://news.ycombinator.com/item?id=' + str(post_id)) post_content = post_html.content post_soup = BeautifulSoup(post_content, 'html.parser') # If post page contains a "More" link to more comments, create asynchronous # task to scrape that page for its comments if (post_soup.find('a', 'morelink')): page_number = post_soup.find('a', 'morelink').get('href').split('&p=')[1] loop.create_task(scrape_post(post_id, feed_id, loop, page_number)) # Get all comment rows from HTML tree comment_rows = post_soup.select('tr.athing.comtr') # Set starting comment feed rank to 0 comment_feed_rank = 0 for comment_row in comment_rows: # Get comment id comment_id = comment_row.get('id') # Check if comment exists in database comment_exists = session.query( models.Comment.id).filter_by(id=comment_id).scalar() # Get core comment data if it is not in database already if not comment_exists: # If comment has content span, get text from span if comment_row.find('div', 'comment').find_all('span'): comment_content = comment_row.find( 'div', 'comment').find_all('span')[0].get_text() # Remove the last word ('reply') from the comment content # and strip trailing whitespace comment_content = comment_content.rsplit(' ', 1)[0].strip() total_word_count = len(comment_content.split()) # Otherwise, comment is flagged, so get flagged message as text # and strip trailing whitespace else: comment_content = comment_row.find( 'div', 'comment').get_text().strip() total_word_count = 0 # Get UTC timestamp for comment's posting time by subtracting # the number of days/hours/minutes ago given on the webpage from # the current UTC timestamp comment_time_unit = comment_row.find('span', 'age').a.get_text().split()[1] if 'day' in comment_time_unit: comment_created = now - 86400 * int( comment_row.find('span', 'age').a.get_text().split()[0]) elif 'hour' in comment_time_unit: comment_created = now - 3600 * int( comment_row.find('span', 'age').a.get_text().split()[0]) else: comment_created = now - 60 * int( comment_row.find('span', 'age').a.get_text().split()[0]) comment_created = time.strftime('%Y-%m-%d %H:%M', time.localtime(comment_created)) # Get comment's level in tree by getting indentation width # value divided by value of one indent (40px) level = int( comment_row.find('td', 'ind').contents[0].get('width')) / 40 # Set parent comment as blank if comment is the top-level # comment if level == 0: parent_comment = None # Otherwise, get preceding comment in comment tree else: parent_comment = session.query(models.Comment).with_entities( models.Comment.id).join(models.FeedComment).filter( models.Comment.level == (level - 1) ).filter(models.FeedComment.feed_id == feed_id).filter( models.Comment.post_id == post_id).order_by( models.FeedComment.feed_rank).limit(1).one()[0] # Get username of user who posted comment try: comment_username = comment_row.find('a', 'hnuser').get_text() except AttributeError: comment_username = '' # Add scraped comment data to database comment = models.Comment(content=comment_content, created=comment_created, id=comment_id, level=level, parent_comment=parent_comment, post_id=post_id, total_word_count=total_word_count, username=comment_username, word_counts=func.to_tsvector( 'simple_english', comment_content.lower())) session.add(comment) # Increment comment feed rank to get current comment's rank comment_feed_rank += 1 # Add feed-based comment data to database feed_comment = models.FeedComment(comment_id=comment_id, feed_id=feed_id, feed_rank=comment_feed_rank) session.add(feed_comment) session.commit() # Print message if there are no more pages of comments to scrape if not post_soup.find('a', 'morelink'): print('Post ' + str(post_id) + ' and its comments scraped') return
def casestudies_search(): search_query = get_json_from_request() offset = get_nonnegative_int_or_400(request.args, 'from', 0) result_count = get_positive_int_or_400( request.args, 'size', current_app.config['DM_API_SUPPLIERS_PAGE_SIZE']) sort_dir = search_query.get('sort_dir', 'asc') sort_by = search_query.get('sort_by', None) domains = search_query.get('domains', None) seller_types = search_query.get('seller_types', None) search_term = search_query.get('search_term', None) framework_slug = request.args.get('framework', 'digital-marketplace') q = db.session.query(CaseStudy).join(Supplier).outerjoin(SupplierDomain).outerjoin(Domain) \ .outerjoin(SupplierFramework).outerjoin(Framework) q = q.filter( Supplier.status != 'deleted', or_(Framework.slug == framework_slug, ~Supplier.frameworks.any())) tsquery = None if search_term: if ' ' in search_term: tsquery = func.plainto_tsquery(search_term) else: tsquery = func.to_tsquery(search_term + ":*") q = q.add_column( func.ts_headline( 'english', func.concat(CaseStudy.data['approach'].astext, ' ', CaseStudy.data['role'].astext), tsquery, 'MaxWords=150, MinWords=75, ShortWord=3, HighlightAll=FALSE, FragmentDelimiter=" ... " ' )) else: q = q.add_column("''") q = q.add_column(Supplier.name) q = q.add_column(postgres.array_agg(Supplier.data)) q = q.group_by(CaseStudy.id, Supplier.name) if domains: d_agg = postgres.array_agg(cast(Domain.name, TEXT)) q = q.having(d_agg.contains(array(domains))) if seller_types: selected_seller_types = select( [postgres.array_agg(column('key'))], from_obj=func.json_each_text(Supplier.data[('seller_type', )]), whereclause=cast(column('value'), Boolean)).as_scalar() q = q.filter(selected_seller_types.contains(array(seller_types))) if sort_dir in ('desc', 'z-a'): ob = [desc(CaseStudy.data['title'].astext)] else: ob = [asc(CaseStudy.data['title'].astext)] if search_term: ob = [ desc( func.ts_rank_cd( func.to_tsvector( func.concat( Supplier.name, CaseStudy.data['title'].astext, CaseStudy.data['approach'].astext)), tsquery)) ] + ob condition = func.to_tsvector( func.concat(Supplier.name, CaseStudy.data['title'].astext, CaseStudy.data['approach'].astext)).op('@@')(tsquery) q = q.filter(condition) q = q.order_by(*ob) raw_results = list(q) results = [] for x in range(len(raw_results)): result = raw_results[x][0].serialize() if raw_results[x][1] is not None and raw_results[x][1] != '': result['approach'] = raw_results[x][1] if raw_results[x][2] is not None: result['supplierName'] = raw_results[x][2] if raw_results[x][3] is not None and raw_results[x][3][0] is not None: result['seller_type'] = raw_results[x][3][0].get('seller_type') results.append(result) total_results = len(results) sliced_results = results[offset:(offset + result_count)] result = { 'hits': { 'total': total_results, 'hits': [{ '_source': r } for r in sliced_results] } } try: response = jsonify(result), 200 except Exception as e: response = jsonify(message=str(e)), 500 return response
def create_tsvector(*args): exp = args[0] for e in args[1:]: exp += ' ' + e return func.to_tsvector('english', exp)
def to_tsvector(*args): exp = args[0] for i in args[1:]: exp += ' ' + i return func.to_tsvector('english', exp)
def initialize_test_database(postgresql, mock_get): db_port = postgresql.dsn()['port'] db_host = postgresql.dsn()['host'] db_user = postgresql.dsn()['user'] database = postgresql.dsn()['database'] os.environ['DB_NAME'] = database os.environ['DB_CONNECTION'] = ('postgresql://' + db_user + '@' + db_host + ':' + str(db_port) + '/' + database) models.engine = create_engine(os.environ['DB_CONNECTION']) models.Session = sessionmaker(bind=models.engine) # Create custom text dictionary and database tables management.initialize_database() alembicArgs = [ '--raiseerr', 'upgrade', 'head', ] alembic.config.main(argv=alembicArgs) # Run fake Hacker News scrape to get sample feed, post, and comment from # past hour hacker_news.scrape_loop() # Connect to database session = models.Session() # Add sample feed to database for remaining time periods ('day', 'week', # 'all') past_day_feed = models.Feed(id=2, created=(datetime.combine( date.today(), datetime.min.time())).isoformat()) session.add(past_day_feed) past_week_feed = models.Feed(id=3, created=(datetime.combine( date.today() - timedelta(days=5), datetime.min.time())).isoformat()) session.add(past_week_feed) all_feed = models.Feed(id=4, created=(datetime.combine( date.today() - timedelta(weeks=2), datetime.min.time())).isoformat()) session.add(all_feed) # Add sample post to database for remaining time periods ('day', 'week', # 'all') posts_file = 'fixtures/test-post.json' with open(posts_file, 'r') as post_data: post = json.load(post_data) past_day_post = models.Post(created=(datetime.combine( date.today() - timedelta(hours=20), datetime.min.time())).isoformat(), id=4, link=post['link'], title=post['title'], type='job', username=post['username'], website=post['website']) session.add(past_day_post) past_week_post = models.Post(created=(datetime.combine( date.today() - timedelta(days=5), datetime.min.time())).isoformat(), id=5, link=post['link'], title=post['title'], type='ask', username=post['username'], website=post['website']) session.add(past_week_post) all_post = models.Post(created=(datetime.combine( date.today() - timedelta(weeks=2), datetime.min.time())).isoformat(), id=6, link=post['link'], title=post['title'], type='show', username=post['username'], website=post['website']) session.add(all_post) # Add sample feed_post data to database for each feed and post past_day_feed_post = models.FeedPost(feed_id=past_day_feed.id, feed_rank=3, point_count=3, post_id=past_day_post.id, comment_count=3) session.add(past_day_feed_post) past_week_feed_post = models.FeedPost(feed_id=past_week_feed.id, feed_rank=5, point_count=5, post_id=past_week_post.id, comment_count=5) session.add(past_week_feed_post) all_feed_post = models.FeedPost(feed_id=all_feed.id, feed_rank=7, point_count=7, post_id=all_post.id, comment_count=7) session.add(all_feed_post) # Add sample comment to database for remaining time periods ('day', 'week', # 'all') comment_file = 'fixtures/test-comment.json' with open(comment_file, 'r') as comment_data: comment = json.load(comment_data) past_day_comment = models.Comment( content=comment['content'], created=(datetime.combine(date.today() - timedelta(hours=20), datetime.min.time())).isoformat(), id=6, level=0, parent_comment=None, post_id=past_day_post.id, total_word_count=len(comment['content'].split()), username=comment['username'], word_counts=func.to_tsvector('simple_english', comment['content'].lower())) session.add(past_day_comment) past_week_comment = models.Comment( content=comment['content'], created=(datetime.combine(date.today() - timedelta(days=5), datetime.min.time())).isoformat(), id=7, level=0, parent_comment=None, post_id=past_week_post.id, total_word_count=len(comment['content'].split()), username=comment['username'], word_counts=func.to_tsvector('simple_english', comment['content'].lower())) session.add(past_week_comment) all_comment = models.Comment( content=comment['content'], created=(datetime.combine(date.today() - timedelta(weeks=2), datetime.min.time())).isoformat(), id=8, level=0, parent_comment=None, post_id=all_post.id, total_word_count=len(comment['content'].split()), username=comment['username'], word_counts=func.to_tsvector('simple_english', comment['content'].lower())) session.add(all_comment) # Add sample feed_comment data to database for each feed and comment past_day_feed_comment = models.FeedComment(comment_id=past_day_comment.id, feed_id=past_day_feed.id, feed_rank=3) session.add(past_day_feed_comment) past_week_feed_comment = models.FeedComment( comment_id=past_week_comment.id, feed_id=past_week_feed.id, feed_rank=5) session.add(past_week_feed_comment) all_feed_comment = models.FeedComment(comment_id=all_comment.id, feed_id=all_feed.id, feed_rank=7) session.add(all_feed_comment) session.commit() session.close()
class ItemModel(db.Model): __abstract__ = True id = db.Column('id', db.Text, nullable=False, primary_key=True) title = db.Column('title', db.Text, nullable=False, index=True) abstract = db.Column('abstract', db.Text, index=True) type = db.Column('type', db.Text, index=True) spatial_data_service_type = db.Column( 'spatial_data_service_type', db.Text, index=True) spatial_data_service_version = db.Column( 'spatial_data_service_version', db.Text, index=True) spatial_data_service_operations = db.Column('spatial_data_service_operations', ARRAY(db.Text), index=True) spatial_data_service_queryables = db.Column('spatial_data_service_queryables', ARRAY(db.Text), index=True) format = db.Column('format', db.Text, index=True) keywords = db.Column('keywords', ARRAY(JSONB)) publisher_name = db.Column('publisher_name', db.Text, index=True) publisher_email = db.Column('publisher_email', db.Text, index=True) publisher_id = db.Column('publisher_id', db.Text, index=True) language = db.Column('language', db.Text, index=True) date_start = db.Column('date_start', db.Date, index=True) date_end = db.Column('date_end', db.Date, index=True) creation_date = db.Column('creation_date', db.Date, index=True) publication_date = db.Column('publication_date', db.Date, index=True) revision_date = db.Column('revision_date', db.Date, index=True) geographic_location = db.Column('geographic_location', Geometry( geometry_type='POLYGON'), index=True) resource_locator = db.Column('resource_locator', db.Text, index=True) license = db.Column('license', db.Text, index=True) open_dataset = db.Column('open_dataset', db.Boolean, default=False) topic_category = db.Column('topic_category', ARRAY(db.Text), index=True) reference_system = db.Column('reference_system', db.Text, index=True) spatial_resolution = db.Column('spatial_resolution', db.Integer, index=True) scales = db.Column('scale', ARRAY(JSONB)) version = db.Column('version', db.Text, index=True) conformity = db.Column('conformity', db.Text, index=True) additional_resources = db.Column( 'additional_resources', ARRAY(JSONB)) public_access_limitations = db.Column( 'public_access_limitations', db.Text, index=True) metadata_language = db.Column('metadata_language', db.Text, index=True) metadata_point_of_contact_name = db.Column( 'metadata_point_of_contact_name', db.Text, index=True) metadata_point_of_contact_email = db.Column( 'metadata_point_of_contact_email', db.Text, index=True) metadata_date = db.Column('metadata_date', db.Date, index=True) metadata_version = db.Column('metadata_version', db.Text, index=True) resources = db.Column('resources', ARRAY(JSONB)) lineage = db.Column('lineage', db.Text, index=True) parent_id = db.Column('parent_id', db.Text, index=True) parent_data_source_id = db.Column('parent_data_source_id', db.Text, index=True) item_geojson = db.Column('item_geojson', JSONB) contract_template_id = db.Column('contract_template_id', db.Integer, index=True) contract_template_version = db.Column('contract_template_version', db.Text, index=True) pricing_models = db.Column('pricing_models', JSONB) statistics = db.Column('statistics', JSONB) delivery_method = db.Column('delivery_method', db.Text, index=True) responsible_party = db.Column('responsible_party', JSONB) automated_metadata = db.Column('automated_metadata', ARRAY(JSONB)) harvested_from = db.Column('harvested_from', db.Text, index=True) harvest_json = db.Column('harvest_json', JSONB) use_only_for_vas = db.Column('use_only_for_vas', db.Boolean, default=False) vetting_required = db.Column('vetting_required', db.Boolean, default=False) ingestion_info = db.Column('ingestion_info', ARRAY(JSONB)) created_at = db.Column('created_at', db.Date, index=True) submitted_at = db.Column('submitted_at', db.Date, index=True) accepted_at = db.Column('accepted_at', db.Date, index=True) visibility = db.Column('visibility', ARRAY(db.Text), index=True) suitable_for = db.Column('suitable_for', ARRAY(db.Text), index=True) ts_vector = func.to_tsvector('english', item_geojson) def update(self, id, data): properties = data.get('properties') if data.get('geometry') is not None: geom = data['geometry'] self.geographic_location = shape(geom).wkt for key in properties: if properties[key] is not None: setattr(self, key, properties[key]) self.id = id item_geojson = self.serialize() self.item_geojson = item_geojson return # convert to geojson format def serialize(self): # build properties object p = {} geom = None for c in inspect(self).attrs.keys(): attr = getattr(self, c) if c == 'geographic_location' and attr is not None: # Convert to a shapely Polygon object to get mapping if isinstance(attr, str): g = shapely.wkt.loads(attr) geom = mapping(g) else: g = to_shape(attr) geom = mapping(g) elif isinstance(attr, datetime.date): p[c] = attr.isoformat() elif c == 'id' or c == 'item_geojson': continue else: p[c] = attr # build geojson item_geojson = \ { "id": self.id, "type": "Feature", "geometry": geom, "properties": p } return item_geojson
def casestudies_search(): search_query = get_json_from_request() offset = get_nonnegative_int_or_400(request.args, 'from', 0) result_count = get_positive_int_or_400(request.args, 'size', current_app.config['DM_API_SUPPLIERS_PAGE_SIZE']) sort_dir = search_query.get('sort_dir', 'asc') sort_by = search_query.get('sort_by', None) domains = search_query.get('domains', None) seller_types = search_query.get('seller_types', None) search_term = search_query.get('search_term', None) framework_slug = request.args.get('framework', 'digital-marketplace') q = db.session.query(CaseStudy).join(Supplier).outerjoin(SupplierDomain).outerjoin(Domain) \ .outerjoin(SupplierFramework).outerjoin(Framework) q = q.filter(Supplier.status != 'deleted', or_(Framework.slug == framework_slug, ~Supplier.frameworks.any())) tsquery = None if search_term: if ' ' in search_term: tsquery = func.plainto_tsquery(search_term) else: tsquery = func.to_tsquery(search_term + ":*") q = q.add_column(func.ts_headline( 'english', func.concat( CaseStudy.data['approach'].astext, ' ', CaseStudy.data['role'].astext), tsquery, 'MaxWords=150, MinWords=75, ShortWord=3, HighlightAll=FALSE, FragmentDelimiter=" ... " ' )) else: q = q.add_column("''") q = q.add_column(Supplier.name) q = q.add_column(postgres.array_agg(Supplier.data)) q = q.group_by(CaseStudy.id, Supplier.name) if domains: d_agg = postgres.array_agg(cast(Domain.name, TEXT)) q = q.having(d_agg.contains(array(domains))) if seller_types: selected_seller_types = select( [postgres.array_agg(column('key'))], from_obj=func.json_each_text(Supplier.data[('seller_type',)]), whereclause=cast(column('value'), Boolean) ).as_scalar() q = q.filter(selected_seller_types.contains(array(seller_types))) if sort_dir in ('desc', 'z-a'): ob = [desc(CaseStudy.data['title'].astext)] else: ob = [asc(CaseStudy.data['title'].astext)] if search_term: ob = [desc(func.ts_rank_cd(func.to_tsvector( func.concat(Supplier.name, CaseStudy.data['title'].astext, CaseStudy.data['approach'].astext)), tsquery))] + ob condition = func.to_tsvector(func.concat(Supplier.name, CaseStudy.data['title'].astext, CaseStudy.data['approach'].astext)).op('@@')(tsquery) q = q.filter(condition) q = q.order_by(*ob) raw_results = list(q) results = [] for x in range(len(raw_results)): result = raw_results[x][0].serialize() if raw_results[x][1] is not None and raw_results[x][1] != '': result['approach'] = raw_results[x][1] if raw_results[x][2] is not None: result['supplierName'] = raw_results[x][2] if raw_results[x][3] is not None and raw_results[x][3][0] is not None: result['seller_type'] = raw_results[x][3][0].get('seller_type') results.append(result) total_results = len(results) sliced_results = results[offset:(offset + result_count)] result = { 'hits': { 'total': total_results, 'hits': [{'_source': r} for r in sliced_results] } } try: response = jsonify(result), 200 except Exception as e: response = jsonify(message=str(e)), 500 return response
def to_tsvector_ix(*columns): s = " || ' ' || ".join(columns) return func.to_tsvector('english', text(s))
def add_text_search(query, join_columns, keywords, locales, include_rank=True, lse=None): from assembl.models.langstrings import LangStringEntry rank = None keywords_j = ' & '.join(keywords) lse = lse or aliased(LangStringEntry) join_conds = [ lse.langstring_id == join_column for join_column in join_columns ] if len(join_conds) > 1: join_cond = or_(*join_conds) else: join_cond = join_conds[0] query = query.join(lse, join_cond) if locales: active_text_indices = get('active_text_indices', 'en') locales_by_config = defaultdict(list) any_locale = 'any' in locales for locale in locales: fts_config = postgres_language_configurations.get(locale, 'simple') if fts_config not in active_text_indices: fts_config = 'simple' locales_by_config[fts_config].append(locale) conds = {} # TODO: to_tsquery vs plainto_tsquery vs phraseto_tsquery for fts_config, locales in locales_by_config.items(): conds[fts_config] = (or_( *[((lse.locale == locale) | lse.locale.like(locale + "_%")) for locale in locales]) if 'any' not in locales else None, func.to_tsvector(fts_config, lse.value)) filter = [ cond & v.match(keywords_j, postgresql_regconfig=conf) for (conf, (cond, v)) in conds.items() if cond is not None ] if any_locale: (_, v) = conds['simple'] filter.append(v.match(keywords_j, postgresql_regconfig='simple')) query = query.filter(or_(*filter)) if include_rank: if len(conds) > 1: if any_locale: (_, v) = conds['simple'] else_case = func.ts_rank( v, func.to_tsquery('simple', keywords_j)) else: else_case = 0 rank = case( [(cond, func.ts_rank(v, func.to_tsquery(conf, keywords_j))) for (conf, (cond, v)) in conds.items() if cond is not None], else_=else_case).label('score') else: (conf, (cond, v)) = next(iter(conds.items())) rank = func.ts_rank(v, func.to_tsquery(conf, keywords_j)).label('score') query = query.add_column(rank) else: fts_config = 'simple' query = query.filter( func.to_tsvector(fts_config, lse.value).match(keywords_j, postgresql_regconfig=fts_config)) if include_rank: rank = func.ts_rank(func.to_tsvector(fts_config, lse.value), func.to_tsquery(fts_config, keywords_j)).label('score') query = query.add_column(rank) return query, rank