def search(): with ScopedSession() as session: q = flask.request.args.get('query') query = session.query(Comment, User, Post, func.ts_headline('russian', Comment.text, func.plainto_tsquery('russian', q), 'HighlightAll=true').label('highlighted')).filter(Comment.user_id == User.user_id).filter(Comment.post_id == Post.post_id) query = query.filter(func.to_tsvector('russian', Comment.text).op('@@')(func.plainto_tsquery('russian', q))) comments = [] for comment, user, post, highlighted in query.order_by(func.ts_rank_cd(func.to_tsvector('russian', Comment.text), func.plainto_tsquery('russian', q)).desc(), Comment.posted.desc()).limit(100).all(): comments.append({ "id": comment.comment_id, "parent_id": comment.parent_id, "post_id": comment.post_id, "text": normalize_text(highlighted), "posted": comment.posted.strftime(DATE_FORMAT), "user_id": user.user_id, "user_name": user.name, "user_avatar": user.avatar_hash, "comment_list_id": post.comment_list_id }) resp = app.make_response(json.dumps(comments, ensure_ascii=False)) resp.mimetype = 'application/json; charset=utf-8' resp.headers['Access-Control-Allow-Origin'] = '*' return resp
def fetch_content(query_text, column, text_column, page, sources=None): session = g.session tsq = build_tsquery(query_text) search = None if column == db.WebPages.title: query = session \ .query(db.WebPages, func.ts_rank_cd(func.to_tsvector("english", column), func.to_tsquery(tsq))) \ .filter( \ func.to_tsvector("english", column).match(tsq, postgresql_regconfig='english') \ ) \ .order_by(func.ts_rank_cd(func.to_tsvector("english", column), func.to_tsquery(tsq)).desc()) elif column == db.WebPages.tsv_content: query = session \ .query(db.WebPages, func.ts_rank_cd(column, func.to_tsquery(tsq))) \ .filter( column.match(tsq) ) if "'" in query_text or '"' in query_text: search = query_text.replace("!", " ").replace("?", " ").replace( "'", " ").replace('"', " ").replace(',', " ").replace('.', " ").strip() while " " in search: search = search.replace(" ", " ") search = search.strip() search = '%{}%'.format(search.lower()) query = query.filter(func.lower(text_column).like(search)) query = query.order_by( func.ts_rank_cd(column, func.to_tsquery(tsq)).desc()) if sources: query = query.filter(db.WebPages.netloc.in_(sources)) else: raise ValueError("Wat?") print(str(query.statement.compile(dialect=postgresql.dialect()))) print("param: '%s', '%s', '%s'" % (tsq, sources, search)) try: entries = paginate(query, page, per_page=50) except sqlalchemy.exc.ProgrammingError: traceback.print_exc() print("ProgrammingError - Rolling back!") g.session.rollback() raise except sqlalchemy.exc.InternalError: traceback.print_exc() print("InternalError - Rolling back!") g.session.rollback() raise except sqlalchemy.exc.OperationalError: traceback.print_exc() print("InternalError - Rolling back!") g.session.rollback() raise return entries
def fetch_content(query_text, column, text_column, page, sources=None): session = g.session tsq = build_tsquery(query_text) search = None if column == db.WebPages.title: query = session \ .query(db.WebPages, func.ts_rank_cd(func.to_tsvector("english", column), func.to_tsquery(tsq))) \ .filter( \ func.to_tsvector("english", column).match(tsq, postgresql_regconfig='english') \ ) \ .order_by(func.ts_rank_cd(func.to_tsvector("english", column), func.to_tsquery(tsq)).desc()) elif column == db.WebPages.tsv_content: query = session \ .query(db.WebPages, func.ts_rank_cd(column, func.to_tsquery(tsq))) \ .filter( column.match(tsq) ) if "'" in query_text or '"' in query_text: search = query_text.replace("!", " ").replace("?", " ").replace("'", " ").replace('"', " ").replace(',', " ").replace('.', " ").strip() while " " in search: search = search.replace(" ", " ") search = search.strip() search = '%{}%'.format(search.lower()) query = query.filter( func.lower(text_column).like(search) ) query = query.order_by(func.ts_rank_cd(column, func.to_tsquery(tsq)).desc()) if sources: query = query.filter(db.WebPages.netloc.in_(sources)) else: raise ValueError("Wat?") print(str(query.statement.compile(dialect=postgresql.dialect()))) print("param: '%s', '%s', '%s'" % (tsq, sources, search)) try: entries = paginate(query, page, per_page=50) except sqlalchemy.exc.ProgrammingError: traceback.print_exc() print("ProgrammingError - Rolling back!") g.session.rollback() raise except sqlalchemy.exc.InternalError: traceback.print_exc() print("InternalError - Rolling back!") g.session.rollback() raise except sqlalchemy.exc.OperationalError: traceback.print_exc() print("InternalError - Rolling back!") g.session.rollback() raise return entries
def fetch_content(query_text, column, page, sources=None): session = db.get_session() tsq = build_tsquery(query_text) if column == db.WebPages.title: query = session \ .query(db.WebPages, func.ts_rank_cd(func.to_tsvector("english", column), func.to_tsquery(tsq))) \ .filter( \ func.to_tsvector("english", column).match(tsq, postgresql_regconfig='english') \ ) \ .order_by(func.ts_rank_cd(func.to_tsvector("english", column), func.to_tsquery(tsq)).desc()) elif column == db.WebPages.tsv_content: query = session \ .query(db.WebPages, func.ts_rank_cd(column, func.to_tsquery(tsq))) \ .filter( column.match(tsq) ) \ .order_by(func.ts_rank_cd(column, func.to_tsquery(tsq)).desc()) if sources: query = query.filter(db.WebPages.netloc.in_(sources)) else: raise ValueError("Wat?") # print(str(query.statement.compile(dialect=postgresql.dialect()))) # print("param: '%s'" % tsq) try: entries = paginate(query, page, per_page=50) except sqlalchemy.exc.ProgrammingError: traceback.print_exc() print("ProgrammingError - Rolling back!") db.get_session().rollback() raise except sqlalchemy.exc.InternalError: traceback.print_exc() print("InternalError - Rolling back!") db.get_session().rollback() raise except sqlalchemy.exc.OperationalError: traceback.print_exc() print("InternalError - Rolling back!") db.get_session().rollback() raise return entries
def fetch_all_by_tsquery(self, tsquery, *expr, **kwargs): """ Returns all structures whose abstract matches the keywords given in the keyword string. Parameters ---------- tsquery : str A string containing the lexme that will be used for searching. The string will be automatically casted into a tsquery through the to_tsquery() function. weights : list, default=[0.1, 0.2, 0.4, 1.0] The weights specify how heavily to weigh each category of word, in the order [D-weight, C-weight, B-weight, A-weight]. normalization : int, default=32 Option that specifies whether and how a document's length should impact its rank. The integer option controls several behaviors, so it is a bit mask: you can specify one or more behaviors using | (for example, 2|4). - 0 ignores the document length - 1 divides the rank by 1 + the logarithm of the document length - 2 divides the rank by the document length - 4 divides the rank by the mean harmonic distance between extents (this is implemented only by ts_rank_cd) - 8 divides the rank by the number of unique words in document - 16 divides the rank by 1 + the logarithm of the number of unique words in document - 32 divides the rank by itself + 1 min_words : int, default=10 max_words : int, default=25 Returns ------- resultset : list List containing tuples in the form (Structure, snippet, rank). The snippet is the part of the abstract that matches the query. Ordered by rank. Notes ----- - http://www.postgresql.org/docs/current/interactive/datatype-textsearch.html Examples -------- >>> StructureAdaptor().fetch_all_by_tsquery('imatinib') [(<Structure(3FW1)>, u'[Imatinib] represents the first in a class', 0.54545500000000002), (<Structure(2GQG)>, u'[imatinib], an inhibitor of BCR-ABL. Although', 0.375), (<Structure(3CS9)>, u'[imatinib] against [imatinib]-resistant Bcr-Abl. Consistent', 0.33333299999999999), (<Structure(2HYY)>, u'[imatinib] as first-line therapy', 0.230769), (<Structure(3G0F)>, u'[imatinib] mesylate and sunitinib malate', 0.230769), (<Structure(3G0E)>, u'[imatinib] mesylate and sunitinib malate', 0.230769), (<Structure(1T46)>, u'[Imatinib] or Gleevec) demonstrates that', 0.090909100000000007)] """ weights = kwargs.get('weights', [0.1, 0.2, 0.4, 1.0]) normalization = kwargs.get('normalization', 32) min_words, max_words = kwargs.get('min_words', 10), kwargs.get('max_words', 25) if kwargs.get('plain'): tsquery = func.plainto_tsquery('english', tsquery) else: tsquery = func.to_tsquery('english', tsquery) # basic query construct query = self.query.join('XRefs').filter(and_(*expr)) query = query.join( citations, and_(citations.c.pubmed_id == cast(XRef.xref, Integer), XRef.source == 'PubMed')) # GIST index that is used for searching index = func.to_tsvector('english', citations.c.abstract).op('@@')(tsquery) query = query.filter(index).order_by("rank DESC") # calculated rank of the hit rank = func.ts_rank_cd( weights, func.to_tsvector('english', citations.c.abstract), tsquery, normalization).label('rank') query = query.add_column(rank) if kwargs.get('snippet'): # configuration of the preview snippet headline_conf = 'StartSel=[, StopSel=], MaxWords={0}, MinWords={1}'.format( max_words, min_words) # function to create a preview snippet of the matched area snippet = func.ts_headline('english', citations.c.abstract, tsquery, headline_conf).label('snippet') # add snippet column to query query = query.add_column(snippet) return query
def filter_field(self, field, op, value): """ Applies a filter on a field. Notes on 'ne' op: Example data: [None, 'john', 'roger'] ne:john would return only roger (i.e. nulls excluded) ne: would return john and roger Notes on 'search' op: For some reason, SQLAlchemy uses to_tsquery rather than plainto_tsquery for the match operator to_tsquery uses operators (&, |, ! etc.) while plainto_tsquery tokenises the input string and uses AND between tokens, hence plainto_tsquery is what we want here For other database back ends, the behaviour of the match operator is completely different - see: http://docs.sqlalchemy.org/en/rel_1_0/core/sqlelement.html :param field: field name :param op: 'eq', 'ne', 'gt', 'lt', 'ge', 'le' or 'search' :param value: comparison value, string or list/tuple :return: """ app = self.app odm = app.odm() query = self.sql_query if field.model: field_model = self.app.models.get(field.model) if not field_model: return db_model = field_model.db_model() field = getattr(db_model, field.name, None) if field is None: field = getattr(db_model, field_model.id_field, None) if not field: return if field_model.identifier not in self.joins: self.joins.add(field_model.identifier) query = query.join(db_model) elif isinstance(field.field, str): field = getattr(self.model.db_model(), field.name, None) else: field = None if not field: return multiple = isinstance(value, (list, tuple)) if value == '': value = None if multiple and op in ('eq', 'ne'): if op == 'eq': query = query.filter(field.in_(value)) elif op == 'ne': query = query.filter(~field.in_(value)) else: if multiple: assert len(value) > 0 value = value[0] if op == 'eq': query = query.filter(field == value) elif op == 'ne': query = query.filter(field != value) elif op == 'search': dialect_name = odm.binds[odm[self.name].__table__].dialect.name if dialect_name == 'postgresql': ts_config = field.info.get( 'text_search_config', app.config['DEFAULT_TEXT_SEARCH_CONFIG']) query = query.filter( func.to_tsvector(ts_config, cast(field, String)).op('@@')( func.plainto_tsquery(value))) else: query = query.filter(field.match(value)) elif op == 'gt': query = query.filter(field > value) elif op == 'ge': query = query.filter(field >= value) elif op == 'lt': query = query.filter(field < value) elif op == 'le': query = query.filter(field <= value) self.sql_query = query return self
def filter_field(self, field, op, value): """ Applies a filter on a field. Notes on 'ne' op: Example data: [None, 'john', 'roger'] ne:john would return only roger (i.e. nulls excluded) ne: would return john and roger Notes on 'search' op: For some reason, SQLAlchemy uses to_tsquery rather than plainto_tsquery for the match operator to_tsquery uses operators (&, |, ! etc.) while plainto_tsquery tokenises the input string and uses AND between tokens, hence plainto_tsquery is what we want here For other database back ends, the behaviour of the match operator is completely different - see: http://docs.sqlalchemy.org/en/rel_1_0/core/sqlelement.html :param field: field name :param op: 'eq', 'ne', 'gt', 'lt', 'ge', 'le' or 'search' :param value: comparison value, string or list/tuple :return: """ app = self.app odm = app.odm() query = self.sql_query if field.model: field_model = self.app.models.get(field.model) if not field_model: return db_model = field_model.db_model() field = getattr(db_model, field.name, None) if field is None: field = getattr(db_model, field_model.id_field, None) if not field: return if field_model.identifier not in self.joins: self.joins.add(field_model.identifier) query = query.join(db_model) elif isinstance(field.field, str): field = getattr(self.model.db_model(), field.name, None) else: field = None if not field: return multiple = isinstance(value, (list, tuple)) if value == '': value = None if multiple and op in ('eq', 'ne'): if op == 'eq': query = query.filter(field.in_(value)) elif op == 'ne': query = query.filter(~field.in_(value)) else: if multiple: assert len(value) > 0 value = value[0] if op == 'eq': query = query.filter(field == value) elif op == 'ne': query = query.filter(field != value) elif op == 'search': dialect_name = odm.binds[odm[self.name].__table__].dialect.name if dialect_name == 'postgresql': ts_config = field.info.get( 'text_search_config', app.config['DEFAULT_TEXT_SEARCH_CONFIG'] ) query = query.filter( func.to_tsvector(ts_config, cast(field, String)).op( '@@')(func.plainto_tsquery(value)) ) else: query = query.filter(field.match(value)) elif op == 'gt': query = query.filter(field > value) elif op == 'ge': query = query.filter(field >= value) elif op == 'lt': query = query.filter(field < value) elif op == 'le': query = query.filter(field <= value) self.sql_query = query return self