def _CheckMatch(self, node, document): """Check if a document matches a query tree. Args: node: the query node to match document: the document to match Returns: True iff the query node matches the document. Raises: ExpressionTreeException: when != operator is used or numeric value is used in comparison for DATE field. """ if node.getType() == QueryParser.SEQUENCE: result = all( self._CheckMatch(child, document) for child in node.children) return result or self._MatchGlobalPhrase(node, document) if node.getType() == QueryParser.CONJUNCTION: return all( self._CheckMatch(child, document) for child in node.children) if node.getType() == QueryParser.DISJUNCTION: return any( self._CheckMatch(child, document) for child in node.children) if node.getType() == QueryParser.NEGATION: return not self._CheckMatch(node.children[0], document) if node.getType() == QueryParser.NE: raise ExpressionTreeException( '!= comparison operator is not available') if node.getType() in query_parser.COMPARISON_TYPES: lhs, match = node.children if lhs.getType() == QueryParser.GLOBAL: return self._MatchGlobal(match, document) elif lhs.getType() == QueryParser.FUNCTION: return self._MatchFunction(lhs, match, node.getType(), document) field_name = self._GetFieldName(lhs) if node.getType() in INEQUALITY_COMPARISON_TYPES: try: float(query_parser.GetPhraseQueryNodeText(match)) except ValueError: self._CheckValidDateComparison(field_name, match) elif (self._IsValidDateValue(field_name) or self._IsValidNumericValue(field_name)): raise ExpressionTreeException('Invalid field name "%s"' % field_name) return self._MatchAnyField(lhs, match, node.getType(), document) return False
def _CheckValidDateComparison(self, field_name, match): """Check if match is a valid date value.""" if match.getType() == QueryParser.FUNCTION: name, _ = match.children raise ExpressionTreeException('Unable to compare "%s" with "%s()"' % (field_name, name)) elif match.getType() == QueryParser.VALUE: match_val = query_parser.GetPhraseQueryNodeText(match) if not self._IsValidDateValue(match_val): raise ExpressionTreeException('Unable to compare "%s" with "%s"' % (field_name, match_val))
def _CheckValidDateComparison(self, field_name, match): """Check if match is a valid date value.""" if match.getType() == QueryParser.VALUE: try: match_val = query_parser.GetPhraseQueryNodeText(match) datetime.datetime.strptime(match_val, '%Y-%m-%d') except ValueError: raise ExpressionTreeException( 'Unable to compare "%s" with "%s"' % (field_name, match_val))
def _MatchPhrase(self, field, match, document): """Match a textual field with a phrase query node.""" raw_field_text = field.value().string_value() raw_phrase_text = query_parser.GetPhraseQueryNodeText(match) if field.value().type() == document_pb.FieldValue.ATOM: return self._MatchRawPhraseWithRawAtom(raw_field_text, raw_phrase_text) if not raw_phrase_text: return False if field.value().type() == document_pb.FieldValue.UNTOKENIZED_PREFIX: phrase = self._parser.Normalize(raw_phrase_text, field.value().type()) field_text = self._parser.Normalize(raw_field_text, field.value().type()) return field_text.startswith(phrase) phrase = self._parser.TokenizeText(raw_phrase_text) field_text = self._parser.TokenizeText(raw_field_text) if not phrase: return True posting = None for post in self._PostingsForFieldToken(field.name(), phrase[0].chars): if post.doc_id == document.id(): posting = post break if not posting: return False def ExtractWords(token_list): return (token.chars for token in token_list) for position in posting.positions: match_words = zip(ExtractWords(field_text[position:]), ExtractWords(phrase)) if len(match_words) != len(phrase): continue match = True for doc_word, match_word in match_words: if (field.value().type() == document_pb.FieldValue.TOKENIZED_PREFIX and doc_word.startswith(match_word)): continue if doc_word != match_word: match = False if match: return True return False
def _MatchComparableField(self, field, match, cast_to_type, op, document): """A generic method to test matching for comparable types. Comparable types are defined to be anything that supports <, >, <=, >=, ==. For our purposes, this is numbers and dates. Args: field: The document_pb.Field to test match: The query node to match against cast_to_type: The type to cast the node string values to op: The query node type representing the type of comparison to perform document: The document that the field is in Returns: True iff the field matches the query. Raises: UnsupportedOnDevError: Raised when an unsupported operator is used, or when the query node is of the wrong type. ExpressionTreeException: Raised when a != inequality operator is used. """ field_val = cast_to_type(field.value().string_value()) if match.getType() == QueryParser.VALUE: try: match_val = cast_to_type( query_parser.GetPhraseQueryNodeText(match)) except ValueError: return False else: return False if op == QueryParser.EQ or op == QueryParser.HAS: return field_val == match_val if op == QueryParser.NE: raise ExpressionTreeException( '!= comparison operator is not available') if op == QueryParser.GT: return field_val > match_val if op == QueryParser.GE: return field_val >= match_val if op == QueryParser.LESSTHAN: return field_val < match_val if op == QueryParser.LE: return field_val <= match_val raise search_util.UnsupportedOnDevError( 'Operator %s not supported for numerical fields on development server.' % match.getText())
def _MatchPhrase(self, field, match, document): """Match a textual field with a phrase query node.""" field_text = field.value().string_value() phrase_text = query_parser.GetPhraseQueryNodeText(match) if field.value().type() == document_pb.FieldValue.ATOM: return self._MatchRawPhraseWithRawAtom(field_text, phrase_text) if not phrase_text: return False phrase = self._parser.TokenizeText( search_util.RemoveAccentsNfkd(phrase_text)) field_text = self._parser.TokenizeText( search_util.RemoveAccentsNfkd(field_text)) if not phrase: return True posting = None for post in self._PostingsForFieldToken(field.name(), phrase[0].chars): if post.doc_id == document.id(): posting = post break if not posting: return False def ExtractWords(token_list): return (token.chars for token in token_list) for position in posting.positions: match_words = list(zip(ExtractWords(field_text[position:]), ExtractWords(phrase))) if len(match_words) != len(phrase): continue match = True for doc_word, match_word in match_words: if doc_word != match_word: match = False if match: return True return False