def _MatchPhrase(self, field, match, document): """Match a textual field with a phrase query node.""" field_text = field.value().string_value() phrase_text = query_parser.GetPhraseQueryNodeText(match) if field.value().type() == document_pb.FieldValue.ATOM: return self._MatchRawPhraseWithRawAtom(field_text, phrase_text) if not phrase_text: return False phrase = self._parser.TokenizeText( search_util.RemoveAccentsNfkd(phrase_text)) field_text = self._parser.TokenizeText( search_util.RemoveAccentsNfkd(field_text)) if not phrase: return True posting = None for post in self._PostingsForFieldToken(field.name(), phrase[0].chars): if post.doc_id == document.id(): posting = post break if not posting: return False def ExtractWords(token_list): return (token.chars for token in token_list) for position in posting.positions: match_words = list(zip(ExtractWords(field_text[position:]), ExtractWords(phrase))) if len(match_words) != len(phrase): continue match = True for doc_word, match_word in match_words: if doc_word != match_word: match = False if match: return True return False
def _MatchTextField(self, field, match, document): """Check if a textual field matches a query tree node.""" if match.getType() == QueryParser.FUZZY: return self._MatchTextField(field, match.getChild(0), document) if match.getType() == QueryParser.VALUE: if query_parser.IsPhrase(match): return self._MatchPhrase(field, match, document) if field.value().type() == document_pb.FieldValue.ATOM: return (field.value().string_value() == query_parser.GetQueryNodeText(match)) query_tokens = self._parser.TokenizeText( query_parser.GetQueryNodeText(match)) if not query_tokens: return True if len(query_tokens) > 1: def QueryNode(token): return query_parser.CreateQueryNode( search_util.RemoveAccentsNfkd(token.chars), QueryParser.TEXT) return all(self._MatchTextField(field, QueryNode(token), document) for token in query_tokens) token_text = search_util.RemoveAccentsNfkd(query_tokens[0].chars) matching_docids = [ post.doc_id for post in self._PostingsForFieldToken( field.name(), token_text)] return document.id() in matching_docids def ExtractGlobalEq(node): op = node.getType() if ((op == QueryParser.EQ or op == QueryParser.HAS) and len(node.children) >= 2): if node.children[0].getType() == QueryParser.GLOBAL: return node.children[1] return node if match.getType() == QueryParser.CONJUNCTION: return all(self._MatchTextField(field, ExtractGlobalEq(child), document) for child in match.children) if match.getType() == QueryParser.DISJUNCTION: return any(self._MatchTextField(field, ExtractGlobalEq(child), document) for child in match.children) if match.getType() == QueryParser.NEGATION: raise ExpressionTreeException('Unable to compare \"' + field.name() + '\" with negation') return False
def TokenizeValue(self, field_value, token_position=0): """Tokenizes a document_pb.FieldValue into a sequence of Tokens.""" if field_value.type() == document_pb.FieldValue.GEO: return self._TokenizeForType(field_type=field_value.type(), value=field_value.geo(), token_position=token_position) return self._TokenizeForType(field_type=field_value.type(), value=search_util.RemoveAccentsNfkd( field_value.string_value()), token_position=token_position)
def _TokenizeForType(self, field_type, value, token_position=0): """Tokenizes value into a sequence of Tokens.""" if field_type == document_pb.FieldValue.NUMBER: return [tokens.Token(chars=value, position=token_position)] if field_type == document_pb.FieldValue.GEO: return [ tokens.GeoPoint(latitude=value.lat(), longitude=value.lng(), position=token_position) ] tokens_found = [] token_strings = [] if not self._split_restricts: token_strings = self.SetCase( search_util.RemoveAccentsNfkd(value)).split() else: token_strings = self._TokenizeString(value, field_type) for token in token_strings: token = _SINGLE_QUOTE_RE.search(token).group(1) if ':' in token and self._split_restricts: for subtoken in token.split(':'): tokens_found.append( tokens.Token(chars=subtoken, position=token_position)) token_position += 1 elif '"' in token: for subtoken in token.split('"'): if not subtoken: tokens_found.append( tokens.Quote(chars='"', position=token_position)) else: tokens_found.append( tokens.Token(chars=subtoken, position=token_position)) token_position += 1 else: tokens_found.append( tokens.Token(chars=token, position=token_position)) token_position += 1 return tokens_found
def QueryNode(token): return query_parser.CreateQueryNode( search_util.RemoveAccentsNfkd(token.chars), QueryParser.TEXT)