예제 #1
0
  def _TokenizeForType(self, field_type, value, token_position=0):
    """Tokenizes value into a sequence of Tokens."""
    if field_type == document_pb.FieldValue.NUMBER:
      return [tokens.Token(chars=value, position=token_position)]

    if field_type == document_pb.FieldValue.GEO:
      return [tokens.GeoPoint(latitude=value.lat(), longitude=value.lng(),
                              position=token_position)]

    tokens_found = []
    token_strings = []

    if not self._split_restricts:
      token_strings = self.SetCase(value).split()
    else:
      token_strings = self._TokenizeString(value, field_type)
    for token in token_strings:
      if ':' in token and self._split_restricts:
        for subtoken in token.split(':'):
          tokens_found.append(
              tokens.Token(chars=subtoken, position=token_position))
          token_position += 1
      elif '"' in token:
        for subtoken in token.split('"'):
          if not subtoken:
            tokens_found.append(
                tokens.Quote(chars='"', position=token_position))
          else:
            tokens_found.append(
                tokens.Token(chars=subtoken, position=token_position))
          token_position += 1
      else:
        tokens_found.append(tokens.Token(chars=token, position=token_position))
        token_position += 1
    return tokens_found
예제 #2
0
    def _Snippet(self, query, field, *args):
        """Create a snippet given a query and the field to query on.

    Args:
      query: A query string containing only a bare term (no operators).
      field: The field name to query on.
      *args: Unused optional arguments. These are not used on dev_appserver.

    Returns:
      A snippet for the field with the query term bolded.

    Raises:
      ExpressionEvaluationError: if this is a sort expression.
    """
        field = query_parser.GetQueryNodeText(field)

        if self._is_sort_expression:
            raise ExpressionEvaluationError(
                'Failed to parse sort expression \'snippet(' +
                query_parser.GetQueryNodeText(query) + ', ' + field +
                ')\': snippet() is not supported in sort expressions')

        schema = self._inverted_index.GetSchema()
        if schema.IsType(field, document_pb.FieldValue.NUMBER):
            raise ExpressionEvaluationError(
                'Failed to parse field expression \'snippet(' +
                query_parser.GetQueryNodeText(query) + ', ' + field +
                ')\': snippet() argument 2 must be text')

        terms = self._tokenizer.TokenizeText(
            query_parser.GetQueryNodeText(query).strip('"'))
        for term in terms:
            search_token = tokens.Token(chars=u'%s:%s' % (field, term.chars))
            postings = self._inverted_index.GetPostingsForToken(search_token)
            for posting in postings:
                if posting.doc_id != self._doc_pb.id(
                ) or not posting.positions:
                    continue

                field_val = self._GetFieldValue(
                    search_util.GetFieldInDocument(self._doc_pb, field))
                if not field_val:
                    continue
                doc_words = [
                    token.chars for token in
                    self._case_preserving_tokenizer.TokenizeText(field_val)
                ]

                position = posting.positions[0]
                return self._GenerateSnippet(
                    doc_words, position,
                    search_util.DEFAULT_MAX_SNIPPET_LENGTH)
            else:
                field_val = self._GetFieldValue(
                    search_util.GetFieldInDocument(self._doc_pb, field))
                if not field_val:
                    return ''
                return '%s...' % field_val[:search_util.
                                           DEFAULT_MAX_SNIPPET_LENGTH]
예제 #3
0
    def _ExtractPrefixTokens(self, token):
        """Extracts the prefixes from a term."""
        term = token.chars.strip()
        prefix_tokens = []
        for i in range(0, len(term)):

            if term[i]:
                prefix_tokens.append(
                    tokens.Token(chars=term[:i + 1], position=token.position))
        return prefix_tokens
예제 #4
0
    def _Snippet(self, query, field, *args):
        """Create a snippet given a query and the field to query on.

    Args:
      query: A query string containing only a bare term (no operators).
      field: The field name to query on.
      *args: Unused optional arguments. These are not used on dev_appserver.

    Returns:
      A snippet for the field with the query term bolded.
    """
        field = query_parser.GetQueryNodeText(field)
        terms = self._tokenizer.TokenizeText(
            query_parser.GetQueryNodeText(query).strip('"'))
        for term in terms:
            search_token = tokens.Token(chars=u'%s:%s' % (field, term.chars))
            postings = self._inverted_index.GetPostingsForToken(search_token)
            for posting in postings:
                if posting.doc_id != self._doc_pb.id(
                ) or not posting.positions:
                    continue

                field_val = search_util.GetFieldValue(
                    search_util.GetFieldInDocument(self._doc_pb, field))
                if not field_val:
                    continue
                doc_words = [
                    token.chars for token in
                    self._case_preserving_tokenizer.TokenizeText(field_val)
                ]

                position = posting.positions[0]
                return self._GenerateSnippet(
                    doc_words, position,
                    search_util.DEFAULT_MAX_SNIPPET_LENGTH)
            else:
                field_val = search_util.GetFieldValue(
                    search_util.GetFieldInDocument(self._doc_pb, field))
                if not field_val:
                    return None
                return '%s...' % field_val[:search_util.
                                           DEFAULT_MAX_SNIPPET_LENGTH]
    def _Snippet(self, query, field, *args):
        field = query_parser.GetQueryNodeText(field)
        terms = self._tokenizer.TokenizeText(
            query_parser.GetQueryNodeText(query).strip('"'))
        for term in terms:
            search_token = tokens.Token(chars=u'%s:%s' % (field, term.chars))
            postings = self._inverted_index.GetPostingsForToken(search_token)
            for posting in postings:
                if posting.doc_id != self._doc_pb.id(
                ) or not posting.positions:
                    continue

                field_val = search_util.GetFieldValue(
                    search_util.GetFieldInDocument(self._doc_pb, field))
                doc_words = [
                    token.chars for token in
                    self._case_preserving_tokenizer.TokenizeText(field_val)
                ]

                position = posting.positions[0]
                return self._GenerateSnippet(
                    doc_words, position,
                    search_util.DEFAULT_MAX_SNIPPET_LENGTH)
 def _DocumentCountForTerm(self, term):
     """Returns the document count for documents containing the term."""
     return len(self._PostingsForToken(tokens.Token(chars=term)))
예제 #7
0
 def _PostingsForFieldToken(self, field, value):
   """Returns postings for the value occurring in the given field."""
   value = simple_tokenizer.NormalizeString(value)
   return self._PostingsForToken(
       tokens.Token(chars=value, field_name=field))
예제 #8
0
 def _PostingsForFieldToken(self, field, value):
     """Returns postings for the value occurring in the given field."""
     return self._PostingsForToken(
         tokens.Token(chars=value.lower(), field_name=field))