Пример #1
0
def extract_searchable_text(entry):
    cite_key = entry["pid"]

    # Include the cite-key in the text that can be searched.
    text_lines = [cite_key]
    for field, funcs in FIELDS_TO_GREP:
        if entry.has_key(field):
            value = entry[field]

            # Apply field-specific processing functions.
            for func in funcs:
                if type(value) == types.ListType:
                    result_values = []
                    for v in value:
                        res = func(v)
                        if type(res) == types.ListType:
                            result_values.extend(res)
                        else:
                            result_values.append(res)
                    value = result_values
                else:
                    value = func(value)

            # Transliterate to ASCII and convert to lowercase.
            if type(value) == types.ListType:
                value = [
                    unicode_string_utils.transliterate_to_ascii(v).lower()
                    for v in value
                ]
            else:
                value = unicode_string_utils.transliterate_to_ascii(
                    value).lower()

            # Add to the searchable text lines for this entry.
            if type(value) == types.ListType:
                # Remove duplicates.
                value = list(set(value))
                text_lines.extend(value)
            else:
                text_lines.append(value)

    return (cite_key, text_lines)
Пример #2
0
def extract_searchable_text(entry):
  cite_key = entry["pid"]

  # Include the cite-key in the text that can be searched.
  text_lines = [cite_key]
  for field, funcs in FIELDS_TO_GREP:
    if entry.has_key(field):
      value = entry[field]

      # Apply field-specific processing functions.
      for func in funcs:
        if type(value) == types.ListType:
          result_values = []
          for v in value:
            res = func(v)
            if type(res) == types.ListType:
              result_values.extend(res)
            else:
              result_values.append(res)
          value = result_values
        else:
          value = func(value)

      # Transliterate to ASCII and convert to lowercase.
      if type(value) == types.ListType:
        value = [unicode_string_utils.transliterate_to_ascii(v).lower() for v in value]
      else:
        value = unicode_string_utils.transliterate_to_ascii(value).lower()

      # Add to the searchable text lines for this entry.
      if type(value) == types.ListType:
        # Remove duplicates.
        value = list(set(value))
        text_lines.extend(value)
      else:
        text_lines.append(value)

  return (cite_key, text_lines)
Пример #3
0
def normalise_hyphens_strip_punctuation(w):
    # We assume that word 'w' contains no whitespace.  It may contain hyphens,
    # which we should not remove (but we should reduce any sequences of multiple
    # hyphens to a single hyphen).

    # First, let's invoke 'transliterate_to_ascii', in case there are any
    # hyphen-like characters that we want to convert into real ASCII hyphens.
    # While we're at it, let's convert all the now-ASCII characters to lowercase.
    w = unicode_string_utils.transliterate_to_ascii(w).lower()

    # We want to retain any hyphens in the word.
    # Hence, split at the hyphens before we remove punctuation, so we can
    # re-join the pieces with hyphens later.  We remove any empty word pieces,
    # because these indicate that there were multiple adjacent hyphens in the word.
    # As an added benefit, this will also ensure that the re-assembled 'w' does not
    # begin or end with a hyphen.
    word_pieces = filter(None, w.split("-"))
    word_pieces = map(strip_punctuation_and_whitespace, word_pieces)
    return '-'.join(word_pieces)
Пример #4
0
def normalise_hyphens_strip_punctuation(w):
  # We assume that word 'w' contains no whitespace.  It may contain hyphens,
  # which we should not remove (but we should reduce any sequences of multiple
  # hyphens to a single hyphen).

  # First, let's invoke 'transliterate_to_ascii', in case there are any
  # hyphen-like characters that we want to convert into real ASCII hyphens.
  # While we're at it, let's convert all the now-ASCII characters to lowercase.
  w = unicode_string_utils.transliterate_to_ascii(w).lower()

  # We want to retain any hyphens in the word.
  # Hence, split at the hyphens before we remove punctuation, so we can
  # re-join the pieces with hyphens later.  We remove any empty word pieces,
  # because these indicate that there were multiple adjacent hyphens in the word.
  # As an added benefit, this will also ensure that the re-assembled 'w' does not
  # begin or end with a hyphen.
  word_pieces = filter(None, w.split("-"))
  word_pieces = map(strip_punctuation_and_whitespace, word_pieces)
  return '-'.join(word_pieces)
Пример #5
0
def build_query(expr):
  tokens = expr.replace(":", " ").split()
  return [unicode_string_utils.transliterate_to_ascii(t).lower() for t in tokens]
Пример #6
0
def normalise_to_ascii_lower(s):
    # Note that we invoke 'transliterate_to_ascii' before 'strip_punctuation_and_whitespace'
    # because 'transliterate_to_ascii' may well introduce some punctuation or whitespace
    # which we then want 'strip_punctuation_and_whitespace' to remove.
    return strip_punctuation_and_whitespace(
        unicode_string_utils.transliterate_to_ascii(s)).lower()
Пример #7
0
def build_query(expr):
    tokens = expr.replace(":", " ").split()
    return [
        unicode_string_utils.transliterate_to_ascii(t).lower() for t in tokens
    ]
Пример #8
0
def normalise_to_ascii_lower(s):
  # Note that we invoke 'transliterate_to_ascii' before 'strip_punctuation_and_whitespace'
  # because 'transliterate_to_ascii' may well introduce some punctuation or whitespace
  # which we then want 'strip_punctuation_and_whitespace' to remove.
  return strip_punctuation_and_whitespace(
      unicode_string_utils.transliterate_to_ascii(s)).lower()