Exemplo n.º 1
0
def ChineseAnalyzer(expression=default_pattern,
                    stoplist=None,
                    minsize=2,
                    maxsize=None,
                    gaps=False,
                    stemfn=stem,
                    ignore=None,
                    cachesize=50000):
    """Composes a RegexTokenizer with a lower case filter, an optional stop
    filter, and a stemming filter.
    用小写过滤器、可选的停止停用词过滤器和词干过滤器组成生成器。
    >>> ana = ChineseAnalyzer()
    >>> [token.text for token in ana("Testing is testing and testing")]
    ["test", "test", "test"]
    :param expression: 用于提取 token 令牌的正则表达式
    :param stoplist: 一个停用词列表。 设置为 None 标识禁用停用词过滤功能。
    :param minsize: 单词最小长度,小于它的单词将被从流中删除。
    :param maxsize: 单词最大长度,大于它的单词将被从流中删除。
    :param gaps: 如果为 True, tokenizer 令牌解析器将会分割正则表达式,而非匹配正则表达式
    :param ignore: 一组忽略的单词。
    :param cachesize: 缓存词干词的最大数目。 这个数字越大,词干生成的速度就越快,但占用的内存就越多。
                      使用 None 表示无缓存,使用 -1 表示无限缓存。
    """
    ret = ChineseTokenizer(expression=expression, gaps=gaps)
    chain = ret | LowercaseFilter()
    if stoplist is not None:
        chain = chain | StopFilter(
            stoplist=stoplist, minsize=minsize, maxsize=maxsize)
    return chain | StemFilter(
        stemfn=stemfn, ignore=ignore, cachesize=cachesize)
Exemplo n.º 2
0
def FancyAnalyzer(expression=r"\s+",
                  stoplist=STOP_WORDS,
                  minsize=2,
                  maxsize=None,
                  gaps=True,
                  splitwords=True,
                  splitnums=True,
                  mergewords=False,
                  mergenums=False):
    """Composes a RegexTokenizer with an IntraWordFilter, LowercaseFilter, and
    StopFilter.

    >>> ana = FancyAnalyzer()
    >>> [token.text for token in ana("Should I call getInt or get_real?")]
    ["should", "call", "getInt", "get", "int", "get_real", "get", "real"]

    :param expression: The regular expression pattern to use to extract tokens.
    :param stoplist: A list of stop words. Set this to None to disable
        the stop word filter.
    :param minsize: Words smaller than this are removed from the stream.
    :param maxsize: Words longer that this are removed from the stream.
    :param gaps: If True, the tokenizer *splits* on the expression, rather
        than matching on the expression.
    """

    return (RegexTokenizer(expression=expression, gaps=gaps)
            | IntraWordFilter(splitwords=splitwords,
                              splitnums=splitnums,
                              mergewords=mergewords,
                              mergenums=mergenums)
            | LowercaseFilter()
            | StopFilter(stoplist=stoplist, minsize=minsize))
Exemplo n.º 3
0
def StandardAnalyzer(expression=default_pattern,
                     stoplist=STOP_WORDS,
                     minsize=2,
                     maxsize=None,
                     gaps=False):
    """Composes a RegexTokenizer with a LowercaseFilter and optional
    StopFilter.

    >>> ana = StandardAnalyzer()
    >>> [token.text for token in ana("Testing is testing and testing")]
    ["testing", "testing", "testing"]

    :param expression: The regular expression pattern to use to extract tokens.
    :param stoplist: A list of stop words. Set this to None to disable
        the stop word filter.
    :param minsize: Words smaller than this are removed from the stream.
    :param maxsize: Words longer that this are removed from the stream.
    :param gaps: If True, the tokenizer *splits* on the expression, rather
        than matching on the expression.
    """

    ret = RegexTokenizer(expression=expression, gaps=gaps)
    chain = ret | LowercaseFilter()
    if stoplist is not None:
        chain = chain | StopFilter(
            stoplist=stoplist, minsize=minsize, maxsize=maxsize)
    return chain
Exemplo n.º 4
0
def StemmingAnalyzer(expression=default_pattern,
                     stoplist=STOP_WORDS,
                     minsize=2,
                     maxsize=None,
                     gaps=False,
                     stemfn=stem,
                     ignore=None,
                     cachesize=50000):
    """Composes a RegexTokenizer with a lower case filter, an optional stop
    filter, and a stemming filter.

    >>> ana = StemmingAnalyzer()
    >>> [token.text for token in ana("Testing is testing and testing")]
    ["test", "test", "test"]

    :param expression: The regular expression pattern to use to extract tokens.
    :param stoplist: A list of stop words. Set this to None to disable
        the stop word filter.
    :param minsize: Words smaller than this are removed from the stream.
    :param maxsize: Words longer that this are removed from the stream.
    :param gaps: If True, the tokenizer *splits* on the expression, rather
        than matching on the expression.
    :param ignore: a set of words to not stem.
    :param cachesize: the maximum number of stemmed words to cache. The larger
        this number, the faster stemming will be but the more memory it will
        use. Use None for no cache, or -1 for an unbounded cache.
    """

    ret = RegexTokenizer(expression=expression, gaps=gaps)
    chain = ret | LowercaseFilter()
    if stoplist is not None:
        chain = chain | StopFilter(
            stoplist=stoplist, minsize=minsize, maxsize=maxsize)
    return chain | StemFilter(
        stemfn=stemfn, ignore=ignore, cachesize=cachesize)
Exemplo n.º 5
0
    def index(self, index_name='unified'):
        types = self.get_requested_content_types()

        from whoosh.fields import TEXT, ID, NGRAM, NUMERIC
        from whoosh.analysis import StemmingAnalyzer, SimpleAnalyzer, IDAnalyzer
        from whoosh.analysis.filters import LowercaseFilter
        simp_ana = SimpleAnalyzer()
        print 'Building %s index...' % index_name

        # build a single schema from the fields exposed by the different search
        # types
        print '\tSchema:'
        fields = {}
        for type in types:
            for info in type.get_fields_info().values():
                if info['whoosh']['name'] not in fields and not info[
                        'whoosh'].get('ignore', False):
                    print '\t\t%s' % info
                    field_type = info['whoosh']['type']

                    if index_name == 'autocomplete':
                        # break the long text fields into terms, leave the
                        # others as single expression
                        if not (field_type.__class__ == NUMERIC):
                            if info.get('long_text', False):
                                field_type = TEXT(analyzer=simp_ana)
                            else:
                                field_type = ID(stored=True,
                                                analyzer=IDAnalyzer()
                                                | LowercaseFilter())
                    print '\t\t%s' % field_type
                    fields[info['whoosh']['name']] = field_type

                    # JIRA 508 - Add an ID counterpart to allow exact phrase search
#                     if info.get('long_text', False):
#                         fields[info['whoosh']['name']+'_iexact'] = ID(analyzer=IDAnalyzer(lowercase=True))

        from whoosh.fields import Schema
        schema = Schema(**fields)

        # Create the index schema
        index = self.recreate_index(index_name, schema)

        # Add documents to the index
        print '\tWrite indexes:'
        writer = index.writer()
        aci = {}
        for type in types:
            count = type.write_index(writer, self.is_verbose(), aci)
            print '\t\t%s %s records indexed' % (count,
                                                 type.get_model().__name__)

        # autocomplete
        if index_name == 'unified':
            f = open(types[0].get_autocomplete_path(True), 'w')
            f.write((ur'|'.join(aci.keys())).encode('utf8'))
            f.close()

        writer.commit()
Exemplo n.º 6
0
def IDAnalyzer(lowercase=False):
    """Deprecated, just use an IDTokenizer directly, with a LowercaseFilter if
    desired.
    """

    tokenizer = IDTokenizer()
    if lowercase:
        tokenizer = tokenizer | LowercaseFilter()
    return tokenizer
Exemplo n.º 7
0
def NgramAnalyzer(minsize, maxsize=None):
    """Composes an NgramTokenizer and a LowercaseFilter.

    >>> ana = NgramAnalyzer(4)
    >>> [token.text for token in ana("hi there")]
    ["hi t", "i th", " the", "ther", "here"]
    """

    return NgramTokenizer(minsize, maxsize=maxsize) | LowercaseFilter()
Exemplo n.º 8
0
def SplitAnalyzer(separator=None):
    """Parses whitespace- or comma-separated tokens.

    >>> ana = KeywordAnalyzer()
    >>> [token.text for token in ana("Hello there, this is a TEST")]
    ["Hello", "there,", "this", "is", "a", "TEST"]

    :param lowercase: whether to lowercase the tokens.
    :param commas: if True, items are separated by commas rather than
        whitespace.
    """
    return SplitTokenizer(separator) | LowercaseFilter()
Exemplo n.º 9
0
def SimpleAnalyzer(expression=default_pattern, gaps=False):
    """Composes a RegexTokenizer with a LowercaseFilter.

    >>> ana = SimpleAnalyzer()
    >>> [token.text for token in ana("Hello there, this is a TEST")]
    ["hello", "there", "this", "is", "a", "test"]

    :param expression: The regular expression pattern to use to extract tokens.
    :param gaps: If True, the tokenizer *splits* on the expression, rather
        than matching on the expression.
    """

    return RegexTokenizer(expression=expression, gaps=gaps) | LowercaseFilter()
Exemplo n.º 10
0
def KeywordAnalyzer(lowercase=False, commas=False):
    """Parses whitespace- or comma-separated tokens.

    >>> ana = KeywordAnalyzer()
    >>> [token.text for token in ana("Hello there, this is a TEST")]
    ["Hello", "there,", "this", "is", "a", "TEST"]

    :param lowercase: whether to lowercase the tokens.
    :param commas: if True, items are separated by commas rather than
        whitespace.
    """

    if commas:
        tokenizer = CommaSeparatedTokenizer()
    else:
        tokenizer = SpaceSeparatedTokenizer()
    if lowercase:
        tokenizer = tokenizer | LowercaseFilter()
    return tokenizer
Exemplo n.º 11
0
def LanguageAnalyzer(lang,
                     expression=default_pattern,
                     gaps=False,
                     cachesize=50000):
    """Configures a simple analyzer for the given language, with a
    LowercaseFilter, StopFilter, and StemFilter.

    >>> ana = LanguageAnalyzer("es")
    >>> [token.text for token in ana("Por el mar corren las liebres")]
    ['mar', 'corr', 'liebr']

    The list of available languages is in `whoosh.lang.languages`.
    You can use :func:`whoosh.lang.has_stemmer` and
    :func:`whoosh.lang.has_stopwords` to check if a given language has a
    stemming function and/or stop word list available.

    :param expression: The regular expression pattern to use to extract tokens.
    :param gaps: If True, the tokenizer *splits* on the expression, rather
        than matching on the expression.
    :param cachesize: the maximum number of stemmed words to cache. The larger
        this number, the faster stemming will be but the more memory it will
        use.
    """

    from whoosh.lang import NoStemmer, NoStopWords

    # Make the start of the chain
    chain = (RegexTokenizer(expression=expression, gaps=gaps)
             | LowercaseFilter())

    # Add a stop word filter
    try:
        chain = chain | StopFilter(lang=lang)
    except NoStopWords:
        pass

    # Add a stemming filter
    try:
        chain = chain | StemFilter(lang=lang, cachesize=cachesize)
    except NoStemmer:
        pass

    return chain
Exemplo n.º 12
0
def StanfordAnalyzer(lowercase=False):
    tokenizer = StanfordTokenizer()
    if lowercase:
        tokenizer = tokenizer | LowercaseFilter()
    return tokenizer
Exemplo n.º 13
0
"""Text utilities."""

# Instead of adding more dependencies, let's take advantage of Whoosh's text
# processing utilities.
from whoosh.analysis.filters import CharsetFilter, LowercaseFilter
from whoosh.analysis.tokenizers import IDTokenizer, RegexTokenizer
from whoosh.support.charset import accent_map

ID_NORMALIZATION_CHAIN = IDTokenizer() | CharsetFilter(
    accent_map) | LowercaseFilter()
SORT_NORMALIZATION_CHAIN = RegexTokenizer() | CharsetFilter(
    accent_map) | LowercaseFilter()
SLUGIFY_CHAIN = RegexTokenizer(r"[^_\W]+") | CharsetFilter(
    accent_map) | LowercaseFilter()


def id_normalize(text):
    return ' '.join([t.text for t in ID_NORMALIZATION_CHAIN(text)])


def sort_normalize(text):
    return ' '.join([t.text for t in SORT_NORMALIZATION_CHAIN(text)])


def slugify(text, delimiter='-'):
    return delimiter.join([t.text for t in SLUGIFY_CHAIN(text)])
Exemplo n.º 14
0
class ExamplesIndex(ExamplesStore):
    """An ExampleStore that also provides interfaces such as querying for nearest
    examples of a certain type.
    It effectively provides a higher level interface around an IndexBackendABC specifically
    related to the domain of AInix examples."""

    # TODO (DNGros): this tokenizer shouldn't really be here. Should not depend on whoosh
    x_tokenizer = RegexTokenizer() | LowercaseFilter()

    def __init__(self, type_context: TypeContext, backend: IndexBackendABC = None):
        super().__init__(type_context)
        scheme = self.get_scheme()
        self.parser = StringParser(type_context)
        self.backend = backend or WhooshIndexBackend(scheme)
        self.example_count = 0

    @staticmethod
    def get_scheme() -> 'IndexBackendScheme':
        return IndexBackendScheme(
            example_id=IndexBackendFields.NUMBER,
            xquery=IndexBackendFields.TEXT,
            ytext=IndexBackendFields.TEXT,
            xtype=IndexBackendFields.ID,
            ytype=IndexBackendFields.ID,
            yindexable=IndexBackendFields.SPACE_STORED_TEXT,
            y_set_id=IndexBackendFields.ID,
            weight=IndexBackendFields.ONE_INTERVAL_NUM,
            split=IndexBackendFields.ID
        )

    @staticmethod
    def get_default_ram_backend() -> 'IndexBackendABC':
        """Gets a default backend that does not
        if node is None:
        return Falsetouch any files and just
        keeps data in RAM"""
        return ainix_kernel.indexing.whooshbackend.WhooshIndexBackend(
            ExamplesIndex.get_scheme(), ram_only=True)

    def get_num_x_values(self) -> int:
        return self.backend.get_doc_count()

    def _get_yparsed_rep(self, y_string: str, y_type: str) -> str:
        ast = self.parser.create_parse_tree(y_string, y_type)
        return ast.indexable_repr()

    def add_example(self, example: XValue) -> None:
        self.backend.add_documents([attr.asdict(example)])
        self.example_count += 1

    def add_yset(
        self,
        x_values: List[str],
        y_values: List[str],
        x_type: str,
        y_type: str,
        y_preferences: List[float],
        splitter: DataSplits = DEFAULT_SPLITTER
    ) -> None:
        y_group = id_generator(size=10)
        for x in x_values:
            split = splitter.get_split_from_example(x, y_type)
            for y, weight in zip(y_values, y_preferences):
                new_example = XValue(self.example_count, x, y, x_type, y_type, weight, y_group,
                                     split=split.value,
                                     yindexable=self._get_yparsed_rep(y, y_type))
                self.add_example(new_example)

    def _dict_to_example(self, doc: Dict) -> XValue:
        """Takes the dictionary form of an object and returns an example object"""
        # make a copy of the dict so we can mutate alter its keys without
        # mutating the input dict (this might be overkill....)
        doc_copy = copy.deepcopy(doc)
        doc_copy['weight'] = float(doc_copy['weight'])
        doc_copy['example_id'] = int(doc_copy['example_id'])
        doc_copy['split'] = int(doc_copy['split'])
        return XValue(**doc_copy)

    def get_example_by_id(self, id: int) -> XValue:
        query = Term("example_id", id)
        hits = list(self.backend.query(query))
        assert len(hits) == 1
        return self._dict_to_example(hits[0].doc)

    # This code is not very relvant anymore.
    def get_nearest_examples(
        self,
        x_value: str,
        choose_type_name: str = None,
        filter_splits=None,
        max_results=10
    ) -> Generator[XValue, None, None]:
        """
        Args:
            filter_splits:
            x_value: a string to look for the most similar example to
            choose_type_name: By optionally specifying this value you may require
                that a specific type choice appears in the example. You could for example
                only look for the nearest examples where the example features a choice
                between a Program type.
            filter_splits: A tuple of DataSplits. If provided, only examples in one
                of the splits in the provided tuple will be returned
            max_results: The max number of examples to return

        Returns:
            A list of all examples that are potentially near the example, sorted
            in order where the 0th item is predicted to be nearest.
        """
        tokenized_x_value = (tok.text for tok in self.x_tokenizer(x_value))
        query = Or([Term("xquery", term,) for term in tokenized_x_value])
        if choose_type_name:
            y_type_indexable_rep = ast_components.indexable_repr_classify_type(choose_type_name)
            query &= Term("yindexable", y_type_indexable_rep)
        if filter_splits:
            query &= Or([Term("split", str(split.value)) for split in filter_splits])
        query_result = self.backend.query(query, max_results)
        yield from (self._dict_to_example(hit.doc) for hit in query_result)

    def get_all_x_values(
        self,
        filter_splits: Tuple[DataSplits, ...] = None
    ) -> Generator[XValue, None, None]:
        """Yields all examples in the index"""
        if filter_splits is None or len(filter_splits) == 0:
            query = Every()
        else:
            query = Or([Term("split", str(split.value)) for split in filter_splits])
        yield from (self._dict_to_example(hit.doc)
                    for hit in self.backend.query(query, max_results=None, score=False))

    def get_y_values_for_y_set(self, y_set_id: str) -> List[XValue]:
        query = Term("y_set_id", y_set_id)
        return [self._dict_to_example(hit.doc)
                for hit in self.backend.query(query, None, False)]
Exemplo n.º 15
0
def NgramWordAnalyzer(minsize, maxsize=None, tokenizer=None, **kwargs):
    if not tokenizer:
        tokenizer = RegexTokenizer()
    return tokenizer | LowercaseFilter() | NgramFilter(minsize, maxsize, **
                                                       kwargs)