def __init__(self, *patterns):
        self.patterns = patterns
        self.exprs = [rcompile(pat, re.IGNORECASE) for pat in self.patterns]

        self.pattern = ("(?P<month>" +
                        "|".join("(%s)" % pat for pat in self.patterns) + ")")
        self.expr = rcompile(self.pattern, re.IGNORECASE)
示例#2
0
    def __init__(self, *patterns):
        self.patterns = patterns
        self.exprs = [rcompile(pat, re.IGNORECASE) for pat in self.patterns]

        self.pattern = ("(?P<month>"
                        + "|".join("(%s)" % pat for pat in self.patterns)
                        + ")")
        self.expr = rcompile(self.pattern, re.IGNORECASE)
示例#3
0
 def __init__(self, next, last, daynames):
     self.next_pattern = next
     self.last_pattern = last
     self._dayname_exprs = tuple(rcompile(pat, re.IGNORECASE)
                                 for pat in daynames)
     dn_pattern = "|".join(daynames)
     self.pattern = ("(?P<dir>%s|%s) +(?P<day>%s)(?=(\\W|$))"
                     % (next, last, dn_pattern))
     self.expr = rcompile(self.pattern, re.IGNORECASE)
 def __init__(self, next, last, daynames):
     self.next_pattern = next
     self.last_pattern = last
     self._dayname_exprs = tuple(
         rcompile(pat, re.IGNORECASE) for pat in daynames)
     dn_pattern = "|".join(daynames)
     self.pattern = ("(?P<dir>%s|%s) +(?P<day>%s)(?=(\\W|$))" %
                     (next, last, dn_pattern))
     self.expr = rcompile(self.pattern, re.IGNORECASE)
    def __init__(self,
                 elements,
                 sep="(\\s+|\\s*,\\s*)",
                 onceper=True,
                 requireall=False,
                 allof=None,
                 anyof=None,
                 name=None):
        """
        :param elements: the sub-elements to parse.
        :param sep: a separator regular expression to match between elements,
            or None to not have separators.
        :param onceper: only allow each element to match once.
        :param requireall: if True, the sub-elements can match in any order,
            but they must all match.
        :param allof: a list of indexes into the list of elements. When this
            argument is not None, this element matches only if all the
            indicated sub-elements match.
        :param allof: a list of indexes into the list of elements. When this
            argument is not None, this element matches only if any of the
            indicated sub-elements match.
        :param name: a name for this element (for debugging purposes only).
        """

        super(Bag, self).__init__(elements, name)
        self.sep_expr = rcompile(sep, re.IGNORECASE)
        self.onceper = onceper
        self.requireall = requireall
        self.allof = allof
        self.anyof = anyof
示例#6
0
class RangePlugin(Plugin):
    """Adds the ability to specify term ranges.
    """

    expr = rcompile(r"""
    (?P<open>\{|\[)               # Open paren
    (?P<start>
        ('[^']*?'\s+)             # single-quoted
        |                         # or
        ([^\]}]+?(?=[Tt][Oo]))    # everything until "to"
    )?
    [Tt][Oo]                      # "to"
    (?P<end>
        (\s+'[^']*?')             # single-quoted
        |                         # or
        ([^\]}]+?)                # everything until "]" or "}"
    )?
    (?P<close>}|])                # Close paren
    """,
                    verbose=True)

    class RangeTagger(RegexTagger):
        def __init__(self, expr, excl_start, excl_end):
            self.expr = expr
            self.excl_start = excl_start
            self.excl_end = excl_end

        def create(self, parser, match):
            start = match.group("start")
            end = match.group("end")
            if start:
                # Strip the space before the "to"
                start = start.rstrip()
                # Strip single quotes
                if start.startswith("'") and start.endswith("'"):
                    start = start[1:-1]
            if end:
                # Strip the space before the "to"
                end = end.lstrip()
                # Strip single quotes
                if end.startswith("'") and end.endswith("'"):
                    end = end[1:-1]
            # What kind of open and close brackets were used?
            startexcl = match.group("open") == self.excl_start
            endexcl = match.group("close") == self.excl_end

            rn = syntax.RangeNode(start, end, startexcl, endexcl)
            return rn

    def __init__(self, expr=None, excl_start="{", excl_end="}"):
        self.expr = expr or self.expr
        self.excl_start = excl_start
        self.excl_end = excl_end

    def taggers(self, parser):
        tagger = self.RangeTagger(self.expr, self.excl_start, self.excl_end)
        return [(tagger, 1)]
示例#7
0
    def __init__(self, pattern, replacement):
        """
        :param pattern: a pattern string or compiled regular expression object
            describing the text to replace.
        :param replacement: the substitution text.
        """

        self.pattern = rcompile(pattern)
        self.replacement = replacement
示例#8
0
    def __init__(self, pattern, replacement):
        """
        :param pattern: a pattern string or compiled regular expression object
            describing the text to replace.
        :param replacement: the substitution text.
        """

        self.pattern = rcompile(pattern)
        self.replacement = replacement
示例#9
0
 def __init__(self, expression=default_pattern, gaps=False):
     """
     :param expression: 一个正则表达式对象或字符串,默认为 rcompile(r"\w+(\.?\w+)*")。
         表达式的每一个匹配都等于一个 token 令牌。
         第0组匹配(整个匹配文本)用作 token 令牌的文本。
         如果你需要更复杂的正则表达式匹配处理,只需要编写自己的 tokenizer 令牌解析器即可。
     :param gaps: 如果为 True, tokenizer 令牌解析器会在正则表达式上进行分割,而非匹配。
     """
     self.expression = rcompile(expression)
     self.gaps = gaps
示例#10
0
    def __init__(self, expression=default_pattern, gaps=False):
        """
        :param expression: A regular expression object or string. Each match
            of the expression equals a token. Group 0 (the entire matched text)
            is used as the text of the token. If you require more complicated
            handling of the expression match, simply write your own tokenizer.
        :param gaps: If True, the tokenizer *splits* on the expression, rather
            than matching on the expression.
        """

        self.expression = rcompile(expression)
        self.gaps = gaps
示例#11
0
    def __init__(self, expression=default_pattern, gaps=False):
        """
        :param expression: A regular expression object or string. Each match
            of the expression equals a token. Group 0 (the entire matched text)
            is used as the text of the token. If you require more complicated
            handling of the expression match, simply write your own tokenizer.
        :param gaps: If True, the tokenizer *splits* on the expression, rather
            than matching on the expression.
        """

        self.expression = rcompile(expression)
        self.gaps = gaps
示例#12
0
    def __init__(self, years, months, weeks, days, hours, minutes, seconds):
        rel_years = "((?P<years>[0-9]+) *(%s))?" % years
        rel_months = "((?P<months>[0-9]+) *(%s))?" % months
        rel_weeks = "((?P<weeks>[0-9]+) *(%s))?" % weeks
        rel_days = "((?P<days>[0-9]+) *(%s))?" % days
        rel_hours = "((?P<hours>[0-9]+) *(%s))?" % hours
        rel_mins = "((?P<mins>[0-9]+) *(%s))?" % minutes
        rel_secs = "((?P<secs>[0-9]+) *(%s))?" % seconds

        self.pattern = ("(?P<dir>[+-]) *%s *%s *%s *%s *%s *%s *%s(?=(\\W|$))"
                        % (rel_years, rel_months, rel_weeks, rel_days,
                           rel_hours, rel_mins, rel_secs))
        self.expr = rcompile(self.pattern, re.IGNORECASE)
示例#13
0
    def __init__(self, years, months, weeks, days, hours, minutes, seconds):
        rel_years = "((?P<years>[0-9]+) *(%s))?" % years
        rel_months = "((?P<months>[0-9]+) *(%s))?" % months
        rel_weeks = "((?P<weeks>[0-9]+) *(%s))?" % weeks
        rel_days = "((?P<days>[0-9]+) *(%s))?" % days
        rel_hours = "((?P<hours>[0-9]+) *(%s))?" % hours
        rel_mins = "((?P<mins>[0-9]+) *(%s))?" % minutes
        rel_secs = "((?P<secs>[0-9]+) *(%s))?" % seconds

        self.pattern = ("(?P<dir>[+-]) *%s *%s *%s *%s *%s *%s *%s(?=(\\W|$))"
                        % (rel_years, rel_months, rel_weeks, rel_days,
                           rel_hours, rel_mins, rel_secs))
        self.expr = rcompile(self.pattern, re.IGNORECASE)
示例#14
0
    def create(self, in_memory=False):
        tokenizer_pattern = rcompile(r"(\w|·)+(\.?(\w|·)+)*") # Includes l·l
        analyzer = StandardAnalyzer(minsize=1, stoplist=None, expression=tokenizer_pattern)
        schema = Schema(verb_form=TEXT(stored=True, sortable=True, analyzer=analyzer),
                        index_letter=TEXT(stored=True, analyzer=analyzer),
                        file_path=TEXT(stored=True, sortable=True))

        if os.path.exists(self.dir_name):
            shutil.rmtree(self.dir_name)

        os.makedirs(self.dir_name)

        ix = create_in(self.dir_name, schema)

        self.writer = ix.writer()
        return ix
示例#15
0
    def __init__(self, elements, sep="(\\s+|\\s*,\\s*)", name=None,
                 progressive=False):
        """
        :param elements: the sequence of sub-elements to parse.
        :param sep: a separator regular expression to match between elements,
            or None to not have separators.
        :param name: a name for this element (for debugging purposes only).
        :param progressive: if True, elements after the first do not need to
            match. That is, for elements (a, b, c) and progressive=True, the
            sequence matches like ``a[b[c]]``.
        """

        super(Sequence, self).__init__(elements, name)
        self.sep_pattern = sep
        if sep:
            self.sep_expr = rcompile(sep, re.IGNORECASE)
        else:
            self.sep_expr = None
        self.progressive = progressive
示例#16
0
    def __init__(self, elements, sep="(\\s+|\\s*,\\s*)", name=None,
                 progressive=False):
        """
        :param elements: the sequence of sub-elements to parse.
        :param sep: a separator regular expression to match between elements,
            or None to not have separators.
        :param name: a name for this element (for debugging purposes only).
        :param progressive: if True, elements after the first do not need to
            match. That is, for elements (a, b, c) and progressive=True, the
            sequence matches like ``a[b[c]]``.
        """

        super(Sequence, self).__init__(elements, name)
        self.sep_pattern = sep
        if sep:
            self.sep_expr = rcompile(sep, re.IGNORECASE)
        else:
            self.sep_expr = None
        self.progressive = progressive
示例#17
0
    def __init__(self, elements, sep="(\\s+|\\s*,\\s*)", onceper=True,
                 requireall=False, allof=None, anyof=None, name=None):
        """
        :param elements: the sub-elements to parse.
        :param sep: a separator regular expression to match between elements,
            or None to not have separators.
        :param onceper: only allow each element to match once.
        :param requireall: if True, the sub-elements can match in any order,
            but they must all match.
        :param allof: a list of indexes into the list of elements. When this
            argument is not None, this element matches only if all the
            indicated sub-elements match.
        :param allof: a list of indexes into the list of elements. When this
            argument is not None, this element matches only if any of the
            indicated sub-elements match.
        :param name: a name for this element (for debugging purposes only).
        """

        super(Bag, self).__init__(elements, name)
        self.sep_expr = rcompile(sep, re.IGNORECASE)
        self.onceper = onceper
        self.requireall = requireall
        self.allof = allof
        self.anyof = anyof
示例#18
0
 def __init__(self, expression="[^/]+"):
     self.expr = rcompile(expression)
示例#19
0
# an optional list to override this one.

STOP_WORDS = frozenset(('a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'can',
                        'for', 'from', 'have', 'if', 'in', 'is', 'it', 'may',
                        'not', 'of', 'on', 'or', 'tbd', 'that', 'the', 'this',
                        'to', 'us', 'we', 'when', 'will', 'with', 'yet',
                        'you', 'your'))


# Simple pattern for filtering URLs, may be useful

url_pattern = rcompile("""
(
    [A-Za-z+]+://          # URL protocol
    \\S+?                  # URL body
    (?=\\s|[.]\\s|$|[.]$)  # Stop at space/end, or a dot followed by space/end
) | (                      # or...
    \w+([:.]?\w+)*         # word characters, with opt. internal colons/dots
)
""", verbose=True)


# Filters

class Filter(Composable):
    """Base class for Filter objects. A Filter subclass must implement a
    filter() method that takes a single argument, which is an iterator of Token
    objects, and yield a series of Token objects in return.

    Filters that do morphological transformation of tokens (e.g. stemming)
    should set their ``is_morph`` attribute to True.
示例#20
0
文件: index.py 项目: AXAz0r/Soph
                self.seen.add(candidate[self.field])
            return (candidate["user"], candidate[self.field])


def deduper(it, dedupe=True, field="content"):
    seen = set([])
    for item in it:
        if dedupe and item[field] in seen:
            continue
        if dedupe:
            seen.add(item[field])
        yield (item["user"], item[field])
    return None


tok_pat = rcompile(r"[+£€]?\w+(\.?\w+)*")
STOP_WORDS = frozenset(
    ('a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'can', 'for', 'from',
     'have', 'if', 'in', 'is', 'it', 'may', 'not', 'of', 'on', 'or', 'tbd',
     'that', 'the', 'this', 'to', 'when', 'will', 'with', 'yet'))


def Analyzer(expression=tok_pat,
             stoplist=None,
             minsize=1,
             maxsize=None,
             gaps=False):
    if stoplist is None:
        stoplist = STOP_WORDS
    return whoosh.analysis.StandardAnalyzer(expression=expression,
                                            stoplist=stoplist,
 def __init__(self, expr):
     self.expr = rcompile(expr)
示例#22
0
 def __init__(self):
     self.tokenizer_pattern = rcompile(r"(\w|·)+(\.?(\w|·)+)*") # Includes l·l
     self.analyzer = StandardAnalyzer(minsize=1, stoplist=None, expression=self.tokenizer_pattern)
 def __init__(self):
     self.pattern = ("(?P<hour>[1-9]|10|11|12)(:(?P<mins>[0-5][0-9])"
                     "(:(?P<secs>[0-5][0-9])(\\.(?P<usecs>[0-9]{1,5}))?)?)?"
                     "\\s*(?P<ampm>am|pm)(?=(\\W|$))")
     self.expr = rcompile(self.pattern, re.IGNORECASE)
示例#24
0
class SimpleVersion(BaseVersion):
    """An object that parses version numbers such as::

        12.2.5b

    The filter supports a limited subset of PEP 386 versions including::

        1
        1.2
        1.2c
        1.2c3
        1.2.3
        1.2.3a
        1.2.3b4
        10.7.5rc1
        999.999.999c999
    """

    _version_exp = rcompile(r"""
    ^
    (?P<major>\d{1,4})
    (
        [.](?P<minor>\d{1,4})
        (
            [.](?P<release>\d{1,4})
        )?
        (
            (?P<ex>[abc]|rc)
            (?P<exnum>\d{1,4})?
        )?
    )?
    $
    """,
                            verbose=True)

    # (groupid, method, skippable, default)
    _parts = [
        ("major", int),
        ("minor", int),
        ("release", int),
        ("ex", str),
        ("exnum", int),
    ]

    _ex_bits = {"a": 0, "b": 1, "c": 2, "rc": 10, "z": 15}
    _bits_ex = dict((v, k) for k, v in _ex_bits.items())

    __slots__ = ("major", "minor", "release", "ex", "exnum")

    def __init__(self, major=1, minor=0, release=0, ex="z", exnum=0):
        self.major = major
        self.minor = minor
        self.release = release
        self.ex = ex
        self.exnum = exnum

    def to_int(self):
        assert self.major < 1024
        n = self.major << 34

        assert self.minor < 1024
        n |= self.minor << 24

        assert self.release < 1024
        n |= self.release << 14

        exbits = self._ex_bits.get(self.ex, 15)
        n |= exbits << 10

        assert self.exnum < 1024
        n |= self.exnum

        return n

    @classmethod
    def from_int(cls, n):
        major = (n & (1023 << 34)) >> 34
        minor = (n & (1023 << 24)) >> 24
        release = (n & (1023 << 14)) >> 14
        exbits = (n & (7 << 10)) >> 10
        ex = cls._bits_ex.get(exbits, "z")
        exnum = n & 1023

        return cls(major, minor, release, ex, exnum)
示例#25
0
class FunctionPlugin(TaggingPlugin):
    """Adds an abitrary "function call" syntax to the query parser to allow
    advanced and extensible query functionality.

    This is unfinished and experimental.
    """

    expr = rcompile("""
    [#](?P<name>[A-Za-z_][A-Za-z0-9._]*)  # function name
    (                                     # optional args
        \\[                               # inside square brackets
        (?P<args>.*?)
        \\]
    )?
    """,
                    verbose=True)

    class FunctionNode(syntax.SyntaxNode):
        has_fieldname = False
        has_boost = True
        merging = False

        def __init__(self, name, fn, args, kwargs):
            self.name = name
            self.fn = fn
            self.args = args
            self.kwargs = kwargs
            self.nodes = []
            self.boost = None

        def __repr__(self):
            return "#%s<%r>(%r)" % (self.name, self.args, self.nodes)

        def query(self, parser):
            qs = [n.query(parser) for n in self.nodes]
            kwargs = self.kwargs
            if "boost" not in kwargs and self.boost is not None:
                kwargs["boost"] = self.boost
            # TODO: If this call raises an exception, return an error query
            return self.fn(qs, *self.args, **self.kwargs)

    def __init__(self, fns):
        """
        :param fns: a dictionary mapping names to functions that return a
            query.
        """

        self.fns = fns

    def create(self, parser, match):
        name = match.group("name")
        if name in self.fns:
            fn = self.fns[name]
            argstring = match.group("args")
            if argstring:
                args, kwargs = self._parse_args(argstring)
            else:
                args = ()
                kwargs = {}
            return self.FunctionNode(name, fn, args, kwargs)

    def _parse_args(self, argstring):
        args = []
        kwargs = {}

        parts = argstring.split(",")
        for part in parts:
            if "=" in part:
                name, value = part.split("=", 1)
                # Wrap with str() because Python 2.5 can't handle unicode kws
                name = str(name.strip())
            else:
                name = None
                value = part

            value = value.strip()
            if value.startswith("'") and value.endswith("'"):
                value = value[1:-1]

            if name:
                kwargs[name] = value
            else:
                args.append(value)

        return args, kwargs

    def filters(self, parser):
        return [(self.do_functions, 600)]

    def do_functions(self, parser, group):
        newgroup = group.empty_copy()
        i = 0
        while i < len(group):
            node = group[i]
            if (isinstance(node, self.FunctionNode) and i < len(group) - 1
                    and isinstance(group[i + 1], syntax.GroupNode)):
                nextnode = group[i + 1]
                node.nodes = list(self.do_functions(parser, nextnode))

                if nextnode.boost != 1:
                    node.set_boost(nextnode.boost)

                i += 1
            elif isinstance(node, syntax.GroupNode):
                node = self.do_functions(parser, node)

            newgroup.append(node)
            i += 1
        return newgroup
示例#26
0
 def __init__(self, pattern, fn=None, modify=None):
     self.pattern = pattern
     self.expr = rcompile(pattern, re.IGNORECASE)
     self.fn = fn
     self.modify = modify
示例#27
0
 def __init__(self):
     self.pattern = ("(?P<hour>[1-9]|10|11|12)(:(?P<mins>[0-5][0-9])"
                     "(:(?P<secs>[0-5][0-9])(\\.(?P<usecs>[0-9]{1,5}))?)?)?"
                     "\\s*(?P<ampm>am|pm)(?=(\\W|$))")
     self.expr = rcompile(self.pattern, re.IGNORECASE)
示例#28
0
 def __init__(self, expression="[^/]+"):
     self.expr = rcompile(expression)
示例#29
0
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.

from whoosh.compat import u, text_type
from whoosh.analysis.acore import Composable, Token
from whoosh.util.text import rcompile

default_pattern = rcompile(r"\w+(\.?\w+)*")

# Tokenizers


class Tokenizer(Composable):
    """Base class for Tokenizers.
    """
    def __eq__(self, other):
        return other and self.__class__ is other.__class__


class IDTokenizer(Tokenizer):
    """Yields the entire input string as a single token. For use in indexed but
    untokenized fields, such as a document's path.
示例#30
0
 def __init__(self, plugin, expr):
     self.plugin = plugin
     self.expr = rcompile(expr, re.IGNORECASE)
示例#31
0
文件: plugins.py 项目: JunjieHu/dl
 def __init__(self, expr=None):
     self.expr = rcompile(expr or self.expr)
 def __init__(self, pattern, fn=None, modify=None):
     self.pattern = pattern
     self.expr = rcompile(pattern, re.IGNORECASE)
     self.fn = fn
     self.modify = modify
示例#33
0
class PhrasePlugin(Plugin):
    """Adds the ability to specify phrase queries inside double quotes.
    """

    # Didn't use TaggingPlugin because I need to add slop parsing at some
    # point

    # Expression used to find words if a schema isn't available
    wordexpr = rcompile(r'\S+')

    class PhraseNode(syntax.TextNode):
        def __init__(self, text, textstartchar, slop=1):
            syntax.TextNode.__init__(self, text)
            self.textstartchar = textstartchar
            self.slop = slop

        def r(self):
            return "%s %r~%s" % (self.__class__.__name__, self.text, self.slop)

        def apply(self, fn):
            return self.__class__(self.type, [fn(node) for node in self.nodes],
                                  slop=self.slop,
                                  boost=self.boost)

        def query(self, parser):
            text = self.text
            fieldname = self.fieldname or parser.fieldname

            # We want to process the text of the phrase into "words" (tokens),
            # and also record the startchar and endchar of each word

            sc = self.textstartchar
            if parser.schema and fieldname in parser.schema:
                field = parser.schema[fieldname]
                if field.analyzer:
                    # We have a field with an analyzer, so use it to parse
                    # the phrase into tokens
                    tokens = field.tokenize(text, mode="query", chars=True)
                    words = []
                    char_ranges = []
                    for t in tokens:
                        words.append(t.text)
                        char_ranges.append((sc + t.startchar, sc + t.endchar))
                else:
                    # We have a field but it doesn't have a format object,
                    # for some reason (it's self-parsing?), so use process_text
                    # to get the texts (we won't know the start/end chars)
                    words = list(field.process_text(text, mode="query"))
                    char_ranges = [(None, None)] * len(words)
            else:
                # We're parsing without a schema, so just use the default
                # regular expression to break the text into words
                words = []
                char_ranges = []
                for match in PhrasePlugin.wordexpr.finditer(text):
                    words.append(match.group(0))
                    char_ranges.append((sc + match.start(), sc + match.end()))

            qclass = parser.phraseclass
            q = qclass(fieldname,
                       words,
                       slop=self.slop,
                       boost=self.boost,
                       char_ranges=char_ranges)
            return attach(q, self)

    class PhraseTagger(RegexTagger):
        def create(self, parser, match):
            text = match.group("text")
            textstartchar = match.start("text")
            slopstr = match.group("slop")
            slop = int(slopstr) if slopstr else 1
            return PhrasePlugin.PhraseNode(text, textstartchar, slop)

    def __init__(self, expr='"(?P<text>.*?)"(~(?P<slop>[1-9][0-9]*))?'):
        self.expr = expr

    def taggers(self, parser):
        return [(self.PhraseTagger(self.expr), 0)]
示例#34
0
from tempfile import TemporaryDirectory

from whoosh import qparser
from whoosh.analysis import RegexTokenizer
from whoosh.util.text import rcompile

tokenizer = RegexTokenizer(expression=rcompile(r"[\w/.]+"))
for token in tokenizer(u"Hello there templates/app1/test.html!"):
    print(repr(token.text))

from whoosh.fields import Schema, TEXT, ID
from whoosh.index import create_in

tmp_dir = TemporaryDirectory()

schema = Schema(title=TEXT(stored=True),
                path=ID(stored=True),
                content=TEXT(analyzer=RegexTokenizer(expression=rcompile(r"[\w/.]+"))))
ix = create_in(tmp_dir.name, schema)
writer = ix.writer()
writer.add_document(title=u"First document", path=u"/a",
                    content=u"this/is/a/test.html")
writer.add_document(title=u"Second document", path=u"/b",
                    content=u"this/is/a/hello.html   hello a yup")
writer.add_document(title=u"Second document", path=u"/b",
                    content=u"this is a hello.html   hello a yup")
writer.commit()
from whoosh.qparser import QueryParser

with ix.searcher() as searcher:
    query = QueryParser("content", ix.schema)
示例#35
0
 def __init__(self, expr=None):
     self.expr = rcompile(expr or self.expr)
 def __init__(self, plugin, expr):
     self.plugin = plugin
     self.expr = rcompile(expr, re.IGNORECASE)
示例#37
0
# -*- coding: utf-8 -*-

from whoosh.fields import *
from whoosh.formats import Positions, Characters
from whoosh.analysis import StandardAnalyzer, Tokenizer, RegexTokenizer, NgramFilter
from whoosh.analysis.filters import Filter, PassFilter
from whoosh.analysis.filters import LowercaseFilter
from whoosh.analysis.filters import StopFilter, STOP_WORDS
from whoosh.analysis.acore import Token
from whoosh.util.text import rcompile
from whoosh.query import And, Or, Term, FuzzyTerm


INDEXDIR = 'var/indexdir'

ENGLISH = rcompile(r"[a-zA-Z0-9_]+(\.?[a-zA-Z0-9_]+)*")

tokenizer = RegexTokenizer(r"[a-zA-Z0-9_]+(\.?[a-zA-Z0-9_]+)*|\w+")
ngram = NgramFilter(minsize=2, maxsize=2)
lower = LowercaseFilter()


class StreamSplitter(object):

    def __init__(self, tokens):
        self.tokens = tokens

        try:
            self.current = self.tokens.next()
            self.end = False
        except StopIteration:
示例#38
0
    "when",
    "will",
    "with",
    "yet",
    "you",
    "your",
))

# Simple pattern for filtering URLs, may be useful

url_pattern = rcompile(
    """
(
    [A-Za-z+]+://          # URL protocol
    \\S+?                  # URL body
    (?=\\s|[.]\\s|$|[.]$)  # Stop at space/end, or a dot followed by space/end
) | (                      # or...
    \w+([:.]?\w+)*         # word characters, with opt. internal colons/dots
)
""",
    verbose=True,
)

# Filters


class Filter(Composable):
    """Base class for Filter objects. A Filter subclass must implement a
    filter() method that takes a single argument, which is an iterator of Token
    objects, and yield a series of Token objects in return.

    Filters that do morphological transformation of tokens (e.g. stemming)
示例#39
0
from dataprocessing.utils import PoorDoc
import numpy as np
from whoosh.index import open_dir
from whoosh.query import DateRange
from config import indexdir, vectordir, dociddir
from dateutil import rrule
from datetime import datetime
from collections import Counter
from nltk.tokenize import word_tokenize, sent_tokenize
from whoosh.analysis import StandardAnalyzer
from whoosh.util.text import rcompile

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logger=logging.getLogger("generate month-vectors")

default_pattern = rcompile(r"\w+(\.?\w+)*")
def RegexTokenizer(text):
    for match in default_pattern.finditer(text):
        term = match.group(0)
        yield term.lower().encode("utf-8")

def getdocvector(date,didentifier):
        doc=PoorDoc(docidentifier=didentifier,date=date)
        tokens = RegexTokenizer(doc.getcontent())
        return Counter(tokens)

def get_months(batchnumber, n_batches):
    '''
    returns a list of monthranges as part of the total monthranges 
    '''
    month_range=[]
示例#40
0
 def __init__(self, expr):
     self.expr = rcompile(expr)
示例#41
0
try:
    from find_stuff.chmfile import SimpleChmFile
except:
    logger.warn("failed to import chm packages")

try:
    from pyth.plugins.rtf15.reader import Rtf15Reader
    from pyth.plugins.plaintext.writer import PlaintextWriter
except:
    logger.warn("failed to import rtf packages")

stem_ana = StemmingAnalyzer() | CJKFilter()
stem_ana.cachesize = -1

pattern2 = rcompile(r"[A-Za-z0-9]+(\.?[A-Za-z0-9]+)*")
stem_ana2 = StemmingAnalyzer(expression=pattern2) | CJKFilter()
stem_ana2.cachesize = -1

schema = Schema(title=TEXT(analyzer=stem_ana2, stored=True),
                content=TEXT(analyzer=stem_ana),
                time=STORED,
                path=ID(stored=True),
                real_path=STORED,
                filetype=ID)

handlers = {}


class TxtHandler(object):
    def extract_content(self, filepath):
示例#42
0
 def __init__(self):
     self.expression = rcompile(r'(\W|_)')
示例#43
0
try:
    from find_stuff.chmfile import SimpleChmFile
except:
    logger.warn("failed to import chm packages")    

try:
    from pyth.plugins.rtf15.reader import Rtf15Reader
    from pyth.plugins.plaintext.writer import PlaintextWriter
except:
    logger.warn("failed to import rtf packages")    


stem_ana = StemmingAnalyzer() | CJKFilter()
stem_ana.cachesize = -1

pattern2 = rcompile(r"[A-Za-z0-9]+(\.?[A-Za-z0-9]+)*")
stem_ana2 = StemmingAnalyzer(expression=pattern2) | CJKFilter()
stem_ana2.cachesize = -1


schema = Schema(title=TEXT(analyzer=stem_ana2,stored=True), content=TEXT(analyzer=stem_ana), time=STORED, path=ID(stored=True), real_path=STORED, filetype=ID)

handlers = {}

class TxtHandler(object):
    
    def extract_content(self, filepath):
        with codecs.open(filepath, encoding='utf-8') as fh:
            return fh.read()

示例#44
0
class FuzzyTermPlugin(TaggingPlugin):
    """Adds syntax to the query parser to create "fuzzy" term queries, which
    match any term within a certain "edit distance" (number of inserted,
    deleted, or transposed characters) by appending a tilde (``~``) and an
    optional maximum edit distance to a term. If you don't specify an explicit
    maximum edit distance, the default is 1.

    >>> qp = qparser.QueryParser("content", myschema)
    >>> qp.add_plugin(qparser.FuzzyTermPlugin())
    >>> q = qp.parse("Stephen~2 Colbert")

    For example, the following query creates a :class:`whoosh.query.FuzzyTerm`
    query with a maximum edit distance of 1::

        bob~

    The following creates a fuzzy term query with a maximum edit distance of
    2::

        bob~2

    The maximum edit distance can only be a single digit. Note that edit
    distances greater than 2 can take an extremely long time and are generally
    not useful.

    You can specify a prefix length using ``~n/m``. For example, to allow a
    maximum edit distance of 2 and require a prefix match of 3 characters::

        johannson~2/3

    To specify a prefix with the default edit distance::

        johannson~/3
    """

    expr = rcompile("""
    (?<=\\S)                          # Only match right after non-space
    ~                                 # Initial tilde
    (?P<maxdist>[0-9])?               # Optional maxdist
    (/                                # Optional prefix slash
        (?P<prefix>[1-9][0-9]*)       # prefix
    )?                                # (end prefix group)
    """,
                    verbose=True)

    class FuzzinessNode(syntax.SyntaxNode):
        def __init__(self, maxdist, prefixlength, original):
            self.maxdist = maxdist
            self.prefixlength = prefixlength
            self.original = original

        def __repr__(self):
            return "<~%d/%d>" % (self.maxdist, self.prefixlength)

    class FuzzyTermNode(syntax.TextNode):
        qclass = query.FuzzyTerm

        def __init__(self, wordnode, maxdist, prefixlength):
            self.fieldname = wordnode.fieldname
            self.text = wordnode.text
            self.boost = wordnode.boost
            self.startchar = wordnode.startchar
            self.endchar = wordnode.endchar
            self.maxdist = maxdist
            self.prefixlength = prefixlength

        def r(self):
            return "%r ~%d/%d" % (self.text, self.maxdist, self.prefixlength)

        def query(self, parser):
            # Use the superclass's query() method to create a FuzzyTerm query
            # (it looks at self.qclass), just because it takes care of some
            # extra checks and attributes
            q = syntax.TextNode.query(self, parser)
            # Set FuzzyTerm-specific attributes
            q.maxdist = self.maxdist
            q.prefixlength = self.prefixlength
            return q

    def create(self, parser, match):
        mdstr = match.group("maxdist")
        maxdist = int(mdstr) if mdstr else 1

        pstr = match.group("prefix")
        prefixlength = int(pstr) if pstr else 0

        return self.FuzzinessNode(maxdist, prefixlength, match.group(0))

    def filters(self, parser):
        return [(self.do_fuzzyterms, 0)]

    def do_fuzzyterms(self, parser, group):
        newgroup = group.empty_copy()
        i = 0
        while i < len(group):
            node = group[i]
            if i < len(group) - 1 and isinstance(node, syntax.WordNode):
                nextnode = group[i + 1]
                if isinstance(nextnode, self.FuzzinessNode):
                    node = self.FuzzyTermNode(node, nextnode.maxdist,
                                              nextnode.prefixlength)
                    i += 1
            if isinstance(node, self.FuzzinessNode):
                node = syntax.to_word(node)
            if isinstance(node, syntax.GroupNode):
                node = self.do_fuzzyterms(parser, node)

            newgroup.append(node)
            i += 1
        return newgroup
示例#45
0
def find_unused_templates():
    start = time.perf_counter()
    print('Finding all unused templates...')
    print('  Getting global templates...')
    global_templates_files, global_templates = find_global_templates()
    print('   Done.\n  Getting app templates...')
    app_templates_files, app_templates = find_app_templates()
    print('   Done.')
    templates = global_templates + app_templates
    template_files = global_templates_files + app_templates_files
    # templates.sort()
    template_files.sort()

    print('  Getting python files...')
    py_files, pys = find_py_files()
    print('   Done.')
    all_files = py_files + template_files

    tl_count = [0 for t in templates]
    unused_templates = []

    print('  Creating Index', end='')
    tmp_dir = TemporaryDirectory()

    schema = Schema(title=TEXT(stored=True),
                    path=ID(stored=True),
                    content=TEXT(analyzer=RegexTokenizer(expression=rcompile(r"[\w/.]+"))))
    ix = create_in(tmp_dir.name, schema)
    writer = ix.writer()

    for filename in all_files:
        print('.', end='')  # , flush=True)
        with open(filename, 'r') as f:
            # print('WHOOSH', filename, filename, f)
            # content = '/n'.join(f.readlines())
            # if content:
            #     print('HAS CONTENT')
            #     print(content)
            u_filename = filename
            try:  # Python2
                u_filename = unicode(filename)
            except NameError:
                pass
            writer.add_document(title=u_filename, path=u_filename,
                                content=six.u('/n'.join(f.readlines())))
                                # content=content)
    print('')  # , flush=True)
    writer.commit()
    print('   Done.')

    print('  Searching through templates for references', end='')  # , flush=True)
    with ix.searcher() as searcher:
        for count, template in enumerate(templates):
            print('.', end="")  # , flush=True)
            query = QueryParser("content", ix.schema).parse(template)
            results = searcher.search(query)
            if len(results) < 1:
                unused_templates.append(template)
    print('')  # , flush=True)
    print('   Done.')

    if not unused_templates:
        print('No unused templates found.')
    else:
        print('\nUnused templates:')
        for template in unused_templates:
            print(template)
    end = time.perf_counter()
    print('Finished in ' + str(end - start) + ' seconds.')
    return unused_templates