예제 #1
0
파일: wordnet.py 프로젝트: archatas/whoosh
def parse_file(f):
    """Parses the WordNet wn_s.pl prolog file and returns two dictionaries:
    word2nums and num2words.
    """
    
    word2nums = defaultdict(list)
    num2words = defaultdict(list)
    
    for line in f:
        if not line.startswith("s("):
            continue
        
        line = line[2:]
        num = int(line[:line.find(",")])
        qt = line.find("'")
        line = line[qt+1:]
        qt = line.find("'")
        word = line[:qt].lower()

        if not word.isalpha():
            continue
        
        word2nums[word].append(num)
        num2words[num].append(word)
    
    return word2nums, num2words
예제 #2
0
파일: wordnet.py 프로젝트: ra2003/whoosh
def parse_file(f):
    """Parses the WordNet wn_s.pl prolog file and returns two dictionaries:
    word2nums and num2words.
    """

    word2nums = defaultdict(list)
    num2words = defaultdict(list)

    for line in f:
        if not line.startswith("s("):
            continue

        line = line[2:]
        num = int(line[:line.find(",")])
        qt = line.find("'")
        line = line[qt + 1:]
        qt = line.find("'")
        word = line[:qt].lower()

        if not word.isalpha():
            continue

        word2nums[word].append(num)
        num2words[num].append(word)

    return word2nums, num2words
예제 #3
0
파일: paicehusk.py 프로젝트: ra2003/whoosh
 def __init__(self, ruletable):
     """
     :param ruletable: a string containing the rule data, separated
         by newlines.
     """
     self.rules = defaultdict(list)
     self.read_rules(ruletable)
예제 #4
0
파일: classify.py 프로젝트: archatas/whoosh
 def __init__(self, ixreader, fieldname, model=Bo1Model):
     """
     :param reader: A :class:whoosh.reading.IndexReader object.
     :param fieldname: The name of the field in which to search.
     :param model: (classify.ExpansionModel) The model to use for expanding
         the query terms. If you omit this parameter, the expander uses
         scoring.Bo1Model by default.
     """
     
     self.fieldname = fieldname
     
     if type(model) is type:
         model = model(ixreader, fieldname)
     self.model = model
     
     # Cache the collection frequency of every term in this field. This
     # turns out to be much faster than reading each individual weight from
     # the term index as we add words.
     self.collection_freq = dict((word, freq) for word, _, freq
                                   in ixreader.iter_field(fieldname))
     
     # Maps words to their weight in the top N documents.
     self.topN_weight = defaultdict(float)
     
     # Total weight of all terms in the top N documents.
     self.top_total = 0
예제 #5
0
파일: classify.py 프로젝트: ra2003/whoosh
    def __init__(self, ixreader, fieldname, model=Bo1Model):
        """
        :param reader: A :class:whoosh.reading.IndexReader object.
        :param fieldname: The name of the field in which to search.
        :param model: (classify.ExpansionModel) The model to use for expanding
            the query terms. If you omit this parameter, the expander uses
            scoring.Bo1Model by default.
        """

        self.fieldname = fieldname

        if type(model) is type:
            model = model(ixreader, fieldname)
        self.model = model

        # Cache the collection frequency of every term in this field. This
        # turns out to be much faster than reading each individual weight from
        # the term index as we add words.
        self.collection_freq = dict(
            (word, freq) for word, _, freq in ixreader.iter_field(fieldname))

        # Maps words to their weight in the top N documents.
        self.topN_weight = defaultdict(float)

        # Total weight of all terms in the top N documents.
        self.top_total = 0
예제 #6
0
 def __init__(self, ruletable):
     """
     :param ruletable: a string containing the rule data, separated
         by newlines.
     """
     self.rules = defaultdict(list)
     self.read_rules(ruletable)
예제 #7
0
    def word_values(self, value, doc_boost=1.0, **kwargs):
        seen = defaultdict(int)
        for t in unstopped(self.analyzer(value, **kwargs)):
            seen[t.text] += 1

        encode = self.encode
        return ((w, freq, encode((freq, doc_boost)))
                for w, freq in seen.iteritems())
예제 #8
0
파일: formats.py 프로젝트: archatas/whoosh
 def word_values(self, value, doc_boost=1.0, **kwargs):
     seen = defaultdict(int)
     for t in unstopped(self.analyzer(value, **kwargs)):
         seen[t.text] += 1
     
     encode = self.encode
     return ((w, freq, encode((freq, doc_boost)))
             for w, freq in seen.iteritems())
예제 #9
0
파일: formats.py 프로젝트: archatas/whoosh
 def word_values(self, value, start_pos=0, **kwargs):
     seen = defaultdict(list)
     for t in unstopped(self.analyzer(value, positions=True,
                                      start_pos=start_pos, **kwargs)):
         seen[t.text].append(start_pos + t.pos)
     
     encode = self.encode
     return ((w, len(poslist), encode(poslist))
             for w, poslist in seen.iteritems())
예제 #10
0
파일: ramindex.py 프로젝트: archatas/whoosh
 def __init__(self, schema):
     self.schema = schema
     self.maxdoc = 0
     self._sync_lock = Lock()
     
     self.termlists = defaultdict(list)
     self.invertedindex = {}
     for fieldnum in xrange(len(schema)):
         self.invertedindex[fieldnum] = defaultdict(list)
     self.indexfreqs = defaultdict(int)
     
     self.storedfields = {}
     self.fieldlengths = defaultdict(int)
     self.fieldlength_totals = defaultdict(int)
     self.vectors = {}
     self.deleted = set()
     
     self._stored_to_pos = dict((fnum, i) for i, fnum in enumerate(self.schema.stored_fields()))
예제 #11
0
파일: ramindex.py 프로젝트: ra2003/whoosh
    def __init__(self, schema):
        self.schema = schema
        self.maxdoc = 0
        self._sync_lock = Lock()

        self.termlists = defaultdict(list)
        self.invertedindex = {}
        for fieldnum in xrange(len(schema)):
            self.invertedindex[fieldnum] = defaultdict(list)
        self.indexfreqs = defaultdict(int)

        self.storedfields = {}
        self.fieldlengths = defaultdict(int)
        self.fieldlength_totals = defaultdict(int)
        self.vectors = {}
        self.deleted = set()

        self._stored_to_pos = dict(
            (fnum, i) for i, fnum in enumerate(self.schema.stored_fields()))
예제 #12
0
파일: formats.py 프로젝트: archatas/whoosh
 def word_values(self, value, **kwargs):
     seen = defaultdict(int)
     if self.boost_as_freq:
         for t in unstopped(self.analyzer(value, boosts=True, **kwargs)):
             seen[t.text] += int(t.boost)
     else:
         for t in unstopped(self.analyzer(value, **kwargs)):
             seen[t.text] += 1
     
     encode = self.encode
     return ((w, freq, encode(freq)) for w, freq in seen.iteritems())
예제 #13
0
파일: formats.py 프로젝트: archatas/whoosh
 def word_values(self, value, start_pos=0, start_char=0, **kwargs):
     seen = defaultdict(list)
     
     for t in unstopped(self.analyzer(value, positions=True, chars=True,
                                      start_pos=start_pos,
                                      start_char=start_char, **kwargs)):
         seen[t.text].append((t.pos, start_char + t.startchar,
                              start_char + t.endchar))
     
     encode = self.encode
     return ((w, len(ls), encode(ls)) for w, ls in seen.iteritems())
예제 #14
0
    def word_values(self, value, **kwargs):
        seen = defaultdict(int)
        if self.boost_as_freq:
            for t in unstopped(self.analyzer(value, boosts=True, **kwargs)):
                seen[t.text] += int(t.boost)
        else:
            for t in unstopped(self.analyzer(value, **kwargs)):
                seen[t.text] += 1

        encode = self.encode
        return ((w, freq, encode(freq)) for w, freq in seen.iteritems())
예제 #15
0
    def word_values(self, value, start_pos=0, **kwargs):
        seen = defaultdict(list)
        for t in unstopped(
                self.analyzer(value,
                              positions=True,
                              start_pos=start_pos,
                              **kwargs)):
            seen[t.text].append(start_pos + t.pos)

        encode = self.encode
        return ((w, len(poslist), encode(poslist))
                for w, poslist in seen.iteritems())
예제 #16
0
파일: spelling.py 프로젝트: ra2003/whoosh
    def suggestions_and_scores(self, text, weighting=None):
        """Returns a list of possible alternative spellings of 'text', as
        ('word', score, weight) triples, where 'word' is the suggested
        word, 'score' is the score that was assigned to the word using
        :meth:`SpellChecker.add_field` or :meth:`SpellChecker.add_scored_words`,
        and 'weight' is the score the word received in the search for the
        original word's ngrams.
        
        You must add words to the dictionary (using add_field, add_words,
        and/or add_scored_words) before you can use this.
        
        This is a lower-level method, in case an expert user needs access to
        the raw scores, for example to implement a custom suggestion ranking
        algorithm. Most people will want to call :meth:`~SpellChecker.suggest`
        instead, which simply returns the top N valued words.
        
        :param text: The word to check.
        :rtype: list
        """

        if weighting is None:
            weighting = TF_IDF()

        grams = defaultdict(list)
        for size in xrange(self.mingram, self.maxgram + 1):
            key = "gram%s" % size
            nga = analysis.NgramAnalyzer(size)
            for t in nga(text):
                grams[key].append(t.text)

        queries = []
        for size in xrange(self.mingram, min(self.maxgram + 1, len(text))):
            key = "gram%s" % size
            gramlist = grams[key]
            queries.append(
                query.Term("start%s" % size,
                           gramlist[0],
                           boost=self.booststart))
            queries.append(
                query.Term("end%s" % size, gramlist[-1], boost=self.boostend))
            for gram in gramlist:
                queries.append(query.Term(key, gram))

        q = query.Or(queries)
        ix = self.index()
        s = ix.searcher(weighting=weighting)
        try:
            result = s.search(q)
            return [(fs["word"], fs["score"], result.score(i))
                    for i, fs in enumerate(result) if fs["word"] != text]
        finally:
            s.close()
예제 #17
0
    def __init__(self, ix, postlimit, blocklimit, name=None):
        """
        :param ix: the Index object in which to write the new segment.
        :param postlimit: the maximum size for a run in the posting pool.
        :param blocklimit: the maximum number of postings in a posting block.
        :param name: the name of the segment.
        """

        self.index = ix
        self.schema = ix.schema
        self.storage = storage = ix.storage
        self.name = name or ix._next_segment_name()

        self.max_doc = 0

        self.pool = postpool.PostingPool(postlimit)

        # Create mappings of field numbers to the position of that field in the
        # lists of scorable and stored fields. For example, consider a schema
        # with fields (A, B, C, D, E, F). If B, D, and E are scorable, then the
        # list of scorable fields is (B, D, E). The _scorable_to_pos dictionary
        # would then map B -> 0, D -> 1, and E -> 2.
        self._scorable_to_pos = dict((fnum, i)
                                     for i, fnum
                                     in enumerate(self.schema.scorable_fields()))
        self._stored_to_pos = dict((fnum, i)
                                   for i, fnum
                                   in enumerate(self.schema.stored_fields()))

        # Create a temporary segment object just so we can access its
        # *_filename attributes (so if we want to change the naming convention,
        # we only have to do it in one place).
        tempseg = Segment(self.name, 0, 0, None)
        self.termtable = create_terms(storage, tempseg)
        self.docslist = create_storedfields(storage, tempseg)
        self.doclengths = None
        if self.schema.scorable_fields():
            self.doclengths = create_doclengths(storage, tempseg, len(self._scorable_to_pos))

        postfile = storage.create_file(tempseg.posts_filename)
        self.postwriter = FilePostingWriter(postfile, blocklimit=blocklimit)

        self.vectortable = None
        if self.schema.has_vectored_fields():
            # Table associating document fields with (postoffset, postcount)
            self.vectortable = create_vectors(storage, tempseg)
            vpostfile = storage.create_file(tempseg.vectorposts_filename)
            self.vpostwriter = FilePostingWriter(vpostfile, stringids=True)

        # Keep track of the total number of tokens (across all docs)
        # in each field
        self.field_length_totals = defaultdict(int)
예제 #18
0
    def __init__(self, ix, postlimit, blocklimit, name=None):
        """
        :param ix: the Index object in which to write the new segment.
        :param postlimit: the maximum size for a run in the posting pool.
        :param blocklimit: the maximum number of postings in a posting block.
        :param name: the name of the segment.
        """

        self.index = ix
        self.schema = ix.schema
        self.storage = storage = ix.storage
        self.name = name or ix._next_segment_name()

        self.max_doc = 0

        self.pool = postpool.PostingPool(postlimit)

        # Create mappings of field numbers to the position of that field in the
        # lists of scorable and stored fields. For example, consider a schema
        # with fields (A, B, C, D, E, F). If B, D, and E are scorable, then the
        # list of scorable fields is (B, D, E). The _scorable_to_pos dictionary
        # would then map B -> 0, D -> 1, and E -> 2.
        self._scorable_to_pos = dict(
            (fnum, i) for i, fnum in enumerate(self.schema.scorable_fields()))
        self._stored_to_pos = dict(
            (fnum, i) for i, fnum in enumerate(self.schema.stored_fields()))

        # Create a temporary segment object just so we can access its
        # *_filename attributes (so if we want to change the naming convention,
        # we only have to do it in one place).
        tempseg = Segment(self.name, 0, 0, None)
        self.termtable = create_terms(storage, tempseg)
        self.docslist = create_storedfields(storage, tempseg)
        self.doclengths = None
        if self.schema.scorable_fields():
            self.doclengths = create_doclengths(storage, tempseg,
                                                len(self._scorable_to_pos))

        postfile = storage.create_file(tempseg.posts_filename)
        self.postwriter = FilePostingWriter(postfile, blocklimit=blocklimit)

        self.vectortable = None
        if self.schema.has_vectored_fields():
            # Table associating document fields with (postoffset, postcount)
            self.vectortable = create_vectors(storage, tempseg)
            vpostfile = storage.create_file(tempseg.vectorposts_filename)
            self.vpostwriter = FilePostingWriter(vpostfile, stringids=True)

        # Keep track of the total number of tokens (across all docs)
        # in each field
        self.field_length_totals = defaultdict(int)
예제 #19
0
파일: spelling.py 프로젝트: archatas/whoosh
    def suggestions_and_scores(self, text, weighting=None):
        """Returns a list of possible alternative spellings of 'text', as
        ('word', score, weight) triples, where 'word' is the suggested
        word, 'score' is the score that was assigned to the word using
        :meth:`SpellChecker.add_field` or :meth:`SpellChecker.add_scored_words`,
        and 'weight' is the score the word received in the search for the
        original word's ngrams.
        
        You must add words to the dictionary (using add_field, add_words,
        and/or add_scored_words) before you can use this.
        
        This is a lower-level method, in case an expert user needs access to
        the raw scores, for example to implement a custom suggestion ranking
        algorithm. Most people will want to call :meth:`~SpellChecker.suggest`
        instead, which simply returns the top N valued words.
        
        :param text: The word to check.
        :rtype: list
        """

        if weighting is None:
            weighting = TF_IDF()

        grams = defaultdict(list)
        for size in xrange(self.mingram, self.maxgram + 1):
            key = "gram%s" % size
            nga = analysis.NgramAnalyzer(size)
            for t in nga(text):
                grams[key].append(t.text)

        queries = []
        for size in xrange(self.mingram, min(self.maxgram + 1, len(text))):
            key = "gram%s" % size
            gramlist = grams[key]
            queries.append(query.Term("start%s" % size, gramlist[0],
                                      boost=self.booststart))
            queries.append(query.Term("end%s" % size, gramlist[-1],
                                      boost=self.boostend))
            for gram in gramlist:
                queries.append(query.Term(key, gram))

        q = query.Or(queries)
        ix = self.index()
        s = ix.searcher(weighting=weighting)
        try:
            result = s.search(q)
            return [(fs["word"], fs["score"], result.score(i))
                    for i, fs in enumerate(result)
                    if fs["word"] != text]
        finally:
            s.close()
예제 #20
0
    def word_values(self, value, start_pos=0, start_char=0, **kwargs):
        seen = defaultdict(list)

        for t in unstopped(
                self.analyzer(value,
                              positions=True,
                              chars=True,
                              start_pos=start_pos,
                              start_char=start_char,
                              **kwargs)):
            seen[t.text].append(
                (t.pos, start_char + t.startchar, start_char + t.endchar))

        encode = self.encode
        return ((w, len(ls), encode(ls)) for w, ls in seen.iteritems())
예제 #21
0
파일: __init__.py 프로젝트: ra2003/whoosh
        def wrapper(self, *args):
            if not hasattr(self, prefix + "cache"):
                cache = {}
                queue = deque()
                refcount = defaultdict(int)
                setattr(self, prefix + "cache", cache)
                setattr(self, prefix + "queue", queue)
                setattr(self, prefix + "refcount", refcount)
            else:
                cache = getattr(self, prefix + "cache")
                queue = getattr(self, prefix + "queue")
                refcount = getattr(self, prefix + "refcount")
            qpend = queue.append
            qpop = queue.popleft

            # Get cache entry or compute if not found
            try:
                result = cache[args]
            except KeyError:
                result = cache[args] = func(self, *args)

            # Record that this key was recently accessed
            qpend(args)
            refcount[args] += 1

            # Purge least recently accessed cache contents
            while len(cache) > size:
                k = qpop()
                refcount[k] -= 1
                if not refcount[k]:
                    del cache[k]
                    del refcount[k]

            # Periodically compact the queue by removing duplicate keys
            if len(queue) > size * 4:
                for _ in xrange(len(queue)):
                    k = qpop()
                    if refcount[k] == 1:
                        qpend(k)
                    else:
                        refcount[k] -= 1
                #assert len(queue) == len(cache) == len(refcount) == sum(refcount.itervalues())

            return result
예제 #22
0
파일: __init__.py 프로젝트: archatas/whoosh
        def wrapper(self, *args):
            if not hasattr(self, prefix + "cache"):
                cache = {}
                queue = deque()
                refcount = defaultdict(int)
                setattr(self, prefix + "cache", cache)
                setattr(self, prefix + "queue", queue)
                setattr(self, prefix + "refcount", refcount)
            else:
                cache = getattr(self, prefix + "cache")
                queue = getattr(self, prefix + "queue")
                refcount = getattr(self, prefix + "refcount")
            qpend = queue.append
            qpop = queue.popleft

            # Get cache entry or compute if not found
            try:
                result = cache[args]
            except KeyError:
                result = cache[args] = func(self, *args)

            # Record that this key was recently accessed
            qpend(args)
            refcount[args] += 1

            # Purge least recently accessed cache contents
            while len(cache) > size:
                k = qpop()
                refcount[k] -= 1
                if not refcount[k]:
                    del cache[k]
                    del refcount[k]

            # Periodically compact the queue by removing duplicate keys
            if len(queue) > size * 4:
                for _ in xrange(len(queue)):
                    k = qpop()
                    if refcount[k] == 1:
                        qpend(k)
                    else:
                        refcount[k] -= 1
                #assert len(queue) == len(cache) == len(refcount) == sum(refcount.itervalues())

            return result
예제 #23
0
 def __init__(self, dbfile):
     self.dbfile = dbfile
     dbfile.seek(2048)
     self.hashes = defaultdict(list)
예제 #24
0
파일: lovins.py 프로젝트: ra2003/whoosh
            ("erid", "eris"),
            ("pand", "pans"),
            ("end", "ens", "s"),
            ("ond", "ons"),
            ("lud", "lus"),
            ("rud", "rus"),
            ("her", "hes", "pt"),
            ("mit", "mis"),
            ("ent", "ens", "m"),
            ("ert", "ers"),
            ("et", "es", "n"),
            ("yt", "ys"),
            ("yz", "ys"))

# Hash the ending rules by the last letter of the target ending
_endingrules = defaultdict(list)
for rule in _endings:
    _endingrules[rule[0][-1]].append(rule)

_doubles = frozenset(("dd", "gg", "ll", "mm", "nn", "pp", "rr", "ss", "tt"))


def fix_ending(word):
    if word[-2:] in _doubles:
        word = word[:-1]
    
    for endingrule in _endingrules[word[-1]]:
        target, newend = endingrule[:2]
        if word.endswith(target):
            if len(endingrule) > 2:
                exceptafter = endingrule[2]
예제 #25
0
 def __init__(self, dbfile):
     self.dbfile = dbfile
     dbfile.seek(2048)
     self.hashes = defaultdict(list)