Пример #1
0
def parse_file(f):
    """Parses the WordNet wn_s.pl prolog file and returns two dictionaries:
    word2nums and num2words.
    """
    
    word2nums = defaultdict(list)
    num2words = defaultdict(list)
    
    for line in f:
        if not line.startswith("s("):
            continue
        
        line = line[2:]
        num = int(line[:line.find(",")])
        qt = line.find("'")
        line = line[qt+1:]
        qt = line.find("'")
        word = line[:qt].lower()

        if not word.isalpha():
            continue
        
        word2nums[word].append(num)
        num2words[num].append(word)
    
    return word2nums, num2words
Пример #2
0
def parse_file(f):
    """Parses the WordNet wn_s.pl prolog file and returns two dictionaries:
    word2nums and num2words.
    """

    word2nums = defaultdict(list)
    num2words = defaultdict(list)

    for line in f:
        if not line.startswith("s("):
            continue

        line = line[2:]
        num = int(line[:line.find(",")])
        qt = line.find("'")
        line = line[qt + 1:]
        qt = line.find("'")
        word = line[:qt].lower()

        if not word.isalpha():
            continue

        word2nums[word].append(num)
        num2words[num].append(word)

    return word2nums, num2words
Пример #3
0
 def __init__(self, ruletable):
     """
     :param ruletable: a string containing the rule data, separated
         by newlines.
     """
     self.rules = defaultdict(list)
     self.read_rules(ruletable)
Пример #4
0
 def __init__(self, ixreader, fieldname, model=Bo1Model):
     """
     :param reader: A :class:whoosh.reading.IndexReader object.
     :param fieldname: The name of the field in which to search.
     :param model: (classify.ExpansionModel) The model to use for expanding
         the query terms. If you omit this parameter, the expander uses
         scoring.Bo1Model by default.
     """
     
     self.fieldname = fieldname
     
     if type(model) is type:
         model = model(ixreader, fieldname)
     self.model = model
     
     # Cache the collection frequency of every term in this field. This
     # turns out to be much faster than reading each individual weight from
     # the term index as we add words.
     self.collection_freq = dict((word, freq) for word, _, freq
                                   in ixreader.iter_field(fieldname))
     
     # Maps words to their weight in the top N documents.
     self.topN_weight = defaultdict(float)
     
     # Total weight of all terms in the top N documents.
     self.top_total = 0
Пример #5
0
    def __init__(self, ixreader, fieldname, model=Bo1Model):
        """
        :param reader: A :class:whoosh.reading.IndexReader object.
        :param fieldname: The name of the field in which to search.
        :param model: (classify.ExpansionModel) The model to use for expanding
            the query terms. If you omit this parameter, the expander uses
            scoring.Bo1Model by default.
        """

        self.fieldname = fieldname

        if type(model) is type:
            model = model(ixreader, fieldname)
        self.model = model

        # Cache the collection frequency of every term in this field. This
        # turns out to be much faster than reading each individual weight from
        # the term index as we add words.
        self.collection_freq = dict(
            (word, freq) for word, _, freq in ixreader.iter_field(fieldname))

        # Maps words to their weight in the top N documents.
        self.topN_weight = defaultdict(float)

        # Total weight of all terms in the top N documents.
        self.top_total = 0
Пример #6
0
 def __init__(self, ruletable):
     """
     :param ruletable: a string containing the rule data, separated
         by newlines.
     """
     self.rules = defaultdict(list)
     self.read_rules(ruletable)
Пример #7
0
    def word_values(self, value, doc_boost=1.0, **kwargs):
        seen = defaultdict(int)
        for t in unstopped(self.analyzer(value, **kwargs)):
            seen[t.text] += 1

        encode = self.encode
        return ((w, freq, encode((freq, doc_boost)))
                for w, freq in seen.iteritems())
Пример #8
0
 def word_values(self, value, doc_boost=1.0, **kwargs):
     seen = defaultdict(int)
     for t in unstopped(self.analyzer(value, **kwargs)):
         seen[t.text] += 1
     
     encode = self.encode
     return ((w, freq, encode((freq, doc_boost)))
             for w, freq in seen.iteritems())
Пример #9
0
 def word_values(self, value, start_pos=0, **kwargs):
     seen = defaultdict(list)
     for t in unstopped(self.analyzer(value, positions=True,
                                      start_pos=start_pos, **kwargs)):
         seen[t.text].append(start_pos + t.pos)
     
     encode = self.encode
     return ((w, len(poslist), encode(poslist))
             for w, poslist in seen.iteritems())
Пример #10
0
 def __init__(self, schema):
     self.schema = schema
     self.maxdoc = 0
     self._sync_lock = Lock()
     
     self.termlists = defaultdict(list)
     self.invertedindex = {}
     for fieldnum in xrange(len(schema)):
         self.invertedindex[fieldnum] = defaultdict(list)
     self.indexfreqs = defaultdict(int)
     
     self.storedfields = {}
     self.fieldlengths = defaultdict(int)
     self.fieldlength_totals = defaultdict(int)
     self.vectors = {}
     self.deleted = set()
     
     self._stored_to_pos = dict((fnum, i) for i, fnum in enumerate(self.schema.stored_fields()))
Пример #11
0
    def __init__(self, schema):
        self.schema = schema
        self.maxdoc = 0
        self._sync_lock = Lock()

        self.termlists = defaultdict(list)
        self.invertedindex = {}
        for fieldnum in xrange(len(schema)):
            self.invertedindex[fieldnum] = defaultdict(list)
        self.indexfreqs = defaultdict(int)

        self.storedfields = {}
        self.fieldlengths = defaultdict(int)
        self.fieldlength_totals = defaultdict(int)
        self.vectors = {}
        self.deleted = set()

        self._stored_to_pos = dict(
            (fnum, i) for i, fnum in enumerate(self.schema.stored_fields()))
Пример #12
0
 def word_values(self, value, **kwargs):
     seen = defaultdict(int)
     if self.boost_as_freq:
         for t in unstopped(self.analyzer(value, boosts=True, **kwargs)):
             seen[t.text] += int(t.boost)
     else:
         for t in unstopped(self.analyzer(value, **kwargs)):
             seen[t.text] += 1
     
     encode = self.encode
     return ((w, freq, encode(freq)) for w, freq in seen.iteritems())
Пример #13
0
 def word_values(self, value, start_pos=0, start_char=0, **kwargs):
     seen = defaultdict(list)
     
     for t in unstopped(self.analyzer(value, positions=True, chars=True,
                                      start_pos=start_pos,
                                      start_char=start_char, **kwargs)):
         seen[t.text].append((t.pos, start_char + t.startchar,
                              start_char + t.endchar))
     
     encode = self.encode
     return ((w, len(ls), encode(ls)) for w, ls in seen.iteritems())
Пример #14
0
    def word_values(self, value, **kwargs):
        seen = defaultdict(int)
        if self.boost_as_freq:
            for t in unstopped(self.analyzer(value, boosts=True, **kwargs)):
                seen[t.text] += int(t.boost)
        else:
            for t in unstopped(self.analyzer(value, **kwargs)):
                seen[t.text] += 1

        encode = self.encode
        return ((w, freq, encode(freq)) for w, freq in seen.iteritems())
Пример #15
0
    def word_values(self, value, start_pos=0, **kwargs):
        seen = defaultdict(list)
        for t in unstopped(
                self.analyzer(value,
                              positions=True,
                              start_pos=start_pos,
                              **kwargs)):
            seen[t.text].append(start_pos + t.pos)

        encode = self.encode
        return ((w, len(poslist), encode(poslist))
                for w, poslist in seen.iteritems())
Пример #16
0
    def suggestions_and_scores(self, text, weighting=None):
        """Returns a list of possible alternative spellings of 'text', as
        ('word', score, weight) triples, where 'word' is the suggested
        word, 'score' is the score that was assigned to the word using
        :meth:`SpellChecker.add_field` or :meth:`SpellChecker.add_scored_words`,
        and 'weight' is the score the word received in the search for the
        original word's ngrams.
        
        You must add words to the dictionary (using add_field, add_words,
        and/or add_scored_words) before you can use this.
        
        This is a lower-level method, in case an expert user needs access to
        the raw scores, for example to implement a custom suggestion ranking
        algorithm. Most people will want to call :meth:`~SpellChecker.suggest`
        instead, which simply returns the top N valued words.
        
        :param text: The word to check.
        :rtype: list
        """

        if weighting is None:
            weighting = TF_IDF()

        grams = defaultdict(list)
        for size in xrange(self.mingram, self.maxgram + 1):
            key = "gram%s" % size
            nga = analysis.NgramAnalyzer(size)
            for t in nga(text):
                grams[key].append(t.text)

        queries = []
        for size in xrange(self.mingram, min(self.maxgram + 1, len(text))):
            key = "gram%s" % size
            gramlist = grams[key]
            queries.append(
                query.Term("start%s" % size,
                           gramlist[0],
                           boost=self.booststart))
            queries.append(
                query.Term("end%s" % size, gramlist[-1], boost=self.boostend))
            for gram in gramlist:
                queries.append(query.Term(key, gram))

        q = query.Or(queries)
        ix = self.index()
        s = ix.searcher(weighting=weighting)
        try:
            result = s.search(q)
            return [(fs["word"], fs["score"], result.score(i))
                    for i, fs in enumerate(result) if fs["word"] != text]
        finally:
            s.close()
Пример #17
0
    def __init__(self, ix, postlimit, blocklimit, name=None):
        """
        :param ix: the Index object in which to write the new segment.
        :param postlimit: the maximum size for a run in the posting pool.
        :param blocklimit: the maximum number of postings in a posting block.
        :param name: the name of the segment.
        """

        self.index = ix
        self.schema = ix.schema
        self.storage = storage = ix.storage
        self.name = name or ix._next_segment_name()

        self.max_doc = 0

        self.pool = postpool.PostingPool(postlimit)

        # Create mappings of field numbers to the position of that field in the
        # lists of scorable and stored fields. For example, consider a schema
        # with fields (A, B, C, D, E, F). If B, D, and E are scorable, then the
        # list of scorable fields is (B, D, E). The _scorable_to_pos dictionary
        # would then map B -> 0, D -> 1, and E -> 2.
        self._scorable_to_pos = dict((fnum, i)
                                     for i, fnum
                                     in enumerate(self.schema.scorable_fields()))
        self._stored_to_pos = dict((fnum, i)
                                   for i, fnum
                                   in enumerate(self.schema.stored_fields()))

        # Create a temporary segment object just so we can access its
        # *_filename attributes (so if we want to change the naming convention,
        # we only have to do it in one place).
        tempseg = Segment(self.name, 0, 0, None)
        self.termtable = create_terms(storage, tempseg)
        self.docslist = create_storedfields(storage, tempseg)
        self.doclengths = None
        if self.schema.scorable_fields():
            self.doclengths = create_doclengths(storage, tempseg, len(self._scorable_to_pos))

        postfile = storage.create_file(tempseg.posts_filename)
        self.postwriter = FilePostingWriter(postfile, blocklimit=blocklimit)

        self.vectortable = None
        if self.schema.has_vectored_fields():
            # Table associating document fields with (postoffset, postcount)
            self.vectortable = create_vectors(storage, tempseg)
            vpostfile = storage.create_file(tempseg.vectorposts_filename)
            self.vpostwriter = FilePostingWriter(vpostfile, stringids=True)

        # Keep track of the total number of tokens (across all docs)
        # in each field
        self.field_length_totals = defaultdict(int)
Пример #18
0
    def __init__(self, ix, postlimit, blocklimit, name=None):
        """
        :param ix: the Index object in which to write the new segment.
        :param postlimit: the maximum size for a run in the posting pool.
        :param blocklimit: the maximum number of postings in a posting block.
        :param name: the name of the segment.
        """

        self.index = ix
        self.schema = ix.schema
        self.storage = storage = ix.storage
        self.name = name or ix._next_segment_name()

        self.max_doc = 0

        self.pool = postpool.PostingPool(postlimit)

        # Create mappings of field numbers to the position of that field in the
        # lists of scorable and stored fields. For example, consider a schema
        # with fields (A, B, C, D, E, F). If B, D, and E are scorable, then the
        # list of scorable fields is (B, D, E). The _scorable_to_pos dictionary
        # would then map B -> 0, D -> 1, and E -> 2.
        self._scorable_to_pos = dict(
            (fnum, i) for i, fnum in enumerate(self.schema.scorable_fields()))
        self._stored_to_pos = dict(
            (fnum, i) for i, fnum in enumerate(self.schema.stored_fields()))

        # Create a temporary segment object just so we can access its
        # *_filename attributes (so if we want to change the naming convention,
        # we only have to do it in one place).
        tempseg = Segment(self.name, 0, 0, None)
        self.termtable = create_terms(storage, tempseg)
        self.docslist = create_storedfields(storage, tempseg)
        self.doclengths = None
        if self.schema.scorable_fields():
            self.doclengths = create_doclengths(storage, tempseg,
                                                len(self._scorable_to_pos))

        postfile = storage.create_file(tempseg.posts_filename)
        self.postwriter = FilePostingWriter(postfile, blocklimit=blocklimit)

        self.vectortable = None
        if self.schema.has_vectored_fields():
            # Table associating document fields with (postoffset, postcount)
            self.vectortable = create_vectors(storage, tempseg)
            vpostfile = storage.create_file(tempseg.vectorposts_filename)
            self.vpostwriter = FilePostingWriter(vpostfile, stringids=True)

        # Keep track of the total number of tokens (across all docs)
        # in each field
        self.field_length_totals = defaultdict(int)
Пример #19
0
    def suggestions_and_scores(self, text, weighting=None):
        """Returns a list of possible alternative spellings of 'text', as
        ('word', score, weight) triples, where 'word' is the suggested
        word, 'score' is the score that was assigned to the word using
        :meth:`SpellChecker.add_field` or :meth:`SpellChecker.add_scored_words`,
        and 'weight' is the score the word received in the search for the
        original word's ngrams.
        
        You must add words to the dictionary (using add_field, add_words,
        and/or add_scored_words) before you can use this.
        
        This is a lower-level method, in case an expert user needs access to
        the raw scores, for example to implement a custom suggestion ranking
        algorithm. Most people will want to call :meth:`~SpellChecker.suggest`
        instead, which simply returns the top N valued words.
        
        :param text: The word to check.
        :rtype: list
        """

        if weighting is None:
            weighting = TF_IDF()

        grams = defaultdict(list)
        for size in xrange(self.mingram, self.maxgram + 1):
            key = "gram%s" % size
            nga = analysis.NgramAnalyzer(size)
            for t in nga(text):
                grams[key].append(t.text)

        queries = []
        for size in xrange(self.mingram, min(self.maxgram + 1, len(text))):
            key = "gram%s" % size
            gramlist = grams[key]
            queries.append(query.Term("start%s" % size, gramlist[0],
                                      boost=self.booststart))
            queries.append(query.Term("end%s" % size, gramlist[-1],
                                      boost=self.boostend))
            for gram in gramlist:
                queries.append(query.Term(key, gram))

        q = query.Or(queries)
        ix = self.index()
        s = ix.searcher(weighting=weighting)
        try:
            result = s.search(q)
            return [(fs["word"], fs["score"], result.score(i))
                    for i, fs in enumerate(result)
                    if fs["word"] != text]
        finally:
            s.close()
Пример #20
0
    def word_values(self, value, start_pos=0, start_char=0, **kwargs):
        seen = defaultdict(list)

        for t in unstopped(
                self.analyzer(value,
                              positions=True,
                              chars=True,
                              start_pos=start_pos,
                              start_char=start_char,
                              **kwargs)):
            seen[t.text].append(
                (t.pos, start_char + t.startchar, start_char + t.endchar))

        encode = self.encode
        return ((w, len(ls), encode(ls)) for w, ls in seen.iteritems())
Пример #21
0
        def wrapper(self, *args):
            if not hasattr(self, prefix + "cache"):
                cache = {}
                queue = deque()
                refcount = defaultdict(int)
                setattr(self, prefix + "cache", cache)
                setattr(self, prefix + "queue", queue)
                setattr(self, prefix + "refcount", refcount)
            else:
                cache = getattr(self, prefix + "cache")
                queue = getattr(self, prefix + "queue")
                refcount = getattr(self, prefix + "refcount")
            qpend = queue.append
            qpop = queue.popleft

            # Get cache entry or compute if not found
            try:
                result = cache[args]
            except KeyError:
                result = cache[args] = func(self, *args)

            # Record that this key was recently accessed
            qpend(args)
            refcount[args] += 1

            # Purge least recently accessed cache contents
            while len(cache) > size:
                k = qpop()
                refcount[k] -= 1
                if not refcount[k]:
                    del cache[k]
                    del refcount[k]

            # Periodically compact the queue by removing duplicate keys
            if len(queue) > size * 4:
                for _ in xrange(len(queue)):
                    k = qpop()
                    if refcount[k] == 1:
                        qpend(k)
                    else:
                        refcount[k] -= 1
                #assert len(queue) == len(cache) == len(refcount) == sum(refcount.itervalues())

            return result
Пример #22
0
        def wrapper(self, *args):
            if not hasattr(self, prefix + "cache"):
                cache = {}
                queue = deque()
                refcount = defaultdict(int)
                setattr(self, prefix + "cache", cache)
                setattr(self, prefix + "queue", queue)
                setattr(self, prefix + "refcount", refcount)
            else:
                cache = getattr(self, prefix + "cache")
                queue = getattr(self, prefix + "queue")
                refcount = getattr(self, prefix + "refcount")
            qpend = queue.append
            qpop = queue.popleft

            # Get cache entry or compute if not found
            try:
                result = cache[args]
            except KeyError:
                result = cache[args] = func(self, *args)

            # Record that this key was recently accessed
            qpend(args)
            refcount[args] += 1

            # Purge least recently accessed cache contents
            while len(cache) > size:
                k = qpop()
                refcount[k] -= 1
                if not refcount[k]:
                    del cache[k]
                    del refcount[k]

            # Periodically compact the queue by removing duplicate keys
            if len(queue) > size * 4:
                for _ in xrange(len(queue)):
                    k = qpop()
                    if refcount[k] == 1:
                        qpend(k)
                    else:
                        refcount[k] -= 1
                #assert len(queue) == len(cache) == len(refcount) == sum(refcount.itervalues())

            return result
Пример #23
0
 def __init__(self, dbfile):
     self.dbfile = dbfile
     dbfile.seek(2048)
     self.hashes = defaultdict(list)
Пример #24
0
            ("erid", "eris"),
            ("pand", "pans"),
            ("end", "ens", "s"),
            ("ond", "ons"),
            ("lud", "lus"),
            ("rud", "rus"),
            ("her", "hes", "pt"),
            ("mit", "mis"),
            ("ent", "ens", "m"),
            ("ert", "ers"),
            ("et", "es", "n"),
            ("yt", "ys"),
            ("yz", "ys"))

# Hash the ending rules by the last letter of the target ending
_endingrules = defaultdict(list)
for rule in _endings:
    _endingrules[rule[0][-1]].append(rule)

_doubles = frozenset(("dd", "gg", "ll", "mm", "nn", "pp", "rr", "ss", "tt"))


def fix_ending(word):
    if word[-2:] in _doubles:
        word = word[:-1]
    
    for endingrule in _endingrules[word[-1]]:
        target, newend = endingrule[:2]
        if word.endswith(target):
            if len(endingrule) > 2:
                exceptafter = endingrule[2]
Пример #25
0
 def __init__(self, dbfile):
     self.dbfile = dbfile
     dbfile.seek(2048)
     self.hashes = defaultdict(list)