예제 #1
0
 def stems(self, words):
     language = self.config['snowball_language']
     stems = defaultdict(lambda: {'inflections': []})
     stemmer = SnowballStemmer(language)
     logger.info("Finding stems")
     for idx, word in enumerate(words, start=1):
         message = "{} stems found".format(len(stems))
         print_progress(idx, len(words), message)
         stem = stemmer.stem(word)
         stems[stem]['inflections'].append(word)
     logger.info(message)
     return dict(stems)
예제 #2
0
    def definitions(self, words):
        logger.info("Finding definitions")
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        sock.connect((self.config['dictd_hostname'],
                      self.config['dictd_port']))
        def read():
            buffer = ''
            while True:
                buffer += sock.recv(1000).decode()
                match = re.match(r'(.*)^(220|552|250|550)[^\r\n]*\r\n', buffer,
                                 re.MULTILINE|re.DOTALL)
                if match:
                    response = match.group(1)
                    code = int(match.group(2))
                    return code, response

        def parse_definition(stem, word, response):
            definition = '\n'.join(l for l in response.split('\r\n')
                                   if (not l.startswith('150') and
                                       not l.startswith('151')))
            definition = html.escape(definition)
            return definition

        code, _ = read()  # skip first line.
        assert code == 220
        count = 0
        for idx, stem in enumerate(sorted(words), start=1):
            message = "{} definitions found".format(count)
            print_progress(idx, len(words), message)
            words[stem]['definitions'] = []
            words[stem]['words'] = []
            for word in sorted(words[stem]['inflections']):
                cmd ='d {database} {word}\n'.format(
                    word=word, database=self.config['dictd_database'])
                sock.send(cmd.encode())
                code, response = read()
                if code == 552:
                    logger.debug("Definition not found %s", word)
                elif code == 250:
                    count += 1
                    definition = parse_definition(stem, word, response)
                    words[stem]['definitions'].append(definition)
                    words[stem]['words'].append(word)
                    continue
                elif code == 550:
                    raise Exception("Invalid dictd database")
        sock.close()
        logger.info(message)
        return words