def stems(self, words): language = self.config['snowball_language'] stems = defaultdict(lambda: {'inflections': []}) stemmer = SnowballStemmer(language) logger.info("Finding stems") for idx, word in enumerate(words, start=1): message = "{} stems found".format(len(stems)) print_progress(idx, len(words), message) stem = stemmer.stem(word) stems[stem]['inflections'].append(word) logger.info(message) return dict(stems)
def definitions(self, words): logger.info("Finding definitions") sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.connect((self.config['dictd_hostname'], self.config['dictd_port'])) def read(): buffer = '' while True: buffer += sock.recv(1000).decode() match = re.match(r'(.*)^(220|552|250|550)[^\r\n]*\r\n', buffer, re.MULTILINE|re.DOTALL) if match: response = match.group(1) code = int(match.group(2)) return code, response def parse_definition(stem, word, response): definition = '\n'.join(l for l in response.split('\r\n') if (not l.startswith('150') and not l.startswith('151'))) definition = html.escape(definition) return definition code, _ = read() # skip first line. assert code == 220 count = 0 for idx, stem in enumerate(sorted(words), start=1): message = "{} definitions found".format(count) print_progress(idx, len(words), message) words[stem]['definitions'] = [] words[stem]['words'] = [] for word in sorted(words[stem]['inflections']): cmd ='d {database} {word}\n'.format( word=word, database=self.config['dictd_database']) sock.send(cmd.encode()) code, response = read() if code == 552: logger.debug("Definition not found %s", word) elif code == 250: count += 1 definition = parse_definition(stem, word, response) words[stem]['definitions'].append(definition) words[stem]['words'].append(word) continue elif code == 550: raise Exception("Invalid dictd database") sock.close() logger.info(message) return words