def get_stop_words(self, language, fail_safe=False): """ Returns a StopWord object initialized with the stop words collection requested by ``language``. If the requested language is not available a StopWordError is raised. If ``fail_safe`` is set to True, an empty StopWord object is returned. """ try: language = self.language_codes[language] except KeyError: pass collection = self.LOADED_LANGUAGES_CACHE.get(language) if collection is None: try: collection = self._get_stop_words(language) self.LOADED_LANGUAGES_CACHE[language] = collection except StopWordError as error: if not fail_safe: raise error collection = [] stop_words = StopWord(language, collection) return stop_words
def test_sub(self): nsw = StopWord('bar', ['baz', 'qux', 'norf']) nsw = nsw - self.sw self.assertEqual(sorted(list(nsw)), ['norf', 'qux']) self.assertEqual(nsw.language, 'bar') nsw = StopWord('bar', ['baz', 'qux', 'norf']) nsw -= self.sw self.assertEqual(sorted(list(nsw)), ['norf', 'qux']) self.assertEqual(nsw.language, 'bar') nsw = StopWord('bar', ['baz', 'qux', 'norf']) nsw -= ['tic', 'tac', 'toc', 'qux'] self.assertEqual(sorted(list(nsw)), ['baz', 'norf']) nsw = StopWord('bar', ['baz', 'qux', 'norf']) nsw -= 'baz' self.assertEqual(sorted(list(nsw)), ['norf', 'qux']) if sys.version_info[0] == 2: nsw = StopWord('bar', ['baz', 'qux', 'norf']) nsw -= 'baz'.decode('utf-8') self.assertEqual(sorted(list(nsw)), ['norf', 'qux']) nsw = StopWord('bar', ['baz', 'qux', 'norf']) self.assertRaises(TypeError, nsw.__sub__, object()) self.assertEqual(sorted(list(nsw)), ['baz', 'norf', 'qux'])
def cmdline(argv=sys.argv[1:]): """ Script for merging different collections of stop words. """ parser = ArgumentParser( description='Create and merge collections of stop words') parser.add_argument('language', help='The language used in the collection') parser.add_argument('sources', metavar='FILE', nargs='+', help='Source files to parse') options = parser.parse_args(argv) factory = StopWordFactory() language = options.language stop_words = factory.get_stop_words(language, fail_safe=True) for filename in options.sources: stop_words += StopWord(language, factory.read_collection(filename)) filename = factory.get_collection_filename(stop_words.language) factory.write_collection(filename, stop_words.collection)
def check_stop_word_rebase(self, inpout, outpout, sept, char=None): sw = StopWord('test', sept) if char is None: self.assertEqual(sw.rebase(inpout), outpout) else: self.assertEqual(sw.rebase(inpout, char), outpout)
def setUp(self): self.sw = StopWord('foo', ['foo', 'bar', 'baz'])
class StopWordTestCase(TestCase): def setUp(self): self.sw = StopWord('foo', ['foo', 'bar', 'baz']) def test_len(self): self.assertEqual(len(self.sw), 3) def test_contains(self): self.assertTrue('foo' in self.sw) self.assertFalse('qux' in self.sw) def test_iter(self): self.assertEqual(len(list(self.sw)), 3) def test_add(self): nsw = StopWord('bar', ['baz', 'qux', 'norf']) nsw = nsw + self.sw self.assertEqual(sorted(list(nsw)), ['bar', 'baz', 'foo', 'norf', 'qux']) self.assertEqual(nsw.language, 'bar') nsw = StopWord('bar', ['baz', 'qux', 'norf']) nsw += self.sw self.assertEqual(sorted(list(nsw)), ['bar', 'baz', 'foo', 'norf', 'qux']) self.assertEqual(nsw.language, 'bar') nsw = StopWord('bar', ['baz', 'qux', 'norf']) nsw += ['tic', 'tac', 'toc'] self.assertEqual(sorted(list(nsw)), ['baz', 'norf', 'qux', 'tac', 'tic', 'toc']) nsw = StopWord('bar', ['baz', 'qux', 'norf']) nsw += 'tic' self.assertEqual(sorted(list(nsw)), ['baz', 'norf', 'qux', 'tic']) if sys.version_info[0] == 2: nsw = StopWord('bar', ['baz', 'qux', 'norf']) nsw += 'tic'.decode('utf-8') self.assertEqual(sorted(list(nsw)), ['baz', 'norf', 'qux', 'tic']) nsw = StopWord('bar', ['baz', 'qux', 'norf']) self.assertRaises(TypeError, nsw.__add__, object()) self.assertEqual(sorted(list(nsw)), ['baz', 'norf', 'qux']) def test_sub(self): nsw = StopWord('bar', ['baz', 'qux', 'norf']) nsw = nsw - self.sw self.assertEqual(sorted(list(nsw)), ['norf', 'qux']) self.assertEqual(nsw.language, 'bar') nsw = StopWord('bar', ['baz', 'qux', 'norf']) nsw -= self.sw self.assertEqual(sorted(list(nsw)), ['norf', 'qux']) self.assertEqual(nsw.language, 'bar') nsw = StopWord('bar', ['baz', 'qux', 'norf']) nsw -= ['tic', 'tac', 'toc', 'qux'] self.assertEqual(sorted(list(nsw)), ['baz', 'norf']) nsw = StopWord('bar', ['baz', 'qux', 'norf']) nsw -= 'baz' self.assertEqual(sorted(list(nsw)), ['norf', 'qux']) if sys.version_info[0] == 2: nsw = StopWord('bar', ['baz', 'qux', 'norf']) nsw -= 'baz'.decode('utf-8') self.assertEqual(sorted(list(nsw)), ['norf', 'qux']) nsw = StopWord('bar', ['baz', 'qux', 'norf']) self.assertRaises(TypeError, nsw.__sub__, object()) self.assertEqual(sorted(list(nsw)), ['baz', 'norf', 'qux']) def test_str(self): self.assertEqual(self.sw.__str__(), 'Foo stop words: 3 words') def test_repr(self): self.assertEqual(self.sw.__repr__(), "Foo stop words: ['bar', 'baz', 'foo']")
def check_stop_word_rebase(self, inpout, outpout, sept, char=None): sw = StopWord("test", sept) if char is None: self.assertEqual(sw.rebase(inpout), outpout) else: self.assertEqual(sw.rebase(inpout, char), outpout)
def setUp(self): self.sw = StopWord("foo", ["foo", "bar", "baz"])
class StopWordTestCase(TestCase): def setUp(self): self.sw = StopWord("foo", ["foo", "bar", "baz"]) def test_len(self): self.assertEqual(len(self.sw), 3) def test_contains(self): self.assertTrue("foo" in self.sw) self.assertFalse("qux" in self.sw) def test_iter(self): self.assertEqual(len(list(self.sw)), 3) def test_add(self): nsw = StopWord("bar", ["baz", "qux", "norf"]) nsw = nsw + self.sw self.assertEqual(sorted(list(nsw)), ["bar", "baz", "foo", "norf", "qux"]) self.assertEqual(nsw.language, "bar") nsw = StopWord("bar", ["baz", "qux", "norf"]) nsw += self.sw self.assertEqual(sorted(list(nsw)), ["bar", "baz", "foo", "norf", "qux"]) self.assertEqual(nsw.language, "bar") nsw = StopWord("bar", ["baz", "qux", "norf"]) nsw += ["tic", "tac", "toc"] self.assertEqual(sorted(list(nsw)), ["baz", "norf", "qux", "tac", "tic", "toc"]) nsw = StopWord("bar", ["baz", "qux", "norf"]) nsw += "tic" self.assertEqual(sorted(list(nsw)), ["baz", "norf", "qux", "tic"]) if sys.version_info[0] == 2: nsw = StopWord("bar", ["baz", "qux", "norf"]) nsw += "tic".decode("utf-8") self.assertEqual(sorted(list(nsw)), ["baz", "norf", "qux", "tic"]) nsw = StopWord("bar", ["baz", "qux", "norf"]) self.assertRaises(TypeError, nsw.__add__, object()) self.assertEqual(sorted(list(nsw)), ["baz", "norf", "qux"]) def test_sub(self): nsw = StopWord("bar", ["baz", "qux", "norf"]) nsw = nsw - self.sw self.assertEqual(sorted(list(nsw)), ["norf", "qux"]) self.assertEqual(nsw.language, "bar") nsw = StopWord("bar", ["baz", "qux", "norf"]) nsw -= self.sw self.assertEqual(sorted(list(nsw)), ["norf", "qux"]) self.assertEqual(nsw.language, "bar") nsw = StopWord("bar", ["baz", "qux", "norf"]) nsw -= ["tic", "tac", "toc", "qux"] self.assertEqual(sorted(list(nsw)), ["baz", "norf"]) nsw = StopWord("bar", ["baz", "qux", "norf"]) nsw -= "baz" self.assertEqual(sorted(list(nsw)), ["norf", "qux"]) if sys.version_info[0] == 2: nsw = StopWord("bar", ["baz", "qux", "norf"]) nsw -= "baz".decode("utf-8") self.assertEqual(sorted(list(nsw)), ["norf", "qux"]) nsw = StopWord("bar", ["baz", "qux", "norf"]) self.assertRaises(TypeError, nsw.__sub__, object()) self.assertEqual(sorted(list(nsw)), ["baz", "norf", "qux"]) def test_str(self): self.assertEqual(self.sw.__str__(), "Foo stop words: 3 words") def test_repr(self): self.assertEqual(self.sw.__repr__(), "Foo stop words: ['bar', 'baz', 'foo']")