예제 #1
0
    def supported_languages():
        # get NLTK list of stopwords
        stopwords_listdir = []
        try:
            stopwords_listdir = [file for file in
                                 os.listdir(stopwords._get_root())
                                 if file.islower()]
            print("stopwords._get_root():"+stopwords._get_root())
        except LookupError:  # when no NLTK data is available
            pass

        return [file.capitalize() for file in stopwords_listdir]
예제 #2
0
    def supported_languages():
        # get NLTK list of stopwords
        stopwords_listdir = []
        try:
            stopwords_listdir = [file for file in
                                 os.listdir(stopwords._get_root())
                                 if file.islower()]
        except LookupError:  # when no NLTK data is available
            pass

        return [file.capitalize() for file in stopwords_listdir]
예제 #3
0
    def supported_languages():
        # get NLTK list of stopwords
        stopwords_listdir = []
        try:
            stopwords_listdir = [
                file for file in os.listdir(stopwords._get_root())
                if file.islower()
            ]
        except LookupError:  # when no NLTK data is available
            pass

        # return sorted(file.capitalize() for file in stopwords_listdir)
        all_stopwords_listdir = ['中文'] + \
            sorted(file.capitalize() for file in stopwords_listdir)
        return all_stopwords_listdir
예제 #4
0
    def from_file(self, path):
        self.file_path = path
        if not path:
            self.word_list = []
        else:
            enc = detect_encoding(path)
            with open(path, encoding=enc) as f:
                self.word_list = set([line.strip() for line in f])


# get NLTK list of stopwords
stopwords_listdir = []
try:
    stopwords_listdir = [
        file for file in os.listdir(stopwords._get_root()) if file.islower()
    ]
except LookupError:  # when no NLTK data is available
    pass


class StopwordsFilter(BaseTokenFilter, WordListMixin):
    """ Remove tokens present in NLTK's language specific lists or a file. """
    name = 'Stopwords'

    supported_languages = [file.capitalize() for file in stopwords_listdir]

    @wait_nltk_data
    def __init__(self, language='English', word_list=None):
        WordListMixin.__init__(self, word_list)
        super().__init__()
예제 #5
0
        self.file_path = None
        self.word_list = word_list or []

    def from_file(self, path):
        self.file_path = path
        if not path:
            self.word_list = []
        else:
            enc = detect_encoding(path)
            with open(path, encoding=enc) as f:
                self.word_list = set([line.strip() for line in f])

# get NLTK list of stopwords
stopwords_listdir = []
try:
    stopwords_listdir = [file for file in os.listdir(stopwords._get_root())
                         if file.islower()]
except LookupError:     # when no NLTK data is available
    pass


class StopwordsFilter(BaseTokenFilter, WordListMixin):
    """ Remove tokens present in NLTK's language specific lists or a file. """
    name = 'Stopwords'

    supported_languages = [file.capitalize() for file in stopwords_listdir]

    def __init__(self, language='English', word_list=None):
        WordListMixin.__init__(self, word_list)
        super().__init__()
        self.language = language