def create_features(_words:list) -> list: _phi = [0] * len(ids) for word in _words: if not is_stopword(word): stemed = stemmer.stem(word) _phi[ids[word]] += 1 return _phi
def preprocessor(sentence: str) -> str: res = [] for word in sentence.split(): if is_stopword(word) or len(word) < 1 or word == "--": continue res.append(stem(word)) return " ".join(res)
def get_feature(sentence): '文からストップワードを除去してステミングした文を返す' phis = [] for word in sentence.split(): if is_stopword(word) or len(word) == 1 or word == '--': continue phis.append(stem(word)) return ' '.join(phis)
def get_feature_base(sentence): stemmer = snowballstemmer.stemmer('english') words = sentence.split() result = [] for word in words: if is_stopword(word): continue result.append(stemmer.stemWord(word)) return ' '.join(result)
def feature_extraction(text: str): words = text.split(' ') sentence = [] for word in words: if is_stopword(word) or len(word) == 1: continue sentence.append(stemmer.stemWord(word)) return ' '.join(sentence)
def clean_sentence(line: str) -> str: sentence = [] for word in line.split(): word = word.rstrip('\n,.;:?!') word = stem(word) if is_stopword(word) or word == '': continue sentence.append(word) return ' '.join(sentence)
def stopword_removal(word_features="word_features.txt", no_stopword_features="no_stopword_features.txt"): pbar = tqdm(total=10662) with open(word_features, "r", encoding="latin-1") as f, open(no_stopword_features, "w", encoding="latin-1") as fw: for line in f: label_words = line.rstrip().split(" ") label = label_words[0] words = label_words[1:] fw.write(label) for i, word in enumerate(words): if is_stopword(word): continue fw.write(" " + word) pbar.update(1) fw.write("\n") pbar.close()
def test_check_nlp(self): self.assertFalse(is_stopword('nlp'))
def test_check_the(self): self.assertTrue(is_stopword('the'))
def test_check_a(self): self.assertTrue(is_stopword('a'))
def test_check_no_args(self): # 引数なしのときはraise TypeError with self.assertRaises(TypeError): is_stopword()
def test_check_zerolength(self): with self.assertRaises(TypeError): is_stopword('')
def test_check_physics(self): self.assertFalse(is_stopword('physics'))
def check(stem): if is_stopword(stem): return False if len(stem) == 1: return False return True
def test_check_you(self): self.assertTrue(is_stopword('you'))
def test_check_i(self): self.assertTrue(is_stopword('i'))
def test_is_stopword(self): self.assertEqual(is_stopword('is'), True) self.assertEqual(is_stopword('me'), False)