def get_vocab(corpus): vocab = defaultdict(float) for sent in corpus: for word in clean_str(sent).split(): vocab[word] += 1 print(len(vocab)) return vocab
def process(corpus): return [clean_str(sent) for sent in corpus]