class Preprocessing: def __init__(self): self.di = DataCollector() def to_upper(self): data = self.di.collect_data() upper = data['data'].upper() return upper
text = PUNCTUATION_PATTERN.sub('', text) print(text) print('INFO: aaaaand lowercase...') text = text.lower() print(text) print('INFO: removing whitespaces...') text = text.lstrip() text = ' '.join(text.split()) print(text)''' #collect data/clean it print('INFO: starting data collection...') collector = DataCollector() collector.collect_data() print('INFO: data collection complete...') print('INFO: starting data cleaning...') collector.clean_corpus() print('INFO: data cleaning complete...') #print(collector.corpus) bigram_model = BigramModel(collector) word_lengths = [] f = open('word_length.txt', r) for line in f: word_lengths.append(line)