import matplotlib.pyplot as plt import pandas as pd import seaborn as sns import PreProcessor import LoadData def sentence_to_wordlist(raw): words=raw.split(); return words #get raw data from Previous Task filtered Organised Preprocessed Text print ('GetttingRawData') rawText = LoadData._getRawDataFromText() rawPrepProcessedText = PreProcessor.preProcessData(rawText) print('Got RawData -ProcessingData') #Tokenising Raw Data tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') raw_sentences=tokenizer.tokenize(rawPrepProcessedText) sentences=[] #getting wrd token List tokenised Data for raw_sentence in raw_sentences: if len(raw_sentence)>0: sentences.append(sentence_to_wordlist(raw_sentence)) #just temporary to find count no use in code tokenCount=sum([len(sentence)for sentence in sentences]) print('Token Count----',tokenCount)