def pennTreeBank(self, text): """ Tokenization using the Penn Tree Bank Tokenizer Parameters ---------- arg1 : list A list of strings where each string is a single sentence Returns ------- list A list of lists where each sub-list is a sequence of tokens """ tokenizedText = [] if isinstance(text, list): for sentence in text: if isinstance(sentence, str): tokenizedText_ = TreebankWordTokenizer().tokenize(sentence) for word in tokenizedText_: if word in punctuations: # remove any unwanted punctuation symbols which have been calssified as tokens # was not getting unwanted spaces with punkt so that part has been ignored tokenizedText_.remove(word) tokenizedText.append(tokenizedText_) else: print("Warning") print("Sentences are not in the form of strings") return 0 else: print("Warning") print("Argument not in the form of a list.") return 0 return tokenizedText
def pennTreeBank(self, text): """ Tokenization using the Penn Tree Bank Tokenizer Parameters ---------- arg1 : list A list of strings where each string is a single sentence Returns ------- list A list of lists where each sub-list is a sequence of tokens """ tokenizedText = [] for sent in text: # tokenize sentence using tree bank algorithm tokens = TreebankWordTokenizer().tokenize(sent) for W in tokens: # in this case there won't be any empty words or spaces so just remove punctuations if any if W in ['?', ':', '!', '.', ',', ';']: tokens.remove(W) tokenizedText.append(tokens) return tokenizedText