def extract_tokens_from_file(self, responses, input_filename, n, token_dict): Y = fh.read_csv(input_filename) rids = Y.index dataset = fh.get_basename(input_filename) for rid in rids: if rid in responses: text = responses[rid].lower() text = text.lstrip() text = text.rstrip() tokens = [] sentences = tokenizer.split_sentences(text) for s in sentences: sent_tokens = tokenizer.make_ngrams(s, n) #sent_tokens = [t.rstrip('`"\'') if re.search('[a-z]', t) else t for t in sent_tokens] #sent_tokens = [t.lstrip('`"\'') if re.search('[a-z]', t) else t for t in sent_tokens] tokens = tokens + sent_tokens tokens = [self.get_prefix() + t for t in tokens] if self.params['source'] != 'normalized': tokens = [t + '_<' + self.params['source'] + '>' for t in tokens] if self.params['append_dataset']: tokens = [t + '_' + dataset for t in tokens] token_dict[rid] = tokens else: token_dict[rid] = []
def extract_tokens_from_text(self, data): token_dict = {} for key, text in data.items(): text = text.lower() text = text.lstrip() text = text.rstrip() tokens = [] sentences = tokenizer.split_sentences(text) for s in sentences: sent_tokens = [s[i:i + self.n] for i in range(len(s) - self.n)] tokens = tokens + sent_tokens tokens = [self.get_prefix() + t for t in tokens] token_dict[key] = tokens return token_dict
def extract_tokens_from_text(self, data): token_dict = {} for key, text in data.items(): text = text.lower() text = text.lstrip() text = text.rstrip() tokens = [] sentences = tokenizer.split_sentences(text) for s in sentences: sent_tokens = [s[i:i+self.n] for i in range(len(s)-self.n)] tokens = tokens + sent_tokens tokens = [self.get_prefix() + t for t in tokens] token_dict[key] = tokens return token_dict
def extract_tokens_from_file(self, data, n): token_dict = {} for key, text in data.items(): text = text.lower() text = text.lstrip() text = text.rstrip() tokens = [] sentences = tokenizer.split_sentences(text) for s in sentences: sent_tokens = tokenizer.make_ngrams(s, n) tokens = tokens + sent_tokens tokens = [self.get_prefix() + t for t in tokens] token_dict[key] = tokens return token_dict
def extract_tokens_from_file(self, responses, input_filename, n, cluster_dict, token_dict): Y = fh.read_csv(input_filename) rids = Y.index for rid in rids: text = responses[rid].lower() text = text.lstrip() text = text.rstrip() tokens = [] sentences = tokenizer.split_sentences(text) for s in sentences: sent_tokens = tokenizer.make_ngrams(s, n) sent_tokens = [t.rstrip('`"\'') if re.search('[a-z]', t) else t for t in sent_tokens] sent_tokens = [t.lstrip('`"\'') if re.search('[a-z]', t) else t for t in sent_tokens] sent_tokens = sent_tokens + ['__ENDS__'] tokens = tokens + sent_tokens tokens = [self.get_prefix() + cluster_dict[t] for t in tokens if t in cluster_dict] token_dict[rid] = tokens