def extract_tokens_from_file(self, responses, input_filename, n, token_dict):
        Y = fh.read_csv(input_filename)
        rids = Y.index
        dataset = fh.get_basename(input_filename)

        for rid in rids:
            if rid in responses:
                text = responses[rid].lower()
                text = text.lstrip()
                text = text.rstrip()
                tokens = []

                sentences = tokenizer.split_sentences(text)
                for s in sentences:
                    sent_tokens = tokenizer.make_ngrams(s, n)
                    #sent_tokens = [t.rstrip('`"\'') if re.search('[a-z]', t) else t for t in sent_tokens]
                    #sent_tokens = [t.lstrip('`"\'') if re.search('[a-z]', t) else t for t in sent_tokens]
                    tokens = tokens + sent_tokens

                tokens = [self.get_prefix() + t for t in tokens]
                if self.params['source'] != 'normalized':
                    tokens = [t + '_<' + self.params['source'] + '>' for t in tokens]
                if self.params['append_dataset']:
                    tokens = [t + '_' + dataset for t in tokens]
                token_dict[rid] = tokens
            else:
                token_dict[rid] = []
Пример #2
0
 def extract_tokens_from_text(self, data):
     token_dict = {}
     for key, text in data.items():
         text = text.lower()
         text = text.lstrip()
         text = text.rstrip()
         tokens = []
         sentences = tokenizer.split_sentences(text)
         for s in sentences:
             sent_tokens = [s[i:i + self.n] for i in range(len(s) - self.n)]
             tokens = tokens + sent_tokens
         tokens = [self.get_prefix() + t for t in tokens]
         token_dict[key] = tokens
     return token_dict
 def extract_tokens_from_text(self, data):
     token_dict = {}
     for key, text in data.items():
         text = text.lower()
         text = text.lstrip()
         text = text.rstrip()
         tokens = []
         sentences = tokenizer.split_sentences(text)
         for s in sentences:
             sent_tokens = [s[i:i+self.n] for i in range(len(s)-self.n)]
             tokens = tokens + sent_tokens
         tokens = [self.get_prefix() + t for t in tokens]
         token_dict[key] = tokens
     return token_dict
Пример #4
0
    def extract_tokens_from_file(self, data, n):
        token_dict = {}
        for key, text in data.items():
            text = text.lower()
            text = text.lstrip()
            text = text.rstrip()
            tokens = []

            sentences = tokenizer.split_sentences(text)
            for s in sentences:
                sent_tokens = tokenizer.make_ngrams(s, n)
                tokens = tokens + sent_tokens

            tokens = [self.get_prefix() + t for t in tokens]
            token_dict[key] = tokens
        return token_dict
    def extract_tokens_from_file(self, responses, input_filename, n, cluster_dict, token_dict):
        Y = fh.read_csv(input_filename)
        rids = Y.index

        for rid in rids:
            text = responses[rid].lower()
            text = text.lstrip()
            text = text.rstrip()
            tokens = []

            sentences = tokenizer.split_sentences(text)
            for s in sentences:
                sent_tokens = tokenizer.make_ngrams(s, n)
                sent_tokens = [t.rstrip('`"\'') if re.search('[a-z]', t) else t for t in sent_tokens]
                sent_tokens = [t.lstrip('`"\'') if re.search('[a-z]', t) else t for t in sent_tokens]
                sent_tokens = sent_tokens + ['__ENDS__']
                tokens = tokens + sent_tokens

            tokens = [self.get_prefix() + cluster_dict[t] for t in tokens if t in cluster_dict]
            token_dict[rid] = tokens