def extract_tokens_from_file(self, responses, input_filename, n, token_dict):
        Y = fh.read_csv(input_filename)
        rids = Y.index
        dataset = fh.get_basename(input_filename)

        for rid in rids:
            if rid in responses:
                text = responses[rid].lower()
                text = text.lstrip()
                text = text.rstrip()
                tokens = []

                sentences = tokenizer.split_sentences(text)
                for s in sentences:
                    sent_tokens = tokenizer.make_ngrams(s, n)
                    #sent_tokens = [t.rstrip('`"\'') if re.search('[a-z]', t) else t for t in sent_tokens]
                    #sent_tokens = [t.lstrip('`"\'') if re.search('[a-z]', t) else t for t in sent_tokens]
                    tokens = tokens + sent_tokens

                tokens = [self.get_prefix() + t for t in tokens]
                if self.params['source'] != 'normalized':
                    tokens = [t + '_<' + self.params['source'] + '>' for t in tokens]
                if self.params['append_dataset']:
                    tokens = [t + '_' + dataset for t in tokens]
                token_dict[rid] = tokens
            else:
                token_dict[rid] = []
Пример #2
0
    def extract_tokens_from_text(self, data, items_to_load, doc_index=None):
        token_dict = {}

        for key in items_to_load:
            if doc_index is not None:
                doc_key = doc_index[key]['filename']
            else:
                doc_key = key
            text = data[doc_key]
            if self.lower:
                text = text.lower()
            text = text.lstrip()
            text = text.rstrip()
            if self.replace_num is not None:
                text = re.sub('\d', self.replace_num, text)
            tokens = []

            sentences = text.split('\n')

            if doc_index is None:
                for s in sentences:
                    sent_tokens = tokenizer.make_ngrams(s, self.n, replace_numbers=False)
                    tokens = tokens + sent_tokens
            elif 'sentences' not in doc_index[key]:
                for s in sentences:
                    sent_tokens = tokenizer.make_ngrams(s, self.n, replace_numbers=False)
                    tokens = tokens + sent_tokens
            else:
                for i in doc_index[key]['sentences']:
                    s = sentences[int(i)]
                    sent_tokens = tokenizer.make_ngrams(s, self.n, replace_numbers=False)
                    tokens = tokens + sent_tokens

            tokens = [self.get_prefix() + t for t in tokens]

            token_dict[key] = tokens
        return token_dict
Пример #3
0
    def extract_tokens_from_text(self, data):
        token_dict = {}
        for key, text in data.items():
            text = text.lower()
            text = text.lstrip()
            text = text.rstrip()
            tokens = []

            sentences = tokenizer.split_sentences(text)
            for s in sentences:
                sent_tokens = tokenizer.make_ngrams(s, self.n)
                tokens = tokens + sent_tokens

            tokens = [self.get_prefix() + t for t in tokens]
            token_dict[key] = tokens
        return token_dict
Пример #4
0
    def extract_tokens_from_file(self, data, n):
        token_dict = {}
        for key, text in data.items():
            text = text.lower()
            text = text.lstrip()
            text = text.rstrip()
            tokens = []

            sentences = tokenizer.split_sentences(text)
            for s in sentences:
                sent_tokens = tokenizer.make_ngrams(s, n)
                tokens = tokens + sent_tokens

            tokens = [self.get_prefix() + t for t in tokens]
            token_dict[key] = tokens
        return token_dict
    def extract_tokens_from_file(self, responses, input_filename, n, cluster_dict, token_dict):
        Y = fh.read_csv(input_filename)
        rids = Y.index

        for rid in rids:
            text = responses[rid].lower()
            text = text.lstrip()
            text = text.rstrip()
            tokens = []

            sentences = tokenizer.split_sentences(text)
            for s in sentences:
                sent_tokens = tokenizer.make_ngrams(s, n)
                sent_tokens = [t.rstrip('`"\'') if re.search('[a-z]', t) else t for t in sent_tokens]
                sent_tokens = [t.lstrip('`"\'') if re.search('[a-z]', t) else t for t in sent_tokens]
                sent_tokens = sent_tokens + ['__ENDS__']
                tokens = tokens + sent_tokens

            tokens = [self.get_prefix() + cluster_dict[t] for t in tokens if t in cluster_dict]
            token_dict[rid] = tokens