Пример #1
0
def make_identifier():

    # Loop while an alpha or digit character
    while (is_alpha(peek()) or is_digit(peek())):
        advance()

    # Make the token and return it
    return make_token(identifier_type())
 def _transform(self, token):
     if token in self.transform_dict:
         return random.choice(self.transform_dict[token])
     elif is_alpha(token):
         for pattern in self.pattern_transform_dict:
             if re.match(pattern, token) is not None:
                 return random.choice(self.pattern_transform_dict[pattern])
     return token
    def __call__(self, tokens, idx):
        target_token = tokens[idx]
        if idx > 0:
            left_token = tokens[idx - 1]
            if is_alpha(left_token):
                return None
        if idx + 1 < len(tokens):
            right_token = tokens[idx + 1]
            if is_alpha(right_token):
                return None

        new_token = self._transform(target_token)
        if new_token == target_token:
            return None

        new_tokens = tokens[:]
        new_tokens[idx] = new_token
        if self.debug:
            self.transformed_tokens.append(new_tokens)
        return new_tokens
Пример #4
0
    def attack(self, raw_texts, rounds=5, topK=5):
        print('Round:', rounds, 'TopK:', topK)
        local_scores = []
        transformed_texts = []
        for raw_text in tqdm(raw_texts):
            best_score = 0.0
            raw_tokens = self.tokenizer(raw_text)
            best_transformed_text = raw_text
            best_transformed_tokens = raw_tokens

            preprocessed_raw_text = preprocess_text(''.join(raw_tokens))
            historical_taa_set = {preprocessed_raw_text}
            candidate_taas = {preprocessed_raw_text: raw_tokens}

            ##############################################################
            ### Global transform: 整个句子全都替换掉, 然后用这些样本当做种子
            ##############################################################
            ## 1. 暴力整句替换
            for _ in range(3):  # 调 3或5没什么区别,速度差一点点
                self._append_transformed_tokens(
                    historical_taa_set, candidate_taas,
                    self.homonymic_transform.global_transform(
                        raw_tokens))  # 替换掉所有骂人的关键词

            ## 2. 随机整句替换
            indices_probs = [
                self.transform_dict[token]['scores']
                if token in self.transform_dict else 0.0
                for token in raw_tokens
            ]
            indices_probs_sum = 0
            valid_cnt = 0
            for prob in indices_probs:
                indices_probs_sum += prob
                valid_cnt += int(prob > 0)
            if indices_probs_sum > 0:
                indices_probs = [
                    prob / indices_probs_sum for prob in indices_probs
                ]
                for round in range(1):  # 这个轮数增多没有实际帮助
                    for i in range(1, valid_cnt + 1):
                        indices = np.random.choice(len(raw_tokens),
                                                   i,
                                                   replace=False,
                                                   p=indices_probs)
                        new_tokens = raw_tokens[:]
                        for idx in indices:
                            target_token = new_tokens[idx]
                            tsf_tokens = self.transform_dict[target_token][
                                'transform_tokens']
                            tsf_token_probs = self.transform_dict[
                                target_token]['transform_probs']
                            tsf_idx = np.random.choice(len(tsf_token_probs),
                                                       1,
                                                       p=tsf_token_probs)[0]
                            new_tokens[idx] = tsf_tokens[tsf_idx]
                        self._append_transformed_tokens(
                            historical_taa_set, candidate_taas, new_tokens)

                # # 挑选出K个攻击力最强的样本,进行下一轮迭代
                # cur_transformed_texts = []
                # cur_transformed_tokens = []
                # for text in candidate_taas:
                #   cur_transformed_texts.append(text)
                #   cur_transformed_tokens.append(candidate_taas[text])
                # ref_texts = [raw_text] * len(cur_transformed_texts)
                # soft_scores, hard_scores = self.performance_evaluator.calc_final_score(ref_texts, cur_transformed_texts,
                #                                                                        show_details=False)
                # sorted_eval_scores = sorted(enumerate(soft_scores), key=lambda d: d[1], reverse=True)[:topK]
                # if sorted_eval_scores[0][1] > best_score:
                #   best_score = sorted_eval_scores[0][1]
                #   best_transformed_text = cur_transformed_texts[sorted_eval_scores[0][0]]
                #   best_transformed_tokens = cur_transformed_tokens[sorted_eval_scores[0][0]]
                #   candidate_taas = {}
                # else:
                #   candidate_taas = {best_transformed_text: best_transformed_tokens}
                # for idx, score in sorted_eval_scores:
                #   candidate_taas[cur_transformed_texts[idx]] = cur_transformed_tokens[idx]

            for round in range(rounds):
                cur_tokens_list = [
                    candidate_taas[text] for text in candidate_taas
                ]
                for tokens_idx, tokens in enumerate(cur_tokens_list):
                    if len(tokens) == 0:
                        continue
                    ## 遗传攻击
                    for other_tokens_idx, other_tokens in enumerate(
                            cur_tokens_list):
                        if other_tokens_idx == tokens_idx or len(
                                tokens) != len(other_tokens):
                            continue

                        new_tokens = tokens[:]
                        target_token_indices = np.random.choice(
                            len(other_tokens),
                            len(other_tokens) // 2,
                            replace=False)
                        for idx in target_token_indices:
                            if idx < len(new_tokens):
                                new_tokens[idx] = other_tokens[idx]
                        self._append_transformed_tokens(
                            historical_taa_set, candidate_taas, new_tokens)
                    pass
                    idx = random.randint(0, len(tokens) - 1)  # Fixme: 换掉随机攻击

                    if is_alpha(tokens[idx]) and len(tokens[idx]) >= 4:
                        self._append_transformed_tokens(
                            historical_taa_set, candidate_taas,
                            self.char_swap_transform(tokens, idx))

                    self._append_transformed_tokens(
                        historical_taa_set, candidate_taas,
                        self.add_transform(tokens, idx))
                    # self._append_transformed_tokens(historical_taa_set, candidate_taas, self.token_drop_transform(tokens, idx))
                    self._append_transformed_tokens(
                        historical_taa_set, candidate_taas,
                        self.radical_transform(tokens,
                                               idx))  # 需要注意一些非左右结构的字,比如死、司等

                    self._append_transformed_tokens(
                        historical_taa_set, candidate_taas,
                        self.hxw_transform(tokens, idx))
                    self._append_transformed_tokens(
                        historical_taa_set, candidate_taas,
                        self.hxw_radical_transform(tokens, idx))
                    self._append_transformed_tokens(
                        historical_taa_set, candidate_taas,
                        self.radical_chardrop_transform(tokens, idx))
                    self._append_transformed_tokens(
                        historical_taa_set, candidate_taas,
                        self.hxw_radical_chardroptransform(tokens, idx))

                    # self._append_transformed_tokens(historical_taa_set, candidate_taas,
                    #                                 self.token_swap_transform(tokens, idx))  # word lvl的swap很垃圾
                    self._append_transformed_tokens(
                        historical_taa_set, candidate_taas,
                        self.phonetic_char_swap_transform(tokens, idx))

                    # # ## fixme: 下面这个是workflow中的小环节,属于特例
                    # candidates_list = self.pronunciation_transform(tokens, idx, N=5)
                    # transformed_tokens = tokens[:idx]
                    # new_token_chars = []
                    # for raw_char, candidates in zip(tokens[idx], candidates_list):
                    #   for candidate in candidates:
                    #     if candidate != raw_char:
                    #       new_token_chars.append(candidate)
                    #       break
                    # if len(new_token_chars) > 0:
                    #   new_token = ''.join(new_token_chars)
                    # else:
                    #   new_token = ''
                    # transformed_tokens.append(new_token)
                    # transformed_tokens += tokens[idx + 1:]
                    # self._append_transformed_tokens(historical_taa_set, candidate_taas, transformed_tokens)
                    self._append_transformed_tokens(
                        historical_taa_set, candidate_taas,
                        rule_based_transform(tokens, self.transform_dict))

                # 挑选出K个攻击力最强的样本,进行下一轮迭代
                cur_transformed_texts = []
                cur_transformed_tokens = []
                for text in candidate_taas:
                    cur_transformed_texts.append(text)
                    cur_transformed_tokens.append(candidate_taas[text])
                ref_texts = [raw_text] * len(cur_transformed_texts)
                soft_scores, hard_scores = self.performance_evaluator.calc_final_score(
                    ref_texts, cur_transformed_texts, show_details=False)
                sorted_eval_scores = sorted(enumerate(soft_scores),
                                            key=lambda d: d[1],
                                            reverse=True)[:topK]
                if sorted_eval_scores[0][1] > best_score:
                    best_score = sorted_eval_scores[0][1]
                    best_transformed_text = cur_transformed_texts[
                        sorted_eval_scores[0][0]]
                    best_transformed_tokens = cur_transformed_tokens[
                        sorted_eval_scores[0][0]]
                    candidate_taas = {}
                else:
                    candidate_taas = {
                        best_transformed_text: best_transformed_tokens
                    }
                for idx, score in sorted_eval_scores:
                    candidate_taas[cur_transformed_texts[
                        idx]] = cur_transformed_tokens[idx]

            transformed_texts.append(best_transformed_text)
            local_scores.append(best_score)

        return transformed_texts, local_scores
Пример #5
0
def generate_taa_samples(self, raw_texts, group_ids, rounds=5, topK=5):
    transformed_texts = []
    new_group_ids = []
    for raw_text, group_id in tqdm(zip(raw_texts, group_ids),
                                   total=len(raw_texts)):
        if isinstance(group_id, int):
            is_obs = (group_id == 1)
        else:
            is_obs = group_id.startswith('obs')

        texts_to_add = set()
        raw_tokens = self.tokenizer(raw_text)

        preprocessed_raw_text = preprocess_text(''.join(raw_tokens))
        historical_taa_set = {preprocessed_raw_text}
        candidate_taas = {preprocessed_raw_text: raw_tokens}
        for round in range(rounds):
            cur_tokens_list = [candidate_taas[text] for text in candidate_taas]
            for tokens_idx, tokens in enumerate(cur_tokens_list):
                if len(tokens) == 0:
                    continue
                ## 遗传攻击
                for other_tokens_idx, other_tokens in enumerate(
                        cur_tokens_list):
                    if other_tokens_idx == tokens_idx or len(tokens) != len(
                            other_tokens):
                        continue

                    new_tokens = tokens[:]
                    target_token_indices = np.random.choice(
                        len(other_tokens),
                        len(other_tokens) // 2,
                        replace=False)
                    for idx in target_token_indices:
                        if idx < len(new_tokens):
                            new_tokens[idx] = other_tokens[idx]
                    self._append_transformed_tokens(historical_taa_set,
                                                    candidate_taas, new_tokens)

                idx = random.randint(0, len(tokens) - 1)  # Fixme: 换掉随机攻击

                if is_alpha(tokens[idx]) and len(tokens[idx]) >= 4:
                    self._append_transformed_tokens(
                        historical_taa_set, candidate_taas,
                        self.char_swap_transform(tokens, idx))

                self._append_transformed_tokens(
                    historical_taa_set, candidate_taas,
                    self.add_transform(tokens, idx))
                self._append_transformed_tokens(
                    historical_taa_set, candidate_taas,
                    self.token_drop_transform(tokens, idx))
                self._append_transformed_tokens(
                    historical_taa_set, candidate_taas,
                    self.token_swap_transform(tokens, idx))  # word lvl的swap很垃圾

                self._append_transformed_tokens(
                    historical_taa_set, candidate_taas,
                    self.radical_transform(tokens,
                                           idx))  # 需要注意一些非左右结构的字,比如死、司等

                self._append_transformed_tokens(
                    historical_taa_set, candidate_taas,
                    self.phonetic_char_swap_transform(tokens, idx))

                self._append_transformed_tokens(
                    historical_taa_set, candidate_taas,
                    self.hxw_transform(tokens, idx))

                # ## fixme: 下面这个是workflow中的小环节,属于特例
                candidates_list = self.pronunciation_transform(tokens,
                                                               idx,
                                                               N=None)
                transformed_tokens = tokens[:idx]
                new_token_chars = []
                for raw_char, candidates in zip(tokens[idx], candidates_list):
                    for candidate in candidates:
                        if candidate != raw_char:
                            new_token_chars.append(candidate)
                            break
                if len(new_token_chars) > 0:
                    new_token = ''.join(new_token_chars)
                else:
                    new_token = ''
                transformed_tokens.append(new_token)
                transformed_tokens += tokens[idx + 1:]
                self._append_transformed_tokens(historical_taa_set,
                                                candidate_taas,
                                                transformed_tokens)

            # 挑选出K个攻击力最强的样本,进行下一轮迭代
            cur_transformed_texts = []
            cur_transformed_tokens = []
            for text in candidate_taas:
                cur_transformed_texts.append(text)
                cur_transformed_tokens.append(candidate_taas[text])
            ref_texts = [raw_text] * len(cur_transformed_texts)
            soft_scores, hasrd_scores = self.performance_evaluator.calc_final_score(
                ref_texts,
                cur_transformed_texts,
                show_details=False,
                is_obs=is_obs)
            sorted_eval_scores = sorted(enumerate(soft_scores),
                                        key=lambda d: d[1],
                                        reverse=True)[:topK]
            candidate_taas = {}
            for idx, score in sorted_eval_scores:
                candidate_taas[
                    cur_transformed_texts[idx]] = cur_transformed_tokens[idx]
            texts_to_add.add(cur_transformed_texts[sorted_eval_scores[0]
                                                   [0]])  # 每轮加一个最高分,最后一轮全加上
        texts_to_add |= set(cur_transformed_texts)
        transformed_texts.extend(list(texts_to_add))
        new_group_ids.extend([group_id] * len(texts_to_add))

    return transformed_texts, new_group_ids
Пример #6
0
    def attack(self,
               raw_texts,
               rounds=5,
               topK=5,
               debug=False,
               kw_freq_thres=20.0):
        print('Round:', rounds, 'TopK:', topK)
        local_scores = []
        transformed_texts = []
        for i_text, raw_text in tqdm(enumerate(raw_texts),
                                     total=len(raw_texts)):
            best_score = 0.0
            raw_tokens = self.tokenizer(raw_text)
            kw_freqs = []
            for token in raw_tokens:
                if token not in self.kw_freq_dict:
                    self.kw_freq_dict[token] = 0
                self.kw_freq_dict[token] += 5
                kw_freqs.append(self.kw_freq_dict[token])
            self.local_kw_freq_dict = self.kw_freq_dict.copy(
            )  # 复制一个全局dict的副本,在高频次query时使用本地副本可以避免进程同步带来的巨大同步耗时
            mean_freq = np.mean(kw_freqs)
            best_transformed_text = raw_text
            best_transformed_tokens = raw_tokens

            ## todo: 可以改成tokens中见过的词太少的话(平均频次低于阈值),就换成kw idf模式
            # if i_text <= kw_idf_cnt:
            if mean_freq < kw_freq_thres:
                kw_scores = self.kw_identification(raw_tokens, len(raw_tokens))
                kw_scores = [score for _, score in kw_scores]

            preprocessed_raw_text = preprocess_text(''.join(raw_tokens))
            historical_taas = {preprocessed_raw_text}
            candidate_taas = {}
            ##############################################################
            ### Global transform: 整个句子全都替换掉, 然后用这些样本当做种子
            ##############################################################
            # 替换掉所有骂人的关键词
            for transform in self.global_transforms:
                for i in range(topK):  # 调大的话效果会好一点
                    self._append_transformed_tokens(
                        historical_taas, candidate_taas,
                        transform.global_transform(raw_tokens))
            if len(candidate_taas) == 0:
                candidate_taas = {preprocessed_raw_text: raw_tokens}

            cur_rounds = rounds  # 当前text的运行轮数,根据长度进行调整
            if len(raw_tokens) < 50:  # 30不会,50不确定
                cur_rounds = int(cur_rounds *
                                 (1.5 - 0.1 * len(raw_tokens) // 10))
            for round in range(1, cur_rounds + 1):
                cur_tokens_list = [
                    candidate_taas[text] for text in candidate_taas
                ]
                for tokens_idx, tokens in enumerate(cur_tokens_list):
                    if len(tokens) == 0:
                        continue
                    # # 暴力多点交叉遗传攻击, 肉眼观察较差,但是线上较强
                    # for other_tokens_idx, other_tokens in enumerate(cur_tokens_list):
                    #   if other_tokens_idx == tokens_idx or len(tokens) != len(other_tokens):
                    #     continue
                    #
                    #   for ratio in [2]:
                    #     if len(tokens) < ratio:
                    #       continue
                    #     new_tokens1 = tokens[:]
                    #     new_tokens2 = other_tokens[:]  # 虽然for循环本身就会遍历到(i,j)和(j,i)的情况,但是多来一次可以增加多样性
                    #     target_token_indices = np.random.choice(len(other_tokens), len(other_tokens) // ratio, replace=False)
                    #     for idx in target_token_indices:
                    #       if idx < len(new_tokens1):
                    #         new_tokens1[idx] = other_tokens[idx]
                    #       if idx > len(new_tokens2):
                    #         new_tokens2[idx] = tokens[idx]
                    #     self._append_transformed_tokens(historical_taas, candidate_taas, new_tokens1)
                    #     self._append_transformed_tokens(historical_taas, candidate_taas, new_tokens2)
                    pass

                    # ## cross over遗传攻击, 线下&肉眼较强,但是线上很差
                    # for other_tokens_idx, other_tokens in enumerate(cur_tokens_list):
                    #   if other_tokens_idx == tokens_idx:
                    #     continue
                    #
                    #   try:
                    #     tgt_idx = random.randint(3, min(len(tokens), len(other_tokens)) - 3)  # 头尾几个点不截取
                    #     new_tokens1 = tokens[:tgt_idx] + other_tokens[tgt_idx:]
                    #     new_tokens2 = other_tokens[:tgt_idx] + tokens[tgt_idx:]
                    #     self._append_transformed_tokens(historical_taas, candidate_taas, new_tokens1)
                    #     self._append_transformed_tokens(historical_taas, candidate_taas, new_tokens2)
                    #   except:
                    #     pass
                    pass
                    idx_probs = None
                    if round % 2:
                        try:
                            if mean_freq < kw_freq_thres:
                                freqs = kw_scores  # 可能会因为add、drop导致idx错位,不过暂时先忽略
                                freqs = freqs[:len(tokens)]
                                freqs += [0] * (len(tokens) - len(freqs))
                                freqs = np.array(freqs)
                                freqs = freqs - freqs.min() + 0.01
                            else:
                                # fixme: 这里可以改成local_kw来提速如果有必要的话
                                freqs = np.array([
                                    self.kw_freq_dict[token]
                                    if token in self.kw_freq_dict else 1
                                    for token in tokens
                                ])
                            idx_probs = freqs / freqs.sum()
                        except:
                            pass
                    idx = np.random.choice(list(range(len(tokens))),
                                           1,
                                           p=idx_probs)[0]  # 针对关键词的定向攻击
                    indices = np.random.choice(list(range(len(tokens))),
                                               min(3, len(tokens)),
                                               p=idx_probs)  # 批量替换

                    ## 开始单点替换
                    if is_alpha(tokens[idx]) and len(tokens[idx]) >= 4:
                        for transform in self.alpha_transforms:
                            self._append_transformed_tokens(
                                historical_taas, candidate_taas,
                                transform(tokens, idx))

                    # if len(tokens[idx]) > 1:
                    #   ## 对于非英文的、经过转换的token,直接continue掉避免影响可读性。
                    #   # (本来可能是拆分成偏旁,然后偏旁->变成别的东西,或者te -> t恶之类的)
                    #   # 对速度影响不大,说明这类样本本身并不是很多
                    #   continue

                    for transform in self.multi_rounds_transforms:
                        for _ in range(3):
                            self._append_transformed_tokens(
                                historical_taas, candidate_taas,
                                transform(tokens, idx))

                    for transform in self.random_transforms:
                        self._append_transformed_tokens(
                            historical_taas, candidate_taas,
                            transform(tokens, idx))

                    for transform in self.fixed_transforms:
                        self._append_transformed_tokens(
                            historical_taas, candidate_taas,
                            transform(tokens, idx))

                    ## 开始批量替换,主要是为拼音\add等不会严重影响可读性方法服务,克服这些方法在jaccard指标上的劣势
                    indices = sorted(indices, reverse=True)  # 降序排列,为add服务
                    for transform in self.multi_ptr_transforms:
                        self._append_transformed_tokens(
                            historical_taas, candidate_taas,
                            transform.multi_ptr_trans(tokens, indices))

                # 挑选出K个攻击力最强的样本,进行下一轮迭代
                cur_transformed_texts = []
                cur_transformed_tokens = []
                for text in candidate_taas:
                    cur_transformed_texts.append(text)
                    cur_transformed_tokens.append(candidate_taas[text])
                ref_texts = [raw_text] * len(cur_transformed_texts)
                soft_scores, hard_scores = self.performance_evaluator.calc_final_score(
                    ref_texts, cur_transformed_texts, show_details=False)

                ## 词频加权的最终得分,该策略用于对抗线上的自动防御机制
                freqs = np.array([
                    sum([
                        self.local_kw_freq_dict[token]
                        if token in self.local_kw_freq_dict else 1
                        for token in tokens
                    ]) for tokens in cur_transformed_tokens
                ])
                freq_weights = (freqs - freqs.min()) / (freqs.max() -
                                                        freqs.min())
                freq_weights = 1.0 - 0.2 * freq_weights
                soft_scores *= freq_weights
                sorted_eval_scores = sorted(enumerate(soft_scores),
                                            key=lambda d: d[1],
                                            reverse=True)
                if sorted_eval_scores[0][1] > best_score:
                    best_score = sorted_eval_scores[0][1]
                    best_transformed_text = cur_transformed_texts[
                        sorted_eval_scores[0][0]]
                    best_transformed_tokens = cur_transformed_tokens[
                        sorted_eval_scores[0][0]]
                    # best_transformed_tokens = self.tokenizer(best_transformed_text) # 额外tokenize一下好像没什么区别,速度也没有影响
                    candidate_taas = {}
                else:
                    candidate_taas = {
                        best_transformed_text: best_transformed_tokens
                    }
                for idx, score in sorted_eval_scores[:topK]:
                    candidate_taas[cur_transformed_texts[
                        idx]] = cur_transformed_tokens[idx]
                    # candidate_taas[cur_transformed_texts[idx]] = self.tokenizer(cur_transformed_texts[idx])

                # 然后额外随机选择2个弱鸡模型加到下一轮迭代中去,以保证样本多样性, 线上完全没用
                # try:
                #   extra_cnt = 2
                #   probs = np.array([score for idx, score in sorted_eval_scores[topK:]])  # 从topk以外的样本中选
                #   probs = probs / probs.sum()
                #   rnd_sample_indices = np.random.choice(list(range(topK, len(sorted_eval_scores))), extra_cnt, replace=False,
                #                                         p=probs)
                #   for idx in rnd_sample_indices:
                #     idx = sorted_eval_scores[idx][0]
                #     candidate_taas[cur_transformed_texts[idx]] = cur_transformed_tokens[idx]
                #     # candidate_taas[cur_transformed_texts[idx]] = self.tokenizer(cur_transformed_texts[idx])
                # except:
                #   pass
                pass
            for token in best_transformed_tokens:
                if token not in self.kw_freq_dict:
                    self.kw_freq_dict[token] = 0
                self.kw_freq_dict[token] += 2

            transformed_texts.append(best_transformed_text)
            local_scores.append(best_score)

            if debug:
                ## 算贡献度
                for transform in self.transforms:
                    tokens_list = transform.transformed_tokens
                    if not tokens_list:
                        continue
                    cur_transformed_texts = list(
                        set([
                            preprocess_text(''.join(tokens))
                            for tokens in tokens_list
                        ]))
                    ref_texts = [raw_text] * len(cur_transformed_texts)
                    soft_scores, hard_scores = self.performance_evaluator.calc_final_score(
                        ref_texts, cur_transformed_texts, show_details=False)
                    transform.mean_scores.append(np.mean(soft_scores))
                    transform.max_scores.append(np.max(soft_scores))
                    transform.clear()
        if debug:
            print('-' * 80)
            print('Mean of Mean scores:')
            print('-' * 80)
            score_records = []
            for transform in self.transforms:
                scores = transform.mean_scores
                score = 0
                if scores:
                    score = np.mean(scores)
                score_records.append((transform, score), )
            score_records = sorted(score_records,
                                   key=lambda d: d[1],
                                   reverse=True)
            for k, v in score_records:
                print(k, v)

            print('-' * 80)
            print('Mean of Max scores:')
            print('-' * 80)
            score_records = []
            for transform in self.transforms:
                scores = transform.max_scores
                score = 0
                if scores:
                    score = np.mean(scores)
                score_records.append((transform, score), )
            score_records = sorted(score_records,
                                   key=lambda d: d[1],
                                   reverse=True)
            for k, v in score_records:
                print(k, v)

            print('-' * 80)
            print('Max of Max scores:')
            print('-' * 80)
            score_records = []
            for transform in self.transforms:
                scores = transform.max_scores
                score = 0
                if scores:
                    score = np.max(scores)
                score_records.append((transform, score), )
            score_records = sorted(score_records,
                                   key=lambda d: d[1],
                                   reverse=True)
            for k, v in score_records:
                print(k, v)

        # print('-' * 80)
        # for token, freq in sorted(self.kw_freq_dict.items(), key=lambda d: d[1], reverse=True)[:50]:
        #   print(token, freq)
        # print('Len freq dict:', len(self.kw_freq_dict))
        # print('-' * 80)
        return transformed_texts, local_scores
Пример #7
0
def scan_token():

    # Skip Whitespace
    skip_whitespace()
    scanner.start = scanner.current

    # Is End?
    if (is_end()):
        scanner.start = scanner.current + 1
        return make_token(TOKEN_END)

    # Advance
    c = advance()

    # Check character(s)
    if (is_alpha(c)): return make_identifier()  # Identifier
    if (is_digit(c)): return make_number()  # Number

    if (c == '('): return make_token(TOKEN_LEFT_PAREN)  # (
    elif (c == ')'): return make_token(TOKEN_RIGHT_PAREN)  # )
    elif (c == '{'): return make_token(TOKEN_LEFT_BRACE)  # {
    elif (c == '}'): return make_token(TOKEN_RIGHT_BRACE)  # }
    elif (c == '['): return make_token(TOKEN_LEFT_BRACKET)  # [
    elif (c == ']'): return make_token(TOKEN_RIGHT_BRACKET)  # ]
    elif (c == ';'): return make_token(TOKEN_SEMICOLON)  # ;
    elif (c == ','): return make_token(TOKEN_COMMA)  # ,
    elif (c == '.'): return make_token(TOKEN_DOT)  # .
    elif (c == '?'): return make_token(TOKEN_QUESTION)  # ?
    elif (c == ':'): return make_token(TOKEN_COLON)  # :
    elif (c == '-'):
        if (match('-')): return make_token(TOKEN_MINUS_MINUS)  # --
        if (match('=')): return make_token(TOKEN_MINUS_EQUAL)  # -=
        return make_token(TOKEN_MINUS)  # -
    elif (c == '+'):
        if (match('+')): return make_token(TOKEN_PLUS_PLUS)  # ++
        if (match('=')): return make_token(TOKEN_PLUS_EQUAL)  # +=
        return make_token(TOKEN_PLUS)  # +
    elif (c == '/'):
        if (match('=')): return make_token(TOKEN_SLASH_EQUAL)  # /=
        return make_token(TOKEN_SLASH)  # /
    elif (c == '*'):
        if (match('=')): return make_token(TOKEN_STAR_EQUAL)  # *=
        return make_token(TOKEN_STAR)  # *
    elif (c == '%'):
        if (match('=')): return make_token(TOKEN_PERCENT_EQUAL)  # %=
        return make_token(TOKEN_PERCENT)  # %
    elif (c == '!'):
        if (match('=')): return make_token(TOKEN_BANG_EQUAL)  # !=
        return make_token(TOKEN_BANG)  # !
    elif (c == '='):
        if (match('=')): return make_token(TOKEN_EQUAL_EQUAL)  # ==
        return make_token(TOKEN_EQUAL)  # =
    elif (c == '<'):
        if (match('=')): return make_token(TOKEN_LESS_EQUAL)  # <=
        return make_token(TOKEN_LESS)  # <
    elif (c == '>'):
        if (match('=')): return make_token(TOKEN_GREATER_EQUAL)  # >=
        return make_token(TOKEN_GREATER)  # >
    elif (c == '&'):
        if (match('&')): return make_token(TOKEN_AND)  # &&
    elif (c == '|'):
        if (match('|')): return make_token(TOKEN_OR)  # ||
    elif (c == '"'):
        return make_string()  # String
    elif (c in " \r\t#\n"):
        # We probably found spacing here, so skip that spacing, scan again and
        # return that scanned token
        skip_whitespace()
        return scan_token()

    # Unexpected character
    return make_error_token("Unexpected character '{0}'.".format(c))