def test_dna_symbols(self): feature_dic = FeatureDictionary() eval_binary_feature(feature_dic, 'dna_symbols', self.feature.reg_dna_symbols.match, 'A') self.assertEqual(feature_dic.get('dna_symbols[0]'), True) feature_dic = FeatureDictionary() eval_binary_feature(feature_dic, 'dna_symbols', self.feature.reg_dna_symbols.match, 'asd') self.assertEqual(feature_dic.get('dna_symbols[0]'), None)
def test_type2(self): feature_dic = FeatureDictionary() eval_binary_feature(feature_dic, 'type2', lambda x: x == 'p', 'p') self.assertEqual(feature_dic.get('type2[0]'), 1) feature_dic = FeatureDictionary() eval_binary_feature(feature_dic, 'type2', lambda x: x == 'p', 'as') self.assertEqual(feature_dic.get('type2[0'), None)
def test_mutation_word(self): feature_dic = FeatureDictionary() eval_binary_feature(feature_dic, 'mutat_word', self.feature.reg_mutat_word.match, 'repeats') self.assertEqual(feature_dic.get('mutat_word[0]'), True) feature_dic = FeatureDictionary() eval_binary_feature(feature_dic, 'mutat_word', self.feature.reg_mutat_word.match, 'repssts') self.assertEqual(feature_dic.get('mutat_word[0]'), None)
def test_regex_evaluator(self): regex = re.compile('^[A-Z]+$') feature_dict = FeatureDictionary() eval_binary_feature(feature_dict, 'name', regex.search, 'ABC') self.assertEqual(feature_dict.get('name[0]'), 1) self.assertEqual(len(feature_dict), 1) feature_dict = FeatureDictionary() eval_binary_feature(feature_dict, 'name', regex.search, 'abc') self.assertEqual(feature_dict.get('name[0]'), None) self.assertEqual(len(feature_dict), 0)
def generate(self, dataset): """ :type dataset: nalaf.structures.data.Dataset """ last_token_str = "" for token in dataset.tokens(): token.features['num_nr'] = self.num_digits(token.word) token.features['num_up'] = self.num_capital_chars(token.word) token.features['num_lo'] = self.num_lower_chars(token.word) token.features['num_alpha'] = self.num_alpha(token.word) token.features['num_spec_chars'] = self.num_spec_chars(token.word) token.features['shape1'] = self.word_shape_1(token.word) token.features['shape2'] = self.word_shape_2(token.word) token.features['shape3'] = self.word_shape_3(token.word) token.features['shape4'] = self.word_shape_4(token.word) # prefix patterns for index, value in enumerate(self.prefix_pattern(token.word)): token.features['prefix{}'.format(index+1)] = value # suffix patterns for index, value in enumerate(self.suffix_pattern(token.word)): token.features['suffix{}'.format(index+1)] = value # Mutation related if self.get_mutation_features: token.features['mutat_article_bp'] = self.mutation_article_bp(token.word) token.features['type1'] = self.is_special_type_1(token.word) token.features['mutat_type'] = self.mutation_type(token.word) token.features['protein_symbols'] = self.has_protein_symbols(token.word, last_token_str) token.features['rs_code'] = self.has_rscode(token.word) # binary features eval_binary_feature(token.features, 'mutat_word', self.reg_mutat_word.match, token.word.lower()) eval_binary_feature(token.features, 'num_has_chr_key', self.reg_chr_keys.search, token.word) eval_binary_feature(token.features, 'type2', lambda x: x == 'p', token.word) eval_binary_feature(token.features, 'dna_symbols', self.reg_dna_symbols.match, token.word) # last token last_token_str = token.word
def test_lambda_evaluator(self): feature_dict = FeatureDictionary() eval_binary_feature(feature_dict, 'name', lambda x: x == 'ABC', 'ABC') self.assertEqual(feature_dict.get('name[0]'), 1) self.assertEqual(len(feature_dict), 1) feature_dict = FeatureDictionary() eval_binary_feature(feature_dict, 'name', lambda x: x == 'ABC', 'abc') self.assertEqual(feature_dict.get('name[0]'), None) self.assertEqual(len(feature_dict), 0) feature_dict = FeatureDictionary() eval_binary_feature(feature_dict, 'name', lambda x, y: x == y, 'xx', 'xx') self.assertEqual(feature_dict.get('name[0]'), 1) self.assertEqual(len(feature_dict), 1) feature_dict = FeatureDictionary() eval_binary_feature(feature_dict, 'name', lambda x, y: x == y, 'xx', 'yy') self.assertEqual(feature_dict.get('name[0]'), None) self.assertEqual(len(feature_dict), 0)