Пример #1
0
    def __init__(self, data_list, tokenizer, args, logger):
        self.data_list = data_list
        self.tokenizer = tokenizer
        self.max_len = args.max_len
        self.device = args.device
        self.logger = logger
        self.cls = self.tokenizer.convert_tokens_to_ids('[CLS]')
        self.sep = self.tokenizer.convert_tokens_to_ids('[SEP]')
        self.keyword_cls = self.tokenizer.convert_tokens_to_ids('[unused1]')
        # self.ot_cls = self.tokenizer.convert_tokens_to_ids('[unused2]')
        self.args = args
        self.init()

        self.d, self.key_words = load_keywords(self.logger)
Пример #2
0
    def __init__(self, data_list, tokenizer, args, logger, name_to_code):
        self.data_list = data_list
        self.tokenizer = tokenizer
        self.max_len = args.max_len
        self.device = args.device
        self.logger = logger
        self.cls = self.tokenizer.convert_tokens_to_ids('[CLS]')
        self.sep = self.tokenizer.convert_tokens_to_ids('[SEP]')
        self.body_cls = self.tokenizer.convert_tokens_to_ids('[unused1]')
        self.args = args
        self.name_to_code = name_to_code
        self.init()

        self.d, self.key_words = load_keywords(self.logger)

        self.code_dict = get_coding_type()
Пример #3
0
    def __init__(self):
        self.logger = utils.initlog('Console', 'Colsole.log')
        self.logger.info('initializing classifier...')
        self.count = 0
        self.w2v_model = self.load_model()
        self.profile_keywords, self.required_keywords, multigram_keywords = utils.load_keywords()
        self.inverted_index = utils.build_inverted_index(self.required_keywords)
        self.ac_tries = utils.build_actries(multigram_keywords)

        detector = ldig.LangDetector('./common/model.latin')
        self.param, self.labels, self.trie = detector.load_params()

        self.stopwords = nltk.corpus.stopwords.words('english')
        self.url_pattern = re.compile(r'(https?:/*)[^ ]+|#|@')
        self.tokenizer_pattern = r'''([a-z]\.)+[a-z]?|\w+(-\w+)*'''
        self.tags = ['JJ', 'NN', 'VB']
        self.logger.info('classifier initialized!')
Пример #4
0
    def __init__(self):
        self.logger = utils.initlog('Console', 'Colsole.log')
        self.logger.info('initializing classifier...')
        self.w2v_model = self.load_model()
        self.profile_keywords, self.required_keywords, multigram_keywords = utils.load_keywords()
        self.inverted_index = utils.build_inverted_index(self.required_keywords)
        self.ac_tries = utils.build_actries(multigram_keywords)
        self.keyword_map = [collections.defaultdict(float) for _ in range(225)]
        self.max_similarity = [0.0 for _ in range(225)]

        detector = ldig.LangDetector('./common/model.latin')
        self.param, self.labels, self.trie = detector.load_params()

        self.stopwords = nltk.corpus.stopwords.words('english')
        self.url_pattern = re.compile(r'(https?:/*)[^ ]+|#|@')
        self.tokenizer_pattern = r'''([a-z]\.)+[a-z]?|\w+(-\w+)*'''
        self.tags = ['JJ', 'NN', 'VB']
        self.logger.info('classifier initialized!')
Пример #5
0
    def __init__(self,
                 data_list,
                 standard_name_list,
                 code_to_name,
                 name_to_code,
                 tokenizer,
                 args,
                 logger,
                 neg_list=None):
        self.data_list = data_list  # train val answer
        self.standard_name_list = standard_name_list  #标准名词表
        self.code_to_name = code_to_name
        self.name_to_code = name_to_code
        self.tokenizer = tokenizer
        self.max_len = args.max_len
        self.neg_num = args.neg_num
        self.neg_sample = args.neg_sample
        self.batch_size = args.train_batch_size
        self.neg_list = neg_list  # len=len(data_list), 每个item长度为该raw_name对应standard name 个数的一个list, 而其中又包含了args.neg_num个负例
        self.device = args.device
        self.logger = logger
        self.add_keywords = args.add_keywords

        #如果是tf_idf,提前构造好
        if self.neg_sample == 'tf_idf' or self.neg_sample == 'online':
            self.logger.info('generate tf idf model......')
            self.tf_idf_neg_list = self.get_tf_idf_neg_list()

        if self.neg_sample == 'tree_index':
            self.logger.info('generate tree index......')
            self.tree_index_dict = self.get_tree_index()

        # 采用关键词替换采样或者使用关键词信息时,读取对应的字典及关键词信息
        if self.neg_sample == 'keyword_replace':
            self.dict, self.key_words = load_keywords(logger)

        self.init()