Exemplo n.º 1
0
#!/usr/bin/env python
# coding:utf-8
import pandas as pd
from tqdm import tqdm
from fasttext import train_supervised
import fasttext
import os
from __init__ import *
from src.utils import config
from src.utils.config import root_path
from src.utils.tools import create_logger, clean_symbols, query_cut, rm_stop_word

logger = create_logger(root_path + '/logs/Fasttext.log')
tqdm.pandas()


class Fasttext(object):
    """
    使用fasttext 训练文本分类的模型
    """
    def __init__(self,
                 train_raw_path=config.train_path,
                 test_raw_path=config.test_path,
                 valid_raw_path=config.valid_path,
                 model_train_file=root_path + '/data/fast_train.txt',
                 model_test_file=root_path + '/data/fast_test.txt',
                 model_valid_file=root_path + '/data/fast_valid.txt',
                 model_path=None):
        """
        初始化参数
        :param train_raw_path: 原始训练文件路径
Exemplo n.º 2
0
                                               predict_all,
                                               target_names=config.label_list,
                                               digits=4)
        confusion = metrics.confusion_matrix(labels_all, predict_all)
        return acc, loss_total / len(data_iter), report, confusion
    return acc, loss_total / len(data_iter)


if __name__ == '__main__':
    # model_name = args.model
    # x = import_module('models.' + model_name)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed_all(1)
    torch.backends.cudnn.deterministic = True  # 保证每次结果一样
    logger = create_logger(config.root_path + '/logs/train.log')

    logger.info('Building tokenizer')
    print('config.bert_path is ', config.bert_path)
    tokenizer = BertTokenizer.from_pretrained(config.bert_path)

    logger.info('Loading dataset')
    # 数据集的定义
    train_dataset = BertDataset(config.train_path,
                                tokenizer=tokenizer,
                                word=args.word)
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=config.batch_size,
                                  collate_fn=collate_fn,
                                  shuffle=True)
    dev_dataset = BertDataset(config.valid_path,
Exemplo n.º 3
0
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from transformers import BertModel, BertTokenizer

from __init__ import *
from src.data.mlData import MLData
from src.utils import config
from src.utils.config import root_path
from src.utils.tools import (Grid_Train_model, bayes_parameter_opt_lgb,
                             query_cut, create_logger, formate_data, get_score)
from src.utils.feature import (get_embedding_feature, get_img_embedding,
                               get_lda_features, get_pretrain_embedding,
                               get_autoencoder_feature, get_basic_feature)

logger = create_logger(config.log_dir + 'model.log')


class Models(object):
    def __init__(self,
                 model_path=None,
                 feature_engineer=False,
                 train_mode=True):
        '''
        @description: initlize Class, EX: model
        @param {type} :
        feature_engineer: whether using feature engineering, if `False`, then compare common ML models
        res_model: res network model
        resnext_model: resnext network model
        wide_model: wide res network model
        bert: bert model
Exemplo n.º 4
0
@Description: train embedding & tfidf & autoencoder
@FilePath: /bookClassification/src/word2vec/embedding.py
'''
import pandas as pd
from gensim import models
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
from gensim.models import LdaMulticore
from gensim.models.ldamodel import LdaModel
import gensim

from __init__ import *
from src.utils.config import root_path
from src.utils.tools import create_logger, query_cut
from src.word2vec.autoencoder import AutoEncoder
logger = create_logger(root_path + '/logs/embedding.log')


class SingletonMetaclass(type):
    '''
    @description: singleton
    '''
    def __init__(self, *args, **kwargs):
        self.__instance = None
        super().__init__(*args, **kwargs)

    def __call__(self, *args, **kwargs):
        if self.__instance is None:
            self.__instance = super(SingletonMetaclass,
                                    self).__call__(*args, **kwargs)
            return self.__instance
Exemplo n.º 5
0
@Date: 2020-04-08 17:21:28
@LastEditTime: 2020-07-17 16:43:02
@LastEditors: xiaoyao jiang
@Description: Process data then get feature
@FilePath: /bookClassification/src/data/mlData.py
'''

import numpy as np
import pandas as pd
import json
import os
from __init__ import *
from src.utils import config
from src.utils.tools import create_logger, wam, query_cut
from src.word2vec.embedding import Embedding
logger = create_logger(config.log_dir + 'data.log')


class MLData(object):
    def __init__(self, debug_mode=False, train_mode=True):
        '''
        @description: initlize ML dataset class
        @param {type}
        debug_mode: if debug_Mode the only deal 10000 data
        em, new embedding class
        @return:None
        '''
        # 加载embedding, 如果不训练, 则不处理数据
        self.debug_mode = debug_mode
        self.em = Embedding()
        self.em.load()
Exemplo n.º 6
0
Arquivo: models.py Projeto: MiniBee/dl
import json
import pandas as pd
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import ClusterCentroids

from sklearn.ensemble import RandomForestClassifier
import joblib
from transformers import BertModel, BertTokenizer

from __init__ import *
from src.utils.tools import create_logger, bayes_parameter_opt_lgb
from src.data.mlData import MLData
from src.utils import config
from src.utils.feature import get_embedding_feature, get_autoencoder_feature, get_score

logger = create_logger(config.log_path + 'model.log')


class Model(object):
    def __init__(self,
                 model_path=None,
                 feature_engineer=False,
                 train_mode=True):
        # self.res_model = torchvision.models.resnet152(pretrained=True)
        # self.res_model = self.res_model.to(config.device)

        self.ml_data = MLData(debug_mode=True, train_mode=train_mode)
        if train_mode:
            self.model = lgb.LGBMClassifier(objective='multiclass',
                                            num_class=33,
                                            seed=11)
Exemplo n.º 7
0
# @author: hongyue.pei
# @file: fasttext.py
# @time: 2020/9/23 下午3:09
# @desc:

import pandas as pd
from tqdm import tqdm
import fasttext
import jieba

from __init__ import *
from src.utils import config
from src.utils.tools import create_logger

logger = create_logger(config.root_path + '/logs/Fasttext.log')


class Fasttext(object):
    def __init__(self,
                 train_raw_path=config.root_path + '/data/train.csv',
                 test_raw_path=config.root_path + '/data/test.csv',
                 model_train_file=config.root_path + '/data/fast_train.csv',
                 model_test_file=config.root_path + '/data/fast_test.csv',
                 model_path=None):
        stopWords = open(config.root_path +
                         '/data/stopWords_cn.txt').readlines()
        jieba.load_userdict(config.root_path + '/data/ai100_words.txt')
        if model_path is None:
            self.train_raw_data = pd.read_csv(train_raw_path,
                                              ',',
Exemplo n.º 8
0
        precision, recall, F1 = calculate_f1(result)
        print("-" * 20 + "intent" + "-" * 20)
        print("\t Precision: %.2f" % (100 * precision))
        print("\t Recall: %.2f" % (100 * recall))
        print("\t F1: %.2f" % (100 * F1))
        return F1


if __name__ == '__main__':
    debug = False
    config = model.Config()
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed_all(1)
    torch.backends.cudnn.deterministic = True  # 保证每次结果一样
    logger = create_logger('../logs/train.log')

    logger.info('Building tokenizer')
    print('config.bert_path is ', config.bert_path)
    tokenizer = BertTokenizer.from_pretrained(config.bert_path)

    logger.info('Loading dataset')
    train_dataset = BertDataset(config.train_path,
                                config.label_path,
                                tokenizer=tokenizer,
                                debug=debug,
                                need_label_weight=True)
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=config.batch_size,
                                  collate_fn=collate_fn,
                                  shuffle=True)
Exemplo n.º 9
0
# coding:utf-8
import pandas as pd
from src.utils.tools import create_logger, clean_symbols, query_cut, rm_stop_word
from src.utils import config
from tqdm import tqdm
import gensim
from gensim import models
from src.utils.tools import timethis
logger = create_logger(config.root_path + '/logs/embedding.log')
tqdm.pandas()


class SingletonMetaclass(type):
    '''
    单例模式
    '''
    def __init__(self, *args, **kwargs):
        self.__instance = None
        super().__init__(*args, **kwargs)

    def __call__(self, *args, **kwargs):
        if self.__instance is None:
            self.__instance = super(SingletonMetaclass,
                                    self).__call__(*args, **kwargs)
            return self.__instance
        else:
            return self.__instance


class Embedding(metaclass=SingletonMetaclass):
    def __init__(self):