예제 #1
0
def load_my_dataset(filepath, datafolderpath, labelfilepath, test_ratio=0.9):
    labellist = load_label(labelfilepath)

    with open(filepath, "r") as readfile:
        filepathlist = [
            os.path.join(datafolderpath, line.strip())
            for line in readfile.readlines()
        ]

    datasets = list()
    for imagefilepath in filepathlist:
        print(imagefilepath)
        img = load_data(imagefilepath)
        filename = imagefilepath.split(os.path.sep)[-2]
        label = filename
        index = labellist.index(label)
        datasets.append([img, index])
    random.shuffle(datasets)

    index = int(len(datasets) * test_ratio)
    traindatasets = datasets[:index]
    testdatasets = datasets[index:]

    train_imgs = [train_data[0] for train_data in traindatasets]
    train_labels = [train_data[1] for train_data in traindatasets]

    test_imgs = [test_data[0] for test_data in testdatasets]
    test_labels = [test_data[1] for test_data in testdatasets]

    return chainer.datasets.tuple_dataset.TupleDataset(train_imgs, train_labels), \
            chainer.datasets.tuple_dataset.TupleDataset(test_imgs, test_labels)
예제 #2
0
    def data_load(self, label):
        data_path = os.path.join(self.opt.dataroot, self.opt.cap_scheme,
                                 label)  # ex. ../data/min24/synthetic
        self.num_train_samples = min(self.opt.train_size,
                                     len(os.listdir(os.path.join(data_path,
                                                                 'train'))))  # number of train data in directory
        self.num_test_sample = min(2000,
                                   len(os.listdir(os.path.join(data_path,
                                                               'test'))))  # number of test data in directory

        # load training set
        x_train = np.empty((self.num_train_samples, self.opt.loadHeight,
                            self.opt.loadWidth, 1), dtype='uint8')  # initialize
        y_train = np.empty(
            (self.num_train_samples, self.opt.cap_len * self.opt.char_set_len),
            dtype='uint8')  # initialize
        train_labels = util.load_label(
            os.path.join(data_path, label + '_train.txt'))
        for i in range(self.num_train_samples):
            img_name = os.path.join(data_path, 'train', str(i) + '.png')
            x_train[i, :, :, :] = util.load_image(img_name)
            try:
                y_train[i, :] = self.text2vec(train_labels[i])
            except:
                print(i)
        # load test set
        x_test = np.empty(
            (self.num_test_sample, self.opt.loadHeight, self.opt.loadWidth, 1),
            dtype='uint8')
        y_test = np.empty(
            (self.num_test_sample, self.opt.cap_len * self.opt.char_set_len),
            dtype='uint8')
        test_labels = util.load_label(
            os.path.join(data_path, label + '_test.txt'))
        for i in range(self.num_test_sample):
            img_name = os.path.join(data_path, 'test', str(i) + '.png')
            x_test[i, :, :, :] = util.load_image(img_name)
            try:
                y_test[i, :] = self.text2vec(test_labels[i])
            except:
                print(i)
        return (x_train, y_train), (x_test, y_test)
예제 #3
0
def main():
    """
    main
    """
    n_units_h1, n_units_h2, n_out,\
            modelpath, labelfilepath, imagefilepath = importingargs()
    model = L.Classifier(LinearNet(n_units_h1, n_units_h2, n_out))
    chainer.serializers.load_npz(modelpath, model)

    img = load_data(imagefilepath)
    labellist = load_label(labelfilepath)
    index, prob, label = predict(model, img, labellist)
    print("index: %d, prob: %f, label: %s" % (index, prob, label))
예제 #4
0
def train_classifier():
    X_train, y_train, X_test, y_test = load_label(split=True)
    vectorizer = joblib.load(
        data_path / "tfidf_vectorizer.pkl"
    )  # vectorizer dumped on cleaning.py
    x = vectorizer.transform(X_train)
    classifier = RandomForestClassifier(max_depth=10, random_state=42)
    classifier.fit(x, y_train)
    joblib.dump(classifier, model_path / "classifier.pkl")
    test_array = vectorizer.transform([X_train[1]])
    test = classifier.predict(test_array)
    print(
        accuracy_score(
            [classifier.predict(vectorizer.transform([a]).toarray()) for a in X_test],
            y_test,
        )
    )
예제 #5
0
 def load_model(self, checkpoint):
     config = self.config
     opt = config['opt']
     labels = load_label(opt.label_path)
     label_size = len(labels)
     config['labels'] = labels
     self.labels = labels
     if config['emb_class'] == 'glove':
         if config['enc_class'] == 'gnb':
             model = TextGloveGNB(config, opt.embedding_path, label_size)
         if config['enc_class'] == 'cnn':
             model = TextGloveCNN(config,
                                  opt.embedding_path,
                                  label_size,
                                  emb_non_trainable=True)
         if config['enc_class'] == 'densenet-cnn':
             model = TextGloveDensenetCNN(config,
                                          opt.embedding_path,
                                          label_size,
                                          emb_non_trainable=True)
         if config['enc_class'] == 'densenet-dsa':
             model = TextGloveDensenetDSA(config,
                                          opt.embedding_path,
                                          label_size,
                                          emb_non_trainable=True)
     else:
         from transformers import AutoTokenizer, AutoConfig, AutoModel
         bert_config = AutoConfig.from_pretrained(opt.bert_output_dir)
         bert_tokenizer = AutoTokenizer.from_pretrained(opt.bert_output_dir)
         bert_model = AutoModel.from_config(bert_config)
         ModelClass = TextBertCNN
         if config['enc_class'] == 'cls': ModelClass = TextBertCLS
         model = ModelClass(config, bert_config, bert_model, bert_tokenizer,
                            label_size)
     model.load_state_dict(checkpoint)
     model = model.to(opt.device)
     logger.info("[Model loaded]")
     return model
예제 #6
0
def load_model(config, checkpoint):
    opt = config['opt']
    labels = load_label(opt.label_path)
    label_size = len(labels)
    config['labels'] = labels
    if config['emb_class'] == 'glove':
        if config['enc_class'] == 'gnb':
            model = TextGloveGNB(config, opt.embedding_path, label_size)
        if config['enc_class'] == 'cnn':
            model = TextGloveCNN(config,
                                 opt.embedding_path,
                                 label_size,
                                 emb_non_trainable=True)
        if config['enc_class'] == 'densenet-cnn':
            model = TextGloveDensenetCNN(config,
                                         opt.embedding_path,
                                         label_size,
                                         emb_non_trainable=True)
        if config['enc_class'] == 'densenet-dsa':
            model = TextGloveDensenetDSA(config,
                                         opt.embedding_path,
                                         label_size,
                                         emb_non_trainable=True)
    else:
        from transformers import AutoTokenizer, AutoConfig, AutoModel
        bert_config = AutoConfig.from_pretrained(opt.bert_output_dir)
        bert_tokenizer = AutoTokenizer.from_pretrained(opt.bert_output_dir)
        bert_model = AutoModel.from_config(bert_config)
        ModelClass = TextBertCNN
        if config['enc_class'] == 'cls': ModelClass = TextBertCLS
        model = ModelClass(config, bert_config, bert_model, bert_tokenizer,
                           label_size)
    if opt.enable_qat:
        assert opt.device == 'cpu'
        model.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm')
        '''
        # fuse if applicable
        # model = torch.quantization.fuse_modules(model, [['']])
        '''
        model = torch.quantization.prepare_qat(model)
        model.eval()
        model.to('cpu')
        logger.info("[Convert to quantized model with device=cpu]")
        model = torch.quantization.convert(model)
    if opt.enable_qat_fx:
        import torch.quantization.quantize_fx as quantize_fx
        qconfig_dict = {
            "": torch.quantization.get_default_qat_qconfig('fbgemm')
        }
        model = quantize_fx.prepare_qat_fx(model, qconfig_dict)
        logger.info("[Convert to quantized model]")
        model = quantize_fx.convert_fx(model)

    model.load_state_dict(checkpoint)
    model = model.to(opt.device)
    '''
    for name, param in model.named_parameters():
        print(name, param.data, param.device, param.requires_grad)
    '''
    logger.info("[model] :\n{}".format(model.__str__()))
    logger.info("[Model loaded]")
    return model
예제 #7
0
    parser.add_argument("--label-num", "-l", type=int, default=5)
    parser.add_argument("--modelpath", "-mf", help="model path")
    parser.add_argument("--labelfilepath", "-lf", help="labelfile")
    parser.add_argument("--imagefilepath", "-if",
                        help="imagefilepath you want to predict")
    args = parser.parse_args()
    return args.first_hidden_layer_units,\
            args.second_hidden_layer_units, args.label_num,\
            args.modelpath, args.labelfilepath, args.imagefilepath

if __name__ == "__main__":
    n_units_h1, n_units_h2, n_out,\
            modelpath, labelfilepath, imagefilepath = importingargs()
    model = L.Classifier(LinearNet(n_units_h1, n_units_h2, n_out))
    chainer.serializers.load_npz(modelpath, model)
    labellist = load_label(labelfilepath)

    cascade_path = 'haarcascade_frontalface_alt.xml'
    face_cascade = cv2.CascadeClassifier(cascade_path) 

    cap = cv2.VideoCapture(0)

    frame_count = 0
    face_count = 0

    while True:
        frame_count += 1
        face_count = 0

        # 内蔵カメラから読み込んだキャプチャデータを取得
        ret, frame = cap.read()
예제 #8
0
def load_model(config, checkpoint):
    args = config['args']
    labels = load_label(args.label_path)
    label_size = len(labels)
    config['labels'] = labels
    if config['emb_class'] == 'glove':
        if config['enc_class'] == 'gnb':
            model = TextGloveGNB(config, args.embedding_path, label_size)
        if config['enc_class'] == 'cnn':
            model = TextGloveCNN(config,
                                 args.embedding_path,
                                 label_size,
                                 emb_non_trainable=True)
        if config['enc_class'] == 'densenet-cnn':
            model = TextGloveDensenetCNN(config,
                                         args.embedding_path,
                                         label_size,
                                         emb_non_trainable=True)
        if config['enc_class'] == 'densenet-dsa':
            model = TextGloveDensenetDSA(config,
                                         args.embedding_path,
                                         label_size,
                                         emb_non_trainable=True)
    else:
        if config['emb_class'] == 'bart' and config['use_kobart']:
            from transformers import BartModel
            from kobart import get_kobart_tokenizer, get_pytorch_kobart_model
            bert_tokenizer = get_kobart_tokenizer()
            bert_tokenizer.cls_token = '<s>'
            bert_tokenizer.sep_token = '</s>'
            bert_tokenizer.pad_token = '<pad>'
            bert_model = BartModel.from_pretrained(get_pytorch_kobart_model())
            bert_config = bert_model.config
        elif config['emb_class'] in ['gpt']:
            bert_tokenizer = AutoTokenizer.from_pretrained(
                args.bert_output_dir)
            bert_tokenizer.bos_token = '<|startoftext|>'
            bert_tokenizer.eos_token = '<|endoftext|>'
            bert_tokenizer.cls_token = '<|startoftext|>'
            bert_tokenizer.sep_token = '<|endoftext|>'
            bert_tokenizer.pad_token = '<|pad|>'
            bert_config = AutoConfig.from_pretrained(args.bert_output_dir)
            bert_model = AutoModel.from_pretrained(args.bert_output_dir)
        elif config['emb_class'] in ['t5']:
            from transformers import T5EncoderModel
            bert_tokenizer = AutoTokenizer.from_pretrained(
                args.bert_output_dir)
            bert_tokenizer.cls_token = '<s>'
            bert_tokenizer.sep_token = '</s>'
            bert_tokenizer.pad_token = '<pad>'
            bert_config = AutoConfig.from_pretrained(args.bert_output_dir)
            bert_model = T5EncoderModel(bert_config)
        else:
            bert_tokenizer = AutoTokenizer.from_pretrained(
                args.bert_output_dir)
            bert_config = AutoConfig.from_pretrained(args.bert_output_dir)
            bert_model = AutoModel.from_config(bert_config)

        ModelClass = TextBertCNN
        if config['enc_class'] == 'cls': ModelClass = TextBertCLS
        if config['enc_class'] == 'densenet-cnn':
            ModelClass = TextBertDensenetCNN

        model = ModelClass(config, bert_config, bert_model, bert_tokenizer,
                           label_size)

    if args.enable_qat:
        assert args.device == 'cpu'
        model.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm')
        '''
        # fuse if applicable
        # model = torch.quantization.fuse_modules(model, [['']])
        '''
        model = torch.quantization.prepare_qat(model)
        model.eval()
        model.to('cpu')
        logger.info("[Convert to quantized model with device=cpu]")
        model = torch.quantization.convert(model)
    if args.enable_qat_fx:
        import torch.quantization.quantize_fx as quantize_fx
        qconfig_dict = {
            "": torch.quantization.get_default_qat_qconfig('fbgemm')
        }
        model = quantize_fx.prepare_qat_fx(model, qconfig_dict)
        logger.info("[Convert to quantized model]")
        model = quantize_fx.convert_fx(model)

    if args.enable_diffq:
        quantizer = DiffQuantizer(model)
        config['quantizer'] = quantizer
        quantizer.restore_quantized_state(checkpoint)
    else:
        model.load_state_dict(checkpoint)

    model = model.to(args.device)
    ''' 
    for name, param in model.named_parameters():
        print(name, param.data, param.device, param.requires_grad)
    '''
    logger.info("[model] :\n{}".format(model.__str__()))
    logger.info("[Model loaded]")
    return model
예제 #9
0
def prepare_model(config, bert_model_name_or_path=None):
    args = config['args']
    emb_non_trainable = not args.embedding_trainable
    labels = load_label(args.label_path)
    label_size = len(labels)
    config['labels'] = labels
    # prepare model
    if config['emb_class'] == 'glove':
        if config['enc_class'] == 'gnb':
            model = TextGloveGNB(config, args.embedding_path, label_size)
        if config['enc_class'] == 'cnn':
            model = TextGloveCNN(config,
                                 args.embedding_path,
                                 label_size,
                                 emb_non_trainable=emb_non_trainable)
        if config['enc_class'] == 'densenet-cnn':
            model = TextGloveDensenetCNN(config,
                                         args.embedding_path,
                                         label_size,
                                         emb_non_trainable=emb_non_trainable)
        if config['enc_class'] == 'densenet-dsa':
            model = TextGloveDensenetDSA(config,
                                         args.embedding_path,
                                         label_size,
                                         emb_non_trainable=emb_non_trainable)
    else:
        model_name_or_path = args.bert_model_name_or_path
        if bert_model_name_or_path:
            model_name_or_path = bert_model_name_or_path

        if config['emb_class'] == 'bart' and config['use_kobart']:
            from transformers import BartModel
            from kobart import get_kobart_tokenizer, get_pytorch_kobart_model
            bert_tokenizer = get_kobart_tokenizer()
            bert_tokenizer.cls_token = '<s>'
            bert_tokenizer.sep_token = '</s>'
            bert_tokenizer.pad_token = '<pad>'
            bert_model = BartModel.from_pretrained(get_pytorch_kobart_model())
        elif config['emb_class'] in ['gpt']:
            bert_tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
            bert_tokenizer.bos_token = '<|startoftext|>'
            bert_tokenizer.eos_token = '<|endoftext|>'
            bert_tokenizer.cls_token = '<|startoftext|>'
            bert_tokenizer.sep_token = '<|endoftext|>'
            bert_tokenizer.pad_token = '<|pad|>'
            bert_model = AutoModel.from_pretrained(
                model_name_or_path,
                from_tf=bool(".ckpt" in model_name_or_path))
            # 3 new tokens added
            bert_model.resize_token_embeddings(len(bert_tokenizer))
        elif config['emb_class'] in ['t5']:
            from transformers import T5EncoderModel
            bert_tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
            bert_tokenizer.cls_token = '<s>'
            bert_tokenizer.sep_token = '</s>'
            bert_tokenizer.pad_token = '<pad>'
            bert_model = T5EncoderModel.from_pretrained(
                model_name_or_path,
                from_tf=bool(".ckpt" in model_name_or_path))

        else:
            bert_tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
            bert_model = AutoModel.from_pretrained(
                model_name_or_path,
                from_tf=bool(".ckpt" in model_name_or_path))

        bert_config = bert_model.config
        # bert model reduction
        reduce_bert_model(config, bert_model, bert_config)
        ModelClass = TextBertCNN
        if config['enc_class'] == 'cls': ModelClass = TextBertCLS
        if config['enc_class'] == 'densenet-cnn':
            ModelClass = TextBertDensenetCNN

        model = ModelClass(config,
                           bert_config,
                           bert_model,
                           bert_tokenizer,
                           label_size,
                           feature_based=args.bert_use_feature_based,
                           finetune_last=args.bert_use_finetune_last)
    if args.restore_path:
        checkpoint = load_checkpoint(args.restore_path)
        model.load_state_dict(checkpoint)
    if args.enable_qat:
        model.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm')
        '''
        # fuse if applicable
        # model = torch.quantization.fuse_modules(model, [['']])
        '''
        model = torch.quantization.prepare_qat(model)
    if args.enable_qat_fx:
        import torch.quantization.quantize_fx as quantize_fx
        model.train()
        qconfig_dict = {
            "": torch.quantization.get_default_qat_qconfig('fbgemm')
        }
        model = quantize_fx.prepare_qat_fx(model, qconfig_dict)

    logger.info("[model] :\n{}".format(model.__str__()))
    logger.info("[model prepared]")
    return model
예제 #10
0
def prepare_model(config, bert_model_name_or_path=None):
    opt = config['opt']
    emb_non_trainable = not opt.embedding_trainable
    labels = load_label(opt.label_path)
    label_size = len(labels)
    config['labels'] = labels
    # prepare model
    if config['emb_class'] == 'glove':
        if config['enc_class'] == 'gnb':
            model = TextGloveGNB(config, opt.embedding_path, label_size)
        if config['enc_class'] == 'cnn':
            model = TextGloveCNN(config,
                                 opt.embedding_path,
                                 label_size,
                                 emb_non_trainable=emb_non_trainable)
        if config['enc_class'] == 'densenet-cnn':
            model = TextGloveDensenetCNN(config,
                                         opt.embedding_path,
                                         label_size,
                                         emb_non_trainable=emb_non_trainable)
        if config['enc_class'] == 'densenet-dsa':
            model = TextGloveDensenetDSA(config,
                                         opt.embedding_path,
                                         label_size,
                                         emb_non_trainable=emb_non_trainable)
    else:
        model_name_or_path = opt.bert_model_name_or_path
        if bert_model_name_or_path:
            model_name_or_path = bert_model_name_or_path
        from transformers import AutoTokenizer, AutoConfig, AutoModel
        bert_tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
        bert_model = AutoModel.from_pretrained(
            model_name_or_path, from_tf=bool(".ckpt" in model_name_or_path))
        bert_config = bert_model.config
        # bert model reduction
        reduce_bert_model(config, bert_model, bert_config)
        ModelClass = TextBertCNN
        if config['enc_class'] == 'cls': ModelClass = TextBertCLS
        model = ModelClass(config,
                           bert_config,
                           bert_model,
                           bert_tokenizer,
                           label_size,
                           feature_based=opt.bert_use_feature_based)
    if opt.restore_path:
        checkpoint = load_checkpoint(opt.restore_path, device=opt.device)
        model.load_state_dict(checkpoint)
    if opt.enable_qat:
        model.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm')
        '''
        # fuse if applicable
        # model = torch.quantization.fuse_modules(model, [['']])
        '''
        model = torch.quantization.prepare_qat(model)
    if opt.enable_qat_fx:
        import torch.quantization.quantize_fx as quantize_fx
        model.train()
        qconfig_dict = {
            "": torch.quantization.get_default_qat_qconfig('fbgemm')
        }
        model = quantize_fx.prepare_qat_fx(model, qconfig_dict)

    model.to(opt.device)
    logger.info("[model] :\n{}".format(model.__str__()))
    logger.info("[model prepared]")
    return model