import tensorflow as _tf import dataset.csv as csv import os as _os from utils import logutil as _logutil from . import augmentation as _augmentation _tde = _tf.data.experimental _logging = _logutil.get_logger() _FLAGS = _tf.app.flags.FLAGS def load(machine=None): csv.inspect() if machine is not None: machine.dataset_loader = Loader() else: return Loader() class Loader(): def __init__(self): self._streams = {} self._generate_stream() pass def _generate_stream(self): for key in [key for key in _FLAGS if str(key).endswith('csv')]: csv_file = _FLAGS[key]._value csv_path = _os.path.join(_FLAGS.dataset_dir, _FLAGS.type, csv_file) key = str(key).split('_')[0]
#!/usr/bin/env python # -*- coding: utf-8 -*- import unittest from unittest import TestCase from exporter import * from exporter.book import BookExport from utils.logutil import get_logger log = get_logger(__name__) class TestBookExport(TestCase): def setUp(self): self.exporter = BookExport("einverne") def test_get_books(self): books = self.exporter.get_books(COLLECT) for book in books: log.debug(book) self.assertIsNotNone(book, "book object fetch failed") self.assertNotEqual(book.title, '', 'book title fetch failed') break def test_get_read(self): readed = self.exporter.get_read() for b in readed: log.debug(b) self.assertIsNotNone(b, 'book object fetch failed') self.assertNotEqual(b.title, '', 'book title fetch failed') break
def main(): if not torch.cuda.is_available(): raise Exception("need gpu to train network!") torch.manual_seed(0) torch.cuda.manual_seed_all(0) cudnn.benchmark = True cudnn.enabled = True logger = get_logger(__name__, Config.log) Config.gpus = torch.cuda.device_count() logger.info("use {} gpus".format(Config.gpus)) config = { key: value for key, value in Config.__dict__.items() if not key.startswith("__") } logger.info(f"args: {config}") start_time = time.time() # dataset and dataloader logger.info("start loading data") train_transform = transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) train_dataset = ImageFolder(Config.train_dataset_path, train_transform) train_loader = DataLoader( train_dataset, batch_size=Config.batch_size, shuffle=True, num_workers=Config.num_workers, pin_memory=True, ) val_transform = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) val_dataset = ImageFolder(Config.val_dataset_path, val_transform) val_loader = DataLoader( val_dataset, batch_size=Config.batch_size, num_workers=Config.num_workers, pin_memory=True, ) logger.info("finish loading data") # network net = ChannelDistillResNet1834(Config.num_classes, Config.dataset_type) net = nn.DataParallel(net).cuda() # loss and optimizer criterion = [] for loss_item in Config.loss_list: loss_name = loss_item["loss_name"] loss_type = loss_item["loss_type"] if "kd" in loss_type: criterion.append(losses.__dict__[loss_name](loss_item["T"]).cuda()) else: criterion.append(losses.__dict__[loss_name]().cuda()) optimizer = SGD(net.parameters(), lr=Config.lr, momentum=0.9, weight_decay=1e-4) scheduler = MultiStepLR(optimizer, milestones=[30, 60, 90], gamma=0.1) # only evaluate if Config.evaluate: # load best model if not os.path.isfile(Config.evaluate): raise Exception( f"{Config.evaluate} is not a file, please check it again") logger.info("start evaluating") logger.info(f"start resuming model from {Config.evaluate}") checkpoint = torch.load(Config.evaluate, map_location=torch.device("cpu")) net.load_state_dict(checkpoint["model_state_dict"]) prec1, prec5 = validate(val_loader, net) logger.info( f"epoch {checkpoint['epoch']:0>3d}, top1 acc: {prec1:.2f}%, top5 acc: {prec5:.2f}%" ) return start_epoch = 1 # resume training if os.path.exists(Config.resume): logger.info(f"start resuming model from {Config.resume}") checkpoint = torch.load(Config.resume, map_location=torch.device("cpu")) start_epoch += checkpoint["epoch"] net.load_state_dict(checkpoint["model_state_dict"]) optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) scheduler.load_state_dict(checkpoint["scheduler_state_dict"]) logger.info( f"finish resuming model from {Config.resume}, epoch {checkpoint['epoch']}, " f"loss: {checkpoint['loss']:3f}, lr: {checkpoint['lr']:.6f}, " f"top1_acc: {checkpoint['acc']}%, loss {checkpoint['loss']}%") if not os.path.exists(Config.checkpoints): os.makedirs(Config.checkpoints) logger.info("start training") best_acc = 0. for epoch in range(start_epoch, Config.epochs + 1): prec1, prec5, loss = train(train_loader, net, criterion, optimizer, scheduler, epoch, logger) logger.info( f"train: epoch {epoch:0>3d}, top1 acc: {prec1:.2f}%, top5 acc: {prec5:.2f}%" ) prec1, prec5 = validate(val_loader, net) logger.info( f"val: epoch {epoch:0>3d}, top1 acc: {prec1:.2f}%, top5 acc: {prec5:.2f}%" ) # remember best prec@1 and save checkpoint torch.save( { "epoch": epoch, "acc": prec1, "loss": loss, "lr": scheduler.get_lr()[0], "model_state_dict": net.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "scheduler_state_dict": scheduler.state_dict(), }, os.path.join(Config.checkpoints, "latest.pth")) if prec1 > best_acc: shutil.copyfile(os.path.join(Config.checkpoints, "latest.pth"), os.path.join(Config.checkpoints, "best.pth")) best_acc = prec1 training_time = (time.time() - start_time) / 3600 logger.info( f"finish training, best acc: {best_acc:.2f}%, total training time: {training_time:.2f} hours" )
def main(): if not torch.cuda.is_available(): raise Exception("need gpu to train network!") setup_seed(2020) logger = get_logger(__name__, Config.log) Config.gpus = torch.cuda.device_count() logger.info("use {} gpus".format(Config.gpus)) config = { # 用类名直接调用__dict__,会输出由该类中所有类属性组成的字典 key: value for key, value in Config.__dict__.items() if not key.startswith("__") } logger.info(f"args: {config}") start_time = time.time() # dataset and dataloader logger.info("start loading data") train_transform = transforms.Compose([ transforms.Pad(4, padding_mode='reflect'), transforms.RandomHorizontalFlip(), transforms.RandomCrop(32), transforms.ToTensor(), transforms.Normalize( np.array([125.3, 123.0, 113.9]) / 255.0, np.array([63.0, 62.1, 66.7]) / 255.0), ]) train_dataset = CIFAR100( Config.train_dataset_path, train=True, transform=train_transform, download=True, ) train_loader = DataLoader( train_dataset, batch_size=Config.batch_size, shuffle=True, num_workers=Config.num_workers, pin_memory=True, ) val_transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize( np.array([125.3, 123.0, 113.9]) / 255.0, np.array([63.0, 62.1, 66.7]) / 255.0), ]) val_dataset = CIFAR100( Config.val_dataset_path, train=False, transform=val_transform, download=True, ) val_loader = DataLoader( val_dataset, batch_size=Config.batch_size, num_workers=Config.num_workers, pin_memory=True, ) logger.info("finish loading data") if Config.baseline: net = ChannelDistillWRN1628( Config.num_classes) # 返回了(ss, ts) 学生网络和预训练的教师网络 net = nn.DataParallel(net).cuda( ) # ChannelDistillResNet50152( (student): ResNet() (teacher): ResNet()) optimizer_s = torch.optim.SGD(net.module.student.parameters(), lr=Config.lr_logit, momentum=0.9, weight_decay=1e-4) optimizer_t = torch.optim.SGD(net.module.teacher.parameters(), lr=Config.lr_logit, momentum=0.9, weight_decay=1e-4) scheduler_s = torch.optim.lr_scheduler.MultiStepLR( optimizer_s, milestones=[150, 225], gamma=0.1) scheduler_t = torch.optim.lr_scheduler.MultiStepLR( optimizer_t, milestones=[150, 225], gamma=0.1) optimizer = [optimizer_s, optimizer_t] scheduler = [scheduler_s, scheduler_t] # loss and optimizer criterion = losses.__dict__["CELoss"]().cuda() start_epoch = 1 # resume training if os.path.exists(Config.resume): pass if not os.path.exists(Config.checkpoints): os.makedirs(Config.checkpoints) logger.info('start training') best_stu_acc = 0. best_tea_acc = 0. for epoch in range(start_epoch, Config.epochs + 1): logger.info(f"train:\n") prec1_s, prec1_t, prec5_s, prec5_t, loss_s, loss_t = train_baseline( train_loader, net, criterion, optimizer, scheduler, epoch, logger) logger.info( f"Student ---> train: epoch {epoch:0>3d}, top1 acc: {prec1_s:.2f}%, top5 acc: {prec5_s:.2f}%\n" ) logger.info( f"Teacher ---> train: epoch {epoch:0>3d}, top1 acc: {prec1_t:.2f}%, top5 acc: {prec5_t:.2f}%\n" ) logger.info(f"val:\n") prec1_s, prec5_s, prec1_t, prec5_t = validate(val_loader, net) logger.info( f"Student ---> val: epoch {epoch:0>3d}, top1 acc: {prec1_s:.2f}%, top5 acc: {prec5_s:.2f}%\n" ) logger.info( f"Teacher ---> val: epoch {epoch:0>3d}, top1 acc: {prec1_t:.2f}%, top5 acc: {prec5_t:.2f}%\n" ) # remember best prec@1 and save checkpoint torch.save( { "epoch": epoch, "acc": prec1_s, "loss": loss_s, "lr": scheduler[0].get_lr()[0], "model_state_dict": net.state_dict(), "optimizer_state_dict": optimizer[0].state_dict(), "scheduler_state_dict": scheduler[0].state_dict(), }, os.path.join(Config.checkpoints, "stu_base_latest.pth")) if prec1_s > best_stu_acc: shutil.copyfile( os.path.join(Config.checkpoints, "stu_base_latest.pth"), os.path.join(Config.checkpoints, "stu_base_best.pth")) best_stu_acc = prec1_s torch.save( { "epoch": epoch, "acc": prec1_t, "loss": loss_t, "lr": scheduler[1].get_lr()[0], "model_state_dict": net.state_dict(), "optimizer_state_dict": optimizer[1].state_dict(), "scheduler_state_dict": scheduler[1].state_dict(), }, os.path.join(Config.checkpoints, "tea_base_latest.pth")) if prec1_t > best_tea_acc: shutil.copyfile( os.path.join(Config.checkpoints, "tea_base_latest.pth"), os.path.join(Config.checkpoints, "tea_base_best.pth")) best_tea_acc = prec1_t training_time = (time.time() - start_time) / 3600 logger.info(f"finish training\n") logger.info( f"Stu -> best acc: {best_stu_acc:.2f}%, Tea -> best acc: {best_tea_acc:.2f}%, total training time: {training_time:.2f} hours" ) else: # network net = ChannelDistillWRN1628( Config.num_classes) # 返回了(ss, ts) 学生网络和预训练的教师网络 # net = ChannelDistillResNet50152(Config.num_classes, Config.dataset_type) # 返回了(ss, ts) 学生网络和预训练的教师网络 net = nn.DataParallel(net).cuda( ) # ChannelDistillResNet50152( (student): ResNet() (teacher): ResNet()) discriminator = DiscriminatorStudentTeacher( 128, Config.model_type).cuda() # WRN最后剩下128 # discriminator = DiscriminatorStudentTeacher(2048, Config.model_type).cuda() # loss and optimizer criterion = [ losses.__dict__["CELoss"]().cuda(), losses.__dict__["KDLoss"](Config.T).cuda(), torch.nn.MSELoss().cuda() ] # 优化学生和老师 -> feature extracter optimizer_logit = [ torch.optim.SGD(net.module.student.parameters(), lr=Config.lr_logit, momentum=0.9, weight_decay=1e-4), torch.optim.SGD(net.module.teacher.parameters(), lr=Config.lr_logit, momentum=0.9, weight_decay=1e-4) ] # g1, g2 scheduler_logit = [ torch.optim.lr_scheduler.MultiStepLR(optimizer_logit[0], milestones=[150, 225], gamma=0.1), torch.optim.lr_scheduler.MultiStepLR(optimizer_logit[1], milestones=[150, 225], gamma=0.1) ] # 优化学生和教师及其他们的判别器-> feature extracter and D1 and D2 optimizer_g1_fmap = torch.optim.Adam(net.module.student.parameters(), lr=Config.lr_fmap, weight_decay=1e-1) optimizer_d1_fmap = torch.optim.Adam( discriminator.discri_s.parameters(), lr=Config.lr_fmap, weight_decay=1e-1) scheduler_g1_fmap = torch.optim.lr_scheduler.MultiStepLR( optimizer_g1_fmap, milestones=[75, 150], gamma=0.1) scheduler_d1_fmap = torch.optim.lr_scheduler.MultiStepLR( optimizer_d1_fmap, milestones=[75, 150], gamma=0.1) optimizer_s_fmap = [optimizer_g1_fmap, optimizer_d1_fmap] # g1, d1 scheduler_s_fmap = [scheduler_g1_fmap, scheduler_d1_fmap] optimizer_g2_fmap = torch.optim.Adam(net.module.teacher.parameters(), lr=Config.lr_fmap, weight_decay=1e-1) optimizer_d2_fmap = torch.optim.Adam( discriminator.discri_t.parameters(), lr=Config.lr_fmap, weight_decay=1e-1) scheduler_g2_fmap = torch.optim.lr_scheduler.MultiStepLR( optimizer_g2_fmap, milestones=[75, 150], gamma=0.1) scheduler_d2_fmap = torch.optim.lr_scheduler.MultiStepLR( optimizer_d2_fmap, milestones=[75, 150], gamma=0.1) optimizer_t_fmap = [optimizer_g2_fmap, optimizer_d2_fmap] # g2, d2 scheduler_t_fmap = [scheduler_g2_fmap, scheduler_d2_fmap] # only evaluate if Config.evaluate: pass start_epoch = 1 # resume training if os.path.exists(Config.resume): pass if not os.path.exists(Config.checkpoints): os.makedirs(Config.checkpoints) logger.info('start training') best_stu_acc = 0. best_tea_acc = 0. for epoch in range(start_epoch, Config.epochs + 1): logger.info(f"train:\n") prec1_s, prec1_t, prec5_s, prec5_t, loss_s, loss_t = train( train_loader, net, discriminator, criterion, optimizer_logit, scheduler_logit, optimizer_s_fmap, scheduler_s_fmap, optimizer_t_fmap, scheduler_t_fmap, epoch, logger) logger.info( f"Student ---> train: epoch {epoch:0>3d}, top1 acc: {prec1_s:.2f}%, top5 acc: {prec5_s:.2f}%\n" ) logger.info( f"Teacher ---> train: epoch {epoch:0>3d}, top1 acc: {prec1_t:.2f}%, top5 acc: {prec5_t:.2f}%\n" ) logger.info(f"val:\n") prec1_s, prec5_s, prec1_t, prec5_t = validate(val_loader, net) logger.info( f"Student ---> val: epoch {epoch:0>3d}, top1 acc: {prec1_s:.2f}%, top5 acc: {prec5_s:.2f}%\n" ) logger.info( f"Teacher ---> val: epoch {epoch:0>3d}, top1 acc: {prec1_t:.2f}%, top5 acc: {prec5_t:.2f}%\n" ) # remember best prec@1 and save checkpoint torch.save( { "epoch": epoch, "acc": prec1_s, "loss": loss_s, "lr_logit": scheduler_logit[0].get_lr()[0], "lr_g": scheduler_s_fmap[0].get_lr()[0], "lr_d": scheduler_s_fmap[1].get_lr()[0], "model_state_dict": net.state_dict(), "optimizer_logit_state_dict": optimizer_logit[0].state_dict(), "optimizer_fmap_g_state_dict": optimizer_s_fmap[0].state_dict(), "optimizer_fmap_d_state_dict": optimizer_s_fmap[1].state_dict(), "scheduler_logit_state_dict": scheduler_logit[0].state_dict(), "scheduler_g_state_dict": scheduler_s_fmap[0].state_dict(), "scheduler_d_state_dict": scheduler_s_fmap[1].state_dict(), }, os.path.join(Config.checkpoints, "stu_latest.pth")) if prec1_s > best_stu_acc: shutil.copyfile( os.path.join(Config.checkpoints, "stu_latest.pth"), os.path.join(Config.checkpoints, "stu_best.pth")) best_stu_acc = prec1_s torch.save( { "epoch": epoch, "acc": prec1_t, "loss": loss_t, "lr_logit": scheduler_logit[1].get_lr()[0], "lr_g": scheduler_t_fmap[0].get_lr()[0], "lr_d": scheduler_t_fmap[1].get_lr()[0], "model_state_dict": net.state_dict(), "optimizer_logit_state_dict": optimizer_logit[1].state_dict(), "optimizer_fmap_g_state_dict": optimizer_t_fmap[0].state_dict(), "optimizer_fmap_d_state_dict": optimizer_t_fmap[1].state_dict(), "scheduler_logit_state_dict": scheduler_logit[1].state_dict(), "scheduler_g_state_dict": scheduler_t_fmap[0].state_dict(), "scheduler_d_state_dict": scheduler_t_fmap[1].state_dict(), }, os.path.join(Config.checkpoints, "tea_latest.pth")) if prec1_t > best_tea_acc: shutil.copyfile( os.path.join(Config.checkpoints, "tea_latest.pth"), os.path.join(Config.checkpoints, "tea_best.pth")) best_tea_acc = prec1_t training_time = (time.time() - start_time) / 3600 logger.info(f"finish training\n") logger.info( f"Stu -> best acc: {best_stu_acc:.2f}%, Tea -> best acc: {best_tea_acc:.2f}%, total training time: {training_time:.2f} hours" )
#!/usr/bin/env python # coding=utf-8 import time import re from utils import base62 from utils import pageutil from utils import logutil logger = logutil.get_logger() # 解析用户关注页面 def follow_page_parse(user_data_dict, num): follow_url_list = [] # 拼接用户关注页面URL download_url = 'http://weibo.cn/' + user_data_dict['user_id'] + '/follow?page=' + str(num) logger.info("Processing follow URL:" + download_url) soup = pageutil.get_soup_from_page(download_url) follow_block = soup.find_all('td', attrs={'valign': 'top'}) # 逐行处理当前页面关注用户 for i in range(0, len(follow_block) / 2): follow = follow_block[i * 2 + 1].find_all('a')[0] follow_url_list.append(follow.get('href')) return follow_url_list # 解析用户粉丝页面 def fans_page_parse(user_data_dict, num): fans_url_list = [] # 拼接用户粉丝页面URL download_url = 'http://weibo.cn/' + user_data_dict['user_id'] + '/fans?page=' + str(num)