def main(): # Load data target_property = sys.argv[1] target_collection = sys.argv[2] dir_path = '../analysis/bin_distribution/' results_path = f'{dir_path}{target_property}.txt' if not os.path.exists(dir_path): os.makedirs(dir_path) pairs_original = read_pairs(target_collection, source='original') pairs_resampled = read_pairs(target_collection, source='resampled') concepts_original = pairs_original[target_property] concepts_resampled = pairs_resampled[target_property] # sort data into bins set_info_dict = get_concepts_set(target_property, target_collection) general_bin_dict = load_general_bins() bin_dict_cosine = load_cosine_bins_prop(set_info_dict) general_bin_dict.update(bin_dict_cosine) # assign bin data to concepts in dataset set_bin_features = get_bin_feature_dict(general_bin_dict, set_info_dict.values()) distribution_original = get_bin_distributions(general_bin_dict, set_bin_features, concepts_original) distribution_resampled = get_bin_distributions(general_bin_dict, set_bin_features, concepts_resampled) with open(results_path, 'w') as outfile: for name, d_original in distribution_original.items(): outfile.write(f'\n{name}\n') outfile.write( 'bin\t original (percent)\t original (absolut)\tresampled (percent)\tresampled (absolut)\n' ) d_resampled = distribution_resampled[name] for b, percent_original in d_original.items(): if b in d_resampled: percent_resampled = d_resampled[b] else: percent_resampled = (0, 0) outfile.write( f'{b}\t{percent_original[0]} \t{percent_original[1]}\t{percent_resampled[0]}\t{percent_resampled[1]}\n\n' ) print('Results written to:', results_path)
def load_lfw(): file_ext = 'jpg' # observe, no '.' before jpg dataset_path = './data/lfw' pairs_path = './data/pairs.txt' pairs = utils.read_pairs(pairs_path) path_list, issame_list = utils.get_paths(args.dataset_path, pairs, file_ext) print('==> Preparing data..') # Define data transforms RGB_MEAN = [0.485, 0.456, 0.406] RGB_STD = [0.229, 0.224, 0.225] test_transform = transforms.Compose([ transforms.Scale((250, 250)), # make 250x250 transforms.CenterCrop(150), # then take 150x150 center crop # resized to the network's required input size transforms.Scale((224, 224)), transforms.ToTensor(), transforms.Normalize(mean=RGB_MEAN, std=RGB_STD), ]) # Create data loader test_loader = torch.utils.data.DataLoader(data_loader.LFWDataset( path_list, issame_list, test_transform), batch_size=args.batch_size, shuffle=False) return test_loader
def start_requests(self): cities = utils.read_pairs(self.cities).keys() if ',' in self.start_time: for d in self.start_time.split(','): start_time = datetime.datetime.strptime(d, '%Y-%m-%d') end_time = start_time + datetime.timedelta(days=1) for city_id in cities: yield self._request(city_id=city_id, start_time=start_time, end_time=end_time) else: self.start_time = datetime.datetime.strptime( self.start_time, '%Y-%m-%d') if self.end_time: self.end_time = datetime.datetime.strptime( self.end_time, '%Y-%m-%d') else: self.end_time = self.start_time + datetime.timedelta(days=1) delta = (self.end_time - self.start_time).days for offset in range(delta): start_time = self.start_time + datetime.timedelta(days=offset) end_time = start_time + datetime.timedelta(days=1) for city_id in cities: yield self._request(city_id=city_id, start_time=start_time, end_time=end_time)
def get_data_dicts(self): data_dicts = [] for coll in self.collections: prop_data_dicts = utils.read_pairs(coll, run, source='test') for prop, dicts in prop_data_dicts.items(): for d in dicts: d['concept'] = d['lemma'] d.pop('lemma') d['collection'] = d['collection'] d['sources'] = 'test' data_dicts.extend(dicts) return data_dicts
def get_data_dicts(self): data_dicts = [] for coll in self.collections: prop_data_dicts = read_pairs(coll) for prop, dicts in prop_data_dicts.items(): for d in dicts: d['concept'] = d['lemma'] d.pop('lemma') d['collection'] = coll d['sources'] = d['sources_str'] d.pop('sources_str') data_dicts.extend(dicts) return data_dicts
def __init__(self, data_path, c): import shutil self.data_path = data_path self.c = c self.output_path = _p.join('result', self.data_path) utils.mkdir_p(self.output_path) self.cgmdir = _p.join(self.data_path, 'cgm') self.spiketime_dir = _p.join(self.data_path, 'spiketime') self.single_data = utils.read_table(_p.join(self.data_path,'statid_correlation_mu_sigma2_urate.dat')) shutil.copyfile(_p.join(self.data_path,'statid_correlation_mu_sigma2_urate.dat'), _p.join(self.output_path,'statid_correlation_mu_sigma2_urate.dat')) self.zero_firing = [i for i in self.single_data if self.single_data[i]['urate']<1e-8] self.pairs = utils.read_pairs(self.cgmdir) self.pair_data = [{'id': p} for p in self.pairs]
parser.add_argument('-sw', '--stopwords', dest='stop_words_file', help='Stop words file', metavar='<file>') parser.add_argument('-idf', '--idffile', dest='idf_file', help='IDF file', metavar='<file>') args = parser.parse_args() input_file = args.input_file output_file = args.output_file stop_words_file = args.stop_words_file stop_words = [] if stop_words_file: stop_words = utils.read_lines(stop_words_file) idf_file = args.idf_file idfs = {} if idf_file: idfs = utils.read_pairs(idf_file, float) scored_rows = [] field_names = [] with open(input_file, newline='', encoding='utf-8') as csvfile: reader = csv.DictReader(csvfile, delimiter=',') field_names = reader.fieldnames for row in reader: content = row['content'] if 'after_user_comment' in row and len(row['after_user_comment']) > 0: content = content + '。' + row['after_user_comment'] if 'answer_content' in row and len(row['answer_content']) > 0: content = content + '。' + row['answer_content'] words = jieba.cut(content) words_set = set()
def main(): parser = argparse.ArgumentParser() parser.add_argument('-e', '--exp_name', default='lfw_eval') parser.add_argument('-g', '--gpu', type=int, default=0) parser.add_argument('-d', '--dataset_path', default='/srv/data1/arunirc/datasets/lfw-deepfunneled') parser.add_argument('--fold', type=int, default=0, choices=[0,10]) parser.add_argument('--batch_size', type=int, default=100) parser.add_argument('-m', '--model_path', default=None, required=True, help='Path to pre-trained model') parser.add_argument('--model_type', default='resnet50', choices=['resnet50', 'resnet101', 'resnet101-512d']) args = parser.parse_args() # CUDA setup os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) cuda = torch.cuda.is_available() torch.manual_seed(1337) if cuda: torch.cuda.manual_seed(1337) torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True # enable if all images are same size if args.fold == 0: pairs_path = './lfw/data/pairsDevTest.txt' else: pairs_path = './lfw/data/pairs.txt' # ----------------------------------------------------------------------------- # 1. Dataset # ----------------------------------------------------------------------------- file_ext = 'jpg' # observe, no '.' before jpg num_class = 8631 pairs = utils.read_pairs(pairs_path) path_list, issame_list = utils.get_paths(args.dataset_path, pairs, file_ext) # Define data transforms RGB_MEAN = [ 0.485, 0.456, 0.406 ] RGB_STD = [ 0.229, 0.224, 0.225 ] test_transform = transforms.Compose([ transforms.Scale((250,250)), # make 250x250 transforms.CenterCrop(150), # then take 150x150 center crop transforms.Scale((224,224)), # resized to the network's required input size transforms.ToTensor(), transforms.Normalize(mean = RGB_MEAN, std = RGB_STD), ]) # Create data loader test_loader = torch.utils.data.DataLoader( data_loader.LFWDataset( path_list, issame_list, test_transform), batch_size=args.batch_size, shuffle=False ) # ----------------------------------------------------------------------------- # 2. Model # ----------------------------------------------------------------------------- if args.model_type == 'resnet50': model = torchvision.models.resnet50(pretrained=False) model.fc = torch.nn.Linear(2048, num_class) elif args.model_type == 'resnet101': model = torchvision.models.resnet101(pretrained=False) model.fc = torch.nn.Linear(2048, num_class) elif args.model_type == 'resnet101-512d': model = torchvision.models.resnet101(pretrained=False) layers = [] layers.append(torch.nn.Linear(2048, 512)) layers.append(torch.nn.Linear(512, num_class)) model.fc = torch.nn.Sequential(*layers) else: raise NotImplementedError checkpoint = torch.load(args.model_path) if checkpoint['arch'] == 'DataParallel': # if we trained and saved our model using DataParallel model = torch.nn.DataParallel(model, device_ids=[0, 1, 2, 3, 4]) model.load_state_dict(checkpoint['model_state_dict']) model = model.module # get network module from inside its DataParallel wrapper else: model.load_state_dict(checkpoint['model_state_dict']) if cuda: model = model.cuda() # Convert the trained network into a "feature extractor" feature_map = list(model.children()) if args.model_type == 'resnet101-512d': model.eval() extractor = model extractor.fc = nn.Sequential(extractor.fc[0]) else: feature_map.pop() extractor = nn.Sequential(*feature_map) extractor.eval() # set to evaluation mode (fixes BatchNorm, dropout, etc.) # ----------------------------------------------------------------------------- # 3. Feature extraction # ----------------------------------------------------------------------------- features = [] for batch_idx, images in tqdm.tqdm(enumerate(test_loader), total=len(test_loader), desc='Extracting features'): x = Variable(images, volatile=True) # test-time memory conservation if cuda: x = x.cuda() feat = extractor(x) if cuda: feat = feat.data.cpu() else: feat = feat.data features.append(feat) features = torch.stack(features) sz = features.size() features = features.view(sz[0]*sz[1], sz[2]) features = F.normalize(features, p=2, dim=1) # L2-normalize # TODO - cache features # ----------------------------------------------------------------------------- # 4. Verification # ----------------------------------------------------------------------------- num_feat = features.size()[0] feat_pair1 = features[np.arange(0,num_feat,2),:] feat_pair2 = features[np.arange(1,num_feat,2),:] feat_dist = (feat_pair1 - feat_pair2).norm(p=2, dim=1) feat_dist = feat_dist.numpy() # Eval metrics scores = -feat_dist gt = np.asarray(issame_list) if args.fold == 0: fig_path = osp.join(here, args.exp_name + '_' + args.model_type + '_lfw_roc_devTest.png') roc_auc = sklearn.metrics.roc_auc_score(gt, scores) fpr, tpr, thresholds = sklearn.metrics.roc_curve(gt, scores) print 'ROC-AUC: %.04f' % roc_auc # Plot and save ROC curve fig = plt.figure() plt.title('ROC - lfw dev-test') plt.plot(fpr, tpr, lw=2, label='ROC (auc = %0.4f)' % roc_auc) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.grid() plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.legend(loc='lower right') plt.tight_layout() else: # 10 fold fold_size = 600 # 600 pairs in each fold roc_auc = np.zeros(10) roc_eer = np.zeros(10) fig = plt.figure() plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.grid() plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') for i in tqdm.tqdm(range(10)): start = i * fold_size end = (i+1) * fold_size scores_fold = scores[start:end] gt_fold = gt[start:end] roc_auc[i] = sklearn.metrics.roc_auc_score(gt_fold, scores_fold) fpr, tpr, _ = sklearn.metrics.roc_curve(gt_fold, scores_fold) # EER calc: https://yangcha.github.io/EER-ROC/ roc_eer[i] = brentq( lambda x: 1. - x - interpolate.interp1d(fpr, tpr)(x), 0., 1.) plt.plot(fpr, tpr, alpha=0.4, lw=2, color='darkgreen', label='ROC(auc=%0.4f, eer=%0.4f)' % (roc_auc[i], roc_eer[i]) ) plt.title( 'AUC: %0.4f +/- %0.4f, EER: %0.4f +/- %0.4f' % (np.mean(roc_auc), np.std(roc_auc), np.mean(roc_eer), np.std(roc_eer)) ) plt.tight_layout() fig_path = osp.join(here, args.exp_name + '_' + args.model_type + '_lfw_roc_10fold.png') plt.savefig(fig_path, bbox_inches='tight') print 'ROC curve saved at: ' + fig_path
dataset_train = datasets.ImageFolder(TRAIN_PATH, train_transform) # For unbalanced dataset we create a weighted sampler # * Balanced class sampling: https://discuss.pytorch.org/t/balanced-sampling-between-classes-with-torchvision-dataloader/2703/3 weights = utils.make_weights_for_balanced_classes(dataset_train.imgs, len(dataset_train.classes)) weights = torch.DoubleTensor(weights) sampler = torch.utils.data.sampler.WeightedRandomSampler(weights, len(weights)) train_loader = torch.utils.data.DataLoader(dataset_train, batch_size=TRAIN_BATCH_SIZE, sampler=sampler, drop_last=True) num_class = len(train_loader.dataset.classes) print('Number of Training Classes: %d' % num_class) pairs = utils.read_pairs(PAIR_TEXT_PATH) path_list, issame_list = utils.get_paths(VAL_PATH, pairs, FILE_EXT) val_loader = torch.utils.data.DataLoader(data_loader.LFWDataset( path_list, issame_list, val_transform), batch_size=VAL_BATCH_SIZE, shuffle=False) #======= Model & Optimizer =======# if MODEL_NAME.lower() == 'resnet18': model = torchvision.models.resnet18(pretrained=True) elif MODEL_NAME.lower() == 'resnet34': model = torchvision.models.resnet34(pretrained=True) elif MODEL_NAME.lower() == 'resnet50': model = torchvision.models.resnet50(pretrained=True) elif MODEL_NAME.lower() == 'resnet101': model = torchvision.models.resnet101(pretrained=True)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-e', '--exp_name', default='lfw_eval') parser.add_argument('-g', '--gpu', type=int, default=0) parser.add_argument('-d', '--dataset_path', default='/srv/data1/arunirc/datasets/lfw-deepfunneled') parser.add_argument('--fold', type=int, default=0, choices=[0, 10]) parser.add_argument('--batch_size', type=int, default=100) parser.add_argument('-m', '--model_path', default=None, required=True, help='Path to pre-trained model') parser.add_argument('--model_type', default='resnet50', choices=['resnet50', 'resnet101', 'resnet101-512d']) args = parser.parse_args() # CUDA setup os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) cuda = torch.cuda.is_available() torch.manual_seed(1337) if cuda: torch.cuda.manual_seed(1337) torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True # enable if all images are same size if args.fold == 0: pairs_path = './lfw/data/pairsDevTest.txt' else: pairs_path = './lfw/data/pairs.txt' # ----------------------------------------------------------------------------- # 1. Dataset # ----------------------------------------------------------------------------- file_ext = 'jpg' # observe, no '.' before jpg num_class = 8631 pairs = utils.read_pairs(pairs_path) path_list, issame_list = utils.get_paths(args.dataset_path, pairs, file_ext) # Define data transforms RGB_MEAN = [0.485, 0.456, 0.406] RGB_STD = [0.229, 0.224, 0.225] test_transform = transforms.Compose([ transforms.Scale((250, 250)), # make 250x250 transforms.CenterCrop(150), # then take 150x150 center crop transforms.Scale( (224, 224)), # resized to the network's required input size transforms.ToTensor(), transforms.Normalize(mean=RGB_MEAN, std=RGB_STD), ]) # Create data loader test_loader = torch.utils.data.DataLoader(data_loader.LFWDataset( path_list, issame_list, test_transform), batch_size=args.batch_size, shuffle=False) # ----------------------------------------------------------------------------- # 2. Model # ----------------------------------------------------------------------------- if args.model_type == 'resnet50': model = torchvision.models.resnet50(pretrained=False) model.fc = torch.nn.Linear(2048, num_class) elif args.model_type == 'resnet101': model = torchvision.models.resnet101(pretrained=False) model.fc = torch.nn.Linear(2048, num_class) elif args.model_type == 'resnet101-512d': model = torchvision.models.resnet101(pretrained=False) layers = [] layers.append(torch.nn.Linear(2048, 512)) layers.append(torch.nn.Linear(512, num_class)) model.fc = torch.nn.Sequential(*layers) else: raise NotImplementedError checkpoint = torch.load(args.model_path) if checkpoint['arch'] == 'DataParallel': # if we trained and saved our model using DataParallel model = torch.nn.DataParallel(model, device_ids=[0, 1, 2, 3, 4]) model.load_state_dict(checkpoint['model_state_dict']) model = model.module # get network module from inside its DataParallel wrapper else: model.load_state_dict(checkpoint['model_state_dict']) if cuda: model = model.cuda() # Convert the trained network into a "feature extractor" feature_map = list(model.children()) if args.model_type == 'resnet101-512d': model.eval() extractor = model extractor.fc = nn.Sequential(extractor.fc[0]) else: feature_map.pop() extractor = nn.Sequential(*feature_map) extractor.eval() # set to evaluation mode (fixes BatchNorm, dropout, etc.) # ----------------------------------------------------------------------------- # 3. Feature extraction # ----------------------------------------------------------------------------- features = [] for batch_idx, images in tqdm.tqdm(enumerate(test_loader), total=len(test_loader), desc='Extracting features'): x = Variable(images, volatile=True) # test-time memory conservation if cuda: x = x.cuda() feat = extractor(x) if cuda: feat = feat.data.cpu() else: feat = feat.data features.append(feat) features = torch.stack(features) sz = features.size() features = features.view(sz[0] * sz[1], sz[2]) features = F.normalize(features, p=2, dim=1) # L2-normalize # TODO - cache features # ----------------------------------------------------------------------------- # 4. Verification # ----------------------------------------------------------------------------- num_feat = features.size()[0] feat_pair1 = features[np.arange(0, num_feat, 2), :] feat_pair2 = features[np.arange(1, num_feat, 2), :] feat_dist = (feat_pair1 - feat_pair2).norm(p=2, dim=1) feat_dist = feat_dist.numpy() # Eval metrics scores = -feat_dist gt = np.asarray(issame_list) if args.fold == 0: fig_path = osp.join( here, args.exp_name + '_' + args.model_type + '_lfw_roc_devTest.png') roc_auc = sklearn.metrics.roc_auc_score(gt, scores) fpr, tpr, thresholds = sklearn.metrics.roc_curve(gt, scores) print 'ROC-AUC: %.04f' % roc_auc # Plot and save ROC curve fig = plt.figure() plt.title('ROC - lfw dev-test') plt.plot(fpr, tpr, lw=2, label='ROC (auc = %0.4f)' % roc_auc) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.grid() plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.legend(loc='lower right') plt.tight_layout() else: # 10 fold fold_size = 600 # 600 pairs in each fold roc_auc = np.zeros(10) roc_eer = np.zeros(10) fig = plt.figure() plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.grid() plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') for i in tqdm.tqdm(range(10)): start = i * fold_size end = (i + 1) * fold_size scores_fold = scores[start:end] gt_fold = gt[start:end] roc_auc[i] = sklearn.metrics.roc_auc_score(gt_fold, scores_fold) fpr, tpr, _ = sklearn.metrics.roc_curve(gt_fold, scores_fold) # EER calc: https://yangcha.github.io/EER-ROC/ roc_eer[i] = brentq( lambda x: 1. - x - interpolate.interp1d(fpr, tpr)(x), 0., 1.) plt.plot(fpr, tpr, alpha=0.4, lw=2, color='darkgreen', label='ROC(auc=%0.4f, eer=%0.4f)' % (roc_auc[i], roc_eer[i])) plt.title('AUC: %0.4f +/- %0.4f, EER: %0.4f +/- %0.4f' % (np.mean(roc_auc), np.std(roc_auc), np.mean(roc_eer), np.std(roc_eer))) plt.tight_layout() fig_path = osp.join( here, args.exp_name + '_' + args.model_type + '_lfw_roc_10fold.png') plt.savefig(fig_path, bbox_inches='tight') print 'ROC curve saved at: ' + fig_path