def prepare(self, dim, sd): """ Make torch Tensors from g2-`dim`-`sd` and infer labels. Args: dim: sd: Returns: """ filename = 'g2-{}-{}.txt'.format(dim, sd) data = [] target = [] with open(os.path.join(self.root, filename)) as in_f: for i, line in enumerate(in_f): a, b = list(map(int, line.split())), 0 if i < 1024 else 1 data.append(a) target.append(b) data = torch.Tensor(data) target = torch.Tensor(target) if self.stardardize: data = (data - 550) / 50 return data, target
def prepare(self, *select): """ Args: *select: Returns: """ datafile, labelfile = self.files(*select) data_filepath = os.path.join(self.root, datafile) label_filepath = os.path.join(self.root, labelfile) data = [] target = [] with open(data_filepath) as data_f, open(label_filepath) as label_f: for x, y in zip(data_f, it.islice(label_f, self.sync_files, None)): data.append(list(map(int, x.split()))) target.append(int(y)) data = torch.Tensor(data) target = torch.Tensor(target) if self.stardardize: data_mean = data.mean(dim=0, keepdim=True) data_std = data.std(dim=0, keepdim=True) data = (data - data_mean) / data_std return data, target
def cbow(sentences, window_size): """ Create data based on skip-gram approach aka (predict context word from target word). """ data = [] for pair in sentences: # Extract data doc_id, sentence = pair[0], pair[1] # For each index for i in range(window_size, len(sentence)-window_size): # Collect contexts context = [sentence[i-size] \ for size in range(window_size, -window_size-1, -1) \ if size != 0] # Target target = sentence[i] # Add to data data.append((doc_id, context, target)) return data
def skip_gram(sentences, window_size): """ Create data based on skip-gram approach aka (predict context word from target word). """ data = [] for sentence in sentences: # For each index for i, index in enumerate(sentence): # Collect valid context indexes contexts = [] for window in range(window_size): # left side if i-(window+1) >= 0: contexts.append(sentence[i-(window+1)]) # right side if i+(window+1) < len(sentence): contexts.append(sentence[i+(window+1)]) # Add to data for context in contexts: data.append((index, context)) return data
def get_paths(self): print('Identifying %s dataset.' % self.split) data = [] labels = [] # Get the corresponding label for each image. for line in self.lines: imgpath = line img_filename = ntpath.basename(imgpath) anno_filename = img_filename.replace('jpg', 'png') labpath = imgpath.replace('imgs', 'annos').replace(img_filename, anno_filename) if not os.path.exists(labpath): print('Could not find label for %s.' % imgpath) continue data.append(imgpath) labels.append(labpath) if self.split in ['train', 'val']: self.train_data = data self.train_labels = labels self.val_data = self.train_data[-self.val_samples:] self.val_labels = self.train_labels[-self.val_samples:] self.train_data = self.train_data[:-self.val_samples] self.train_labels = self.train_labels[:-self.val_samples] else: self.test_data = data self.test_labels = labels
def prepare(self): """ Make torch Tensors from data and label files. Returns: """ datafile = self.urls[0].rpartition('/')[2] data_filepath = os.path.join(self.root, datafile) data = [] target = [] with open(data_filepath) as data_f: for sample in data_f: x, y, label = tuple(map(float, sample.split())) data.append([x, y]) target.append(int(label) - 1) data = torch.Tensor(data) target = torch.Tensor(target) if self.stardardize: data_mean = data.mean(dim=0, keepdim=True) data_std = data.std(dim=0, keepdim=True) data = (data - data_mean) / data_std return data, target
parser = argparse.ArgumentParser() parser.add_argument("--fst_dset", type=str, help="first directory in shared dir") parser.add_argument("--snd_dset", type=str, help="second directory in shared dir") opt = parser.parse_args() print(opt) def load_image( infilename ) : img = Image.open( infilename ) img.load() data = np.asarray( img, dtype="float64" ) return data path = '/share/se3/export/data/'+opt.fst_dset+'/'+opt.snd_dset+'/' data = [] counter = 0 for filename in os.listdir(path): if counter > 15000: break if counter % 1000 == 0: print('files read', counter) img = load_image(path+filename) img *= 255.0/img.max() data.append(np.transpose(img)) counter += 1 data = torch.from_numpy(np.stack(data)) torch.save(data, opt.fst_dset+'_'+opt.snd_dset+'.pt') print ("Calculating Inception Score...") print (inception_score(data, cuda=False, batch_size=32, resize=True, splits=10))
def __init__(self, root, image_set, year, img_size, shots=1, phase=1, shuffle=False): self.shuffle = shuffle self.img_size = img_size self.phase = phase subset = 'shots' self.shot_path = os.path.join(root, 'annotations', 'instances_{}2014.json'.format(subset)) self.shots = shots # if phase == 2: # self.shots = shots * 3 # name, paths self._year = year self._image_set = image_set self._data_path = root # load COCO API, classes, class <-> id mappings self._COCO = COCO(self._get_ann_file()) self.json_data = self._COCO.dataset.copy() cats = self._COCO.loadCats(self._COCO.getCatIds()) self._classes = tuple( ['__background__'] + [c['name'] for c in cats if c['name'] not in cfg.VOC_CLASSES] + [c['name'] for c in cats if c['name'] in cfg.VOC_CLASSES]) self._class_to_coco_cat_id = dict( list(zip([c['name'] for c in cats], self._COCO.getCatIds()))) self._image_index = self._load_image_set_index() # Some image sets are "views" (i.e. subsets) into others. # For example, minival2014 is a random 5000 image subset of val2014. # This mapping tells us where the view's images and proposals come from. self._view_map = { 'minival2014': 'val2014', # 5k val2014 subset 'valminusminival2014': 'val2014', # val2014 \setminus minival2014 'valminuscapval2014': 'val2014', 'capval2014': 'val2014', 'captest2014': 'val2014', 'shots2014': 'train2014' } coco_name = image_set + year # e.g., "val2014" self._data_name = (self._view_map[coco_name] if coco_name in self._view_map else coco_name) if phase == 1: self.metaclass = tuple( [c['name'] for c in cats if c['name'] not in cfg.VOC_CLASSES]) else: self.metaclass = tuple( [c['name'] for c in cats if c['name'] not in cfg.VOC_CLASSES] + [c['name'] for c in cats if c['name'] in cfg.VOC_CLASSES]) class_to_idx = dict(zip(self.metaclass, range(len( self.metaclass)))) # class to index mapping self.prndata = [] self.prncls = [] prn_image_pth = os.path.join(root, 'annotations', 'prn_image_{}shots.pt'.format(shots)) prn_mask_pth = os.path.join(root, 'annotations', 'prn_mask_{}shots.pt'.format(shots)) if os.path.exists(prn_image_pth) and os.path.exists( prn_mask_pth) and self.phase == 1: prn_image = torch.load(prn_image_pth) prn_mask = torch.load(prn_mask_pth) else: prn_image, prn_mask = self.get_prndata() torch.save(prn_image, prn_image_pth) torch.save(prn_mask, prn_mask_pth) for i in range(shots): cls = [] data = [] for n, key in enumerate(list(prn_image.keys())): img = torch.from_numpy( np.array(prn_image[key][i % len(prn_image[key])])) img = img.unsqueeze(0) mask = torch.from_numpy( np.array(prn_mask[key][i % len(prn_mask[key])])) mask = mask.unsqueeze(0) mask = mask.unsqueeze(3) imgmask = torch.cat([img, mask], dim=3) cls.append(class_to_idx[key]) data.append(imgmask.permute(0, 3, 1, 2).contiguous()) self.prncls.append(cls) self.prndata.append(torch.cat(data, dim=0))
def read_langs(file_name, entity, can, ind2cand, max_line=None): logging.info(("Reading lines from {}".format(file_name))) # Read the file and split into lines data = [] context = "" u = None r = None with open(file_name) as fin: cnt_ptr = 0 cnt_voc = 0 max_r_len = 0 cnt_lin = 1 for line in fin: line = line.strip() if line: nid, line = line.split(' ', 1) if '\t' in line: u, r = line.split('\t') context += str(u) + " " contex_arr = context.split(' ')[LIMIT:] r_index = [] gate = [] for key in r.split(' '): if (key in entity): index = [ loc for loc, val in enumerate(contex_arr) if val == key ] if (index): index = max(index) gate.append(1) cnt_ptr += 1 else: index = len(contex_arr) gate.append(0) cnt_voc += 1 r_index.append(index) if (len(r_index) == 0): r_index = [ len(contex_arr), len(contex_arr), len(contex_arr), len(contex_arr) ] if (len(r_index) == 1): r_index.append(len(contex_arr)) r_index.append(len(contex_arr)) r_index.append(len(contex_arr)) if len(r_index) > max_r_len: max_r_len = len(r_index) data.append( [" ".join(contex_arr) + " $$$$", can[r], r_index, r]) context += str(r) + " " else: r = line context += str(r) + " " else: cnt_lin += 1 if (max_line and cnt_lin >= max_line): break context = "" max_len = max([len(d[0].split(' ')) for d in data]) logging.info("Pointer percentace= {} ".format(cnt_ptr / (cnt_ptr + cnt_voc))) logging.info("Max responce Len: {}".format(max_r_len)) logging.info("Max Input Len: {}".format(max_len)) return data, max_len, max_r_len
def read_langs(file_name, max_line=None): print(("Reading lines from {}".format(file_name))) data, context_arr, conv_arr, kb_arr = [], [], [], [] max_resp_len = 0 with open('data/MULTIWOZ2.1/global_entities.json') as f: global_entity = json.load(f) with open(file_name) as fin: cnt_lin, sample_counter = 1, 1 for line in fin: line = line.strip() if line: if line[-1] == line[0] == "#": line = line.replace("#", "") task_type = line continue nid, line = line.split(' ', 1) if '\t' in line: u, r, gold_ent = line.split('\t') gen_u = generate_memory(u, "$u", str(nid)) context_arr += gen_u conv_arr += gen_u # Get gold entity for each domain gold_ent = ast.literal_eval(gold_ent) ent_idx_restaurant, ent_idx_attraction, ent_idx_hotel = [], [], [] if task_type == "restaurant": ent_idx_restaurant = gold_ent elif task_type == "attraction": ent_idx_attraction = gold_ent elif task_type == "hotel": ent_idx_hotel = gold_ent ent_index = list(set(ent_idx_restaurant + ent_idx_attraction + ent_idx_hotel)) # Get local pointer position for each word in system response ptr_index = [] for key in r.split(): index = [loc for loc, val in enumerate(context_arr) if (val[0] == key and key in ent_index)] if (index): index = max(index) else: index = len(context_arr) ptr_index.append(index) # Get global pointer labels for words in system response, the 1 in the end is for the NULL token selector_index = [1 if (word_arr[0] in ent_index or word_arr[0] in r.split()) else 0 for word_arr in context_arr] + [1] sketch_response, gold_sketch = generate_template(global_entity, r, gold_ent, kb_arr, task_type) data_detail = { 'context_arr': list(context_arr + [['$$$$'] * MEM_TOKEN_SIZE]), # $$$$ is NULL token 'response': r, 'sketch_response': sketch_response, 'gold_sketch': gold_sketch, 'ptr_index': ptr_index + [len(context_arr)], 'selector_index': selector_index, 'ent_index': ent_index, 'ent_idx_restaurant': list(set(ent_idx_restaurant)), 'ent_idx_attraction': list(set(ent_idx_attraction)), 'ent_idx_hotel': list(set(ent_idx_hotel)), 'conv_arr': list(conv_arr), 'kb_arr': list(kb_arr), 'id': int(sample_counter), 'ID': int(cnt_lin), 'domain': task_type} data.append(data_detail) gen_r = generate_memory(r, "$s", str(nid)) context_arr += gen_r conv_arr += gen_r if max_resp_len < len(r.split()): max_resp_len = len(r.split()) sample_counter += 1 else: r = line kb_info = generate_memory(r, "", str(nid)) context_arr = kb_info + context_arr kb_arr += kb_info else: cnt_lin += 1 context_arr, conv_arr, kb_arr = [], [], [] if (max_line and cnt_lin >= max_line): break return data, max_resp_len
def read_langs(file_name, global_entity, type_dict, max_line=None): # print(("Reading lines from {}".format(file_name))) data, context_arr, conv_arr, kb_arr = [], [], [], [] max_resp_len, sample_counter = 0, 0 with open(file_name) as fin: cnt_lin = 1 for line in fin: line = line.strip() if line: nid, line = line.split(' ', 1) # print("line", line) if '\t' in line: u, r = line.split('\t') gen_u = generate_memory(u, "$u", str(nid)) context_arr += gen_u conv_arr += gen_u ptr_index, ent_words = [], [] # Get local pointer position for each word in system response for key in r.split(): if key in global_entity and key not in ent_words: ent_words.append(key) index = [ loc for loc, val in enumerate(context_arr) if (val[0] == key and key in global_entity) ] index = max(index) if (index) else len(context_arr) ptr_index.append(index) # Get global pointer labels for words in system response, the 1 in the end is for the NULL token selector_index = [ 1 if (word_arr[0] in ent_words or word_arr[0] in r.split()) else 0 for word_arr in context_arr ] + [1] sketch_response = generate_template( global_entity, r, type_dict) data_detail = { 'context_arr': list(context_arr + [['$$$$'] * MEM_TOKEN_SIZE]), # $$$$ is NULL token 'response': r, 'sketch_response': sketch_response, 'ptr_index': ptr_index + [len(context_arr)], 'selector_index': selector_index, 'ent_index': ent_words, 'ent_idx_cal': [], 'ent_idx_nav': [], 'ent_idx_wet': [], 'conv_arr': list(conv_arr), 'kb_arr': list(kb_arr), 'id': int(sample_counter), 'ID': int(cnt_lin), 'domain': "" } data.append(data_detail) gen_r = generate_memory(r, "$s", str(nid)) context_arr += gen_r conv_arr += gen_r if max_resp_len < len(r.split()): max_resp_len = len(r.split()) sample_counter += 1 else: r = line kb_info = generate_memory(r, "", str(nid)) context_arr = kb_info + context_arr kb_arr += kb_info else: cnt_lin += 1 context_arr, conv_arr, kb_arr = [], [], [] if (max_line and cnt_lin >= max_line): break return data, max_resp_len
def get_video_frames(self, video_index): front_vid_path, tac_path, pos_path, label_path, vid_frames, tac_frames, label = self.video_data[video_index] data = [] for vid, tac, lab in zip(vid_frames, tac_frames, label): data.append(self.custom_getitem(front_vid_path, vid, lab, tac_path, pos_path, tac)) return data, label_path
def read_langs(file_name, gating_dict, SLOTS, dataset, lang, mem_lang, sequicity, training, max_line=None): print(("Reading from {}".format(file_name))) data = [] max_resp_len, max_value_len = 0, 0 domain_counter = {} with open(file_name) as f: dials = json.load(f) # create vocab first for dial_dict in dials: if (args["all_vocab"] or dataset == "train") and training: for ti, turn in enumerate(dial_dict["dialogue"]): lang.index_words(turn["system_transcript"], 'utter') lang.index_words(turn["transcript"], 'utter') # determine training data ratio, default is 100% if training and dataset == "train" and args["data_ratio"] != 100: random.Random(10).shuffle(dials) dials = dials[:int(len(dials) * 0.01 * args["data_ratio"])] cnt_lin = 1 for dial_dict in dials: dialog_history = "" last_belief_dict = {} # Filtering and counting domains for domain in dial_dict["domains"]: if domain not in EXPERIMENT_DOMAINS: continue if domain not in domain_counter.keys(): domain_counter[domain] = 0 domain_counter[domain] += 1 # Unseen domain setting if args["only_domain"] != "" and args[ "only_domain"] not in dial_dict["domains"]: continue if (args["except_domain"] != "" and dataset == "test" and args["except_domain"] not in dial_dict["domains"]) or \ (args["except_domain"] != "" and dataset != "test" and [args["except_domain"]] == dial_dict["domains"]): continue # Reading data for ti, turn in enumerate(dial_dict["dialogue"]): turn_domain = turn["domain"] turn_id = turn["turn_idx"] turn_uttr = turn["system_transcript"] + " ; " + turn[ "transcript"] turn_uttr_strip = turn_uttr.strip() dialog_history += (turn["system_transcript"] + " ; " + turn["transcript"] + " ; ") source_text = dialog_history.strip() turn_belief_dict = fix_general_label_error( turn["belief_state"], False, SLOTS) # Generate domain-dependent slot list slot_temp = SLOTS if dataset == "train" or dataset == "dev": if args["except_domain"] != "": slot_temp = [ k for k in SLOTS if args["except_domain"] not in k ] turn_belief_dict = OrderedDict([ (k, v) for k, v in turn_belief_dict.items() if args["except_domain"] not in k ]) elif args["only_domain"] != "": slot_temp = [ k for k in SLOTS if args["only_domain"] in k ] turn_belief_dict = OrderedDict([ (k, v) for k, v in turn_belief_dict.items() if args["only_domain"] in k ]) else: if args["except_domain"] != "": slot_temp = [ k for k in SLOTS if args["except_domain"] in k ] turn_belief_dict = OrderedDict([ (k, v) for k, v in turn_belief_dict.items() if args["except_domain"] in k ]) elif args["only_domain"] != "": slot_temp = [ k for k in SLOTS if args["only_domain"] in k ] turn_belief_dict = OrderedDict([ (k, v) for k, v in turn_belief_dict.items() if args["only_domain"] in k ]) turn_belief_list = [ str(k) + '-' + str(v) for k, v in turn_belief_dict.items() ] if (args["all_vocab"] or dataset == "train") and training: mem_lang.index_words(turn_belief_dict, 'belief') class_label, generate_y, slot_mask, gating_label = [], [], [], [] start_ptr_label, end_ptr_label = [], [] for slot in slot_temp: if slot in turn_belief_dict.keys(): generate_y.append(turn_belief_dict[slot]) if turn_belief_dict[slot] == "dontcare": gating_label.append(gating_dict["dontcare"]) elif turn_belief_dict[slot] == "none": gating_label.append(gating_dict["none"]) else: gating_label.append(gating_dict["ptr"]) if max_value_len < len(turn_belief_dict[slot]): max_value_len = len(turn_belief_dict[slot]) else: generate_y.append("none") gating_label.append(gating_dict["none"]) # 可以根据ID和turn_idx将内容复原 data_detail = { "ID": dial_dict["dialogue_idx"], "domains": dial_dict["domains"], "turn_domain": turn_domain, "turn_id": turn_id, "dialog_history": source_text, "turn_belief": turn_belief_list, "gating_label": gating_label, "turn_uttr": turn_uttr_strip, 'generate_y': generate_y } data.append(data_detail) if max_resp_len < len(source_text.split()): max_resp_len = len(source_text.split()) cnt_lin += 1 if (max_line and cnt_lin >= max_line): break # add t{} to the lang file if "t{}".format(max_value_len - 1) not in mem_lang.word2index.keys() and training: for time_i in range(max_value_len): mem_lang.index_words("t{}".format(time_i), 'utter') print("domain_counter", domain_counter) return data, max_resp_len, slot_temp
def main(): torch.manual_seed(1) torch.cuda.manual_seed_all(1) global args, best_prec1 best_prec1 = 0 args = parser.parse_args() time_stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') if args.evaluate: args.results_dir = '/tmp' if args.save is '': args.save = time_stamp save_path = os.path.join(args.results_dir, args.save) if not os.path.exists(save_path): os.makedirs(save_path) args.noise = not args.no_noise args.quant = not args.no_quantization args.act_quant = not args.no_act_quantization args.quant_edges = not args.no_quant_edges logging.info("saving to %s", save_path) logging.debug("run arguments: %s", args) if args.gpus is not None: args.gpus = [int(i) for i in args.gpus.split(',')] device = 'cuda:' + str(args.gpus[0]) cudnn.benchmark = True else: device = 'cpu' dtype = torch.float32 args.step_setup = None model = models.__dict__[args.model] model_config = { 'scale': args.scale, 'input_size': args.input_size, 'dataset': args.dataset, 'bitwidth': args.bitwidth, 'quantize': args.quant, 'noise': args.noise, 'step': args.step, 'depth': args.depth, 'act_bitwidth': args.act_bitwidth, 'act_quant': args.act_quant, 'quant_edges': args.quant_edges, 'step_setup': args.step_setup, 'quant_epoch_step': args.quant_epoch_step, 'quant_start_stage': args.quant_start_stage, 'normalize': args.no_pre_process_normalize, 'noise_mask': args.noise_mask } if args.model_config is not '': model_config = dict(model_config, **literal_eval(args.model_config)) # create model model = model(**model_config) logging.info("creating model %s", args.model) model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) print("number of parameters: ", params) logging.info("created model with configuration: %s", model_config) print(model) data = None checkpoint_epoch = 0 # optionally resume from a checkpoint if args.evaluate: if not os.path.isfile(args.evaluate): parser.error('invalid checkpoint: {}'.format(args.evaluate)) checkpoint = torch.load(args.evaluate, map_location=device) load_model(model, checkpoint) logging.info("loaded checkpoint '%s' (epoch %s)", args.evaluate, checkpoint['epoch']) print("loaded checkpoint {0} (epoch {1})".format( args.evaluate, checkpoint['epoch'])) elif args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location=device) if not args.start_from_zero: args.start_epoch = checkpoint['epoch'] - 1 best_test = checkpoint['best_prec1'] checkpoint_epoch = checkpoint['epoch'] load_model(model, checkpoint) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) elif os.path.isdir(args.resume): checkpoint_path = os.path.join(args.resume, 'checkpoint.pth.tar') csv_path = os.path.join(args.resume, 'results.csv') print("=> loading checkpoint '{}'".format(checkpoint_path)) checkpoint = torch.load(checkpoint_path, map_location=device) best_test = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})".format( checkpoint_path, checkpoint['epoch'])) data = [] with open(csv_path) as csvfile: reader = csv.DictReader(csvfile) for row in reader: data.append(row) else: print("=> no checkpoint found at '{}'".format(args.resume)) if args.gpus is not None: model = torch.nn.DataParallel( model, [args.gpus[0]] ) # Statistics need to be calculated on single GPU to be consistant with data among multiplr GPUs # Data loading code default_transform = { 'train': get_transform(args.dataset, input_size=args.input_size, augment=True, integer_values=args.quant_dataloader, norm=not args.no_pre_process_normalize), 'eval': get_transform(args.dataset, input_size=args.input_size, augment=False, integer_values=args.quant_dataloader, norm=not args.no_pre_process_normalize) } transform = getattr(model.module, 'input_transform', default_transform) val_data = get_dataset(args.dataset, 'val', transform['eval'], datasets_path=args.datapath) val_loader = torch.utils.data.DataLoader(val_data, batch_size=args.val_batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) train_data = get_dataset(args.dataset, 'train', transform['train'], datasets_path=args.datapath) train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) statistics_train_loader = torch.utils.data.DataLoader( train_data, batch_size=args.act_stats_batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), args.learning_rate, momentum=args.momentum, weight_decay=args.decay, nesterov=True) model, criterion = model.to(device, dtype), criterion.to(device, dtype) if args.clr: scheduler = CyclicLR(optimizer, base_lr=args.min_lr, max_lr=args.max_lr, step_size=args.epochs_per_step * len(train_loader), mode=args.mode) else: scheduler = MultiStepLR(optimizer, milestones=args.schedule, gamma=args.gamma) csv_logger = CsvLogger(filepath=save_path, data=data) csv_logger.save_params(sys.argv, args) csv_logger_training_stats = os.path.join(save_path, 'training_stats.csv') # pre-training activation and parameters statistics calculation #### if check_if_need_to_collect_statistics(model): for layer in model.modules(): if isinstance(layer, actquant.ActQuantBuffers): layer.pre_training_statistics = True # Turn on pre-training activation statistics calculation model.module.statistics_phase = True validate( statistics_train_loader, model, criterion, device, epoch=0, num_of_batches=80, stats_phase=True) # Run validation on training set for statistics model.module.quantize.get_act_max_value_from_pre_calc_stats( list(model.modules())) _ = model.module.quantize.set_weight_basis(list(model.modules()), None) for layer in model.modules(): if isinstance(layer, actquant.ActQuantBuffers): layer.pre_training_statistics = False # Turn off pre-training activation statistics calculation model.module.statistics_phase = False else: # Maximal activation values still need to be derived from loaded stats model.module.quantize.assign_act_clamp_during_val(list( model.modules()), print_clamp_val=True) model.module.quantize.assign_weight_clamp_during_val( list(model.modules()), print_clamp_val=True) # model.module.quantize.get_act_max_value_from_pre_calc_stats(list(model.modules())) if args.gpus is not None: # Return to Multi-GPU after statistics calculations model = torch.nn.DataParallel(model.module, args.gpus) model, criterion = model.to(device, dtype), criterion.to(device, dtype) # pre-training activation statistics calculation #### if args.evaluate: val_loss, val_prec1, val_prec5 = validate(val_loader, model, criterion, device, epoch=0) print("val_prec1: ", val_prec1) return # fast forward to curr stage for i in range(args.quant_start_stage): model.module.switch_stage(0) for epoch in trange(args.start_epoch, args.epochs + 1): if not isinstance(scheduler, CyclicLR): scheduler.step() # scheduler.optimizer = optimizer train_loss, train_prec1, train_prec5 = train( train_loader, model, criterion, device, epoch, optimizer, scheduler, training_stats_logger=csv_logger_training_stats) for layer in model.modules(): if isinstance(layer, actquant.ActQuantBuffers): layer.print_clamp() # evaluate on validation set val_loss, val_prec1, val_prec5 = validate(val_loader, model, criterion, device, epoch) # remember best prec@1 and save checkpoint is_best = val_prec1 > best_prec1 best_prec1 = max(val_prec1, best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'model': args.model, 'config': args.model_config, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'layers_b_dict': model.module. layers_b_dict #TODO this doesn't work for multi gpu - need to del }, is_best, path=save_path) # New type of logging csv_logger.write({ 'epoch': epoch + 1, 'val_error1': 1 - val_prec1, 'val_error5': 1 - val_prec5, 'val_loss': val_loss, 'train_error1': 1 - train_prec1, 'train_error5': 1 - train_prec5, 'train_loss': train_loss }) csv_logger.plot_progress(title=args.model + str(args.depth)) csv_logger.write_text( 'Epoch {}: Best accuracy is {:.2f}% top-1'.format( epoch + 1, best_prec1 * 100.))
target_dir = 'data/modelnet40_2048_category' train_files = getDataFiles(os.path.join(base_dir, 'test_files.txt')) # print(train_files) # TEST_FILES = getDataFiles(os.path.join(BASE_DIR, 'data/modelnet40_ply_hdf5_2048/test_files.txt')) shape_names = [] with open(os.path.join(base_dir, 'shape_names.txt'), 'r') as f: shape_names = [line.replace('\n', '') for line in f.readlines()] print(shape_names) data = [] label = [] for fn in range(len(train_files)): print('----' + str(fn) + '-----') current_data, current_label = loadDataFile(train_files[fn]) data.append(current_data) label.append(current_label) data = np.concatenate(data, axis=0) label = np.concatenate(label, axis=0) print(data.shape) print(label.shape) phase = 'test' for i, shape in enumerate(shape_names): indices = np.asarray([ind for ind, l in enumerate(label) if l == i]) shape_data = data[indices] dest_dir = os.path.join(target_dir, shape) if not os.path.exists(dest_dir): os.makedirs(dest_dir) with h5py.File(os.path.join(dest_dir, '%s_%s.h5' % (shape, phase)),
def train_discriminator(dataset, dataset_fp=None, pretrained_model="gpt2-medium", epochs=10, learning_rate=0.0001, batch_size=64, log_interval=10, save_model=False, cached=False, no_cuda=False, output_fp='.'): device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu" add_eos_token = pretrained_model.startswith("gpt2") if save_model: if not os.path.exists(output_fp): os.makedirs(output_fp) classifier_head_meta_fp = os.path.join( output_fp, "{}_classifier_head_meta.json".format(dataset)) classifier_head_fp_pattern = os.path.join( output_fp, "{}_classifier_head_epoch".format(dataset) + "_{}.pt") print("Preprocessing {} dataset...".format(dataset)) start = time.time() if dataset == "SST": idx2class = [ "positive", "negative", "very positive", "very negative", "neutral" ] class2idx = {c: i for i, c in enumerate(idx2class)} discriminator = Discriminator(class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device).to(device) text = torchtext_data.Field() label = torchtext_data.Field(sequential=False) train_data, val_data, test_data = datasets.SST.splits( text, label, fine_grained=True, train_subtrees=True, ) x = [] y = [] #preprocess dataset for i in trange(len(train_data), ascii=True): seq = TreebankWordDetokenizer().detokenize( vars(train_data[i])["text"]) seq = discriminator.tokenizer.encode(seq) if add_eos_token: seq = [50256] + seq seq = torch.tensor(seq, device=device, dtype=torch.long) x.append(seq) y.append(class2idx[vars(train_data[i])["label"]]) train_dataset = Dataset(x, y) test_x = [] test_y = [] for i in trange(len(test_data), ascii=True): seq = TreebankWordDetokenizer().detokenize( vars(test_data[i])["text"]) seq = discriminator.tokenizer.encode(seq) if add_eos_token: seq = [50256] + seq seq = torch.tensor(seq, device=device, dtype=torch.long) test_x.append(seq) test_y.append(class2idx[vars(test_data[i])["label"]]) test_dataset = Dataset(test_x, test_y) discriminator_meta = { "class_size": len(idx2class), "embed_size": discriminator.embed_size, "pretrained_model": pretrained_model, "class_vocab": class2idx, "default_class": 2, } elif dataset == "clickbait": idx2class = ["non_clickbait", "clickbait"] class2idx = {c: i for i, c in enumerate(idx2class)} discriminator = Discriminator(class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device).to(device) with open("datasets/clickbait/clickbait.txt") as f: data = [] for i, line in enumerate(f): try: data.append(eval(line)) except: print("Error evaluating line {}: {}".format(i, line)) continue x = [] y = [] with open("datasets/clickbait/clickbait.txt") as f: for i, line in enumerate(tqdm(f, ascii=True)): try: d = eval(line) seq = discriminator.tokenizer.encode(d["text"]) if len(seq) < max_length_seq: if add_eos_token: seq = [50256] + seq seq = torch.tensor(seq, device=device, dtype=torch.long) else: print( "Line {} is longer than maximum length {}".format( i, max_length_seq)) continue x.append(seq) y.append(d["label"]) except: print("Error evaluating / tokenizing" " line {}, skipping it".format(i)) pass full_dataset = Dataset(x, y) train_size = int(0.9 * len(full_dataset)) test_size = len(full_dataset) - train_size train_dataset, test_dataset = torch.utils.data.random_split( full_dataset, [train_size, test_size]) discriminator_meta = { "class_size": len(idx2class), "embed_size": discriminator.embed_size, "pretrained_model": pretrained_model, "class_vocab": class2idx, "default_class": 1, } elif dataset == "toxic": idx2class = ["non_toxic", "toxic"] class2idx = {c: i for i, c in enumerate(idx2class)} discriminator = Discriminator(class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device).to(device) x = [] y = [] with open("datasets/toxic/toxic_train.txt") as f: for i, line in enumerate(tqdm(f, ascii=True)): try: d = eval(line) seq = discriminator.tokenizer.encode(d["text"]) if len(seq) < max_length_seq: if add_eos_token: seq = [50256] + seq seq = torch.tensor(seq, device=device, dtype=torch.long) else: print( "Line {} is longer than maximum length {}".format( i, max_length_seq)) continue x.append(seq) y.append(int(np.sum(d["label"]) > 0)) except: print("Error evaluating / tokenizing" " line {}, skipping it".format(i)) pass full_dataset = Dataset(x, y) train_size = int(0.9 * len(full_dataset)) test_size = len(full_dataset) - train_size train_dataset, test_dataset = torch.utils.data.random_split( full_dataset, [train_size, test_size]) discriminator_meta = { "class_size": len(idx2class), "embed_size": discriminator.embed_size, "pretrained_model": pretrained_model, "class_vocab": class2idx, "default_class": 0, } else: # if dataset == "generic": # This assumes the input dataset is a TSV with the following structure: # class \t text if dataset_fp is None: raise ValueError("When generic dataset is selected, " "dataset_fp needs to be specified aswell.") idx2class = get_idx2class(dataset_fp) discriminator = Discriminator(class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device).to(device) full_dataset = get_generic_dataset(dataset_fp, discriminator.tokenizer, device, idx2class=idx2class, add_eos_token=add_eos_token) train_size = int(0.9 * len(full_dataset)) test_size = len(full_dataset) - train_size train_dataset, test_dataset = torch.utils.data.random_split( full_dataset, [train_size, test_size]) discriminator_meta = { "class_size": len(idx2class), "embed_size": discriminator.embed_size, "pretrained_model": pretrained_model, "class_vocab": {c: i for i, c in enumerate(idx2class)}, "default_class": 0, } end = time.time() print("Preprocessed {} data points".format( len(train_dataset) + len(test_dataset))) print("Data preprocessing took: {:.3f}s".format(end - start)) if cached: print("Building representation cache...") start = time.time() train_loader = get_cached_data_loader(train_dataset, batch_size, discriminator, shuffle=True, device=device) test_loader = get_cached_data_loader(test_dataset, batch_size, discriminator, device=device) end = time.time() print("Building representation cache took: {:.3f}s".format(end - start)) else: train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn) test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, collate_fn=collate_fn) if save_model: with open(classifier_head_meta_fp, "w") as meta_file: json.dump(discriminator_meta, meta_file) optimizer = optim.Adam(discriminator.parameters(), lr=learning_rate) test_losses = [] test_accuracies = [] for epoch in range(epochs): start = time.time() print("\nEpoch", epoch + 1) train_epoch(discriminator=discriminator, data_loader=train_loader, optimizer=optimizer, epoch=epoch, log_interval=log_interval, device=device) test_loss, test_accuracy = evaluate_performance( data_loader=test_loader, discriminator=discriminator, device=device) end = time.time() print("Epoch took: {:.3f}s".format(end - start)) test_losses.append(test_loss) test_accuracies.append(test_accuracy) print("\nExample prediction") predict(example_sentence, discriminator, idx2class, cached=cached, device=device) if save_model: # torch.save(discriminator.state_dict(), # "{}_discriminator_{}.pt".format( # args.dataset, epoch + 1 # )) torch.save(discriminator.get_classifier().state_dict(), classifier_head_fp_pattern.format(epoch + 1)) min_loss = float("inf") min_loss_epoch = 0 max_acc = 0.0 max_acc_epoch = 0 print("Test performance per epoch") print("epoch\tloss\tacc") for e, (loss, acc) in enumerate(zip(test_losses, test_accuracies)): print("{}\t{}\t{}".format(e + 1, loss, acc)) if loss < min_loss: min_loss = loss min_loss_epoch = e + 1 if acc > max_acc: max_acc = acc max_acc_epoch = e + 1 print("Min loss: {} - Epoch: {}".format(min_loss, min_loss_epoch)) print("Max acc: {} - Epoch: {}".format(max_acc, max_acc_epoch)) return discriminator, discriminator_meta
def load_train_data_test(self): """ todo 학습용 데이터 적재, 학습에 들어갈 데이터 컬럼명 리스트 작성. 전체 데이터 컬럼명 작성 (offset 포함된것) :return: """ first_layer = ['user', 'product'] second_layer = ['dense', 'sparse'] third_layer = ['single', 'seq'] for f_l in first_layer: file_member1 = OrderedDict() root_layer_col_name = OrderedDict() whole_root_layer_col_name = OrderedDict() for s_l in second_layer: file_member2 = OrderedDict() second_layer_col_name = OrderedDict() whole_second_layer_col_name = OrderedDict() for t_l in third_layer: path = f"parquet_file/partitioned_data/train/{f_l}/{s_l}/{t_l}" file_list = os.listdir(path) file_list_py = [ file for file in file_list if file.endswith(".dms") ] data = list() if file_list_py: for file in file_list_py: # if not empty with open(f'{path}/{file}', 'rb') as f: data.append( pd.read_parquet(f, engine='pyarrow')) train_data_df = pd.concat(data, ignore_index=True) train_data_df = train_data_df.set_index("idx") # if f_l == 'user' and s_l =='sparse' and t_l =='single': # print(train_data_df.tail) # sys.exit() train_data_df.to_csv(f"{f_l}{s_l}{t_l}.csv", mode='w') if self.len == 0: try: self.len = train_data_df.shape[0] if self.len < 1: raise Exception('empty train data') except Exception as e: print(e) sys.exit(1) else: break file_member2[t_l] = train_data_df.to_numpy() # 모델에 사용할 컬럼만 추려낸다. bad_list = ["offset"] data = np.asarray(train_data_df.columns) new_list = np.asarray( [x for x in data if x not in bad_list]) second_layer_col_name[t_l] = new_list whole_second_layer_col_name[t_l] = data file_member1[s_l] = file_member2 root_layer_col_name[s_l] = second_layer_col_name whole_root_layer_col_name[s_l] = whole_second_layer_col_name self._data[f_l] = file_member1 self.col_name[f_l] = root_layer_col_name # 전체 컬럼명 리스트 self.whole_col_name[f_l] = whole_root_layer_col_name # label path = "parquet_file/partitioned_data/train/label" file_list = os.listdir(path) file_list_py = [file for file in file_list if file.endswith(".dms")] for file in file_list_py: # if not empty if file: with open(f'{path}/{file}', 'rb') as f: data.append(pd.read_parquet(f, engine='pyarrow')) label_df = pd.concat(data, ignore_index=True) label_df = label_df.set_index("idx") label_df = label_df.to_numpy() self._label = label_df
def read_langs(file_name, max_line=None): logging.info(("Reading lines from {}".format(file_name))) data = [] contex_arr = [] conversation_arr = [] entity = {} u = None r = None with open(file_name) as fin: cnt_ptr = 0 cnt_voc = 0 max_r_len = 0 cnt_lin = 1 user_counter = 0 system_counter = 0 system_res_counter = 0 KB_counter = 0 dialog_counter = 0 for line in fin: line = line.strip() if line: if '#' in line: line = line.replace("#", "") task_type = line continue nid, line = line.split(' ', 1) if '\t' in line: u, r, gold = line.split('\t') user_counter += 1 system_counter += 1 gen_u = generate_memory(u, "$u", str(nid)) contex_arr += gen_u conversation_arr += gen_u r_index = [] gate = [] for key in r.split(' '): index = [ loc for loc, val in enumerate(contex_arr) if (val[0] == key) ] if (index): index = max(index) gate.append(1) cnt_ptr += 1 else: index = len(contex_arr) gate.append(0) cnt_voc += 1 r_index.append(index) system_res_counter += 1 if len(r_index) > max_r_len: max_r_len = len(r_index) contex_arr_temp = contex_arr + [['$$$$'] * MEM_TOKEN_SIZE] ent_index_calendar = [] ent_index_navigation = [] ent_index_weather = [] gold = ast.literal_eval(gold) if task_type == "weather": ent_index_weather = gold elif task_type == "schedule": ent_index_calendar = gold elif task_type == "navigate": ent_index_navigation = gold ent_index = list( set(ent_index_calendar + ent_index_navigation + ent_index_weather)) data.append([ contex_arr_temp, r, r_index, gate, ent_index, list(set(ent_index_calendar)), list(set(ent_index_navigation)), list(set(ent_index_weather)), list(conversation_arr) ]) gen_r = generate_memory(r, "$s", str(nid)) contex_arr += gen_r conversation_arr += gen_r else: KB_counter += 1 r = line for e in line.split(' '): entity[e] = 0 contex_arr += generate_memory(r, "", str(nid)) else: cnt_lin += 1 entity = {} if (max_line and cnt_lin >= max_line): break contex_arr = [] conversation_arr = [] dialog_counter += 1 max_len = max([len(d[0]) for d in data]) logging.info("Pointer percentace= {} ".format(cnt_ptr / (cnt_ptr + cnt_voc))) logging.info("Max responce Len: {}".format(max_r_len)) logging.info("Max Input Len: {}".format(max_len)) logging.info("Avg. User Utterances: {}".format(user_counter * 1.0 / dialog_counter)) logging.info("Avg. Bot Utterances: {}".format(system_counter * 1.0 / dialog_counter)) logging.info("Avg. KB results: {}".format(KB_counter * 1.0 / dialog_counter)) logging.info("Avg. responce Len: {}".format(system_res_counter * 1.0 / system_counter)) print('Sample: ', data[1][0], data[1][1], data[1][2], data[1][3], data[1][4]) return data, max_len, max_r_len
vgg = VGG() vgg.load_weights("vgg16-00b39a1b.pth") vgg.cuda() vgg.eval() print("load data") imsize = 32 data = [] for i in range(100): l = os.listdir("data/" + str(i)) l.sort() for f in l: data.append(( np.asarray( PIL.Image.open("data/" + str(i) + "/" + f).convert("RGB").copy()), i, "data/" + str(i) + "/" + f, )) print("extract features") batchsize = 100 featurefile = open("featurefile.txt", "w") def forwarddata(): for i in range(0, len(data) + 1 - batchsize, batchsize): batchlabel = np.zeros(batchsize, dtype=int) batchimage = np.zeros((batchsize, 3, imsize, imsize), dtype=float) for j in range(batchsize): image, label, name = data[i + j]
def next_batch(self, train=True): data = [] label = [] if train: remaining = self.source_size - self.source_id start = self.source_id if remaining <= self.source_batch_size: for i in self.source_list[start:]: data.append(self.source_text[i, :]) label.append(self.label_source[i, :]) self.source_id += 1 self.source_list = random.sample(range(self.source_size), self.source_size) self.source_id = 0 for i in self.source_list[0:(self.source_batch_size - remaining)]: data.append(self.source_text[i, :]) label.append(self.label_source[i, :]) self.source_id += 1 else: for i in self.source_list[start:start + self.source_batch_size]: data.append(self.source_text[i, :]) label.append(self.label_source[i, :]) self.source_id += 1 remaining = self.target_size - self.target_id start = self.target_id if remaining <= self.target_batch_size: for i in self.target_list[start:]: data.append(self.target_text[i, :]) # no target label #label.append(self.label_target[i, :]) self.target_id += 1 self.target_list = random.sample(range(self.target_size), self.target_size) self.target_id = 0 for i in self.target_list[0:self.target_batch_size - remaining]: data.append(self.target_text[i, :]) #label.append(self.label_target[i, :]) self.target_id += 1 else: for i in self.target_list[start:start + self.target_batch_size]: data.append(self.target_text[i, :]) #label.append(self.label_target[i, :]) self.target_id += 1 else: remaining = self.val_size - self.val_id start = self.val_id if remaining <= self.val_batch_size: for i in self.val_list[start:]: data.append(self.val_text[i, :]) label.append(self.label_val[i, :]) self.val_id += 1 self.val_list = random.sample(range(self.val_size), self.val_size) self.val_id = 0 for i in self.val_list[0:self.val_batch_size - remaining]: data.append(self.val_text[i, :]) label.append(self.label_val[i, :]) self.val_id += 1 else: for i in self.val_list[start:start + self.val_batch_size]: data.append(self.val_text[i, :]) label.append(self.label_val[i, :]) self.val_id += 1 data = self.scaler.transform(np.vstack(data)) label = np.vstack(label) return torch.from_numpy(data).float(), torch.from_numpy(label).float()
def __init__(self, dir_path, transforms=None): self.dir_path = dir_path imgs = [] paths = [] anger_path = os.path.join(dir_path, '0') disgust_path = os.path.join(dir_path, '1') fear_path = os.path.join(dir_path, '2') happy_path = os.path.join(dir_path, '3') sad_path = os.path.join(dir_path, '4') surprise_path = os.path.join(dir_path, '5') neutral_path = os.path.join(dir_path, '6') paths.append(anger_path) paths.append(disgust_path) paths.append(fear_path) paths.append(happy_path) paths.append(sad_path) paths.append(surprise_path) paths.append(neutral_path) image_num = 0 num0 = 0 num1 = 0 num2 = 0 num3 = 0 num4 = 0 num5 = 0 num6 = 0 for i in range(7): gap = 5 sequences = os.listdir(paths[i]) sequences.sort() for sequence in sequences: txt_path = os.path.join(paths[i], sequence) data = [] img_paths = [] for line in open(txt_path, "r"): # 设置文件对象并读取每一行文件 data.append(line[:-1]) # 将每一行文件加入到list中 for k in range(len(data)): if k == 0: img_paths.append(data[k][2:]) else: img_paths.append(data[k][1:]) temp = img_paths[k] temp = temp.replace('\\', '/') # 替换斜杠方向 img_paths[k] = temp for id in range(len(img_paths)): if id % gap == 0: # img_p = os.path.join('/home/ubuntu/Code/data/AffWild2/', img_paths[id]) img_p = os.path.join('/home/ubuntu/Code/data/', img_paths[id]) img = Image.open(os.path.join(img_p)).convert('RGB') image_num += 1 imgs.append((img, i)) # imgs存放样本(image, label) if image_num % 1000 == 0: print(image_num) print('**********************共有图片:', image_num) self.imgs = imgs self.transform = transforms
def read_langs2(source_text, utterance, gating_dict, SLOTS, dataset, lang, mem_lang, sequicity, training, max_line=None): data = [] max_resp_len, max_value_len = 0, 0 domain_counter = {} if 1 == 1: # dials = json.load(f) dials = [] # create vocab first for dial_dict in dials: if (args["all_vocab"] or dataset == "train") and training: assert True == False for ti, turn in enumerate(dial_dict["dialogue"]): lang.index_words(turn["system_transcript"], 'utter') lang.index_words(turn["transcript"], 'utter') # determine training data ratio, default is 100% if training and dataset == "train" and args["data_ratio"] != 100: random.Random(10).shuffle(dials) dials = dials[:int(len(dials) * 0.01 * args["data_ratio"])] cnt_lin = 1 for dial_dict in ['placeholder']: dialog_history = source_text last_belief_dict = {} # Filtering and counting domains # for domain in dial_dict["domains"]: # if domain not in EXPERIMENT_DOMAINS: # continue # if domain not in domain_counter.keys(): # domain_counter[domain] = 0 # domain_counter[domain] += 1 # Unseen domain setting # if args["only_domain"] != "" and args["only_domain"] not in dial_dict["domains"]: # continue # if (args["except_domain"] != "" and dataset == "test" and args["except_domain"] not in dial_dict[ # "domains"]) or \ # (args["except_domain"] != "" and dataset != "test" and [args["except_domain"]] == dial_dict[ # "domains"]): # continue # Reading data for ti, turn in enumerate(['placeholder']): turn_domain = '' turn_id = '0' turn_uttr = utterance turn_uttr_strip = turn_uttr.strip() dialog_history += source_text source_text = dialog_history.strip() turn_belief_dict = {} # Generate domain-dependent slot list slot_temp = SLOTS # if dataset == "test": # if args["except_domain"] != "": # slot_temp = [k for k in SLOTS if args["except_domain"] in k] # turn_belief_dict = OrderedDict( # [(k, v) for k, v in turn_belief_dict.items() if args["except_domain"] in k]) # elif args["only_domain"] != "": # slot_temp = [k for k in SLOTS if args["only_domain"] in k] # turn_belief_dict = OrderedDict( # [(k, v) for k, v in turn_belief_dict.items() if args["only_domain"] in k]) # turn_belief_list = [str(k) + '-' + str(v) for k, v in turn_belief_dict.items()] turn_belief_list = [] # if (args["all_vocab"] or dataset == "train") and training: # mem_lang.index_words(turn_belief_dict, 'belief') class_label, generate_y, slot_mask, gating_label = [], [], [], [] start_ptr_label, end_ptr_label = [], [] # for slot in slot_temp: # if slot in turn_belief_dict.keys(): # generate_y.append(turn_belief_dict[slot]) # # if turn_belief_dict[slot] == "dontcare": # gating_label.append(gating_dict["dontcare"]) # elif turn_belief_dict[slot] == "none": # gating_label.append(gating_dict["none"]) # else: # gating_label.append(gating_dict["ptr"]) # # if max_value_len < len(turn_belief_dict[slot]): # max_value_len = len(turn_belief_dict[slot]) # # else: # generate_y.append("none") # gating_label.append(gating_dict["none"]) gating_label = [2] * 80 generate_y = ['none'] * 80 # 可以根据ID和turn_idx将内容复原 data_detail = { "ID": "0", "domains": [], "turn_domain": "", "turn_id": 0, "dialog_history": source_text, "turn_belief": [], "gating_label": gating_label, "turn_uttr": turn_uttr_strip, 'generate_y': generate_y } data.append(data_detail) if max_resp_len < len(source_text.split()): max_resp_len = len(source_text.split()) cnt_lin += 1 if (max_line and cnt_lin >= max_line): break # add t{} to the lang file # if "t{}".format(max_value_len - 1) not in mem_lang.word2index.keys() and training: # for time_i in range(max_value_len): # mem_lang.index_words("t{}".format(time_i), 'utter') # print("domain_counter", domain_counter) return data, max_resp_len, slot_temp
def calc_gradients_wrt_output_whole_network_all_tasks( self, loader, out_path, if_pretrained_imagenet=False, layers=layers_bn_afterrelu, neuron_nums=[ 64, 64, 64, 128, 128, 128, 128, 256, 256, 256, 256, 512, 512, 512, 512 ], if_rename_layers=True): print( "Warning! Assume that loader returns in i-th batch only instances of i-th class" ) # for model in self.model.values(): # model.zero_grad() # model.eval() # target_layer_names = [layer.replace('_', '.') for layer in layers]#layers_bn_afterrelu] #+ ['feature_extractor'] target_layer_names = [ layer.replace('_', '.') if if_rename_layers else layer for layer in layers ] # neuron_nums = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2] def save_activation(activations, name, mod, inp, out): if name in target_layer_names: if_out_is_list = type(out) == list #backbone output if if_out_is_list: out = out[0] #single-head cifar # print(out.shape) out.requires_grad_(True) # if 'bn1' in name: # out = F.relu(out) out.retain_grad() activations[name] = out if if_out_is_list: out = [out] return out activations = {} hooks = [] for name, m in self.feature_extractor.named_modules(): if name in target_layer_names: hooks.append( m.register_forward_hook( partial(save_activation, activations, name))) hooks.append( self.feature_extractor.register_forward_hook( partial(save_activation, activations, 'feature_extractor'))) layer_names_for_pd = [] neuron_indices_for_pd = [] mean_grads_for_pd = defaultdict(list) if_already_saved_layer_names_and_neuron_indices = False n_classes = 10 if if_pretrained_imagenet: n_classes = 1000 iter_loader = iter(loader) for cond_idx in range(n_classes): print(cond_idx) batch = next(iter_loader) cur_grads = defaultdict( lambda: defaultdict(list) ) # layer -> neuron -> grads from every batch (i.e. 1 scalar per batch) ims, labels = batch if False: mask = ( labels == cond_idx ) #(labels != cond_idx)#np.array([True] * len(labels))# print(labels) ims_masked = ims[mask, ...] ims_masked = ims_masked.cuda() else: ims_masked = ims.cuda() print(labels) out = self.feature_extractor(ims_masked) if not if_pretrained_imagenet: #single-headed y = out[0] out_cond = self.model['all'].linear(y) out_cond[:, cond_idx].sum().backward() else: out[:, cond_idx].sum().backward() for layer_name in target_layer_names: print(layer_name) layer_grad = activations[layer_name].grad.detach().cpu() n_neurons = neuron_nums[target_layer_names.index(layer_name)] # print(layer_grad.shape[1], n_neurons) for target_neuron in range(n_neurons): cur_grad = layer_grad[:, target_neuron] try: cur_grad = cur_grad.mean(axis=(-1, -2)) except: pass # cur_grad = np.sign(cur_grad) # cur_grad[cur_grad < 0] = 0 cur_grad = cur_grad.mean().item() cur_grads[layer_name][target_neuron].append(cur_grad) if not if_already_saved_layer_names_and_neuron_indices: layer_names_for_pd.append(layer_name) neuron_indices_for_pd.append(target_neuron) activations[layer_name].grad.zero_() if_already_saved_layer_names_and_neuron_indices = True # is set after the first batch of the first cond_idx for layer_name in target_layer_names: n_neurons = neuron_nums[target_layer_names.index(layer_name)] for target_neuron in range(n_neurons): grad_meaned = np.mean(cur_grads[layer_name][target_neuron]) mean_grads_for_pd[cond_idx].append(grad_meaned) for hook in hooks: hook.remove() data = [] for i in range(len(neuron_indices_for_pd)): data.append([layer_names_for_pd[i], neuron_indices_for_pd[i]] + [mg[i] for mg in mean_grads_for_pd.values()]) df = pd.DataFrame(data, columns=['layer_name', 'neuron_idx'] + list(range(n_classes))) df.to_pickle(out_path) return df
def init(root, num_query, num_train): data_list = [ 'data_batch_1', 'data_batch_2', 'data_batch_3', 'data_batch_4', 'data_batch_5', 'test_batch', ] base_folder = 'cifar-10-batches-py' data = [] targets = [] for file_name in data_list: file_path = os.path.join(root, base_folder, file_name) with open(file_path, 'rb') as f: if sys.version_info[0] == 2: entry = pickle.load(f) else: entry = pickle.load(f, encoding='latin1') data.append(entry['data']) if 'labels' in entry: targets.extend(entry['labels']) else: targets.extend(entry['fine_labels']) data = np.vstack(data).reshape(-1, 3, 32, 32) data = data.transpose((0, 2, 3, 1)) # convert to HWC targets = np.array(targets) CIFAR10.ALL_IMG = data CIFAR10.ALL_TARGETS = targets # sort by class sort_index = CIFAR10.ALL_TARGETS.argsort() CIFAR10.ALL_IMG = CIFAR10.ALL_IMG[sort_index, :] CIFAR10.ALL_TARGETS = CIFAR10.ALL_TARGETS[sort_index] # (num_query / number of class) query images per class # (num_train / number of class) train images per class query_per_class = num_query // 10 train_per_class = num_train // 10 # permutate index (range 0 - 6000 per class) perm_index = np.random.permutation(CIFAR10.ALL_IMG.shape[0] // 10) query_index = perm_index[:query_per_class] train_index = perm_index[query_per_class:query_per_class + train_per_class] query_index = np.tile(query_index, 10) train_index = np.tile(train_index, 10) inc_index = np.array( [i * (CIFAR10.ALL_IMG.shape[0] // 10) for i in range(10)]) query_index = query_index + inc_index.repeat(query_per_class) train_index = train_index + inc_index.repeat(train_per_class) # split data, tags CIFAR10.QUERY_IMG = CIFAR10.ALL_IMG[query_index, :] CIFAR10.QUERY_TARGETS = CIFAR10.ALL_TARGETS[query_index] CIFAR10.TRAIN_IMG = CIFAR10.ALL_IMG[train_index, :] CIFAR10.TRAIN_TARGETS = CIFAR10.ALL_TARGETS[train_index]
def read_langs(file_name, max_line = None): print(("Reading lines from {}".format(file_name))) data, context_arr, kb_arr, kb_id = [], [], [], [] max_resp_len = 0 with open('data/KVR/kvret_entities.json') as f: global_entity = json.load(f) global_entity_list = {} for key in global_entity.keys(): if key != 'poi': if key not in global_entity_list: global_entity_list[key] = [] global_entity_list[key] += [item.lower().replace(' ', '_') for item in global_entity[key]] else: #global_entity_list['poi'] = [d['poi'].lower().replace(' ', '_') for d in global_entity['poi']] for item in global_entity['poi']: for k in item.keys(): if k == "type": continue if k not in global_entity_list: global_entity_list[k] = [] global_entity_list[k] += [item[k].lower().replace(' ', '_')] #global_entity_list['poi'] = [item[k].lower().replace(' ', '_') for k in item.keys()] with open(file_name) as fin: cnt_lin, sample_counter = 1, 1 for line in fin: line = line.strip() if line: if '#' in line: line = line.replace("#","") task_type = line continue nid, line = line.split(' ', 1) if '\t' in line: u, r, gold_ent = line.split('\t') context_arr.append(u.split(' ')) # Get gold entity for each domain gold_ent = ast.literal_eval(gold_ent) ent_idx_cal, ent_idx_nav, ent_idx_wet = [], [], [] if task_type == "weather": ent_idx_wet = gold_ent elif task_type == "schedule": ent_idx_cal = gold_ent elif task_type == "navigate": ent_idx_nav = gold_ent ent_index = list(set(ent_idx_cal + ent_idx_nav + ent_idx_wet)) # Get entity set entity_set, entity_type_set = generate_entity_set(kb_arr) entity_set, entity_type_set = generate_entity_from_context(context_arr, global_entity_list, entity_set, entity_type_set) # Get local pointer position for each word in system response ptr_index = [] for key in r.split(): if key in entity_set: index = entity_set.index(key) else: index = len(entity_set) ptr_index.append(index) sketch_response = generate_template(global_entity_list, r, gold_ent, entity_set, entity_type_set, task_type) #add empty token if len(entity_set) == 0: entity_set.append("$$$$") entity_type_set.append("empty_token") entity_set.append("$$$$") entity_type_set.append("empty_token") #generate indicator indicator = generate_indicator(context_arr, entity_set) #generate graph graph = generate_graph(entity_set, relation_set, kb_arr) data_detail = { 'context_arr':list(context_arr), 'kb_arr':list(entity_set), 'response':r.split(' '), 'sketch_response':sketch_response.split(' '), 'ptr_index':ptr_index+[len(entity_set) - 1], 'indicator':indicator, 'ent_index':ent_index, 'ent_idx_cal':list(set(ent_idx_cal)), 'ent_idx_nav':list(set(ent_idx_nav)), 'ent_idx_wet':list(set(ent_idx_wet)), 'id':int(sample_counter), 'ID':int(cnt_lin), 'domain':task_type, 'graph':graph} data.append(data_detail) context_arr.append(r.split(' ')) if max_resp_len < len(r.split()): max_resp_len = len(r.split()) sample_counter += 1 else: kb_id.append(nid) kb_info = line.split(' ') kb_arr.append(kb_info) if len(kb_info) != 5: print(kb_info) else: cnt_lin += 1 context_arr, kb_arr, kb_id = [], [], [] if(max_line and cnt_lin >= max_line): break return data, max_resp_len
def __getitem__(self, index): ## random select starting frame index t between [0, N - #sample_frames] N = self.num_frames[index] T = random.randint(0, N - self.opts.sample_frames) video = self.task_videos[index][0] ## load input and processed frames input_dir = os.path.join(self.opts.data_haze_dir, self.mode, "Rain_Haze", video) haze_dir = os.path.join(self.opts.data_haze_dir, self.mode, "Haze", video) gt_dir = os.path.join(self.opts.data_haze_dir, self.mode, "GT", video) alpha_dir = os.path.join(self.opts.data_haze_dir, self.mode, "Alpha", video) trans_dir = os.path.join(self.opts.data_haze_dir, self.mode, "Trans", video) ## sample from T to T + #sample_frames - 1 frame_i = [] frame_h = [] frame_a = [] frame_t = [] frame_g = [] for t in range(T + 1, T + self.opts.sample_frames + 1): frame_i.append( utils.read_img(os.path.join(input_dir, "%d.jpg" % t))) frame_h.append(utils.read_img(os.path.join(haze_dir, "%d.jpg" % t))) frame_a.append( utils.read_img(os.path.join(alpha_dir, "%d.jpg" % t))) frame_t.append( utils.read_img(os.path.join(trans_dir, "%d.jpg" % t))) frame_g.append(utils.read_img(os.path.join(gt_dir, "%d.jpg" % t))) ## data augmentation if self.mode == 'train': if self.opts.geometry_aug: ## random scale H_in = frame_i[0].shape[0] W_in = frame_i[0].shape[1] sc = np.random.uniform(self.opts.scale_min, self.opts.scale_max) H_out = int(math.floor(H_in * sc)) W_out = int(math.floor(W_in * sc)) ## scaled size should be greater than opts.crop_size if H_out < W_out: if H_out < self.opts.crop_size: H_out = self.opts.crop_size W_out = int( math.floor(W_in * float(H_out) / float(H_in))) else: ## W_out < H_out if W_out < self.opts.crop_size: W_out = self.opts.crop_size H_out = int( math.floor(H_in * float(W_out) / float(W_in))) for t in range(self.opts.sample_frames): frame_i[t] = cv2.resize(frame_i[t], (W_out, H_out)) frame_h[t] = cv2.resize(frame_h[t], (W_out, H_out)) frame_a[t] = cv2.resize(frame_a[t], (W_out, H_out)) frame_t[t] = cv2.resize(frame_t[t], (W_out, H_out)) frame_g[t] = cv2.resize(frame_g[t], (W_out, H_out)) ## random crop cropper = RandomCrop(frame_i[0].shape[:2], (self.opts.crop_size, self.opts.crop_size)) for t in range(self.opts.sample_frames): frame_i[t] = cropper(frame_i[t]) frame_h[t] = cropper(frame_h[t]) frame_a[t] = cropper(frame_a[t]) frame_t[t] = cropper(frame_t[t]) frame_g[t] = cropper(frame_g[t]) if self.opts.geometry_aug: ### random rotate #rotate = random.randint(0, 3) #if rotate != 0: # for t in range(self.opts.sample_frames): # frame_i[t] = np.rot90(frame_i[t], rotate) # frame_p[t] = np.rot90(frame_p[t], rotate) ## horizontal flip if np.random.random() >= 0.5: for t in range(self.opts.sample_frames): frame_i[t] = cv2.flip(frame_i[t], flipCode=0) frame_h[t] = cv2.flip(frame_h[t], flipCode=0) frame_t[t] = cv2.flip(frame_t[t], flipCode=0) frame_a[t] = cv2.flip(frame_a[t], flipCode=0) frame_g[t] = cv2.flip(frame_g[t], flipCode=0) if self.opts.order_aug: ## reverse temporal order if np.random.random() >= 0.5: frame_i.reverse() frame_h.reverse() frame_a.reverse() frame_t.reverse() frame_g.reverse() elif self.mode == "test": ## resize image to avoid size mismatch after downsampline and upsampling H_i = frame_i[0].shape[0] W_i = frame_i[0].shape[1] H_o = int( math.ceil(float(H_i) / self.opts.size_multiplier) * self.opts.size_multiplier) W_o = int( math.ceil(float(W_i) / self.opts.size_multiplier) * self.opts.size_multiplier) for t in range(self.opts.sample_frames): frame_i[t] = cv2.resize(frame_i[t], (W_o, H_o)) frame_h[t] = cv2.resize(frame_h[t], (W_o, H_o)) frame_a[t] = cv2.resize(frame_a[t], (W_o, H_o)) frame_t[t] = cv2.resize(frame_t[t], (W_o, H_o)) frame_g[t] = cv2.resize(frame_g[t], (W_o, H_o)) else: raise Exception("Unknown mode (%s)" % self.mode) ### convert (H, W, C) array to (C, H, W) tensor data = [] for t in range(self.opts.sample_frames): data.append( torch.from_numpy(frame_i[t].transpose(2, 0, 1).astype( np.float32)).contiguous()) data.append( torch.from_numpy(frame_h[t].transpose(2, 0, 1).astype( np.float32)).contiguous()) data.append( torch.from_numpy(frame_a[t].transpose(2, 0, 1).astype( np.float32)).contiguous()) data.append( torch.from_numpy(frame_t[t].transpose(2, 0, 1).astype( np.float32)).contiguous()) data.append( torch.from_numpy(frame_g[t].transpose(2, 0, 1).astype( np.float32)).contiguous()) return data
def __getitem__(self, index): if self.training: index_ratio = int(self.ratio_index[index]) else: index_ratio = index # get the anchor index for current sample index # here we set the anchor index to the last one # sample in this group #index = 32014 # temp hack for testing case where crop excluded gt boxes minibatch_db = self._roidb[index_ratio] blobs = [] data = [] padding_data = [] im_info = [] data_heights = [] data_widths = [] gt_boxes = [] gt_boxes_padding = [] num_boxes = [] # check for duplicate tracks within same frame assert len(minibatch_db[0]['track_id']) == len(np.unique(minibatch_db[0]['track_id'])), \ 'Cannot have >1 track with same id in same frame.' assert len(minibatch_db[1]['track_id']) == len(np.unique(minibatch_db[1]['track_id'])), \ 'Cannot have >1 track with same id in same frame.' # Iterate through each entry in the sample tuple for ientry, entry in enumerate(minibatch_db): blobs.append(get_minibatch([entry], self._num_classes)) data.append(torch.from_numpy(blobs[ientry]['data'])) im_info.append(torch.from_numpy(blobs[ientry]['im_info'])) data_heights.append(data[ientry].size(1)) data_widths.append(data[ientry].size(2)) # random shuffle the bounding boxes #np.random.shuffle(blobs[ientry]['gt_boxes']) if not self.training and blobs[ientry]['gt_boxes'].shape[0] == 0: blobs[ientry]['gt_boxes'] = np.ones((1, 6), dtype=np.float32) gt_boxes.append(torch.from_numpy(blobs[ientry]['gt_boxes'])) if self.training: ######################################################## # padding the input image to fixed size for each group # ######################################################## # if the image needs to be cropped, crop to the target size ratio = self.ratio_list_batch[index] if self._roidb[index_ratio][0]['need_crop']: if ratio < 1.: # this means that data_width << data_height and we crop the height min_y = int(torch.min(gt_boxes[ientry][:, 1])) max_y = int(torch.max(gt_boxes[ientry][:, 3])) trim_size = int(np.floor(data_widths[ientry] / ratio)) if trim_size > data_heights[ientry]: trim_size = data_heights[ientry] box_region = max_y - min_y + 1 if min_y == 0: y_s = 0 else: if (box_region - trim_size) < 0: y_s_min = max(max_y - trim_size, 0) y_s_max = min(min_y, data_heights[ientry] - trim_size) if y_s_min == y_s_max: y_s = y_s_min else: y_s = np.random.choice( range(y_s_min, y_s_max)) else: y_s_add = int((box_region - trim_size) / 2) if y_s_add == 0: y_s = min_y else: y_s = np.random.choice( range(min_y, min_y + y_s_add)) # crop the image data[ientry] = data[ientry][:, y_s:(y_s + trim_size), :, :] # shift y coordiante of gt_boxes gt_boxes[ientry][:, 1] = gt_boxes[ientry][:, 1] - float(y_s) gt_boxes[ientry][:, 3] = gt_boxes[ientry][:, 3] - float(y_s) # update gt bounding box according to trim gt_boxes[ientry][:, 1].clamp_(0, trim_size - 1) gt_boxes[ientry][:, 3].clamp_(0, trim_size - 1) else: # data_width >> data_height so crop width min_x = int(torch.min(gt_boxes[ientry][:, 0])) max_x = int(torch.max(gt_boxes[ientry][:, 2])) trim_size = int(np.ceil(data_heights[ientry] * ratio)) if trim_size > data_widths[ientry]: trim_size = data_widths[ientry] box_region = max_x - min_x + 1 if min_x == 0: x_s = 0 else: if (box_region - trim_size) < 0: x_s_min = max(max_x - trim_size, 0) x_s_max = min(min_x, data_widths[ientry] - trim_size) if x_s_min == x_s_max: x_s = x_s_min else: x_s = np.random.choice( range(x_s_min, x_s_max)) else: x_s_add = int((box_region - trim_size) / 2) if x_s_add == 0: x_s = min_x else: x_s = np.random.choice( range(min_x, min_x + x_s_add)) # crop the image data[ientry] = data[ientry][:, :, x_s:(x_s + trim_size), :] # shift x coordiante of gt_boxes[ientry] gt_boxes[ientry][:, 0] = gt_boxes[ientry][:, 0] - float(x_s) gt_boxes[ientry][:, 2] = gt_boxes[ientry][:, 2] - float(x_s) # update gt bounding box according the trip gt_boxes[ientry][:, 0].clamp_(0, trim_size - 1) gt_boxes[ientry][:, 2].clamp_(0, trim_size - 1) # based on the ratio, pad the image. if ratio < 1: # data_width < data_height trim_size = int(np.floor(data_widths[ientry] / ratio)) padding_data.append(torch.FloatTensor(int(np.ceil(data_widths[ientry] / ratio)),\ data_widths[ientry], 3).zero_()) padding_data[ientry][:data_heights[ientry], :, :] = data[ ientry][0] im_info[ientry][0, 0] = padding_data[ientry].size(0) elif ratio > 1: # data_width > data_height padding_data.append(torch.FloatTensor(data_heights[ientry],\ int(np.ceil(data_heights[ientry] * ratio)), 3).zero_()) padding_data[ientry][:, :data_widths[ientry], :] = data[ ientry][0] im_info[ientry][0, 1] = padding_data[ientry].size(1) else: trim_size = min(data_heights[ientry], data_widths[ientry]) padding_data.append( torch.FloatTensor(trim_size, trim_size, 3).zero_()) padding_data[ientry] = data[ientry][ 0][:trim_size, :trim_size, :] # gt_boxes[ientry].clamp_(0, trim_size) gt_boxes[ientry][:, :4].clamp_(0, trim_size) im_info[ientry][0, 0] = trim_size im_info[ientry][0, 1] = trim_size # check the bounding box: not_keep = (gt_boxes[ientry][:,0] \ == gt_boxes[ientry][:,2]) | (gt_boxes[ientry][:,1] == gt_boxes[ientry][:,3]) keep = torch.nonzero(not_keep == 0).view(-1) gt_boxes_padding.append( torch.FloatTensor(self.max_num_box, gt_boxes[ientry].size(1)).zero_()) if keep.numel() != 0: gt_boxes[ientry] = gt_boxes[ientry][keep] num_boxes.append( torch.LongTensor( [min(gt_boxes[ientry].size(0), self.max_num_box)]).cuda()) curr_num_boxes = int(num_boxes[ientry][0]) gt_boxes_padding[ientry][:curr_num_boxes, :] = gt_boxes[ ientry][:curr_num_boxes] else: num_boxes.append(torch.LongTensor(1).cuda().zero_()) # permute trim_data to adapt to downstream processing padding_data[ientry] = padding_data[ientry].squeeze(0).permute( 2, 0, 1).contiguous() padding_data[ientry] = padding_data[ientry].unsqueeze(0) #im_info[ientry] = im_info[ientry].view(3) gt_boxes_padding[ientry] = gt_boxes_padding[ientry].unsqueeze( 0) num_boxes[ientry] = num_boxes[ientry].unsqueeze(0) #return padding_data, im_info, gt_boxes_padding, num_boxes else: data[ientry] = data[ientry].permute(0, 3, 1, 2).contiguous().\ view(3, data_heights[ientry], data_widths[ientry]) data[ientry] = data[ientry].unsqueeze(0) #im_info[ientry] = im_info[ientry].view(3) #gt_boxes.append(torch.FloatTensor([1,1,1,1,1])) gt_boxes_padding.append( torch.FloatTensor(self.max_num_box, gt_boxes[ientry].size(1)).zero_()) #gt_boxes[ientry] = gt_boxes[ientry].unsqueeze(0) num_boxes.append( torch.LongTensor( [min(gt_boxes[ientry].size(0), self.max_num_box)]).cuda()) #num_boxes.append(torch.LongTensor(1).cuda().zero_()) num_boxes[ientry] = num_boxes[ientry].unsqueeze(0) curr_num_boxes = int(num_boxes[ientry][0]) gt_boxes_padding[ientry][:curr_num_boxes, :] = gt_boxes[ ientry][:curr_num_boxes] gt_boxes_padding[ientry] = gt_boxes_padding[ientry].unsqueeze( 0) #return data, im_info, gt_boxes, num_boxes if _DEBUG: if self.training: print(gt_boxes_padding[ientry]) print(padding_data[ientry].size()) self._plot_image(padding_data[ientry].permute(0, 2, 3, 1), gt_boxes_padding[ientry], num_boxes[ientry]) else: print(gt_boxes[ientry]) print(data[ientry].size()) self._plot_image(data[ientry].permute(0, 2, 3, 1), gt_boxes[ientry], num_boxes[ientry]) im_info_pair = torch.cat(im_info, dim=0) num_boxes = torch.cat(num_boxes, dim=0) if self.training: data_pair = torch.cat(padding_data, dim=0) gt_boxes_padding_pair = torch.cat(gt_boxes_padding, dim=0) return data_pair, im_info_pair, gt_boxes_padding_pair, num_boxes else: data_pair = torch.cat(data, dim=0) gt_boxes_padding_pair = torch.cat(gt_boxes_padding, dim=0) #gt_boxes = torch.cat(gt_boxes, dim=0) return data_pair, im_info_pair, gt_boxes_padding_pair, num_boxes
def read_langs(file_name, entity, cand2DLidx, idx2candDL, max_line=None): logging.info(("Reading lines from {}".format(file_name))) data = [] content_arr = [] #conversation_arr = [] u = None r = None user_counter = 0 system_counter = 0 system_res_counter = 0 KB_counter = 0 dialog_counter = 0 with open(file_name) as fin: #cnt_ptr = 0 #cnt_voc = 0 max_r_len = 0 cnt_lin = 1 time_counter = 1 for line in fin: line = line.strip() if line: nid, line = line.split(' ', 1) if '\t' in line: u, r = line.split('\t') if u != '<SILENCE>': user_counter += 1 system_counter += 1 bot_action_idx = cand2DLidx[r] bot_action = idx2candDL[bot_action_idx] gen_u = generate_memory(u, "$u", str(time_counter)) content_arr += gen_u #conversation_arr += gen_u ent_query = {} ent_query_idx = {} for idx, key in enumerate(r.split(' ')): if (key in entity): index = [ loc for loc, val in enumerate(content_arr) if (val[0] == key) ] if (index): index = max(index) #cnt_ptr += 1 ent_query_idx[bot_action.split(' ') [idx]] = index ent_query[bot_action.split(' ')[idx]] = key else: print('[Wrong] Cannot find the entity') exit(1) system_res_counter += 1 if ent_query == {}: ent_query = {'UNK': '$$$$'} ent_query_idx = {'UNK': len(content_arr)} content_arr_temp = content_arr + [['$$$$'] * MEM_TOKEN_SIZE] else: content_arr_temp = content_arr # ent = [] # for key in r.split(' '): # if(key in entity): # ent.append(key) for ent in ent_query.keys(): data_item = { 'dialID': dialog_counter, 'turnID': system_counter, 'content_arr': content_arr_temp, 'bot_action': bot_action, 'bot_action_idx': bot_action_idx, 'ent_query': [ent, ent_query[ent]], 'ent_query_idx': [ent, ent_query_idx[ent]], 'gold_response': r } data.append(data_item) #data.append([content_arr_temp,r,r_index,conversation_arr,ent]) gen_r = generate_memory(r, "$s", str(time_counter)) content_arr += gen_r #conversation_arr += gen_r time_counter += 1 else: KB_counter += 1 r = line content_arr += generate_memory(r, "", "") else: cnt_lin += 1 if (max_line and cnt_lin >= max_line): break content_arr = [] content_arr_temp = [] #conversation_arr = [] time_counter = 1 dialog_counter += 1 max_len = max([len(d['content_arr']) for d in data]) logging.info("Nb of dialogs = {} ".format(dialog_counter)) #logging.info("Pointer percentace= {} ".format(cnt_ptr/(cnt_ptr+cnt_voc))) logging.info("Max responce Len: {}".format(max_r_len)) logging.info("Max Input Len: {}".format(max_len)) logging.info("Avg. User Utterances: {}".format(user_counter * 1.0 / dialog_counter)) logging.info("Avg. Bot Utterances: {}".format(system_counter * 1.0 / dialog_counter)) logging.info("Avg. KB results: {}".format(KB_counter * 1.0 / dialog_counter)) logging.info("Avg. responce Len: {}".format(system_res_counter * 1.0 / system_counter)) print('Sample: ', data[5]) return data, max_len
def read_langs(file_name, gating_dict, SLOTS, dataset, lang, mem_lang, sequicity, training, max_line=None): """ Better name it construct_vocab? In fact, this function is the front line towards original data files. The 1st step to process data files. Convert them into python data-type. Params: SLOTS: contain slots from train, dev and test max_line: set the max number of dialogs that model deals with Returns: data: list of dicts, each element (one dict) is an abstract of each turn of all the dialogs. So the content is very redundant. See line 322. max_resp_len: the maximum length of dialog history slot_temp: The same as SLOTS in most conditions. slot_temp is different from SLOTS ONLY when we do experiments on specific domains """ print(("Reading from {}".format(file_name))) data = [] max_resp_len, max_value_len = 0, 0 domain_counter = {} # distribution of domain in the datafiles with open(file_name) as f: dials = json.load(f) # create vocab first for dial_dict in dials: if (args["all_vocab"] or dataset == "train") and training: for ti, turn in enumerate(dial_dict["dialogue"]): lang.index_words(turn["system_transcript"], 'utter') lang.index_words(turn["transcript"], 'utter') # determine training data ratio, default is 100% if training and dataset == "train" and args["data_ratio"] != 100: random.Random(10).shuffle(dials) dials = dials[:int(len(dials) * 0.01 * args["data_ratio"])] cnt_lin = 1 # count the number of dialogs that have been processed for dial_dict in dials: dialog_history = "" last_belief_dict = {} # Filtering and counting domains for domain in dial_dict["domains"]: if domain not in EXPERIMENT_DOMAINS: continue if domain not in domain_counter.keys(): domain_counter[domain] = 0 domain_counter[domain] += 1 ###### # Unseen domain setting for zero-shot learning if args["only_domain"] != "" and args[ "only_domain"] not in dial_dict["domains"]: continue if (args["except_domain"] != "" and dataset == "test" and args["except_domain"] not in dial_dict["domains"]) or \ (args["except_domain"] != "" and dataset != "test" and [args["except_domain"]] == dial_dict["domains"]): continue ###### # Reading data for ti, turn in enumerate(dial_dict["dialogue"]): turn_domain = turn["domain"] turn_id = turn["turn_idx"] turn_uttr = turn["system_transcript"] + " ; " + turn[ "transcript"] turn_uttr_strip = turn_uttr.strip() dialog_history += (turn["system_transcript"] + " ; " + turn["transcript"] + " ; ") source_text = dialog_history.strip() '''Func below is very tricky. 0_0''' turn_belief_dict = fix_general_label_error( turn["belief_state"], False, SLOTS) # Generate domain-dependent slot list slot_temp = SLOTS if dataset == "train" or dataset == "dev": if args["except_domain"] != "": slot_temp = [ k for k in SLOTS if args["except_domain"] not in k ] turn_belief_dict = OrderedDict([ (k, v) for k, v in turn_belief_dict.items() if args["except_domain"] not in k ]) elif args["only_domain"] != "": slot_temp = [ k for k in SLOTS if args["only_domain"] in k ] turn_belief_dict = OrderedDict([ (k, v) for k, v in turn_belief_dict.items() if args["only_domain"] in k ]) else: if args["except_domain"] != "": slot_temp = [ k for k in SLOTS if args["except_domain"] in k ] turn_belief_dict = OrderedDict([ (k, v) for k, v in turn_belief_dict.items() if args["except_domain"] in k ]) elif args["only_domain"] != "": slot_temp = [ k for k in SLOTS if args["only_domain"] in k ] turn_belief_dict = OrderedDict([ (k, v) for k, v in turn_belief_dict.items() if args["only_domain"] in k ]) turn_belief_list = [ str(k) + '-' + str(v) for k, v in turn_belief_dict.items() ] if (args["all_vocab"] or dataset == "train") and training: mem_lang.index_words(turn_belief_dict, 'belief') class_label, generate_y, slot_mask, gating_label = [], [], [], [] start_ptr_label, end_ptr_label = [], [] for slot in slot_temp: if slot in turn_belief_dict.keys(): generate_y.append( turn_belief_dict[slot] ) # generate_y stores the true label of values for domain-slot! # It also includes "none", so the length is fixed to len(SLOTS) '''Below is similar to the category in ProPara''' if turn_belief_dict[slot] == "dontcare": gating_label.append(gating_dict["dontcare"]) elif turn_belief_dict[slot] == "none": gating_label.append(gating_dict["none"]) else: gating_label.append(gating_dict["ptr"]) if max_value_len < len( turn_belief_dict[slot] ): # max_value_len: the maximum of number of turn_belief items across all dialogs max_value_len = len(turn_belief_dict[slot]) else: generate_y.append("none") gating_label.append(gating_dict["none"]) data_detail = { "ID": dial_dict["dialogue_idx"], "domains": dial_dict["domains"], "turn_domain": turn_domain, "turn_id": turn_id, "dialog_history": source_text, "turn_belief": turn_belief_list, "gating_label": gating_label, "turn_uttr": turn_uttr_strip, 'generate_y': generate_y } data.append( data_detail ) # data_detail is appended per turn in each dialogue. len(data)=(#average turns * #dialogs) # Each data_detail represents an primitive raw instance for training. if max_resp_len < len(source_text.split( )): # max_resp_len: the maximum length of dialog history max_resp_len = len(source_text.split()) cnt_lin += 1 # count how many dialogs there are in the datafile if (max_line and cnt_lin >= max_line): break # add t{} to the mem_lang file # todo point of this operation? if "t{}".format(max_value_len - 1) not in mem_lang.word2index.keys() and training: for time_i in range(max_value_len): mem_lang.index_words("t{}".format(time_i), 'utter') print("domain_counter", domain_counter) return data, max_resp_len, slot_temp # slot_temp is different from SLOTS if we only do experiments on specific domains
def train_discriminator(dataset, train_dataset_fp=None, valid_dataset_fp=None, pretrained_model="gpt2-medium", epochs=10, batch_size=64, log_interval=10, save_model=False, cached=False, no_cuda=False, reg_type=1): device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu" print("Preprocessing {} dataset...".format(dataset)) start = time.time() if dataset == "SST": idx2class = [ "positive", "negative", "very positive", "very negative", "neutral" ] class2idx = {c: i for i, c in enumerate(idx2class)} discriminator = Discriminator(class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device, reg_type=reg_type).to(device) text = torchtext_data.Field() label = torchtext_data.Field(sequential=False) train_data, val_data, test_data = datasets.SST.splits( text, label, fine_grained=True, train_subtrees=True, ) x = [] y = [] for i in trange(len(train_data), ascii=True): seq = TreebankWordDetokenizer().detokenize( vars(train_data[i])["text"]) seq = discriminator.tokenizer.encode(seq) seq = torch.tensor([50256] + seq, device=device, dtype=torch.long) x.append(seq) y.append(class2idx[vars(train_data[i])["label"]]) train_dataset = Dataset(x, y) test_x = [] test_y = [] for i in trange(len(test_data), ascii=True): seq = TreebankWordDetokenizer().detokenize( vars(test_data[i])["text"]) seq = discriminator.tokenizer.encode(seq) seq = torch.tensor([50256] + seq, device=device, dtype=torch.long) test_x.append(seq) test_y.append(class2idx[vars(test_data[i])["label"]]) test_dataset = Dataset(test_x, test_y) discriminator_meta = { "class_size": len(idx2class), "embed_size": discriminator.embed_size, "pretrained_model": pretrained_model, "class_vocab": class2idx, "default_class": 2, } elif dataset == "clickbait": idx2class = ["non_clickbait", "clickbait"] class2idx = {c: i for i, c in enumerate(idx2class)} discriminator = Discriminator(class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device).to(device) with open("datasets/clickbait/clickbait_train_prefix.txt") as f: data = [] for i, line in enumerate(f): try: data.append(eval(line)) except: print("Error evaluating line {}: {}".format(i, line)) continue x = [] y = [] with open("datasets/clickbait/clickbait_train_prefix.txt") as f: for i, line in enumerate(tqdm(f, ascii=True)): try: d = eval(line) seq = discriminator.tokenizer.encode(d["text"]) if len(seq) < max_length_seq: seq = torch.tensor([50256] + seq, device=device, dtype=torch.long) else: print( "Line {} is longer than maximum length {}".format( i, max_length_seq)) continue x.append(seq) y.append(d["label"]) except: print("Error evaluating / tokenizing" " line {}, skipping it".format(i)) pass full_dataset = Dataset(x, y) train_size = int(0.9 * len(full_dataset)) test_size = len(full_dataset) - train_size train_dataset, test_dataset = torch.utils.data.random_split( full_dataset, [train_size, test_size]) discriminator_meta = { "class_size": len(idx2class), "embed_size": discriminator.embed_size, "pretrained_model": pretrained_model, "class_vocab": class2idx, "default_class": 1, } elif dataset == "toxic": idx2class = ["non_toxic", "toxic"] class2idx = {c: i for i, c in enumerate(idx2class)} discriminator = Discriminator(class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device).to(device) x = [] y = [] with open("datasets/toxic/toxic_train.txt") as f: for i, line in enumerate(tqdm(f, ascii=True)): try: d = eval(line) seq = discriminator.tokenizer.encode(d["text"]) if len(seq) < max_length_seq: seq = torch.tensor([50256] + seq, device=device, dtype=torch.long) else: print( "Line {} is longer than maximum length {}".format( i, max_length_seq)) continue x.append(seq) y.append(int(np.sum(d["label"]) > 0)) except: print("Error evaluating / tokenizing" " line {}, skipping it".format(i)) pass full_dataset = Dataset(x, y) train_size = int(0.9 * len(full_dataset)) test_size = len(full_dataset) - train_size train_dataset, test_dataset = torch.utils.data.random_split( full_dataset, [train_size, test_size]) discriminator_meta = { "class_size": len(idx2class), "embed_size": discriminator.embed_size, "pretrained_model": pretrained_model, "class_vocab": class2idx, "default_class": 0, } else: # if dataset == "generic": # This assumes the input dataset is a TSV with the following structure: # class \t text if train_dataset_fp is None: raise ValueError("When generic dataset is selected, " "train_dataset_fp needs to be specified aswell.") if valid_dataset_fp is None: raise ValueError("When generic dataset is selected, " "valid_dataset_fp needs to be specified aswell.") discriminator = Discriminator(pretrained_model=pretrained_model, cached_mode=cached, device=device).to(device) x = [] y = [] with open(train_dataset_fp) as f: csv_reader = csv.reader(f, delimiter="\t") for i, row in enumerate(tqdm(csv_reader, ascii=True)): if row: label = float(row[0]) text = row[1] try: seq = discriminator.tokenizer.encode(text) if (len(seq) < max_length_seq): seq = torch.tensor([50256] + seq, device=device, dtype=torch.long) else: print("Line {} is longer than maximum length {}". format(i, max_length_seq)) continue x.append(seq) y.append(label) except: print( "Error tokenizing line {}, skipping it".format(i)) pass train_dataset = Dataset(x, y) x = [] y = [] with open(valid_dataset_fp) as f: csv_reader = csv.reader(f, delimiter="\t") for i, row in enumerate(tqdm(csv_reader, ascii=True)): if row: label = float(row[0]) text = row[1] try: seq = discriminator.tokenizer.encode(text) if (len(seq) < max_length_seq): seq = torch.tensor([50256] + seq, device=device, dtype=torch.long) else: print("Line {} is longer than maximum length {}". format(i, max_length_seq)) continue x.append(seq) y.append(label) except: print( "Error tokenizing line {}, skipping it".format(i)) pass test_dataset = Dataset(x, y) discriminator_meta = { "embed_size": discriminator.embed_size, "pretrained_model": pretrained_model, } end = time.time() print("Preprocessed {} data points".format( len(train_dataset) + len(test_dataset))) print("Data preprocessing took: {:.3f}s".format(end - start)) if cached: print("Building representation cache...") start = time.time() train_loader = get_cached_data_loader(train_dataset, batch_size, discriminator, shuffle=True, device=device) test_loader = get_cached_data_loader(test_dataset, batch_size, discriminator, device=device) end = time.time() print("Building representation cache took: {:.3f}s".format(end - start)) else: train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn) test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, collate_fn=collate_fn) if save_model: with open("{}_classifier_head_meta.json".format(dataset), "w") as meta_file: json.dump(discriminator_meta, meta_file) optimizer = optim.Adam(discriminator.parameters(), lr=0.0001) for epoch in range(epochs): start = time.time() print("\nEpoch", epoch + 1) train_epoch(discriminator=discriminator, data_loader=train_loader, optimizer=optimizer, epoch=epoch, log_interval=log_interval, device=device) evaluate_performance(data_loader=test_loader, discriminator=discriminator, device=device) end = time.time() print("Epoch took: {:.3f}s".format(end - start)) print("\nExample prediction") predict(example_sentence, discriminator, cached=cached, device=device) if save_model: # torch.save(discriminator.state_dict(), # "{}_discriminator_{}.pt".format( # args.dataset, epoch + 1 # )) torch.save( discriminator.get_classifier().state_dict(), "{}_classifier_head_epoch_{}.pt".format(dataset, epoch + 1))
ae = MNISTAE().cuda() ae.load_state_dict(torch.load("mnist_conv_autoencoder_weights.pth")) data = [] targets = [] n_samples = int(len(trainfolder) * 0.25) counter = 0 for batch_x, batch_y in tqdm(trainloader): batch_x = batch_x.cuda().float() batch_x_preds = ae.encode(batch_x).detach().cpu().numpy() batch_y = batch_y.detach().cpu().numpy() for x, y in zip(batch_x_preds, batch_y): data.append(x.reshape(6400)) targets.append(y) counter += 1 if counter >= n_samples: break data = np.array(data) targets = np.array(targets) data = data[:int(len(data) * 0.25)] targets = targets[:int(len(targets) * 0.25)] data = TSNE(n_components=2, perplexity=15, learning_rate=10, verbose=2).fit_transform(data)
def main(): args = parser.parse_args() if args.seed is None: args.seed = random.randint(1, 10000) print("Random Seed: ", args.seed) random.seed(args.seed) torch.manual_seed(args.seed) if args.gpus: torch.cuda.manual_seed_all(args.seed) time_stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') if args.evaluate: args.results_dir = '/tmp' if args.save is '': args.save = time_stamp save_path = os.path.join(args.results_dir, args.save) if not os.path.exists(save_path): os.makedirs(save_path) if args.gpus is not None: args.gpus = [int(i) for i in args.gpus.split(',')] device = 'cuda:' + str(args.gpus[0]) cudnn.benchmark = True else: device = 'cpu' if args.type == 'float64': dtype = torch.float64 elif args.type == 'float32': dtype = torch.float32 elif args.type == 'float16': dtype = torch.float16 else: raise ValueError('Wrong type!') # TODO int8 model = MobileNet2(input_size=args.input_size, scale=args.scaling) num_parameters = sum([l.nelement() for l in model.parameters()]) print(model) print('number of parameters: {}'.format(num_parameters)) # TODO(3/30):flop计算方法还没看, loader要自己写 print('FLOPs: {}'.format( flops_benchmark.count_flops( MobileNet2, args.batch_size // len(args.gpus) if args.gpus is not None else args.batch_size, device, dtype, args.input_size, 3, args.scaling))) train_loader, val_loader = get_loaders(args.dataroot, args.batch_size, args.batch_size, args.input_size, args.workers) # define loss function (criterion) and optimizer criterion = torch.nn.CrossEntropyLoss() if args.gpus is not None: model = torch.nn.DataParallel(model, args.gpus) model.to(device=device, dtype=dtype) criterion.to(device=device, dtype=dtype) optimizer = torch.optim.SGD(model.parameters(), args.learning_rate, momentum=args.momentum, weight_decay=args.decay, nesterov=True) # TODO(3/30):clr学习率寻找,尚未学习 if args.find_clr: find_bounds_clr(model, train_loader, optimizer, criterion, device, dtype, min_lr=args.min_lr, max_lr=args.max_lr, step_size=args.epochs_per_step * len(train_loader), mode=args.mode, save_path=save_path) return if args.clr: scheduler = CyclicLR(optimizer, base_lr=args.min_lr, max_lr=args.max_lr, step_size=args.epochs_per_step * len(train_loader), mode=args.mode) else: scheduler = MultiStepLR(optimizer, milestones=args.schedule, gamma=args.gamma) best_test = 0 # optionally resume from a checkpoint data = None if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location=device) args.start_epoch = checkpoint['epoch'] - 1 best_test = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) elif os.path.isdir(args.resume): checkpoint_path = os.path.join(args.resume, 'checkpoint.pth.tar') csv_path = os.path.join(args.resume, 'results.csv') print("=> loading checkpoint '{}'".format(checkpoint_path)) checkpoint = torch.load(checkpoint_path, map_location=device) args.start_epoch = checkpoint['epoch'] - 1 best_test = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( checkpoint_path, checkpoint['epoch'])) data = [] with open(csv_path) as csvfile: reader = csv.DictReader(csvfile) for row in reader: data.append(row) else: print("=> no checkpoint found at '{}'".format(args.resume)) if args.evaluate: loss, top1, top5 = test(model, val_loader, criterion, device, dtype) # TODO return # TODO(3/30):使用来做什么的? csv_logger = CsvLogger(filepath=save_path, data=data) csv_logger.save_params(sys.argv, args) # TODO(3、30):似乎是多余的,搞懂在干什么吧 claimed_acc1 = None claimed_acc5 = None if args.input_size in claimed_acc_top1: if args.scaling in claimed_acc_top1[args.input_size]: claimed_acc1 = claimed_acc_top1[args.input_size][args.scaling] claimed_acc5 = claimed_acc_top5[args.input_size][args.scaling] csv_logger.write_text( 'Claimed accuracies are: {:.2f}% top-1, {:.2f}% top-5'.format( claimed_acc1 * 100., claimed_acc5 * 100.)) train_network(args.start_epoch, args.epochs, scheduler, model, train_loader, val_loader, optimizer, criterion, device, dtype, args.batch_size, args.log_interval, csv_logger, save_path, claimed_acc1, claimed_acc5, best_test)
def __init__(self, path, mode, args): data = [] with open(os.path.join(path, mode)) as f: all_lines = f.readlines() for line in all_lines: ins = json.loads(line) data.append(ins) entityMarker = EntityMarker(args) tot_instance = len(data) # load rel2id and type2id if os.path.exists(os.path.join(path, "rel2id.json")): rel2id = json.load(open(os.path.join(path, "rel2id.json"))) else: raise Exception("Error: There is no `rel2id.json` in "+ path +".") if os.path.exists(os.path.join(path, "type2id.json")): type2id = json.load(open(os.path.join(path, "type2id.json"))) else: print("Warning: There is no `type2id.json` in "+ path +", If you want to train model using `OT`, `CT` settings, please firstly run `utils.py` to get `type2id.json`.") print("pre process " + mode) # pre process data self.input_ids = np.zeros((tot_instance, args.max_length), dtype=int) self.mask = np.zeros((tot_instance, args.max_length), dtype=int) self.h_pos = np.zeros((tot_instance), dtype=int) self.t_pos = np.zeros((tot_instance), dtype=int) self.h_pos_l = np.zeros((tot_instance), dtype=int) self.t_pos_l = np.zeros((tot_instance), dtype=int) self.label = np.zeros((tot_instance), dtype=int) for i, ins in enumerate(data): self.label[i] = rel2id[ins["relation"]] # tokenize if args.mode == "CM": ids, ph, pt, ph_l, pt_l = entityMarker.tokenize(data[i]["token"], data[i]['h']['pos'], data[i]['t']['pos']) elif args.mode == "OC": ids, ph, pt = entityMarker.tokenize(data[i]["token"], data[i]['h']['pos'], data[i]['t']['pos'], None, None, True, True) elif args.mode == "CT": h_type = "[unused%d]" % (type2id['subj_'+ins['h']['type']] + 10) t_type = "[unused%d]" % (type2id['obj_'+ins['t']['type']] + 10) ids, ph, pt = entityMarker.tokenize(data[i]["token"], data[i]['h']['pos'], data[i]['t']['pos'], h_type, t_type) elif args.mode == "OM": head = entityMarker.tokenizer.tokenize(ins['h']['name']) tail = entityMarker.tokenizer.tokenize(ins['t']['name']) h_first = ins['h']['pos'][0] < ins['t']['pos'][0] ids, ph, pt = entityMarker.tokenize_OMOT(head, tail, h_first) elif args.mode == "OT": h_type = "[unused%d]" % (type2id['subj_'+ins['h']['type']] + 10) t_type = "[unused%d]" % (type2id['obj_'+ins['t']['type']] + 10) h_first = ins['h']['pos'][0] < ins['t']['pos'][0] ids, ph, pt = entityMarker.tokenize_OMOT([h_type,], [t_type,], h_first) else: raise Exception("No such mode! Please make sure that `mode` takes the value in {CM,OC,CT,OM,OT}") length = min(len(ids), args.max_length) self.input_ids[i][0:length] = ids[0:length] self.mask[i][0:length] = 1 self.h_pos[i] = min(ph, args.max_length-1) self.t_pos[i] = min(pt, args.max_length-1) self.h_pos_l[i] = min(ph_l, args.max_length) self.t_pos_l[i] = min(pt_l, args.max_length) print("The number of sentence in which tokenizer can't find head/tail entity is %d" % entityMarker.err)
def read_langs(file_name, max_line=None): print(("Reading lines from {}".format(file_name))) data, context_arr, conv_arr, kb_arr, domain_dict = [], [], [], [], {} max_resp_len = 0 one_domain_cnt = 0 node_list, list_object_node = [], [] with open(file_name, encoding='utf-8') as fin: cnt_lin, sample_counter, node_idx = 1, 1, 0 for line in fin: line = line.strip() if line: # 处理domain if line.startswith("#"): flag = 0 # 标记是否是domain line = line.split() for a in line: if a == "#": # 若是'#'则跳过 continue if a.startswith("0"): # 是domain的序号 domain_idx = int(a) assert 5 >= domain_idx >= 0 # domain_l.append(domain_idx) flag = 1 continue if flag == 1: # 是domain domain_dict[domain_idx] = a assert 5 >= domains[a] >= 0 # domain_l.append(domains[a]) flag = 0 node_list.append([a, domain_idx, node_idx]) node_idx += 1 continue dialog_id = a # 读取dialogue ID domain_l = "全部" continue # 处理每句话,每行是一个KB(entity, attribute, value)/一个query-answer对 nid, line = line.split(' ', 1) # 处理answer-query对 if '\t' in line: # 将user/response/gold entity分开 u_seged, r_seged, gold_ent = line.split('\t') # 生成user话中每个词的memory gen_u = generate_memory(u_seged, "$u", str(nid)) context_arr += gen_u conv_arr += gen_u for tri in gen_u: node_list.append([tri, node_idx]) node_idx += 1 # Get gold entity for each domain # eval 能将字符串转为其原本的数据形式list/tuple/dict # 这里的ast.literal_eval能安全的转换,即该字符串不合法时直接抛出异常 gold_ent = ast.literal_eval(gold_ent) # ent_idx_restaurant, ent_idx_attraction, ent_idx_hotel = [], [], [] # if task_type == "restaurant": # ent_idx_restaurant = gold_ent # elif task_type == "attraction": # ent_idx_attraction = gold_ent # elif task_type == "hotel": # ent_idx_hotel = gold_ent ent_index = list(set(gold_ent)) # Get local pointer position for each word in system response ptr_index = [] for key in r_seged.split(): # 获取local指针,对于之前整理好的global指针,如果这个单词是backend的单词,那就记录它的位置 index = [ loc for loc, val in enumerate(context_arr) if (val[0] == key and key in ent_index) ] # 这里取最大的index,如果没有,就取句子长度 if index: index = max(index) else: index = len(context_arr) ptr_index.append(index) # Get global pointer labels for words in system response, the 1 in the end is for the NULL token # 对于user+KB中的单词,如果是system response出现的单词或者是KB实体,就标为1,否则为0,然后添加了一个1对应句子末尾NULL selector_index = [ 1 if (word_arr[0] in ent_index or word_arr[0] in r.split()) else 0 for word_arr in context_arr ] + [1] # 生成带sketch的回复 sketch_response, gold_sketch = generate_template( r_seged, gold_ent, kb_arr, domain_dict, node_list) # if len(domain_label) < 3: # domain_label.append(RiSA_PAD_token) # assert len(domain_label) == 3 # 把这段对话的所有内容放到一个dict中,然后加入到总数据 data_detail = { 'context_arr': list(context_arr + [['$$$$'] * MEM_TOKEN_SIZE]), # $$$$ is NULL token 'response': r_seged, 'sketch_response': sketch_response, 'gold_sketch': gold_sketch, 'ptr_index': ptr_index + [len(context_arr)], 'selector_index': selector_index, 'ent_index': ent_index, 'conv_arr': list(conv_arr), 'kb_arr': list(kb_arr), 'id': int(sample_counter), 'ID': int(cnt_lin), 'domain': domain_l } data.append(data_detail) # 注意,在这里就按照turn来生成了多个对话,将gold response作为context历史一并加入 gen_r = generate_memory(r_seged, "$s", str(nid)) context_arr += gen_r conv_arr += gen_r for tri in gen_r: node_list.append([tri, node_idx]) node_idx += 1 # 统计一下最长的回复长度 if max_resp_len < len(r_seged.split()): max_resp_len = len(r_seged.split()) sample_counter += 1 # 处理(entity, attribute, value) else: r = line kb_info = generate_memory(r, "", str(nid)) context_arr = kb_info + context_arr kb_arr += kb_info node_list.extend(kb_info) node_idx += 1 else: cnt_lin += 1 context_arr, conv_arr, kb_arr, node_list, domain_dict = [], [], [], [], {} node_idx = 0 if max_line and cnt_lin >= max_line: break return data, max_resp_len
def read_langs(file_name, SLOTS, dataset, lang, mem_lang, training, args): print(("Reading from {}".format(file_name))) data = [] max_len_val_per_slot = 0 max_len_slot_val = {} domain_counter = {} #count_noise = 0 sorted_domainslots = sorted(SLOTS) sorted_in_domains = [ i.split('-')[0] + "_DOMAIN" for i in sorted_domainslots ] sorted_in_slots = [i.split('-')[1] + "_SLOT" for i in sorted_domainslots] for ds in sorted_domainslots: max_len_slot_val[ds] = (1, "none") # counting none/dontcare multival_count = 0 with open(file_name) as f: dials = json.load(f) # create vocab first for dial_dict in dials: if (dataset == 'train' and training) or (args['pointer_decoder']): for ti, turn in enumerate(dial_dict["dialogue"]): lang.index_words(turn["system_transcript"], 'utter') lang.index_words(turn["transcript"], 'utter') for dial_dict in dials: last_belief_dict = {} # Filtering and counting domains for domain in dial_dict["domains"]: if domain not in domain_counter.keys(): domain_counter[domain] = 0 domain_counter[domain] += 1 # Reading data dialog_history = '' delex_dialog_history = '' prev_turn_belief_dict = {} for ti, turn in enumerate(dial_dict["dialogue"]): turn_id = turn["turn_idx"] if ti == 0: user_sent = ' SOS ' + turn["transcript"] + ' EOS ' sys_sent = '' dlx_user_sent = ' SOS ' + turn["delex_transcript"] + ' EOS ' dlx_sys_sent = '' else: sys_sent = ' SOS ' + turn["system_transcript"] + ' EOS ' user_sent = 'SOS ' + turn["transcript"] + ' EOS ' dlx_sys_sent = ' SOS ' + turn[ "delex_system_transcript"] + ' EOS ' dlx_user_sent = 'SOS ' + turn["delex_transcript"] + ' EOS ' turn_uttr = sys_sent + user_sent dialog_history += sys_sent delex_dialog_history += dlx_sys_sent dialog_history += user_sent delex_dialog_history += dlx_user_sent turn_belief_dict = fix_general_label_error( turn["belief_state"], False, SLOTS) turn_belief_dict = fix_book_slot_name(turn_belief_dict, SLOTS) turn_belief_dict, multival_count = fix_multival( turn_belief_dict, multival_count) turn_belief_dict = remove_none_value(turn_belief_dict) sorted_lenval, sorted_gates = get_sorted_lenval( sorted_domainslots, turn_belief_dict, args['slot_gating']) sorted_in_domains2, sorted_in_slots2, sorted_generate_y, sorted_in_domainslots2_index = get_sorted_generate_y( sorted_domainslots, sorted_lenval, turn_belief_dict) if args['auto_regressive']: atrg_generate_y, sorted_in_domainslots2_index = get_atrg_generate_y( sorted_domainslots, sorted_lenval, turn_belief_dict) else: atrg_generate_y = None if args['delex_his']: temp = dialog_history.split() delex_temp = delex_dialog_history.split() start_idx = [ i for i, t in enumerate(temp) if t == 'SOS' ][-1] #delex all except the last user utterance in_delex_dialog_history = ' '.join(delex_temp[:start_idx] + temp[start_idx:]) if len(in_delex_dialog_history.split()) != len( dialog_history.split()): pdb.set_trace() if (dataset == 'train' and training) or (args['pointer_decoder']): lang.index_words(in_delex_dialog_history, 'utter') turn_belief_list = [ str(k) + '-' + str(v) for k, v in turn_belief_dict.items() ] for k, v in turn_belief_dict.items(): if len(v.split()) > max_len_slot_val[k][0]: max_len_slot_val[k] = (len(v.split()), v) if dataset == 'train' and training: mem_lang.index_words(turn_belief_dict, 'belief') data_detail = { "ID": dial_dict["dialogue_idx"], "turn_id": turn_id, "dialog_history": dialog_history.strip(), "delex_dialog_history": in_delex_dialog_history.strip(), "turn_belief": turn_belief_list, "sorted_domainslots": sorted_domainslots, "turn_belief_dict": turn_belief_dict, "turn_uttr": turn_uttr.strip(), 'sorted_in_domains': sorted_in_domains, 'sorted_in_slots': sorted_in_slots, 'sorted_in_domains2': sorted_in_domains2, 'sorted_in_slots2': sorted_in_slots2, 'sorted_in_domainslots2_idx': sorted_in_domainslots2_index, 'sorted_lenval': sorted_lenval, 'sorted_gates': sorted_gates, 'sorted_generate_y': sorted_generate_y, 'atrg_generate_y': atrg_generate_y } data.append(data_detail) if len(sorted_lenval) > 0 and max( sorted_lenval) > max_len_val_per_slot: max_len_val_per_slot = max(sorted_lenval) prev_turn_belief_dict = turn_belief_dict print("domain_counter", domain_counter) print("multival_count", multival_count) return data, SLOTS, max_len_val_per_slot, max_len_slot_val
def main(): args = parser.parse_args() if args.seed is None: args.seed = random.randint(1, 10000) print("Random Seed: ", args.seed) random.seed(args.seed) torch.manual_seed(args.seed) if args.gpus: torch.cuda.manual_seed_all(args.seed) time_stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') if args.evaluate: args.results_dir = '/tmp' if args.save is '': args.save = time_stamp save_path = os.path.join(args.results_dir, args.save) if not os.path.exists(save_path): os.makedirs(save_path) if args.gpus is not None: args.gpus = [int(i) for i in args.gpus.split(',')] device = 'cuda:' + str(args.gpus[0]) cudnn.benchmark = True else: device = 'cpu' if args.type == 'float64': dtype = torch.float64 elif args.type == 'float32': dtype = torch.float32 elif args.type == 'float16': dtype = torch.float16 else: raise ValueError('Wrong type!') # TODO int8 model = MobileNet2(input_size=args.input_size, scale=args.scaling) num_parameters = sum([l.nelement() for l in model.parameters()]) print(model) print('number of parameters: {}'.format(num_parameters)) print('FLOPs: {}'.format( flops_benchmark.count_flops(MobileNet2, args.batch_size // len(args.gpus) if args.gpus is not None else args.batch_size, device, dtype, args.input_size, 3, args.scaling))) train_loader, val_loader = get_loaders(args.dataroot, args.batch_size, args.batch_size, args.input_size, args.workers) # define loss function (criterion) and optimizer criterion = torch.nn.CrossEntropyLoss() if args.gpus is not None: model = torch.nn.DataParallel(model, args.gpus) model.to(device=device, dtype=dtype) criterion.to(device=device, dtype=dtype) optimizer = torch.optim.SGD(model.parameters(), args.learning_rate, momentum=args.momentum, weight_decay=args.decay, nesterov=True) if args.find_clr: find_bounds_clr(model, train_loader, optimizer, criterion, device, dtype, min_lr=args.min_lr, max_lr=args.max_lr, step_size=args.epochs_per_step * len(train_loader), mode=args.mode, save_path=save_path) return if args.clr: scheduler = CyclicLR(optimizer, base_lr=args.min_lr, max_lr=args.max_lr, step_size=args.epochs_per_step * len(train_loader), mode=args.mode) else: scheduler = MultiStepLR(optimizer, milestones=args.schedule, gamma=args.gamma) best_test = 0 # optionally resume from a checkpoint data = None if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location=device) args.start_epoch = checkpoint['epoch'] - 1 best_test = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) elif os.path.isdir(args.resume): checkpoint_path = os.path.join(args.resume, 'checkpoint.pth.tar') csv_path = os.path.join(args.resume, 'results.csv') print("=> loading checkpoint '{}'".format(checkpoint_path)) checkpoint = torch.load(checkpoint_path, map_location=device) args.start_epoch = checkpoint['epoch'] - 1 best_test = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format(checkpoint_path, checkpoint['epoch'])) data = [] with open(csv_path) as csvfile: reader = csv.DictReader(csvfile) for row in reader: data.append(row) else: print("=> no checkpoint found at '{}'".format(args.resume)) if args.evaluate: loss, top1, top5 = test(model, val_loader, criterion, device, dtype) # TODO return csv_logger = CsvLogger(filepath=save_path, data=data) csv_logger.save_params(sys.argv, args) claimed_acc1 = None claimed_acc5 = None if args.input_size in claimed_acc_top1: if args.scaling in claimed_acc_top1[args.input_size]: claimed_acc1 = claimed_acc_top1[args.input_size][args.scaling] claimed_acc5 = claimed_acc_top5[args.input_size][args.scaling] csv_logger.write_text( 'Claimed accuracies are: {:.2f}% top-1, {:.2f}% top-5'.format(claimed_acc1 * 100., claimed_acc5 * 100.)) train_network(args.start_epoch, args.epochs, scheduler, model, train_loader, val_loader, optimizer, criterion, device, dtype, args.batch_size, args.log_interval, csv_logger, save_path, claimed_acc1, claimed_acc5, best_test)
def read_langs(file_name, max_line=None): logging.info(("Reading lines from {}".format(file_name))) # Read the file and split into lines data = [] context = "" u = None # u for user; r for response r = None with open(file_name) as fin: cnt_ptr = 0 # 在回答中有多少词带pointer cnt_voc = 0 max_r_len = 0 cnt_lin = 1 # 记录对话样本数 for line in fin: line = line.strip() if line: # 空行代表一个样本的结束 nid, line = line.split(' ', 1) if '\t' in line: u, r = line.split('\t') context += str(u) + " " # 当前response的对话历史,当前response会添加到下一轮的对话历史中 contex_arr = context.split(' ')[LIMIT:] r_index = [] gate = [] for key in r.split(' '): index = [ loc for loc, val in enumerate(contex_arr) if val == key ] if (index): index = max(index) gate.append(1) cnt_ptr += 1 else: index = len(contex_arr) - 1 gate.append(0) cnt_voc += 1 r_index.append(index) if len(r_index) > max_r_len: max_r_len = len(r_index) # TODO: why this way ??? data.append( [" ".join(contex_arr) + "$$$$", r, r_index, gate]) context += str(r) + " " else: r = line if USEKB: context += str(r) + " " else: cnt_lin += 1 if (max_line and cnt_lin >= max_line): break context = "" max_len = max([len(d[0].split(' ')) for d in data]) avg_len = sum([len(d[0].split(' ')) for d in data]) / float( len([len(d[0].split(' ')) for d in data])) logging.info("Pointer percentace= {} ".format(cnt_ptr / (cnt_ptr + cnt_voc))) logging.info("Max responce Len: {}".format(max_r_len)) logging.info("Max Input Len: {}".format(max_len)) logging.info("AVG Input Len: {}".format(avg_len)) return data, max_len, max_r_len