def main(params): # Create vocabulary and author index saved_model = torch.load(params['model']) if 'misc' in saved_model: misc = saved_model['misc'] char_to_ix = misc['char_to_ix'] auth_to_ix = misc['auth_to_ix'] ix_to_char = misc['ix_to_char'] ix_to_auth = misc['ix_to_auth'] else: char_to_ix = saved_model['char_to_ix'] auth_to_ix = saved_model['auth_to_ix'] ix_to_char = saved_model['ix_to_char'] ix_to_auth = saved_model['ix_to_auth'] cp_params = saved_model['arch'] if params['softmax_scale']: cp_params['softmax_scale'] = params['softmax_scale'] dp = DataProvider(cp_params) if params['m_type'] == 'generative': model = CharLstm(cp_params) else: model = CharTranslator(cp_params) # set to train mode, this activates dropout model.eval() auth_colors = ['red', 'blue'] startc = dp.data['configs']['start'] endc = dp.data['configs']['end'] append_tensor = np.zeros((1, 1), dtype=np.int) append_tensor[0, 0] = char_to_ix[startc] append_tensor = torch.LongTensor(append_tensor).cuda() # Restore saved checkpoint model.load_state_dict(saved_model['state_dict']) hidden = model.init_hidden(1) jc = '' if cp_params.get('atoms','char') == 'char' else ' ' for i in xrange(params['num_samples']): c_aid = np.random.choice(auth_to_ix.values()) if params['m_type'] == 'generative': batch = dp.get_random_string(slen = params['seed_length'], split=params['split']) else: batch = dp.get_sentence_batch(1,split=params['split'], atoms=cp_params.get('atoms','char'), aid=ix_to_auth[c_aid]) inps, targs, auths, lens = dp.prepare_data(batch, char_to_ix, auth_to_ix, maxlen=cp_params['max_seq_len']) auths_inp = 1 - auths if params['flip'] else auths outs = adv_forward_pass(model, inps, lens, end_c=char_to_ix[endc], maxlen=cp_params['max_seq_len'], auths=auths_inp, cycle_compute=params['show_rev'], append_symb=append_tensor) #char_outs = model.forward_gen(inps, hidden, auths_inp, n_max = cp_params['max_len'],end_c=char_to_ix['.']) print '--------------------------------------------' #print 'Translate from %s to %s'%(batch[0]['author'], ix_to_auth[auths_inp[0]]) print colored('Inp %6s: '%(ix_to_auth[auths[0]]),'green') + colored('%s'%(jc.join([ix_to_char[c[0]] for c in inps[1:]])),auth_colors[auths[0]]) print colored('Out %6s: '%(ix_to_auth[auths_inp[0]]),'grey')+ colored('%s'%(jc.join([ix_to_char[c.data.cpu()[0]] for c in outs[0] if c.data.cpu()[0] in ix_to_char])),auth_colors[auths_inp[0]]) if params['show_rev']: print colored('Rev %6s: '%(ix_to_auth[auths[0]]),'green')+ colored('%s'%(jc.join([ix_to_char[c.data.cpu()[0]] for c in outs[-1] if c.data.cpu()[0] in ix_to_char])),auth_colors[auths[0]])
def get_reconstruction_objective_values(dp: DataProvider, image_type: ImageType, t): """ This function dumps the output of the objective function given a permutation from the greedy comparator :return: a tuple of 2 lists of log probabilities (scalars) """ solver = image_type_to_solver_with_comparator[image_type] comparator = solver._t_to_comparator[t] inputs = dp.get_fish_images( ) if image_type == ImageType.IMAGES else dp.get_docs_images() inputs = [Shredder.shred(im, t, shuffle_shreds=False) for im in inputs] correct_reconstruction_probas = [] incorrect_reconstruction_probas = [] for i, stacked_shreds in enumerate(inputs): print('#{}-{}-{}'.format(i, image_type, t)) ltr_adj_probas, ttb_adj_probas = AdjacencyMatrixBuilder.build_adjacency_matrices( comparator, stacked_shreds) _, correct_log_objective = ObjectiveFunction.compute( np.arange(t**2), ltr_adj_probas, ttb_adj_probas) predicted_permutation, log_objective = solver.predict( stacked_shreds, return_log_objective=True) if np.array_equal(predicted_permutation, np.arange(t**2)): correct_reconstruction_probas.append(log_objective) else: incorrect_reconstruction_probas.append( (log_objective, correct_log_objective)) return correct_reconstruction_probas, incorrect_reconstruction_probas
def main(): if 'large' in sys.argv: number_of_samples = sys.maxsize else: number_of_samples = 20 ts = list() if '2' in sys.argv: ts.append(2) if '4' in sys.argv: ts.append(4) if '5' in sys.argv: ts = [ 5, ] if 0 == len(ts): ts = (2, 4, 5) image_types = list() if 'image' in sys.argv: image_types.append(ImageType.IMAGES) if 'document' in sys.argv: image_types.append(ImageType.DOCUMENTS) if 0 == len(image_types): image_types = ImageType np.random.seed(42) for t in ts: for image_type in image_types: print('t={}. image type is {}'.format(t, image_type.value)) if image_type == ImageType.IMAGES: get_images = DataProvider().get_fish_images else: get_images = DataProvider().get_docs_images images = get_images(num_samples=number_of_samples) images_train, images_validation = train_test_split(images, random_state=42) clf = ComparatorCNN(t, IMAGE_TYPE_TO_T_TO_COMPARATOR_CNN_WEIGHT_FILE_ID_AND_FILE_PATH[image_type][t].width, IMAGE_TYPE_TO_T_TO_COMPARATOR_CNN_WEIGHT_FILE_ID_AND_FILE_PATH[image_type][t].height, image_type) \ .load_weights(IMAGE_TYPE_TO_T_TO_COMPARATOR_CNN_WEIGHT_FILE_ID_AND_FILE_PATH[image_type][t].model_path) cam = ComparatorActivationMap(clf) for image in images_train: cam.visualize_activations( shred_and_resize_to([image], t, (clf.width, clf.height)))
def prepare_data(self, num_samples, resize=None): num_samples = 2000 dp = DataProvider() ts = (1, 2, 4, 5) fish = dp.get_fish_images(num_samples=num_samples, resize=resize) docs = dp.get_docs_images(num_samples=num_samples, resize=resize) fish = list(itertools.chain(*[shred_shuffle_and_reconstruct(fish, t) for t in ts])) docs = list(itertools.chain(*[shred_shuffle_and_reconstruct(docs, t) for t in ts])) if resize is not None: fish = list_of_images_to_numpy(fish) docs = list_of_images_to_numpy(docs) return fish, docs
def main(): print('---------------------- data config ------------------------') pprint(data_cfg) print('---------------------- model config -------------------') pprint(model_cfg) print('creating dirs for saving model weights, logs ...') checkpoint_dir = os.path.join( model_cfg.checkpoint_dir, model_cfg.exp_name) create_dirs([checkpoint_dir, train_cfg.summary_dir]) print('initializing train data provider....') det_data_provider = DataProvider(data_cfg) sess = tf.Session() print('creating tensorflow log for summaries...') tf_logger = TfLogger(sess, train_cfg) print('creating seg models ...') train_model = SegModel(model_cfg) if model_cfg.train_from_pretrained: train_model.load(sess) print('creating seg trainer...') trainer = SegTrainer(sess, train_model, det_data_provider, train_cfg, tf_logger) print('start trainning...') trainer.train() sess.close()
def main(params): # Create vocabulary and author index saved_model = torch.load(params['model']) char_to_ix = saved_model['char_to_ix'] auth_to_ix = saved_model['auth_to_ix'] ix_to_char = saved_model['ix_to_char'] cp_params = saved_model['arch'] dp = DataProvider(cp_params) if params['m_type'] == 'translator': model = CharTranslator(cp_params) else: model = get_classifier(cp_params) # set to train mode, this activates dropout #model.eval() # Restore saved checkpoint model.load_state_dict(saved_model['state_dict']) eval_function = eval_translator if params[ 'm_type'] == 'translator' else eval_model if cp_params[ 'mode'] == 'generative' else eval_classify score = eval_function(dp, model, cp_params, char_to_ix, auth_to_ix, split=params['split'], max_docs=params['num_eval'], dump_scores=params['dump_scores'])
def main(params): eval_model = torch.load(params['evalmodel']) eval_params = eval_model['arch'] eval_state = eval_model['state_dict'] modelEval = get_classifier(eval_params)#= CharLstm(eval_params) char_to_ix = eval_model['char_to_ix'] auth_to_ix = eval_model['auth_to_ix'] ix_to_char = eval_model['ix_to_char'] dp = DataProvider(eval_params) #modelEval.eval() state = modelEval.state_dict() state.update(eval_state) modelEval.load_state_dict(state) if params['inpfile'].split('.')[-1] == 'json': inps = json.load(open(params['inpfile'],'r')) elif params['inpfile'].split('.')[-1] == 'p': inps = pkl.load(open(params['inpfile'],'r')) bsz = 100 def process_batch(batch, featstr = 'sent_enc'): _, targs, _,lens = dp.prepare_data(batch, char_to_ix, auth_to_ix, maxlen=eval_params['max_seq_len']) if not all(lens): import ipdb; ipdb.set_trace() eval_out = modelEval.forward_classify(targs, lens=lens,compute_softmax=True) eval_out = eval_out[0].data.cpu().numpy() for i,b in enumerate(batch): inps['docs'][b['id']]['sents'][b['sid']][b['sampid']][featstr] = eval_out[i,:].tolist() batch = [] for i,doc in tqdm(enumerate(inps['docs'])): for j, st in enumerate(doc['sents']): for k in xrange(len(st)): st = inps['docs'][i]['sents'][j][k]['trans'].split() if len(st) > 0: batch.append({'in': st,'targ': st, 'author': inps['docs'][i]['author'], 'id':i, 'sid': j, 'sampid':k}) if len(batch) == bsz: process_batch(batch, featstr = params['store_in']) del batch batch = [] if batch: process_batch(batch, featstr = params['store_in']) del batch batch = [] if params['inpfile'].split('.')[-1] == 'json': json.dump(inps, open(params['inpfile'],'w')) else: pkl.dump(inps, open(params['inpfile'],'wb'))
def get_adjacent_crops_probabilities(dp: DataProvider, image_type: ImageType, t): """ This function calculates the output of the comparator on adjacent crops :return: list of probabilities (scalars) """ comparator = image_type_to_t_to_comparator[image_type][t] inputs = dp.get_fish_images( ) if image_type == ImageType.IMAGES else dp.get_docs_images() inputs = shred_and_resize_to(inputs, t, (comparator.width, comparator.height)) adj_probabilities = [] for stacked_shreds in inputs: for left_idx in range(t**2): right_idx = left_idx + 1 if left_idx % t == t - 1: continue softmax = comparator.predict_is_left_probability( [stacked_shreds[left_idx]], [stacked_shreds[right_idx]]) adj_probabilities.append(softmax[0][1]) return adj_probabilities
def get_non_adjacent_crops_probabilities(dp: DataProvider, image_type: ImageType, t): """ This function calculates the output of the comparator on non adjacent crops :return: list of probabilities (scalars) """ comparator = image_type_to_t_to_comparator[image_type][t] inputs = dp.get_fish_images( ) if image_type == ImageType.IMAGES else dp.get_docs_images() inputs = shred_and_resize_to(inputs, t, (comparator.width, comparator.height)) non_adj_probabilities = [] for stacked_shreds in inputs: left_idx, right_idx = 0, 1 while left_idx + 1 == right_idx: left_idx, right_idx = tuple( np.random.choice(t**2, 2, replace=False)) softmax = comparator.predict_is_left_probability( [stacked_shreds[left_idx]], [stacked_shreds[right_idx]]) non_adj_probabilities.append(softmax[0][1]) return non_adj_probabilities
def get_number_of_images_with_same_patches_and_number_of_same_patches( data_provider: DataProvider, image_type: ImageType, t: int, width_height=None): inputs = data_provider.get_fish_images() if image_type == ImageType.IMAGES else data_provider.get_docs_images() if width_height is None: inputs = list(map(lambda image: Shredder.shred(image, t), inputs)) else: inputs = shred_and_resize_to(inputs, t, width_height) number_of_pictures_with_same_patches = 0 number_of_patches_with_similar_in_same_picture = 0 for stacked_shreds in inputs: picture_has_similar_patches = False for left_shred in range(t**2): picture_has_similar_to_this_shred = False for right_shred in range(t**2): if left_shred != right_shred and np.all(stacked_shreds[left_shred] == stacked_shreds[right_shred]): picture_has_similar_to_this_shred = True if picture_has_similar_to_this_shred: picture_has_similar_patches = True number_of_patches_with_similar_in_same_picture += 1 if picture_has_similar_patches: number_of_pictures_with_same_patches += 1 return \ number_of_pictures_with_same_patches, \ number_of_patches_with_similar_in_same_picture, \ len(inputs), \ len(inputs) * (t ** 2)
def dump_reconstruction_objective_values(): d = { image_type: {t: { 'correct': None, 'incorrect': None } for t in TS} for image_type in ImageType } dp = DataProvider() for image_type in ImageType: for t in TS: print("Getting stats for {}-{}...".format(image_type, t)) d[image_type][t]['correct'], \ d[image_type][t]['incorrect'] = get_reconstruction_objective_values(dp, image_type, t) os.makedirs(root_path, exist_ok=True) file_path = os.path.join(dict_file_names['log_obj']) PickleHelper.dump(d, file_path)
def evaluate_image_for_permutation(self, shred_index_to_image, permutation, sample_index=None): t = int(round(math.sqrt(len(permutation)))) if isinstance(shred_index_to_image, str): shred_index_to_image = DataProvider.read_image( shred_index_to_image) if np.shape(shred_index_to_image)[0] != len(permutation): shred_index_to_image = Shredder.shred(shred_index_to_image, t) shreds_permuted = shred_index_to_image[permutation] permutation_predicted = self.predict(shreds_permuted) current_accuracy = np.average(permutation_predicted == permutation) if not np.isclose(current_accuracy, 1.0): print('On #{} 0-1 is {}: {}!={}'.format(sample_index, current_accuracy, permutation, permutation_predicted)) visualize = True if visualize: directory_path = 'problems/{}/{}'.format(t, self._image_type) os.makedirs(directory_path, exist_ok=True) time_stamp = int(time.time()) Visualizer.visualize_crops( shreds_permuted[np.argsort(permutation)], show=False, save_path=os.path.join( directory_path, '{}-original.png'.format(time_stamp))) Visualizer.visualize_crops( shreds_permuted[np.argsort(permutation_predicted)], show=False, save_path=os.path.join( directory_path, '{}-restored.png'.format(time_stamp))) print('visualized') return current_accuracy
def dump_adjacent_and_non_adjacent_probabilities(): d = { image_type: {t: { 'adj': None, 'non_adj': None } for t in TS} for image_type in ImageType } dp = DataProvider() for image_type in ImageType: for t in TS: print("Getting stats for {}-{}...".format(image_type, t)) d[image_type][t]['non_adj'] = get_non_adjacent_crops_probabilities( dp, image_type, t) d[image_type][t]['adj'] = get_adjacent_crops_probabilities( dp, image_type, t) os.makedirs(root_path, exist_ok=True) PickleHelper.dump(d, dict_file_names['adj'])
def debug(): names_train = ['n01440764_11593.JPEG', 'n01440764_11602.JPEG', 'n01440764_4562.JPEG', 'n01440764_5148.JPEG', 'n01440764_11897.JPEG', 'n01440764_29057.JPEG', 'n01440764_22135.JPEG', 'n01440764_8003.JPEG', 'n01440764_3566.JPEG', 'n01440764_44.JPEG', 'n01440764_10910.JPEG', 'n01440764_10382.JPEG', 'n01440764_6508.JPEG', 'n01440764_10290.JPEG', 'n01440764_910.JPEG'] images_train, names_train = DataProvider.read_images('../images', names_train) indices = [4, 5, 13] images_validation = [images_train[index] for index in indices] index = 4 t = 4 permutation = [2, 8, 4, 12, 0, 10, 6, 5, 7, 3, 13, 15, 11, 9, 1, 14] width = 224 height = 224 image_type = ImageType.IMAGES cmp = ComparatorCNN(t, width, height, image_type) \ ._fit_standardisation(images_train) \ .load_weights(IMAGE_TYPE_TO_T_TO_COMPARATOR_CNN_WEIGHT_FILE_ID_AND_FILE_PATH[image_type][t].model_path) slv = SolverGreedy({t: cmp}) score = slv.evaluate_image_for_permutation(images_validation[0], permutation, sample_index=index) print('done with ', score)
def main(params): # Create vocabulary and author index saved_model = torch.load(params['model']) if 'misc' in saved_model: misc = saved_model['misc'] char_to_ix = misc['char_to_ix'] auth_to_ix = misc['auth_to_ix'] ix_to_char = misc['ix_to_char'] ix_to_auth = misc['ix_to_auth'] else: char_to_ix = saved_model['char_to_ix'] auth_to_ix = saved_model['auth_to_ix'] ix_to_char = saved_model['ix_to_char'] ix_to_auth = saved_model['ix_to_auth'] cp_params = saved_model['arch'] if params['softmax_scale']: cp_params['softmax_scale'] = params['softmax_scale'] dp = DataProvider(cp_params) if params['m_type'] == 'generative': model = CharLstm(cp_params) else: model = CharTranslator(cp_params) # set to train mode, this activates dropout model.eval() auth_colors = ['red', 'blue'] startc = dp.data['configs']['start'] endc = dp.data['configs']['end'] append_tensor = np.zeros((1, 1), dtype=np.int) append_tensor[0, 0] = char_to_ix[startc] append_tensor = torch.LongTensor(append_tensor).cuda() # Restore saved checkpoint model.load_state_dict(saved_model['state_dict']) hidden = model.init_hidden(1) jc = '' if cp_params.get('atoms', 'char') == 'char' else ' ' for i in range(params['num_samples']): c_aid = np.random.choice(list(auth_to_ix.values())) if params['m_type'] == 'generative': batch = dp.get_random_string(slen=params['seed_length'], split=params['split']) else: batch = dp.get_sentence_batch(1, split=params['split'], atoms=cp_params.get('atoms', 'char'), aid=ix_to_auth[c_aid]) inps, targs, auths, lens = dp.prepare_data( batch, char_to_ix, auth_to_ix, maxlen=cp_params['max_seq_len']) auths_inp = 1 - auths if params['flip'] else auths forward, backward = adv_forward_pass(model, inps, lens, end_c=char_to_ix[endc], maxlen=cp_params['max_seq_len'], auths=auths_inp, cycle_compute=params['show_rev'], append_symb=append_tensor) # char_outs = model.forward_gen(inps, hidden, auths_inp, n_max = cp_params['max_len'],end_c=char_to_ix['.']) print('--------------------------------------------') print('Translate from %s to %s' % (batch[0]['author'], ix_to_auth[auths_inp.item()])) # General helper functions # Clears whitespace but retains character for re.sub def strip_match(match): return match.group(0).strip() # Joins together decimals def fix_decimals(match): match = match.group(0) return re.sub('\s', '', match) # Cleans text by removing unnecessary whitespace and substituting back in some symbols def clean_text(text): text = re.sub('-lrb-', '(', text) text = re.sub('-rrb-', ')', text) text = re.sub('-lsb-', '[', text) text = re.sub('-rsb-', ']', text) text = re.sub('-lcb-', '{', text) text = re.sub('-rcb-', '}', text) text = re.sub('\'\'', '\"', text) text = re.sub('\si\s', ' I ', text) text = re.sub('^i\s', 'I ', text) text = re.sub('\sna\s', 'na ', text) text = re.sub('\$\s', strip_match, text) text = re.sub('[-#]\s|\s([-.!,\':;?]|n\'t)', strip_match, text) text = re.sub('\d+. \d+', fix_decimals, text) return text # Get original sentence and clean it up a bit input_list = [ix_to_char[c.item()] for c in inps[1:]] input_string = jc.join(input_list) input_string = clean_text(input_string) # Get translated sentence and clean it up a bit output_list = [ ix_to_char[c.item()] for c in forward if c.item() in ix_to_char ] if output_list[-1] == 'END': output_list = output_list[:-1] output_string = jc.join(output_list) output_string = clean_text(output_string) print( colored('Inp %6s: ' % (ix_to_auth[auths.item()]), 'green') + colored('%s' % input_string, auth_colors[auths.item()])) print( colored('Out %6s: ' % (ix_to_auth[auths_inp.item()]), 'grey') + colored('%s' % output_string, auth_colors[auths_inp.item()])) if params['show_rev']: print( colored('Rev %6s: ' % (ix_to_auth[auths.item()]), 'green') + colored( '%s' % (jc.join([ ix_to_char[c.item()] for c in backward if c.item() in ix_to_char and ix_to_char[c.item()] != 'END' ])), auth_colors[auths.item()]))
def main(): if 'debug' in sys.argv: print('Debug') number_of_samples = 20 epochs = 1 else: print('Release') number_of_samples = sys.maxsize epochs = 1 ts = list() if '2' in sys.argv: ts.append(2) if '4' in sys.argv: ts.append(4) if '5' in sys.argv: ts = [ 5, ] if 0 == len(ts): ts = (2, 4, 5) image_types = list() if 'image' in sys.argv: image_types.append(ImageType.IMAGES) if 'document' in sys.argv: image_types.append(ImageType.DOCUMENTS) if 0 == len(image_types): image_types = ImageType np.random.seed(42) for image_type in image_types: print(image_type.value) if image_type == ImageType.IMAGES: get_images = DataProvider().get_fish_images mean = 100.52933494138787 std = 65.69793156777682 else: get_images = DataProvider().get_docs_images mean = 241.46115784237548 std = 49.512839464023564 images, names = get_images(num_samples=number_of_samples, return_names=True) images_train, images_validation, names_train, names_validation = train_test_split( images, names, random_state=42) t_to_comparator = { t: ComparatorCNN( t, IMAGE_TYPE_TO_T_TO_COMPARATOR_CNN_WEIGHT_FILE_ID_AND_FILE_PATH[ image_type][t].width, IMAGE_TYPE_TO_T_TO_COMPARATOR_CNN_WEIGHT_FILE_ID_AND_FILE_PATH[ image_type][t].height, image_type, mean=mean, std=std). load_weights( IMAGE_TYPE_TO_T_TO_COMPARATOR_CNN_WEIGHT_FILE_ID_AND_FILE_PATH[ image_type][t].model_path) for t in ts } clf = SolverLP(t_to_comparator, image_type=image_type) print('Train: ', names_train) accuracy = clf.evaluate(images_train, epochs=epochs, ts=ts) print('Train 0-1 accuracy on {}: {}'.format(image_type.value, accuracy)) print('Validation: ', names_validation) accuracy = clf.evaluate(images_validation, epochs=epochs, ts=ts) print('Validation 0-1 accuracy on {}: {}'.format( image_type.value, accuracy))
def main(): if 'debug' in sys.argv: print('Debug') number_of_samples = 20 epochs = 5 else: print('Release') number_of_samples = sys.maxsize epochs = 50 ts = list() if '2' in sys.argv: ts.append(2) if '4' in sys.argv: ts.append(4) if '5' in sys.argv: ts = [ 5, ] if 0 == len(ts): ts = (2, 4, 5) image_types = list() if 'image' in sys.argv: image_types.append(ImageType.IMAGES) if 'document' in sys.argv: image_types.append(ImageType.DOCUMENTS) if 0 == len(image_types): image_types = ImageType if 'train' in sys.argv: force = True elif 'evaluate' in sys.argv: force = False else: force = False np.random.seed(42) width = 224 height = 224 batch_size = 32 for t in ts: for image_type in image_types: print('t={}. image type is {}'.format(t, image_type.value)) if image_type == ImageType.IMAGES: get_images = DataProvider().get_fish_images else: get_images = DataProvider().get_docs_images images = get_images(num_samples=number_of_samples) images_train, images_validation = train_test_split(images, random_state=42) clf = TopLeftCNN(t, width, height, image_type) if force: clf.fit_generator( images_train, batch_size, epochs, images_validation, ) else: clf.load_weights() clf._fit_standardisation(images_train) print('Train 0-1:', clf.evaluate(images_train)) print('Validation 0-1:', clf.evaluate(images_validation))
def main(params): saved_model = torch.load(params['checkpoint']) cp_params = saved_model['arch'] dp = DataProvider(cp_params) if 'misc' in saved_model: misc = saved_model['misc'] char_to_ix = misc['char_to_ix'] auth_to_ix = misc['auth_to_ix'] ix_to_char = misc['ix_to_char'] ix_to_auth = misc['ix_to_auth'] else: char_to_ix = saved_model['char_to_ix'] auth_to_ix = saved_model['auth_to_ix'] ix_to_char = saved_model['ix_to_char'] ix_to_auth = saved_model['ix_to_auth'] del saved_model total_sents = 0. resf = params['resfile'] res = json.load(open(resf, 'r')) bsz = params['batch_size'] for doc in res['docs']: for st in doc['sents']: total_sents += 1 all_feats = np.zeros((2 * total_sents, 4096), dtype='float16') c_idx = 0 def process_batch(batch, c_idx, featstr='sent_enc'): inps, _, _, lens = dp.prepare_data(batch, char_to_ix, auth_to_ix, maxlen=cp_params['max_seq_len']) enc_out = modelGenEncoder.forward_encode(inps, lens) enc_out = enc_out.data.cpu().numpy().astype('float16') all_feats[c_idx:c_idx + enc_out.shape[0]] = enc_out for i, b in enumerate(batch): res['docs'][b['id']]['sents'][b['sid']][featstr] = c_idx + i c_idx += enc_out.shape[0] return c_idx if params['use_semantic_encoder']: modelGenEncoder = BLSTMEncoder(char_to_ix, ix_to_char, params['glove_path']) encoderState = torch.load(params['use_semantic_encoder']) else: modelGenEncoder = CharTranslator(cp_params, encoder_only=True) encoderState = model_gen_state state = modelGenEncoder.state_dict() for k in encoderState: if k in state: state[k] = encoderState[k] modelGenEncoder.load_state_dict(state) modelGenEncoder.eval() del encoderState batch = [] print ' Processing original text' for i in tqdm(xrange(len(res['docs']))): ix = auth_to_ix[res['docs'][i]['author']] for j in xrange(len(res['docs'][i]['sents'])): st = res['docs'][i]['sents'][j]['sent'].split() if len(st) > 0: batch.append({ 'in': st, 'targ': st, 'author': res['docs'][i]['author'], 'id': i, 'sid': j }) if len(batch) == bsz: c_idx = process_batch(batch, c_idx, featstr='sent_enc') del batch batch = [] if batch: c_idx = process_batch(batch, c_idx, featstr='sent_enc') del batch batch = [] print 'Processing translated text' for i in tqdm(xrange(len(res['docs']))): ix = auth_to_ix[res['docs'][i]['author']] for j in xrange(len(res['docs'][i]['sents'])): st = res['docs'][i]['sents'][j]['trans'].split() if len(st) > 0: batch.append({ 'in': st, 'targ': st, 'author': res['docs'][i]['author'], 'id': i, 'sid': j }) if len(batch) == bsz: c_idx = process_batch(batch, c_idx, featstr='trans_enc') batch = [] if batch: c_idx = process_batch(batch, c_idx, featstr='trans_enc') batch = [] json.dump(res, open(resf, 'w')) np.save('.'.join(resf.split('.')[:-1]) + 'sememb.npy', all_feats)
test_x, test_y = shreds_to_x_y(test_shreds) assert train_x.shape == (self._t**2 * train_shreds.shape[0], self._height, self._width, 1) assert train_y.shape == (self._t**2 * train_shreds.shape[0], ) assert validation_x.shape == (self._t**2 * validation_shreds.shape[0], self._height, self._width, 1) assert validation_y.shape == (self._t**2 * validation_shreds.shape[0], ) assert test_x.shape == (self._t**2 * test_shreds.shape[0], self._height, self._width, 1) assert test_y.shape == (self._t**2 * test_shreds.shape[0], ) return (train_x, train_y), (validation_x, validation_y), (test_x, test_y) def _get_model_checkpoint_file_path(self): return 'saved_weights/one-picture-classify-best-{}-{}-model.h5'.format( self._t, self._image_type.value) def _get_model_final_file_path(self): return 'saved_weights/one-picture-classify-final-{}-{}-model.h5'.format( self._t, self._image_type.value) if "__main__" == __name__: for t in (2, 4, 5): for image_type in ImageType: clf = OnePictureClassify(t, 220, 220, image_type, DataProvider()) clf.fit(epochs=50)
def main(params): # Create vocabulary and author index saved_model = torch.load(params['genmodel']) cp_params = saved_model['arch'] if params['evalmodel']: eval_model = torch.load(params['evalmodel']) eval_params = eval_model['arch'] eval_state = eval_model['state_dict'] else: print "FIX THIS" return if 'misc' in saved_model: misc = saved_model['misc'] char_to_ix = misc['char_to_ix'] auth_to_ix = misc['auth_to_ix'] ix_to_char = misc['ix_to_char'] ix_to_auth = misc['ix_to_auth'] else: char_to_ix = saved_model['char_to_ix'] auth_to_ix = saved_model['auth_to_ix'] ix_to_char = saved_model['ix_to_char'] if 'ix_to_auth' in saved_model: ix_to_auth = saved_model['ix_to_auth'] else: ix_to_auth = {auth_to_ix[a]:a for a in auth_to_ix} dp = DataProvider(cp_params) if params['softmax_scale']: cp_params['softmax_scale'] = params['softmax_scale'] modelGen = CharTranslator(cp_params) modelEval = CharLstm(eval_params) startc = dp.data['configs']['start'] endc = dp.data['configs']['end'] modelGen.eval() modelEval.eval() # Restore saved checkpoint modelGen.load_state_dict(saved_model['state_dict']) state = modelEval.state_dict() state.update(eval_state) modelEval.load_state_dict(state) append_tensor = np.zeros((1, 1), dtype=np.int) append_tensor[0, 0] = char_to_ix[startc] append_tensor = torch.LongTensor(append_tensor).cuda() accum_diff_eval = [[],[]] accum_err_eval = np.zeros(len(auth_to_ix)) accum_err_real = np.zeros(len(auth_to_ix)) accum_count_gen = np.zeros(len(auth_to_ix)) accum_recall_forward = np.zeros(len(auth_to_ix)) accum_prec_forward = np.zeros(len(auth_to_ix)) accum_recall_rev = np.zeros(len(auth_to_ix)) accum_prec_rev = np.zeros(len(auth_to_ix)) jc = '' if cp_params.get('atoms','char') == 'char' else ' ' result = {'docs':[], 'misc':{'auth_to_ix':auth_to_ix, 'ix_to_auth':ix_to_auth}, 'cp_params':cp_params, 'params': params} id_to_ix = {} for i,iid in enumerate(dp.splits[params['split']]): result['docs'].append({'sents':[], 'author':dp.data['docs'][iid][dp.athstr], 'id':iid}) if 'attrib' in dp.data['docs'][iid]: result['docs'][-1]['attrib'] = dp.data['docs'][iid]['attrib'] id_to_ix[iid] = i n_samp = params['n_samples'] for i, b_data in tqdm(enumerate(dp.iter_sentences_bylen(split=params['split'], atoms=cp_params.get('atoms','word'), batch_size = params['batch_size'], auths = auth_to_ix.keys()))): if i > params['num_batches'] and params['num_batches']>0: break; #for i in xrange(params['num_batches']): #c_aid = np.random.choice(auth_to_ix.values()) #batch = dp.get_sentence_batch(1,split=params['split'], atoms=cp_params.get('atoms','char'), aid=ix_to_auth[c_aid]) c_bsz = len(b_data[0]) done = b_data[1] inps, targs, auths, lens = dp.prepare_data(b_data[0], char_to_ix, auth_to_ix, maxlen=cp_params['max_seq_len']) # outs are organized as auths_inp = 1 - auths if params['flip'] else auths outs = adv_forward_pass(modelGen, modelEval, inps, lens, end_c=char_to_ix[endc], maxlen=cp_params['max_seq_len'], auths=auths_inp, cycle_compute=params['show_rev'], append_symb=append_tensor, n_samples=params['n_samples']) eval_out_gt = modelEval.forward_classify(targs, lens=lens, compute_softmax=True) auths_inp = auths_inp.numpy() i_bsz = np.arange(c_bsz) real_aid_out = eval_out_gt[0].data.cpu().numpy()[i_bsz, auths_inp] gen_scores = outs[0].view(n_samp,c_bsz,-1) gen_aid_out = gen_scores.cpu().numpy()[:,i_bsz, auths_inp] gen_char = [v.view(n_samp,c_bsz) for v in outs[1]] gen_lens = outs[2].view(n_samp,c_bsz) np.add.at(accum_err_eval, auths_inp, gen_aid_out[0,:] >=0.5) np.add.at(accum_err_real, auths_inp, real_aid_out >=0.5) np.add.at(accum_count_gen,auths_inp,1) for b in xrange(inps.size()[1]): inpset = set(inps[:,b].tolist()[:lens[b]]) ; samples = [] accum_diff_eval[auths_inp[b]].append(gen_aid_out[0,b] - real_aid_out[b]) for si in xrange(n_samp): genset = set([c[si, b] for c in gen_char[:gen_lens[si,b]]]); accum_recall_forward[auths_inp[b]] += (float(len(genset & inpset)) / float(len(inpset))) accum_prec_forward[auths_inp[b]] += (float(len(genset & inpset)) / float(len(genset))) if params['show_rev']: revgenset = set([c[b] for c in outs[-2][:outs[-1][b]] ]) accum_recall_rev[auths_inp[b]] += (float(len(revgenset & inpset)) / float(len(inpset))) accum_prec_rev[auths_inp[b]] += (float(len(revgenset & inpset)) / float(len(revgenset))) inp_text = jc.join([ix_to_char[c] for c in targs[:,b] if c in ix_to_char]) trans_text = jc.join([ix_to_char[c.cpu()[si,b]] for c in gen_char[:gen_lens[si,b]] if c.cpu()[si,b] in ix_to_char]) samples.append({'sent':inp_text,'score':eval_out_gt[0][b].data.cpu().tolist(), 'trans': trans_text, 'trans_score':gen_scores[si,b].cpu().tolist(),'sid':b_data[0][b]['sid']}) result['docs'][id_to_ix[b_data[0][b]['id']]]['sents'].append(samples) if params['print']: print '--------------------------------------------' print 'Author: %s'%(b_data[0][0]['author']) print 'Inp text %s: %s (%.2f)'%(ix_to_auth[auths[0]], jc.join([ix_to_char[c[0]] for c in inps[1:]]), real_aid_out[0]) print 'Out text %s: %s (%.2f)'%(ix_to_auth[auths_inp[0]],jc.join([ix_to_char[c.cpu()[0]] for c in outs[1] if c.cpu()[0] in ix_to_char]), gen_aid_out[0]) if params['show_rev']: print 'Rev text %s: '%(ix_to_auth[auths[0]])+ '%s'%(jc.join([ix_to_char[c.cpu()[0]] for c in outs[-2] if c.cpu()[0] in ix_to_char])) #else: # print '%d/%d\r'%(i, params['num_batches']), err_a1, err_a2 = accum_err_eval[0]/(1e-5+accum_count_gen[0]), accum_err_eval[1]/(1e-5+accum_count_gen[1]) err_real_a1, err_real_a2 = accum_err_real[0]/(1e-5+accum_count_gen[0]), accum_err_real[1]/(1e-5+accum_count_gen[1]) print '--------------------------------------------' print 'Efficiency in fooling discriminator' print '--------------------------------------------' print(' erra1 {:3.2f} - erra2 {:3.2f}'.format(100.*err_a1, 100.*err_a2)) print(' err_real_a1 {:3.2f} - err_real_a2 {:3.2f}'.format(100.*err_real_a1, 100.*err_real_a2)) print(' count %d - %d'%(accum_count_gen[0], accum_count_gen[1])) diff_arr0, diff_arr1 = np.array(accum_diff_eval[0]), np.array(accum_diff_eval[1]) print 'Mean difference : translation to %s = %.2f , translation to %s = %.2f '%(ix_to_auth[0], diff_arr0.mean(), ix_to_auth[1], diff_arr1.mean()) print 'Difference > 0 : translation to %s = %.2f%%, translation to %s = %.2f%% '%(ix_to_auth[0], 100.*(diff_arr0>0).sum()/(1e-5+diff_arr0.shape[0]), ix_to_auth[1], 100.*(diff_arr1>0).sum()/(1e-5+diff_arr1.shape[0])) print 'Difference < 0 : translation to %s = %.2f%%, translation to %s = %.2f%% '%(ix_to_auth[0], 100.*(diff_arr0<0).sum()/(1e-5+diff_arr0.shape[0]), ix_to_auth[1], 100.*(diff_arr1<0).sum()/(1e-5+diff_arr1.shape[0])) print '\n--------------------------------------------' print 'Consistencey with the input text' print '--------------------------------------------' print 'Generated text A0- Precision = %.2f, Recall = %.2f'%(accum_prec_forward[0]/accum_count_gen[0], accum_recall_forward[0]/accum_count_gen[0] ) print 'Generated text A1- Precision = %.2f, Recall = %.2f'%(accum_prec_forward[1]/accum_count_gen[1], accum_recall_forward[1]/accum_count_gen[1] ) if params['show_rev']: print '\n' print 'Reconstr text A0- Precision = %.2f, Recall = %.2f'%(accum_prec_rev[0]/accum_count_gen[0], accum_recall_rev[0]/accum_count_gen[0] ) print 'Reconstr text A1- Precision = %.2f, Recall = %.2f'%(accum_prec_rev[1]/accum_count_gen[1], accum_recall_rev[1]/accum_count_gen[1] ) print '\n--------------------------------------------' print 'Document Level Scores' print '--------------------------------------------' doc_accuracy = np.zeros(len(auth_to_ix)) doc_accuracy_trans = np.zeros(len(auth_to_ix)) doc_count = np.zeros(len(auth_to_ix)) for doc in result['docs']: doc_score_orig = np.array([0.,0.]) doc_score_trans = np.array([0.,0.]) for st in doc['sents']: doc_score_orig += np.log(st[0]['score']) doc_score_trans += np.log(st[0]['trans_score']) doc_accuracy[auth_to_ix[doc['author']]] += float(doc_score_orig.argmax() == auth_to_ix[doc['author']]) doc_accuracy_trans[auth_to_ix[doc['author']]] += float(doc_score_trans.argmax() == auth_to_ix[doc['author']]) doc_count[auth_to_ix[doc['author']]] += 1. print 'Original data' print '-------------' print 'Doc accuracy is %s : %.2f , %s : %.2f'%(ix_to_auth[0], (doc_accuracy[0]/doc_count[0]),ix_to_auth[1], (doc_accuracy[1]/doc_count[1]) ) fp = doc_count[1]- doc_accuracy[1] recall = doc_accuracy[0]/doc_count[0] precision = doc_accuracy[0]/(doc_accuracy[0]+fp) f1score = 2.*(precision*recall)/(precision+recall) print 'Precision is %.2f : Recall is %.2f , F1-score is %.2f'%(precision, recall, f1score) print '\nTranslated data' print '-----------------' print 'Doc accuracy is %s : %.2f , %s : %.2f'%(ix_to_auth[0], (doc_accuracy_trans[0]/doc_count[0]),ix_to_auth[1], (doc_accuracy_trans[1]/doc_count[1]) ) fp = doc_count[1]- doc_accuracy_trans[1] recall = doc_accuracy_trans[0]/doc_count[0] precision = doc_accuracy_trans[0]/(doc_accuracy_trans[0]+fp) f1score = 2.*(precision*recall)/(precision+recall) print 'Precision is %.2f : Recall is %.2f , F1-score is %.2f'%(precision, recall, f1score) if params['dumpjson']: json.dump(result, open(params['dumpjson'],'w'))
def main(): data_provider = DataProvider() directory = 'plots' os.makedirs(directory, exist_ok=True) for width_height in (None, (224, 224), (2200 // 5, 2200 // 5)): print(width_height) type_to_bad_picutres_percent = defaultdict(list) type_to_bad_patches_pairs_percent = defaultdict(list) ts = 2, 4, 5 for image_type in ImageType: print(image_type) for t in ts: print(t) number_of_pictures_with_same_patches, number_of_patches_with_similar_in_same_picture,\ total_number_of_pictures, total_number_of_patches = \ get_number_of_images_with_same_patches_and_number_of_same_patches( data_provider, image_type, t, width_height ) print('{} shredded to {} patches and resized to {} has {}/{} bad pictures and {}/{} patches'.format( image_type, t, width_height, number_of_pictures_with_same_patches, total_number_of_pictures, number_of_patches_with_similar_in_same_picture, total_number_of_patches )) type_to_bad_picutres_percent[image_type].append( number_of_pictures_with_same_patches * 100.0 / total_number_of_pictures) type_to_bad_patches_pairs_percent[image_type].append( number_of_patches_with_similar_in_same_picture * 100.0 / total_number_of_patches) handles = list() for image_type in ImageType: current_handle, = plt.plot(ts, type_to_bad_picutres_percent[image_type], 'o', label=image_type.value) handles.append(current_handle) plt.title('Percent of bad images as function of t') plt.legend(handles) plt.xlabel('t') plt.ylabel('% of images') plt.savefig(os.path.join(directory, 'bad_images.png')) plt.show() handles = list() for image_type in ImageType: current_handle, = plt.plot(ts, type_to_bad_patches_pairs_percent[image_type], 'o', label=image_type.value) handles.append(current_handle) plt.title('Percent of bad crops as function of t') plt.legend(handles) plt.xlabel('t') plt.ylabel('% of pairs') plt.savefig(os.path.join(directory, 'bad_crops.png')) plt.show()
labels = np.concatenate((np.ones(len(fish)), np.zeros(len(docs)))).astype(int).tolist() x_train, x_test, y_train, y_test = \ train_test_split(images, labels, train_size=0.8, random_state=42, stratify=labels) return (x_train, np.stack(y_train, axis=0)), (x_test, np.stack(y_test, axis=0)) def is_fish(self, x): x = resize_to(x, self._input_shape) x = x / 255 res = self._model.predict(x) > 0.5 return res def is_doc(self, image): return not self.is_fish(image) if __name__ == '__main__': weights = os.path.join(os.path.dirname(__file__), 'saved_weights') epochs = 3 if 'debug' in sys.argv else 300 if 'fit' in sys.argv: # This will fit the classifier clf = FishOrDocClassifier(DataProvider()) print('Fitting {}'.format(clf.__class__.__name__)) os.makedirs(weights, exist_ok=True) model, history = clf.fit(weights, epochs=epochs) PickleHelper.dump(history.history, os.path.join(visualization_root, 'history.pkl')) if 'visualize' in sys.argv: print('Visualizing history') history = PickleHelper.load(os.path.join(visualization_root, 'history.pkl')) HistoryVisualizer.visualize(history['acc'], history['val_acc'], 'accuracy', visualization_root) HistoryVisualizer.visualize(history['loss'], history['val_loss'], 'loss', visualization_root)
def main(params): dp = DataProvider(params) # Create vocabulary and author index if params['resume'] == None: if params['atoms'] == 'char': char_to_ix, ix_to_char = dp.create_char_vocab( params['vocab_threshold']) else: char_to_ix, ix_to_char = dp.create_word_vocab( params['vocab_threshold']) auth_to_ix, ix_to_auth = dp.create_author_idx() else: saved_model = torch.load(params['resume']) char_to_ix = saved_model['char_to_ix'] auth_to_ix = saved_model['auth_to_ix'] ix_to_char = saved_model['ix_to_char'] params['vocabulary_size'] = len(char_to_ix) params['num_output_layers'] = len(auth_to_ix) print params['vocabulary_size'], params['num_output_layers'] model = get_classifier(params) # set to train mode, this activates dropout model.train() # Initialize the RMSprop optimizer if params['use_sgd']: optim = torch.optim.SGD(model.parameters(), lr=params['learning_rate'], momentum=params['decay_rate']) else: optim = torch.optim.RMSprop([{ 'params': [p[1] for p in model.named_parameters() if p[0] != 'decoder_W'] }, { 'params': model.decoder_W, 'weight_decay': 0.000 }], lr=params['learning_rate'], alpha=params['decay_rate'], eps=params['smooth_eps']) # Loss function if len(params['balance_loss']) == 0: criterion = nn.CrossEntropyLoss() else: criterion = nn.CrossEntropyLoss( torch.FloatTensor(params['balance_loss']).cuda()) # Restore saved checkpoint if params['resume'] != None: model.load_state_dict(saved_model['state_dict']) # optim.load_state_dict(saved_model['optimizer']) total_loss = 0. class_loss = 0. start_time = time.time() hidden = model.init_hidden(params['batch_size']) hidden_zeros = model.init_hidden(params['batch_size']) # Initialize the cache if params['randomize_batches']: dp.set_hid_cache(range(len(dp.data['docs'])), hidden_zeros) # Compute the iteration parameters epochs = params['max_epochs'] total_seqs = dp.get_num_sents(split='train') iter_per_epoch = total_seqs // params['batch_size'] total_iters = iter_per_epoch * epochs best_loss = 0. best_val = 1000. eval_every = int(iter_per_epoch * params['eval_interval']) # val_score = eval_model(dp, model, params, char_to_ix, auth_to_ix, split='val', max_docs = params['num_eval']) val_score = 0. # eval_model(dp, model, params, char_to_ix, auth_to_ix, split='val', max_docs = params['num_eval']) val_rank = 0 eval_function = eval_model if params[ 'mode'] == 'generative' else eval_classify leakage = params['leakage'] for i in xrange(total_iters): # TODO if params['randomize_batches']: batch, reset_next = dp.get_rand_doc_batch(params['batch_size'], split='train') b_ids = [b['id'] for b in batch] hidden = dp.get_hid_cache(b_ids, hidden) elif params['use_sentences']: c_aid = None # ix_to_auth[np.random.choice(auth_to_ix.values())] batch = dp.get_sentence_batch( params['batch_size'], split='train', aid=c_aid, atoms=params['atoms'], sample_by_len=params['sample_by_len']) hidden = hidden_zeros else: batch, reset_h = dp.get_doc_batch(split='train') if len(reset_h) > 0: hidden[0].data.index_fill_(1, torch.LongTensor(reset_h).cuda(), 0.) hidden[1].data.index_fill_(1, torch.LongTensor(reset_h).cuda(), 0.) inps, targs, auths, lens = dp.prepare_data(batch, char_to_ix, auth_to_ix, leakage=leakage) # Reset the hidden states for which new docs have been sampled # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) optim.zero_grad() # TODO if params['mode'] == 'generative': output, hidden = model.forward(inps, lens, hidden, auths) targets = pack_padded_sequence(Variable(targs).cuda(), lens) loss = criterion(pack_padded_sequence(output, lens)[0], targets[0]) else: # for classifier auths is the target output, _ = model.forward_classify(targs, hidden, compute_softmax=False, lens=lens) targets = Variable(auths).cuda() lossClass = criterion(output, targets) if params['compression_layer']: loss = lossClass + (model.compression_W.weight.norm( p=1, dim=1)).mean() else: loss = lossClass loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(model.parameters(), params['grad_clip']) # Take an optimization step optim.step() total_loss += loss.data.cpu().numpy()[0] class_loss += lossClass.data.cpu().numpy()[0] # Save the hidden states in cache for later use if params['randomize_batches']: if len(reset_next) > 0: hidden[0].data.index_fill_(1, torch.LongTensor(reset_next).cuda(), 0.) hidden[1].data.index_fill_(1, torch.LongTensor(reset_next).cuda(), 0.) dp.set_hid_cache(b_ids, hidden) if i % eval_every == 0 and i > 0: val_rank, val_score = eval_function(dp, model, params, char_to_ix, auth_to_ix, split='val', max_docs=params['num_eval']) if i % iter_per_epoch == 0 and i > 0 and leakage > params[ 'leakage_min']: leakage = leakage * params['leakage_decay'] # if (i % iter_per_epoch == 0) and ((i//iter_per_epoch) >= params['lr_decay_st']): if i % params['log_interval'] == 0 and i > 0: cur_loss = total_loss / params['log_interval'] class_loss = class_loss / params['log_interval'] elapsed = time.time() - start_time print( '| epoch {:3.2f} | {:5d}/{:5d} batches | lr {:02.2e} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( float(i) / iter_per_epoch, i, total_iters, params['learning_rate'], elapsed * 1000 / args.log_interval, cur_loss, math.exp(class_loss))) if val_rank >= best_loss: best_loss = val_rank save_checkpoint( { 'iter': i, 'arch': params, 'val_mean_rank': val_rank, 'val_auc': val_score, 'char_to_ix': char_to_ix, 'ix_to_char': ix_to_char, 'auth_to_ix': auth_to_ix, 'state_dict': model.state_dict(), 'loss': cur_loss, 'optimizer': optim.state_dict(), }, fappend=params['fappend'], outdir=params['checkpoint_output_directory']) best_val = val_rank start_time = time.time() total_loss = 0. class_loss = 0.
def main(params): dp = DataProvider(params) # Create vocabulary and author index if params['resume'] == None: if params['atoms'] == 'char': char_to_ix, ix_to_char = dp.createCharVocab( params['vocab_threshold']) else: char_to_ix, ix_to_char = dp.createWordVocab( params['vocab_threshold']) auth_to_ix, ix_to_auth = dp.createAuthorIdx() else: saved_model = torch.load(params['resume']) char_to_ix = saved_model['char_to_ix'] auth_to_ix = saved_model['auth_to_ix'] ix_to_auth = saved_model['ix_to_auth'] ix_to_char = saved_model['ix_to_char'] params['vocabulary_size'] = len(char_to_ix) params['num_output_layers'] = len(auth_to_ix) model = CharTranslator(params) # set to train mode, this activates dropout model.train() #Initialize the RMSprop optimizer if params['use_sgd']: optim = torch.optim.SGD(model.parameters(), lr=params['learning_rate'], momentum=params['decay_rate']) else: optim = torch.optim.RMSprop(model.parameters(), lr=params['learning_rate'], alpha=params['decay_rate'], eps=params['smooth_eps']) # Loss function if params['mode'] == 'generative': criterion = nn.CrossEntropyLoss() else: criterion = nn.NLLLoss() # Restore saved checkpoint if params['resume'] != None: model.load_state_dict(saved_model['state_dict']) optim.load_state_dict(saved_model['optimizer']) total_loss = 0. start_time = time.time() hidden = model.init_hidden(params['batch_size']) hidden_zeros = model.init_hidden(params['batch_size']) # Initialize the cache if params['randomize_batches']: dp.set_hid_cache(range(len(dp.data['docs'])), hidden_zeros) # Compute the iteration parameters epochs = params['max_epochs'] total_seqs = dp.get_num_sents(split='train') iter_per_epoch = total_seqs // params['batch_size'] total_iters = iter_per_epoch * epochs best_loss = 1000000. best_val = 1000. eval_every = int(iter_per_epoch * params['eval_interval']) #val_score = eval_model(dp, model, params, char_to_ix, auth_to_ix, split='val', max_docs = params['num_eval']) val_score = 0. #eval_model(dp, model, params, char_to_ix, auth_to_ix, split='val', max_docs = params['num_eval']) val_rank = 1000 eval_function = eval_translator if params[ 'mode'] == 'generative' else eval_classify leakage = 0. #params['leakage'] print total_iters for i in xrange(total_iters): #TODO if params['split_generators']: c_aid = ix_to_auth[np.random.choice(auth_to_ix.values())] else: c_aid = None batch = dp.get_sentence_batch(params['batch_size'], split='train', atoms=params['atoms'], aid=c_aid, sample_by_len=params['sample_by_len']) inps, targs, auths, lens = dp.prepare_data( batch, char_to_ix, auth_to_ix, maxlen=params['max_seq_len']) # Reset the hidden states for which new docs have been sampled # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) optim.zero_grad() #TODO if params['mode'] == 'generative': output, _ = model.forward_mltrain(inps, lens, inps, lens, hidden_zeros, auths=auths) targets = pack_padded_sequence(Variable(targs).cuda(), lens) loss = criterion(pack_padded_sequence(output, lens)[0], targets[0]) else: # for classifier auths is the target output, hidden = model.forward_classify(inps, hidden, compute_softmax=True) targets = Variable(auths).cuda() loss = criterion(output, targets) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(model.parameters(), params['grad_clip']) # Take an optimization step optim.step() total_loss += loss.data.cpu().numpy()[0] # Save the hidden states in cache for later use if i % eval_every == 0 and i > 0: val_rank, val_score = eval_function(dp, model, params, char_to_ix, auth_to_ix, split='val') #if i % iter_per_epoch == 0 and i > 0 and leakage > params['leakage_min']: # leakage = leakage * params['leakage_decay'] #if (i % iter_per_epoch == 0) and ((i//iter_per_epoch) >= params['lr_decay_st']): if i % params['log_interval'] == 0 and i > 0: cur_loss = total_loss / params['log_interval'] elapsed = time.time() - start_time print( '| epoch {:2.2f} | {:5d}/{:5d} batches | lr {:02.2e} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( float(i) / iter_per_epoch, i, total_iters, params['learning_rate'], elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0. if val_rank <= best_val: save_checkpoint( { 'iter': i, 'arch': params, 'val_loss': val_rank, 'val_pplx': val_score, 'char_to_ix': char_to_ix, 'ix_to_char': ix_to_char, 'auth_to_ix': auth_to_ix, 'ix_to_auth': ix_to_auth, 'state_dict': model.state_dict(), 'loss': cur_loss, 'optimizer': optim.state_dict(), }, fappend=params['fappend'], outdir=params['checkpoint_output_directory']) best_val = val_rank start_time = time.time()
def main(params): dp = DataProvider(params) auth_to_ix = dp.createAuthorIdx() # Preprocess the training data train_docs = [] targets = [] model = {} # remove numbers bad_hombres = range(10) if params['nostop']: bad_hombres = bad_hombres + stopwords.words('english') if params['nopunct']: bad_hombres = bad_hombres + list(string.punctuation) bad_hombres = set(bad_hombres) all_words = Counter() for i, doc in enumerate(dp.data['docs']): no_num = re.sub(r'\d+', '', doc['text'].lower()) curr_text = [ w for w in wordpunct_tokenize(no_num) if w not in bad_hombres ] dp.data['docs'][i]['tokenized'] = curr_text if doc['split'] == 'train': all_words.update(curr_text) short_vocab = { w: i for i, w in enumerate([ wrd for wrd in all_words if all_words[wrd] > params['vocab_threshold'] ]) } docCounts_train, target_train = count(dp, short_vocab, auth_to_ix, split='train') bow_features_train, idf_train = bow_features(docCounts_train, params['tfidf']) docCounts_val, target_val = count(dp, short_vocab, auth_to_ix, split='val') bow_features_val, _ = bow_features(docCounts_val, params['tfidf'], idf=idf_train) # Do PCA? if params['pca'] > 0: pca_model = PCA(n_components=params['pca']) bow_features_train = pca_model.fit_transform(bow_features_train) print 'Explained variance is %.2f' % (sum( pca_model.explained_variance_ratio_)) bow_features_val = pca_model.transform(bow_features_val) params['pca'] = bow_features_train.shape[-1] # Normalize the data bow_features_train, mean_tr, std_tr = normalize(bow_features_train) bow_features_val, _, _ = normalize(bow_features_val, mean_tr, std_tr) if params['mlp'] == False: if params['linearsvm']: # Linear SVC alread implements one-vs-rest svm_model = LinearSVC() #verbose=1) svm_model.fit(bow_features_train, target_train) #Time to evaluate now. confTr = svm_model.decision_function(bow_features_train) confVal = svm_model.decision_function(bow_features_val) else: params['num_output_layers'] = len(auth_to_ix) params['inp_size'] = params['pca'] model = MLP_classifier(params) model.fit(bow_features_train, target_train, bow_features_val, target_val, params['epochs'], params['lr'], params['l2']) confTr = model.decision_function(bow_features_train) confVal = model.decision_function(bow_features_val) mean_rank_train = np.where( confTr.argsort(axis=1)[:, ::-1] == target_train[:, None])[1].mean() topk_train = ( np.where(confTr.argsort(axis=1)[:, ::-1] == target_train[:, None])[1] <= params['topk']).sum() * 100. / len(target_train) train_accuracy = 100. * float( (confTr.argmax(axis=1) == target_train).sum()) / len(target_train) mean_rank_val = np.where( confVal.argsort(axis=1)[:, ::-1] == target_val[:, None])[1].mean() topk_val = ( np.where(confVal.argsort(axis=1)[:, ::-1] == target_val[:, None])[1] <= params['topk']).sum() * 100. / len(target_val) val_accuracy = 100. * float( (confVal.argmax(axis=1) == target_val).sum()) / len(target_val) # DO the binary evaluation similar to the Bagnall #confTr = confTr - confTr.mean(axis=1)[:,None] n_auths = len(auth_to_ix) n_train = confTr.shape[0] neg_auths_tr = np.random.randint(0, n_auths, n_train) adjusted_scores_tr = ((np.argsort( confTr[:, np.concatenate([target_train.astype(int), neg_auths_tr])], axis=0) == np.concatenate([np.arange(n_train), np.arange(n_train)])).argmax(axis=0) + 1) / float(n_train) auc_tr = roc_auc_score( np.concatenate([ np.ones(int(n_train), dtype=int), np.zeros(int(n_train), dtype=int) ]), adjusted_scores_tr) n_val = confVal.shape[0] neg_auths_val = np.random.randint(0, n_auths, n_val) adjusted_scores_val = ((np.argsort( confVal[:, np.concatenate([target_val.astype(int), neg_auths_val])], axis=0) == np.concatenate([np.arange(n_val), np.arange(n_val)])).argmax(axis=0) + 1) / float(n_val) auc_val = roc_auc_score( np.concatenate( [np.ones(int(n_val), dtype=int), np.zeros(int(n_val), dtype=int)]), adjusted_scores_val) print '------------- Training set-------------------' print 'Accuracy is %.2f, Mean rank is %.2f / %d' % ( train_accuracy, mean_rank_train, len(auth_to_ix)) print 'Top-%d Accuracy is %.2f' % (params['topk'], topk_train) print 'Accuracy per adjusted scores %.3f' % (100. * ( (adjusted_scores_tr[:n_train] >= 0.5).sum() + (adjusted_scores_tr[n_train:] < 0.5).sum()) / (2. * n_train)) print 'AUC is %.2f' % (auc_tr) print '------------- Val set-------------------' print 'Accuracy is %.2f, Mean rank is %.2f / %d' % ( val_accuracy, mean_rank_val, len(auth_to_ix)) print 'Top-%d Accuracy is %.2f' % (params['topk'], topk_val) print 'Accuracy per adjusted scores %.3f' % (100. * ( (adjusted_scores_val[:n_val] >= 0.5).sum() + (adjusted_scores_val[n_val:] < 0.5).sum()) / (2. * n_val)) print 'AUC is %.2f' % (auc_val) print '--------------------------------------------------------------------------' print '--------------------------------------------------------------------------\n\n'
def main(): if 'debug' in sys.argv: print('Debug') number_of_samples = 20 epochs = 1 else: print('Release') number_of_samples = sys.maxsize epochs = 5 ts = list() if '2' in sys.argv: ts.append(2) if '4' in sys.argv: ts.append(4) if '5' in sys.argv: ts = [ 5, ] if 0 == len(ts): ts = (2, 4, 5) image_types = list() if 'image' in sys.argv: image_types.append(ImageType.IMAGES) if 'document' in sys.argv: image_types.append(ImageType.DOCUMENTS) if 0 == len(image_types): image_types = ImageType np.random.seed(42) for image_type in image_types: print(image_type.value) if image_type == ImageType.IMAGES: get_images = DataProvider().get_fish_images mean = IMAGE_TYPE_TO_MEAN[image_type] std = IMAGE_TYPE_TO_STD[image_type] else: get_images = DataProvider().get_docs_images mean = IMAGE_TYPE_TO_MEAN[image_type] std = IMAGE_TYPE_TO_STD[image_type] images, names = get_images(num_samples=number_of_samples, return_names=True) images_train, images_validation, names_train, names_validation = train_test_split( images, names, random_state=42) t_to_comparator = { t: ComparatorCNN( t, IMAGE_TYPE_TO_T_TO_COMPARATOR_CNN_WEIGHT_FILE_ID_AND_FILE_PATH[ image_type][t].width, IMAGE_TYPE_TO_T_TO_COMPARATOR_CNN_WEIGHT_FILE_ID_AND_FILE_PATH[ image_type][t].height, image_type, mean=mean, std=std). load_weights( IMAGE_TYPE_TO_T_TO_COMPARATOR_CNN_WEIGHT_FILE_ID_AND_FILE_PATH[ image_type][t].model_path) for t in ts } t_to_backup_solver = { t: SolverGreedy(t_to_comparator, image_type=image_type) for t in ts } clf = SolverPairwiseMerge(t_to_comparator, t_to_backup_solver=t_to_backup_solver, image_type=image_type) print('Train: ', names_train) accuracy = clf.evaluate(images_train, epochs=epochs, ts=ts) print('Train 0-1 accuracy on {}: {}'.format(image_type.value, accuracy)) print('Validation: ', names_validation) accuracy = clf.evaluate(images_validation, epochs=epochs, ts=ts) print('Validation 0-1 accuracy on {}: {}'.format( image_type.value, accuracy))
def main(): if 'debug' in sys.argv: print('Debug') number_of_samples = 20 epochs = 1 else: print('Release') number_of_samples = sys.maxsize epochs = 1 ts = list() if '2' in sys.argv: ts.append(2) if '4' in sys.argv: ts.append(4) if '5' in sys.argv: ts = [5, ] if 0 == len(ts): ts = (2, 4, 5) image_types = list() if 'image' in sys.argv: image_types.append(ImageType.IMAGES) if 'document' in sys.argv: image_types.append(ImageType.DOCUMENTS) if 'version' in sys.argv: version = int(sys.argv[sys.argv.index('version') + 1]) else: version = 2 if 0 == version: iterate_on_bottom_values = (False, ) iterate_on_right_values = (False, ) column_then_row_values = (False, ) iterate_first_shred = False try_to_improve_with_row_permutation = False width = 224 height = 224 elif 1 == version: iterate_on_bottom_values = (False, ) iterate_on_right_values = (False, ) column_then_row_values = (False, ) iterate_first_shred = True try_to_improve_with_row_permutation = False width = 224 height = 224 elif 2 == version: iterate_on_bottom_values = (False, True) iterate_on_right_values = (False, True) column_then_row_values = (False, True) iterate_first_shred = True try_to_improve_with_row_permutation = False width = 224 height = 224 else: # if 3 <= version iterate_on_bottom_values = (False, True) iterate_on_right_values = (False, True) column_then_row_values = (False, True) iterate_first_shred = True try_to_improve_with_row_permutation = True width = 2200 // 5 height = 2200 // 5 if 0 == len(image_types): image_types = ImageType np.random.seed(42) for image_type in image_types: print(image_type.value) if image_type == ImageType.IMAGES: get_images = DataProvider().get_fish_images mean = 100.52933494138787 std = 65.69793156777682 else: get_images = DataProvider().get_docs_images mean = 241.46115784237548 std = 49.512839464023564 images, names = get_images(num_samples=number_of_samples, return_names=True) images_train, images_validation, names_train, names_validation = train_test_split(images, names, random_state=42) t_to_comparator = { t: ComparatorCNN(t, width, height, image_type, mean=mean, std=std) .load_weights() for t in ts } clf = SolverGreedy(t_to_comparator, image_type=image_type, iterate_on_bottom_values=iterate_on_bottom_values, iterate_on_right_values=iterate_on_right_values, column_then_row_values=column_then_row_values, iterate_first_shred=iterate_first_shred, try_to_improve_with_row_permutation=try_to_improve_with_row_permutation) print('Train: ', names_train) accuracy = clf.evaluate(images_train, epochs=epochs, ts=ts) print('Train 0-1 accuracy on {}: {}'.format(image_type.value, accuracy)) print('Validation: ', names_validation) accuracy = clf.evaluate(images_validation, epochs=epochs, ts=ts) print('Validation 0-1 accuracy on {}: {}'.format(image_type.value, accuracy))