def main():
    if 'large' in sys.argv:
        number_of_samples = sys.maxsize
    else:
        number_of_samples = 20

    ts = list()

    if '2' in sys.argv:
        ts.append(2)

    if '4' in sys.argv:
        ts.append(4)

    if '5' in sys.argv:
        ts = [
            5,
        ]

    if 0 == len(ts):
        ts = (2, 4, 5)

    image_types = list()

    if 'image' in sys.argv:
        image_types.append(ImageType.IMAGES)

    if 'document' in sys.argv:
        image_types.append(ImageType.DOCUMENTS)

    if 0 == len(image_types):
        image_types = ImageType

    np.random.seed(42)

    for t in ts:
        for image_type in image_types:
            print('t={}. image type is {}'.format(t, image_type.value))

            if image_type == ImageType.IMAGES:
                get_images = DataProvider().get_fish_images
            else:
                get_images = DataProvider().get_docs_images

            images = get_images(num_samples=number_of_samples)
            images_train, images_validation = train_test_split(images,
                                                               random_state=42)

            clf = ComparatorCNN(t,
                                IMAGE_TYPE_TO_T_TO_COMPARATOR_CNN_WEIGHT_FILE_ID_AND_FILE_PATH[image_type][t].width,
                                IMAGE_TYPE_TO_T_TO_COMPARATOR_CNN_WEIGHT_FILE_ID_AND_FILE_PATH[image_type][t].height,
                                image_type) \
                .load_weights(IMAGE_TYPE_TO_T_TO_COMPARATOR_CNN_WEIGHT_FILE_ID_AND_FILE_PATH[image_type][t].model_path)

            cam = ComparatorActivationMap(clf)
            for image in images_train:
                cam.visualize_activations(
                    shred_and_resize_to([image], t, (clf.width, clf.height)))
def main():
    print('---------------------- data config ------------------------')
    pprint(data_cfg)

    print('---------------------- model config -------------------')
    pprint(model_cfg)

    print('creating dirs for saving model weights, logs ...')
    checkpoint_dir = os.path.join(
        model_cfg.checkpoint_dir, model_cfg.exp_name)
    create_dirs([checkpoint_dir, train_cfg.summary_dir])

    print('initializing train data provider....')
    det_data_provider = DataProvider(data_cfg)

    sess = tf.Session()
    print('creating tensorflow log for summaries...')
    tf_logger = TfLogger(sess, train_cfg)
    print('creating seg models ...')
    train_model = SegModel(model_cfg)
    if model_cfg.train_from_pretrained:
        train_model.load(sess)

    print('creating seg trainer...')
    trainer = SegTrainer(sess, train_model,
                         det_data_provider, train_cfg, tf_logger)

    print('start trainning...')
    trainer.train()

    sess.close()
def main(params):

    # Create vocabulary and author index
    saved_model = torch.load(params['model'])
    char_to_ix = saved_model['char_to_ix']
    auth_to_ix = saved_model['auth_to_ix']
    ix_to_char = saved_model['ix_to_char']
    cp_params = saved_model['arch']

    dp = DataProvider(cp_params)

    if params['m_type'] == 'translator':
        model = CharTranslator(cp_params)
    else:
        model = get_classifier(cp_params)
    # set to train mode, this activates dropout
    #model.eval()

    # Restore saved checkpoint
    model.load_state_dict(saved_model['state_dict'])

    eval_function = eval_translator if params[
        'm_type'] == 'translator' else eval_model if cp_params[
            'mode'] == 'generative' else eval_classify

    score = eval_function(dp,
                          model,
                          cp_params,
                          char_to_ix,
                          auth_to_ix,
                          split=params['split'],
                          max_docs=params['num_eval'],
                          dump_scores=params['dump_scores'])
def main(params):

    # Create vocabulary and author index
    saved_model = torch.load(params['model'])
    if 'misc' in saved_model:
        misc = saved_model['misc']
        char_to_ix = misc['char_to_ix']
        auth_to_ix = misc['auth_to_ix']
        ix_to_char = misc['ix_to_char']
        ix_to_auth = misc['ix_to_auth']
    else:
        char_to_ix = saved_model['char_to_ix']
        auth_to_ix = saved_model['auth_to_ix']
        ix_to_char = saved_model['ix_to_char']
        ix_to_auth = saved_model['ix_to_auth']
    cp_params = saved_model['arch']
    if params['softmax_scale']:
        cp_params['softmax_scale'] = params['softmax_scale']

    dp = DataProvider(cp_params)

    if params['m_type'] == 'generative':
        model = CharLstm(cp_params)
    else:
        model = CharTranslator(cp_params)
    # set to train mode, this activates dropout
    model.eval()
    auth_colors = ['red', 'blue']

    startc = dp.data['configs']['start']
    endc = dp.data['configs']['end']

    append_tensor = np.zeros((1, 1), dtype=np.int)
    append_tensor[0, 0] = char_to_ix[startc]
    append_tensor = torch.LongTensor(append_tensor).cuda()

    # Restore saved checkpoint
    model.load_state_dict(saved_model['state_dict'])
    hidden = model.init_hidden(1)
    jc = '' if cp_params.get('atoms','char') == 'char' else ' '

    for i in xrange(params['num_samples']):
        c_aid = np.random.choice(auth_to_ix.values())
        if params['m_type'] == 'generative':
            batch = dp.get_random_string(slen = params['seed_length'], split=params['split'])
        else:
            batch = dp.get_sentence_batch(1,split=params['split'], atoms=cp_params.get('atoms','char'), aid=ix_to_auth[c_aid])

        inps, targs, auths, lens = dp.prepare_data(batch, char_to_ix, auth_to_ix, maxlen=cp_params['max_seq_len'])
        auths_inp = 1 - auths if params['flip'] else auths
        outs = adv_forward_pass(model, inps, lens, end_c=char_to_ix[endc], maxlen=cp_params['max_seq_len'], auths=auths_inp,
                     cycle_compute=params['show_rev'], append_symb=append_tensor)
        #char_outs = model.forward_gen(inps, hidden, auths_inp, n_max = cp_params['max_len'],end_c=char_to_ix['.'])
        print '--------------------------------------------'
        #print 'Translate from %s to %s'%(batch[0]['author'], ix_to_auth[auths_inp[0]])
        print colored('Inp %6s: '%(ix_to_auth[auths[0]]),'green') + colored('%s'%(jc.join([ix_to_char[c[0]] for c in inps[1:]])),auth_colors[auths[0]])
        print colored('Out %6s: '%(ix_to_auth[auths_inp[0]]),'grey')+ colored('%s'%(jc.join([ix_to_char[c.data.cpu()[0]] for c in outs[0] if c.data.cpu()[0] in ix_to_char])),auth_colors[auths_inp[0]])

        if params['show_rev']:
            print colored('Rev %6s: '%(ix_to_auth[auths[0]]),'green')+ colored('%s'%(jc.join([ix_to_char[c.data.cpu()[0]] for c in outs[-1] if c.data.cpu()[0] in ix_to_char])),auth_colors[auths[0]])
Пример #5
0
def main(params):

    eval_model = torch.load(params['evalmodel'])
    eval_params = eval_model['arch']
    eval_state = eval_model['state_dict']
    modelEval  = get_classifier(eval_params)#= CharLstm(eval_params)

    char_to_ix = eval_model['char_to_ix']
    auth_to_ix = eval_model['auth_to_ix']
    ix_to_char = eval_model['ix_to_char']

    dp = DataProvider(eval_params)
    #modelEval.eval()
    state = modelEval.state_dict()
    state.update(eval_state)
    modelEval.load_state_dict(state)

    if params['inpfile'].split('.')[-1] == 'json':
        inps = json.load(open(params['inpfile'],'r'))
    elif params['inpfile'].split('.')[-1] == 'p':
        inps = pkl.load(open(params['inpfile'],'r'))
    bsz = 100

    def process_batch(batch, featstr = 'sent_enc'):
        _, targs, _,lens = dp.prepare_data(batch, char_to_ix, auth_to_ix, maxlen=eval_params['max_seq_len'])
        if not all(lens):
            import ipdb; ipdb.set_trace()
        eval_out = modelEval.forward_classify(targs, lens=lens,compute_softmax=True)
        eval_out = eval_out[0].data.cpu().numpy()
        for i,b in enumerate(batch):
            inps['docs'][b['id']]['sents'][b['sid']][b['sampid']][featstr] = eval_out[i,:].tolist()


    batch = []
    for i,doc in tqdm(enumerate(inps['docs'])):
        for j, st in enumerate(doc['sents']):
            for k in xrange(len(st)):
                st = inps['docs'][i]['sents'][j][k]['trans'].split()
                if len(st) > 0:
                    batch.append({'in': st,'targ': st, 'author': inps['docs'][i]['author'],
                        'id':i, 'sid': j, 'sampid':k})
                if len(batch) == bsz:
                    process_batch(batch, featstr = params['store_in'])
                    del batch
                    batch = []
    if batch:
        process_batch(batch, featstr = params['store_in'])
        del batch
        batch = []
    if params['inpfile'].split('.')[-1] == 'json':
        json.dump(inps, open(params['inpfile'],'w'))
    else:
        pkl.dump(inps, open(params['inpfile'],'wb'))
Пример #6
0
 def prepare_data(self, num_samples, resize=None):
     num_samples = 2000
     dp = DataProvider()
     ts = (1, 2, 4, 5)
     fish = dp.get_fish_images(num_samples=num_samples, resize=resize)
     docs = dp.get_docs_images(num_samples=num_samples, resize=resize)
     fish = list(itertools.chain(*[shred_shuffle_and_reconstruct(fish, t) for t in ts]))
     docs = list(itertools.chain(*[shred_shuffle_and_reconstruct(docs, t) for t in ts]))
     if resize is not None:
         fish = list_of_images_to_numpy(fish)
         docs = list_of_images_to_numpy(docs)
     return fish, docs
def dump_reconstruction_objective_values():
    d = {
        image_type: {t: {
            'correct': None,
            'incorrect': None
        }
                     for t in TS}
        for image_type in ImageType
    }
    dp = DataProvider()
    for image_type in ImageType:
        for t in TS:
            print("Getting stats for {}-{}...".format(image_type, t))
            d[image_type][t]['correct'], \
            d[image_type][t]['incorrect'] = get_reconstruction_objective_values(dp, image_type, t)

    os.makedirs(root_path, exist_ok=True)
    file_path = os.path.join(dict_file_names['log_obj'])
    PickleHelper.dump(d, file_path)
def dump_adjacent_and_non_adjacent_probabilities():
    d = {
        image_type: {t: {
            'adj': None,
            'non_adj': None
        }
                     for t in TS}
        for image_type in ImageType
    }
    dp = DataProvider()
    for image_type in ImageType:
        for t in TS:
            print("Getting stats for {}-{}...".format(image_type, t))
            d[image_type][t]['non_adj'] = get_non_adjacent_crops_probabilities(
                dp, image_type, t)
            d[image_type][t]['adj'] = get_adjacent_crops_probabilities(
                dp, image_type, t)

    os.makedirs(root_path, exist_ok=True)
    PickleHelper.dump(d, dict_file_names['adj'])
Пример #9
0
def main(params):
    # Create vocabulary and author index
    saved_model = torch.load(params['model'])
    if 'misc' in saved_model:
        misc = saved_model['misc']
        char_to_ix = misc['char_to_ix']
        auth_to_ix = misc['auth_to_ix']
        ix_to_char = misc['ix_to_char']
        ix_to_auth = misc['ix_to_auth']
    else:
        char_to_ix = saved_model['char_to_ix']
        auth_to_ix = saved_model['auth_to_ix']
        ix_to_char = saved_model['ix_to_char']
        ix_to_auth = saved_model['ix_to_auth']
    cp_params = saved_model['arch']
    if params['softmax_scale']:
        cp_params['softmax_scale'] = params['softmax_scale']

    dp = DataProvider(cp_params)

    if params['m_type'] == 'generative':
        model = CharLstm(cp_params)
    else:
        model = CharTranslator(cp_params)
    # set to train mode, this activates dropout
    model.eval()
    auth_colors = ['red', 'blue']

    startc = dp.data['configs']['start']
    endc = dp.data['configs']['end']

    append_tensor = np.zeros((1, 1), dtype=np.int)
    append_tensor[0, 0] = char_to_ix[startc]
    append_tensor = torch.LongTensor(append_tensor).cuda()

    # Restore saved checkpoint
    model.load_state_dict(saved_model['state_dict'])
    hidden = model.init_hidden(1)
    jc = '' if cp_params.get('atoms', 'char') == 'char' else ' '

    for i in range(params['num_samples']):
        c_aid = np.random.choice(list(auth_to_ix.values()))
        if params['m_type'] == 'generative':
            batch = dp.get_random_string(slen=params['seed_length'],
                                         split=params['split'])
        else:
            batch = dp.get_sentence_batch(1,
                                          split=params['split'],
                                          atoms=cp_params.get('atoms', 'char'),
                                          aid=ix_to_auth[c_aid])

        inps, targs, auths, lens = dp.prepare_data(
            batch, char_to_ix, auth_to_ix, maxlen=cp_params['max_seq_len'])
        auths_inp = 1 - auths if params['flip'] else auths
        forward, backward = adv_forward_pass(model,
                                             inps,
                                             lens,
                                             end_c=char_to_ix[endc],
                                             maxlen=cp_params['max_seq_len'],
                                             auths=auths_inp,
                                             cycle_compute=params['show_rev'],
                                             append_symb=append_tensor)
        # char_outs = model.forward_gen(inps, hidden, auths_inp, n_max = cp_params['max_len'],end_c=char_to_ix['.'])
        print('--------------------------------------------')
        print('Translate from %s to %s' %
              (batch[0]['author'], ix_to_auth[auths_inp.item()]))

        # General helper functions

        # Clears whitespace but retains character for re.sub
        def strip_match(match):
            return match.group(0).strip()

        # Joins together decimals
        def fix_decimals(match):
            match = match.group(0)
            return re.sub('\s', '', match)

        # Cleans text by removing unnecessary whitespace and substituting back in some symbols
        def clean_text(text):
            text = re.sub('-lrb-', '(', text)
            text = re.sub('-rrb-', ')', text)
            text = re.sub('-lsb-', '[', text)
            text = re.sub('-rsb-', ']', text)
            text = re.sub('-lcb-', '{', text)
            text = re.sub('-rcb-', '}', text)
            text = re.sub('\'\'', '\"', text)
            text = re.sub('\si\s', ' I ', text)
            text = re.sub('^i\s', 'I ', text)
            text = re.sub('\sna\s', 'na ', text)
            text = re.sub('\$\s', strip_match, text)
            text = re.sub('[-#]\s|\s([-.!,\':;?]|n\'t)', strip_match, text)
            text = re.sub('\d+. \d+', fix_decimals, text)
            return text

        # Get original sentence and clean it up a bit
        input_list = [ix_to_char[c.item()] for c in inps[1:]]
        input_string = jc.join(input_list)
        input_string = clean_text(input_string)

        # Get translated sentence and clean it up a bit
        output_list = [
            ix_to_char[c.item()] for c in forward if c.item() in ix_to_char
        ]
        if output_list[-1] == 'END':
            output_list = output_list[:-1]
        output_string = jc.join(output_list)
        output_string = clean_text(output_string)

        print(
            colored('Inp %6s: ' % (ix_to_auth[auths.item()]), 'green') +
            colored('%s' % input_string, auth_colors[auths.item()]))
        print(
            colored('Out %6s: ' % (ix_to_auth[auths_inp.item()]), 'grey') +
            colored('%s' % output_string, auth_colors[auths_inp.item()]))

        if params['show_rev']:
            print(
                colored('Rev %6s: ' % (ix_to_auth[auths.item()]), 'green') +
                colored(
                    '%s' % (jc.join([
                        ix_to_char[c.item()]
                        for c in backward if c.item() in ix_to_char
                        and ix_to_char[c.item()] != 'END'
                    ])), auth_colors[auths.item()]))
Пример #10
0
def main():
    if 'debug' in sys.argv:
        print('Debug')
        number_of_samples = 20
        epochs = 1
    else:
        print('Release')
        number_of_samples = sys.maxsize
        epochs = 1

    ts = list()

    if '2' in sys.argv:
        ts.append(2)

    if '4' in sys.argv:
        ts.append(4)

    if '5' in sys.argv:
        ts = [
            5,
        ]

    if 0 == len(ts):
        ts = (2, 4, 5)

    image_types = list()

    if 'image' in sys.argv:
        image_types.append(ImageType.IMAGES)

    if 'document' in sys.argv:
        image_types.append(ImageType.DOCUMENTS)

    if 0 == len(image_types):
        image_types = ImageType

    np.random.seed(42)

    for image_type in image_types:
        print(image_type.value)

        if image_type == ImageType.IMAGES:
            get_images = DataProvider().get_fish_images
            mean = 100.52933494138787
            std = 65.69793156777682
        else:
            get_images = DataProvider().get_docs_images
            mean = 241.46115784237548
            std = 49.512839464023564

        images, names = get_images(num_samples=number_of_samples,
                                   return_names=True)

        images_train, images_validation, names_train, names_validation = train_test_split(
            images, names, random_state=42)
        t_to_comparator = {
            t: ComparatorCNN(
                t,
                IMAGE_TYPE_TO_T_TO_COMPARATOR_CNN_WEIGHT_FILE_ID_AND_FILE_PATH[
                    image_type][t].width,
                IMAGE_TYPE_TO_T_TO_COMPARATOR_CNN_WEIGHT_FILE_ID_AND_FILE_PATH[
                    image_type][t].height,
                image_type,
                mean=mean,
                std=std).
            load_weights(
                IMAGE_TYPE_TO_T_TO_COMPARATOR_CNN_WEIGHT_FILE_ID_AND_FILE_PATH[
                    image_type][t].model_path)
            for t in ts
        }

        clf = SolverLP(t_to_comparator, image_type=image_type)
        print('Train: ', names_train)
        accuracy = clf.evaluate(images_train, epochs=epochs, ts=ts)
        print('Train 0-1 accuracy on {}: {}'.format(image_type.value,
                                                    accuracy))
        print('Validation: ', names_validation)
        accuracy = clf.evaluate(images_validation, epochs=epochs, ts=ts)
        print('Validation 0-1 accuracy on {}: {}'.format(
            image_type.value, accuracy))
Пример #11
0
def main():
    if 'debug' in sys.argv:
        print('Debug')
        number_of_samples = 20
        epochs = 5
    else:
        print('Release')
        number_of_samples = sys.maxsize
        epochs = 50

    ts = list()

    if '2' in sys.argv:
        ts.append(2)

    if '4' in sys.argv:
        ts.append(4)

    if '5' in sys.argv:
        ts = [
            5,
        ]

    if 0 == len(ts):
        ts = (2, 4, 5)

    image_types = list()

    if 'image' in sys.argv:
        image_types.append(ImageType.IMAGES)

    if 'document' in sys.argv:
        image_types.append(ImageType.DOCUMENTS)

    if 0 == len(image_types):
        image_types = ImageType

    if 'train' in sys.argv:
        force = True
    elif 'evaluate' in sys.argv:
        force = False
    else:
        force = False

    np.random.seed(42)

    width = 224
    height = 224
    batch_size = 32

    for t in ts:
        for image_type in image_types:
            print('t={}. image type is {}'.format(t, image_type.value))

            if image_type == ImageType.IMAGES:
                get_images = DataProvider().get_fish_images
            else:
                get_images = DataProvider().get_docs_images

            images = get_images(num_samples=number_of_samples)
            images_train, images_validation = train_test_split(images,
                                                               random_state=42)

            clf = TopLeftCNN(t, width, height, image_type)

            if force:
                clf.fit_generator(
                    images_train,
                    batch_size,
                    epochs,
                    images_validation,
                )
            else:
                clf.load_weights()
                clf._fit_standardisation(images_train)

            print('Train 0-1:', clf.evaluate(images_train))
            print('Validation 0-1:', clf.evaluate(images_validation))
Пример #12
0
def main(params):
    saved_model = torch.load(params['checkpoint'])
    cp_params = saved_model['arch']
    dp = DataProvider(cp_params)

    if 'misc' in saved_model:
        misc = saved_model['misc']
        char_to_ix = misc['char_to_ix']
        auth_to_ix = misc['auth_to_ix']
        ix_to_char = misc['ix_to_char']
        ix_to_auth = misc['ix_to_auth']
    else:
        char_to_ix = saved_model['char_to_ix']
        auth_to_ix = saved_model['auth_to_ix']
        ix_to_char = saved_model['ix_to_char']
        ix_to_auth = saved_model['ix_to_auth']
    del saved_model

    total_sents = 0.
    resf = params['resfile']
    res = json.load(open(resf, 'r'))
    bsz = params['batch_size']

    for doc in res['docs']:
        for st in doc['sents']:
            total_sents += 1

    all_feats = np.zeros((2 * total_sents, 4096), dtype='float16')
    c_idx = 0

    def process_batch(batch, c_idx, featstr='sent_enc'):
        inps, _, _, lens = dp.prepare_data(batch,
                                           char_to_ix,
                                           auth_to_ix,
                                           maxlen=cp_params['max_seq_len'])
        enc_out = modelGenEncoder.forward_encode(inps, lens)
        enc_out = enc_out.data.cpu().numpy().astype('float16')
        all_feats[c_idx:c_idx + enc_out.shape[0]] = enc_out
        for i, b in enumerate(batch):
            res['docs'][b['id']]['sents'][b['sid']][featstr] = c_idx + i
        c_idx += enc_out.shape[0]
        return c_idx

    if params['use_semantic_encoder']:
        modelGenEncoder = BLSTMEncoder(char_to_ix, ix_to_char,
                                       params['glove_path'])
        encoderState = torch.load(params['use_semantic_encoder'])
    else:
        modelGenEncoder = CharTranslator(cp_params, encoder_only=True)
        encoderState = model_gen_state

    state = modelGenEncoder.state_dict()
    for k in encoderState:
        if k in state:
            state[k] = encoderState[k]
    modelGenEncoder.load_state_dict(state)
    modelGenEncoder.eval()
    del encoderState

    batch = []
    print ' Processing original text'
    for i in tqdm(xrange(len(res['docs']))):
        ix = auth_to_ix[res['docs'][i]['author']]
        for j in xrange(len(res['docs'][i]['sents'])):
            st = res['docs'][i]['sents'][j]['sent'].split()
            if len(st) > 0:
                batch.append({
                    'in': st,
                    'targ': st,
                    'author': res['docs'][i]['author'],
                    'id': i,
                    'sid': j
                })
            if len(batch) == bsz:
                c_idx = process_batch(batch, c_idx, featstr='sent_enc')
                del batch
                batch = []
    if batch:
        c_idx = process_batch(batch, c_idx, featstr='sent_enc')
        del batch
        batch = []

    print 'Processing translated text'
    for i in tqdm(xrange(len(res['docs']))):
        ix = auth_to_ix[res['docs'][i]['author']]
        for j in xrange(len(res['docs'][i]['sents'])):
            st = res['docs'][i]['sents'][j]['trans'].split()
            if len(st) > 0:
                batch.append({
                    'in': st,
                    'targ': st,
                    'author': res['docs'][i]['author'],
                    'id': i,
                    'sid': j
                })
            if len(batch) == bsz:
                c_idx = process_batch(batch, c_idx, featstr='trans_enc')
                batch = []
    if batch:
        c_idx = process_batch(batch, c_idx, featstr='trans_enc')
        batch = []

    json.dump(res, open(resf, 'w'))
    np.save('.'.join(resf.split('.')[:-1]) + 'sememb.npy', all_feats)
Пример #13
0
        test_x, test_y = shreds_to_x_y(test_shreds)

        assert train_x.shape == (self._t**2 * train_shreds.shape[0],
                                 self._height, self._width, 1)
        assert train_y.shape == (self._t**2 * train_shreds.shape[0], )
        assert validation_x.shape == (self._t**2 * validation_shreds.shape[0],
                                      self._height, self._width, 1)
        assert validation_y.shape == (self._t**2 *
                                      validation_shreds.shape[0], )
        assert test_x.shape == (self._t**2 * test_shreds.shape[0],
                                self._height, self._width, 1)
        assert test_y.shape == (self._t**2 * test_shreds.shape[0], )

        return (train_x, train_y), (validation_x, validation_y), (test_x,
                                                                  test_y)

    def _get_model_checkpoint_file_path(self):
        return 'saved_weights/one-picture-classify-best-{}-{}-model.h5'.format(
            self._t, self._image_type.value)

    def _get_model_final_file_path(self):
        return 'saved_weights/one-picture-classify-final-{}-{}-model.h5'.format(
            self._t, self._image_type.value)


if "__main__" == __name__:
    for t in (2, 4, 5):
        for image_type in ImageType:
            clf = OnePictureClassify(t, 220, 220, image_type, DataProvider())
            clf.fit(epochs=50)
def main(params):

    # Create vocabulary and author index
    saved_model = torch.load(params['genmodel'])
    cp_params = saved_model['arch']
    if params['evalmodel']:
        eval_model = torch.load(params['evalmodel'])
        eval_params = eval_model['arch']
        eval_state = eval_model['state_dict']
    else:
        print "FIX THIS"
        return

    if 'misc' in saved_model:
        misc = saved_model['misc']
        char_to_ix = misc['char_to_ix']
        auth_to_ix = misc['auth_to_ix']
        ix_to_char = misc['ix_to_char']
        ix_to_auth = misc['ix_to_auth']
    else:
        char_to_ix = saved_model['char_to_ix']
        auth_to_ix = saved_model['auth_to_ix']
        ix_to_char = saved_model['ix_to_char']
        if 'ix_to_auth' in saved_model:
            ix_to_auth = saved_model['ix_to_auth']
        else:
            ix_to_auth = {auth_to_ix[a]:a for a in auth_to_ix}

    dp = DataProvider(cp_params)
    if params['softmax_scale']:
        cp_params['softmax_scale'] = params['softmax_scale']

    modelGen = CharTranslator(cp_params)
    modelEval = CharLstm(eval_params)

    startc = dp.data['configs']['start']
    endc = dp.data['configs']['end']

    modelGen.eval()
    modelEval.eval()

    # Restore saved checkpoint
    modelGen.load_state_dict(saved_model['state_dict'])
    state = modelEval.state_dict()
    state.update(eval_state)
    modelEval.load_state_dict(state)

    append_tensor = np.zeros((1, 1), dtype=np.int)
    append_tensor[0, 0] = char_to_ix[startc]
    append_tensor = torch.LongTensor(append_tensor).cuda()

    accum_diff_eval = [[],[]]
    accum_err_eval = np.zeros(len(auth_to_ix))
    accum_err_real = np.zeros(len(auth_to_ix))
    accum_count_gen = np.zeros(len(auth_to_ix))


    accum_recall_forward = np.zeros(len(auth_to_ix))
    accum_prec_forward = np.zeros(len(auth_to_ix))
    accum_recall_rev = np.zeros(len(auth_to_ix))
    accum_prec_rev = np.zeros(len(auth_to_ix))

    jc = '' if cp_params.get('atoms','char') == 'char' else ' '
    result = {'docs':[], 'misc':{'auth_to_ix':auth_to_ix, 'ix_to_auth':ix_to_auth}, 'cp_params':cp_params, 'params': params}
    id_to_ix = {}
    for i,iid in enumerate(dp.splits[params['split']]):
        result['docs'].append({'sents':[], 'author':dp.data['docs'][iid][dp.athstr], 'id':iid})
        if 'attrib' in dp.data['docs'][iid]:
            result['docs'][-1]['attrib'] = dp.data['docs'][iid]['attrib']
        id_to_ix[iid] = i


    n_samp = params['n_samples']
    for i, b_data in tqdm(enumerate(dp.iter_sentences_bylen(split=params['split'], atoms=cp_params.get('atoms','word'), batch_size = params['batch_size'], auths = auth_to_ix.keys()))):
        if i > params['num_batches'] and params['num_batches']>0:
            break;
    #for i in xrange(params['num_batches']):
        #c_aid = np.random.choice(auth_to_ix.values())
        #batch = dp.get_sentence_batch(1,split=params['split'], atoms=cp_params.get('atoms','char'), aid=ix_to_auth[c_aid])
        c_bsz = len(b_data[0])
        done = b_data[1]
        inps, targs, auths, lens = dp.prepare_data(b_data[0], char_to_ix, auth_to_ix, maxlen=cp_params['max_seq_len'])
        # outs are organized as
        auths_inp = 1 - auths if params['flip'] else auths
        outs = adv_forward_pass(modelGen, modelEval, inps, lens,
                end_c=char_to_ix[endc], maxlen=cp_params['max_seq_len'],
                auths=auths_inp, cycle_compute=params['show_rev'],
                append_symb=append_tensor, n_samples=params['n_samples'])
        eval_out_gt = modelEval.forward_classify(targs, lens=lens, compute_softmax=True)
        auths_inp = auths_inp.numpy()
        i_bsz = np.arange(c_bsz)
        real_aid_out = eval_out_gt[0].data.cpu().numpy()[i_bsz, auths_inp]

        gen_scores = outs[0].view(n_samp,c_bsz,-1)
        gen_aid_out = gen_scores.cpu().numpy()[:,i_bsz, auths_inp]
        gen_char = [v.view(n_samp,c_bsz) for v in outs[1]]
        gen_lens = outs[2].view(n_samp,c_bsz)
        np.add.at(accum_err_eval, auths_inp, gen_aid_out[0,:] >=0.5)
        np.add.at(accum_err_real, auths_inp, real_aid_out >=0.5)
        np.add.at(accum_count_gen,auths_inp,1)
        for b in xrange(inps.size()[1]):
            inpset =  set(inps[:,b].tolist()[:lens[b]]) ;
            samples = []
            accum_diff_eval[auths_inp[b]].append(gen_aid_out[0,b] - real_aid_out[b])
            for si in xrange(n_samp):
                genset = set([c[si, b] for c in gen_char[:gen_lens[si,b]]]);
                accum_recall_forward[auths_inp[b]] += (float(len(genset & inpset)) / float(len(inpset)))
                accum_prec_forward[auths_inp[b]] += (float(len(genset & inpset)) / float(len(genset)))

                if params['show_rev']:
                    revgenset = set([c[b] for c in outs[-2][:outs[-1][b]] ])
                    accum_recall_rev[auths_inp[b]]  += (float(len(revgenset & inpset)) / float(len(inpset)))
                    accum_prec_rev[auths_inp[b]]    += (float(len(revgenset & inpset)) / float(len(revgenset)))

                inp_text = jc.join([ix_to_char[c] for c in targs[:,b] if c in ix_to_char])
                trans_text = jc.join([ix_to_char[c.cpu()[si,b]] for c in gen_char[:gen_lens[si,b]] if c.cpu()[si,b] in ix_to_char])
                samples.append({'sent':inp_text,'score':eval_out_gt[0][b].data.cpu().tolist(), 'trans': trans_text, 'trans_score':gen_scores[si,b].cpu().tolist(),'sid':b_data[0][b]['sid']})
            result['docs'][id_to_ix[b_data[0][b]['id']]]['sents'].append(samples)

        if params['print']:
            print '--------------------------------------------'
            print 'Author: %s'%(b_data[0][0]['author'])
            print 'Inp text %s: %s (%.2f)'%(ix_to_auth[auths[0]], jc.join([ix_to_char[c[0]] for c in inps[1:]]), real_aid_out[0])
            print 'Out text %s: %s (%.2f)'%(ix_to_auth[auths_inp[0]],jc.join([ix_to_char[c.cpu()[0]] for c in outs[1] if c.cpu()[0] in ix_to_char]), gen_aid_out[0])
            if params['show_rev']:
                print 'Rev text %s: '%(ix_to_auth[auths[0]])+ '%s'%(jc.join([ix_to_char[c.cpu()[0]] for c in outs[-2] if c.cpu()[0] in ix_to_char]))
        #else:
        #    print '%d/%d\r'%(i, params['num_batches']),

    err_a1, err_a2 = accum_err_eval[0]/(1e-5+accum_count_gen[0]), accum_err_eval[1]/(1e-5+accum_count_gen[1])
    err_real_a1, err_real_a2 = accum_err_real[0]/(1e-5+accum_count_gen[0]), accum_err_real[1]/(1e-5+accum_count_gen[1])
    print '--------------------------------------------'
    print 'Efficiency in fooling discriminator'
    print '--------------------------------------------'
    print(' erra1 {:3.2f} - erra2 {:3.2f}'.format(100.*err_a1, 100.*err_a2))
    print(' err_real_a1 {:3.2f} - err_real_a2 {:3.2f}'.format(100.*err_real_a1, 100.*err_real_a2))
    print(' count %d - %d'%(accum_count_gen[0], accum_count_gen[1]))

    diff_arr0, diff_arr1 =  np.array(accum_diff_eval[0]), np.array(accum_diff_eval[1])
    print 'Mean difference : translation to %s = %.2f , translation to %s = %.2f '%(ix_to_auth[0], diff_arr0.mean(), ix_to_auth[1], diff_arr1.mean())
    print 'Difference > 0  : translation to %s = %.2f%%, translation to %s = %.2f%% '%(ix_to_auth[0], 100.*(diff_arr0>0).sum()/(1e-5+diff_arr0.shape[0]), ix_to_auth[1], 100.*(diff_arr1>0).sum()/(1e-5+diff_arr1.shape[0]))
    print 'Difference < 0  : translation to %s = %.2f%%, translation to %s = %.2f%% '%(ix_to_auth[0], 100.*(diff_arr0<0).sum()/(1e-5+diff_arr0.shape[0]), ix_to_auth[1], 100.*(diff_arr1<0).sum()/(1e-5+diff_arr1.shape[0]))

    print '\n--------------------------------------------'
    print 'Consistencey with the input text'
    print '--------------------------------------------'
    print 'Generated text A0- Precision = %.2f, Recall = %.2f'%(accum_prec_forward[0]/accum_count_gen[0], accum_recall_forward[0]/accum_count_gen[0] )
    print 'Generated text A1- Precision = %.2f, Recall = %.2f'%(accum_prec_forward[1]/accum_count_gen[1], accum_recall_forward[1]/accum_count_gen[1] )
    if params['show_rev']:
        print '\n'
        print 'Reconstr  text A0- Precision = %.2f, Recall = %.2f'%(accum_prec_rev[0]/accum_count_gen[0], accum_recall_rev[0]/accum_count_gen[0] )
        print 'Reconstr  text A1- Precision = %.2f, Recall = %.2f'%(accum_prec_rev[1]/accum_count_gen[1], accum_recall_rev[1]/accum_count_gen[1] )

    print '\n--------------------------------------------'
    print 'Document Level Scores'
    print '--------------------------------------------'
    doc_accuracy = np.zeros(len(auth_to_ix))
    doc_accuracy_trans = np.zeros(len(auth_to_ix))
    doc_count = np.zeros(len(auth_to_ix))
    for doc in result['docs']:
        doc_score_orig = np.array([0.,0.])
        doc_score_trans = np.array([0.,0.])
        for st in doc['sents']:
            doc_score_orig  += np.log(st[0]['score'])
            doc_score_trans += np.log(st[0]['trans_score'])
        doc_accuracy[auth_to_ix[doc['author']]] += float(doc_score_orig.argmax() == auth_to_ix[doc['author']])
        doc_accuracy_trans[auth_to_ix[doc['author']]] += float(doc_score_trans.argmax() == auth_to_ix[doc['author']])
        doc_count[auth_to_ix[doc['author']]] += 1.

    print 'Original data'
    print '-------------'
    print 'Doc accuracy is %s : %.2f , %s : %.2f'%(ix_to_auth[0], (doc_accuracy[0]/doc_count[0]),ix_to_auth[1], (doc_accuracy[1]/doc_count[1]) )
    fp = doc_count[1]- doc_accuracy[1]
    recall = doc_accuracy[0]/doc_count[0]
    precision = doc_accuracy[0]/(doc_accuracy[0]+fp)
    f1score = 2.*(precision*recall)/(precision+recall)
    print 'Precision is %.2f : Recall is %.2f , F1-score is %.2f'%(precision, recall, f1score)
    print '\nTranslated data'
    print '-----------------'
    print 'Doc accuracy is %s : %.2f , %s : %.2f'%(ix_to_auth[0], (doc_accuracy_trans[0]/doc_count[0]),ix_to_auth[1], (doc_accuracy_trans[1]/doc_count[1]) )
    fp = doc_count[1]- doc_accuracy_trans[1]
    recall = doc_accuracy_trans[0]/doc_count[0]
    precision = doc_accuracy_trans[0]/(doc_accuracy_trans[0]+fp)
    f1score = 2.*(precision*recall)/(precision+recall)
    print 'Precision is %.2f : Recall is %.2f , F1-score is %.2f'%(precision, recall, f1score)


    if params['dumpjson']:
       json.dump(result, open(params['dumpjson'],'w'))
Пример #15
0
def main():
    data_provider = DataProvider()
    directory = 'plots'
    os.makedirs(directory, exist_ok=True)

    for width_height in (None, (224, 224), (2200 // 5, 2200 // 5)):
        print(width_height)
        type_to_bad_picutres_percent = defaultdict(list)
        type_to_bad_patches_pairs_percent = defaultdict(list)
        ts = 2, 4, 5

        for image_type in ImageType:
            print(image_type)

            for t in ts:
                print(t)

                number_of_pictures_with_same_patches, number_of_patches_with_similar_in_same_picture,\
                    total_number_of_pictures, total_number_of_patches = \
                    get_number_of_images_with_same_patches_and_number_of_same_patches(
                        data_provider,
                        image_type,
                        t,
                        width_height
                    )

                print('{} shredded to {} patches and resized to {} has {}/{} bad pictures and {}/{} patches'.format(
                    image_type,
                    t,
                    width_height,
                    number_of_pictures_with_same_patches, total_number_of_pictures,
                    number_of_patches_with_similar_in_same_picture, total_number_of_patches
                ))

                type_to_bad_picutres_percent[image_type].append(
                    number_of_pictures_with_same_patches * 100.0 / total_number_of_pictures)
                type_to_bad_patches_pairs_percent[image_type].append(
                    number_of_patches_with_similar_in_same_picture * 100.0 / total_number_of_patches)

        handles = list()

        for image_type in ImageType:
            current_handle, = plt.plot(ts, type_to_bad_picutres_percent[image_type], 'o', label=image_type.value)
            handles.append(current_handle)

        plt.title('Percent of bad images as function of t')
        plt.legend(handles)
        plt.xlabel('t')
        plt.ylabel('% of images')
        plt.savefig(os.path.join(directory, 'bad_images.png'))
        plt.show()

        handles = list()

        for image_type in ImageType:
            current_handle, = plt.plot(ts, type_to_bad_patches_pairs_percent[image_type], 'o', label=image_type.value)
            handles.append(current_handle)

        plt.title('Percent of bad crops as function of t')
        plt.legend(handles)
        plt.xlabel('t')
        plt.ylabel('% of pairs')
        plt.savefig(os.path.join(directory, 'bad_crops.png'))
        plt.show()
Пример #16
0
        labels = np.concatenate((np.ones(len(fish)), np.zeros(len(docs)))).astype(int).tolist()
        x_train, x_test, y_train, y_test = \
            train_test_split(images, labels, train_size=0.8, random_state=42, stratify=labels)
        return (x_train, np.stack(y_train, axis=0)), (x_test, np.stack(y_test, axis=0))

    def is_fish(self, x):
        x = resize_to(x, self._input_shape)
        x = x / 255
        res = self._model.predict(x) > 0.5
        return res

    def is_doc(self, image):
        return not self.is_fish(image)


if __name__ == '__main__':
    weights = os.path.join(os.path.dirname(__file__), 'saved_weights')
    epochs = 3 if 'debug' in sys.argv else 300
    if 'fit' in sys.argv:
        # This will fit the classifier
        clf = FishOrDocClassifier(DataProvider())
        print('Fitting {}'.format(clf.__class__.__name__))
        os.makedirs(weights, exist_ok=True)
        model, history = clf.fit(weights, epochs=epochs)
        PickleHelper.dump(history.history, os.path.join(visualization_root, 'history.pkl'))
    if 'visualize' in sys.argv:
        print('Visualizing history')
        history = PickleHelper.load(os.path.join(visualization_root, 'history.pkl'))
        HistoryVisualizer.visualize(history['acc'], history['val_acc'], 'accuracy', visualization_root)
        HistoryVisualizer.visualize(history['loss'], history['val_loss'], 'loss', visualization_root)
Пример #17
0
def main(params):

    dp = DataProvider(params)

    # Create vocabulary and author index
    if params['resume'] == None:
        if params['atoms'] == 'char':
            char_to_ix, ix_to_char = dp.createCharVocab(
                params['vocab_threshold'])
        else:
            char_to_ix, ix_to_char = dp.createWordVocab(
                params['vocab_threshold'])
        auth_to_ix, ix_to_auth = dp.createAuthorIdx()
    else:
        saved_model = torch.load(params['resume'])
        char_to_ix = saved_model['char_to_ix']
        auth_to_ix = saved_model['auth_to_ix']
        ix_to_auth = saved_model['ix_to_auth']
        ix_to_char = saved_model['ix_to_char']

    params['vocabulary_size'] = len(char_to_ix)
    params['num_output_layers'] = len(auth_to_ix)

    model = CharTranslator(params)
    # set to train mode, this activates dropout
    model.train()
    #Initialize the RMSprop optimizer

    if params['use_sgd']:
        optim = torch.optim.SGD(model.parameters(),
                                lr=params['learning_rate'],
                                momentum=params['decay_rate'])
    else:
        optim = torch.optim.RMSprop(model.parameters(),
                                    lr=params['learning_rate'],
                                    alpha=params['decay_rate'],
                                    eps=params['smooth_eps'])
    # Loss function
    if params['mode'] == 'generative':
        criterion = nn.CrossEntropyLoss()
    else:
        criterion = nn.NLLLoss()

    # Restore saved checkpoint
    if params['resume'] != None:
        model.load_state_dict(saved_model['state_dict'])
        optim.load_state_dict(saved_model['optimizer'])

    total_loss = 0.
    start_time = time.time()
    hidden = model.init_hidden(params['batch_size'])
    hidden_zeros = model.init_hidden(params['batch_size'])
    # Initialize the cache
    if params['randomize_batches']:
        dp.set_hid_cache(range(len(dp.data['docs'])), hidden_zeros)

    # Compute the iteration parameters
    epochs = params['max_epochs']
    total_seqs = dp.get_num_sents(split='train')
    iter_per_epoch = total_seqs // params['batch_size']
    total_iters = iter_per_epoch * epochs
    best_loss = 1000000.
    best_val = 1000.
    eval_every = int(iter_per_epoch * params['eval_interval'])

    #val_score = eval_model(dp, model, params, char_to_ix, auth_to_ix, split='val', max_docs = params['num_eval'])
    val_score = 0.  #eval_model(dp, model, params, char_to_ix, auth_to_ix, split='val', max_docs = params['num_eval'])
    val_rank = 1000

    eval_function = eval_translator if params[
        'mode'] == 'generative' else eval_classify
    leakage = 0.  #params['leakage']

    print total_iters
    for i in xrange(total_iters):
        #TODO
        if params['split_generators']:
            c_aid = ix_to_auth[np.random.choice(auth_to_ix.values())]
        else:
            c_aid = None

        batch = dp.get_sentence_batch(params['batch_size'],
                                      split='train',
                                      atoms=params['atoms'],
                                      aid=c_aid,
                                      sample_by_len=params['sample_by_len'])
        inps, targs, auths, lens = dp.prepare_data(
            batch, char_to_ix, auth_to_ix, maxlen=params['max_seq_len'])
        # Reset the hidden states for which new docs have been sampled

        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        optim.zero_grad()
        #TODO
        if params['mode'] == 'generative':
            output, _ = model.forward_mltrain(inps,
                                              lens,
                                              inps,
                                              lens,
                                              hidden_zeros,
                                              auths=auths)
            targets = pack_padded_sequence(Variable(targs).cuda(), lens)
            loss = criterion(pack_padded_sequence(output, lens)[0], targets[0])
        else:
            # for classifier auths is the target
            output, hidden = model.forward_classify(inps,
                                                    hidden,
                                                    compute_softmax=True)
            targets = Variable(auths).cuda()
            loss = criterion(output, targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm(model.parameters(), params['grad_clip'])

        # Take an optimization step
        optim.step()

        total_loss += loss.data.cpu().numpy()[0]

        # Save the hidden states in cache for later use
        if i % eval_every == 0 and i > 0:
            val_rank, val_score = eval_function(dp,
                                                model,
                                                params,
                                                char_to_ix,
                                                auth_to_ix,
                                                split='val')

        #if i % iter_per_epoch == 0 and i > 0 and leakage > params['leakage_min']:
        #    leakage = leakage * params['leakage_decay']

        #if (i % iter_per_epoch == 0) and ((i//iter_per_epoch) >= params['lr_decay_st']):
        if i % params['log_interval'] == 0 and i > 0:
            cur_loss = total_loss / params['log_interval']
            elapsed = time.time() - start_time
            print(
                '| epoch {:2.2f} | {:5d}/{:5d} batches | lr {:02.2e} | ms/batch {:5.2f} | '
                'loss {:5.2f} | ppl {:8.2f}'.format(
                    float(i) / iter_per_epoch, i, total_iters,
                    params['learning_rate'],
                    elapsed * 1000 / args.log_interval, cur_loss,
                    math.exp(cur_loss)))
            total_loss = 0.

            if val_rank <= best_val:
                save_checkpoint(
                    {
                        'iter': i,
                        'arch': params,
                        'val_loss': val_rank,
                        'val_pplx': val_score,
                        'char_to_ix': char_to_ix,
                        'ix_to_char': ix_to_char,
                        'auth_to_ix': auth_to_ix,
                        'ix_to_auth': ix_to_auth,
                        'state_dict': model.state_dict(),
                        'loss': cur_loss,
                        'optimizer': optim.state_dict(),
                    },
                    fappend=params['fappend'],
                    outdir=params['checkpoint_output_directory'])
                best_val = val_rank
            start_time = time.time()
Пример #18
0
def main(params):
    dp = DataProvider(params)
    auth_to_ix = dp.createAuthorIdx()

    # Preprocess the training data
    train_docs = []
    targets = []
    model = {}

    # remove numbers
    bad_hombres = range(10)
    if params['nostop']:
        bad_hombres = bad_hombres + stopwords.words('english')
    if params['nopunct']:
        bad_hombres = bad_hombres + list(string.punctuation)

    bad_hombres = set(bad_hombres)

    all_words = Counter()

    for i, doc in enumerate(dp.data['docs']):
        no_num = re.sub(r'\d+', '', doc['text'].lower())
        curr_text = [
            w for w in wordpunct_tokenize(no_num) if w not in bad_hombres
        ]
        dp.data['docs'][i]['tokenized'] = curr_text
        if doc['split'] == 'train':
            all_words.update(curr_text)

    short_vocab = {
        w: i
        for i, w in enumerate([
            wrd for wrd in all_words
            if all_words[wrd] > params['vocab_threshold']
        ])
    }

    docCounts_train, target_train = count(dp,
                                          short_vocab,
                                          auth_to_ix,
                                          split='train')
    bow_features_train, idf_train = bow_features(docCounts_train,
                                                 params['tfidf'])

    docCounts_val, target_val = count(dp, short_vocab, auth_to_ix, split='val')
    bow_features_val, _ = bow_features(docCounts_val,
                                       params['tfidf'],
                                       idf=idf_train)

    # Do PCA?
    if params['pca'] > 0:
        pca_model = PCA(n_components=params['pca'])
        bow_features_train = pca_model.fit_transform(bow_features_train)
        print 'Explained variance is %.2f' % (sum(
            pca_model.explained_variance_ratio_))

        bow_features_val = pca_model.transform(bow_features_val)
        params['pca'] = bow_features_train.shape[-1]

    # Normalize the data
    bow_features_train, mean_tr, std_tr = normalize(bow_features_train)
    bow_features_val, _, _ = normalize(bow_features_val, mean_tr, std_tr)

    if params['mlp'] == False:
        if params['linearsvm']:
            # Linear SVC alread implements one-vs-rest
            svm_model = LinearSVC()  #verbose=1)
            svm_model.fit(bow_features_train, target_train)

        #Time to evaluate now.
        confTr = svm_model.decision_function(bow_features_train)
        confVal = svm_model.decision_function(bow_features_val)
    else:
        params['num_output_layers'] = len(auth_to_ix)
        params['inp_size'] = params['pca']
        model = MLP_classifier(params)
        model.fit(bow_features_train, target_train, bow_features_val,
                  target_val, params['epochs'], params['lr'], params['l2'])
        confTr = model.decision_function(bow_features_train)
        confVal = model.decision_function(bow_features_val)

    mean_rank_train = np.where(
        confTr.argsort(axis=1)[:, ::-1] == target_train[:, None])[1].mean()
    topk_train = (
        np.where(confTr.argsort(axis=1)[:, ::-1] == target_train[:, None])[1]
        <= params['topk']).sum() * 100. / len(target_train)
    train_accuracy = 100. * float(
        (confTr.argmax(axis=1) == target_train).sum()) / len(target_train)

    mean_rank_val = np.where(
        confVal.argsort(axis=1)[:, ::-1] == target_val[:, None])[1].mean()
    topk_val = (
        np.where(confVal.argsort(axis=1)[:, ::-1] == target_val[:, None])[1] <=
        params['topk']).sum() * 100. / len(target_val)
    val_accuracy = 100. * float(
        (confVal.argmax(axis=1) == target_val).sum()) / len(target_val)

    # DO the binary evaluation similar to the Bagnall
    #confTr = confTr - confTr.mean(axis=1)[:,None]
    n_auths = len(auth_to_ix)

    n_train = confTr.shape[0]
    neg_auths_tr = np.random.randint(0, n_auths, n_train)
    adjusted_scores_tr = ((np.argsort(
        confTr[:, np.concatenate([target_train.astype(int), neg_auths_tr])],
        axis=0) == np.concatenate([np.arange(n_train),
                                   np.arange(n_train)])).argmax(axis=0) +
                          1) / float(n_train)
    auc_tr = roc_auc_score(
        np.concatenate([
            np.ones(int(n_train), dtype=int),
            np.zeros(int(n_train), dtype=int)
        ]), adjusted_scores_tr)

    n_val = confVal.shape[0]
    neg_auths_val = np.random.randint(0, n_auths, n_val)
    adjusted_scores_val = ((np.argsort(
        confVal[:, np.concatenate([target_val.astype(int), neg_auths_val])],
        axis=0) == np.concatenate([np.arange(n_val),
                                   np.arange(n_val)])).argmax(axis=0) +
                           1) / float(n_val)
    auc_val = roc_auc_score(
        np.concatenate(
            [np.ones(int(n_val), dtype=int),
             np.zeros(int(n_val), dtype=int)]), adjusted_scores_val)

    print '------------- Training set-------------------'
    print 'Accuracy is %.2f, Mean rank is %.2f / %d' % (
        train_accuracy, mean_rank_train, len(auth_to_ix))
    print 'Top-%d Accuracy is %.2f' % (params['topk'], topk_train)
    print 'Accuracy per adjusted scores %.3f' % (100. * (
        (adjusted_scores_tr[:n_train] >= 0.5).sum() +
        (adjusted_scores_tr[n_train:] < 0.5).sum()) / (2. * n_train))
    print 'AUC is  %.2f' % (auc_tr)

    print '------------- Val set-------------------'
    print 'Accuracy is %.2f, Mean rank is %.2f / %d' % (
        val_accuracy, mean_rank_val, len(auth_to_ix))
    print 'Top-%d Accuracy is %.2f' % (params['topk'], topk_val)
    print 'Accuracy per adjusted scores %.3f' % (100. * (
        (adjusted_scores_val[:n_val] >= 0.5).sum() +
        (adjusted_scores_val[n_val:] < 0.5).sum()) / (2. * n_val))
    print 'AUC is  %.2f' % (auc_val)

    print '--------------------------------------------------------------------------'
    print '--------------------------------------------------------------------------\n\n'
Пример #19
0
def main(params):
    dp = DataProvider(params)

    # Create vocabulary and author index
    if params['resume'] == None:
        if params['atoms'] == 'char':
            char_to_ix, ix_to_char = dp.create_char_vocab(
                params['vocab_threshold'])
        else:
            char_to_ix, ix_to_char = dp.create_word_vocab(
                params['vocab_threshold'])
        auth_to_ix, ix_to_auth = dp.create_author_idx()
    else:
        saved_model = torch.load(params['resume'])
        char_to_ix = saved_model['char_to_ix']
        auth_to_ix = saved_model['auth_to_ix']
        ix_to_char = saved_model['ix_to_char']

    params['vocabulary_size'] = len(char_to_ix)
    params['num_output_layers'] = len(auth_to_ix)
    print
    params['vocabulary_size'], params['num_output_layers']

    model = get_classifier(params)
    # set to train mode, this activates dropout
    model.train()
    # Initialize the RMSprop optimizer

    if params['use_sgd']:
        optim = torch.optim.SGD(model.parameters(),
                                lr=params['learning_rate'],
                                momentum=params['decay_rate'])
    else:
        optim = torch.optim.RMSprop([{
            'params':
            [p[1] for p in model.named_parameters() if p[0] != 'decoder_W']
        }, {
            'params': model.decoder_W,
            'weight_decay': 0.000
        }],
                                    lr=params['learning_rate'],
                                    alpha=params['decay_rate'],
                                    eps=params['smooth_eps'])
    # Loss function
    if len(params['balance_loss']) == 0:
        criterion = nn.CrossEntropyLoss()
    else:
        criterion = nn.CrossEntropyLoss(
            torch.FloatTensor(params['balance_loss']).cuda())

    # Restore saved checkpoint
    if params['resume'] != None:
        model.load_state_dict(saved_model['state_dict'])
        # optim.load_state_dict(saved_model['optimizer'])

    total_loss = 0.
    class_loss = 0.
    start_time = time.time()
    hidden = model.init_hidden(params['batch_size'])
    hidden_zeros = model.init_hidden(params['batch_size'])
    # Initialize the cache
    if params['randomize_batches']:
        dp.set_hid_cache(range(len(dp.data['docs'])), hidden_zeros)

    # Compute the iteration parameters
    epochs = params['max_epochs']
    total_seqs = dp.get_num_sents(split='train')
    iter_per_epoch = total_seqs // params['batch_size']
    total_iters = iter_per_epoch * epochs
    best_loss = 0.
    best_val = 1000.
    eval_every = int(iter_per_epoch * params['eval_interval'])

    # val_score = eval_model(dp, model, params, char_to_ix, auth_to_ix, split='val', max_docs = params['num_eval'])
    val_score = 0.  # eval_model(dp, model, params, char_to_ix, auth_to_ix, split='val', max_docs = params['num_eval'])
    val_rank = 0

    eval_function = eval_model if params[
        'mode'] == 'generative' else eval_classify

    leakage = params['leakage']
    for i in xrange(total_iters):
        # TODO
        if params['randomize_batches']:
            batch, reset_next = dp.get_rand_doc_batch(params['batch_size'],
                                                      split='train')
            b_ids = [b['id'] for b in batch]
            hidden = dp.get_hid_cache(b_ids, hidden)
        elif params['use_sentences']:
            c_aid = None  # ix_to_auth[np.random.choice(auth_to_ix.values())]
            batch = dp.get_sentence_batch(
                params['batch_size'],
                split='train',
                aid=c_aid,
                atoms=params['atoms'],
                sample_by_len=params['sample_by_len'])
            hidden = hidden_zeros
        else:
            batch, reset_h = dp.get_doc_batch(split='train')
            if len(reset_h) > 0:
                hidden[0].data.index_fill_(1,
                                           torch.LongTensor(reset_h).cuda(),
                                           0.)
                hidden[1].data.index_fill_(1,
                                           torch.LongTensor(reset_h).cuda(),
                                           0.)

        inps, targs, auths, lens = dp.prepare_data(batch,
                                                   char_to_ix,
                                                   auth_to_ix,
                                                   leakage=leakage)

        # Reset the hidden states for which new docs have been sampled

        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        optim.zero_grad()

        # TODO
        if params['mode'] == 'generative':
            output, hidden = model.forward(inps, lens, hidden, auths)
            targets = pack_padded_sequence(Variable(targs).cuda(), lens)
            loss = criterion(pack_padded_sequence(output, lens)[0], targets[0])
        else:
            # for classifier auths is the target
            output, _ = model.forward_classify(targs,
                                               hidden,
                                               compute_softmax=False,
                                               lens=lens)
            targets = Variable(auths).cuda()
            lossClass = criterion(output, targets)
            if params['compression_layer']:
                loss = lossClass + (model.compression_W.weight.norm(
                    p=1, dim=1)).mean()
            else:
                loss = lossClass
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm(model.parameters(), params['grad_clip'])

        # Take an optimization step
        optim.step()

        total_loss += loss.data.cpu().numpy()[0]
        class_loss += lossClass.data.cpu().numpy()[0]

        # Save the hidden states in cache for later use
        if params['randomize_batches']:
            if len(reset_next) > 0:
                hidden[0].data.index_fill_(1,
                                           torch.LongTensor(reset_next).cuda(),
                                           0.)
                hidden[1].data.index_fill_(1,
                                           torch.LongTensor(reset_next).cuda(),
                                           0.)
            dp.set_hid_cache(b_ids, hidden)

        if i % eval_every == 0 and i > 0:
            val_rank, val_score = eval_function(dp,
                                                model,
                                                params,
                                                char_to_ix,
                                                auth_to_ix,
                                                split='val',
                                                max_docs=params['num_eval'])

        if i % iter_per_epoch == 0 and i > 0 and leakage > params[
                'leakage_min']:
            leakage = leakage * params['leakage_decay']

        # if (i % iter_per_epoch == 0) and ((i//iter_per_epoch) >= params['lr_decay_st']):
        if i % params['log_interval'] == 0 and i > 0:
            cur_loss = total_loss / params['log_interval']
            class_loss = class_loss / params['log_interval']
            elapsed = time.time() - start_time
            print(
                '| epoch {:3.2f} | {:5d}/{:5d} batches | lr {:02.2e} | ms/batch {:5.2f} | '
                'loss {:5.2f} | ppl {:8.2f}'.format(
                    float(i) / iter_per_epoch, i, total_iters,
                    params['learning_rate'],
                    elapsed * 1000 / args.log_interval, cur_loss,
                    math.exp(class_loss)))

            if val_rank >= best_loss:
                best_loss = val_rank
                save_checkpoint(
                    {
                        'iter': i,
                        'arch': params,
                        'val_mean_rank': val_rank,
                        'val_auc': val_score,
                        'char_to_ix': char_to_ix,
                        'ix_to_char': ix_to_char,
                        'auth_to_ix': auth_to_ix,
                        'state_dict': model.state_dict(),
                        'loss': cur_loss,
                        'optimizer': optim.state_dict(),
                    },
                    fappend=params['fappend'],
                    outdir=params['checkpoint_output_directory'])
                best_val = val_rank
            start_time = time.time()
            total_loss = 0.
            class_loss = 0.
Пример #20
0
def main():
    if 'debug' in sys.argv:
        print('Debug')
        number_of_samples = 20
        epochs = 1
    else:
        print('Release')
        number_of_samples = sys.maxsize
        epochs = 1

    ts = list()

    if '2' in sys.argv:
        ts.append(2)

    if '4' in sys.argv:
        ts.append(4)

    if '5' in sys.argv:
        ts = [5, ]

    if 0 == len(ts):
        ts = (2, 4, 5)

    image_types = list()

    if 'image' in sys.argv:
        image_types.append(ImageType.IMAGES)

    if 'document' in sys.argv:
        image_types.append(ImageType.DOCUMENTS)

    if 'version' in sys.argv:
        version = int(sys.argv[sys.argv.index('version') + 1])
    else:
        version = 2

    if 0 == version:
        iterate_on_bottom_values = (False, )
        iterate_on_right_values = (False, )
        column_then_row_values = (False, )
        iterate_first_shred = False
        try_to_improve_with_row_permutation = False
        width = 224
        height = 224
    elif 1 == version:
        iterate_on_bottom_values = (False, )
        iterate_on_right_values = (False, )
        column_then_row_values = (False, )
        iterate_first_shred = True
        try_to_improve_with_row_permutation = False
        width = 224
        height = 224
    elif 2 == version:
        iterate_on_bottom_values = (False, True)
        iterate_on_right_values = (False, True)
        column_then_row_values = (False, True)
        iterate_first_shred = True
        try_to_improve_with_row_permutation = False
        width = 224
        height = 224
    else:  # if 3 <= version
        iterate_on_bottom_values = (False, True)
        iterate_on_right_values = (False, True)
        column_then_row_values = (False, True)
        iterate_first_shred = True
        try_to_improve_with_row_permutation = True
        width = 2200 // 5
        height = 2200 // 5

    if 0 == len(image_types):
        image_types = ImageType

    np.random.seed(42)

    for image_type in image_types:
        print(image_type.value)

        if image_type == ImageType.IMAGES:
            get_images = DataProvider().get_fish_images
            mean = 100.52933494138787
            std = 65.69793156777682
        else:
            get_images = DataProvider().get_docs_images
            mean = 241.46115784237548
            std = 49.512839464023564

        images, names = get_images(num_samples=number_of_samples, return_names=True)

        images_train, images_validation, names_train, names_validation = train_test_split(images, names,
                                                                                          random_state=42)
        t_to_comparator = {
            t: ComparatorCNN(t, width, height, image_type, mean=mean, std=std)
                .load_weights()
            for t in ts
        }

        clf = SolverGreedy(t_to_comparator,
                           image_type=image_type,
                           iterate_on_bottom_values=iterate_on_bottom_values,
                           iterate_on_right_values=iterate_on_right_values,
                           column_then_row_values=column_then_row_values,
                           iterate_first_shred=iterate_first_shred,
                           try_to_improve_with_row_permutation=try_to_improve_with_row_permutation)

        print('Train: ', names_train)
        accuracy = clf.evaluate(images_train, epochs=epochs, ts=ts)
        print('Train 0-1 accuracy on {}: {}'.format(image_type.value, accuracy))
        print('Validation: ', names_validation)
        accuracy = clf.evaluate(images_validation, epochs=epochs, ts=ts)
        print('Validation 0-1 accuracy on {}: {}'.format(image_type.value, accuracy))
Пример #21
0
def main():
    if 'debug' in sys.argv:
        print('Debug')
        number_of_samples = 20
        epochs = 1
    else:
        print('Release')
        number_of_samples = sys.maxsize
        epochs = 5

    ts = list()

    if '2' in sys.argv:
        ts.append(2)

    if '4' in sys.argv:
        ts.append(4)

    if '5' in sys.argv:
        ts = [
            5,
        ]

    if 0 == len(ts):
        ts = (2, 4, 5)

    image_types = list()

    if 'image' in sys.argv:
        image_types.append(ImageType.IMAGES)

    if 'document' in sys.argv:
        image_types.append(ImageType.DOCUMENTS)

    if 0 == len(image_types):
        image_types = ImageType

    np.random.seed(42)

    for image_type in image_types:
        print(image_type.value)

        if image_type == ImageType.IMAGES:
            get_images = DataProvider().get_fish_images
            mean = IMAGE_TYPE_TO_MEAN[image_type]
            std = IMAGE_TYPE_TO_STD[image_type]
        else:
            get_images = DataProvider().get_docs_images
            mean = IMAGE_TYPE_TO_MEAN[image_type]
            std = IMAGE_TYPE_TO_STD[image_type]

        images, names = get_images(num_samples=number_of_samples,
                                   return_names=True)

        images_train, images_validation, names_train, names_validation = train_test_split(
            images, names, random_state=42)
        t_to_comparator = {
            t: ComparatorCNN(
                t,
                IMAGE_TYPE_TO_T_TO_COMPARATOR_CNN_WEIGHT_FILE_ID_AND_FILE_PATH[
                    image_type][t].width,
                IMAGE_TYPE_TO_T_TO_COMPARATOR_CNN_WEIGHT_FILE_ID_AND_FILE_PATH[
                    image_type][t].height,
                image_type,
                mean=mean,
                std=std).
            load_weights(
                IMAGE_TYPE_TO_T_TO_COMPARATOR_CNN_WEIGHT_FILE_ID_AND_FILE_PATH[
                    image_type][t].model_path)
            for t in ts
        }

        t_to_backup_solver = {
            t: SolverGreedy(t_to_comparator, image_type=image_type)
            for t in ts
        }

        clf = SolverPairwiseMerge(t_to_comparator,
                                  t_to_backup_solver=t_to_backup_solver,
                                  image_type=image_type)
        print('Train: ', names_train)
        accuracy = clf.evaluate(images_train, epochs=epochs, ts=ts)
        print('Train 0-1 accuracy on {}: {}'.format(image_type.value,
                                                    accuracy))
        print('Validation: ', names_validation)
        accuracy = clf.evaluate(images_validation, epochs=epochs, ts=ts)
        print('Validation 0-1 accuracy on {}: {}'.format(
            image_type.value, accuracy))