示例#1
0
    def __init__(self):
        self.pre = Preprocess()
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.net = ResnetGenerator(ngf=32, img_size=256, light=True).to(self.device)

        params = torch.load('./models/photo2cartoon_10000.pt', map_location=self.device)
        self.net.load_state_dict(params['genA2B'])
def varyDataset(ds, save_path):
    classed_feature_preffix = [[
        '^als_d7_id_', '^als_d15_id_', '^als_m1_id_', '^als_m3_id_',
        '^als_m6_id_', '^als_m12_id_', '^als_fst_id_', '^als_lst_id_'
    ],
                               [
                                   '^als_d7_cell_', '^als_d15_cell_',
                                   '^als_m1_cell_', '^als_m3_cell_',
                                   '^als_m6_cell_', '^als_m12_cell_',
                                   '^als_fst_cell_', '^als_lst_cell_'
                               ]]
    printlog('class 5 - value padding: larger/smaller')
    ds_t = pd.read_csv(ds, encoding='gb18030', header=0, index_col=0)
    for i, (id_fc, cell_fc) in enumerate(
            zip(
                Preprocess.pattern_to_feature(ds_t,
                                              classed_feature_preffix[0],
                                              encoding='gb18030'),
                Preprocess.pattern_to_feature(ds_t,
                                              classed_feature_preffix[1],
                                              encoding='gb18030'))):
        for id_f, cell_f in zip(id_fc, cell_fc):
            ds_t.insert(loc=ds_t.columns.get_loc(id_f),
                        column=id_f.replace('id', 'large'),
                        value=ds_t[[id_f, cell_f]].apply(np.max, axis=1))
            ds_t.insert(loc=ds_t.columns.get_loc(id_f),
                        column=id_f.replace('id', 'small'),
                        value=ds_t[[id_f, cell_f]].apply(np.min, axis=1))
        printlog('class 5 - value padding finished {} and {}'.format(
            classed_feature_preffix[0][i], classed_feature_preffix[1][i]))
    ds_t.to_csv(save_path, encoding='gb18030')
示例#3
0
 def __init__(self):
     self.pre = Preprocess()
     assert os.path.exists(
         './models/photo2cartoon_weights.onnx'
     ), "[Step1: load weights] Can not find 'photo2cartoon_weights.onnx' in folder 'models!!!'"
     self.session = onnxruntime.InferenceSession(
         './models/photo2cartoon_weights.onnx')
     print('[Step1: load weights] success!')
示例#4
0
    def __init__(self):
        self.pre = Preprocess()
        self.net = ResnetGenerator(ngf=32, img_size=256, light=True)

        assert os.path.exists(
            './models/photo2cartoon_weights.pdparams'
        ), "[Step1: load weights] Can not find 'photo2cartoon_weights.pt' in folder 'models!!!'"
        params = paddle.load('./models/photo2cartoon_weights.pdparams')
        self.net.set_state_dict(params['genA2B'])
        print('[Step1: load weights] success!')
    def __init__(self,weight_path):

        self.pre = Preprocess()
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.net = ResnetGenerator(ngf=32, img_size=256, light=True).to(self.device)

        assert os.path.exists(weight_path), "[Step1: load weights] Can not find 'photo2cartoon_weights.pt' in folder 'models!!!'"
        params = torch.load(weight_path, map_location=self.device)
        self.net.load_state_dict(params['genA2B'])          # get the generator human face to cartoon portrait
        print('[Step1: load weights] success!')
def preprocess_data(args, train_df, test_df):
    data_dir = args['data_dir']

    preprocessor = Preprocess(data_dir + args['pickle_data'])

    train_data, test_data, word_tokenizer = preprocessor.preprocess_data(
        train_df, test_df)
    pos_tags, dep_tags = preprocessor.get_tags()

    return train_data, test_data, word_tokenizer, pos_tags, dep_tags
示例#7
0
    def __init__(self):
        self.pre = Preprocess()
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.net = ResnetGenerator(ngf=32, img_size=256,
                                   light=True).to(self.device)

        assert os.path.exists(
            './models/photo2blackjack_weights.pt'
        ), "[Step1: load weights] Can not find 'photo2blackjack_weights.pt' in folder 'models!!!'"
        params = torch.load('./models/photo2blackjack_weights.pt',
                            map_location=self.device)
        self.net.load_state_dict(params['genA2B'])
        print('[Step1: load weights] success!')
示例#8
0
def feature_padding(ds,
                    features,
                    preffix_patterns,
                    encoding='utf-8',
                    header=0,
                    index_col=0):
    ## get suffix of features in given class
    classed_class_features = Preprocess.pattern_to_feature(ds,
                                                           preffix_patterns,
                                                           encoding=encoding)
    tmp = [
        list(map(lambda fc, pf=preffix: fc[len(pf) - 1:],
                 feature_class)) for preffix, feature_class in zip(
                     preffix_patterns, classed_class_features)
    ]
    class_suffix = []
    for t in tmp:
        class_suffix.extend(t)
    class_suffix = list(set(class_suffix))
    # print('feature_padding: preffix_patterns = {}'.format(preffix_patterns))
    ## get features with mutually exclusive suffixs
    mut_exc_feature = []
    for suffix in class_suffix:
        for i, t in enumerate(tmp):
            if suffix in t:
                mut_exc_feature.append(preffix_patterns[i][1:] + suffix)
                break
        # if suffix in tmp[0]:
        #     mut_exc_feature.append(preffix_patterns[0][1:] + suffix)
        # elif suffix not in tmp[0]:
        #     mut_exc_feature.append(preffix_patterns[1][1:] + suffix if suffix in tmp[1] else preffix_patterns[2][1:] + suffix)
    return mut_exc_feature
示例#9
0
class Photo2Cartoon:
    def __init__(self):
        self.pre = Preprocess()
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.net = ResnetGenerator(ngf=32, img_size=256, light=True).to(self.device)

        params = torch.load('./models/photo2cartoon_10000.pt', map_location=self.device)
        self.net.load_state_dict(params['genA2B'])

    def inference(self, img):
        # face alignment and segmentation
        face_rgba = self.pre.process(img)
        if face_rgba is None:
            print('can not detect face!!!')
            return None

        face_rgba = cv2.resize(face_rgba, (256, 256), interpolation=cv2.INTER_AREA)
        face = face_rgba[:, :, :3].copy()
        mask = face_rgba[:, :, 3][:, :, np.newaxis].copy() / 255.
        face = (face * mask + (1 - mask) * 255) / 127.5 - 1

        face = np.transpose(face[np.newaxis, :, :, :], (0, 3, 1, 2)).astype(np.float32)
        face = torch.from_numpy(face).to(self.device)

        # inference
        with torch.no_grad():
            cartoon = self.net(face)[0][0]

        # post-process
        cartoon = np.transpose(cartoon.cpu().numpy(), (1, 2, 0))
        cartoon = (cartoon + 1) * 127.5
        cartoon = (cartoon * mask + 255 * (1 - mask)).astype(np.uint8)
        cartoon = cv2.cvtColor(cartoon, cv2.COLOR_RGB2BGR)
        return cartoon
def refreshModelFeature(ds, listed_feature_pattern):
    fe_temp = Preprocess.pattern_to_feature(ds,
                                            listed_feature_pattern,
                                            encoding='gb18030')
    fe_model = []
    for fe_class in fe_temp:
        fe_model.extend(fe_class)
    return fe_model
def run(train_sample, train_label, test_sample, test_label, k):
    train_sample, train_sample_size = Load.loadSample(train_sample)
    train_label, train_label_size = Load.loadLabel(train_label)
    assert train_sample_size == train_label_size, 'train_sample_size does not match train_label_size'

    test_sample, test_sample_size = Load.loadSample(test_sample)
    test_label, test_label_size = Load.loadLabel(test_label)
    assert test_sample_size == test_label_size, 'test_sample_size does not match test_label_size'

    train_sample = Preprocess.normalize(train_sample).values.tolist()  # list
    test_sample = Preprocess.normalize(test_sample).values.tolist()  # list

    label_to_index = {
        label: index
        for index, label in enumerate(set(train_label['x'].tolist()))
    }
    train_index = Preprocess.labelMap(train_label, label_to_index)  # list
    test_index = Preprocess.labelMap(test_label, label_to_index)  # list

    correct_count = 0

    for i, one in enumerate(test_sample):
        euclid_dist = np.linalg.norm(np.array(one) - np.array(train_sample),
                                     axis=1)
        nn_idx = euclid_dist.argsort()[:k]

        nn_vote = []
        nn_decision = 0
        for idx in nn_idx:
            nn_vote.append(train_index[idx])  # for there are only 1 or 0
        if sum(nn_vote) > k / 2:
            # print(list(label_to_index.keys())[1])
            nn_decision = 1
        else:
            # print(list(label_to_index.keys())[0])
            nn_decision = 0
        # print(test_label.values.tolist()[i][0])
        if test_label.values.tolist()[i][0] == list(
                label_to_index.keys())[nn_decision]:
            # right
            correct_count += 1
    test_correct = correct_count / test_sample_size
    Log.log(filename, 'k: {}; correct rate: {}\n'.format(k, test_correct))
    return test_correct
示例#12
0
def score(query, document):
    # -- by fast vec -- #
    """Построим представление
     векторное (как было показано в материалах к работе, 
     будем строить нечно средне взвешанное всех представлений 
     слов входящих в каждый документ с весами, полученными из 
     tfidf) отдельно взятого документа и запроса и по 
     cos(vq, vd) = (vq, vd)/(|vq| * |vd|)"""
    # -- строим общий текст title + text -- #
    words_text = (document.title + document.text).split(" ")
    # -- Уберём все лишнее если есть и приведем к нормализованному стостоянию -- #
    query = Preprocess().cleantext(query)
    words_query = query.split(" ")
    vd = sum(list(map(lambda w: dictionary_fastWV.get(w, default),
                      words_text))) / len(words_text)
    vq = sum(
        list(map(lambda w: dictionary_fastWV.get(w, default),
                 words_query))) / len(words_query)
    # -- Само значение косинусной близости -- #
    return np.dot(vd, vq) / (np.linalg.norm(vd) * np.linalg.norm(vq))
示例#13
0
def gen_cartoon(img):
    pre = Preprocess()
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    net = ResnetGenerator(ngf=32, img_size=256, light=True).to(device)

    assert os.path.exists(
        './models/photo2cartoon_weights.pt'
    ), "[Step1: load weights] Can not find 'photo2cartoon_weights.pt' in folder 'models!!!'"
    params = torch.load('./models/photo2cartoon_weights.pt',
                        map_location=device)
    net.load_state_dict(params['genA2B'])

    # face alignment and segmentation
    face_rgba = pre.process(img)
    if face_rgba is None:
        return None

    face_rgba = cv2.resize(face_rgba, (256, 256), interpolation=cv2.INTER_AREA)
    face = face_rgba[:, :, :3].copy()
    mask = face_rgba[:, :, 3][:, :, np.newaxis].copy() / 255.
    face = (face * mask + (1 - mask) * 255) / 127.5 - 1

    face = np.transpose(face[np.newaxis, :, :, :],
                        (0, 3, 1, 2)).astype(np.float32)
    face = torch.from_numpy(face).to(device)

    # inference
    with torch.no_grad():
        cartoon = net(face)[0][0]

    # post-process
    cartoon = np.transpose(cartoon.cpu().numpy(), (1, 2, 0))
    cartoon = (cartoon + 1) * 127.5
    cartoon = (cartoon * mask + 255 * (1 - mask)).astype(np.uint8)
    cartoon = cv2.cvtColor(cartoon, cv2.COLOR_RGB2BGR)
    out_path = Path(tempfile.mkdtemp()) / "out.png"
    cv2.imwrite(str(out_path), cartoon)
    return out_path
示例#14
0
class Photo2Cartoon:
    def __init__(self):
        self.pre = Preprocess()
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.net = ResnetGenerator(ngf=32, img_size=256,
                                   light=True).to(self.device)

        assert os.path.exists(
            './models/photo2blackjack_weights.pt'
        ), "[Step1: load weights] Can not find 'photo2blackjack_weights.pt' in folder 'models!!!'"
        params = torch.load('./models/photo2blackjack_weights.pt',
                            map_location=self.device)
        self.net.load_state_dict(params['genA2B'])
        print('[Step1: load weights] success!')

    def inference(self, img):
        # face alignment and segmentation
        face_rgba = self.pre.process(img)
        if face_rgba is None:
            print('[Step2: face detect] can not detect face!!!')
            return None

        print('[Step2: face detect] success!')
        face_rgba = cv2.resize(face_rgba, (256, 256),
                               interpolation=cv2.INTER_AREA)
        face = face_rgba[:, :, :3].copy()
        mask = face_rgba[:, :, 3][:, :, np.newaxis].copy() / 255.

        face = (face * mask + (1 - mask) * 255)
        img = Image.fromarray(np.uint8(face))
        img = img.convert("L")
        img = img.convert("RGB")
        face = np.asarray(img)

        face = np.transpose(face[np.newaxis, :, :, :],
                            (0, 3, 1, 2)).astype(np.float32)
        face = torch.from_numpy(face).to(self.device)

        # inference
        with torch.no_grad():
            cartoon = self.net(face)[0][0]

        # post-process
        cartoon = np.transpose(cartoon.cpu().numpy(), (1, 2, 0))
        cartoon = (cartoon + 1) * 127.5
        cartoon = (cartoon * mask + 255 * (1 - mask)).astype(np.uint8)
        cartoon = cv2.cvtColor(cartoon, cv2.COLOR_RGB2BGR)
        print('[Step3: face detect] success!')
        return cartoon
示例#15
0
def feature_padding_on_hit_rate(ds,
                                features,
                                preffix_patterns,
                                encoding='utf-8',
                                header=0,
                                index_col=0):
    ## get suffix of features in given class
    classed_class_features = Preprocess.pattern_to_feature(ds,
                                                           preffix_patterns,
                                                           encoding=encoding)
    ds = pd.read_csv(
        ds, encoding='gb18030', header=header,
        index_col=index_col) if isinstance(ds, str) else ds
    ## tmp: class suffix inflattened
    tmp = [
        list(map(lambda fc, pf=preffix: fc[len(pf) - 1:],
                 feature_class)) for preffix, feature_class in zip(
                     preffix_patterns, classed_class_features)
    ]
    class_suffix = []
    for t in tmp:
        class_suffix.extend(t)
    ## class_suffix: class suffix unique flattened
    class_suffix = list(set(class_suffix))
    # print('feature_padding: preffix_patterns = {}'.format(preffix_patterns))
    ## get features with mutually exclusive suffixs
    mut_exc_feature = []
    for suffix in class_suffix:
        tmp_hit_rate = 0
        tmp_output_feature = ''
        for i, t in enumerate(tmp):
            if suffix in t:
                tmp_feature = preffix_patterns[i][1:] + suffix
                tmp_feature_hit_rate = ds[tmp_feature].notna().sum(
                ) / ds.shape[0]
                if tmp_feature_hit_rate > tmp_hit_rate:
                    tmp_hit_rate = tmp_feature_hit_rate
                    tmp_output_feature = tmp_feature
        if tmp_output_feature != '':
            mut_exc_feature.append(tmp_output_feature)
    printlog('feature_padding_on_hit_rate: mut_exc_feature: {}'.format(
        mut_exc_feature),
             printable=False)
    # if suffix in tmp[0]:
    #     mut_exc_feature.append(preffix_patterns[0][1:] + suffix)
    # elif suffix not in tmp[0]:
    #     mut_exc_feature.append(preffix_patterns[1][1:] + suffix if suffix in tmp[1] else preffix_patterns[2][1:] + suffix)
    return mut_exc_feature
示例#16
0
def create_dataset(root,
                   dataset,
                   num_bits,
                   pad,
                   valid_size,
                   valid_indices=None,
                   split='train'):
    assert split in ['train', 'valid', 'test']

    preprocess = Preprocess(num_bits)
    c, h, w = (1, 28 + 2 * pad, 28 + 2 * pad)

    if dataset == 'fashion-mnist':
        transforms = []

        if split == 'train':
            transforms += [tvt.RandomHorizontalFlip()]

        transforms += [tvt.Pad((pad, pad)), tvt.ToTensor(), preprocess]

        dataset = datasets.FashionMNIST(root=root,
                                        train=(split in ['train', 'valid']),
                                        transform=tvt.Compose(transforms),
                                        download=True)
    else:
        raise RuntimeError('Unknown dataset')

    if split == 'train':
        num_train = len(dataset)
        indices = torch.randperm(num_train).tolist()
        # valid_size = int(np.floor(valid_frac * num_train))
        train_indices, valid_indices = indices[
            valid_size:], indices[:valid_size]
        dataset = Subset(dataset, train_indices)
    elif split == 'valid':
        dataset = Subset(dataset, valid_indices)

    print(f'Using {split} data split of size {len(dataset)}')

    return ImageDataset(dataset=dataset,
                        img_shape=(c, h, w),
                        preprocess_fn=preprocess,
                        valid_indices=valid_indices)
示例#17
0
class Photo2Cartoon:
    def __init__(self):
        self.pre = Preprocess()

        assert os.path.exists(
            './models/photo2cartoon_weights.onnx'
        ), "[Step1: load weights] Can not find 'photo2cartoon_weights.onnx' in folder 'models!!!'"
        self.session = onnxruntime.InferenceSession(
            './models/photo2cartoon_weights.onnx')
        print('[Step1: load weights] success!')

    def inference(self, img):
        # face alignment and segmentation
        face_rgba = self.pre.process(img)
        if face_rgba is None:
            print('[Step2: face detect] can not detect face!!!')
            return None

        print('[Step2: face detect] success!')
示例#18
0
class Photo2Cartoon:
    def __init__(self):
        self.pre = Preprocess()
        self.net = ResnetGenerator(ngf=32, img_size=256, light=True)

        assert os.path.exists(
            './models/photo2cartoon_weights.pdparams'
        ), "[Step1: load weights] Can not find 'photo2cartoon_weights.pt' in folder 'models!!!'"
        params = paddle.load('./models/photo2cartoon_weights.pdparams')
        self.net.set_state_dict(params['genA2B'])
        print('[Step1: load weights] success!')

    def inference(self, img):
        # face alignment and segmentation
        face_rgba = self.pre.process(img)
        if face_rgba is None:
            print('[Step2: face detect] can not detect face!!!')
            return None

        print('[Step2: face detect] success!')
        face_rgba = cv2.resize(face_rgba, (256, 256),
                               interpolation=cv2.INTER_AREA)
        face = face_rgba[:, :, :3].copy()
        mask = face_rgba[:, :, 3][:, :, np.newaxis].copy() / 255.
        face = (face * mask + (1 - mask) * 255) / 127.5 - 1

        face = np.transpose(face[np.newaxis, :, :, :],
                            (0, 3, 1, 2)).astype(np.float32)
        face = paddle.to_tensor(face)

        # inference
        with paddle.no_grad():
            cartoon = self.net(face)[0][0]

        # post-process
        cartoon = np.transpose(cartoon.numpy(), (1, 2, 0))
        cartoon = (cartoon + 1) * 127.5
        cartoon = (cartoon * mask + 255 * (1 - mask)).astype(np.uint8)
        cartoon = cv2.cvtColor(cartoon, cv2.COLOR_RGB2BGR)
        print('[Step3: photo to cartoon] success!')
        return cartoon
示例#19
0
class Photo2Cartoon:
    def __init__(self):
        self.pre = Preprocess()

        assert os.path.exists(
            './models/photo2cartoon_weights.onnx'
        ), "[Step1: load weights] Can not find 'photo2cartoon_weights.onnx' in folder 'models!!!'"
        self.session = onnxruntime.InferenceSession(
            './models/photo2cartoon_weights.onnx')
        print('[Step1: load weights] success!')

    def inference(self, img):
        # face alignment and segmentation
        face_rgba = self.pre.process(img)
        if face_rgba is None:
            print('[Step2: face detect] can not detect face!!!')
            return None

        print('[Step2: face detect] success!')
        face_rgba = cv2.resize(face_rgba, (256, 256),
                               interpolation=cv2.INTER_AREA)
        face = face_rgba[:, :, :3].copy()
        mask = face_rgba[:, :, 3][:, :, np.newaxis].copy() / 255.
        face = (face * mask + (1 - mask) * 255) / 127.5 - 1

        face = np.transpose(face[np.newaxis, :, :, :],
                            (0, 3, 1, 2)).astype(np.float32)

        # inference
        cartoon = self.session.run(['output'], input_feed={'input': face})

        # post-process
        cartoon = np.transpose(cartoon[0][0], (1, 2, 0))
        cartoon = (cartoon + 1) * 127.5
        cartoon = (cartoon * mask + 255 * (1 - mask)).astype(np.uint8)
        cartoon = cv2.cvtColor(cartoon, cv2.COLOR_RGB2BGR)
        print('[Step3: photo to cartoon] success!')
        return cartoon
def run():
    printlog(
        '-----------------------------------start presetting-----------------------------------'
    )
    ## hyperparams
    ## feature selection
    drop_sparse_threshold = 10
    hit_pos_rate_upper = 0.5
    hit_pos_rate_lower = 0.2
    tree_max_depth = None
    iv_upper_thresh = 999
    iv_lower_thresh = 0.2
    lasso_alpha = 1.0
    lasso_coef = 1e-05
    ## model
    xgb_FP_grad_mul = 0.3
    xgb_FN_grad_mul = 1.2
    xgb_zero_proba_cutoff = 0.5
    ## settings
    matplotlib.use('Agg')
    plt.rcParams['axes.unicode_minus'] = False
    plt.rcParams['font.family'] = 'SimHei'
    Log.clear_log(creative=True)
    ##
    ds_path = 'data/data.csv'  # raw dataset
    ds_merged = 'data/ds_merged.csv'  # raw dataset merged with population dataset
    ds_ns = 'tmp/ds_ns.csv'  # merged dataset clear of sparse columns
    ds_na = 'tmp/ds_na.csv'  # merged dataset clear of na data
    ds_cat = 'tmp/ds_cat.csv'  # merged dataset clear of categorical feature
    ds_cut = 'tmp/ds_cut.csv'  # merged dataset cut for IV feature selection
    ds_varied = 'tmp/ds_varied.csv'  # merged dataset varied
    ds_train = 'tmp/ds_train.csv'  # split train dataset
    ds_valid = 'tmp/ds_valid.csv'  # split validation dataset
    ds_test = 'tmp/ds_test.csv'  # split test dataset
    iv_detail = 'iv/iv_detail.csv'  # dataset with feature IVs
    lasso_detail = 'lasso/lasso_detail.csv'  # dataset with feature lasso coefficients
    xgb_detail = 'xgb/xgb_detail.csv'  # dataset with feature xgb importances
    fe_iv = 'features/fe_iv.csv'  # selected feature by IV
    fe_lasso = 'features/fe_lasso.csv'  # selected feature by lasso coefficients
    fe_xgb = 'features/fe_xgb.csv'  # selected feature by xgb importances
    tree_gate = 'tmp/tree_gate.joblib'  # trained tree model
    model_xgb = 'xgb/model_xgb.joblib'  # trained xgb model
    model_xgb_optim = 'xgb/model_xgb_optim.joblib'  # trained xgb model optimized
    model_stacking = 'tmp/model_stacking.joblib'  # trained stacking model
    plot_gate_tree = 'tmp/gate_tree.dot'  # plot of tree model
    fe_gate_hit = 'features/fe_gate_hit.csv'  # selected gate feature
    fe_gate_tree = 'features/fe_gate_tree.csv'  # selected tree feature
    cutoff_xgb = 'tmp/cutoff.txt'
    cutoff_xgb_optim = 'tmp/cutoff_optim.txt'
    ## class 1, 2, 4 variables
    fe_gate_pattern = ['^sl_', '^fr_', '^alu_']
    ## class 3, 5, 6, 7, 8 variables
    fe_model_pattern = ['^ir_', '^als_', '^cf_', '^cons_', '^pd_']

    # printlog('-----------------------------------feature preprocess-----------------------------------')
    # printlog('-----------------------------------prepare dataset-----------------------------------')
    # Preprocess.drop_sparse(ds_merged, 'all', threshold=drop_sparse_threshold, save_path=ds_ns, encoding='gb18030')
    # Preprocess.fill_na(ds_ns, 'all', replacement=-1, save_path=ds_na, encoding='gb18030')
    # Preprocess.fill_cat(ds_na, 'all', save_path=ds_cat, encoding='gb18030')
    # varyDataset(ds=ds_cat, save_path=ds_varied)
    # generateExperienceFeature(ds_varied)
    # train_fe, valid_fe, test_fe, train_lb, valid_lb, test_lb = Preprocess.train_validation_test_split(ds_varied, -1, 0.8, 0.05, 0.15, encoding='gb18030')
    # printlog('train label proportion:      {}; '.format(train_lb.sum() / train_lb.count()))
    # printlog('validation label proportion: {}; '.format(valid_lb.sum() / valid_lb.count()))
    # printlog('test label proportion:       {}; '.format(test_lb.sum() / test_lb.count()))
    # printlog('train feature shape:         {}; '.format(train_fe.shape))
    # printlog('validation feature shape:    {}; '.format(valid_fe.shape))
    # printlog('test feature shape:          {}; '.format(test_fe.shape))
    # pd.concat([train_fe, train_lb], axis=1, sort=True).to_csv(ds_train, encoding='gb18030')
    # pd.concat([valid_fe, valid_lb], axis=1, sort=True).to_csv(ds_valid, encoding='gb18030')
    # pd.concat([test_fe,  test_lb],  axis=1, sort=True).to_csv(ds_test,  encoding='gb18030')

    # printlog('-----------------------------------feature selection-----------------------------------')
    # printlog('-----------------------------------feature selection on gate feature and tree classifier-----------------------------------')
    # fe_gate       = refreshModelFeature(ds_train, fe_gate_pattern)
    # ## gate feature
    # fe_gate_upper = Feature_selection.hit_positive_rate(ds_train, fe_gate, -1, hit_pos_rate_upper, na_replacement=-1, encoding='gb18030')
    # fe_gate_lower = Feature_selection.hit_positive_rate(ds_train, fe_gate, -1, hit_pos_rate_lower, na_replacement=-1, encoding='gb18030')
    # Log.itersave(fe_gate_hit, fe_gate_upper)
    # Log.itersave(fe_gate_tree, [fe for fe in fe_gate_lower if fe not in fe_gate_upper])
    # ## tree model
    # tcl = Model.tree_classifier(
    #     ds=ds_train, features=Log.iterread(fe_gate_tree), label_column=-1,
    #     max_depth=tree_max_depth, encoding='gb18030', export_path=plot_gate_tree) ## only if fill_cat apply method='label_binarizer' should tree features be refreshed.
    # dump(tcl, tree_gate)

    # printlog('-----------------------------------feature selection on IV-----------------------------------')
    # fe_model = refreshModelFeature(ds_train, fe_model_pattern)
    # ## redo below 1 line only if change threshold and bin or totally rebuild
    # Temp_support.cut(ds_train, fe_model, threshold=10, bin=10, method='equal-frequency', save_path=ds_cut, encoding='gb18030')
    # Temp_support.select_feature_iv(ds_cut, fe_model, -1, iv_upper_thresh, iv_lower_thresh, to_file=iv_detail, encoding='gb18030')
    # ds_temp = pd.read_csv(iv_detail, encoding='gb18030', header=0, index_col=0)
    # ds_temp.sort_values('iv', ascending=False).head(5).to_csv(fe_iv)
    # # ds_temp = pd.read_csv(iv_detail, encoding='gb18030', header=0, index_col=0)['iv']
    # # ds_temp[ds_temp.between(iv_lower_thresh, iv_upper_thresh)].to_csv(fe_iv, header='iv')

    from utils.Simplify import method_iteration, results_archive

    # def func_whot_return(going):
    #     print('func: go {} with bebe'.format(going))
    # def func_with_return(going, being):
    #     print('func: go {} with {}'.format(going, being))
    #     return going, being
    # value_non     = None
    # value_str     = 'bebe'
    # value_lst_sin = [['bebe']]
    # value_lst_mul = ['bebe', 'gogo']

    # param_str     = {'going': value_str,     'being': value_str}
    # param_lst_sin = {'going': value_lst_sin, 'being': value_lst_sin}
    # param_lst_mul = {'going': value_lst_mul, 'being': value_lst_mul}
    # param_lst_mix = {'going': value_lst_sin, 'being': value_lst_mul}
    # param_str_non = {'going': value_str,     'being': value_non}
    # param_sin_non = {'going': value_lst_sin, 'being': value_non}
    # param_mul_non = {'going': value_lst_mul, 'being': value_non}

    # keys = [
    #     ['going', 'bebe'],
    #     ['going', 'bebe'],
    #     None,
    #     'x'
    # ]

    # func_res1, func_res2, func_res3, func_res4 = results_archive(
    #     results=method_iteration(
    #         methods=[func_with_return, func_with_return, func_whot_return, lambda x: x+1],
    #         params=[param_lst_mix, param_lst_mul, value_lst_sin, {'x': [1,2,3]}]),
    #     keys=keys, listed=False)
    # printlog('func 1 res: {}'.format(func_res1))
    # printlog('func 2 res: {}'.format(func_res2))
    # printlog('func 3 res: {}'.format(func_res3))
    # printlog('func 4 res: {}'.format(func_res4))
    # printlog('-----------------------------------feature selection on lasso/xgb-----------------------------------')
    # classed_fe_model = Preprocess.pattern_to_feature(ds_train, fe_model_pattern, encoding='gb18030')
    # ds_t = pd.read_csv(ds_train, encoding='gb18030', header=0, index_col=0)
    # listed_all_lasso_coef = []
    # listed_best_lasso_coef = []
    # listed_all_xgb_imprt = []
    # listed_best_xgb_imprt = []
    # for fe_model in tqdm(classed_fe_model):
    #     best_feaures, all_features = Feature_selection.select_on_lasso(
    #         X=ds_t.loc[:, fe_model], y=ds_t.iloc[:, -1],
    #         lasso_params={'alpha': lasso_alpha}, sort_index=2, sorted=True,
    #         encoding='gb18030')
    #     listed_best_lasso_coef.append(best_feaures)
    #     listed_all_lasso_coef.append(all_features)
    #     best_feaures, all_features = Feature_selection.select_on_xgb(
    #         X=ds_t.loc[:, fe_model], y=ds_t.iloc[:, -1],
    #         xgb_params={'alpha': lasso_alpha}, sort_index=2, sorted=True,
    #         encoding='gb18030')
    #     listed_best_xgb_imprt.append(best_feaures)
    #     listed_all_xgb_imprt.append(all_features)
    # pd.concat(listed_all_lasso_coef, axis=0).to_csv(lasso_detail, encoding='gb18030', header='lasso_coef')
    # pd.concat(listed_best_lasso_coef, axis=0).to_csv(fe_lasso, encoding='gb18030', header='lasso_coef')
    # pd.concat(listed_all_xgb_imprt, axis=0).to_csv(xgb_detail, encoding='gb18030', header='feature_importances')
    # pd.concat(listed_best_xgb_imprt, axis=0).to_csv(fe_xgb, encoding='gb18030', header='feature_importances')

    # printlog('-----------------------------------feature selection on lasso/xgb-----------------------------------')
    classed_fe_model = Preprocess.pattern_to_feature(ds_train,
                                                     fe_model_pattern,
                                                     encoding='gb18030')
    ds_t = pd.read_csv(ds_train, encoding='gb18030', header=0, index_col=0)
    lasso_select_params = {
        'X': [ds_t.loc[:, fe_model] for fe_model in classed_fe_model],
        'y': [ds_t.iloc[:, -1]],
        'lasso_params': [{
            'alpha': lasso_alpha
        }],
        'sort_index': [2],
        'sorted': [True],
        'encoding': ['gb18030']
    }
    xgb_select_params = {
        'X': [ds_t.loc[:, fe_model] for fe_model in classed_fe_model],
        'y': [ds_t.iloc[:, -1]],
        'xgb_params': [{
            'alpha': lasso_alpha
        }],
        'sort_index': [2],
        'sorted': [True],
        'encoding': ['gb18030']
    }
    keys = [['best_lasso_features', 'all_lasso_features'],
            ['best_xgb_features', 'all_xgb_features']]
    lasso_res, xgb_res = results_archive(results=method_iteration(
        methods=[
            Feature_selection.select_on_lasso, Feature_selection.select_on_xgb
        ],
        params=[lasso_select_params, xgb_select_params]),
                                         keys=keys,
                                         listed=False)
    print('lasso best features: {}'.format(lasso_res['best_lasso_features']))
    print('xgb   best features: {}'.format(xgb_res['best_xgb_features']))

    # printlog('-----------------------------------features-----------------------------------')
    # hitrate_features  = Log.iterread(fe_gate_hit)
    # tree_features     = Log.iterread(fe_gate_tree)
    # # selected_features = [
    # #     'als_m12_id_nbank_orgnum', 'als_m3_id_cooff_allnum',
    # #     'ir_id_x_cell_cnt', 'als_m6_id_rel_allnum',
    # #     'als_fst_id_nbank_inteday', 'cons_tot_m12_visits','pd_gender_age']
    # selected_features = []
    # selected_features.extend(pd.read_csv(fe_iv, encoding='gb18030', header=0, index_col=0).index.tolist())
    # selected_features.extend(pd.read_csv(fe_xgb, encoding='gb18030', header=0, index_col=0).index.tolist())
    # selected_features.extend(pd.read_csv(fe_lasso, encoding='gb18030', header=0, index_col=0).index.tolist())
    # selected_features = list(set(selected_features))
    # printlog('Selected features: {}'.format(selected_features), printable=False)

    # printlog('-----------------------------------prepare train dataset-----------------------------------')
    # train_dataset = pd.read_csv(ds_train, encoding='gb18030', header=0, index_col=0)
    # valid_dataset = pd.read_csv(ds_valid, encoding='gb18030', header=0, index_col=0)
    # X_train = train_dataset.loc[:, selected_features].values
    # y_train = train_dataset.iloc[:,-1]
    # X_valid = valid_dataset.loc[:, selected_features].values
    # y_valid = valid_dataset.iloc[:,-1]

    # printlog('-----------------------------------train on xgb-----------------------------------')
    # def objective(y_true, y_pred):
    #     multiplier = pd.Series(y_true).mask(y_true == 1, xgb_FN_grad_mul).mask(y_true == 0, xgb_FP_grad_mul)
    #     grad = multiplier * (y_pred - y_true)
    #     hess = multiplier * np.ones(y_pred.shape)
    #     return grad, hess
    # xgb_params          = {'max_depth': range(1, 11), 'n_estimators': range(270, 280, 1), 'objective': [objective], 'random_state': [1], 'seed': [1]}
    # xgb_grid_plot       = 'tmp/grid_XGB_optim'
    # best_model, best_score, _, _ = Assess.gridTrainValidSelection(
    #     XGBClassifier(), xgb_params, X_train, y_train, X_valid, y_valid, # nfolds=5 [optional, instead of validation set]
    #     metric=roc_auc_score, greater_is_better=True,
    #     scoreLabel='ROC AUC', showPlot=False, to_file=None)
    # printlog(best_model, best_score)
    # dump(XGBClassifier(), model_xgb)
    # dump(best_model, model_xgb_optim)

    # printlog('-----------------------------------calculate cutoff-----------------------------------')
    # for model, cutoff_model in zip([load(model_xgb), load(model_xgb_optim)], [cutoff_xgb, cutoff_xgb_optim]):
    #     model.fit(X_train, y_train)
    #     cutoff = optimalCutoff(model, X_valid, y_valid.to_numpy())
    #     Log.itersave(cutoff_model, [cutoff])

    # ###########################################shit###############################
    # estimators = [
    #     ('RF',   RandomForestClassifier()),
    #     ('ET',   ExtraTreesClassifier()),
    #     ('AB',   AdaBoostClassifier()),
    #     ('GBDT', GradientBoostingClassifier()),
    #     ('XGB',  XGBClassifier())
    # ]
    # grids = [
    #     {
    #         'n_estimators': range(10, 101, 10),
    #         'min_samples_leaf': [1, 5, 10, 15, 20, 25],
    #         'max_features': ['sqrt', 'log2', 0.5, 0.6, 0.7],
    #         'n_jobs': [-1], 'random_state': [1]},
    #     {
    #         'n_estimators': range(10, 101, 10),
    #         'min_samples_leaf': [1, 5, 10, 15, 20, 25],
    #         'max_features': ['sqrt', 'log2', 0.5, 0.6, 0.7],
    #         'n_jobs': [-1], 'random_state': [1]},
    #     {
    #         'n_estimators': range(10, 101, 10),
    #         'random_state': [1]},
    #     {
    #         'n_estimators': range(10, 101, 10),
    #         'min_samples_leaf': [1, 5, 10, 15, 20, 25],
    #         'max_features': ['sqrt', 'log2', 0.5, 0.6, 0.7],
    #         'random_state': [1]},
    #     {
    #         'n_estimators': range(10, 101, 10),
    #         'max_depth': range(1, 11),
    #         'n_jobs': [-1], 'random_state': [1]}]
    # grid_plots = [
    #     'tmp/grid_RF.png', 'tmp/grid_ET.png', 'tmp/grid_AB.png',
    #     'tmp/grid_GBDT.png', 'tmp/grid_XGB.png']
    # best_models = []
    # for i in range(5):
    #     best_model, best_score, all_models, all_scores = Assess.gridTrainValidSelection(
    #         estimators[i][1], grids[i], X_train, y_train, X_valid, y_valid, # nfolds=5 [optional, instead of validation set]
    #         metric=roc_auc_score, greater_is_better=True,
    #         scoreLabel='ROC AUC', to_file=grid_plots[i])
    #     printlog(best_model)
    #     printlog(best_score)
    #     best_models.append((estimators[i][0], best_model))
    # stackingClassifier = StackingClassifier(estimators=best_models)
    # dump(stackingClassifier, model_stacking)
    # printlog('-----------------------------------train on stacking-----------------------------------')
    # estimators = [
    #     ('RF',   RandomForestClassifier()),
    #     ('ET',   ExtraTreesClassifier()),
    #     ('AB',   AdaBoostClassifier()),
    #     # ('GBDT', GradientBoostingClassifier()),
    #     ('XGB',  XGBClassifier())
    # ]
    # estimator_params = [
    #     {'max_depth': range(10, 101, 1), 'n_estimators': range(30, 121, 1)},
    #     {'max_depth': range(10, 101, 1), 'n_estimators': range(30, 121, 1)},
    #     {'n_estimators': range(30, 121, 1)},
    #     # {'max_depth': range(10, 121, 5), 'n_estimators': range(10, 121, 5)},
    #     {'max_depth': range(2,  10,  1), 'n_estimators': range(10, 121, 1)}
    # ]
    # for i, (estimator, params) in enumerate(zip(estimators, estimator_params)):
    #     estimators[i][1].set_params(**Assess.gridCVSelection(
    #             estimator=estimator[1], estimator_name=estimator[0], save_folder='stacking',
    #             train_features=X_train, train_label=y_train, valid_features=X_valid, valid_label=y_valid,
    #             grid_params=params, grid_scorers=['neg_mean_squared_error', 'roc_auc'], refit_scorer='roc_auc'))
    # stackingClassifier = StackingClassifier(estimators=estimators)
    # stackingClassifier.fit(X_train, y_train)
    # dump(stackingClassifier, model_stacking)

    # printlog('-----------------------------------prepare test dataset-----------------------------------')
    # test_dataset = pd.read_csv(ds_test, encoding='gb18030', header=0, index_col=0)
    # X_test = test_dataset.loc[:, selected_features].values
    # y_test = test_dataset.iloc[:, -1]

    # printlog('-----------------------------------test on gate and tree-----------------------------------')
    # pred_hit     = (test_dataset[hitrate_features] != -1).any(axis=1).astype(int)
    # pred_tree    = pd.Series(load(tree_gate).predict(test_dataset[tree_features]), index=test_dataset.index)
    # printlog('gate test: {} labelled 1 by hit positive rate.'.format(pred_hit.sum()))
    # printlog('gate test: {} labelled 1 by tree classifier.'.format(pred_tree.sum()))

    # printlog('-----------------------------------test on xgb-----------------------------------')
    # prediction = recoverEstimator(model_xgb, X_train, y_train).predict(X_test)
    # print((prediction == 1).sum())
    # prediction_optim    = recoverEstimator(model_xgb_optim, X_train, y_train).predict(X_test)
    # # prediction = y_test.copy()
    # # labeled_index = prediction[prediction == 1].index.tolist()
    # # unlabeled_index = prediction[prediction == 0].index.tolist()
    # # prediction.loc[labeled_index[:89]] = 0
    # # prediction.loc[unlabeled_index[:46]] = 1
    # # Assess.modelAssess(y_test, prediction, '/', 'Stacking')
    # # Assess.confusionMatrixFromPrediction(
    # #     y_test, prediction,       [0, 1], 'Normalized matrics on Stacking',
    # #     'true', plt.cm.Blues, 'confusion_Stacking.png')
    # Assess.confusionMatrixFromPrediction(
    #     y_test, prediction_optim, [0, 1], 'Normalized matrics on XGB_optim without cutoff',
    #     'true', plt.cm.Blues, 'tmp/confusion_XGB_optim_raw.png')
    # prediction          = recoverEstimator(model_xgb, X_train, y_train).predict_proba(X_test)
    # prediction_optim    = recoverEstimator(model_xgb_optim, X_train, y_train).predict_proba(X_test)
    # ## assess model
    # Assess.modelAssess(y_test.to_numpy(), prediction,       'misc', 'XGB_before_gate')
    # Assess.modelAssess(y_test.to_numpy(), prediction_optim, 'misc', 'XGB_optim_before_gate')
    # ## apply gate prediction to xgb prediction
    # prediction          = applyGate(prediction,       pred_hit, pred_tree)
    # prediction_optim    = applyGate(prediction_optim, pred_hit, pred_tree)
    # ## assess model
    # Assess.modelAssess(y_test.to_numpy(), prediction,       'misc', 'XGB')
    # Assess.modelAssess(y_test.to_numpy(), prediction_optim, 'misc', 'XGB_optim')
    # ## apply cutoff formula
    # cutoff=0.9
    # cutoff_optim=0.7
    # prediction          = applyCutoff(prediction, cutoff)
    # prediction_optim    = applyCutoff(prediction, cutoff_optim)
    # Assess.confusionMatrixFromPrediction(
    #     y_test, prediction[:, 1],       [0, 1], 'Normalized matrics on XGB with cutoff',
    #     'true', plt.cm.Blues, 'tmp/confusion_XGB.png')
    # Assess.confusionMatrixFromPrediction(
    #     y_test, prediction_optim[:, 1], [0, 1], 'Normalized matrics on XGB_optim with cutoff',
    #     'true', plt.cm.Blues, 'tmp/confusion_XGB_optim.png')

    # printlog('-----------------------------------test on stacking-----------------------------------')
    # prediction  = recoverEstimator(model_stacking, X_train, y_train).predict(X_test)
    # Assess.confusionMatrixFromPrediction(
    #     y_test, prediction,       [0, 1], 'Normalized matrics on stacking without cutoff',
    #     'true', plt.cm.Blues, 'tmp/confusion_stacking_raw.png')
    # ## assess model
    # prediction  = recoverEstimator(model_stacking, X_train, y_train).predict_proba(X_test)
    # Assess.modelAssess(y_test.to_numpy(), prediction, 'misc', 'ENSSEMBLE_before_gate')
    # ## apply gate prediction to xgb prediction
    # prediction = applyGate(prediction, pred_hit, pred_tree)
    # ## assess model
    # Assess.modelAssess(y_test.to_numpy(), prediction, 'misc', 'ENSSEMBLE')
    # ## apply cutoff formula
    # prediction = applyCutoff(prediction, cutoff=0.7)
    # Assess.confusionMatrixFromPrediction(
    #     y_test, prediction[:, 1],       [0, 1], 'Normalized matrics on stacking with cutoff',
    #     'true', plt.cm.Blues, 'tmp/confusion_stacking.png')

    printlog(
        '-----------------------------------finished-----------------------------------'
    )
示例#21
0
文件: train.py 项目: zghzdxs/chainer
print('GPU: {}'.format(device))
print('# Minibatch-size: {}'.format(args.batchsize))
print('# epoch: {}'.format(args.epoch))
print('')

if device.xp is chainer.backends.cuda.cupy:
    chainer.global_config.autotune = True

# Datasets
if not os.path.isdir(args.dataset):
    raise RuntimeError('Dataset directory not found: {}'.format(args.dataset))
paths = sorted([
    str(path) for path in pathlib.Path(args.dataset).glob('wav48/*/*.wav')])
preprocess = Preprocess(
    sr=16000, n_fft=1024, hop_length=256, n_mels=128, top_db=20,
    length=args.length, quantize=args.a_channels)
dataset = chainer.datasets.TransformDataset(paths, preprocess)
train, valid = chainer.datasets.split_dataset_random(
    dataset, int(len(dataset) * 0.9), args.seed)

# Networks
encoder = UpsampleNet(args.n_loop * args.n_layer, args.r_channels)
decoder = WaveNet(
    args.n_loop, args.n_layer,
    args.a_channels, args.r_channels, args.s_channels,
    args.use_embed_tanh)
model = chainer.links.Classifier(EncoderDecoderModel(encoder, decoder))

# Optimizer
optimizer = chainer.optimizers.Adam(1e-4)
示例#22
0
import os
import cv2
import numpy as np
from tqdm import tqdm
import argparse

from utils import Preprocess

parser = argparse.ArgumentParser()
parser.add_argument('--data_path', type=str, help='photo folder path')
parser.add_argument('--save_path', type=str, help='save folder path')

args = parser.parse_args()
os.makedirs(args.save_path, exist_ok=True)

pre = Preprocess()

for idx, img_name in enumerate(tqdm(os.listdir(args.data_path))):
    img = cv2.cvtColor(cv2.imread(os.path.join(args.data_path, img_name)),
                       cv2.COLOR_BGR2RGB)

    # face alignment and segmentation
    face_rgba = pre.process(img)
    if face_rgba is not None:
        # change background to white
        face = face_rgba[:, :, :3].copy()
        mask = face_rgba[:, :, 3].copy()[:, :, np.newaxis] / 255.
        face_white_bg = (face * mask + (1 - mask) * 255).astype(np.uint8)

        cv2.imwrite(os.path.join(args.save_path,
                                 str(idx).zfill(4) + '.png'),
示例#23
0
    parser.add_argument("--path",
                        type=str,
                        default='../data/screens_phone/screen_5.jpeg',
                        help="Path of the image")
    parser.add_argument("--save_path",
                        type=str,
                        default='',
                        help="Path to save the image")

    args = parser.parse_args()
    path = args.path
    save = args.save_path
    img = cv2.imread(path)  #reading the image

    ##### Preprocessing
    preprocessor = Preprocess()
    img = preprocessor.remove_shadows(img.copy())
    angle_desk, img = preprocessor.deskew(img.copy())
    edges = preprocessor.edge_detection(img.copy())
    lines = preprocessor.line_detection(edges)
    lines = preprocessor.filter_hough_lines(lines, edges)

    ##### Computing lines and angles
    angles_deg = preprocessor.compute_angles(lines)
    rot_angles = preprocessor.adjust_angles(angles_deg)
    lines_candidates = preprocessor.filter_lines_direction(
        lines, rot_angles, angle_desk)

    ##### adjusting blind image
    blind_image_adjustor = Blind_image_adjustment(img, lines_candidates,
                                                  img.shape[0])
示例#24
0
                    help='Number of prefetch samples')
parser.add_argument('--resume',
                    '-r',
                    default='',
                    help='Resume the training from snapshot')
args = parser.parse_args()
if args.gpus != [-1]:
    chainer.cuda.set_max_workspace_size(2 * 512 * 1024 * 1024)
    chainer.global_config.autotune = True

# get paths
files, _ = get_LJSpeech_paths(params.root)
# files, _ = get_VCTK_paths(params.root)

preprocess = Preprocess(params.sr, params.n_fft, params.hop_length,
                        params.n_mels, params.top_db, params.length,
                        params.categorical_output_dim)

dataset = chainer.datasets.TransformDataset(files, preprocess)
train, valid = chainer.datasets.split_dataset_random(dataset,
                                                     int(len(dataset) * 0.9),
                                                     params.split_seed)

# make directory of results
result = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
os.mkdir(result)
shutil.copy(__file__, os.path.join(result, __file__))
shutil.copy('utils.py', os.path.join(result, 'utils.py'))
shutil.copy('params.py', os.path.join(result, 'params.py'))
shutil.copy('generate.py', os.path.join(result, 'generate.py'))
shutil.copy('net.py', os.path.join(result, 'net.py'))
示例#25
0
        from TextCNNen import Config, Model
        dataset = '../data/aclImdb'

    # 设置相同的随机数,保证模型在不同测试的在数据层面的稳定性,同时也会带一些隐性的弊端
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed_all(1)
    torch.backends.cudnn.deterministic = True  # 保证每次结果一样

    # 设置对应实验的数据参数、模型参数以及其他设置
    config = Config(dataset)

    start_time = time.time()
    print("Loading data...")
    # 对预训练词向量的embedding进行设置,载入数据分类种类,数据标签和数据样本,并将其文字转化对应的 character level 或者 word level的 idx
    preprocess = Preprocess(config)
    # 获得处理数据集的迭代器,迭代器每次返回一个batch size的 data and labels
    train_iter, dev_iter, test_iter = preprocess.get_iters()

    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)

    # train
    # 注意运行环境为 `cuda`
    model = Model(config).to(config.device)
    # 权重初始化
    init_network(model)
    # 打印模型结构
    print(model.parameters)
    # 使用iterators遍历数据喂给训练的模型,同时dev,最后test
    train(config, model, train_iter, dev_iter, test_iter)
示例#26
0
def run(filename, train_sample, train_label, test_sample, test_label, title, M,
        thresh, CART_step):
    train_sample, train_sample_size = Load.loadSample(train_sample)
    train_label, train_label_size = Load.loadLabel(train_label)
    assert train_sample_size == train_label_size, 'train_sample_size does not match train_label_size'

    test_sample, test_sample_size = Load.loadSample(test_sample)
    test_label, test_label_size = Load.loadLabel(test_label)
    assert test_sample_size == test_label_size, 'test_sample_size does not match test_label_size'

    train_sample = Preprocess.normalize(train_sample,
                                        True).values.tolist()  # list
    test_sample = Preprocess.normalize(test_sample,
                                       True).values.tolist()  # list

    label_to_index = {
        label: index
        for index, label in enumerate(set(train_label['x'].tolist()))
    }
    train_index = Preprocess.labelMap(train_label, label_to_index)  # list
    test_index = Preprocess.labelMap(test_label, label_to_index)  # list

    input_size = len(train_sample[0])
    sample_size = len(train_sample)
    sample_weights = [1 / sample_size for _ in range(sample_size)]
    classifier_weights = []
    classifier_thresholds = []
    threshold_positions = []
    test_corrs = []
    test_times = [i + 1 for i in range(M)]

    for i in range(M):
        threshold, position, errors = Calc.CART(train_sample, train_index,
                                                sample_weights, thresh,
                                                CART_step)
        total_error = Calc.gentleError(np.array(sample_weights),
                                       np.array(errors))
        classifier_weights.append(round(Calc.classifierError(total_error), 3))
        classifier_thresholds.append(threshold)
        threshold_positions.append(position)
        sample_weights = Calc.updateVariableWeights(np.array(sample_weights),
                                                    total_error, errors)
        # print('errors: {}'.format(errors))
        # print('sample_weights: {}'.format(sample_weights))
        # print('classifier_threshold: {} in {}'.format(threshold, position))
        print('total_error: {}'.format(total_error))
        print('threshold_positions:   {}'.format(threshold_positions))
        print('classifier_thresholds: {}'.format(classifier_thresholds))
        print('classifier_weights:    {}'.format(classifier_weights))

        test_corr = 0
        test_size = len(test_sample)
        for sample, index in zip(test_sample, test_index):
            vote = 0
            for threshold, position, weight in zip(classifier_thresholds,
                                                   threshold_positions,
                                                   classifier_weights):
                if sample[position] >= threshold:
                    vote += weight
                elif sample[position] < threshold:
                    vote -= weight
            if vote >= 0 and index == 1:
                test_corr += 1
            elif vote < 0 and index == 0:
                test_corr += 1
        test_corrs.append(round(test_corr / test_size, 3))
        Log.log(filename, 'M: {}; correction: {}\n'.format(M, test_corrs[-1]))
        print(
            '-----------------thresh: {}; CART_step: {}; iter: {}-----------------'
            .format(thresh, CART_step, i + 1))

    Graph.draw(filename, test_times, test_corrs, test_times[-1], 1.0, title)
    return test_corrs
示例#27
0
                    '-r',
                    default='',
                    help='Resume the training from snapshot')
args = parser.parse_args()
if args.gpus != [-1]:
    chainer.cuda.set_max_workspace_size(2 * 512 * 1024 * 1024)
    chainer.global_config.autotune = True

# get paths
if params.dataset_type == 'LJSpeech':
    files, _ = get_LJSpeech_paths(params.root)
else:
    files, _ = get_VCTK_paths(params.root)

preprocess = Preprocess(params.sr, params.n_fft, params.hop_length,
                        params.n_mels, params.fmin, params.fmax, params.top_db,
                        params.length)

dataset = chainer.datasets.TransformDataset(files, preprocess)
if params.split_seed is None:
    train, valid = chainer.datasets.split_dataset_random(
        dataset, int(len(dataset) * 0.9))
else:
    train, valid = chainer.datasets.split_dataset_random(
        dataset, int(len(dataset) * 0.9), params.split_seed)

# make directory of results
result = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
os.mkdir(result)
shutil.copy(__file__, os.path.join(result, __file__))
shutil.copy('utils.py', os.path.join(result, 'utils.py'))
示例#28
0
parser.add_argument('--gpu',
                    '-g',
                    type=int,
                    default=-1,
                    help='GPU ID (negative value indicates CPU)')
args = parser.parse_args()
if args.gpu != [-1]:
    chainer.cuda.set_max_workspace_size(2 * 512 * 1024 * 1024)
    chainer.global_config.autotune = True

# set data
path = args.input

# preprocess
n = 1  # batchsize; now suporrts only 1
inputs = Preprocess(params.sr, params.n_fft, params.hop_length, params.n_mels,
                    params.top_db, None, params.categorical_output_dim)(path)

_, condition, _ = inputs
if params.categorical_output_dim is False or params.categorical_output_dim is None:
    input_dim = 1
else:
    input_dim = categorical_output_dim
x = numpy.zeros([n, input_dim, 1, 1], dtype=numpy.float32)
condition = numpy.expand_dims(condition, axis=0)

# make model
encoder = UpsampleNet(params.upsample_factors)
decoder = WaveNet(params.n_loop, params.n_layer, params.filter_size,
                  params.residual_channels, params.dilated_channels,
                  params.skip_channels, params.output_dim, params.quantize,
                  params.log_scale_min, params.condition_dim,
示例#29
0
parser.add_argument('--gpu',
                    '-g',
                    type=int,
                    default=-1,
                    help='GPU ID (negative value indicates CPU)')
args = parser.parse_args()
if args.gpu != [-1]:
    chainer.cuda.set_max_workspace_size(2 * 512 * 1024 * 1024)
    chainer.global_config.autotune = True

# set data
path = args.input

# preprocess
n = 1  # batchsize; now suporrts only 1
inputs = Preprocess(params.sr, params.n_fft, params.hop_length, params.n_mels,
                    params.fmin, params.fmax, params.top_db, None)(path)

_, condition = inputs
condition = numpy.expand_dims(condition, axis=0)

# make model
glow = Glow(params.hop_length, params.n_mels, 1, params.squeeze_factor,
            params.n_flows, params.n_layers, params.wn_channel,
            params.early_every, params.early_size, params.var)

# load trained parameter
chainer.serializers.load_npz(args.model, glow, 'updater/model:main/')

if args.gpu >= 0:
    use_gpu = True
    chainer.cuda.get_device_from_id(args.gpu).use()
示例#30
0
def retrieve(query):
    """
    Получаем запорос, если не пустой, то с помощью инвертированного индекса 
    Записываем потенциальных претендетнов на вывод. Это должно быть достаточно быстро.
    Сперва смотрим на title, так как чаще всего люди ищут песню по её названию. Если в top 100 or 200 
    Не все по названию попали, то смотрим уже по содержанию в тексте песни. Хотя это моежт быть неправильно, ибо 
    общие слова есть в текстах всех песен, поэтому стоит попробовать строить для каждого текста его векторное представление
    Аналогично тому, как мы это делали с текстами и embeddings. 
    """
    # -- Введёт исполнителя -- #
    # -- Жанр -- #
    # -- Пытаться вспомнить имя песни (точно, неточно) -- #
    # -- Исполнитель + Песня + Жанр -- #
    # -- Пытаться кусочек фразы вспомнить из песни -- #
    N = 100
    # -- if empty enter first  or maybe by popularity -- #
    if (query.__len__() == 0):
        return index[:N]

    query_words = [w.lower() for w in query.split(" ")]
    query_lowercase = " ".join(query_words)
    # -- Поискать среди исполнителей -- #
    # -- [query1, query2, ...], [name1, name2, ...] -- #
    # -- Мы просто пробегаемся по именам исполнителей и ищем максимальное совпадение с запросом query -- #
    name_artist = findKeyInInvertIndexDict(invertIndexArtistName,
                                           query_lowercase)
    # -- we have find_name -- #
    # -- Можно поискать среди жанров -- # (Их мало)
    # -- Человек может просто ввести жанр и например по популярности логично вывезти слова -- #
    name_genre = findKeyInInvertIndexDict(invertIndexGenres, query_lowercase)
    # -- поискать по названию песни -- #
    # -- Надо query преобразовать по подобию, как мы делали с title -- #
    prep = Preprocess(lemmatize=True)
    query_prep_words = prep.cleantext(query_lowercase,
                                      stopwords=True).split(" ")
    # -- Предобработанный список слов query_prep_words -- #
    indeces_of_documents = []
    indeces_title = []
    for w in query_prep_words:
        if (invertindextitles.get(w, -1) != -1):
            indeces_title += invertindextitles[w]['index']

    indeces_text = []
    for w in query_prep_words:
        if (invertindextext.get(w, -1) != -1):
            indeces_text += invertindextext[w]['index']

    # -- Можно уже на самом деле выводить список, если мы смогли набрать за счёт indeces_title -- #
    indeces_of_documents += indeces_title
    if (name_artist != str()):
        # -- we have some match -- #
        # -- need intersection with indeces_title -- #
        # -- indeces_title is list, invertArtist is list also -- #
        # -- Может быть и не нужно искать пересечение -- #
        indeces_of_documents += list(
            set(indeces_of_documents).intersection(
                invertIndexArtistName[name_artist]))

    # -- Если ничего до не нашли пройтись по тексту песен -- #
    if (len(indeces_of_documents) == 0 and len(indeces_text) != 0):
        # -- Ничего не отобрали -- #
        indeces_of_documents += indeces_of_text

    # -- Если претендентов больше скажем 100, то мы посмотрим на Popularity песен и по нему отсортируем -- #
    # -- sort by popularity всегда сортируем что-бы там не было по популярности -- #
    if (len(indeces_of_documents) != 0):
        return np.array(index)[sorted(
            indeces_of_documents,
            key=lambda x: index_popularity[x])][:N].tolist()
    else:
        # -- Вообще ничего не нашли, то надо что-то хоть отобрать -- #
        # -- Например по популярности -- #
        return np.array(index)[[
            ind for ind, _ in sorted(index_popularity, key=lambda x: x[1])
        ]][:N].tolist()