示例#1
0
def main():
    data_provider = DataProvider(data_directory=Path('./data'))
    item_users = transform_to_item_user_csr_matrix(
        data_provider.get_purchases_train())

    # baseline model
    model = get_model()
    np.random.seed(42)
    model.fit(item_users=item_users)

    test_user_ids, test_purchases = get_purchases_by_customer(
        data_provider.get_purchases_test())
    recommendations = get_recommendations(model, test_user_ids, item_users)
    score = mapk(test_purchases, recommendations, k=10)
    return score
def run_simulate(user_document_params, click_exposure_params):
    user_document_generator = UserDocumentDataGenerator(**user_document_params)
    click_exposure_generator = ClickExposureDataGenerator(
        user_document=user_document_generator, **click_exposure_params)

    user_document_data = user_document_generator.generate_data()
    relevance, exposure, click, exposure_labels, implicit_feedback = click_exposure_generator.generate_data(
    )

    # model setup
    model = get_model(implicit_feedback, user_document_data, exposure_labels)

    # train
    trainer = Trainer(model)
    trainer.train(relevance[0].reshape(-1),
                  user_document_data.reshape(-1, 1100))

    return trainer
示例#3
0
def examinate(algorithm_name):
    table_name = 'score_' + algorithm_name  #结果存储表名
    for mall_id in malls:
        print(mall_id, ' with ', algorithm_name, ' starts...')
        sql = "SELECT train_time FROM {s} WHERE mall_id='{m}'".format(
            m=mall_id, s=table_name)  # 检测这个商场有没有建过模型,建过模型会有记录
        try:  # 测试有没有建过表,如果没建过就会建表
            cur.execute(sql)
        except pymysql.err.ProgrammingError:
            sql2 = '''CREATE TABLE `{n}` (
                                `mall_id`  varchar(255) NOT NULL ,
                                `result`  varchar(255) NULL ,
                                `param`  varchar(255) NULL ,
                                `train_time`  int NULL ,
                                PRIMARY KEY (`mall_id`)
                                );'''.format(n=table_name)
            cur.execute(sql2)
            cur.execute(sql)
        if cur.rowcount != 0:  # 已经建过模型
            print(mall_id, ' has already been fittedwith ', algorithm_name)
            continue
        metrix, tar = get_data(mall_id)
        x_train, x_test, y_train, y_test = train_test_split(
            metrix, tar, test_size=0.1, random_state=random_state)  # 分割测试集和训练集
        save_dir = root_path + "model/" + algorithm_name + "_" + mall_id + "_model.m"  # 存储模型位置
        clf = get_model(algorithm_name)  # 根据名称获取新模型
        train_time = time.time()
        clf.fit(x_train, y_train)
        train_time = time.time() - train_time
        print('time : ', train_time)
        score = clf.score(x_test, y_test)  # 检验训练效果,得到准确度
        train_time = int(train_time)
        sql = "INSERT INTO {tn} SET result='{s}', train_time={tt},mall_id='{m}' " \
              "ON DUPLICATE KEY UPDATE result='{s}', train_time={tt}".format(
            s=score, m=mall_id, tt=train_time, tn=table_name)
        cur.execute(sql)
        joblib.dump(clf, save_dir)
        print(get_time(), ' saved a model for ', mall_id, ' with ',
              algorithm_name, ' .  score ', score)
        conn.commit()
示例#4
0
def get_best_model(dataset_dir, return_model=False, return_params=False):
    if len(os.listdir(dataset_dir)) == 0:
        return None
    dataset = os.path.split(os.path.split(dataset_dir)[0])[-1]
    data, _ = get_data(dataset)
    num_data_points = np.sum(data)

    models = []
    BIC_scores = []
    sigs = []
    clusters = []
    for model in os.listdir(dataset_dir):
        experiment_dir = os.path.join(dataset_dir, model)
        best_run = get_best_run(experiment_dir)
        if len(best_run) > 0:
            best_score = load_json(best_run)['log-likelihood']
            num_sigs = int(model.split('_')[2][:3])
            num_clusters = int(model.split('_')[1][:3])
            num_params = (num_clusters - 1) + (num_sigs - 1) * num_clusters + (96 - 1) * num_sigs
            models.append(best_run)
            clusters.append(num_clusters)
            sigs.append(num_sigs)
            BIC_scores.append(np.log(num_data_points) * num_params - 2 * best_score)

    models = np.array(models)
    BIC_scores = np.array(BIC_scores)
    sigs = np.array(sigs, dtype='int')
    clusters = np.array(clusters, dtype='int')
    best_model = models[np.argmin(BIC_scores)]

    if return_model:
        return get_model(load_json(best_model)['parameters'])

    if return_params:
        return {'BIC_scores': BIC_scores, 'num_clusters': clusters, 'model_paths': models, 'num_signatures': sigs}

    return best_model
示例#5
0
def get_model_rf(mall_id):
    data, tar = u.get_data(mall_id)
    clf = u.get_model('RF_1000')
    clf.fit(data, tar)
    return clf
示例#6
0
def get_model_knn(mall_id):
    data, tar = u.get_data(mall_id)
    clf = u.get_model('knn_5')
    clf.fit(data, tar)
    return clf
示例#7
0
 # x_train,x_test, y_train, y_test = train_test_split(data, tar, test_size=test_size, random_state=random_state)
 x_train, x_test, y_train, y_test = data, data, tar, tar
 labels = sorted(set(y_train))
 print('start ', 'xgb')
 model = u.get_model_xgb(mall)
 score = model.score(x_test, y_test)
 print(mall, ' score : ', score, '  ', 'xgb')
 print(mall, ' predicting  ', 'xgb')
 result = []
 result_proba = []
 result.append(model.predict(x_test))
 result_proba.append(model.predict_proba(x_test))
 for al in algs:
     gc.collect()
     print('start ', al)
     model = u.get_model(al)
     print(mall, ' training model ', al)
     model.fit(x_train, y_train)
     score = model.score(x_test, y_test)
     print(mall, ' score : ', score, '  ', al)
     print(mall, ' predicting  ', al)
     result.append(model.predict(x_test))
     result_proba.append(model.predict_proba(x_test))
 wrong = 0
 # print(result)
 result = [[result[0][i], result[1][i], result[2][i]]
           for i in range(0, len(result[0]))]
 for i in range(0, len(y_test)):
     if y_test[i] not in result[i] or len(set(result[i])) > 1:
         print(y_test[i], '-------', result[i])
         if y_test[i] not in result[i]:
示例#8
0
def main():
    # Argparse custom actions
    class SetModes(argparse.Action):
        """Set the modes of operations."""
        def __call__(self, parser, args, values, option_string=None):
            for value in values:
                setattr(args, value, True)

    # yapf: disable
    parser = argparse.ArgumentParser(description='Fake News Classifier')
    # Initialization
    parser.add_argument('--init', action='store_true', default=False,
                        help='perform initialization')
    # Modes
    parser.add_argument('-m', '--mode', action=SetModes, nargs='+', choices=['train', 'test', 'demo', 'plot'],
                        help='specify the mode of operation: train, test, demo, plot')
    parser.add_argument('--train', action='store_true', default=False,
                        help='train the model')
    parser.add_argument('--test', action='store_true', default=False,
                        help='test the model (must either train or load a model)')
    parser.add_argument('--demo', action='store_true', default=False,
                        help='demo the model on linewise samples from a file (must either train or load a model)')
    parser.add_argument('--plot', action='store_true', default=False,
                        help='plot training data (must either train or have existing training data)')
    # Options
    parser.add_argument('-b', '--batch-size', type=int, default=64, metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('-c', '--config', type=str,
                        help='path to configuration json file (overrides args)')
    parser.add_argument('--data-loader', type=str, default='BatchLoader',
                        help='data loader to use (default: "BatchLoader")')
    parser.add_argument('--dataset', type=str, default='FakeRealNews',
                        help='dataset to use (default: "FakeRealNews")')
    parser.add_argument('-e', '--epochs', type=int, default=10,
                        help='number of epochs to train (default: 10)')
    parser.add_argument('-f', '--file', type=str,
                        help='specify a file for another argument')
    parser.add_argument('--lr', '--learning-rate', dest='learning_rate', type=float, default=1e-4,
                        help='learning rate (default: 1e-4)')
    parser.add_argument('-l', '--load', type=int, metavar='EPOCH',
                        help='load a model and its training data')
    parser.add_argument('--loss', type=str, default='BCEWithLogitsLoss',
                        help='loss function (default: "BCEWithLogitsLoss")')
    parser.add_argument('--model', type=str, default='FakeNewsNet',
                        help='model architecture to use (default: "FakeNewsNet")')
    parser.add_argument('-s', '--sample-size', type=int, metavar='N',
                        help='limit sample size for training')
    parser.add_argument('--seed', type=int, default=0,
                        help='random seed (default: 0)')
    parser.add_argument('--save', action='store_true', default=True,
                        help='save model checkpoints and training data (default: True)')
    parser.add_argument('--no-save', dest='save', action='store_false')
    args = parser.parse_args()
    # yapf: enable

    # Print help if no args
    if len(sys.argv) == 1:
        parser.print_help()
        parser.exit()

    # Configure logger
    logging.basicConfig(level=logging.DEBUG)
    logging.getLogger('matplotlib').setLevel(logging.WARNING)

    # Load configuration file if specified
    if args.config is not None:
        utils.load_config(args)

    # Exit if no mode is specified
    if not args.init and not args.train and not args.test and not args.demo and not args.plot:
        logging.error(
            'No mode specified. Please specify with: --mode {init,train,test,demo,plot}'
        )
        exit(1)
    # Exit on `--load` if run directory not found
    if (args.load is not None or
        (args.plot
         and not args.train)) and not os.path.isdir(utils.get_path(args)):
        logging.error(
            'Could not find directory for current configuration {}'.format(
                utils.get_path(args)))
        exit(1)
    # Exit on `test` or `demo` without `train` or `--load EPOCH`
    if (args.test or args.demo) and not (args.train or args.load is not None):
        logging.error(
            'Cannot run `test` or `demo` without a model. Try again with either `train` or `--load EPOCH`.'
        )
        exit(1)
    # Exit on `demo` without a string file
    if args.demo and not args.file:
        logging.error(
            'Cannot run `demo` without a file. Try again with `--file FILE`.')
        exit(1)

    # Setup run directory
    if args.save and not args.init and not (args.train or args.test
                                            or args.demo or args.plot):
        utils.save_config(args)
        path = utils.get_path(args) + '/output.log'
        os.makedirs(os.path.dirname(path), exist_ok=True)
        logging.getLogger().addHandler(logging.FileHandler(path))

    # Set random seeds
    random.seed(args.seed)
    torch.manual_seed(args.seed)

    # Variable declarations
    training_data = None

    # Load GloVe vocabulary
    if args.init or args.train or args.test or args.demo:
        glove = torchtext.vocab.GloVe(name='6B', dim=50)

    # Perform initialization
    if args.init or args.train or args.test:
        # Determine which dataset to use
        dataset = utils.get_dataset(args)
        # Preload the dataset
        dataset.load()
        # Get preprocessed samples
        samples = preprocessing.get_samples(dataset, glove, args.init)
        random.shuffle(samples)

    # DataLoader setup for `train`, `test`
    if args.train or args.test:
        # Select data loader to use
        DataLoader = utils.get_data_loader(args)

        # Split samples
        split_ratio = [.6, .2, .2]
        trainset, validset, testset = list(
            DataLoader.splits(samples, split_ratio))
        if args.sample_size is not None:  # limit samples used in training
            trainset = trainset[:args.sample_size]
            validset = validset[:int(args.sample_size * split_ratio[1] /
                                     split_ratio[0])]

        # Get data loaders
        train_loader, valid_loader, test_loader = [
            DataLoader(split, batch_size=args.batch_size)
            for split in [trainset, validset, testset]
        ]

    # Load samples for demo
    if args.demo:
        if os.path.isfile(args.file):
            # Read samples from the input file
            with open(args.file, 'r') as f:
                samples = [line for line in f if line.strip()]
            data = pd.DataFrame({
                'text': samples,
                'label': [0.5] * len(samples)
            })
            # Preprocess samples
            preprocessing.clean(data)
            samples = preprocessing.encode(data, glove)
            samples = [(torch.tensor(text).long(), label)
                       for text, label in samples]

            # Select data loader to use
            DataLoader = utils.get_data_loader(args)

            # Get data loader
            data_loader = DataLoader(samples, batch_size=1, shuffle=False)
        else:
            logging.error('Could not find file for demo at {}'.format(
                args.file))
            exit(1)

    # Model setup for `train`, `test`, `demo`
    if args.train or args.test or args.demo:
        # Create the model
        model = utils.get_model(glove, args)

        # Load a model
        if args.load is not None:
            utils.load_model(args.load, model, args)

    # Run `train`
    if args.train:
        training_data = training.train(model, train_loader, valid_loader, args)

    # Run `test`
    if args.test:
        if args.train or args.load is not None:
            criterion = utils.get_criterion(args.loss)
            acc, loss = training.evaluate(model, test_loader, criterion)
            logging.info('Testing accuracy: {:.4%}, loss: {:.6f}'.format(
                acc, loss))
        else:
            logging.error('No model loaded for testing')
            exit(1)

    # Run `demo`
    if args.demo:
        if args.train or args.load is not None:
            model.eval()  # set model to evaluate mode
            logging.info('-- Results --')
            for i, (text, _) in enumerate(data_loader):
                preview = data['text'][i][:32] + '...'
                out = model(text).flatten()
                prob = torch.sigmoid(out)  # apply sigmoid to get probability
                pred = (prob >
                        0.5).long()  # predict `true` if greater than 0.5
                label = ['fake', 'true'][pred.item()]
                label = '{}{}{}'.format(
                    '\033[92m' if pred.item() else '\033[93m', label,
                    '\033[0m')
                confidence = (prob if pred.item() else 1 - prob).item()
                logging.info(
                    'Report {}: {} with {:.2%} confidence - "{}"'.format(
                        i, label, confidence, preview))
        else:
            logging.error('No model loaded for demo')
            exit(1)

    # Run `plot`
    if args.plot:
        if training_data is None:
            training_data = utils.load_training_data(args, allow_missing=False)
        if args.load is not None and not args.train:
            for k, v in training_data.items():
                training_data[k] = v[:args.load + 1]

        logging.info('Plotting training data')
        training.plot(training_data)
示例#9
0
def compare_panel_clusters():
    np.random.seed(1359)
    # full_dataset, panel_dataset = 'BRCA-panel-full', 'BRCA-panel'
    full_dataset, panel_dataset = 'nature2019-full', 'nature2019-panel'
    full_data, active_signatures = get_data(full_dataset)
    panel_data, _ = get_data(panel_dataset)
    signatures = get_cosmic_signatures()[active_signatures]
    num_samples = len(full_data)

    full_data_exposures = stack_nnls(full_data, signatures)

    full_data_exposures_dists = cosine_similarity(full_data_exposures)
    corrs = []
    models = []
    relations = []

    for model in ['MIX', 'SigMA']:
        if model == 'MIX':
            d = os.path.join(ROOT_DIR, 'experiments/trained_models/{}/refit'.format('BRCA-panel'))

            mix = get_model(load_json(get_best_model(d))['parameters'])
            clusters = np.argmax(mix.soft_cluster(full_data), 1)

        elif model == 'SigMA':
            # d = os.path.join(ROOT_DIR, 'data/ICGC-BRCA/out-sigma-brca-panel.tsv')
            d = os.path.join(ROOT_DIR, 'data/nature2019/SigMA_output.tsv')
            all_df = pd.read_csv(d, sep='\t')
            # In case this is comma separated
            if len(all_df.columns) == 1:
                all_df = pd.read_csv(d, sep=',')
            clusters = all_df['categ'].values
            unique_clusters = np.unique(clusters)
            cluster_to_num = {}
            for i, c in enumerate(unique_clusters):
                cluster_to_num[c] = i

            clusters = np.array([cluster_to_num[c] for c in clusters])

        else:
            raise ValueError('error')

        dists_in_clusters = []
        dists_out_clusters = []
        for i in range(num_samples):
            for j in range(i + 1, num_samples):
                if clusters[i] == clusters[j]:
                    dists_in_clusters.append(full_data_exposures_dists[i, j])
                else:
                    dists_out_clusters.append(full_data_exposures_dists[i, j])

        dists_in_clusters = np.array(dists_in_clusters)
        dists_out_clusters = np.array(dists_out_clusters)

        dists_in_clusters = np.random.choice(dists_in_clusters, 200, replace=False)
        dists_out_clusters = np.random.choice(dists_out_clusters, 200, replace=False)
        corrs.extend(dists_in_clusters)
        corrs.extend(dists_out_clusters)
        models.extend([model] * len(dists_out_clusters) * 2)
        relations.extend(['Intra-cluster pairs'] * len(dists_out_clusters))
        relations.extend(['Inter-cluster pairs'] * len(dists_out_clusters))

        print(model, len(np.unique(clusters)))
        print(ranksums(dists_in_clusters, dists_out_clusters), np.mean(dists_in_clusters), np.mean(dists_out_clusters))

    df = {'Cosine similarity': corrs, 'model': models, 'relation': relations}
    df = pd.DataFrame(df)
    sns.violinplot(x='relation', y='Cosine similarity', hue='model', data=df, split=True, inner='stick')

    plt.xlabel('')
    plt.savefig(os.path.join(ROOT_DIR, 'results', 'clusters_quality', 'clusters_quality.pdf'))
示例#10
0
def simulated_data_analysis(dataset, trained_models_dir=def_trained_models_dir):
    if 'simulated' not in dataset:
        raise ValueError('dataset is no synthetic')

    data, _ = get_data(dataset)
    num_data_points = data.sum()

    dataset_dir = os.path.join(trained_models_dir, dataset)
    dataset_params = dataset[10:]
    original_model = get_model(load_json(os.path.join(ROOT_DIR, 'data/simulated-data/{}/model.json'.format(dataset_params))))
    original_num_clusters, original_num_sigs = original_model.num_clusters, original_model.num_topics

    original_model_ll = original_model.log_likelihood(data)
    original_num_params = (original_num_clusters - 1) + (original_num_sigs - 1) * original_num_clusters + (96 - 1) * original_num_sigs
    original_bic = np.log(num_data_points) * original_num_params - 2 * original_model_ll
    # Plot BIC
    scores_dict = get_best_model(os.path.join(dataset_dir, 'denovo'), return_params=True)
    BIC_scores = scores_dict['BIC_scores']
    num_clusters = scores_dict['num_clusters']
    num_signatures = scores_dict['num_signatures']
    # print(dataset, signature_learning, model_paths[np.argmin(BIC_scores)])
    unique_clusters = np.unique(num_clusters)
    unique_signaturs = np.unique(num_signatures)
    from mpl_toolkits.mplot3d import axes3d

    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')

    for c in unique_clusters:
        tmp = num_clusters == c
        curr_sigs = num_signatures[tmp]
        curr_clusters = num_clusters[tmp]
        curr_BIC_scores = BIC_scores[tmp]
        arg_sort_curr_sigs = np.argsort(curr_sigs)
        curr_clusters = np.array([curr_clusters[arg_sort_curr_sigs]])
        curr_sigs = np.array([curr_sigs[arg_sort_curr_sigs]])
        curr_BIC_scores = np.array([curr_BIC_scores[arg_sort_curr_sigs]])
        ax.plot_wireframe(curr_clusters, curr_sigs, curr_BIC_scores, rstride=1, cstride=1)
    for s in unique_signaturs:
        tmp = num_signatures == s
        curr_sigs = num_signatures[tmp]
        curr_clusters = num_clusters[tmp]
        curr_BIC_scores = BIC_scores[tmp]
        arg_sort_curr_clusters = np.argsort(curr_clusters)
        curr_clusters = np.array([curr_clusters[arg_sort_curr_clusters]])
        curr_sigs = np.array([curr_sigs[arg_sort_curr_clusters]])
        curr_BIC_scores = np.array([curr_BIC_scores[arg_sort_curr_clusters]])
        ax.plot_wireframe(curr_clusters, curr_sigs, curr_BIC_scores, rstride=1, cstride=1)

    ax.set_xlabel('clusters')
    ax.set_ylabel('signatures')
    ax.set_zlabel('BIC score')
    plt.xticks(unique_clusters)
    plt.yticks(unique_signaturs)

    # if plot_title:
    plt.title(dataset)
    plt.savefig(os.path.join(ROOT_DIR, 'results', 'synthetic', dataset, 'BIC.pdf'))
    # plt.show()

    ### Test sig/cluster/weight correlations
    results = [['Model', 'Average clusters similarity', '# Unique clusters', 'Average signatures similarity', '# Unique signatures']]
    best_model = get_best_model(os.path.join(dataset_dir, 'denovo'), return_model=True)
    best_num_clusters, best_num_sigs = best_model.num_clusters, best_model.num_topics

    original_sigs, original_clusters, original_weights = original_model.e.copy(), original_model.pi.copy(), original_model.w.copy()
    best_model_sigs, best_model_clusters, best_model_weights = best_model.e, best_model.pi, best_model.w
    sig, sig_corr = get_signatures_correlations(best_model_sigs, original_sigs)
    # print(sig, sig_corr)

    # rearange clusters
    reagranged_clusters = original_clusters[:, sig]
    reagranged_clusters /= reagranged_clusters.sum(1, keepdims=True)
    cluster, cluster_corr = get_signatures_correlations(best_model_clusters, reagranged_clusters)
    # print(cluster, cluster_corr)

    # rearange weights
    reagranged_weights = original_weights[cluster]
    reagranged_weights /= reagranged_weights.sum()
    weight_corr = cosine_similarity(best_model_weights, reagranged_weights)[0, 1]
    # print(weight_corr)

    best_model_ll = best_model.log_likelihood(data)
    best_num_params = (best_num_clusters - 1) + (best_num_sigs - 1) * best_num_clusters + (96 - 1) * best_num_sigs
    best_bic = np.log(num_data_points) * best_num_params - 2 * best_model_ll
    # print(best_bic)

    # print(best_num_clusters, best_num_sigs, best_bic, best_model_ll, weight_corr, np.min(cluster_corr), np.max(cluster_corr), np.min(sig_corr), np.max(sig_corr), len(np.unique(cluster[cluster_corr > 0.8])), len(np.unique(sig[sig_corr > 0.8])))
    results.append(['Mix ({}, {})'.format(best_num_clusters, best_num_sigs), str(np.mean(cluster_corr)), str(len(np.unique(cluster[cluster_corr > 0.8]))), str(np.mean(sig_corr)), str(len(np.unique(sig[sig_corr > 0.8])))])

    ### Test the same with the best model with the same parameters
    same_params_model = get_best_run(os.path.join(dataset_dir, 'denovo', 'mix_{}clusters_{}signatures'.format(str(original_num_clusters).zfill(3), str(original_num_sigs).zfill(3))))
    same_params_model = get_model(load_json(same_params_model)['parameters'])

    original_sigs, original_clusters, original_weights = original_model.e.copy(), original_model.pi.copy(), original_model.w.copy()
    same_params_model_sigs, same_params_model_clusters, same_params_model_weights = same_params_model.e, same_params_model.pi, same_params_model.w
    sig, sig_corr = get_signatures_correlations(same_params_model_sigs, original_sigs)
    # print(sig, sig_corr)

    # rearange clusters
    reagranged_clusters = original_clusters[:, sig]
    reagranged_clusters /= reagranged_clusters.sum(1, keepdims=True)
    cluster, cluster_corr = get_signatures_correlations(same_params_model_clusters, reagranged_clusters)
    # print(cluster, cluster_corr)

    # rearange weights
    reagranged_weights = original_weights[cluster]
    reagranged_weights /= reagranged_weights.sum()
    weight_corr = cosine_similarity(same_params_model_weights, reagranged_weights)[0, 1]
    # print(weight_corr)

    same_params_ll = same_params_model.log_likelihood(data)
    same_params_bic = np.log(num_data_points) * original_num_params - 2 * same_params_ll
    # print(best_bic, same_params_bic, original_bic)
    # print(original_num_clusters, original_num_sigs, same_params_bic, same_params_ll, weight_corr, np.min(cluster_corr), np.max(cluster_corr), np.min(sig_corr), np.max(sig_corr), len(np.unique(cluster[cluster_corr > 0.8])), len(np.unique(sig[sig_corr > 0.8])))
    # print('-', '-', original_bic, original_model_ll, '-', '-', '-', '-', '-', '-', '-')
    results.append(['Mix ({}, {})'.format(original_num_clusters, original_num_sigs), str(np.mean(cluster_corr)), str(len(np.unique(cluster[cluster_corr > 0.8]))), str(np.mean(sig_corr)), str(len(np.unique(sig[sig_corr > 0.8])))])
    np.savetxt(os.path.join(ROOT_DIR, 'results', 'synthetic', dataset, 'summary.tsv'), results, fmt='%s', delimiter='\t', header='{} clusters | {} signatures'.format(original_num_clusters, original_num_sigs))
示例#11
0
def plot_cluster_AMI(range_clusters, computation='AMI'):
    if computation == 'AMI':
        score_func = AMI_score
    elif computation == 'MI':
        score_func = MI_score
    elif computation == 'jaccard':
        score_func = Jaccard_score
    else:
        raise ValueError('{} is not a valid computation'.format(computation))

    rich_sample_threshold = 10
    data, active_signatures = get_data('MSK-ALL')
    signatures = get_cosmic_signatures()[active_signatures]
    num_data_points = data.sum()

    nnls_exposures = np.zeros((len(data), len(signatures)))
    for i in range(len(data)):
        nnls_exposures[i] = nnls(signatures.T, data[i])[0]

    num_mutations_per_sample = data.sum(1)
    rich_samples = num_mutations_per_sample >= rich_sample_threshold

    all_df = pd.read_csv(os.path.join(ROOT_DIR, 'data/MSK-processed/oncotype_counts.txt'), sep='\t')
    all_df['Counts'] = all_df['Counts'].astype(int)
    all_df = all_df[all_df['Counts'] > 100]
    cancer_types = np.array(all_df['Oncotree'])

    sample_cancer_assignments = []
    sample_cancer_id_assignments = []
    for i, oc in enumerate(cancer_types):

        # dat_f = "data/processed/%s_counts.npy" % oc
        dat_f = os.path.join(ROOT_DIR, 'data/MSK-processed/{}_counts.npy'.format(oc))
        tmp_data = np.array(np.load(dat_f, allow_pickle=True), dtype=np.float64)
        sample_cancer_assignments.extend([oc] * len(tmp_data))
        sample_cancer_id_assignments.extend([i] * len(tmp_data))
    sample_cancer_assignments = np.array(sample_cancer_assignments)
    sample_cancer_id_assignments = np.array(sample_cancer_id_assignments)
    shuffled_indices = np.arange(len(sample_cancer_assignments))

    # Finding best_models
    d = os.path.join(ROOT_DIR, 'experiments/trained_models/MSK-ALL/denovo')
    BIC_summary = get_best_model(d, return_params=True)
    BIC_scores, BIC_clusters, BIC_paths = BIC_summary['BIC_scores'], BIC_summary['num_clusters'], BIC_summary['model_paths']

    MIX_scores = np.zeros((2, len(range_clusters)))
    MIX_soft_scores = np.zeros((2, len(range_clusters)))
    MIX_refit_scores = np.zeros((2, len(range_clusters)))
    MIX_soft_refit_scores = np.zeros((2, len(range_clusters)))
    KMeans_scores = np.zeros((2, len(range_clusters)))
    NNLS_KMeans_scores = np.zeros((2, len(range_clusters)))
    for idx, num_clusters in enumerate(range_clusters):
        best_model_path = BIC_paths[BIC_clusters == num_clusters][np.argmin(BIC_scores[BIC_clusters == num_clusters])]

        model = get_model(load_json(best_model_path)['parameters'])
        MIX_soft_clustering = model.soft_cluster(data)
        sample_cluster_assignment_MIX = np.argmax(MIX_soft_clustering, 1)
        MIX_scores[0, idx] = score_func(sample_cancer_id_assignments, sample_cluster_assignment_MIX)
        MIX_scores[1, idx] = score_func(sample_cancer_id_assignments[rich_samples],
                                            sample_cluster_assignment_MIX[rich_samples])
        if computation == 'MI':
            MIX_soft_scores[0, idx] = MI_score_soft_clustering(sample_cancer_id_assignments, MIX_soft_clustering)
            MIX_soft_scores[1, idx] = MI_score_soft_clustering(sample_cancer_id_assignments[rich_samples],
                                                                   MIX_soft_clustering[rich_samples])

        # MIX refit
        d = os.path.join(ROOT_DIR, 'experiments/trained_models/MSK-ALL/refit/mix_{}clusters_017signatures'.format(str(num_clusters).zfill(3)))
        model = get_model(load_json(get_best_run(d))['parameters'])
        MIX_refit_soft_clustering = model.soft_cluster(data)
        sample_cluster_assignment_MIX_refit = np.argmax(MIX_refit_soft_clustering, 1)
        MIX_refit_scores[0, idx] = score_func(sample_cancer_id_assignments, sample_cluster_assignment_MIX_refit)
        MIX_refit_scores[1, idx] = score_func(sample_cancer_id_assignments[rich_samples],
                                                  sample_cluster_assignment_MIX_refit[rich_samples])
        if computation == 'MI':
            MIX_soft_refit_scores[0, idx] = MI_score_soft_clustering(sample_cancer_id_assignments, MIX_refit_soft_clustering)
            MIX_soft_refit_scores[1, idx] = MI_score_soft_clustering(sample_cancer_id_assignments[rich_samples],
                                                                         MIX_refit_soft_clustering[rich_samples])

        # KMeans clustering
        cluster_model = KMeans(num_clusters, n_init=100, random_state=140296)
        np.random.shuffle(shuffled_indices)
        shuffled_data = data[shuffled_indices]
        cluster_model.fit(shuffled_data)
        kmeans_clusters = cluster_model.predict(data)
        KMeans_scores[0, idx] = score_func(sample_cancer_id_assignments, kmeans_clusters)
        KMeans_scores[1, idx] = score_func(sample_cancer_id_assignments[rich_samples],
                                               kmeans_clusters[rich_samples])

        # NNLS + KMeans clustering
        cluster_model = KMeans(num_clusters, n_init=100, random_state=140296)
        np.random.shuffle(shuffled_indices)
        shuffled_data = nnls_exposures[shuffled_indices]
        cluster_model.fit(shuffled_data)
        nnls_kmeans_clusters = cluster_model.predict(nnls_exposures)
        NNLS_KMeans_scores[0, idx] = score_func(sample_cancer_id_assignments, nnls_kmeans_clusters)
        NNLS_KMeans_scores[1, idx] = score_func(sample_cancer_id_assignments[rich_samples],
                                                    nnls_kmeans_clusters[rich_samples])

        print('finished {}'.format(num_clusters))

    plt.plot(range_clusters, MIX_scores[0], label='MIX-denovo')
    if computation == 'MI':
        plt.plot(range_clusters, MIX_soft_scores[0], label='MIX-denovo-soft')
    plt.plot(range_clusters, MIX_refit_scores[0], label='MIX-refit')
    if computation == 'MI':
        plt.plot(range_clusters, MIX_soft_refit_scores[0], label='MIX-refit-soft')
    plt.plot(range_clusters, KMeans_scores[0], label='KMeans')
    plt.plot(range_clusters, NNLS_KMeans_scores[0], label='NNLS+KMeans')
    plt.title('All samples AMI score')
    plt.xlabel('clusters')
    plt.ylabel(computation)
    plt.legend(loc='lower right')
    plt.xticks(np.arange(min(range_clusters), max(range_clusters) + 1, 2))
    plt.savefig(os.path.join(ROOT_DIR, 'results', 'AMI', 'cluster_score_all.pdf'))
    # plt.show()

    plt.plot(range_clusters, MIX_scores[1], label='MIX-denovo')
    if computation == 'MI':
        plt.plot(range_clusters, MIX_soft_scores[1], label='MIX-denovo-soft')
    plt.plot(range_clusters, MIX_refit_scores[1], label='MIX-refit')
    if computation == 'MI':
        plt.plot(range_clusters, MIX_soft_refit_scores[1], label='MIX-refit-soft')
    plt.plot(range_clusters, KMeans_scores[1], label='KMeans')
    plt.plot(range_clusters, NNLS_KMeans_scores[1], label='NNLS+KMeans')
    plt.title('Filtered AMI score')
    plt.xlabel('clusters')
    plt.ylabel(computation)
    plt.legend(loc='lower right')
    plt.xticks(np.arange(min(range_clusters), max(range_clusters) + 1, 2))
    plt.savefig(os.path.join(ROOT_DIR, 'results', 'AMI', 'cluster_score_filtered.pdf'))
    # plt.show()
    return
示例#12
0
def simulate(num_clusters, num_signatures, num_samples, random_seed):
    np.random.seed(random_seed)
    base_model = get_model(
        load_json(
            os.path.join(ROOT_DIR, 'data', 'simulated-data',
                         'base_model.json'))['parameters'])
    if num_clusters > base_model.num_clusters:
        raise ValueError(
            'num_clusters cannot be larger than base_model.num_clusters ({})'.
            format(base_model.num_clusters))
    if num_signatures > base_model.num_topics:
        raise ValueError(
            'num_clusters cannot be larger than base_model.num_topics ({})'.
            format(base_model.num_topics))

    msk_data, _ = get_data('MSK-ALL')
    msk_sizes = np.sum(msk_data, 1).astype('int')

    clusters = np.random.choice(base_model.num_clusters,
                                size=num_clusters,
                                replace=False,
                                p=base_model.w)
    pi = base_model.pi[clusters]
    w = base_model.w[clusters]
    w /= w.sum()
    prob_sig = np.dot(w, pi)
    signatures = np.random.choice(base_model.num_topics,
                                  size=num_signatures,
                                  replace=False,
                                  p=prob_sig)

    pi = pi[:, signatures]
    pi /= pi.sum(1, keepdims=True)
    e = base_model.e[signatures]
    model = Mix(num_clusters,
                num_signatures,
                init_params={
                    'w': w,
                    'pi': pi,
                    'e': e
                })
    sample_sizes = np.random.choice(msk_sizes, num_samples)
    clusters, signatures, mutations = model.sample(sample_sizes)

    curr_dir = os.path.join(
        ROOT_DIR, 'data', 'simulated-data',
        '{}_{}_{}_{}'.format(num_clusters, num_signatures, num_samples,
                             random_seed))
    try:
        os.makedirs(curr_dir)
    except OSError:
        pass

    # Save model, base data
    save_json(os.path.join(curr_dir, 'full_simulated'), {
        'clusters': clusters,
        'signatures': signatures,
        'mutations': mutations
    })
    parameters = model.get_params()

    parameters['w'] = parameters['w'].tolist()
    parameters['pi'] = parameters['pi'].tolist()
    parameters['e'] = parameters['e'].tolist()

    save_json(os.path.join(curr_dir, 'model'), parameters)

    # Transform the basic data into mutation matrix
    mutation_mat = np.zeros((num_samples, 96), dtype='int')
    for i in range(num_samples):
        a, b = np.unique(mutations[i], return_counts=True)
        mutation_mat[i, a] = b

    np.save(os.path.join(curr_dir, 'mutations'), mutation_mat)