Exemplo n.º 1
0
def train_from_scratch(config, state, channel):    
    # Model options
    save_model_dir = config[config.model].save_model_dir
    if save_model_dir == 'current':
        config[config.model].save_model_dir = './'
        save_model_dir = './'
        # to facilitate the use of cluster for multiple jobs
        save_path = './model_config.pkl'
    else:
        # run locally, save locally
        save_path = save_model_dir + 'model_config.pkl'
    print 'current save dir ',save_model_dir
    utils.create_dir_if_not_exist(save_model_dir)

    reload_ = config[config.model].reload_
    if reload_:
        print 'preparing reload'
        save_dir_backup = config[config.model].save_model_dir
        from_dir_backup = config[config.model].from_dir
        # never start retrain in the same folder
        assert save_dir_backup != from_dir_backup
        print 'save dir ',save_dir_backup
        print 'from_dir ',from_dir_backup
        print 'setting current model config with the old one'
        model_config_old = utils.load_pkl(from_dir_backup + '/model_config.pkl')
        set_config(config, model_config_old)
        config[config.model].save_model_dir = save_dir_backup
        config[config.model].from_dir = from_dir_backup
        config[config.model].reload_ = True
    if config.erase_history:
        print 'erasing everything in ',save_model_dir
        os.system('rm %s/*'%save_model_dir)
    # for stdout file logging
    #sys.stdout = Unbuffered(sys.stdout, state.save_model_path + 'stdout.log')
    print 'saving model config into %s'%save_path
    utils.dump_pkl(config, save_path)
    # Also copy back from config into state.
    for key in config:
        setattr(state, key, config[key])
    model_type = config.model
    print 'Model Type: %s'%model_type
    print 'Dataset: %s'%config[config.model].dataset
    print 'Command: %s' % ' '.join(sys.argv)
    if config.model == 'attention':
        model_deepRNN.train_from_scratch(state, channel)
    else:
        raise NotImplementedError()
Exemplo n.º 2
0
def train_util(params):
    save_dir = params['save_dir']
    print('current save dir : ' + save_dir)
    utils.create_dir_if_not_exist(save_dir)

    reload_model = params['reload_model']
    if reload_model:
        print 'preparing reload'
        save_dir_backup = params['save_dir']
        from_dir_backup = params['from_dir']
        # never start retrain in the same folder
        assert save_dir_backup != from_dir_backup
        print 'save dir ', save_dir_backup
        print 'from_dir ', from_dir_backup
        print 'setting current model config with the old one'
        model_config_old = utils.read_from_json(from_dir_backup +
                                                'model_config.json')
        model_config_old['reload_model'] = True
        model_config_old['save_dir'] = params['save_dir']
        model_config_old['from_dir'] = params['from_dir']
        model_config_old['max_epochs'] = params['max_epochs']
        model_config_old['dispFreq'] = params['dispFreq']
        model_config_old['sampleFreq'] = params['sampleFreq']
        model_config_old['validFreq'] = params['validFreq']
        model_config_old['debug'] = params['debug']
        params = model_config_old
        feats_dir = params['feats_dir']
    elif params['cnn_name'] != "MURALI":
        feats_dir = params['feats_dir'] + params['cnn_name'] + "_kmeans3/"
    else:
        feats_dir = params['feats_dir']
    print('feats dir : ' + feats_dir)
    params['feats_dir'] = feats_dir

    config_save_path = save_dir + "model_config.json"
    print('saving model config into %s' % config_save_path)
    utils.write_to_json(params, config_save_path)

    t0 = time.time()
    print('training an attention model')
    train(params, **params)
    print('training time in total %.4f sec' % (time.time() - t0))
def frames_to_feat(cnn, vid_ids_path, num_vids):
    if cnn == "ResNet50":
        model, height, width, preprocess_input = get_ResNet50_model()
        FEAT_DIM = config.RESNET_FEAT_DIM
    elif cnn == "ResNet152":
        model, height, width, preprocess_input = get_ResNet152_model()
        FEAT_DIM = config.RESNET_FEAT_DIM
    elif cnn == "InceptionV3":
        model, height, width, preprocess_input = get_InceptionV3_model()
        FEAT_DIM = config.INCEPTION_FEAT_DIM
    elif cnn == "VGG19":
        model, height, width, preprocess_input = get_VGG19_model()
        FEAT_DIM = config.VGG_FEAT_DIM
    else:
        raise NotImplementedError()

    feat_save_path = config.MSVD_FEATS_DIR + cnn + "/"
    print "saving feats to :", feat_save_path
    utils.create_dir_if_not_exist(feat_save_path)

    vid_ids = utils.read_file_to_list(vid_ids_path)
    vid_clips_list = [vid[:-4] for vid in vid_ids]
    assert len(vid_ids) == num_vids

    for vid in vid_clips_list:
        print("extracting features from : " + vid)
        vid_frames_dir = config.MSVD_FRAMES_DIR + "/" + vid
        frames_list = utils.read_dir(vid_frames_dir)
        n_frames = len(frames_list)
        if n_frames > config.MAX_FRAMES:
            n_frames = config.MAX_FRAMES
        selected_frames = extract_frames_equally_spaced(
            n_frames, config.FRAME_SPACING)
        vid_feats = np.empty((0, FEAT_DIM), dtype=np.float32)
        for fid in selected_frames:
            img_path = vid_frames_dir + "/frame" + str(fid) + ".jpg"
            # print("extracting features from : "+img_path)
            img_feat = img_to_feat(img_path, height, width, preprocess_input,
                                   model)
            vid_feats = np.vstack((vid_feats, img_feat))
        print(vid_feats.shape)
        np.save(feat_save_path + vid + ".npy", vid_feats)
Exemplo n.º 4
0
def feats_kmeans(cnn, vid_ids_path, num_vids, org_dim, k):

    feat_save_path = config.MSVD_FEATS_DIR + cnn + "_kmeans" + str(k) + "/"
    print "saving feats to :", feat_save_path
    utils.create_dir_if_not_exist(feat_save_path)

    vid_ids = utils.read_file_to_list(vid_ids_path)
    vid_clips_list = [vid[:-4] for vid in vid_ids]
    assert len(vid_ids) == num_vids

    for vid in vid_clips_list:
        # print("loading features from : "+vid)
        vid_feats_path = config.MSVD_FEATS_DIR + cnn + "/" + vid + ".npy"
        vid_feats = np.load(vid_feats_path)
        # print(vid_feats.shape)
        kmeans = KMeans(n_clusters=k, init='k-means++',
                        random_state=0).fit(vid_feats)
        vid_feat_kmeans = kmeans.cluster_centers_
        # print(vid_feat_kmeans.shape)
        np.save(feat_save_path + vid + ".npy", vid_feat_kmeans)
Exemplo n.º 5
0
def save_feats(whichdata):
    feat_save_path = config.MURALI_MSVD_FEATS_DIR
    print "saving feats to :", feat_save_path
    utils.create_dir_if_not_exist(feat_save_path)
    if whichdata == "train":
        encoded_feats_path = config.MURALI_MSVD_ENCODED_FEATS_TRAIN
        dictsize = config.MURALI_TRAIN_VIDS
    elif whichdata == "test":
        encoded_feats_path = config.MURALI_MSVD_ENCODED_FEATS_TEST
        dictsize = config.MURALI_TEST_VIDS
    else:
        raise NotImplementedError()
    encoded_video = np.loadtxt(encoded_feats_path, delimiter=',')
    print(encoded_video.shape)
    num, dim = encoded_video.shape
    assert num == dictsize
    for vid_id in range(num):
        vid_feats = encoded_video[vid_id].reshape(32, 1024)
        # print(vid_feats.shape)
        np.save(feat_save_path + whichdata + "_" + str(vid_id) + ".npy",
                vid_feats)
Exemplo n.º 6
0
def transform_lfw(input_dir_path, output_dir_path, size, mode, split=True, ratio=0.9):
    """
    :param ratio: how much of train set we use if split is True
    :param split: should we split into train set and test set
    :param input_dir_path: input path to lfw images (in default lfw format)
    :param output_dir_path: output dir for processed images
    :param size: should be tuple (x, y) opencv style
    :param mode: grayscale supported, for others nothing happens
    :return:
    """
    images_paths = []
    create_dir_if_not_exist(output_dir_path)

    for dir_name in os.listdir(input_dir_path):
        dir_path = os.path.join(input_dir_path, dir_name)
        if os.path.isdir(dir_path):
            images_names = [img_name for img_name in os.listdir(dir_path) if img_name.endswith('.jpg')]
            for img_name in images_names:
                img_path = os.path.join(dir_path, img_name)
                images_paths.append(img_path)

    shuffle(images_paths)

    if split:
        train_dir = os.path.join(output_dir_path, 'train')
        test_dir = os.path.join(output_dir_path, 'test')
        create_dir_if_not_exist(train_dir)
        create_dir_if_not_exist(test_dir)
        train_end = int(len(images_paths) * ratio)
        train_images_paths = images_paths[:train_end]
        test_images_paths = images_paths[train_end:]
        transform(train_images_paths, size, train_dir, mode)
        transform(test_images_paths, size, test_dir, mode)
    else:
        transform(images_paths, size, output_dir_path, mode)
Exemplo n.º 7
0
def train_from_scratch(config, state, channel):
    model_type = config.model
    # set up automatically some fields in config
    if config.dataset.signature == 'MNIST_binary_russ':
        config[model_type].n_in = 784
        config[model_type].n_out = 784

    # manipulate the 'state
    # save the config file
    save_model_path = config.save_model_path

    if save_model_path == 'current':
        config.save_model_path = './'
        # to facilitate the use of cluster for multiple jobs
        save_path = './model_config.pkl'
    else:
        # run locally, save locally
        save_path = save_model_path + 'model_config.pkl'

    utils.create_dir_if_not_exist(config.save_model_path)
    # for stdout file logging
    #sys.stdout = Unbuffered(sys.stdout, state.save_model_path + 'stdout.log')
    print 'saving model config into %s' % save_path
    utils.dump_pkl(config, save_path)

    # Also copy back from config into state.
    for key in config:
        setattr(state, key, config[key])

    print 'Model Type: %s' % model_type
    print 'Host:    %s' % socket.gethostname()
    print 'Command: %s' % ' '.join(sys.argv)

    print 'initializing data engine'
    input_dtype = 'float32'
    target_dtype = 'int32'
    data_engine = None
    deep_orderless_bernoulli_nade.train_from_scratch(state, data_engine,
                                                     channel)
Exemplo n.º 8
0
def train_from_scratch(config, state, channel):
    model_type = config.model
    # set up automatically some fields in config
    if config.dataset.signature == 'MNIST_binary_russ':
        config[model_type].n_in = 784
        config[model_type].n_out = 784
        
    # manipulate the 'state
    # save the config file
    save_model_path = config.save_model_path

    if save_model_path == 'current':
        config.save_model_path = './'
        # to facilitate the use of cluster for multiple jobs
        save_path = './model_config.pkl'
    else:
        # run locally, save locally
        save_path = save_model_path + 'model_config.pkl'

    utils.create_dir_if_not_exist(config.save_model_path)
    # for stdout file logging
    #sys.stdout = Unbuffered(sys.stdout, state.save_model_path + 'stdout.log')
    print 'saving model config into %s'%save_path
    utils.dump_pkl(config, save_path)

    # Also copy back from config into state.
    for key in config:
        setattr(state, key, config[key])
    
    print 'Model Type: %s'%model_type
    print 'Host:    %s' % socket.gethostname()
    print 'Command: %s' % ' '.join(sys.argv)
    
    print 'initializing data engine'
    input_dtype = 'float32'
    target_dtype = 'int32'
    data_engine = None
    deep_orderless_bernoulli_nade.train_from_scratch(state, data_engine, channel)
def random_comparison(arguments, rep_num=0):  # esegue i confronti random
    matrices_extractors, genes, gene_list = arguments
    matrix_01_pair = []
    for matrices_extractor in matrices_extractors:
        bed_file_name = matrices_extractor["bed_file_name"]
        me = matrices_extractor["me"]
        # extract the matrices
        # print("random comparison num: " + str(rep_num) + " extract 01 and coverage from: " + str(bed_file_name))
        # start = time.time()
        matrix_01_pair.append({'matrix': me.extract_matrices(areReadsRandomized=True, add_small_random_value=add_small_random_value_to_random_comparison, rep_num=rep_num), 'file_name': bed_file_name})
        # print ("end in " + str(time.time() - start) + " sec(s)")
        if save_matrix_01:
            create_dir_if_not_exist([random_comparisons_folder_matrix_01])
            save_dir = os.path.join(random_comparisons_folder_matrix_01, bed_file_name)
            create_dir_if_not_exist([save_dir])
            matrix_01.to_csv(os.path.join(save_dir, str(rep_num) + ".csv"), index=True, header=True, decimal='.', sep=',', float_format='%.6f')
    # print(" start compare pairs ")
    # start = time.time()
    match_score, pair_names = compare_pair(matrix_01_pair, genes.set_index('GeneID'), gene_list)  # compara coppie di file bed  (di matrici 0-1)

    pair_names = Path(pair_names[0]).stem + ":" + Path(pair_names[1]).stem
    # print("end_comparison in " + str(time.time() - start) + "seconds")
    return {'pair_name': pair_names, 'match_score': match_score}
def compare_pair_n_times_serial(bed_files_pair, genes, gene_list, n):  # per ogni coppia di file bed esegue n confronti
    # extract a pair of bed files
    match_scores = []

    matrices_extractors = []
    for bed_files_dict in bed_files_pair:
        bed_file = bed_files_dict["bed_file"]
        bed_file_name = bed_files_dict["bed_file_name"]
        me = MatricesExtractor(bed_file, genes, bed_file_name)
        matrices_extractors.append({"me": me, "bed_file_name": bed_file_name})

    arguments = matrices_extractors, genes, gene_list

    for i in range(n):
        match_scores.append(random_comparison(arguments, i))  # effettua n confronti random

    if save_random_match_scores:
        create_dir_if_not_exist([random_comparisons_folder_match_scores])
        for i in range(len(match_scores)):
            save_dir = os.path.join(random_comparisons_folder_match_scores, match_scores[i]["pair_name"])
            create_dir_if_not_exist([save_dir])
            match_scores[i]["match_score"].to_csv(os.path.join(save_dir, str(i) + ".csv"), index=True, header=True, decimal='.', sep=',', float_format='%.6f')

    return match_scores
def compare_pair_n_times(bed_files_pair, genes, gene_list, n):
    # extract a pair of bed files
    match_scores = []

    matrices_extractors = []
    for bed_files_dict in bed_files_pair:  # FIX ME creazione delle classi estrattori a monte, verificare correttezza
        bed_file = bed_files_dict["bed_file"]
        bed_file_name = bed_files_dict["bed_file_name"]
        me = MatricesExtractor(bed_file, genes)
        matrices_extractors.append({"me": me, "bed_file_name": bed_file_name})

    arguments = matrices_extractors, genes, gene_list
    pool = multiprocessing.Pool(processes=num_task)
    res = []
    for i in range(n):
        res.append(pool.apply_async(random_comparison, [arguments]))
    pool.close()
    pool.join()
    for i in res:
        match_scores.append(i.get())

    if save_random_match_scores:
        create_dir_if_not_exist([random_comparisons_folder_match_scores])
        for i in range(len(match_scores)):
            save_dir = os.path.join(random_comparisons_folder_match_scores,
                                    match_scores[i]["pair_name"])
            create_dir_if_not_exist([save_dir])
            match_scores[i]["match_score"].to_csv(os.path.join(
                save_dir,
                str(i) + ".csv"),
                                                  index=True,
                                                  header=True,
                                                  decimal='.',
                                                  sep=',',
                                                  float_format='%.6f')
    return match_scores
Exemplo n.º 12
0
    input_data = {
        'raw_spectrograms': np.array(raw_spectrograms),
        'std_spectrograms': np.array(std_spectrograms),
        'file_name': np.array(mat_file_names),
        'segment': pdata['segment']
    }

    if pdata['dtype'] == 'train':
        input_data['target'] = pdata['target']

    np.save(os.path.join(output_dir, 'data.npy'), input_data)


def collect_and_write_data(p_idx):
    pid = int(p_idx / 2)

    is_test = p_idx % 2 != 0

    # write training data
    pdata = utils.load_data(pid, is_test=is_test)

    write_patient_data(pdata)

    return 1


if __name__ == "__main__":
    create_dir_if_not_exist(utils.output_directory)
    result = futures.map(collect_and_write_data, range(6))
    print list(result)
                            in_features=n_features,
                            out_features=1,
                            regression=True)
    net = net.to(device)
    optimizer = optim.Adam(net.parameters(), lr=opt.lr, betas=(0.9, 0.999))
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.1)

    now = datetime.datetime.now()
    folder_name = os.path.join('modelnet_regression_log', opt.model,
                               'width' + str(width))

    start_run_time = str(now.day) + str(now.month) + str(now.hour) + str(
        now.minute)
    filename = 'optimize=' + str(optimizer)[:3] + '.csv'

    utils.create_dir_if_not_exist(os.path.join(opt.outf, folder_name))

    logpath = os.path.join(opt.outf, folder_name, filename)
    cols = ['epoch', 'train_loss', 'test_loss']
    logger = utils.Logger(logpath, cols)

    loss_func = nn.MSELoss()
    nepoch = 50

    for epoch in range(nepoch):
        epoch_loss = 0.0
        # scheduler.step()
        for step, data in enumerate(dataloader):  # for each training step
            if opt.model == "GraphNet":
                batch_x, A, batch_y = data
                A = A.float()
Exemplo n.º 14
0
def calc_reproducible_sequences(match_scores_list, gene_list, pair_names_list, match_scores_real, matrix_01_list):
    # compute the match score histograms for the random comparisons
    match_scores_hist = {}
    for fake_match_scores in match_scores_list:
        for fake_match_score in fake_match_scores:
            # fake_match_score contains the scores of one pair
            pair_name = fake_match_score['pair_name']
            match_scores_fake = fake_match_score['match_score']
            gene_hist = {}
            for gene, match_score in match_scores_fake.items():
                gene_hist[gene] = [match_score]

            if pair_name in match_scores_hist:
                for gene, match_score in match_scores_fake.items():
                    match_scores_hist[pair_name][gene].append(match_score)  # = [match_score]
            else:
                match_scores_hist[pair_name] = gene_hist

    p_value_matrix = pd.DataFrame(index=gene_list, columns=pair_names_list)

    plot_num = 0

    # extract pvalues for each gene and dataset pair
    for pair_name in match_scores_hist:
        for gene in match_scores_hist[pair_name]:
            gene_hist = pd.Series(match_scores_hist[pair_name][gene])
            hist_mean = np.mean(gene_hist)
            hist_std = np.std(gene_hist)

            if plot_data:
                match_scores_hist_pair_plot_folder = os.path.join(match_scores_hist_plot_folder, pair_name)
                create_dir_if_not_exist([match_scores_hist_pair_plot_folder])
                sns.set_style('darkgrid')
                plot = sns.distplot(gene_hist, bins=num_bins).set_title("hist_mean: " + str('%.5f' % hist_mean) + "   hist_std: " + str('%.5f' % hist_std))
                plot.get_figure().savefig(os.path.join(match_scores_hist_pair_plot_folder, "gene:" + gene))
                plot.get_figure().clf()

            for match_score_real in match_scores_real:
                pair_name_real = match_score_real["pair_name"]
                if pair_name_real == pair_name:
                    real_score = match_score_real["match_score"][gene]
                    z_score = (real_score - hist_mean) / hist_std
                    pvalue = st.norm.sf(abs(z_score))
                    p_value_matrix[pair_name][gene] = pvalue
                    # p_value_matrix[gene][pair_name] = pvalue
            # if (plot_num < 3):
            #     print(gene)
            #     print(pair_name)
            #     plt.figure()
            #     gene_hist.plot.hist(grid=True, bins=10, rwidth=0.9, color='#607c8e')
            #     plt.show()

            plot_num += 1

    reproducible_genes = []
    for gene, pvalue_row in p_value_matrix.iterrows():
        pvalue_row = pvalue_row.to_numpy()

        y = multipletests(pvals=pvalue_row, alpha=FDR, method="fdr_bh")
        number_of_significative_values_python = len(y[1][np.where(y[1] < FDR)])

        #
        # print("gene")
        # print(gene)
        # print("pvalue row")
        # print(pvalue_row)
        # print("Benjamini-Hockberg thresholds")
        # print(y[1])
        # print("number of significative values")
        # print(number_of_significative_values)
        #
        # # if all the pvalues are below the threshold for each dataset then the gene can be considered reproducible
        # if number_of_significative_values == len(pair_names_list):
        #     reproducible_genes.append(gene)

        pvalue_row = np.sort(pvalue_row)
        critical_values = ((np.nonzero(pvalue_row >= 0)[0] + 1) / len(pair_names_list)) * FDR
        bh_candidates = pvalue_row[pvalue_row <= critical_values]
        # print ("funzione multipletests:" + str(number_of_significative_values_python)+"   funzione di davide:"+str(len(bh_candidates)))

        if len(bh_candidates) > 0:
            idx_of_max_value = np.argwhere(bh_candidates == np.amax(bh_candidates)).flatten().tolist()[-1] + 1
            bh_selected = pvalue_row[np.array(range(0, idx_of_max_value))]
            if len(bh_selected) == len(pair_names_list):
                reproducible_genes.append(gene)

    reproducible_sequence_mask, first_matrix_01_with_only_reproducible_genes = extract_reproducible_sequences(reproducible_genes, matrix_01_list)
    # take the first matrix 01 with only reproducible genes and put to zero the non reproducible parts
    first_matrix_01_with_only_reproducible_genes[~reproducible_sequence_mask] = 0
    reproducible_sequence = pd.DataFrame(first_matrix_01_with_only_reproducible_genes, index=reproducible_genes)
    reproducible_sequence.to_csv(os.path.join(reproducible_sequence_output_dir, "reproducible_sequence.csv"), index=True, header=True, decimal='.', sep=',', float_format='%.6f')
Exemplo n.º 15
0
num_task = num_cores - 1
plot_data = True
num_bins = 15

# I/O directories
input_dir = os.path.join(os.getcwd(), "check_reduced/")  # get the path to the data input directory
match_scores_output_dir = os.path.join(os.getcwd(), "matrix_python/match_scores/")  # Sets the directory where all the saved outputs will be stored
reproducible_sequence_output_dir = os.path.join(os.getcwd(), "matrix_python/reproducible_sequence/")  # Sets the directory where all the saved outputs will be stored
genes_lengths_path = os.path.join(os.getcwd(), "gene_lengths.csv")  # path to upload the file containing each gene's ID and the correspondent gene length
histogram_plot_path = os.path.join(os.getcwd(), "genes_histograms/")  # path to upload the file containing each gene's ID and the correspondent gene length
intermediate_results = os.path.join(os.getcwd(), "intermediate_results/")
plots_folder = os.path.join(os.getcwd(), "plots/")
match_scores_hist_plot_folder = os.path.join(plots_folder, "match_scores_hist/")
match_coverage_hist_plot_folder = os.path.join(plots_folder, "coverage_hist/")
path_match_score_csv = os.path.join(os.getcwd(), "path_match_score_csv/")
create_dir_if_not_exist([input_dir, match_scores_output_dir, histogram_plot_path, reproducible_sequence_output_dir, intermediate_results, plots_folder, match_scores_hist_plot_folder,path_match_score_csv])

FDR = 0.01


def signal_digitalisation(genes, bed_files_dicts, areReadsRandomized, add_small_random_value):
    matrix_01_list = []
    for bed_files_dict in bed_files_dicts:
        bed_file = bed_files_dict["bed_file"]
        bed_file_name = bed_files_dict["bed_file_name"]
        me = MatricesExtractor(bed_file, genes)
        # extract the matrices
        pd_matrix_coverage, matrix_01 = me.extract_matrices(areReadsRandomized=areReadsRandomized, add_small_random_value=add_small_random_value)
        if plot_data:
            for gene, coverage in pd_matrix_coverage.iterrows():
                coverage = coverage[~np.isnan(coverage)]
)  #matrix_python_CONTROL_all/reproducible_sequence/")#"matrix_python_472/reproducible_sequence/")  # Sets the directory where all the saved outputs will be stored
genes_lengths_path = os.path.join(
    os.getcwd(), "../../CDShumanGenesLengths.txt"
)  # path to upload the file containing each gene's ID and the correspondent gene length
histogram_plot_path = os.path.join(
    os.getcwd(), "genes_histograms_LC509_nosoglia_27/"
)  # path to upload the file containing each gene's ID and the correspondent gene length
intermediate_results = os.path.join(os.getcwd(),
                                    "intermediate_results_LC509_nosoglia_27/")
random_comparisons_folder_match_scores = os.path.join(
    os.getcwd(), "random_comparisons_folder_match_scores_LC509_nosoglia/")
random_comparisons_folder_matrix_01 = os.path.join(
    os.getcwd(), "random_comparisons_folder_matrix_01_LC509_nosoglia/")

create_dir_if_not_exist([
    input_dir, match_scores_output_dir, histogram_plot_path,
    reproducible_sequence_output_dir, intermediate_results
])

num_comparison = 200  #200 e 50 per fare 10000  # NOTA: numero di confronti random da eseguire per ogni coppia di file bed

save_random_match_scores = False
save_matrix_01 = False


def extract_gene_list(genes, bed_files_dicts):
    gene_lists = []
    for bed_files_dict in bed_files_dicts:
        bed_file = bed_files_dict["bed_file"]
        table_FP = bed_file["Chromosome"].value_counts().sort_index(
        ).rename_axis('GeneID').reset_index(name='ReadsCounts')
        table_FP_Geneslengths = pd.merge(table_FP, genes, on="GeneID")
Exemplo n.º 17
0
# BoXHED 2.0 (https://arxiv.org/pdf/2103.12591.pdf) is a software package
# for estimating hazard functions nonparametrically via gradient boosting.
# It is orders of magnitude faster than BoXHED 1.0 (http://proceedings.mlr.press/v119/wang20o/wang20o.pdf).
# BoXHED 2.0 also allows for more general forms of survival data including recurrent events.

#This tutorial demonstrates how to apply BoXHED 2.0 to a synthetic dataset.

DATA_ADDRESS = "./data/"  # train/test data directory
RSLT_ADDRESS = "./results/"  # results directory

nthread_prep = 20  # number of CPU threads used for preprocessing
nthread_train = 20  # number of CPU threads used for training

# Create the results' directory if it does not exist
for addr in [RSLT_ADDRESS]:
    create_dir_if_not_exist(addr)


# Function: read_train_data
# Reads the synthetic training data.
# Input:
#      None
# Return:
#      @ A pandas dataframe containing training data with the following columns:
#            * ID:      subject ID
#            * t_start: the start time of an epoch for the subject
#            * t_end:   the end time of the epoch
#            * X_i:     values of covariates between t_start and t_end
# Sample Output:
# ID	t_start         t_end           X_0             delta
#	1       0.010000	0.064333	0.152407	0.0
Exemplo n.º 18
0
def signal_digitalisation(genes, bed_files_dicts, areReadsRandomized,
                          add_small_random_value):
    matrix_01_list = []
    for bed_files_dict in bed_files_dicts:
        bed_file = bed_files_dict["bed_file"]
        bed_file_name = bed_files_dict["bed_file_name"]
        # extract the matrix coverage and the matrix 01 for each bed file
        me = MatricesExtractor(bed_file, genes, bed_file_name)
        pd_matrix_01, pd_matrix_coverage = me.extract_matrices(
            areReadsRandomized=areReadsRandomized,
            add_small_random_value=add_small_random_value)
        pd_matrix_coverage.to_csv(os.path.join(matrix_coverage_real_dir,
                                               bed_file_name + ".csv"),
                                  index=True,
                                  header=True,
                                  decimal='.',
                                  sep=',',
                                  float_format='%.6f')

        if plot_data:
            # plot the matrix coverage for some specific genes
            for gene, coverage in pd_matrix_coverage.iterrows():
                coverage = coverage[~np.isnan(coverage)]

                match_scores_hist_pair_plot_folder = os.path.join(
                    match_coverage_hist_plot_folder, bed_file_name)
                create_dir_if_not_exist([match_scores_hist_pair_plot_folder])

                x = range(0, len(coverage))
                fig, ax = plt.subplots()
                # if gene == "ENST00000367755.9" or gene == "ENST00000200639.9" or gene == "ENST00000301522.3" or gene == "ENST00000371634.7"or gene == "ENST00000361789.2" or gene == "ENST00000239938.5" or gene == "ENST00000220763.10" or gene == "ENST00000273258.4" :
                plot = sns.lineplot(x, coverage, color='black')
                plot.fill_between(x, coverage, color='black')
                plot = sns.lineplot(x,
                                    coverage.median(),
                                    color='orange',
                                    hue=coverage.median(),
                                    palette=["C0"])
                ax.legend(title="Median")

                plot.set(xticks=((x[0::int(len(coverage) * 0.08)])))
                plot.get_figure().savefig(
                    os.path.join(match_scores_hist_pair_plot_folder,
                                 "gene:" + gene + ".pdf"))
                plot.get_figure().clf()
            # plot the matrix 01 for some specific genes
            for gene, matrix_01 in pd_matrix_01.iterrows():
                matrix_01 = matrix_01[~np.isnan(matrix_01)]

                hist_01_hist_pair_plot_folder = os.path.join(
                    hist_01_plot_folder, bed_file_name)
                create_dir_if_not_exist([hist_01_hist_pair_plot_folder])
                #         # print_full(coverage)
                x = range(0, len(matrix_01))
                # if gene == "ENST00000367755.9" or gene == "ENST00000200639.9" or gene == "ENST00000301522.3" or gene == "ENST00000371634.7"or gene =="ENST00000361789.2" or gene == "ENST00000239938.5" :
                plot = sns.lineplot(x, matrix_01, color='black')
                plot.fill_between(x, matrix_01, color='black')
                #
                plot.set(xticks=((x[0::int(len(matrix_01) * 0.08)])))
                #
                plot.get_figure().savefig(
                    os.path.join(hist_01_hist_pair_plot_folder,
                                 "gene:" + gene + ".pdf"))
                plot.get_figure().clf()
                matrix_01.to_csv(os.path.join(hist_01_hist_pair_plot_folder,
                                              "gene" + gene + ".csv"),
                                 index=True,
                                 header=True,
                                 decimal='.',
                                 sep=',',
                                 float_format='%.6f')

        matrix_01_list.append({
            'matrix': pd_matrix_01,
            'file_name': bed_file_name
        })

    return matrix_01_list
Exemplo n.º 19
0
    if exp_num == 2:
        return beta.ppf(-math.log(u) / beta.pdf(x, 4, 4) + beta.cdf(t0, 4, 4),
                        4, 4)

    if exp_num == 3:
        return np.exp(x - norm.ppf(u * norm.cdf(x - math.log(t0))))

    if exp_num == 4:
        return np.power(
            -math.log(u) / (np.exp(-0.5 * math.cos(2 * math.pi * x) - 1.5)) +
            np.power(t0, 1.5), 2 / 3)


file_addr = "./synth_files/"
create_dir_if_not_exist(file_addr)

num_irr = 40
t_min = 0.01
t_max = {1: 1, 2: 1, 3: 5, 4: 5}
num_pcs = 10
max_size = int(14e6)

n_sub = {'train': 1000000, 'test': 5000}

seed = {'train': 0, 'test': 1}


def set_subj_1_to_N(data):
    subject_converter = dict(
        zip(sorted(data['subject'].unique()),
Exemplo n.º 20
0
def calc_reproducible_sequences(match_scores_list, gene_list, pair_names_list, match_scores_real, matrix_01_list):
    # compute the match score histograms for the random comparisons
    # match_score_list contains all the match scores computed during the random comparisons
    match_scores_hist = {}
    for fake_match_scores in match_scores_list:
        for fake_match_score in fake_match_scores:
            pair_name = fake_match_score['pair_name']
            match_scores_fake = fake_match_score['match_score']
            # match_scores_hist aggregates the match scores indexing by pair name (name of the pair of bed files) and gene
            gene_hist = {}
            for gene, match_score in match_scores_fake.items():
                gene_hist[gene] = [match_score]

            if pair_name in match_scores_hist:
                for gene, match_score in match_scores_fake.items():
                    match_scores_hist[pair_name][gene].append(match_score)  # = [match_score]
            else:
                match_scores_hist[pair_name] = gene_hist

    # the pvale matrix contains the pvalues indexed by the name of pair of bed files and the name of the gene
    p_value_matrix = pd.DataFrame(index=gene_list, columns=pair_names_list)

    # the matrix summary is a pandas dataframe that summarizes the mean, standard deviation, pvalue and zscores of each gene
    matrix_summary_columns = [[c + "_mean", c + "_std", c + "_zscore", c + "_pvalue"] for c in pair_names_list]
    matrix_summary_columns = np.reshape(matrix_summary_columns, (np.shape(matrix_summary_columns)[0] * np.shape(matrix_summary_columns)[1])).T
    matrix_summary = pd.DataFrame(index=gene_list, columns=matrix_summary_columns)
    plot_num = 0
    # extract pvalues for each gene and dataset pair
    for pair_name in match_scores_hist:
        for gene in match_scores_hist[pair_name]:
            # compute mean and standard deviation for each gene
            gene_hist = pd.Series(match_scores_hist[pair_name][gene])
            hist_mean = np.mean(gene_hist)
            hist_std = np.std(gene_hist)

            if plot_data:
                # plot the histogram of match scores for each gene
                match_scores_hist_pair_plot_folder = os.path.join(match_scores_hist_plot_folder, pair_name)
                create_dir_if_not_exist([match_scores_hist_pair_plot_folder])
                sns.set_style('darkgrid')
                plot = sns.distplot(gene_hist, bins=num_bins).set_title("hist_mean: " + str('%.5f' % hist_mean) + "   hist_std: " + str('%.5f' % hist_std))
                plot.get_figure().savefig(os.path.join(match_scores_hist_pair_plot_folder, "gene:" + gene))
                plot.get_figure().clf()

            # compute the pvalues of each gene for each pair of bed files
            for match_score_real in match_scores_real:
                pair_name_real = match_score_real["pair_name"]
                if pair_name_real == pair_name:
                    real_score = match_score_real["match_score"][gene]
                    if hist_std != 0:
                        # compute the zscore and corresponding pvalue
                        z_score = (real_score - hist_mean) / hist_std
                        pvalue = 1 - st.norm.cdf(z_score)
                    else:
                        z_score = np.nan
                        pvalue = np.nan

                    # insert the results in the matrix summary
                    matrix_summary[pair_name + "_mean"][gene] = hist_mean
                    matrix_summary[pair_name + "_std"][gene] = hist_std
                    matrix_summary[pair_name + "_zscore"][gene] = z_score
                    matrix_summary[pair_name + "_pvalue"][gene] = pvalue
                    p_value_matrix[pair_name][gene] = pvalue

            plot_num += 1

    reproducible_genes = []
    matrix_summary.to_csv(os.path.join(reproducible_sequence_output_dir, "matrix_summary_005.csv"), index=True, header=True, decimal='.', sep=',', float_format='%.6f')
    for gene, pvalue_row in p_value_matrix.iterrows():
        pvalue_row = pvalue_row.to_numpy()

        # test the pvalues with Benjamini/Hochberg

        # python version
        # use multipletest from python stats library, results are the same as in R script: number_of_significative_values_python) ==len(bh_candidates)

        # y = multipletests(pvals=pvalue_row, alpha=FDR, method="fdr_bh")
        # number_of_significative_values_python = len(y[1][np.where(y[1] < FDR)])

        # # if all the pvalues are below the threshold for each dataset then the gene can be considered reproducible
        # if number_of_significative_values == len(pair_names_list):
        #     reproducible_genes.append(gene)

        # R version
        pvalue_row = np.sort(pvalue_row)
        critical_values = ((np.nonzero(pvalue_row >= 0)[0] + 1) / len(pair_names_list)) * FDR

        # select the gene only if the number of selected elements is above the threshold
        reproducibily_threshold = int(round((len(pair_names_list) * reproducibility_min_fraction)))
        #print(reproducibily_threshold)
        if len(critical_values) > 0:
            # remove nan from pvalue row
            pvalue_row = pd.to_numeric(pvalue_row, errors='coerce')
            pvalue_row = pvalue_row[np.logical_not(np.isnan(pvalue_row))]
            bh_candidates = pvalue_row[pvalue_row <= critical_values]
            if len(bh_candidates) > 0:
                idx_of_max_value = np.argwhere(bh_candidates == np.amax(bh_candidates)).flatten().tolist()[-1] + 1
                bh_selected = pvalue_row[np.array(range(0, idx_of_max_value))]
                if len(bh_selected) >= reproducibily_threshold:
                    reproducible_genes.append(gene)

    # extract the reproducible sequences
    reproducible_sequence_mask, first_matrix_01_with_only_reproducible_genes = extract_reproducible_sequences(reproducible_genes, matrix_01_list)
    # take the first matrix 01 with only reproducible genes and put to zero the non reproducible parts
    first_matrix_01_with_only_reproducible_genes[~reproducible_sequence_mask] = 0
    reproducible_sequence = pd.DataFrame(first_matrix_01_with_only_reproducible_genes, index=reproducible_genes)
    reproducible_sequence.to_csv(os.path.join(reproducible_sequence_output_dir, "reproducible_sequence_005.csv"), index=True, header=True, decimal='.', sep=',', float_format='%.6f')
    p_value_matrix.to_csv(os.path.join(reproducible_sequence_output_dir, "global_matrix_005.csv"), index=True, header=True, decimal='.', sep=',', float_format='%.6f')