예제 #1
0
def split_bam(donor_id, analysis_id, use_cntl, specimen_map, data_path, outpath, clean_dir, tmp_dir, num_processes=8):
    """Processes a sample level BAM by RG to create unaligned BAMs"""

    output_dir = utils.make_new_dir(os.path.join(outpath, donor_id, analysis_id))
    metadata = header_utils.parse_cghub_metadata(analysis_id)
    metadata['use_cntl'] = use_cntl
    bam_filename = bam_utils.get_bam_file(os.path.join(data_path, analysis_id))
    logger.info('Starting gen_unaligned_bam: %s' % analysis_id)
    bam_utils.gen_unaligned_bam(bam_filename, analysis_id, metadata, specimen_map, output_dir, clean_dir, tmp_dir, num_processes=num_processes)
    logger.info('Completed gen_unaligned_bam: %s' % analysis_id)
예제 #2
0
def classif_experiment(features_file_name, labels_file_name,
                       train_percent_list = [1, 5, 10, 20], n_rep = 5,
                       results_files_prefix = '',
                       results_dir = svm_results_dir()):

    (features, labels, n_samples) = extract_training_and_test(
        features_file_name, labels_file_name)

    results_dir = utils.make_new_dir(results_dir, 'svm_experiment')
    with open(path.join(results_dir, 'description.txt'), 'w') as desc_file:
        desc_file.write('features file: %s\n' % features_file_name)
        desc_file.write('labels file: %s\n' % labels_file_name)
        desc_file.write('n samples: %d\n' % n_samples)
        desc_file.write('n positives: %d\n' % np.sum(labels))
        
    for train_percent in train_percent_list:
        file_name_prefix = '%s_train_%d_percent' % (results_files_prefix,
                                                    train_percent)
        
        file_name_prefix = utils.find_good_name(results_dir, file_name_prefix)
        
        fp_file_name = utils.find_good_name(
            results_dir, '%s_false_positives.txt' % file_name_prefix)
        fn_file_name = utils.find_good_name(
            results_dir, '%s_false_negatives.txt' % file_name_prefix)
        tp_file_name = utils.find_good_name(
            results_dir, '%s_true_positives.txt' % file_name_prefix)
        tn_file_name = utils.find_good_name(
            results_dir, '%s_true_negatives.txt' % file_name_prefix)
        with open(fp_file_name, 'a') as fp_file:
            with open(fn_file_name, 'a') as fn_file:
                with open(tp_file_name, 'a') as tp_file:
                    with open(tn_file_name, 'a') as tn_file:
                        
                        for rep in xrange(n_rep):
                            print ('%d percent of train, repetition %d'
                                   % (train_percent, rep))
                
                            train_indices, test_indices = choose_train_indices(
                                n_samples, train_percent)
            
                            classif = train_classif(
                                features[train_indices,:],
                                labels[train_indices])
            
                            perf = test_classif(
                                classif, features[test_indices,:],
                                labels[test_indices])
                
                            fp_file.write('%d\n' % perf['false_positives'])
                            fn_file.write('%d\n' % perf['false_negatives'])
                            tp_file.write('%d\n' % perf['true_positives'])
                            tn_file.write('%d\n' % perf['true_negatives'])
def create_dl_dir(leaf_name_root, extradir=None):
    # structure of dirs created, shown by example:
    # 1) no extradir:
    #    2013
    #     +-10
    #        +-leaf_name_root05_120101_55abbbc
    #            ...
    #
    # 2) with extradir:
    #    2013
    #     +-10
    #        +-extradir
    #            +-leaf_name_root05_120102_44deeef
    #                ...

    root_dl_dir = DownloadManagerController.Instance().get_download_dir()
    st_time = time.gmtime()
    yr_name  = str(st_time.tm_year)
    mon_name = str(st_time.tm_mon)
    leaf_dir_name = mkFname(leaf_name_root)

    if extradir:
        path = (yr_name, mon_name, extradir, leaf_dir_name)
    else:
        path = (yr_name, mon_name, leaf_dir_name)
    
    rel_path  = os.path.join(*path)

    full_path = root_dl_dir
    for p in path[:-1]:
        full_path = os.path.join(full_path, p)
        check_or_make_dir(full_path, logger)

    full_path = os.path.join (full_path, leaf_dir_name)
    make_new_dir(full_path, logger)

    return full_path, rel_path
def create_dl_dir(leaf_name_root, extradir=None):
    # structure of dirs created, shown by example:
    # 1) no extradir:
    #    2013
    #     +-10
    #        +-leaf_name_root05_120101_55abbbc
    #            ...
    #
    # 2) with extradir:
    #    2013
    #     +-10
    #        +-extradir
    #            +-leaf_name_root05_120102_44deeef
    #                ...

    root_dl_dir = DownloadManagerController.Instance().get_download_dir()
    st_time = time.gmtime()
    yr_name = str(st_time.tm_year)
    mon_name = str(st_time.tm_mon)
    leaf_dir_name = mkFname(leaf_name_root)

    if extradir:
        path = (yr_name, mon_name, extradir, leaf_dir_name)
    else:
        path = (yr_name, mon_name, leaf_dir_name)

    rel_path = os.path.join(*path)

    full_path = root_dl_dir
    for p in path[:-1]:
        full_path = os.path.join(full_path, p)
        check_or_make_dir(full_path, logger)

    full_path = os.path.join(full_path, leaf_dir_name)
    make_new_dir(full_path, logger)

    return full_path, rel_path
예제 #5
0
def gosta_experiment(nb_repetitions = 1, n_iter = 10000, trace_period = 10,
                     data_files = ['wine.csv'],
                     graph_type_list = ['watts', 'complete', 'grid'],
                     f_to_avg = pf.within_clust_scatter,
                     averaging_function = gosta.neighbourhood_avg,
                     root_results_folder = path_to_results_folder()):
    
    for data_src in data_files:
        data_name = re.match(r'([^\.]+)\..*', data_src).group(1)
        print '\n', data_name

        data = gc.parse(data_src)
        print data.shape
        true_mean = compute_truth_all_pairs(
            data, f_to_avg)
        
        for graph_type in graph_type_list:
            print '\n', graph_type, '\n'

            results_folder_name = '%s_%s_%s' % (
                function_names[averaging_function], data_name, graph_type)

            results_folder = utils.make_new_dir(
                root_results_folder, results_folder_name)

            graph = gc.build_graph(data, graph_type)
            traces = []
            error_traces = []
            
            for rep in range(nb_repetitions):
                print '%s, %s, repetition %d' % (data_name, graph_type, rep)

                trace = gosta.gosta_sync(
                    graph, f_to_avg, n_iter = n_iter,
                    trace_period = trace_period,
                    averaging_function = averaging_function,
                    log_filename = path.join(results_folder, 'log.npy'),
                    log_period = 1000)

                traces.append(trace)
                error_traces.append(np.abs((np.array(trace)
                                            - true_mean)
                                           / true_mean))

            traces = np.array(traces)
            error_traces = np.atleast_2d(np.array(error_traces))
            err_mean = np.mean(error_traces, axis = 0)

            traces_file_name = path.join(
                results_folder,
                '%s_%s_traces.npy' % (data_name, graph_type))
            
            mean_file_name = path.join(
                results_folder,
                '%s_%s_mean.npy' % (data_name, graph_type))

            with open(traces_file_name, 'w') as traces_file:
                np.save(traces_file, traces)

            with open(mean_file_name, 'w') as mean_file:
                np.save(mean_file, err_mean)
예제 #6
0
        default=None)
    parser.add_argument('--work_dir',
                        type=str,
                        help='path/to/output/directory',
                        default=None)
    parser.add_argument('--output_dir',
                        type=str,
                        help='path/to/output/directory',
                        required=True)

    args = parser.parse_args()

    if args.work_dir is None:
        args.work_dir = args.output_dir
    exit_code = 0
    output_dir = utils.make_new_dir(args.output_dir)
    work_dir = utils.make_new_dir(args.work_dir)
    try:
        if args.tumor_id is None and args.normal_id is not None:
            metadata = header_utils.parse_cghub_metadata(args.normal_id)
            metadata['use_cntl'] = 'N/A'
            exit_code = bam_utils.gen_unaligned_bam(args.bam_path,
                                                    args.normal_id, metadata,
                                                    specimen_dict, work_dir,
                                                    output_dir)
        elif args.tumor_id is not None and args.normal_id is not None:
            metadata = header_utils.parse_cghub_metadata(args.tumor_id)
            metadata['use_cntl'] = args.normal_id
            exit_code = bam_utils.gen_unaligned_bam(args.bam_path,
                                                    args.tumor_id, metadata,
                                                    specimen_dict, work_dir,
def main():
    tweets = _read_tweets_to_dataframe("data/tweet_data/", True, 2000)
    make_new_dir("data/datasets")
    save_to_csv(tweets, "data/datasets/individual_tweets.csv", "tweet_id")
import pickle

segmented_dir_tempalte = "../outputs/check_segmentation_fovea/{}/"
ori_img_dir = "../data/merged_training_set/"

# load features, labels
csv_file = "../outputs/dme_features.csv"
df = pd.read_csv(csv_file)
df_mat = df.as_matrix()
n_total = len(df)
ratio_val = 0.1
n_val = int(n_total * ratio_val)

# set outdir
out_dir = "../dme_xgb_models"
utils.make_new_dir(out_dir)

# run xgboost
min_child_weight = 1
subsample = 0.2
colsample_by_tree = 0.2
colsample_bylevel = 0.2
lambda_val = 3
alpha = 5
depth = 8

train_accs, val_accs = [], []

for i in range(10):
    # set training and validation dataset
    train_X = np.concatenate([
예제 #9
0
    parser = argparse.ArgumentParser(prog='pcap_split.py', description='Create unaligned BAM files')
    parser.add_argument('--bam_path', type=str, help='path/to/tcga/data/labeled_by_analysis_id', required=True)
    parser.add_argument('--normal_id', type=str, help='UUID for normal analysis (analysis_id)', default=None)
    parser.add_argument('--tumor_id', type=str, help='Comma separated list of tumor analysis UUIDs (analysid_id(s))', default=None)
    parser.add_argument('--work_dir', type=str, help='path/to/output/directory', default=None)    
    parser.add_argument('--output_dir', type=str, help='path/to/output/directory', required=True)
    parser.add_argument('--specimen_map', type=str, default=os.path.join(basedir,'tcga_dcc_specimen_type.txt'), help='path/to/tcga/icgc/sample_code_specimen_mapping')

    args = parser.parse_args()
    
    specimen_dict = parse_specimen_dict(args.specimen_map)
    
    if args.work_dir is None:
        args.work_dir = args.output_dir
    exit_code = 0
    output_dir = utils.make_new_dir(args.output_dir)
    work_dir = utils.make_new_dir(args.work_dir)
    try:
        if args.tumor_id is None and args.normal_id is not None:
            metadata = header_utils.parse_cghub_metadata(args.normal_id)
            metadata['use_cntl'] = 'N/A'
            exit_code = bam_utils.gen_unaligned_bam(args.bam_path, args.normal_id, metadata, specimen_dict, work_dir, output_dir)
        elif args.tumor_id is not None and args.normal_id is not None:
            metadata = header_utils.parse_cghub_metadata(args.tumor_id)
            metadata['use_cntl'] = args.normal_id
            exit_code = bam_utils.gen_unaligned_bam(args.bam_path, args.tumor_id, metadata, specimen_dict, work_dir, output_dir)
        else:
            print "Please define --normal_id or (--normal_id and --tumor_id)"
            sys.exit(1)
    except:
        print "PCAP SPLIT Failure!!!"