def tool_newref(args): logging.info('Creating new reference') split_path = list(os.path.split(args.outfile)) if split_path[-1][-4:] == '.npz': split_path[-1] = split_path[-1][:-4] base_path = os.path.join(split_path[0], split_path[1]) args.basepath = base_path args.prepfile = '{}_prep.npz'.format(base_path) args.partfile = '{}_part'.format(base_path) samples = [] logging.info('Importing data ...') for infile in args.infiles: logging.info('Loading: {}'.format(infile)) npzdata = np.load(infile, encoding='latin1') sample = npzdata['sample'].item() binsize = int(npzdata['binsize']) logging.info('Binsize: {}'.format(int(binsize))) samples.append(scale_sample(sample, binsize, args.binsize)) samples = np.array(samples) genders, trained_cutoff = train_gender_model(samples) if not args.nipt: for i, sample in enumerate(samples): samples[i] = gender_correct(sample, genders[i]) total_mask, bins_per_chr = get_mask(samples) if genders.count('F') > 4: mask_F, _ = get_mask(samples[np.array(genders) == 'F']) total_mask = total_mask & mask_F if genders.count('M') > 4 and not args.nipt: mask_M, _ = get_mask(samples[np.array(genders) == 'M']) total_mask = total_mask & mask_M outfiles = [] if len(genders) > 9: logging.info('Starting autosomal reference creation ...') args.tmpoutfile = '{}.tmp.A.npz'.format(args.basepath) outfiles.append(args.tmpoutfile) tool_newref_prep(args, samples, 'A', total_mask, bins_per_chr) logging.info('This might take a while ...') tool_newref_main(args, args.cpus) else: logging.critical( 'Provide at least 10 samples to enable the generation of a reference.' ) sys.exit() if genders.count('F') > 4: logging.info('Starting female gonosomal reference creation ...') args.tmpoutfile = '{}.tmp.F.npz'.format(args.basepath) outfiles.append(args.tmpoutfile) tool_newref_prep(args, samples[np.array(genders) == 'F'], 'F', total_mask, bins_per_chr) logging.info('This might take a while ...') tool_newref_main(args, 1) else: logging.warning( 'Provide at least 5 female samples to enable normalization of female gonosomes.' ) if not args.nipt: if genders.count('M') > 4: logging.info('Starting male gonosomal reference creation ...') args.tmpoutfile = '{}.tmp.M.npz'.format(args.basepath) outfiles.append(args.tmpoutfile) tool_newref_prep(args, samples[np.array(genders) == 'M'], 'M', total_mask, bins_per_chr) tool_newref_main(args, 1) else: logging.warning( 'Provide at least 5 male samples to enable normalization of male gonosomes.' ) tool_newref_merge(args, outfiles, trained_cutoff) logging.info('Finished creating reference')
def tool_test(args): logging.info('Starting CNA prediction') if not args.bed and not args.plot: logging.critical( 'No output format selected. ' 'Select at least one of the supported output formats (--bed, --plot)' ) sys.exit() if args.zscore <= 0: logging.critical( 'Parameter --zscore should be a strictly positive number') sys.exit() if args.beta is not None: if args.beta <= 0 or args.beta > 1: logging.critical( 'Parameter --beta should be a strictly positive number lower than 1' ) sys.exit() if args.alpha <= 0 or args.alpha > 1: logging.critical( 'Parameter --alpha should be a strictly positive number lower than 1' ) sys.exit() logging.info('Importing data ...') ref_file = np.load(args.reference, encoding='latin1') sample_file = np.load(args.infile, encoding='latin1') sample = sample_file['sample'].item() n_reads = sum([sum(sample[x]) for x in sample.keys()]) sample = scale_sample(sample, int(sample_file['binsize'].item()), int(ref_file['binsize'])) if not ref_file['is_nipt']: actual_gender = predict_gender(sample, ref_file['trained_cutoff']) if args.gender: actual_gender = args.gender sample = gender_correct(sample, actual_gender) else: actual_gender = 'F' if args.gender: actual_gender = args.gender ref_gender = actual_gender logging.info('Normalizing autosomes ...') results_r, results_z, results_w, ref_sizes, m_lr, m_z = normalize( args, sample, ref_file, 'A') if not ref_file['has_male'] and actual_gender == 'M': logging.warning( 'This sample is male, whilst the reference is created with fewer than 5 males. ' 'The female gonosomal reference will be used for X predictions. Note that these might ' 'not be accurate. If the latter is desired, create a new reference and include more ' 'male samples.') ref_gender = 'F' elif not ref_file['has_female'] and actual_gender == 'F': logging.warning( 'This sample is female, whilst the reference is created with fewer than 5 females. ' 'The male gonosomal reference will be used for XY predictions. Note that these might ' 'not be accurate. If the latter is desired, create a new reference and include more ' 'female samples.') ref_gender = 'M' logging.info('Normalizing gonosomes ...') null_ratios_aut_per_bin = ref_file['null_ratios'] null_ratios_gon_per_bin = ref_file['null_ratios.{}'.format( ref_gender)][len(null_ratios_aut_per_bin):] results_r_2, results_z_2, results_w_2, ref_sizes_2, _, _ = normalize( args, sample, ref_file, ref_gender) rem_input = { 'args': args, 'wd': str(os.path.dirname(os.path.realpath(__file__))), 'binsize': int(ref_file['binsize']), 'n_reads': n_reads, 'ref_gender': ref_gender, 'actual_gender': actual_gender, 'mask': ref_file['mask.{}'.format(ref_gender)], 'bins_per_chr': ref_file['bins_per_chr.{}'.format(ref_gender)], 'masked_bins_per_chr': ref_file['masked_bins_per_chr.{}'.format(ref_gender)], 'masked_bins_per_chr_cum': ref_file['masked_bins_per_chr_cum.{}'.format(ref_gender)] } del ref_file results_r = np.append(results_r, results_r_2) results_z = np.append(results_z, results_z_2) - m_z results_w = np.append(results_w * np.nanmedian(results_w_2), results_w_2 * np.nanmedian(results_w)) results_w = results_w / np.nanmedian(results_w) ref_sizes = np.append(ref_sizes, ref_sizes_2) null_ratios_aut_per_sample = np.transpose(null_ratios_aut_per_bin) part_mask = np.array([not x for x in list(np.isnan(results_r))], dtype=bool) null_m_lr_aut = np.array([ np.nanmedian(x[part_mask[:len(null_ratios_aut_per_bin)]]) for x in null_ratios_aut_per_sample ]) null_ratios_aut_per_bin = null_ratios_aut_per_bin - null_m_lr_aut null_ratios = np.array([x.tolist() for x in null_ratios_aut_per_bin] + [x.tolist() for x in null_ratios_gon_per_bin]) results = { 'results_r': results_r, 'results_z': results_z, 'results_w': results_w, 'results_nr': null_ratios } for result in results.keys(): results[result] = get_post_processed_result(args, results[result], ref_sizes, rem_input) log_trans(results, m_lr) if args.blacklist: logging.info('Applying blacklist ...') apply_blacklist(rem_input, results) logging.info('Executing circular binary segmentation ...') results['results_c'] = exec_cbs(rem_input, results) if args.bed: logging.info('Writing tables ...') generate_output_tables(rem_input, results) if args.plot: logging.info('Writing plots ...') exec_write_plots(rem_input, results) logging.info('Finished prediction')