def outputPennCnv(gtc_file, manifest_file, outFile): manifest = BeadPoolManifest(manifest_file) gtc = GenotypeCalls(gtc_file) genotypes = gtc.get_genotypes() LRR = gtc.get_logr_ratios() BAF = gtc.get_ballele_freqs() with open(outFile, 'w') as output: output.write( 'Name\tChr\tPosition\tGtype\tLog R Ratio\tB Allele Freq\n') for (name, chrom, map_info, genotype, lrr, baf) in zip(manifest.names, manifest.chroms, manifest.map_infos, genotypes, LRR, BAF): if genotype == 1: geno = 'AA' elif genotype == 2: geno = 'AB' elif genotype == 3: geno = 'BB' else: geno = '--' output.write('\t'.join([ name, chrom, str(map_info), geno, str(lrr), str(baf) ]) + '\n')
def get_manifest(manifest_path, extraction_path, sep=','): """ Extract manifest data about SNPs and write it to a ../data/extracted folder :param manifest_path: str - path to file :param extraction_path: str - path to directory where extracted files will be stored :param sep: str - separator which is used for writing, comma by default :return: """ # Add {} to use it later in formatting names path_to_save = extraction_path + '/{}' # Get data manifest = BeadPoolManifest(manifest_path) # Check whether name of manifest is the same as coded in the file assert manifest.manifest_name == manifest_path.split('/')[-1], \ "Name of manifest file doesn't match with manifest name from file" # List of fields which should be extracted from the manifest manifest_extract = ['names', 'chroms', 'map_infos', 'ref_strands', 'source_strands', 'snps'] content = [] # Iterate over attributes of manifest object for attr in manifest_extract: content.append((attr, map(str, manifest.__getattribute__(attr)))) # Initialize variables name = manifest.manifest_name.split('.')[0] + '_old.csv' length = len(content[0][1]) rows = [] # Make header header = sep.join([content[i][0] for i in range(len(content))]) # Create normal df structure for i in range(length): row = sep.join([content[j][1][i] for j in range(len(content))]) rows.append(row) # Write to a file with open(path_to_save.format(name), 'w') as dest: dest.write(header + '\n' + '\n'.join(rows))
def __init__(self, bpm_file, logger): """ Initialize a BPM reader with a file path Args: bpm_file (string): Path to the BPM manifest logger (Logger) : A logger Returns: BeadPoolReader """ self.source_file = bpm_file self._bpm = BeadPoolManifest(bpm_file) self._logger = logger
def driver(gtc_dir, manifest_filename, output_filename, project_name, delim, logger): logger.info("Reading manifest file") bpm = BeadPoolManifest(manifest_filename) samples = [] logger.info("Initializing genotype data") gtc_files = [] for gtc_file in os.listdir(gtc_dir): if gtc_file.endswith(".gtc"): gtc_files.append(os.path.join(gtc_dir, gtc_file)) logger.info("Generating report") loci = range(len(bpm.normalization_lookups)) with open(output_filename, "w") as output_handle: output_handle.write("DNA Report on " + os.path.abspath(output_filename) + "\n") header = [""] header.append("# LOCI = {}".format(len(loci))) header.append("# DNAs = {}".format(len(samples))) header.append("ProjectName = {}".format(project_name)) header.append("GenCall Version = NaN") header.append("Low GenCall Score Cutoff = NaN") output_handle.write(delim.join(header) + "\n") output_handle.write(delim.join("Row,DNA_ID,#No_Calls,#Calls,Call_Rate,A/A_Freq,A/B_Freq,B/B_Freq,Minor_Freq,50%_GC_Score,10%_GC_Score".split(",")) + "\n") row = 0 for gtc_file in gtc_files: row += 1 gtc = GenotypeCalls(gtc_file) genotypes = gtc.get_genotypes() scores = gtc.get_genotype_scores() assert len(genotypes) == len(bpm.names) row_data = [] row_data.append(row) row_data.append(gtc.get_sample_name()) row_data += compute_genotypes(genotypes) row_data.append(ScoreStatistics(scores, 50)) row_data.append(ScoreStatistics(scores, 10)) output_handle.write(delim.join(map(str, row_data)) + "\n") logger.info("Report generation complete")
delim = "\t" parser = argparse.ArgumentParser( "Generate a final report from a directory of GTC files") parser.add_argument("manifest", help="BPM manifest file") parser.add_argument("gtc_directory", help="Directory containing GTC files") parser.add_argument("output_file", help="Location to write report") args = parser.parse_args() if os.path.isfile(args.output_file): sys.stderr.write("Output file already exists, please delete and re-run\n") sys.exit(-1) try: manifest = BeadPoolManifest(args.manifest) except: sys.stderr.write("Failed to read data from manifest\n") sys.exit(-1) with open(args.output_file, "w") as output_handle: output_handle.write("[Header]\n") output_handle.write( delim.join( ["Processing Date", datetime.now().strftime("%m/%d/%Y %I:%M %p")]) + "\n") output_handle.write( delim.join(["Content", os.path.basename(args.manifest)]) + "\n") output_handle.write( delim.join(["Num SNPs", str(len(manifest.names))]) + "\n") output_handle.write(
from IlluminaBeadArrayFiles import GenotypeCalls, BeadPoolManifest, code2genotype import sys import os from datetime import datetime delim = "\t" if len(sys.argv) < 4: sys.stderr.write("Generate a final report from a directory of GTC files\n") sys.stderr.write( "usage: python gtc_final_report.py <BPM manifest file> <GTC directory> <output file>\n" ) sys.exit(-1) try: names = BeadPoolManifest(sys.argv[1]).names except: sys.stderr.write("Failed to read loci names from manifest\n") sys.exit(-1) output_file = sys.argv[3] if os.path.isfile(output_file): sys.stderr.write("Output file already exists, please delete and re-run\n") sys.exit(-1) with open(output_file, "w") as output_handle: output_handle.write("[Header]\n") output_handle.write( delim.join( ["Processing Date",
def driver(gtc_dir, manifest_filename, cluster_filename, output_filename, project_name, delim, logger): logger.info("Reading cluster file") with open(cluster_filename, "rb") as cluster_handle: egt = ClusterFile.read_cluster_file(cluster_handle) logger.info("Reading manifest file") bpm = BeadPoolManifest(manifest_filename) samples = [] logger.info("Initializing genotype data") gtc_files = [] for gtc_file in os.listdir(gtc_dir): if gtc_file.endswith(".gtc"): gtc_files.append(os.path.join(gtc_dir, gtc_file)) samples = map(GenotypeCalls, gtc_files) ls_genotypes = [] ls_genotype_scores = [] ls_sample_names = [] ls_snps = bpm.names for sample in samples: genotypes = sample.get_genotypes() assert len(genotypes) == len(bpm.names) ls_genotypes.append(genotypes) ls_genotype_scores.append(sample.get_genotype_scores()) ls_sample_names.append(sample.get_sample_name()) logger.info("Generating report") loci = range(len(bpm.normalization_lookups)) row = 0 with open(output_filename, "w") as output_handle: output_handle.write("Locus Summary on " + os.path.abspath(output_filename) + "\n") header = [""] header.append("# LOCI = {}".format(len(loci))) header.append("# DNAs = {}".format(len(gtc_files))) header.append("ProjectName = {}".format(project_name)) header.append("GenCall Version = {}".format(egt.gencall_version)) header.append("Low GenCall Score Cutoff = NaN") output_handle.write(delim.join(header) + "\n") output_handle.write( delim.join( "Row,Locus_Name,Illumicode_Name,#No_Calls,#Calls,Call_Freq,A/A_Freq,A/B_Freq,B/B_Freq,Minor_Freq,Gentrain_Score,50%_GC_Score,10%_GC_Score,Het_Excess_Freq,ChiTest_P100,Cluster_Sep,AA_T_Mean,AA_T_Std,AB_T_Mean,AB_T_Std,BB_T_Mean,BB_T_Std,AA_R_Mean,AA_R_Std,AB_R_Mean,AB_R_Std,BB_R_Mean,BB_R_Std,Plus/Minus Strand" .split(",")) + "\n") for i in range(0, len(ls_snps)): row += 1 snp_wise_genotypes = [item[i] for item in ls_genotypes] snp_wise_scores = [item[i] for item in ls_genotype_scores] locus_summary = summarize_locus(snp_wise_genotypes, snp_wise_scores) cluster_record = egt.get_record(ls_snps[i]) row_data = [] row_data.append(row) row_data.append(ls_snps[i]) row_data.append(cluster_record.address) row_data.append(locus_summary.genotype_counts.no_calls) row_data.append(locus_summary.genotype_counts.get_num_calls()) row_data.append(locus_summary.genotype_counts.get_call_frequency()) row_data.append(locus_summary.genotype_counts.get_aa_frequency()) row_data.append(locus_summary.genotype_counts.get_ab_frequency()) row_data.append(locus_summary.genotype_counts.get_bb_frequency()) row_data.append( locus_summary.genotype_counts.get_minor_frequency()) row_data.append(cluster_record.cluster_score.total_score) row_data.append(locus_summary.score_stats.gc_50) row_data.append(locus_summary.score_stats.gc_10) (hw_equilibrium, het_excess ) = locus_summary.genotype_counts.compute_hardy_weinberg() row_data.append(het_excess) row_data.append(hw_equilibrium) row_data.append(cluster_record.cluster_score.cluster_separation) for cluster_stats in (cluster_record.aa_cluster_stats, cluster_record.ab_cluster_stats, cluster_record.bb_cluster_stats): row_data.append(cluster_stats.theta_mean) row_data.append(cluster_stats.theta_dev) for cluster_stats in (cluster_record.aa_cluster_stats, cluster_record.ab_cluster_stats, cluster_record.bb_cluster_stats): row_data.append(cluster_stats.r_mean) row_data.append(cluster_stats.r_dev) if len(bpm.ref_strands) > 0: row_data.append(RefStrand.to_string(bpm.ref_strands[i])) else: row_data.append("U") output_handle.write(delim.join(map(str, row_data)) + "\n") logger.info("Report generation complete")
def extract(gtc_path, extraction_path, manifest_path="/home/ailin/repo_new/data/BovineSNP50_v3_A1.bpm"): """ Extract genotyping data - ballele_freqs, base_calls, genotypes, genotype_scores, logr_ratios, raw_x_intensities, raw_y_intensities, normalized_intensities, names, chroms, map_infos, ref_strands, source_strands and snps - and write it to a file Also extract general sample information - call_rate, cluster_file, gender, imaging_date, autocall_date, scanner_data, snp_manifest, is_write_complete, sample_name, sample_plate, sample_well - and write it to .sinfo file :param gtc_path: str - path to gtc with genotyping data :param extraction_path: str - path to directory where extracted files will be stored :param manifest_path: str - path to manifest file used for creation of this gtc :return: """ # Add {} to use it later in formatting names path_to_save = extraction_path + '/{}' # Get gtc and manifest objects gtc = GenotypeCalls(gtc_path) manifest = BeadPoolManifest(manifest_path) # Structure for ordered names and methods of gtc fields field = namedtuple('field', ['name', 'method']) # List of fields which should be extracted from gtc gtc_extract = [ field('ballele_freqs', GenotypeCalls.get_ballele_freqs), field('genotypes', GenotypeCalls.get_genotypes), field('genotype_scores', GenotypeCalls.get_genotype_scores), field('logr_ratios', GenotypeCalls.get_logr_ratios), field('raw_x_intensities', GenotypeCalls.get_raw_x_intensities), field('raw_y_intensities', GenotypeCalls.get_raw_y_intensities), field( 'normalized_intensities', lambda x: GenotypeCalls.get_normalized_intensities( x, manifest.normalization_lookups)) ] # I don't see place in db where we use this data # List of fields which correspond to a whole sample and extracted from gtc sample_info = [ field('call_rate', GenotypeCalls.get_call_rate), field('cluster_file', GenotypeCalls.get_cluster_file), field('gender', GenotypeCalls.get_gender), field('imaging_date', GenotypeCalls.get_imaging_date), field('autocall_date', GenotypeCalls.get_autocall_date), field('scanner_data', GenotypeCalls.get_scanner_data), field('snp_manifest', GenotypeCalls.get_snp_manifest), field('is_write_complete', GenotypeCalls.is_write_complete), field('sample_name', GenotypeCalls.get_sample_name), field('sample_plate', GenotypeCalls.get_sample_plate), field('sample_well', GenotypeCalls.get_sample_well) ] # Containers for data content = [] general_info = [] # Get content from gtc # Iterate over fields which should be extracted in gtc, transform them to str for name, method in gtc_extract: res = method(gtc) # For normalized_intensities divide the list of (x, y) intensities into lists of x and y if name != 'normalized_intensities': if not isinstance(res, str): try: res = map(str, res) except TypeError: res = str(res) content.append((name, res)) else: # Compute r and theta values polar = map(NormalizationTransform.rect_to_polar, res) content.append( ('normalized_x_intensities', [str(x) for x, y in res])) content.append( ('normalized_y_intensities', [str(y) for x, y in res])) content.append(('r', [str(r) for r, theta in polar])) content.append(('theta', [str(theta) for r, theta in polar])) # Get base calls and their forward encoding base_calls = GenotypeCalls.get_base_calls(gtc) genotype_forward = GenotypeCalls.get_base_calls_forward_strand( gtc, base_calls, [SourceStrand.Forward for i in range(len(base_calls))]) # Write them to collection content.append(('base_calls', base_calls)) content.append(('genotype_forward', genotype_forward)) # Iterate over sample information attributes of gtc object for name, method in sample_info: res = str(method(gtc)) general_info.append((name, res)) # Initialize variables length = len(content[0][1]) sep = ',' rows = [] # Make header header = sep.join([content[i][0] for i in range(len(content))]) try: # Create normal df structure for i in range(length): row = sep.join([content[j][1][i] for j in range(len(content))]) rows.append(row) except: print(gtc_path) return # File names name = gtc_path.split('/')[-1].split('.')[0] sinfo_name = name + '_old.sinfo' data_name = name + '_old.csv' # Write data to a file with open(path_to_save.format(data_name), 'w') as dest: dest.write(header + '\n' + '\n'.join(rows)) # Write sample information to a file with open(path_to_save.format(sinfo_name), 'w') as dest: dest.write('\n'.join([sep.join(record) for record in general_info]))