Exemplo n.º 1
0
def outputPennCnv(gtc_file, manifest_file, outFile):
    manifest = BeadPoolManifest(manifest_file)
    gtc = GenotypeCalls(gtc_file)
    genotypes = gtc.get_genotypes()
    LRR = gtc.get_logr_ratios()
    BAF = gtc.get_ballele_freqs()
    with open(outFile, 'w') as output:
        output.write(
            'Name\tChr\tPosition\tGtype\tLog R Ratio\tB Allele Freq\n')
        for (name, chrom, map_info, genotype, lrr,
             baf) in zip(manifest.names, manifest.chroms, manifest.map_infos,
                         genotypes, LRR, BAF):
            if genotype == 1:
                geno = 'AA'
            elif genotype == 2:
                geno = 'AB'
            elif genotype == 3:
                geno = 'BB'
            else:
                geno = '--'
            output.write('\t'.join([
                name, chrom,
                str(map_info), geno,
                str(lrr), str(baf)
            ]) + '\n')
Exemplo n.º 2
0
def get_manifest(manifest_path, extraction_path, sep=','):
    """
    Extract manifest data about SNPs and write it to a ../data/extracted folder
    :param manifest_path: str - path to file
    :param extraction_path: str - path to directory where extracted files will be stored
    :param sep: str - separator which is used for writing, comma by default
    :return:
    """
    # Add {} to use it later in formatting names
    path_to_save = extraction_path + '/{}'
    # Get data
    manifest = BeadPoolManifest(manifest_path)

    # Check whether name of manifest is the same as coded in the file
    assert manifest.manifest_name == manifest_path.split('/')[-1], \
        "Name of manifest file doesn't match with manifest name from file"

    # List of fields which should be extracted from the manifest
    manifest_extract = ['names', 'chroms', 'map_infos', 'ref_strands', 'source_strands', 'snps']

    content = []
    # Iterate over attributes of manifest object
    for attr in manifest_extract:
        content.append((attr, map(str, manifest.__getattribute__(attr))))

    # Initialize variables
    name = manifest.manifest_name.split('.')[0] + '_old.csv'
    length = len(content[0][1])
    rows = []

    # Make header
    header = sep.join([content[i][0] for i in range(len(content))])

    # Create normal df structure
    for i in range(length):
        row = sep.join([content[j][1][i] for j in range(len(content))])
        rows.append(row)

    # Write to a file
    with open(path_to_save.format(name), 'w') as dest:
        dest.write(header + '\n' + '\n'.join(rows))
Exemplo n.º 3
0
    def __init__(self, bpm_file, logger):
        """
        Initialize a BPM reader with a file path

        Args:
            bpm_file (string): Path to the BPM manifest
            logger (Logger) : A logger

        Returns:
            BeadPoolReader
        """
        self.source_file = bpm_file
        self._bpm = BeadPoolManifest(bpm_file)
        self._logger = logger
Exemplo n.º 4
0
def driver(gtc_dir, manifest_filename, output_filename, project_name, delim, logger):
    logger.info("Reading manifest file")
    bpm = BeadPoolManifest(manifest_filename)

    samples = []
    logger.info("Initializing genotype data")
    gtc_files = []
    for gtc_file in os.listdir(gtc_dir):
        if gtc_file.endswith(".gtc"):
            gtc_files.append(os.path.join(gtc_dir, gtc_file))




    logger.info("Generating report")
    loci = range(len(bpm.normalization_lookups))
    with open(output_filename, "w") as output_handle:
        output_handle.write("DNA Report on " + os.path.abspath(output_filename) + "\n")
        header = [""]
        header.append("# LOCI = {}".format(len(loci)))
        header.append("# DNAs = {}".format(len(samples)))
        header.append("ProjectName = {}".format(project_name))
        header.append("GenCall Version = NaN")
        header.append("Low GenCall Score Cutoff = NaN")

        output_handle.write(delim.join(header) + "\n")
        output_handle.write(delim.join("Row,DNA_ID,#No_Calls,#Calls,Call_Rate,A/A_Freq,A/B_Freq,B/B_Freq,Minor_Freq,50%_GC_Score,10%_GC_Score".split(",")) + "\n")
        row = 0
        for gtc_file in gtc_files:
            row += 1
            gtc = GenotypeCalls(gtc_file)
            genotypes = gtc.get_genotypes()
            scores = gtc.get_genotype_scores()
            assert len(genotypes) == len(bpm.names)
            row_data = []
            row_data.append(row)
            row_data.append(gtc.get_sample_name())
            row_data += compute_genotypes(genotypes)
            row_data.append(ScoreStatistics(scores, 50))
            row_data.append(ScoreStatistics(scores, 10))
            output_handle.write(delim.join(map(str, row_data)) + "\n")
        logger.info("Report generation complete")
Exemplo n.º 5
0
delim = "\t"

parser = argparse.ArgumentParser(
    "Generate a final report from a directory of GTC files")
parser.add_argument("manifest", help="BPM manifest file")
parser.add_argument("gtc_directory", help="Directory containing GTC files")
parser.add_argument("output_file", help="Location to write report")

args = parser.parse_args()

if os.path.isfile(args.output_file):
    sys.stderr.write("Output file already exists, please delete and re-run\n")
    sys.exit(-1)

try:
    manifest = BeadPoolManifest(args.manifest)
except:
    sys.stderr.write("Failed to read data from manifest\n")
    sys.exit(-1)

with open(args.output_file, "w") as output_handle:
    output_handle.write("[Header]\n")
    output_handle.write(
        delim.join(
            ["Processing Date",
             datetime.now().strftime("%m/%d/%Y %I:%M %p")]) + "\n")
    output_handle.write(
        delim.join(["Content", os.path.basename(args.manifest)]) + "\n")
    output_handle.write(
        delim.join(["Num SNPs", str(len(manifest.names))]) + "\n")
    output_handle.write(
Exemplo n.º 6
0
from IlluminaBeadArrayFiles import GenotypeCalls, BeadPoolManifest, code2genotype
import sys
import os
from datetime import datetime

delim = "\t"

if len(sys.argv) < 4:
    sys.stderr.write("Generate a final report from a directory of GTC files\n")
    sys.stderr.write(
        "usage: python gtc_final_report.py <BPM manifest file> <GTC directory> <output file>\n"
    )
    sys.exit(-1)

try:
    names = BeadPoolManifest(sys.argv[1]).names
except:
    sys.stderr.write("Failed to read loci names from manifest\n")
    sys.exit(-1)

output_file = sys.argv[3]

if os.path.isfile(output_file):
    sys.stderr.write("Output file already exists, please delete and re-run\n")
    sys.exit(-1)

with open(output_file, "w") as output_handle:
    output_handle.write("[Header]\n")
    output_handle.write(
        delim.join(
            ["Processing Date",
Exemplo n.º 7
0
def driver(gtc_dir, manifest_filename, cluster_filename, output_filename,
           project_name, delim, logger):
    logger.info("Reading cluster file")
    with open(cluster_filename, "rb") as cluster_handle:
        egt = ClusterFile.read_cluster_file(cluster_handle)

    logger.info("Reading manifest file")
    bpm = BeadPoolManifest(manifest_filename)
    samples = []

    logger.info("Initializing genotype data")
    gtc_files = []
    for gtc_file in os.listdir(gtc_dir):
        if gtc_file.endswith(".gtc"):
            gtc_files.append(os.path.join(gtc_dir, gtc_file))

    samples = map(GenotypeCalls, gtc_files)

    ls_genotypes = []
    ls_genotype_scores = []
    ls_sample_names = []
    ls_snps = bpm.names
    for sample in samples:
        genotypes = sample.get_genotypes()
        assert len(genotypes) == len(bpm.names)
        ls_genotypes.append(genotypes)
        ls_genotype_scores.append(sample.get_genotype_scores())
        ls_sample_names.append(sample.get_sample_name())

    logger.info("Generating report")
    loci = range(len(bpm.normalization_lookups))
    row = 0
    with open(output_filename, "w") as output_handle:
        output_handle.write("Locus Summary on " +
                            os.path.abspath(output_filename) + "\n")
        header = [""]
        header.append("# LOCI = {}".format(len(loci)))
        header.append("# DNAs = {}".format(len(gtc_files)))
        header.append("ProjectName = {}".format(project_name))
        header.append("GenCall Version = {}".format(egt.gencall_version))
        header.append("Low GenCall Score Cutoff = NaN")

        output_handle.write(delim.join(header) + "\n")

        output_handle.write(
            delim.join(
                "Row,Locus_Name,Illumicode_Name,#No_Calls,#Calls,Call_Freq,A/A_Freq,A/B_Freq,B/B_Freq,Minor_Freq,Gentrain_Score,50%_GC_Score,10%_GC_Score,Het_Excess_Freq,ChiTest_P100,Cluster_Sep,AA_T_Mean,AA_T_Std,AB_T_Mean,AB_T_Std,BB_T_Mean,BB_T_Std,AA_R_Mean,AA_R_Std,AB_R_Mean,AB_R_Std,BB_R_Mean,BB_R_Std,Plus/Minus Strand"
                .split(",")) + "\n")
        for i in range(0, len(ls_snps)):
            row += 1
            snp_wise_genotypes = [item[i] for item in ls_genotypes]
            snp_wise_scores = [item[i] for item in ls_genotype_scores]
            locus_summary = summarize_locus(snp_wise_genotypes,
                                            snp_wise_scores)
            cluster_record = egt.get_record(ls_snps[i])
            row_data = []
            row_data.append(row)
            row_data.append(ls_snps[i])
            row_data.append(cluster_record.address)
            row_data.append(locus_summary.genotype_counts.no_calls)
            row_data.append(locus_summary.genotype_counts.get_num_calls())
            row_data.append(locus_summary.genotype_counts.get_call_frequency())
            row_data.append(locus_summary.genotype_counts.get_aa_frequency())
            row_data.append(locus_summary.genotype_counts.get_ab_frequency())
            row_data.append(locus_summary.genotype_counts.get_bb_frequency())
            row_data.append(
                locus_summary.genotype_counts.get_minor_frequency())
            row_data.append(cluster_record.cluster_score.total_score)
            row_data.append(locus_summary.score_stats.gc_50)
            row_data.append(locus_summary.score_stats.gc_10)

            (hw_equilibrium, het_excess
             ) = locus_summary.genotype_counts.compute_hardy_weinberg()
            row_data.append(het_excess)
            row_data.append(hw_equilibrium)

            row_data.append(cluster_record.cluster_score.cluster_separation)

            for cluster_stats in (cluster_record.aa_cluster_stats,
                                  cluster_record.ab_cluster_stats,
                                  cluster_record.bb_cluster_stats):
                row_data.append(cluster_stats.theta_mean)
                row_data.append(cluster_stats.theta_dev)

            for cluster_stats in (cluster_record.aa_cluster_stats,
                                  cluster_record.ab_cluster_stats,
                                  cluster_record.bb_cluster_stats):
                row_data.append(cluster_stats.r_mean)
                row_data.append(cluster_stats.r_dev)

            if len(bpm.ref_strands) > 0:
                row_data.append(RefStrand.to_string(bpm.ref_strands[i]))
            else:
                row_data.append("U")
            output_handle.write(delim.join(map(str, row_data)) + "\n")
        logger.info("Report generation complete")
Exemplo n.º 8
0
def extract(gtc_path,
            extraction_path,
            manifest_path="/home/ailin/repo_new/data/BovineSNP50_v3_A1.bpm"):
    """
    Extract genotyping data -
    ballele_freqs, base_calls, genotypes, genotype_scores, logr_ratios, raw_x_intensities, raw_y_intensities,
    normalized_intensities, names, chroms, map_infos, ref_strands, source_strands and snps -
    and write it to a file
    Also extract general sample information -
    call_rate, cluster_file, gender, imaging_date, autocall_date, scanner_data, snp_manifest, is_write_complete,
    sample_name, sample_plate, sample_well -
    and write it to .sinfo file
    :param gtc_path: str - path to gtc with genotyping data
    :param extraction_path: str - path to directory where extracted files will be stored
    :param manifest_path: str - path to manifest file used for creation of this gtc
    :return:
    """
    # Add {} to use it later in formatting names
    path_to_save = extraction_path + '/{}'
    # Get gtc and manifest objects
    gtc = GenotypeCalls(gtc_path)
    manifest = BeadPoolManifest(manifest_path)

    # Structure for ordered names and methods of gtc fields
    field = namedtuple('field', ['name', 'method'])

    # List of fields which should be extracted from gtc
    gtc_extract = [
        field('ballele_freqs', GenotypeCalls.get_ballele_freqs),
        field('genotypes', GenotypeCalls.get_genotypes),
        field('genotype_scores', GenotypeCalls.get_genotype_scores),
        field('logr_ratios', GenotypeCalls.get_logr_ratios),
        field('raw_x_intensities', GenotypeCalls.get_raw_x_intensities),
        field('raw_y_intensities', GenotypeCalls.get_raw_y_intensities),
        field(
            'normalized_intensities',
            lambda x: GenotypeCalls.get_normalized_intensities(
                x, manifest.normalization_lookups))
    ]

    # I don't see place in db where we use this data
    # List of fields which correspond to a whole sample and extracted from gtc
    sample_info = [
        field('call_rate', GenotypeCalls.get_call_rate),
        field('cluster_file', GenotypeCalls.get_cluster_file),
        field('gender', GenotypeCalls.get_gender),
        field('imaging_date', GenotypeCalls.get_imaging_date),
        field('autocall_date', GenotypeCalls.get_autocall_date),
        field('scanner_data', GenotypeCalls.get_scanner_data),
        field('snp_manifest', GenotypeCalls.get_snp_manifest),
        field('is_write_complete', GenotypeCalls.is_write_complete),
        field('sample_name', GenotypeCalls.get_sample_name),
        field('sample_plate', GenotypeCalls.get_sample_plate),
        field('sample_well', GenotypeCalls.get_sample_well)
    ]

    # Containers for data
    content = []
    general_info = []

    # Get content from gtc
    # Iterate over fields which should be extracted in gtc, transform them to str
    for name, method in gtc_extract:
        res = method(gtc)
        # For normalized_intensities divide the list of (x, y) intensities into lists of x and y
        if name != 'normalized_intensities':
            if not isinstance(res, str):
                try:
                    res = map(str, res)
                except TypeError:
                    res = str(res)
            content.append((name, res))
        else:
            # Compute r and theta values
            polar = map(NormalizationTransform.rect_to_polar, res)

            content.append(
                ('normalized_x_intensities', [str(x) for x, y in res]))
            content.append(
                ('normalized_y_intensities', [str(y) for x, y in res]))
            content.append(('r', [str(r) for r, theta in polar]))
            content.append(('theta', [str(theta) for r, theta in polar]))

    # Get base calls and their forward encoding
    base_calls = GenotypeCalls.get_base_calls(gtc)
    genotype_forward = GenotypeCalls.get_base_calls_forward_strand(
        gtc, base_calls,
        [SourceStrand.Forward for i in range(len(base_calls))])
    # Write them to collection
    content.append(('base_calls', base_calls))
    content.append(('genotype_forward', genotype_forward))

    # Iterate over sample information attributes of gtc object
    for name, method in sample_info:
        res = str(method(gtc))
        general_info.append((name, res))

    # Initialize variables
    length = len(content[0][1])
    sep = ','
    rows = []

    # Make header
    header = sep.join([content[i][0] for i in range(len(content))])

    try:
        # Create normal df structure
        for i in range(length):
            row = sep.join([content[j][1][i] for j in range(len(content))])
            rows.append(row)
    except:
        print(gtc_path)
        return

    # File names
    name = gtc_path.split('/')[-1].split('.')[0]
    sinfo_name = name + '_old.sinfo'
    data_name = name + '_old.csv'

    # Write data to a file
    with open(path_to_save.format(data_name), 'w') as dest:
        dest.write(header + '\n' + '\n'.join(rows))

    # Write sample information to a file
    with open(path_to_save.format(sinfo_name), 'w') as dest:
        dest.write('\n'.join([sep.join(record) for record in general_info]))