示例#1
0
def outputPennCnv(gtc_file, manifest_file, outFile):
    manifest = BeadPoolManifest(manifest_file)
    gtc = GenotypeCalls(gtc_file)
    genotypes = gtc.get_genotypes()
    LRR = gtc.get_logr_ratios()
    BAF = gtc.get_ballele_freqs()
    with open(outFile, 'w') as output:
        output.write(
            'Name\tChr\tPosition\tGtype\tLog R Ratio\tB Allele Freq\n')
        for (name, chrom, map_info, genotype, lrr,
             baf) in zip(manifest.names, manifest.chroms, manifest.map_infos,
                         genotypes, LRR, BAF):
            if genotype == 1:
                geno = 'AA'
            elif genotype == 2:
                geno = 'AB'
            elif genotype == 3:
                geno = 'BB'
            else:
                geno = '--'
            output.write('\t'.join([
                name, chrom,
                str(map_info), geno,
                str(lrr), str(baf)
            ]) + '\n')
示例#2
0
def driver(gtc_files, manifest_reader, genome_reader, output_vcf_files,
           expand_identifiers, unsquash_duplicates, auxiliary_records,
           attrs_to_include, logger):
    format_factory = FormatFactory(gtc_files[0] is None, attrs_to_include,
                                   logger)
    reader_template_factory = ReaderTemplateFactory(
        genome_reader, format_factory, "4.1", "gtc_to_vcf " + VERSION,
        genome_reader.get_contig_order(), logger)
    vcf_record_factory = VcfRecordFactory(format_factory, genome_reader,
                                          expand_identifiers,
                                          auxiliary_records, logger)
    locus_entries = LocusEntryFactory(
        vcf_record_factory, genome_reader.get_contig_order(),
        unsquash_duplicates, logger).create_locus_entries(manifest_reader)

    for (gtc_file, output_vcf_file) in zip(gtc_files, output_vcf_files):
        if gtc_file:
            logger.info("Handling GTC file " + gtc_file)
            gtc = GenotypeCalls(gtc_file)
            if os.path.splitext(os.path.basename(
                    gtc.get_snp_manifest()))[0].lower() != os.path.splitext(
                        os.path.basename(
                            manifest_reader.source_file))[0].lower():
                logger.warn("Provided manifest name: " +
                            manifest_reader.source_file +
                            " and manifest file used to generate GTC file: " +
                            gtc.get_snp_manifest() + " do not match, skipping")
                continue
            logger.info(
                "Manifest file used for GTC conversion identified as: " +
                gtc.get_snp_manifest())
            sample_name = get_sample_name(gtc, gtc_file)
            reader_template = reader_template_factory.create_reader_template(
                [sample_name])

            call_factory = CallFactory(format_factory.create_formats(gtc),
                                       sample_name, logger)
            for entry in locus_entries:
                if entry.vcf_record:
                    entry.add_sample(call_factory, False)
        else:
            logger.info("GTC file not provided")
            reader_template = reader_template_factory.create_reader_template(
                [])

        output_vcf_file = os.path.abspath(output_vcf_file)
        output_vcf_file_temp = tempfile.mktemp(
            dir=os.path.dirname(output_vcf_file), suffix=".vcf")
        with open(output_vcf_file_temp, "w") as output_handle:
            vcf_writer = Writer(output_handle, reader_template)
            for entry in locus_entries:
                if not entry.vcf_record:
                    logger.warn("Could not create record for: " +
                                entry.bpm_records[0].name)
                    continue
                vcf_writer.write_record(entry.vcf_record)
        os.rename(output_vcf_file_temp, output_vcf_file)
示例#3
0
def driver(gtc_dir, manifest_filename, output_filename, project_name, delim, logger):
    logger.info("Reading manifest file")
    bpm = BeadPoolManifest(manifest_filename)

    samples = []
    logger.info("Initializing genotype data")
    gtc_files = []
    for gtc_file in os.listdir(gtc_dir):
        if gtc_file.endswith(".gtc"):
            gtc_files.append(os.path.join(gtc_dir, gtc_file))




    logger.info("Generating report")
    loci = range(len(bpm.normalization_lookups))
    with open(output_filename, "w") as output_handle:
        output_handle.write("DNA Report on " + os.path.abspath(output_filename) + "\n")
        header = [""]
        header.append("# LOCI = {}".format(len(loci)))
        header.append("# DNAs = {}".format(len(samples)))
        header.append("ProjectName = {}".format(project_name))
        header.append("GenCall Version = NaN")
        header.append("Low GenCall Score Cutoff = NaN")

        output_handle.write(delim.join(header) + "\n")
        output_handle.write(delim.join("Row,DNA_ID,#No_Calls,#Calls,Call_Rate,A/A_Freq,A/B_Freq,B/B_Freq,Minor_Freq,50%_GC_Score,10%_GC_Score".split(",")) + "\n")
        row = 0
        for gtc_file in gtc_files:
            row += 1
            gtc = GenotypeCalls(gtc_file)
            genotypes = gtc.get_genotypes()
            scores = gtc.get_genotype_scores()
            assert len(genotypes) == len(bpm.names)
            row_data = []
            row_data.append(row)
            row_data.append(gtc.get_sample_name())
            row_data += compute_genotypes(genotypes)
            row_data.append(ScoreStatistics(scores, 50))
            row_data.append(ScoreStatistics(scores, 10))
            output_handle.write(delim.join(map(str, row_data)) + "\n")
        logger.info("Report generation complete")
    sys.exit(-1)

with open(args.output_file, "w") as output_handle:
    output_handle.write("[Header]\n")
    output_handle.write(delim.join(["Processing Date", datetime.now().strftime("%m/%d/%Y %I:%M %p")])+ "\n")
    output_handle.write(delim.join(["Content", os.path.basename(args.manifest)]) + "\n")
    output_handle.write(delim.join(["Num SNPs", str(len(manifest.names))]) + "\n")
    output_handle.write(delim.join(["Total SNPs", str(len(manifest.names))]) + "\n")

    samples = []
    for gtc_file in os.listdir(args.gtc_directory):
        if gtc_file.lower().endswith(".gtc"):
            samples.append(gtc_file)

    output_handle.write(delim.join(["Num Samples", str(len(samples))]) + "\n")
    output_handle.write(delim.join(["Total Samples", str(len(samples))]) + "\n")

    output_handle.write("[Data]\n")
    output_handle.write(delim.join(["SNP Name", "Sample ID", "Chr", "MapInfo", "Alleles - AB", "Alleles - Plus", "Alleles - Forward"]) + "\n")
    for gtc_file in samples:
        sys.stderr.write("Processing " + gtc_file + "\n")
        gtc_file = os.path.join(args.gtc_directory, gtc_file)
        gtc = GenotypeCalls(gtc_file)
        genotypes = gtc.get_genotypes()
        plus_strand_genotypes = gtc.get_base_calls_plus_strand(manifest.snps, manifest.ref_strands)
        forward_strand_genotypes = gtc.get_base_calls_forward_strand(manifest.snps, manifest.source_strands)

        assert len(genotypes) == len(manifest.names)
        for (name, chrom, map_info, genotype, ref_strand_genotype, source_strand_genotype) in zip(manifest.names, manifest.chroms, manifest.map_infos, genotypes, plus_strand_genotypes, forward_strand_genotypes):
            output_handle.write(delim.join([name, os.path.basename(gtc_file)[:-4], chrom, str(map_info), code2genotype[genotype], ref_strand_genotype, source_strand_genotype]) + "\n")
            samples.append(gtc_file)

    output_handle.write(delim.join(["Num Samples", str(len(samples))]) + "\n")
    output_handle.write(
        delim.join(["Total Samples", str(len(samples))]) + "\n")

    output_handle.write("[Data]\n")
    output_handle.write(
        delim.join([
            "SNP Name", "Sample ID", "Chr", "MapInfo", "Alleles - AB",
            "Alleles - Plus", "Alleles - Forward", "X", "Y"
        ]) + "\n")
    for gtc_file in samples:
        sys.stderr.write("Processing " + gtc_file + "\n")
        gtc_file = os.path.join(args.gtc_directory, gtc_file)
        gtc = GenotypeCalls(gtc_file)
        genotypes = gtc.get_genotypes()
        plus_strand_genotypes = gtc.get_base_calls_plus_strand(
            manifest.snps, manifest.ref_strands)
        forward_strand_genotypes = gtc.get_base_calls_forward_strand(
            manifest.snps, manifest.source_strands)
        normalized_intensities = gtc.get_normalized_intensities(
            manifest.normalization_lookups)

        assert len(genotypes) == len(manifest.names)
        for (name, chrom, map_info, genotype, ref_strand_genotype,
             source_strand_genotype, (x_norm, y_norm)) in zip(
                 manifest.names, manifest.chroms, manifest.map_infos,
                 genotypes, plus_strand_genotypes, forward_strand_genotypes,
                 normalized_intensities):
            output_handle.write(
示例#6
0
             datetime.now().strftime("%m/%d/%Y %I:%M %p")]) + "\n")
    output_handle.write(
        delim.join(["Content", os.path.basename(sys.argv[1])]) + "\n")
    output_handle.write(delim.join(["Num SNPs", str(len(names))]) + "\n")
    output_handle.write(delim.join(["Total SNPs", str(len(names))]) + "\n")

    samples = []
    for file in os.listdir(sys.argv[2]):
        if file.lower().endswith(".gtc"):
            samples.append(file)

    output_handle.write(delim.join(["Num Samples", str(len(samples))]) + "\n")
    output_handle.write(
        delim.join(["Total Samples", str(len(samples))]) + "\n")

    output_handle.write("[Data]\n")
    output_handle.write(
        delim.join(["SNP Name", "Sample ID", "Alleles - AB"]) + "\n")
    for file in samples:
        sys.stderr.write("Processing " + file + "\n")
        gtc_file = os.path.join(sys.argv[2], file)
        genotypes = GenotypeCalls(gtc_file).get_genotypes()
        assert len(genotypes) == len(names)
        for (name, genotype) in zip(names, genotypes):
            output_handle.write(
                delim.join([
                    name,
                    file[:-4],
                    code2genotype[genotype],
                ]) + "\n")
            samples.append(gtc_file)

    output_handle.write(delim.join(["Num Samples", str(len(samples))]) + "\n")
    output_handle.write(
        delim.join(["Total Samples", str(len(samples))]) + "\n")

    output_handle.write("[Data]\n")
    output_handle.write(
        delim.join([
            "SNP Name", "Sample ID", "Chr", "MapInfo", "Alleles - AB",
            "Alleles - Plus", "Alleles - Forward"
        ]) + "\n")
    for gtc_file in samples:
        sys.stderr.write("Processing " + gtc_file + "\n")
        gtc_file = os.path.join(args.gtc_directory, gtc_file)
        gtc = GenotypeCalls(gtc_file)
        genotypes = gtc.get_genotypes()
        plus_strand_genotypes = gtc.get_base_calls_plus_strand(
            manifest.snps, manifest.ref_strands)
        forward_strand_genotypes = gtc.get_base_calls_forward_strand(
            manifest.snps, manifest.source_strands)

        assert len(genotypes) == len(manifest.names)
        for (name, chrom, map_info, genotype, ref_strand_genotype,
             source_strand_genotype) in zip(manifest.names, manifest.chroms,
                                            manifest.map_infos, genotypes,
                                            plus_strand_genotypes,
                                            forward_strand_genotypes):
            output_handle.write(
                delim.join([
                    name,
    if gtc_file.lower().endswith(".gtc"):
        samples.append(gtc_file)

for gtc_file in glob.glob(os.path.join(args.gtc_directory, '*.gtc')):
    # print gtc_file
    gtc_output = gtc_file + ".txt"
    gtc_output2 = os.path.basename(gtc_output)
    save_path = args.output_directory
    gtc_output3 = os.path.join(save_path, gtc_output2)
    print gtc_output3

    output = open(gtc_output3, "w")
    print "output is" + str(output)
    if gtc_file.lower().endswith(".gtc"):
        manifest = BeadPoolManifest(args.manifest)
        names = manifest.names
        sample_id = os.path.basename(gtc_file)[:-4]
        print sample_id
        genotypes = GenotypeCalls(gtc_file).get_genotypes()
        chrom = manifest.chroms
        map_info = manifest.map_infos
        map_info2 = str(map_info)
        logratio = GenotypeCalls(gtc_file).get_logr_ratios()
        BAF = GenotypeCalls(gtc_file).get_ballele_freqs()
        for (names, chrom, map_info, genotypes, logratio,
             BAF) in zip(names, chrom, map_info, genotypes, logratio, BAF):
            output.write(sample_id + "\t" + names + "\t" + chrom + "\t" +
                         str(map_info) + "\t" + code2genotype[genotypes] +
                         "\t" + str(logratio) + "\t" + str(BAF) + "\n")
    output.close()
示例#9
0
def extract(gtc_path,
            extraction_path,
            manifest_path="/home/ailin/repo_new/data/BovineSNP50_v3_A1.bpm"):
    """
    Extract genotyping data -
    ballele_freqs, base_calls, genotypes, genotype_scores, logr_ratios, raw_x_intensities, raw_y_intensities,
    normalized_intensities, names, chroms, map_infos, ref_strands, source_strands and snps -
    and write it to a file
    Also extract general sample information -
    call_rate, cluster_file, gender, imaging_date, autocall_date, scanner_data, snp_manifest, is_write_complete,
    sample_name, sample_plate, sample_well -
    and write it to .sinfo file
    :param gtc_path: str - path to gtc with genotyping data
    :param extraction_path: str - path to directory where extracted files will be stored
    :param manifest_path: str - path to manifest file used for creation of this gtc
    :return:
    """
    # Add {} to use it later in formatting names
    path_to_save = extraction_path + '/{}'
    # Get gtc and manifest objects
    gtc = GenotypeCalls(gtc_path)
    manifest = BeadPoolManifest(manifest_path)

    # Structure for ordered names and methods of gtc fields
    field = namedtuple('field', ['name', 'method'])

    # List of fields which should be extracted from gtc
    gtc_extract = [
        field('ballele_freqs', GenotypeCalls.get_ballele_freqs),
        field('genotypes', GenotypeCalls.get_genotypes),
        field('genotype_scores', GenotypeCalls.get_genotype_scores),
        field('logr_ratios', GenotypeCalls.get_logr_ratios),
        field('raw_x_intensities', GenotypeCalls.get_raw_x_intensities),
        field('raw_y_intensities', GenotypeCalls.get_raw_y_intensities),
        field(
            'normalized_intensities',
            lambda x: GenotypeCalls.get_normalized_intensities(
                x, manifest.normalization_lookups))
    ]

    # I don't see place in db where we use this data
    # List of fields which correspond to a whole sample and extracted from gtc
    sample_info = [
        field('call_rate', GenotypeCalls.get_call_rate),
        field('cluster_file', GenotypeCalls.get_cluster_file),
        field('gender', GenotypeCalls.get_gender),
        field('imaging_date', GenotypeCalls.get_imaging_date),
        field('autocall_date', GenotypeCalls.get_autocall_date),
        field('scanner_data', GenotypeCalls.get_scanner_data),
        field('snp_manifest', GenotypeCalls.get_snp_manifest),
        field('is_write_complete', GenotypeCalls.is_write_complete),
        field('sample_name', GenotypeCalls.get_sample_name),
        field('sample_plate', GenotypeCalls.get_sample_plate),
        field('sample_well', GenotypeCalls.get_sample_well)
    ]

    # Containers for data
    content = []
    general_info = []

    # Get content from gtc
    # Iterate over fields which should be extracted in gtc, transform them to str
    for name, method in gtc_extract:
        res = method(gtc)
        # For normalized_intensities divide the list of (x, y) intensities into lists of x and y
        if name != 'normalized_intensities':
            if not isinstance(res, str):
                try:
                    res = map(str, res)
                except TypeError:
                    res = str(res)
            content.append((name, res))
        else:
            # Compute r and theta values
            polar = map(NormalizationTransform.rect_to_polar, res)

            content.append(
                ('normalized_x_intensities', [str(x) for x, y in res]))
            content.append(
                ('normalized_y_intensities', [str(y) for x, y in res]))
            content.append(('r', [str(r) for r, theta in polar]))
            content.append(('theta', [str(theta) for r, theta in polar]))

    # Get base calls and their forward encoding
    base_calls = GenotypeCalls.get_base_calls(gtc)
    genotype_forward = GenotypeCalls.get_base_calls_forward_strand(
        gtc, base_calls,
        [SourceStrand.Forward for i in range(len(base_calls))])
    # Write them to collection
    content.append(('base_calls', base_calls))
    content.append(('genotype_forward', genotype_forward))

    # Iterate over sample information attributes of gtc object
    for name, method in sample_info:
        res = str(method(gtc))
        general_info.append((name, res))

    # Initialize variables
    length = len(content[0][1])
    sep = ','
    rows = []

    # Make header
    header = sep.join([content[i][0] for i in range(len(content))])

    try:
        # Create normal df structure
        for i in range(length):
            row = sep.join([content[j][1][i] for j in range(len(content))])
            rows.append(row)
    except:
        print(gtc_path)
        return

    # File names
    name = gtc_path.split('/')[-1].split('.')[0]
    sinfo_name = name + '_old.sinfo'
    data_name = name + '_old.csv'

    # Write data to a file
    with open(path_to_save.format(data_name), 'w') as dest:
        dest.write(header + '\n' + '\n'.join(rows))

    # Write sample information to a file
    with open(path_to_save.format(sinfo_name), 'w') as dest:
        dest.write('\n'.join([sep.join(record) for record in general_info]))
示例#10
0
        if gtc_file.lower().endswith(".gtc"):
            if gtc_file in excludeIDArray:
                print(gtc_file + ' is excluded.')
            else:
                samples.append(gtc_file)


    output_handle.write(delim.join(["Num Samples", str(len(samples))]) + "\n")
    output_handle.write(delim.join(["Total Samples", str(len(samples))]) + "\n")
    output_handle.write("[Data]\n")
    output_handle.write(delim.join(["SNP Name", "Sample ID", "Chr", "MapInfo" ,"SNP" ,"REFSTRAND","SOURCESTRAND" ,"GType" ,"Allele1 - Top","Allele2 - Top","X_raw","Y_raw","X","Y","B Allele Freq","Log R Ratio","GT Score","Alleles - Plus", "Alleles - Forward"]) + "\n")

    for gtc_file in samples:
        sys.stderr.write("Processing " + gtc_file + "\n")
        gtc_file = os.path.join(args.gtc_directory, gtc_file)
        gtc = GenotypeCalls(gtc_file)
        genotypes = gtc.get_genotypes()
        raw_Xs = gtc.get_raw_x_intensities()
        raw_Ys = gtc.get_raw_y_intensities()
        normalized_intensities = GenotypeCalls( gtc_file ).get_normalized_intensities(manifest.normalization_lookups)
        base_calls = GenotypeCalls( gtc_file ).get_base_calls()
        logratios = GenotypeCalls( gtc_file ).get_logr_ratios()
        BAFs = GenotypeCalls( gtc_file ).get_ballele_freqs()
        genotype_scores = GenotypeCalls( gtc_file ).get_genotype_scores()
        plus_strand_genotypes = gtc.get_base_calls_plus_strand(manifest.snps, manifest.ref_strands)
        forward_strand_genotypes = gtc.get_base_calls_forward_strand(manifest.snps, manifest.source_strands)

        assert len(genotypes) == len(manifest.names)

        for (name, chrom, map_info, snp,ref_strand,source_strands, genotype, norm, Allele, raw_x, raw_y, BAF, logratio , genotype_score , ref_strand_genotype, source_strand_genotype) in zip(manifest.names, manifest.chroms, manifest.map_infos, manifest.snps ,manifest.ref_strands,manifest.source_strands,genotypes , normalized_intensities, base_calls, raw_Xs, raw_Ys, BAFs , logratios , genotype_scores , plus_strand_genotypes, forward_strand_genotypes):
            x_norm = norm[0]
    output.write(
        delim.join(["Content", os.path.basename(args.manifest)]) + "\n")
    output.write(delim.join(["Num SNPs", str(len(names))]) + "\n")
    output.write(delim.join(["Total SNPs", str(len(names))]) + "\n")
    output.write("[Data]\n")

    output.write(
        delim.join([
            "SNP Name", "Sample Index", "Allele1 - Forward",
            "Allele2 - Forward", "GC Score", "Chr", "Position"
        ]) + "\n")

    sys.stdout.write("Processing " + gtc_file + "\n")
    gtc_file = os.path.join(args.gtc_directory, gtc_file)

    gtc = GenotypeCalls(gtc_file)
    genotypes = GenotypeCalls(gtc_file).get_genotypes()
    chrom = manifest.chroms
    map_info = manifest.map_infos
    forward_strand_genotypes = gtc.get_base_calls_forward_strand(
        manifest.snps, manifest.source_strands)
    gen_score = GenotypeCalls(gtc_file).get_genotype_scores()

    for (names, forward_strand_genotypes, chrom, map_info,
         gen_score) in zip(names, forward_strand_genotypes, chrom, map_info,
                           gen_score):

        if forward_strand_genotypes[0] == "-":
            forward_strand_genotypes += "-"

        output.write(
示例#12
0
    output_handle.write(delim.join(["Num Samples", str(len(samples))]) + "\n")
    output_handle.write(
        delim.join(["Total Samples", str(len(samples))]) + "\n")
    output_handle.write("[Data]\n")
    output_handle.write(
        delim.join([
            "SNP Name", "Sample ID", "Chr", "MapInfo", "SNP", "REFSTRAND",
            "SOURCESTRAND", "GType", "Allele1 - Top", "Allele2 - Top", "X_raw",
            "Y_raw", "X", "Y", "B Allele Freq", "Log R Ratio", "GT Score",
            "Alleles - Plus", "Alleles - Forward"
        ]) + "\n")

    for gtc_file in samples:
        sys.stderr.write("Processing " + gtc_file + "\n")
        gtc_file = os.path.join(args.gtc_directory, gtc_file)
        gtc = GenotypeCalls(gtc_file)
        genotypes = gtc.get_genotypes()
        raw_Xs = gtc.get_raw_x_intensities()
        raw_Ys = gtc.get_raw_y_intensities()
        normalized_intensities = GenotypeCalls(
            gtc_file).get_normalized_intensities(
                manifest.normalization_lookups)
        base_calls = GenotypeCalls(gtc_file).get_base_calls()
        logratios = GenotypeCalls(gtc_file).get_logr_ratios()
        BAFs = GenotypeCalls(gtc_file).get_ballele_freqs()
        genotype_scores = GenotypeCalls(gtc_file).get_genotype_scores()
        plus_strand_genotypes = gtc.get_base_calls_plus_strand(
            manifest.snps, manifest.ref_strands)
        forward_strand_genotypes = gtc.get_base_calls_forward_strand(
            manifest.snps, manifest.source_strands)
示例#13
0
import argparse

parser = argparse.ArgumentParser("Generate a Callrate report from a directory of GTC files")
parser.add_argument("manifest", help="BPM manifest file")
parser.add_argument("gtc_directory", help="Directory containing GTC files")
parser.add_argument("output_file", help="Directory where output has to be written")

args = parser.parse_args()

output_file2 = args.output_file
output_file3 = open(output_file2, "w")
print "output is" + str(output_file3)

for gtc_file in glob.glob(os.path.join(args.gtc_directory, '*.gtc')):

	print "processing" + gtc_file

        manifest = BeadPoolManifest(args.manifest)

        sample_id = os.path.basename(gtc_file)[:-4]

        call_rate = str(GenotypeCalls(gtc_file).get_call_rate())

        gender = GenotypeCalls(gtc_file).get_gender()

	output_file3.write(sample_id + "\t" + call_rate + "\t" + gender + "\n")



output_file3.close()