Exemplo n.º 1
0
    def read_scanner_data(handle):
        """
        Helper function to parse ScannerData object from file handle.
        Args:
            handle (file): File handle

Returns:
            ScannerData
        """
        name = read_string(handle)
        pmt_green = read_int(handle)
        pmt_red = read_int(handle)
        scanner_version = read_string(handle)
        imaging_user = read_string(handle)
        return ScannerData(name, pmt_green, pmt_red, scanner_version, imaging_user)
Exemplo n.º 2
0
    def get_base_calls(self):
        """
        Returns:
            list(string): The genotype basecalls
            The characters are A, C, G, T, or - for a no-call/null.
            The calls are relative to the top strand.
        """
        try:
            ploidy_type = self.get_ploidy_type()
        except:
            ploidy_type = 1

        if ploidy_type != 1:
            genotypes = self.get_genotypes()

        with open(self.filename, "rb") as gtc_handle:
            gtc_handle.seek(self.toc_table[GenotypeCalls.__ID_BASE_CALLS])
            num_entries = read_int(gtc_handle)
            result = []
            for idx in range(num_entries):
                if ploidy_type == 1:
                    result.append(gtc_handle.read(2).decode())
                else:
                    byte_string = gtc_handle.read(2).decode()
                    ab_genotype = code2genotype[genotypes[idx]]
                    if ab_genotype == "NC" or ab_genotype == "NULL":
                        result.append("-")
                    else:
                        top_genotype = "".join(
                            [byte_string[0] if allele == "A" else byte_string[1] for allele in ab_genotype])
                        result.append(top_genotype)
            return result
Exemplo n.º 3
0
 def __get_generic_array(self, toc_entry, parse_function, item_size, offset, count):
     """
     Internal helper function to access a data array in a generic
     fashion.
     Args:
         toc_entry (int): Identifier for entry in table of contents
         parse_function (function): A function used to parse the value
                                      from a file handle
         item_size (int): Size (in bytes) of individual entry
         offset (int): Offset (in elements counts) to start reading
         count (int): Number of entries to read (None is read all remaining entries)
     Returns:
         list(type): An array parsed from the file (type dependent on parse_function)
     """
     with open(self.filename, "rb") as gtc_handle:
         gtc_handle.seek(self.toc_table[toc_entry])
         num_entries = read_int(gtc_handle) - offset
         if count is not None:
             num_entries = min(num_entries, count)
         if offset > 0:
             gtc_handle.seek(
                 self.toc_table[toc_entry] + 4 + offset * item_size)
         result = []
         for idx in range(num_entries):
             result.append(parse_function(gtc_handle))
         return result
Exemplo n.º 4
0
    def __init__(self, filename, ignore_version=False, check_write_complete=True):
        """
        Constructor
        Args:
            filename (string): GTC filename
            ignore_version (bool): boolean to ignore automated checks on
                            file version, not recommended (default: False)
        Returns:
            GenotypeCalls
        """
        self.filename = filename
        with open(self.filename, "rb") as gtc_handle:
            identifier = gtc_handle.read(3).decode()
            if identifier != "gtc":
                raise Exception("GTC format error: bad format identifier")
            self.version = read_byte(gtc_handle)
            if self.version not in GenotypeCalls.supported_version and not ignore_version:
                raise Exception("Unsupported GTC File version (" + str(self.version) + ")")
            number_toc_entries = read_int(gtc_handle)

            #
            # Parse the table of contents and map the toc entry
            # to the lookup
            #
            self.toc_table = {}
            for toc_idx in range(number_toc_entries):
                (id, offset) = struct.unpack("<hI", gtc_handle.read(6))
                self.toc_table[id] = offset
        if check_write_complete and not self.is_write_complete():
            raise Exception("GTC file is incomplete")
Exemplo n.º 5
0
 def get_num_intensity_only(self):
     """
     Returns:
         int: The number of intensity only SNPs
     """
     with open(self.filename, "rb") as gtc_handle:
         gtc_handle.seek(self.toc_table[GenotypeCalls.__ID_GC50] + 12)
         return read_int(gtc_handle)
Exemplo n.º 6
0
 def get_num_no_calls(self):
     """
     Returns:
         int: The number of no calls
     """
     with open(self.filename, "rb") as gtc_handle:
         gtc_handle.seek(self.toc_table[GenotypeCalls.__ID_GC50] + 8)
         return read_int(gtc_handle)
Exemplo n.º 7
0
 def __parse_locus_version_6(self, handle):
     """
     Helper function to parse version 6 locus entry
     Args:
         handle (file): File handle at start of locus entry record
     Returns:
         None
     Raises:
         Exception: Manifest format error
     """
     self.ilmn_id = read_string(handle)
     self.source_strand = SourceStrand.from_string(
         self.ilmn_id.split("_")[-2])
     self.name = read_string(handle)
     for idx in range(3):
         read_string(handle)
     handle.read(4)
     for idx in range(2):
         read_string(handle)
     self.snp = read_string(handle)
     self.chrom = read_string(handle)
     for idx in range(2):
         read_string(handle)
     self.map_info = int(read_string(handle))
     for idx in range(2):
         read_string(handle)
     self.address_a = read_int(handle)
     self.address_b = read_int(handle)
     for idx in range(7):
         read_string(handle)
     handle.read(3)
     self.assay_type = read_byte(handle)
     if self.assay_type not in [0, 1, 2]:
         raise Exception(
             "Format error in reading assay type from locus entry")
     if self.address_b == 0:
         if self.assay_type != 0:
             raise Exception(
                 "Manifest format error: Assay type is inconsistent with address B"
             )
     else:
         if self.assay_type == 0:
             raise Exception(
                 "Manifest format error: Assay type is inconsistent with address B"
             )
Exemplo n.º 8
0
 def __parse_file(self, handle):
     """
     Helper function to initialize this object from a file handle
     Args:
         handle (file handle): File handle at start of locus entry record
     Returns:
         None
     """
     version = read_int(handle)
     if version == 6:
         self.__parse_locus_version_6(handle)
     elif version == 7:
         self.__parse_locus_version_7(handle)
     elif version == 8:
         self.__parse_locus_version_8(handle)
     else:
         raise Exception(
             "Manifest format error: unknown version for locus entry (" +
             str(version) + ")")
Exemplo n.º 9
0
 def __get_generic_array_numpy(self, toc_entry, numpy_type, offset=0, count=None):
     """
     Internal helper function to access a data array in a generic
     fashion.
     Args:
         toc_entry (int): Identifier for entry in table of contents
         numpy_type (numpy.dtype): Data type to read into array
         offset (int): Offset (in element counts) to start reading
         count (int): Number of entries to read (None will read remaining entries)
     Returns:
         list(type): An array parsed from the file (type dependent on parse_function)
     """
     numpy_type = dtype(numpy_type)
     with open(self.filename, "rb") as gtc_handle:
         gtc_handle.seek(self.toc_table[toc_entry])
         num_entries = read_int(gtc_handle) - offset
         if count is not None:
             num_entries = min(num_entries, count)
         if offset > 0:
             gtc_handle.seek(
                 self.toc_table[toc_entry] + 4 + offset * numpy_type.itemsize)
         return frombuffer(gtc_handle.read(num_entries * numpy_type.itemsize), dtype=numpy_type)
Exemplo n.º 10
0
    def read_cluster_file(handle):
        """
        Read a cluster file
        Args:
            file: EGT cluster file handle
        Returns:
            ClusterFile
        Raises:
            Exception: Incompatible cluster file format
        """
        version = read_int(handle)
        if version != 3:
            raise Exception("Cluster file version " + str(version) +
                            " not supported")

        gencall_version = read_string(handle)
        cluster_version = read_string(handle)
        call_version = read_string(handle)
        normalization_version = read_string(handle)
        date_created = read_string(handle)

        is_wgt = read_byte(handle) == 1
        if not is_wgt:
            raise Exception("Only WGT cluster file version supported")

        manifest_name = read_string(handle)

        result = ClusterFile(gencall_version, cluster_version, call_version,
                             normalization_version, date_created,
                             manifest_name)
        data_block_version = read_int(handle)
        if data_block_version not in [8, 9]:
            raise Exception("Data block version in cluster file " +
                            str(data_block_version) + " not  supported")
        # opa
        _ = read_string(handle)

        num_records = read_int(handle)
        cluster_records = ClusterFile.read_array(
            handle, num_records, lambda handle: ClusterRecord.read_record(
                handle, data_block_version))
        cluster_scores = ClusterFile.read_array(handle, num_records,
                                                ClusterScore.read_record)

        # genotypes
        _ = ClusterFile.read_array(handle, num_records, read_string)

        loci_names = ClusterFile.read_array(handle, num_records, read_string)
        addresses = ClusterFile.read_array(handle, num_records, read_int)

        # cluster counts
        cluster_counts = []
        for idx in range(num_records):
            # 3 corresponds to number genotypes (AA, AB, BB)
            cluster_counts.append(ClusterFile.read_array(handle, 3, read_int))

        for (cluster_record, count_record) in zip(cluster_records,
                                                  cluster_counts):
            assert cluster_record.aa_cluster_stats.N == count_record[0]
            assert cluster_record.ab_cluster_stats.N == count_record[1]
            assert cluster_record.bb_cluster_stats.N == count_record[2]

        for (locus_name, address, cluster_record,
             cluster_score) in zip(loci_names, addresses, cluster_records,
                                   cluster_scores):
            cluster_record.address = address
            cluster_record.cluster_score = cluster_score
            result.add_record(locus_name, cluster_record)

        return result
Exemplo n.º 11
0
    def __parse_file(self, manifest_file):
        """
        Helper function to initialize this object from a file.
        Args:
            manifest_file (string): Location of BPM (bead pool manifest) file
        Returns:
            None
        Raises:
            Exception: Unsupported or unknown BPM version
            Exception: Manifest format error
        """
        with open(manifest_file, "rb") as manifest_handle:
            header = manifest_handle.read(3).decode()
            if len(header) != 3 or header != "BPM":
                raise Exception("Invalid BPM format")
            version = read_byte(manifest_handle)
            if version != 1:
                raise Exception("Unknown BPM version (" + str(ord(version)) +
                                ")")

            version = read_int(manifest_handle)
            version_flag = 0x1000
            if version & version_flag == version_flag:
                version = version ^ version_flag
            if version > 5 or version < 3:
                raise Exception("Unsupported BPM version (" + str(version) +
                                ")")
            self.manifest_name = read_string(manifest_handle)

            if version > 1:
                self.control_config = read_string(manifest_handle)

            self.num_loci = read_int(manifest_handle)
            manifest_handle.seek(4 * self.num_loci, 1)
            name_lookup = {}
            for idx in range(self.num_loci):
                self.names.append(read_string(manifest_handle))
                name_lookup[self.names[-1]] = idx

            for idx in range(self.num_loci):
                normalization_id = read_byte(manifest_handle)
                if normalization_id >= 100:
                    raise Exception(
                        "Manifest format error: read invalid normalization ID")
                self.normalization_ids.append(normalization_id)

            self.assay_types = [0] * self.num_loci
            self.addresses = [0] * self.num_loci
            self.snps = [""] * self.num_loci
            self.chroms = [""] * self.num_loci
            self.map_infos = [0] * self.num_loci
            self.ref_strands = [RefStrand.Unknown] * self.num_loci
            self.source_strands = [SourceStrand.Unknown] * self.num_loci
            for idx in range(self.num_loci):
                locus_entry = LocusEntry(manifest_handle)
                self.assay_types[name_lookup[
                    locus_entry.name]] = locus_entry.assay_type
                self.addresses[name_lookup[
                    locus_entry.name]] = locus_entry.address_a
                self.snps[name_lookup[locus_entry.name]] = locus_entry.snp
                self.chroms[name_lookup[locus_entry.name]] = locus_entry.chrom
                self.map_infos[name_lookup[
                    locus_entry.name]] = locus_entry.map_info
                self.ref_strands[name_lookup[
                    locus_entry.name]] = locus_entry.ref_strand
                self.source_strands[name_lookup[
                    locus_entry.name]] = locus_entry.source_strand

            if len(self.normalization_ids) != len(self.assay_types):
                raise Exception(
                    "Manifest format error: read invalid number of assay entries"
                )

            all_norm_ids = set()
            for locus_idx in range(self.num_loci):
                self.normalization_ids[locus_idx] += 100 * \
                    self.assay_types[locus_idx]
                all_norm_ids.add(self.normalization_ids[locus_idx])
            sorted_norm_ids = sorted(all_norm_ids)
            lookup_dictionary = {}
            for idx in range(len(sorted_norm_ids)):
                lookup_dictionary[sorted_norm_ids[idx]] = idx
            self.normalization_lookups = [
                lookup_dictionary[normalization_id]
                for normalization_id in self.normalization_ids
            ]