示例#1
0
文件: parser.py 项目: maximz/pyaffy
    def parse_intensities(self):

        # check version
        version = None
        with misc.smart_open_read(self.path, mode = 'rb', try_gzip = True) \
                as fh:
            v = ord(fh.read(1)) # look at first byte
            if v == 59:
                # command console generic file format
                version = 'CCG'
            if v == 64:
                # binary cel format, version 4
                version = 4

        assert version in (4, 'CCG')
        logger.info('Detected CEL file format version "%s".', str(version))

        y = None
        if version == 'CCG':
            cel = CEL.read_cel(self.path)
            ds = cel.data_groups[0].get_data_set_by_name('Intensity')
            y = np.float64(ds.data)[:,0]
        elif version == 4:
            y = self._parse_cel_v4_intensities()

        return y
示例#2
0
    def parse_intensities(self):

        # check version
        version = None
        with misc.smart_open_read(self.path, mode = 'rb', try_gzip = True) \
                as fh:
            v = ord(fh.read(1)) # look at first byte
            if v == 59:
                # command console generic file format
                version = 'CCG'
            if v == 64:
                # binary cel format, version 4
                version = 4

        assert version in (4, 'CCG')
        logger.info('Detected CEL file format version "%s".', str(version))

        y = None
        if version == 'CCG':
            cel = CEL.read_cel(self.path)
            ds = cel.data_groups[0].get_data_set_by_name('Intensity')
            y = np.float64(ds.data)[:,0]
        elif version == 4:
            y = self._parse_cel_v4_intensities()

        return y
示例#3
0
def main(args=None):
    """Script body."""

    if args is None:
        # parse command-line arguments
        parser = get_argument_parser()
        args = parser.parse_args()

    fasta_file = args.fasta_file
    species = args.species
    chrom_pat = args.chromosome_pattern
    output_file = args.output_file

    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose

    # configure root logger
    log_stream = sys.stdout
    if output_file == '-':
        # if we print output to stdout, redirect log messages to stderr
        log_stream = sys.stderr

    logger = misc.get_logger(log_stream=log_stream,
                             log_file=log_file,
                             quiet=quiet,
                             verbose=verbose)

    # generate regular expression object from the chromosome pattern
    if chrom_pat is None:
        chrom_pat = ensembl.SPECIES_CHROMPAT[species]
    chrom_re = re.compile(chrom_pat)

    # filter the FASTA file
    # note: each chromosome sequence is temporarily read into memory,
    # so this script has a large memory footprint
    with \
        misc.smart_open_read(
            fasta_file, mode='r', encoding='ascii', try_gzip=True
        ) as fh, \
        misc.smart_open_write(
            output_file, mode='w', encoding='ascii'
        ) as ofh:

        # inside = False
        reader = FastaReader(fh)
        for seq in reader:
            chrom = seq.name.split(' ', 1)[0]
            if chrom_re.match(chrom) is None:
                logger.info('Ignoring chromosome "%s"...', chrom)
                continue
            seq.name = chrom
            seq.append_fasta(ofh)

    return 0
示例#4
0
def main(args=None):
    """Script body."""

    if args is None:
        # parse command-line arguments 
        parser = get_argument_parser()
        args = parser.parse_args()

    fasta_file = args.fasta_file
    species = args.species
    chrom_pat = args.chromosome_pattern
    output_file = args.output_file
    
    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose

    # configure root logger
    log_stream = sys.stdout
    if output_file == '-':
        # if we print output to stdout, redirect log messages to stderr
        log_stream = sys.stderr

    logger = misc.get_logger(log_stream=log_stream, log_file=log_file,
                             quiet=quiet, verbose=verbose)

    # generate regular expression object from the chromosome pattern
    if chrom_pat is None:
        chrom_pat = ensembl.species_chrompat[species]
    chrom_re = re.compile(chrom_pat)

    # filter the FASTA file
    # note: each chromosome sequence is temporarily read into memory,
    # so this script has a large memory footprint
    with \
        misc.smart_open_read(
            fasta_file, mode='r', encoding='ascii', try_gzip=True
        ) as fh, \
        misc.smart_open_write(
            output_file, mode='w', encoding='ascii'
        ) as ofh:

        # inside = False
        reader = FastaReader(fh)
        for seq in reader:
            chrom = seq.name.split(' ', 1)[0]
            if chrom_re.match(chrom) is None:
                logger.info('Ignoring chromosome "%s"...', chrom)
                continue
            seq.name = chrom
            seq.append_fasta(ofh)

    return 0
def read_gene2acc(file_path, logger):
    """Extracts Entrez ID -> gene symbol mapping from gene2accession.gz file.

    Parameters
    ----------
    file_path: str
        The path of the gene2accession.gz file (or a filtered version thereof).
        The file may be gzip'ed.

    Returns
    -------
    dict
        A mapping of Entrez IDs to gene symbols.
    """
    gene2acc = {}
    with misc.smart_open_read(file_path, mode='rb', try_gzip=True) as fh:
        reader = csv.reader(fh, dialect='excel-tab')
        next(reader)  # skip header
        for i, l in enumerate(reader):
            id_ = int(l[1])
            symbol = l[15]

            try:
                gene2acc[id_].append(symbol)
            except KeyError:
                gene2acc[id_] = [symbol]

            # print (l[0],l[15])

    # make sure all EntrezIDs map to a unique gene symbol
    n = len(gene2acc)
    for k, v in gene2acc.items():
        symbols = sorted(set(v))
        assert len(symbols) == 1
        gene2acc[k] = symbols[0]

    all_symbols = sorted(set(gene2acc.values()))
    m = len(all_symbols)

    logger.info('Found %d Entrez Gene IDs associated with %d gene symbols.',
                n, m)
    return gene2acc
示例#6
0
def get_gaf_gene_ontology_file(path):
    """Extract the gene ontology file associated with a GO annotation file.

    Parameters
    ----------
    path: str
        The path name of the GO annotation file.

    Returns
    -------
    str
        The URL of the associated gene ontology file.
    """
    assert isinstance(path, str)

    version = None
    with misc.smart_open_read(path, encoding='UTF-8', try_gzip=True) as fh:
        for l in fh:
            if l[0] != '!':
                break
            if l.startswith('!GO-version:'):
                version = l.split(' ')[1]
                break
    return version
示例#7
0
文件: cel.py 项目: numpde/pyaffy
    def read_cel(cls, path):
        """Parser for CEL files in Command Console generic data file format.

        This is a binary format.
        """

        read = [0]

        decode_unicode = lambda s: codecs.decode(s, 'UTF-16-BE')
        decode_ascii = lambda s: codecs.decode(s, 'ascii')
        decode_float = lambda s: struct.unpack('>f', s)[0]
        decode_int32 = lambda s: struct.unpack('>i', s)[0]
        decode_uint32 = lambda s: struct.unpack('>I', s)[0]
        decode_int8 = lambda s: struct.unpack('>b', s)[0]
        decode_uint8 = lambda s: struct.unpack('>B', s)[0]
        decode_int16 = lambda s: struct.unpack('>h', s)[0]
        decode_uint16 = lambda s: struct.unpack('>H', s)[0]

        def read_float(bytes):
            read[0] += 4
            return decode_float(fh.read(4))

        def read_byte(fh):
            read[0] += 1
            return decode_int8(fh.read(1))

        def read_ubyte(fh):
            read[0] += 1
            return decode_uint8(fh.read(1))

        def read_short(fh):
            read[0] += 2
            return decode_int16(fh.read(2))

        def read_ushort(fh):
            read[0] += 2
            return decode_uint16(fh.read(2))

        def read_int(fh):
            read[0] += 4
            return decode_int32(fh.read(4))

        def read_uint(fh):
            read[0] += 4
            return decode_uint32(fh.read(4))

        def read_raw(fh):
            # reads an int x and then x raw bytes
            bytes = read_int(fh)
            read[0] += bytes
            return fh.read(bytes)

        def read_string(fh):
            strlen = read_int(fh)
            read[0] += strlen
            return fh.read(strlen)

        def read_wstring(fh):
            strlen = read_int(fh)
            s = fh.read(2 * strlen)
            read[0] += (2 * strlen)
            return decode_unicode(s)

        def read_guid(fh):
            return read_string(fh)

        def read_datetime(fh):
            s = read_wstring(fh)
            logger.debug('DateTime string: %s|||', s)
            dt = None
            if s:
                dt = dateutil.parser.parse(s).replace(tzinfo=None)
            #s = u'2015-02-20T13:52:11Z'
            #print s
            return dt

        def read_locale(fh):
            loc = read_wstring(fh)
            return loc[:2], loc[3:]

        def read_value(fh):
            raw = read_raw(fh)
            return raw

        def read_type(fh):
            return read_wstring(fh)

        def read_file_header(fh):
            magic_number = read_ubyte(fh)
            logger.debug('Magic number: %d', magic_number)
            assert isinstance(magic_number, int) and magic_number == 59
            version_number = read_ubyte(fh)
            assert version_number == 1
            num_data_groups = read_int(fh)
            assert isinstance(num_data_groups, int)
            logger.debug('# data groups: %d', num_data_groups)
            first_data_group_pos = read_uint(fh)
            return num_data_groups, first_data_group_pos

        def read_header_param(fh):
            v1 = read_wstring(fh)
            v2 = read_value(fh)
            v3 = read_type(fh)

            if v3 == 'text/plain':
                v2 = decode_unicode(v2.rstrip('\x00'))
            elif v3 == 'text/ascii':
                v2 = decode_ascii(v2.rstrip('\x00'))
            elif v3 == 'text/x-calvin-float':
                v2 = decode_float(v2[:4])
            elif v3 == 'text/x-calvin-integer-32':
                v2 = decode_int32(v2[:4])
            elif v3 == 'text/x-calvin-unsigned-integer-32':
                v2 = decode_uint32(v2[:4])
            elif v3 == 'text/x-calvin-unsigned-integer-8':
                v2 = decode_uint8(v2[:1])
            elif v3 == 'text/x-calvin-unsigned-integer-16':
                v2 = decode_uint16(v2[:2])
            elif v3 == 'text/x-calvin-integer-8':
                v2 = decode_int8(v2[:1])
            elif v3 == 'text/x-calvin-integer-16':
                v2 = decode_int16(v2[:2])

            return (v1, v2)

        def read_data_header(fh):
            data_type_id = read_guid(fh)
            logger.debug('Data type identifier: %s', data_type_id)
            file_id = read_guid(fh)
            logger.debug('File identifier: %s', file_id)
            creation_time = read_datetime(fh)
            #print creation_time
            iso639, iso3166 = read_locale(fh)
            locale = '-'.join([iso639, iso3166])
            n_params = read_int(fh)
            logger.debug('Number of parameters (name/value/type triplets): %d',
                         n_params)
            params = OrderedDict()
            for i in range(n_params):
                params.update([read_header_param(fh)])

            num_parents = read_int(fh)
            logger.debug('Number of parent file headers: %d', num_parents)

            parent_headers = []
            for i in range(num_parents):
                logger.debug('')
                logger.debug('-------------------------------------------')
                parent_headers.append(read_data_header(fh))

            header = CELHeader(data_type_id, file_id, creation_time, locale,
                               params, parent_headers)
            return header

        def read_col(fh):
            name = read_wstring(fh)
            valtype = read_byte(fh)
            size = read_int(fh)
            return (name, valtype, size)

        def read_data_set(fh):

            VALUE_TYPES = [
                read_byte, read_ubyte, read_short, read_ushort, read_int,
                read_uint, read_float, read_string, read_wstring
            ]

            data_pos = read_uint(fh)
            next_pos = read_uint(fh)
            data_size = next_pos - data_pos
            name = read_wstring(fh)
            n_params = read_int(fh)
            logger.debug('DataSet / data position: %d', data_pos)
            logger.debug('DataSet / next position: %d', next_pos)
            logger.debug('DataSet / data size: %d', data_size)
            logger.debug('DataSet / name: %s', name)
            logger.debug('DataSet / # parameters: %d', n_params)
            params = OrderedDict()
            for i in range(n_params):
                params.update([read_header_param(fh)])
            n_cols = read_uint(fh)
            logger.debug('DataSet / # cols: %d', n_cols)
            cols = []
            for i in range(n_cols):
                cols.append(read_col(fh))
            col_names = [c[0] for c in cols]
            n_rows = read_uint(fh)
            logger.debug('DataSet / # rows: %d', n_rows)
            data = []
            for i in range(n_rows):
                d = []
                for c in cols:
                    d.append(VALUE_TYPES[c[1]](fh))
                data.append(tuple(d))
            ds = CELDataSet(name, params, col_names, data)
            return ds, next_pos

        def read_data_group(fh):
            next_pos = read_uint(fh)
            dataset_pos = read_uint(fh)
            n_datasets = read_int(fh)
            name = read_wstring(fh)
            logger.debug('# data sets within the group: %d', n_datasets)
            logger.debug('Position of first data set within the group: %d',
                         dataset_pos)
            logger.debug('Position of next data group: %d', next_pos)
            logger.debug('Bytes read up until this point: %d', read[0])
            assert dataset_pos == read[0]
            datasets = []
            for i in range(n_datasets):
                ds, next_pos = read_data_set(fh)
                logger.debug('Position of next data set: %d', next_pos)
                logger.debug('Bytes read up until this point: %d', read[0])
                assert (next_pos - read[0]) in [0, 1]
                if next_pos - read[0] == 1:
                    logger.warning('Skipping one byte (%d) between data sets.',
                                   ord(fh.read(1)))
                    read[0] += 1
                assert next_pos == read[0]
                datasets.append(ds)
            group = CELDataGroup(name, datasets)
            return group

        header = None
        data_groups = []
        with misc.smart_open_read(path, 'rb', try_gzip=True) as fh:
            num_data_groups, data_pos = read_file_header(fh)
            #assert n_data_groups == 1 # for expression CEL file
            header = read_data_header(fh)
            logger.info('# data groups: %d', num_data_groups)
            assert data_pos == read[0]  # position of the first data group
            data_groups = []
            for i in range(num_data_groups):
                data_groups.append(read_data_group(fh))
        return cls(header, data_groups)
示例#8
0
文件: parser.py 项目: maximz/pyaffy
    def _parse_cel_v4_intensities(self):
        # Version 4 format, http://media.affymetrix.com/support/developer/powertools/changelog/gcos-agcc/cel.html#V4
        # data encoding is little endian

        read = [0]

        def decode_unicode(s):
            return codecs.decode(s, 'UTF-16-BE')

        def decode_ascii(s):
            return codecs.decode(s, 'ascii')

        def decode_float(bytes):
            # this is an actual float, not a double!
            return struct.unpack('<f', bytes)[0]

        def decode_int32(bytes):
            return struct.unpack('<i', bytes)[0]

        def decode_uint32(bytes):
            return struct.unpack('<I', bytes)[0]

        def decode_int8(bytes):
            return struct.unpack('<b', bytes)[0]

        def decode_uint8(bytes):
            return struct.unpack('<B', bytes)[0]

        def decode_int16(bytes):
            return struct.unpack('<h', bytes)[0]

        def decode_uint16(bytes):
            return struct.unpack('<H', bytes)[0]

        def read_float(bytes):
            read[0] += 4
            return decode_float(fh.read(4))

        def read_short(fh):
            read[0] += 2
            return decode_int16(fh.read(2))

        def read_integer(fh):
            read[0] += 4
            return decode_int32(fh.read(4))

        def read_DWORD(fh):
            read[0] +=4
            return decode_uint32(fh.read(4))

        def read_raw(fh):
            # reads an int x and then x raw bytes
            bytes = read_integer(fh)
            read[0] += bytes
            return fh.read(bytes)

        def read_tag_val(fh):
            """Returns an OrderedDict containing tag-value entries."""
            raw = read_raw(fh)
            logger.debug('Tag/Value string:\n%s', raw)
            try:
                C = ConfigParser(interpolation = None, delimiters = ('=',), empty_lines_in_values = False)
                C.optionxform = lambda x: x
                C.read_string(u'[Section]\n' + unicode(raw))
            except ParsingError:
                C = ConfigParser(interpolation = None, delimiters = (':',), empty_lines_in_values = False)
                C.optionxform = lambda x: x
                C.read_string(u'[Section]\n' + unicode('\n'.join(raw.split(';'))))
                
            return C['Section']

        def read_cell(fh):
            intensity = read_float(fh)
            intensity_std = read_float(fh)
            pixel_count = read_short(fh)
            return (intensity, intensity_std, pixel_count)

        def read_coords(fh):
            x = read_short(fh)
            y = read_short(fh)
            return (x, y)

        def read_subgrid(fh):
            num_rows = read_integer(fh)
            num_cols = read_integer(fh)
            upper_left_x = read_float(fh)
            upper_left_y = read_float(fh)
            upper_right_x = read_float(fh)
            upper_right_y = read_float(fh)
            lower_left_x = read_float(fh)
            lower_left_y = read_float(fh)
            lower_right_x = read_float(fh)
            lower_right_y = read_float(fh)
            left = read_integer(fh)
            top = read_integer(fh)
            right = read_integer(fh)
            bottom = read_integer(fh)
            return (num_rows, num_cols, upper_left_x, upper_left_y, upper_right_x, upper_right_y,
                    lower_left_x, lower_left_y, lower_right_x, lower_right_y, left, top, right, bottom)

        with misc.smart_open_read(self.path, mode = 'rb', try_gzip = True) as fh:
            #read_file_header(fh)
            
            magic_number = read_integer(fh)
            assert isinstance(magic_number, int) and magic_number == 64
            version_number = read_integer(fh)
            assert version_number == 4
            num_cols = read_integer(fh)
            num_rows = read_integer(fh)
            num_cells = read_integer(fh)
            logger.debug('Number of rows: %d', num_rows)
            logger.debug('Number of cols: %d', num_cols)
            header = read_tag_val(fh)
            logger.debug('; '.join(['%s = %s' %(k,v) for k,v in header.iteritems()]))
            algo_name = read_raw(fh)
            logger.debug('Algorithm name: %s', algo_name)
            algo_params = read_tag_val(fh)
            logger.debug('; '.join(['%s = %s' %(k,v) for k,v in algo_params.iteritems()]))
            cell_margin = read_integer(fh)
            num_outlier_cells = read_DWORD(fh)
            num_masked_cells = read_DWORD(fh)
            num_subgrids = read_integer(fh)
            logger.debug('Cell margin: %d', cell_margin)
            logger.debug('# outlier cells: %d', num_outlier_cells)
            logger.debug('# masked cells: %d', num_masked_cells)
            logger.debug('# sub-grids: %d', num_subgrids)
            
            cells = []
            for j in range(num_cols):
                for i in range(num_rows):
                    cells.append(read_cell(fh))
            logger.debug('# cells: %d', len(cells))
            
            masked = []
            for i in range(num_masked_cells):
                masked.append(read_coords(fh))
                
            outliers = []
            for i in range(num_outlier_cells):
                outliers.append(read_coords(fh))
                
            subgrids = []
            for i in range(num_subgrids):
                subgrids.append(read_subgrid(fh))

            return np.float64([c[0] for c in cells])
示例#9
0
    def parse_annotations(
            self, annotation_file, genes, db_sel='UniProtKB',
            select_evidence=None, exclude_evidence=None,
            exclude_ref=None, strip_species=False, ignore_case=False):
        """Parse a GO annotation file (in GAF 2.0 format).

        GO annotation files can be downloaded from the
        `UniProt-GOA download site`__ or from their `FTP server`__.

        __ goa_download_
        __ goa_ftp_

        .. _goa_download: http://www.ebi.ac.uk/GOA/downloads
        .. _goa_ftp: ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/

        Parameters
        ----------
        annotation_file: str
            Path of the annotation file (in GAF 2.0 format).
        genes: List (tuple, set) of str
            List of valid gene names.
        db_sel: str, optional
            Select only annotations with this ``DB`` (column 1) value.
            If empty, disable filtering based on the ``DB`` value.
        select_evidence: list of str, optional
            Only include annotations with the given evidence codes.
            It not specified, allow all evidence codes, except for those listed
            in ``exclude_evidence``.
        exclude_evidence: list of str, optional
            Exclude all annotations with any of the given evidence codes.
            If ``select_evidence`` is specified, this parameter is ignored.
            If not specified, allow all evidence codes.
        exclude_ref: list of str, optional
            Exclude all annotations with the given DB:reference (column 6).
            Example: ``["PMID:2676709"]``. Note: This filter is currently
            ignored if an annotation has more than one reference.
        strip_species: bool, optional
            Undocumented.
        ignore_case: bool, optional
            Undocumented.

        Returns
        -------
        None
        """

        assert isinstance(annotation_file, str)
        assert isinstance(genes, (list, tuple))

        if not self.terms:
            raise ValueError('You need to first parse an OBO file!')

        if select_evidence is None:
            select_evidence = []

        if exclude_evidence is None:
            exclude_evidence = []

        if exclude_ref is None:
            exclude_ref = []

        # always overwrite all previously parsed annotations
        self.clear_annotation_data()

        # store genes
        self.genes = set(genes)  # store the list of genes for later use
        genes_upper = dict((g.upper(), g) for g in genes)
        logger.info('Read %d genes.', len(genes))

        # read annotations
        self.term_annotations = dict((id_, []) for id_ in self.terms)
        self.gene_annotations = dict((g, []) for g in self.genes)
        # gene_terms is used for statistics
        gene_terms = dict((g, set()) for g in self.genes)

        # isoform_pattern = re.compile(r"UniProtKB:([A-Z][0-9A-Z]{5}-\d+)")
        # gene_pattern = re.compile(r"[a-zA-Z0-9]+\.\d+$")
        # pmid_pattern = re.compile(r"(?:PMID:\d+|DOI:[^\s]+)")
        # uniprot_pattern = re.compile(r"UniProtKB:([A-Z][0-9A-Z]{5}(?:-\d+)?)")

        unknown_gene_names = Counter()
        unknown_gene_annotations = 0

        unknown_term_ids = Counter()
        unknown_term_annotations = 0

        # Parsing!
        logger.info('Parsing annotations...')
        n = 0
        excluded_evidence_annotations = 0
        excluded_reference_annotations = 0
        valid_annotations = 0
        with misc.smart_open_read(annotation_file, mode='rb',
                                  try_gzip=True) as fh:
            reader = csv.reader(fh, dialect='excel-tab', encoding='UTF-8')
            for i, l in enumerate(reader):
                # gene = None

                if not l:
                    continue
                if ((not db_sel) or l[0] == db_sel) and l[3] != 'NOT':
                    n += 1

                    # test if evidence code is excluded
                    if (select_evidence and l[6] not in select_evidence) \
                            or l[6] in exclude_evidence:
                        excluded_evidence_annotations += 1
                        continue

                    # test if reference is excluded
                    db_ref = []
                    if l[5]:
                        db_ref = l[5].split('|')
                        if len(db_ref) == 1 and db_ref[0] in exclude_ref:
                            excluded_reference_annotations += 1
                            continue
                            
                    # determine target gene
                    if not l[2]:
                        raise Exception('Missing target gene in line %d:\n%s'
                                        % (i+1, '\t'.join(l)))

                    gene = l[2]
                    # db = l[0]
                    db_id = l[1]
                    if strip_species:
                        try:
                            gene = gene[:gene.rindex('_')]
                        except ValueError:
                            pass

                    term_id = l[4]
                    evidence = l[6]

                    invalid = False

                    if (ignore_case and gene.upper() not in genes_upper) \
                            or ((not ignore_case) and gene not in self.genes):
                        unknown_gene_annotations += 1
                        unknown_gene_names[l[2]] += 1
                        invalid = True

                    if term_id not in self.terms:
                        unknown_term_annotations += 1
                        unknown_term_ids[term_id] += 1
                        invalid = True

                    if not invalid:
                
                        valid_annotations += 1

                        # if ignore_case, convert gene to "original" name
                        if ignore_case:
                            gene = genes_upper[gene.upper()]

                        term = self.terms[term_id]

                        # parse secondary information
                        # (associated UniProt and PubMed entries)
                        # pmid = pmid_pattern.search(l[5])
                        # if pmid is not None: pmid = pmid.group(0)
                        # uniprot = uniprot_pattern.search(l[7])
                        # if uniprot is not None: uniprot = uniprot.group(1)
                        with_ = []
                        if l[7]:
                            with_ = l[7].split('|')

                        # generate annotation
                        ann = GOAnnotation(
                            gene=gene, term=term,
                            evidence=evidence, db_id=db_id,
                            db_ref=db_ref, with_=with_)

                        # add annotation to global list
                        self.annotations.append(ann)

                        # add annotation under term ID
                        self.term_annotations[term_id].append(ann)

                        # add annotation under gene
                        self.gene_annotations[gene].append(ann)
                        gene_terms[gene].add(term_id)

        # output some statistics
        if n > 0:
            logger.info('Parsed %d positive GO annotations '
                        '(%d = %.1f%% excluded based on evidence type).',
                        n, excluded_evidence_annotations,
                        100*(excluded_evidence_annotations/float(n)))

        if unknown_gene_annotations > 0:
            logger.warning('Warning: %d annotations with %d unkonwn gene '
                           'names.',
                           unknown_gene_annotations, len(unknown_gene_names))

        if unknown_term_annotations > 0:
            logger.warning('Warning: %d annotations with %d unkonwn term IDs.',
                           unknown_term_annotations, len(unknown_term_ids))

        logger.info('Found a total of %d valid annotations.',
                    valid_annotations)

        logger.info('%d unique Gene-Term associations.',
                    sum(len(gene_terms[g]) for g in genes))
示例#10
0
文件: cel.py 项目: maximz/pyaffy
    def read_cel(cls, path):
        """Parser for CEL files in Command Console generic data file format.

        This is a binary format.
        """
        
        read = [0]
        
        decode_unicode = lambda s: codecs.decode(s, 'UTF-16-BE')
        decode_ascii = lambda s: codecs.decode(s, 'ascii')
        decode_float = lambda s: struct.unpack('>f', s)[0]
        decode_int32 = lambda s: struct.unpack('>i', s)[0]
        decode_uint32 = lambda s: struct.unpack('>I', s)[0]
        decode_int8 = lambda s: struct.unpack('>b', s)[0]
        decode_uint8 = lambda s: struct.unpack('>B', s)[0]
        decode_int16 = lambda s: struct.unpack('>h', s)[0]
        decode_uint16 = lambda s: struct.unpack('>H', s)[0]

        def read_float(bytes):
            read[0] += 4
            return decode_float(fh.read(4))

        def read_byte(fh):
            read[0] += 1
            return decode_int8(fh.read(1))

        def read_ubyte(fh):
            read[0] += 1
            return decode_uint8(fh.read(1))

        def read_short(fh):
            read[0] += 2
            return decode_int16(fh.read(2))

        def read_ushort(fh):
            read[0] += 2
            return decode_uint16(fh.read(2))

        def read_int(fh):
            read[0] += 4
            return decode_int32(fh.read(4))

        def read_uint(fh):
            read[0] +=4
            return decode_uint32(fh.read(4))

        def read_raw(fh):
            # reads an int x and then x raw bytes
            bytes = read_int(fh)
            read[0] += bytes
            return fh.read(bytes)

        def read_string(fh):
            strlen = read_int(fh)
            read[0] += strlen
            return fh.read(strlen)

        def read_wstring(fh):
            strlen = read_int(fh)
            s = fh.read(2 * strlen)
            read[0] += (2 * strlen)
            return decode_unicode(s)

        def read_guid(fh):
            return read_string(fh)

        def read_datetime(fh):
            s = read_wstring(fh)
            logger.debug('DateTime string: %s|||', s)
            dt = None
            if s:
                dt = dateutil.parser.parse(s).replace(tzinfo = None)
            #s = u'2015-02-20T13:52:11Z'
            #print s
            return dt

        def read_locale(fh):
            loc = read_wstring(fh)
            return loc[:2], loc[3:]

        def read_value(fh):
            raw = read_raw(fh)
            return raw

        def read_type(fh):
            return read_wstring(fh)

        def read_file_header(fh):
            magic_number = read_ubyte(fh)
            logger.debug('Magic number: %d', magic_number)
            assert isinstance(magic_number, int) and magic_number == 59
            version_number = read_ubyte(fh)
            assert version_number == 1
            num_data_groups = read_int(fh)
            assert isinstance(num_data_groups, int)
            logger.debug('# data groups: %d', num_data_groups)
            first_data_group_pos = read_uint(fh)
            return num_data_groups, first_data_group_pos

        def read_header_param(fh):
            v1 = read_wstring(fh)
            v2 = read_value(fh)
            v3 = read_type(fh)

            if v3 == 'text/plain':
                v2 = decode_unicode(v2.rstrip('\x00'))
            elif v3 == 'text/ascii':
                v2 = decode_ascii(v2.rstrip('\x00'))
            elif v3 == 'text/x-calvin-float':
                v2 = decode_float(v2[:4])
            elif v3 == 'text/x-calvin-integer-32':
                v2 = decode_int32(v2[:4])
            elif v3 == 'text/x-calvin-unsigned-integer-32':
                v2 = decode_uint32(v2[:4])
            elif v3 == 'text/x-calvin-unsigned-integer-8':
                v2 = decode_uint8(v2[:1])
            elif v3 == 'text/x-calvin-unsigned-integer-16':
                v2 = decode_uint16(v2[:2])
            elif v3 == 'text/x-calvin-integer-8':
                v2 = decode_int8(v2[:1])
            elif v3 == 'text/x-calvin-integer-16':
                v2 = decode_int16(v2[:2])

            return (v1, v2)

        def read_data_header(fh):
            data_type_id = read_guid(fh)
            logger.debug('Data type identifier: %s', data_type_id)
            file_id = read_guid(fh)
            logger.debug('File identifier: %s', file_id)
            creation_time = read_datetime(fh)
            #print creation_time
            iso639, iso3166 = read_locale(fh)
            locale = '-'.join([iso639, iso3166])
            n_params = read_int(fh)
            logger.debug('Number of parameters (name/value/type triplets): %d', n_params)
            params = OrderedDict()
            for i in range(n_params):
                params.update([read_header_param(fh)])

            num_parents = read_int(fh)
            logger.debug('Number of parent file headers: %d', num_parents)
            
            parent_headers = []
            for i in range(num_parents):
                logger.debug('')
                logger.debug('-------------------------------------------')
                parent_headers.append(read_data_header(fh))

            header = CELHeader(data_type_id, file_id, creation_time, locale, params, parent_headers)
            return header

        def read_col(fh):
            name = read_wstring(fh)
            valtype = read_byte(fh)
            size = read_int(fh)
            return (name, valtype, size)

        def read_data_set(fh):
            
            VALUE_TYPES = [read_byte, read_ubyte, read_short, read_ushort, read_int, read_uint,
                    read_float, read_string, read_wstring]
            
            data_pos = read_uint(fh)
            next_pos = read_uint(fh)
            data_size = next_pos - data_pos
            name = read_wstring(fh)
            n_params = read_int(fh)
            logger.debug('DataSet / data position: %d', data_pos)
            logger.debug('DataSet / next position: %d', next_pos)
            logger.debug('DataSet / data size: %d', data_size)
            logger.debug('DataSet / name: %s', name)
            logger.debug('DataSet / # parameters: %d', n_params)
            params = OrderedDict()
            for i in range(n_params):
                params.update([read_header_param(fh)])
            n_cols = read_uint(fh)
            logger.debug('DataSet / # cols: %d', n_cols)
            cols = []
            for i in range(n_cols):
                cols.append(read_col(fh))
            col_names = [c[0] for c in cols]
            n_rows = read_uint(fh)
            logger.debug('DataSet / # rows: %d', n_rows)
            data = []
            for i in range(n_rows):
                d = []
                for c in cols:
                    d.append(VALUE_TYPES[c[1]](fh))
                data.append(tuple(d))
            ds = CELDataSet(name, params, col_names, data)
            return ds, next_pos

        def read_data_group(fh):
            next_pos = read_uint(fh)
            dataset_pos = read_uint(fh)
            n_datasets = read_int(fh)
            name = read_wstring(fh)
            logger.debug('# data sets within the group: %d', n_datasets)
            logger.debug('Position of first data set within the group: %d', dataset_pos)
            logger.debug('Position of next data group: %d', next_pos)
            logger.debug('Bytes read up until this point: %d', read[0])
            assert dataset_pos == read[0]
            datasets = []
            for i in range(n_datasets):
                ds, next_pos = read_data_set(fh)
                logger.debug('Position of next data set: %d', next_pos)
                logger.debug('Bytes read up until this point: %d', read[0])
                assert (next_pos - read[0]) in [0, 1]
                if next_pos - read[0] == 1:
                    logger.warning('Skipping one byte (%d) between data sets.', ord(fh.read(1)))
                    read[0] += 1
                assert next_pos == read[0]
                datasets.append(ds)
            group = CELDataGroup(name, datasets)
            return group

        header = None
        data_groups = []
        with misc.smart_open_read(path, 'rb', try_gzip = True) as fh:
            num_data_groups, data_pos = read_file_header(fh)
            #assert n_data_groups == 1 # for expression CEL file
            header = read_data_header(fh)
            logger.info('# data groups: %d', num_data_groups)
            assert data_pos == read[0] # position of the first data group
            data_groups = []
            for i in range(num_data_groups):
                data_groups.append(read_data_group(fh))
        return cls(header, data_groups)
def main(args=None):
    """Extract all exon annotations of protein-coding genes."""

    if args is None:
        parser = get_argument_parser()
        args = parser.parse_args()

    input_file = args.annotation_file
    output_file = args.output_file
    species = args.species
    chrom_pat = args.chromosome_pattern
    field_name = args.field_name

    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose

    # configure root logger
    log_stream = sys.stdout
    if output_file == '-':
        # if we print output to stdout, redirect log messages to stderr
        log_stream = sys.stderr

    logger = misc.get_logger(log_stream=log_stream,
                             log_file=log_file,
                             quiet=quiet,
                             verbose=verbose)

    if chrom_pat is None:
        chrom_pat = re.compile(ensembl.SPECIES_CHROMPAT[species])
    else:
        chrom_pat = re.compile(chrom_pat)

    logger.info('Regular expression used for filtering chromosome names: "%s"',
                chrom_pat.pattern)

    chromosomes = set()
    excluded_chromosomes = set()
    i = 0
    exons = 0
    logger.info('Parsing data...')
    if input_file == '-':
        input_file = None
    with misc.smart_open_read(input_file, mode = 'rb', try_gzip = True) as fh, \
            misc.smart_open_write(output_file) as ofh:
        #if i >= 500000: break
        reader = csv.reader(fh, dialect='excel-tab')
        writer = csv.writer(ofh,
                            dialect='excel-tab',
                            lineterminator=os.linesep,
                            quoting=csv.QUOTE_NONE,
                            quotechar='|')
        for l in reader:
            i += 1
            #if i % int(1e5) == 0:
            #   print '\r%d...' %(i), ; sys.stdout.flush() # report progress
            if len(l) > 1 and l[2] == field_name:
                attr = parse_attributes(l[8])
                type_ = attr['gene_biotype']
                if type_ in ['protein_coding', 'polymorphic_pseudogene']:

                    # test whether chromosome is valid
                    chrom = l[0]
                    m = chrom_pat.match(chrom)
                    if m is None:
                        excluded_chromosomes.add(chrom)
                        continue

                    chromosomes.add(chrom)
                    writer.writerow(l)
                    exons += 1

    logger.info('Done! (Parsed %d lines.)', i)

    logger.info('')
    logger.info('Gene chromosomes (%d):', len(chromosomes))
    logger.info('\t' + ', '.join(sorted(chromosomes)))
    logger.info('')
    logger.info('Excluded chromosomes (%d):', len(excluded_chromosomes))
    logger.info('\t' + ', '.join(sorted(excluded_chromosomes)))
    logger.info('')
    logger.info('Total no. of exons: %d' % (exons))

    return 0
def main(args=None):
    """Extract Ensembl IDs and store in tab-delimited text file.

    Parameters
    ----------
    args: argparse.Namespace object, optional
        The argument values. If not specified, the values will be obtained by
        parsing the command line arguments using the `argparse` module.

    Returns
    -------
    int
        Exit code (0 if no error occurred).
    """

    if args is None:
        # parse command-line arguments
        parser = get_argument_parser()
        args = parser.parse_args()

    input_file = args.annotation_file
    output_file = args.output_file
    species = args.species
    chrom_pat = args.chromosome_pattern
    field_name = args.field_name
    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose

    # configure logger
    log_stream = sys.stdout
    if output_file == '-':
        # if we print output to stdout, redirect log messages to stderr
        log_stream = sys.stderr

    logger = misc.get_logger(log_stream = log_stream, log_file = log_file,
            quiet = quiet, verbose = verbose)

    if chrom_pat is None:
        chrom_pat = re.compile(ensembl.species_chrompat[species])
    else:
        chrom_pat = re.compile(chrom_pat)

    logger.info('Regular expression used for filtering chromosome names: "%s"',
            chrom_pat.pattern)

    # for statistics
    types = Counter()
    sources = Counter()

    # primary information
    genes = Counter()
    gene_chroms = dict()
    gene_ids = dict()

    # secondary information
    genes2 = Counter()
    polymorphic = set()

    # list of chromosomes
    chromosomes = set()
    excluded_chromosomes = set()

    transcripts = {}
    gene_id = None
    gene_name = None

    i = 0
    missing = 0
    logger.info('Parsing data...')
    if input_file == '-':
        input_file = None
    with misc.smart_open_read(input_file, mode = 'rb', try_gzip = True) as fh:
        #if i >= 500000: break
        reader = csv.reader(fh, dialect = 'excel-tab')
        for l in reader:

            i += 1
            #if i % int(1e5) == 0:
            #   print '\r%d...' %(i), ; sys.stdout.flush() # report progress

            if len(l) > 1 and l[2] == field_name:
                attr = parse_attributes(l[8])
                type_ = attr['gene_biotype']

                if type_ not in ['protein_coding', 'polymorphic_pseudogene']:
                    continue

                chrom = l[0]

                # test whether chromosome is valid
                m = chrom_pat.match(chrom)
                if m is None:
                    excluded_chromosomes.add(chrom)
                    continue

                chromosomes.add(m.group())

                source = l[1]
                gene_id = attr['gene_id']
                try:
                    gene_name = attr['gene_name']
                except KeyError as e:
                    missing += 1
                    continue

                if gene_id in genes:
                    if genes[gene_id] != gene_name:
                        raise ValueError('Ensembl ID "%s" ' %(gene_id) +
                                'associated with multiple gene symbols.')
                else:
                    genes[gene_id] = gene_name

    logger.info('Done! (Parsed %d lines.)', i)

    logger.info('Excluded %d chromosomes:', len(excluded_chromosomes))
    logger.info(', '.join(sorted(excluded_chromosomes)))

    n = len(genes)
    m = len(set(genes.values()))

    logger.info('No. of chromosomes: %d', len(chromosomes))
    logger.info('No. of genes IDs: %d', n)
    logger.info('No. of gene names: %d', m)

    with misc.smart_open_write(output_file) as ofh:
        writer = csv.writer(ofh, dialect = 'excel-tab',
                lineterminator = os.linesep, quoting = csv.QUOTE_NONE)
        for g in sorted(genes.keys()):
            writer.writerow([g, genes[g]])

    return 0
示例#13
0
    def _parse_cel_v4_intensities(self):
        # Version 4 format, http://media.affymetrix.com/support/developer/powertools/changelog/gcos-agcc/cel.html#V4
        # data encoding is little endian

        read = [0]

        def decode_unicode(s):
            return codecs.decode(s, 'UTF-16-BE')

        def decode_ascii(s):
            return codecs.decode(s, 'ascii')

        def decode_float(bytes):
            # this is an actual float, not a double!
            return struct.unpack('<f', bytes)[0]

        def decode_int32(bytes):
            return struct.unpack('<i', bytes)[0]

        def decode_uint32(bytes):
            return struct.unpack('<I', bytes)[0]

        def decode_int8(bytes):
            return struct.unpack('<b', bytes)[0]

        def decode_uint8(bytes):
            return struct.unpack('<B', bytes)[0]

        def decode_int16(bytes):
            return struct.unpack('<h', bytes)[0]

        def decode_uint16(bytes):
            return struct.unpack('<H', bytes)[0]

        def read_float(bytes):
            read[0] += 4
            return decode_float(fh.read(4))

        def read_short(fh):
            read[0] += 2
            return decode_int16(fh.read(2))

        def read_integer(fh):
            read[0] += 4
            return decode_int32(fh.read(4))

        def read_DWORD(fh):
            read[0] +=4
            return decode_uint32(fh.read(4))

        def read_raw(fh):
            # reads an int x and then x raw bytes
            bytes = read_integer(fh)
            read[0] += bytes
            return fh.read(bytes)

        def read_tag_val(fh):
            """Returns an OrderedDict containing tag-value entries."""
            raw = codecs.decode(read_raw(fh), encoding='iso-8859-1')
            logger.debug('Tag/Value string:\n%s', raw)
            try:
                C = ConfigParser(interpolation = None, delimiters = ('=',), empty_lines_in_values = False)
                C.optionxform = lambda x: x
                C.read_string('[Section]\n' + raw)
            except ParsingError:
                C = ConfigParser(interpolation = None, delimiters = (':',), empty_lines_in_values = False)
                C.optionxform = lambda x: x
                C.read_string('[Section]\n' + '\n'.join(raw.split(';')))
                
            return C['Section']

        def read_cell(fh):
            intensity = read_float(fh)
            intensity_std = read_float(fh)
            pixel_count = read_short(fh)
            return (intensity, intensity_std, pixel_count)

        def read_coords(fh):
            x = read_short(fh)
            y = read_short(fh)
            return (x, y)

        def read_subgrid(fh):
            num_rows = read_integer(fh)
            num_cols = read_integer(fh)
            upper_left_x = read_float(fh)
            upper_left_y = read_float(fh)
            upper_right_x = read_float(fh)
            upper_right_y = read_float(fh)
            lower_left_x = read_float(fh)
            lower_left_y = read_float(fh)
            lower_right_x = read_float(fh)
            lower_right_y = read_float(fh)
            left = read_integer(fh)
            top = read_integer(fh)
            right = read_integer(fh)
            bottom = read_integer(fh)
            return (num_rows, num_cols, upper_left_x, upper_left_y, upper_right_x, upper_right_y,
                    lower_left_x, lower_left_y, lower_right_x, lower_right_y, left, top, right, bottom)

        with misc.smart_open_read(self.path, mode = 'rb', try_gzip = True) as fh:
            #read_file_header(fh)
            
            magic_number = read_integer(fh)
            assert isinstance(magic_number, int) and magic_number == 64
            version_number = read_integer(fh)
            assert version_number == 4
            num_cols = read_integer(fh)
            num_rows = read_integer(fh)
            num_cells = read_integer(fh)
            logger.debug('Number of rows: %d', num_rows)
            logger.debug('Number of cols: %d', num_cols)
            header = read_tag_val(fh)
            logger.debug('; '.join(['%s = %s' %(k,v) for k,v in header.items()]))
            algo_name = read_raw(fh)
            logger.debug('Algorithm name: %s', algo_name)
            algo_params = read_tag_val(fh)
            logger.debug('; '.join(['%s = %s' %(k,v) for k,v in algo_params.items()]))
            cell_margin = read_integer(fh)
            num_outlier_cells = read_DWORD(fh)
            num_masked_cells = read_DWORD(fh)
            num_subgrids = read_integer(fh)
            logger.debug('Cell margin: %d', cell_margin)
            logger.debug('# outlier cells: %d', num_outlier_cells)
            logger.debug('# masked cells: %d', num_masked_cells)
            logger.debug('# sub-grids: %d', num_subgrids)
            
            cells = []
            for j in range(num_cols):
                for i in range(num_rows):
                    cells.append(read_cell(fh))
            logger.debug('# cells: %d', len(cells))
            
            masked = []
            for i in range(num_masked_cells):
                masked.append(read_coords(fh))
                
            outliers = []
            for i in range(num_outlier_cells):
                outliers.append(read_coords(fh))
                
            subgrids = []
            for i in range(num_subgrids):
                subgrids.append(read_subgrid(fh))

            return np.float64([c[0] for c in cells])
def main(args=None):
    """Extract Ensembl IDs and store in tab-delimited text file.

    Parameters
    ----------
    args: argparse.Namespace object, optional
        The argument values. If not specified, the values will be obtained by
        parsing the command line arguments using the `argparse` module.

    Returns
    -------
    int
        Exit code (0 if no error occurred).
    """

    if args is None:
        # parse command-line arguments
        parser = get_argument_parser()
        args = parser.parse_args()

    input_file = args.annotation_file
    output_file = args.output_file
    species = args.species
    chrom_pat = args.chromosome_pattern
    field_name = args.field_name
    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose

    # configure logger
    log_stream = sys.stdout
    if output_file == '-':
        # if we print output to stdout, redirect log messages to stderr
        log_stream = sys.stderr

    logger = misc.get_logger(log_stream = log_stream, log_file = log_file,
            quiet = quiet, verbose = verbose)

    if chrom_pat is None:
        chrom_pat = re.compile(ensembl.SPECIES_CHROMPAT[species])
    else:
        chrom_pat = re.compile(chrom_pat)

    logger.info('Regular expression used for filtering chromosome names: "%s"',
            chrom_pat.pattern)

    # for statistics
    types = Counter()
    sources = Counter()

    # primary information
    genes = Counter()
    gene_chroms = dict()
    gene_ids = dict()

    # secondary information
    genes2 = Counter()
    polymorphic = set()

    # list of chromosomes
    chromosomes = set()
    excluded_chromosomes = set()

    transcripts = {}
    gene_id = None
    gene_name = None

    i = 0
    missing = 0
    logger.info('Parsing data...')
    if input_file == '-':
        input_file = None
    with misc.smart_open_read(input_file, mode = 'rb', try_gzip = True) as fh:
        #if i >= 500000: break
        reader = csv.reader(fh, dialect = 'excel-tab')
        for l in reader:

            i += 1
            #if i % int(1e5) == 0:
            #   print '\r%d...' %(i), ; sys.stdout.flush() # report progress

            if len(l) > 1 and l[2] == field_name:
                attr = parse_attributes(l[8])
                type_ = attr['gene_biotype']

                if type_ not in ['protein_coding', 'polymorphic_pseudogene']:
                    continue

                chrom = l[0]

                # test whether chromosome is valid
                m = chrom_pat.match(chrom)
                if m is None:
                    excluded_chromosomes.add(chrom)
                    continue

                chromosomes.add(m.group())

                source = l[1]
                gene_id = attr['gene_id']
                try:
                    gene_name = attr['gene_name']
                except KeyError as e:
                    missing += 1
                    continue

                if gene_id in genes:
                    if genes[gene_id] != gene_name:
                        raise ValueError('Ensembl ID "%s" ' %(gene_id) +
                                'associated with multiple gene symbols.')
                else:
                    genes[gene_id] = gene_name

    logger.info('Done! (Parsed %d lines.)', i)

    logger.info('Excluded %d chromosomes:', len(excluded_chromosomes))
    logger.info(', '.join(sorted(excluded_chromosomes)))

    n = len(genes)
    m = len(set(genes.values()))

    logger.info('No. of chromosomes: %d', len(chromosomes))
    logger.info('No. of genes IDs: %d', n)
    logger.info('No. of gene names: %d', m)

    with misc.smart_open_write(output_file) as ofh:
        writer = csv.writer(ofh, dialect = 'excel-tab',
                lineterminator = os.linesep, quoting = csv.QUOTE_NONE)
        for g in sorted(genes.keys()):
            writer.writerow([g, genes[g]])

    return 0
def main(args=None):
    """Extract all exon annotations of protein-coding genes."""

    if args is None:
        parser = get_argument_parser()
        args = parser.parse_args()

    input_file = args.annotation_file
    output_file = args.output_file
    species = args.species
    chrom_pat = args.chromosome_pattern
    field_name = args.field_name

    log_file = args.log_file
    quiet = args.quiet
    verbose = args.verbose

    # configure root logger
    log_stream = sys.stdout
    if output_file == '-':
        # if we print output to stdout, redirect log messages to stderr
        log_stream = sys.stderr

    logger = misc.get_logger(log_stream = log_stream, log_file = log_file,
            quiet = quiet, verbose = verbose)

    if chrom_pat is None:
        chrom_pat = re.compile(ensembl.species_chrompat[species])
    else:
        chrom_pat = re.compile(chrom_pat)

    logger.info('Regular expression used for filtering chromosome names: "%s"',
            chrom_pat.pattern)

    chromosomes = set()
    excluded_chromosomes = set()
    i = 0
    exons = 0
    logger.info('Parsing data...')
    if input_file == '-':
        input_file = None
    with misc.smart_open_read(input_file, mode = 'rb', try_gzip = True) as fh, \
            misc.smart_open_write(output_file) as ofh:
        #if i >= 500000: break
        reader = csv.reader(fh, dialect = 'excel-tab')
        writer = csv.writer(ofh, dialect = 'excel-tab', lineterminator = os.linesep,
                quoting = csv.QUOTE_NONE , quotechar = '|')
        for l in reader:
            i += 1
            #if i % int(1e5) == 0:
            #   print '\r%d...' %(i), ; sys.stdout.flush() # report progress
            if len(l) > 1 and l[2] == field_name:
                attr = parse_attributes(l[8])
                type_ = attr['gene_biotype']
                if type_ in ['protein_coding','polymorphic_pseudogene']:

                    # test whether chromosome is valid
                    chrom = l[0]
                    m = chrom_pat.match(chrom)
                    if m is None:
                        excluded_chromosomes.add(chrom)
                        continue

                    chromosomes.add(chrom)
                    writer.writerow(l)
                    exons += 1

    logger.info('Done! (Parsed %d lines.)', i)

    logger.info('')
    logger.info('Gene chromosomes (%d):', len(chromosomes))
    logger.info('\t' + ', '.join(sorted(chromosomes)))
    logger.info('')
    logger.info('Excluded chromosomes (%d):', len(excluded_chromosomes))
    logger.info('\t' + ', '.join(sorted(excluded_chromosomes)))
    logger.info('')
    logger.info('Total no. of exons: %d' %(exons))

    return 0