def parse_intensities(self): # check version version = None with misc.smart_open_read(self.path, mode = 'rb', try_gzip = True) \ as fh: v = ord(fh.read(1)) # look at first byte if v == 59: # command console generic file format version = 'CCG' if v == 64: # binary cel format, version 4 version = 4 assert version in (4, 'CCG') logger.info('Detected CEL file format version "%s".', str(version)) y = None if version == 'CCG': cel = CEL.read_cel(self.path) ds = cel.data_groups[0].get_data_set_by_name('Intensity') y = np.float64(ds.data)[:,0] elif version == 4: y = self._parse_cel_v4_intensities() return y
def main(args=None): """Script body.""" if args is None: # parse command-line arguments parser = get_argument_parser() args = parser.parse_args() fasta_file = args.fasta_file species = args.species chrom_pat = args.chromosome_pattern output_file = args.output_file log_file = args.log_file quiet = args.quiet verbose = args.verbose # configure root logger log_stream = sys.stdout if output_file == '-': # if we print output to stdout, redirect log messages to stderr log_stream = sys.stderr logger = misc.get_logger(log_stream=log_stream, log_file=log_file, quiet=quiet, verbose=verbose) # generate regular expression object from the chromosome pattern if chrom_pat is None: chrom_pat = ensembl.SPECIES_CHROMPAT[species] chrom_re = re.compile(chrom_pat) # filter the FASTA file # note: each chromosome sequence is temporarily read into memory, # so this script has a large memory footprint with \ misc.smart_open_read( fasta_file, mode='r', encoding='ascii', try_gzip=True ) as fh, \ misc.smart_open_write( output_file, mode='w', encoding='ascii' ) as ofh: # inside = False reader = FastaReader(fh) for seq in reader: chrom = seq.name.split(' ', 1)[0] if chrom_re.match(chrom) is None: logger.info('Ignoring chromosome "%s"...', chrom) continue seq.name = chrom seq.append_fasta(ofh) return 0
def main(args=None): """Script body.""" if args is None: # parse command-line arguments parser = get_argument_parser() args = parser.parse_args() fasta_file = args.fasta_file species = args.species chrom_pat = args.chromosome_pattern output_file = args.output_file log_file = args.log_file quiet = args.quiet verbose = args.verbose # configure root logger log_stream = sys.stdout if output_file == '-': # if we print output to stdout, redirect log messages to stderr log_stream = sys.stderr logger = misc.get_logger(log_stream=log_stream, log_file=log_file, quiet=quiet, verbose=verbose) # generate regular expression object from the chromosome pattern if chrom_pat is None: chrom_pat = ensembl.species_chrompat[species] chrom_re = re.compile(chrom_pat) # filter the FASTA file # note: each chromosome sequence is temporarily read into memory, # so this script has a large memory footprint with \ misc.smart_open_read( fasta_file, mode='r', encoding='ascii', try_gzip=True ) as fh, \ misc.smart_open_write( output_file, mode='w', encoding='ascii' ) as ofh: # inside = False reader = FastaReader(fh) for seq in reader: chrom = seq.name.split(' ', 1)[0] if chrom_re.match(chrom) is None: logger.info('Ignoring chromosome "%s"...', chrom) continue seq.name = chrom seq.append_fasta(ofh) return 0
def read_gene2acc(file_path, logger): """Extracts Entrez ID -> gene symbol mapping from gene2accession.gz file. Parameters ---------- file_path: str The path of the gene2accession.gz file (or a filtered version thereof). The file may be gzip'ed. Returns ------- dict A mapping of Entrez IDs to gene symbols. """ gene2acc = {} with misc.smart_open_read(file_path, mode='rb', try_gzip=True) as fh: reader = csv.reader(fh, dialect='excel-tab') next(reader) # skip header for i, l in enumerate(reader): id_ = int(l[1]) symbol = l[15] try: gene2acc[id_].append(symbol) except KeyError: gene2acc[id_] = [symbol] # print (l[0],l[15]) # make sure all EntrezIDs map to a unique gene symbol n = len(gene2acc) for k, v in gene2acc.items(): symbols = sorted(set(v)) assert len(symbols) == 1 gene2acc[k] = symbols[0] all_symbols = sorted(set(gene2acc.values())) m = len(all_symbols) logger.info('Found %d Entrez Gene IDs associated with %d gene symbols.', n, m) return gene2acc
def get_gaf_gene_ontology_file(path): """Extract the gene ontology file associated with a GO annotation file. Parameters ---------- path: str The path name of the GO annotation file. Returns ------- str The URL of the associated gene ontology file. """ assert isinstance(path, str) version = None with misc.smart_open_read(path, encoding='UTF-8', try_gzip=True) as fh: for l in fh: if l[0] != '!': break if l.startswith('!GO-version:'): version = l.split(' ')[1] break return version
def read_cel(cls, path): """Parser for CEL files in Command Console generic data file format. This is a binary format. """ read = [0] decode_unicode = lambda s: codecs.decode(s, 'UTF-16-BE') decode_ascii = lambda s: codecs.decode(s, 'ascii') decode_float = lambda s: struct.unpack('>f', s)[0] decode_int32 = lambda s: struct.unpack('>i', s)[0] decode_uint32 = lambda s: struct.unpack('>I', s)[0] decode_int8 = lambda s: struct.unpack('>b', s)[0] decode_uint8 = lambda s: struct.unpack('>B', s)[0] decode_int16 = lambda s: struct.unpack('>h', s)[0] decode_uint16 = lambda s: struct.unpack('>H', s)[0] def read_float(bytes): read[0] += 4 return decode_float(fh.read(4)) def read_byte(fh): read[0] += 1 return decode_int8(fh.read(1)) def read_ubyte(fh): read[0] += 1 return decode_uint8(fh.read(1)) def read_short(fh): read[0] += 2 return decode_int16(fh.read(2)) def read_ushort(fh): read[0] += 2 return decode_uint16(fh.read(2)) def read_int(fh): read[0] += 4 return decode_int32(fh.read(4)) def read_uint(fh): read[0] += 4 return decode_uint32(fh.read(4)) def read_raw(fh): # reads an int x and then x raw bytes bytes = read_int(fh) read[0] += bytes return fh.read(bytes) def read_string(fh): strlen = read_int(fh) read[0] += strlen return fh.read(strlen) def read_wstring(fh): strlen = read_int(fh) s = fh.read(2 * strlen) read[0] += (2 * strlen) return decode_unicode(s) def read_guid(fh): return read_string(fh) def read_datetime(fh): s = read_wstring(fh) logger.debug('DateTime string: %s|||', s) dt = None if s: dt = dateutil.parser.parse(s).replace(tzinfo=None) #s = u'2015-02-20T13:52:11Z' #print s return dt def read_locale(fh): loc = read_wstring(fh) return loc[:2], loc[3:] def read_value(fh): raw = read_raw(fh) return raw def read_type(fh): return read_wstring(fh) def read_file_header(fh): magic_number = read_ubyte(fh) logger.debug('Magic number: %d', magic_number) assert isinstance(magic_number, int) and magic_number == 59 version_number = read_ubyte(fh) assert version_number == 1 num_data_groups = read_int(fh) assert isinstance(num_data_groups, int) logger.debug('# data groups: %d', num_data_groups) first_data_group_pos = read_uint(fh) return num_data_groups, first_data_group_pos def read_header_param(fh): v1 = read_wstring(fh) v2 = read_value(fh) v3 = read_type(fh) if v3 == 'text/plain': v2 = decode_unicode(v2.rstrip('\x00')) elif v3 == 'text/ascii': v2 = decode_ascii(v2.rstrip('\x00')) elif v3 == 'text/x-calvin-float': v2 = decode_float(v2[:4]) elif v3 == 'text/x-calvin-integer-32': v2 = decode_int32(v2[:4]) elif v3 == 'text/x-calvin-unsigned-integer-32': v2 = decode_uint32(v2[:4]) elif v3 == 'text/x-calvin-unsigned-integer-8': v2 = decode_uint8(v2[:1]) elif v3 == 'text/x-calvin-unsigned-integer-16': v2 = decode_uint16(v2[:2]) elif v3 == 'text/x-calvin-integer-8': v2 = decode_int8(v2[:1]) elif v3 == 'text/x-calvin-integer-16': v2 = decode_int16(v2[:2]) return (v1, v2) def read_data_header(fh): data_type_id = read_guid(fh) logger.debug('Data type identifier: %s', data_type_id) file_id = read_guid(fh) logger.debug('File identifier: %s', file_id) creation_time = read_datetime(fh) #print creation_time iso639, iso3166 = read_locale(fh) locale = '-'.join([iso639, iso3166]) n_params = read_int(fh) logger.debug('Number of parameters (name/value/type triplets): %d', n_params) params = OrderedDict() for i in range(n_params): params.update([read_header_param(fh)]) num_parents = read_int(fh) logger.debug('Number of parent file headers: %d', num_parents) parent_headers = [] for i in range(num_parents): logger.debug('') logger.debug('-------------------------------------------') parent_headers.append(read_data_header(fh)) header = CELHeader(data_type_id, file_id, creation_time, locale, params, parent_headers) return header def read_col(fh): name = read_wstring(fh) valtype = read_byte(fh) size = read_int(fh) return (name, valtype, size) def read_data_set(fh): VALUE_TYPES = [ read_byte, read_ubyte, read_short, read_ushort, read_int, read_uint, read_float, read_string, read_wstring ] data_pos = read_uint(fh) next_pos = read_uint(fh) data_size = next_pos - data_pos name = read_wstring(fh) n_params = read_int(fh) logger.debug('DataSet / data position: %d', data_pos) logger.debug('DataSet / next position: %d', next_pos) logger.debug('DataSet / data size: %d', data_size) logger.debug('DataSet / name: %s', name) logger.debug('DataSet / # parameters: %d', n_params) params = OrderedDict() for i in range(n_params): params.update([read_header_param(fh)]) n_cols = read_uint(fh) logger.debug('DataSet / # cols: %d', n_cols) cols = [] for i in range(n_cols): cols.append(read_col(fh)) col_names = [c[0] for c in cols] n_rows = read_uint(fh) logger.debug('DataSet / # rows: %d', n_rows) data = [] for i in range(n_rows): d = [] for c in cols: d.append(VALUE_TYPES[c[1]](fh)) data.append(tuple(d)) ds = CELDataSet(name, params, col_names, data) return ds, next_pos def read_data_group(fh): next_pos = read_uint(fh) dataset_pos = read_uint(fh) n_datasets = read_int(fh) name = read_wstring(fh) logger.debug('# data sets within the group: %d', n_datasets) logger.debug('Position of first data set within the group: %d', dataset_pos) logger.debug('Position of next data group: %d', next_pos) logger.debug('Bytes read up until this point: %d', read[0]) assert dataset_pos == read[0] datasets = [] for i in range(n_datasets): ds, next_pos = read_data_set(fh) logger.debug('Position of next data set: %d', next_pos) logger.debug('Bytes read up until this point: %d', read[0]) assert (next_pos - read[0]) in [0, 1] if next_pos - read[0] == 1: logger.warning('Skipping one byte (%d) between data sets.', ord(fh.read(1))) read[0] += 1 assert next_pos == read[0] datasets.append(ds) group = CELDataGroup(name, datasets) return group header = None data_groups = [] with misc.smart_open_read(path, 'rb', try_gzip=True) as fh: num_data_groups, data_pos = read_file_header(fh) #assert n_data_groups == 1 # for expression CEL file header = read_data_header(fh) logger.info('# data groups: %d', num_data_groups) assert data_pos == read[0] # position of the first data group data_groups = [] for i in range(num_data_groups): data_groups.append(read_data_group(fh)) return cls(header, data_groups)
def _parse_cel_v4_intensities(self): # Version 4 format, http://media.affymetrix.com/support/developer/powertools/changelog/gcos-agcc/cel.html#V4 # data encoding is little endian read = [0] def decode_unicode(s): return codecs.decode(s, 'UTF-16-BE') def decode_ascii(s): return codecs.decode(s, 'ascii') def decode_float(bytes): # this is an actual float, not a double! return struct.unpack('<f', bytes)[0] def decode_int32(bytes): return struct.unpack('<i', bytes)[0] def decode_uint32(bytes): return struct.unpack('<I', bytes)[0] def decode_int8(bytes): return struct.unpack('<b', bytes)[0] def decode_uint8(bytes): return struct.unpack('<B', bytes)[0] def decode_int16(bytes): return struct.unpack('<h', bytes)[0] def decode_uint16(bytes): return struct.unpack('<H', bytes)[0] def read_float(bytes): read[0] += 4 return decode_float(fh.read(4)) def read_short(fh): read[0] += 2 return decode_int16(fh.read(2)) def read_integer(fh): read[0] += 4 return decode_int32(fh.read(4)) def read_DWORD(fh): read[0] +=4 return decode_uint32(fh.read(4)) def read_raw(fh): # reads an int x and then x raw bytes bytes = read_integer(fh) read[0] += bytes return fh.read(bytes) def read_tag_val(fh): """Returns an OrderedDict containing tag-value entries.""" raw = read_raw(fh) logger.debug('Tag/Value string:\n%s', raw) try: C = ConfigParser(interpolation = None, delimiters = ('=',), empty_lines_in_values = False) C.optionxform = lambda x: x C.read_string(u'[Section]\n' + unicode(raw)) except ParsingError: C = ConfigParser(interpolation = None, delimiters = (':',), empty_lines_in_values = False) C.optionxform = lambda x: x C.read_string(u'[Section]\n' + unicode('\n'.join(raw.split(';')))) return C['Section'] def read_cell(fh): intensity = read_float(fh) intensity_std = read_float(fh) pixel_count = read_short(fh) return (intensity, intensity_std, pixel_count) def read_coords(fh): x = read_short(fh) y = read_short(fh) return (x, y) def read_subgrid(fh): num_rows = read_integer(fh) num_cols = read_integer(fh) upper_left_x = read_float(fh) upper_left_y = read_float(fh) upper_right_x = read_float(fh) upper_right_y = read_float(fh) lower_left_x = read_float(fh) lower_left_y = read_float(fh) lower_right_x = read_float(fh) lower_right_y = read_float(fh) left = read_integer(fh) top = read_integer(fh) right = read_integer(fh) bottom = read_integer(fh) return (num_rows, num_cols, upper_left_x, upper_left_y, upper_right_x, upper_right_y, lower_left_x, lower_left_y, lower_right_x, lower_right_y, left, top, right, bottom) with misc.smart_open_read(self.path, mode = 'rb', try_gzip = True) as fh: #read_file_header(fh) magic_number = read_integer(fh) assert isinstance(magic_number, int) and magic_number == 64 version_number = read_integer(fh) assert version_number == 4 num_cols = read_integer(fh) num_rows = read_integer(fh) num_cells = read_integer(fh) logger.debug('Number of rows: %d', num_rows) logger.debug('Number of cols: %d', num_cols) header = read_tag_val(fh) logger.debug('; '.join(['%s = %s' %(k,v) for k,v in header.iteritems()])) algo_name = read_raw(fh) logger.debug('Algorithm name: %s', algo_name) algo_params = read_tag_val(fh) logger.debug('; '.join(['%s = %s' %(k,v) for k,v in algo_params.iteritems()])) cell_margin = read_integer(fh) num_outlier_cells = read_DWORD(fh) num_masked_cells = read_DWORD(fh) num_subgrids = read_integer(fh) logger.debug('Cell margin: %d', cell_margin) logger.debug('# outlier cells: %d', num_outlier_cells) logger.debug('# masked cells: %d', num_masked_cells) logger.debug('# sub-grids: %d', num_subgrids) cells = [] for j in range(num_cols): for i in range(num_rows): cells.append(read_cell(fh)) logger.debug('# cells: %d', len(cells)) masked = [] for i in range(num_masked_cells): masked.append(read_coords(fh)) outliers = [] for i in range(num_outlier_cells): outliers.append(read_coords(fh)) subgrids = [] for i in range(num_subgrids): subgrids.append(read_subgrid(fh)) return np.float64([c[0] for c in cells])
def parse_annotations( self, annotation_file, genes, db_sel='UniProtKB', select_evidence=None, exclude_evidence=None, exclude_ref=None, strip_species=False, ignore_case=False): """Parse a GO annotation file (in GAF 2.0 format). GO annotation files can be downloaded from the `UniProt-GOA download site`__ or from their `FTP server`__. __ goa_download_ __ goa_ftp_ .. _goa_download: http://www.ebi.ac.uk/GOA/downloads .. _goa_ftp: ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/ Parameters ---------- annotation_file: str Path of the annotation file (in GAF 2.0 format). genes: List (tuple, set) of str List of valid gene names. db_sel: str, optional Select only annotations with this ``DB`` (column 1) value. If empty, disable filtering based on the ``DB`` value. select_evidence: list of str, optional Only include annotations with the given evidence codes. It not specified, allow all evidence codes, except for those listed in ``exclude_evidence``. exclude_evidence: list of str, optional Exclude all annotations with any of the given evidence codes. If ``select_evidence`` is specified, this parameter is ignored. If not specified, allow all evidence codes. exclude_ref: list of str, optional Exclude all annotations with the given DB:reference (column 6). Example: ``["PMID:2676709"]``. Note: This filter is currently ignored if an annotation has more than one reference. strip_species: bool, optional Undocumented. ignore_case: bool, optional Undocumented. Returns ------- None """ assert isinstance(annotation_file, str) assert isinstance(genes, (list, tuple)) if not self.terms: raise ValueError('You need to first parse an OBO file!') if select_evidence is None: select_evidence = [] if exclude_evidence is None: exclude_evidence = [] if exclude_ref is None: exclude_ref = [] # always overwrite all previously parsed annotations self.clear_annotation_data() # store genes self.genes = set(genes) # store the list of genes for later use genes_upper = dict((g.upper(), g) for g in genes) logger.info('Read %d genes.', len(genes)) # read annotations self.term_annotations = dict((id_, []) for id_ in self.terms) self.gene_annotations = dict((g, []) for g in self.genes) # gene_terms is used for statistics gene_terms = dict((g, set()) for g in self.genes) # isoform_pattern = re.compile(r"UniProtKB:([A-Z][0-9A-Z]{5}-\d+)") # gene_pattern = re.compile(r"[a-zA-Z0-9]+\.\d+$") # pmid_pattern = re.compile(r"(?:PMID:\d+|DOI:[^\s]+)") # uniprot_pattern = re.compile(r"UniProtKB:([A-Z][0-9A-Z]{5}(?:-\d+)?)") unknown_gene_names = Counter() unknown_gene_annotations = 0 unknown_term_ids = Counter() unknown_term_annotations = 0 # Parsing! logger.info('Parsing annotations...') n = 0 excluded_evidence_annotations = 0 excluded_reference_annotations = 0 valid_annotations = 0 with misc.smart_open_read(annotation_file, mode='rb', try_gzip=True) as fh: reader = csv.reader(fh, dialect='excel-tab', encoding='UTF-8') for i, l in enumerate(reader): # gene = None if not l: continue if ((not db_sel) or l[0] == db_sel) and l[3] != 'NOT': n += 1 # test if evidence code is excluded if (select_evidence and l[6] not in select_evidence) \ or l[6] in exclude_evidence: excluded_evidence_annotations += 1 continue # test if reference is excluded db_ref = [] if l[5]: db_ref = l[5].split('|') if len(db_ref) == 1 and db_ref[0] in exclude_ref: excluded_reference_annotations += 1 continue # determine target gene if not l[2]: raise Exception('Missing target gene in line %d:\n%s' % (i+1, '\t'.join(l))) gene = l[2] # db = l[0] db_id = l[1] if strip_species: try: gene = gene[:gene.rindex('_')] except ValueError: pass term_id = l[4] evidence = l[6] invalid = False if (ignore_case and gene.upper() not in genes_upper) \ or ((not ignore_case) and gene not in self.genes): unknown_gene_annotations += 1 unknown_gene_names[l[2]] += 1 invalid = True if term_id not in self.terms: unknown_term_annotations += 1 unknown_term_ids[term_id] += 1 invalid = True if not invalid: valid_annotations += 1 # if ignore_case, convert gene to "original" name if ignore_case: gene = genes_upper[gene.upper()] term = self.terms[term_id] # parse secondary information # (associated UniProt and PubMed entries) # pmid = pmid_pattern.search(l[5]) # if pmid is not None: pmid = pmid.group(0) # uniprot = uniprot_pattern.search(l[7]) # if uniprot is not None: uniprot = uniprot.group(1) with_ = [] if l[7]: with_ = l[7].split('|') # generate annotation ann = GOAnnotation( gene=gene, term=term, evidence=evidence, db_id=db_id, db_ref=db_ref, with_=with_) # add annotation to global list self.annotations.append(ann) # add annotation under term ID self.term_annotations[term_id].append(ann) # add annotation under gene self.gene_annotations[gene].append(ann) gene_terms[gene].add(term_id) # output some statistics if n > 0: logger.info('Parsed %d positive GO annotations ' '(%d = %.1f%% excluded based on evidence type).', n, excluded_evidence_annotations, 100*(excluded_evidence_annotations/float(n))) if unknown_gene_annotations > 0: logger.warning('Warning: %d annotations with %d unkonwn gene ' 'names.', unknown_gene_annotations, len(unknown_gene_names)) if unknown_term_annotations > 0: logger.warning('Warning: %d annotations with %d unkonwn term IDs.', unknown_term_annotations, len(unknown_term_ids)) logger.info('Found a total of %d valid annotations.', valid_annotations) logger.info('%d unique Gene-Term associations.', sum(len(gene_terms[g]) for g in genes))
def read_cel(cls, path): """Parser for CEL files in Command Console generic data file format. This is a binary format. """ read = [0] decode_unicode = lambda s: codecs.decode(s, 'UTF-16-BE') decode_ascii = lambda s: codecs.decode(s, 'ascii') decode_float = lambda s: struct.unpack('>f', s)[0] decode_int32 = lambda s: struct.unpack('>i', s)[0] decode_uint32 = lambda s: struct.unpack('>I', s)[0] decode_int8 = lambda s: struct.unpack('>b', s)[0] decode_uint8 = lambda s: struct.unpack('>B', s)[0] decode_int16 = lambda s: struct.unpack('>h', s)[0] decode_uint16 = lambda s: struct.unpack('>H', s)[0] def read_float(bytes): read[0] += 4 return decode_float(fh.read(4)) def read_byte(fh): read[0] += 1 return decode_int8(fh.read(1)) def read_ubyte(fh): read[0] += 1 return decode_uint8(fh.read(1)) def read_short(fh): read[0] += 2 return decode_int16(fh.read(2)) def read_ushort(fh): read[0] += 2 return decode_uint16(fh.read(2)) def read_int(fh): read[0] += 4 return decode_int32(fh.read(4)) def read_uint(fh): read[0] +=4 return decode_uint32(fh.read(4)) def read_raw(fh): # reads an int x and then x raw bytes bytes = read_int(fh) read[0] += bytes return fh.read(bytes) def read_string(fh): strlen = read_int(fh) read[0] += strlen return fh.read(strlen) def read_wstring(fh): strlen = read_int(fh) s = fh.read(2 * strlen) read[0] += (2 * strlen) return decode_unicode(s) def read_guid(fh): return read_string(fh) def read_datetime(fh): s = read_wstring(fh) logger.debug('DateTime string: %s|||', s) dt = None if s: dt = dateutil.parser.parse(s).replace(tzinfo = None) #s = u'2015-02-20T13:52:11Z' #print s return dt def read_locale(fh): loc = read_wstring(fh) return loc[:2], loc[3:] def read_value(fh): raw = read_raw(fh) return raw def read_type(fh): return read_wstring(fh) def read_file_header(fh): magic_number = read_ubyte(fh) logger.debug('Magic number: %d', magic_number) assert isinstance(magic_number, int) and magic_number == 59 version_number = read_ubyte(fh) assert version_number == 1 num_data_groups = read_int(fh) assert isinstance(num_data_groups, int) logger.debug('# data groups: %d', num_data_groups) first_data_group_pos = read_uint(fh) return num_data_groups, first_data_group_pos def read_header_param(fh): v1 = read_wstring(fh) v2 = read_value(fh) v3 = read_type(fh) if v3 == 'text/plain': v2 = decode_unicode(v2.rstrip('\x00')) elif v3 == 'text/ascii': v2 = decode_ascii(v2.rstrip('\x00')) elif v3 == 'text/x-calvin-float': v2 = decode_float(v2[:4]) elif v3 == 'text/x-calvin-integer-32': v2 = decode_int32(v2[:4]) elif v3 == 'text/x-calvin-unsigned-integer-32': v2 = decode_uint32(v2[:4]) elif v3 == 'text/x-calvin-unsigned-integer-8': v2 = decode_uint8(v2[:1]) elif v3 == 'text/x-calvin-unsigned-integer-16': v2 = decode_uint16(v2[:2]) elif v3 == 'text/x-calvin-integer-8': v2 = decode_int8(v2[:1]) elif v3 == 'text/x-calvin-integer-16': v2 = decode_int16(v2[:2]) return (v1, v2) def read_data_header(fh): data_type_id = read_guid(fh) logger.debug('Data type identifier: %s', data_type_id) file_id = read_guid(fh) logger.debug('File identifier: %s', file_id) creation_time = read_datetime(fh) #print creation_time iso639, iso3166 = read_locale(fh) locale = '-'.join([iso639, iso3166]) n_params = read_int(fh) logger.debug('Number of parameters (name/value/type triplets): %d', n_params) params = OrderedDict() for i in range(n_params): params.update([read_header_param(fh)]) num_parents = read_int(fh) logger.debug('Number of parent file headers: %d', num_parents) parent_headers = [] for i in range(num_parents): logger.debug('') logger.debug('-------------------------------------------') parent_headers.append(read_data_header(fh)) header = CELHeader(data_type_id, file_id, creation_time, locale, params, parent_headers) return header def read_col(fh): name = read_wstring(fh) valtype = read_byte(fh) size = read_int(fh) return (name, valtype, size) def read_data_set(fh): VALUE_TYPES = [read_byte, read_ubyte, read_short, read_ushort, read_int, read_uint, read_float, read_string, read_wstring] data_pos = read_uint(fh) next_pos = read_uint(fh) data_size = next_pos - data_pos name = read_wstring(fh) n_params = read_int(fh) logger.debug('DataSet / data position: %d', data_pos) logger.debug('DataSet / next position: %d', next_pos) logger.debug('DataSet / data size: %d', data_size) logger.debug('DataSet / name: %s', name) logger.debug('DataSet / # parameters: %d', n_params) params = OrderedDict() for i in range(n_params): params.update([read_header_param(fh)]) n_cols = read_uint(fh) logger.debug('DataSet / # cols: %d', n_cols) cols = [] for i in range(n_cols): cols.append(read_col(fh)) col_names = [c[0] for c in cols] n_rows = read_uint(fh) logger.debug('DataSet / # rows: %d', n_rows) data = [] for i in range(n_rows): d = [] for c in cols: d.append(VALUE_TYPES[c[1]](fh)) data.append(tuple(d)) ds = CELDataSet(name, params, col_names, data) return ds, next_pos def read_data_group(fh): next_pos = read_uint(fh) dataset_pos = read_uint(fh) n_datasets = read_int(fh) name = read_wstring(fh) logger.debug('# data sets within the group: %d', n_datasets) logger.debug('Position of first data set within the group: %d', dataset_pos) logger.debug('Position of next data group: %d', next_pos) logger.debug('Bytes read up until this point: %d', read[0]) assert dataset_pos == read[0] datasets = [] for i in range(n_datasets): ds, next_pos = read_data_set(fh) logger.debug('Position of next data set: %d', next_pos) logger.debug('Bytes read up until this point: %d', read[0]) assert (next_pos - read[0]) in [0, 1] if next_pos - read[0] == 1: logger.warning('Skipping one byte (%d) between data sets.', ord(fh.read(1))) read[0] += 1 assert next_pos == read[0] datasets.append(ds) group = CELDataGroup(name, datasets) return group header = None data_groups = [] with misc.smart_open_read(path, 'rb', try_gzip = True) as fh: num_data_groups, data_pos = read_file_header(fh) #assert n_data_groups == 1 # for expression CEL file header = read_data_header(fh) logger.info('# data groups: %d', num_data_groups) assert data_pos == read[0] # position of the first data group data_groups = [] for i in range(num_data_groups): data_groups.append(read_data_group(fh)) return cls(header, data_groups)
def main(args=None): """Extract all exon annotations of protein-coding genes.""" if args is None: parser = get_argument_parser() args = parser.parse_args() input_file = args.annotation_file output_file = args.output_file species = args.species chrom_pat = args.chromosome_pattern field_name = args.field_name log_file = args.log_file quiet = args.quiet verbose = args.verbose # configure root logger log_stream = sys.stdout if output_file == '-': # if we print output to stdout, redirect log messages to stderr log_stream = sys.stderr logger = misc.get_logger(log_stream=log_stream, log_file=log_file, quiet=quiet, verbose=verbose) if chrom_pat is None: chrom_pat = re.compile(ensembl.SPECIES_CHROMPAT[species]) else: chrom_pat = re.compile(chrom_pat) logger.info('Regular expression used for filtering chromosome names: "%s"', chrom_pat.pattern) chromosomes = set() excluded_chromosomes = set() i = 0 exons = 0 logger.info('Parsing data...') if input_file == '-': input_file = None with misc.smart_open_read(input_file, mode = 'rb', try_gzip = True) as fh, \ misc.smart_open_write(output_file) as ofh: #if i >= 500000: break reader = csv.reader(fh, dialect='excel-tab') writer = csv.writer(ofh, dialect='excel-tab', lineterminator=os.linesep, quoting=csv.QUOTE_NONE, quotechar='|') for l in reader: i += 1 #if i % int(1e5) == 0: # print '\r%d...' %(i), ; sys.stdout.flush() # report progress if len(l) > 1 and l[2] == field_name: attr = parse_attributes(l[8]) type_ = attr['gene_biotype'] if type_ in ['protein_coding', 'polymorphic_pseudogene']: # test whether chromosome is valid chrom = l[0] m = chrom_pat.match(chrom) if m is None: excluded_chromosomes.add(chrom) continue chromosomes.add(chrom) writer.writerow(l) exons += 1 logger.info('Done! (Parsed %d lines.)', i) logger.info('') logger.info('Gene chromosomes (%d):', len(chromosomes)) logger.info('\t' + ', '.join(sorted(chromosomes))) logger.info('') logger.info('Excluded chromosomes (%d):', len(excluded_chromosomes)) logger.info('\t' + ', '.join(sorted(excluded_chromosomes))) logger.info('') logger.info('Total no. of exons: %d' % (exons)) return 0
def main(args=None): """Extract Ensembl IDs and store in tab-delimited text file. Parameters ---------- args: argparse.Namespace object, optional The argument values. If not specified, the values will be obtained by parsing the command line arguments using the `argparse` module. Returns ------- int Exit code (0 if no error occurred). """ if args is None: # parse command-line arguments parser = get_argument_parser() args = parser.parse_args() input_file = args.annotation_file output_file = args.output_file species = args.species chrom_pat = args.chromosome_pattern field_name = args.field_name log_file = args.log_file quiet = args.quiet verbose = args.verbose # configure logger log_stream = sys.stdout if output_file == '-': # if we print output to stdout, redirect log messages to stderr log_stream = sys.stderr logger = misc.get_logger(log_stream = log_stream, log_file = log_file, quiet = quiet, verbose = verbose) if chrom_pat is None: chrom_pat = re.compile(ensembl.species_chrompat[species]) else: chrom_pat = re.compile(chrom_pat) logger.info('Regular expression used for filtering chromosome names: "%s"', chrom_pat.pattern) # for statistics types = Counter() sources = Counter() # primary information genes = Counter() gene_chroms = dict() gene_ids = dict() # secondary information genes2 = Counter() polymorphic = set() # list of chromosomes chromosomes = set() excluded_chromosomes = set() transcripts = {} gene_id = None gene_name = None i = 0 missing = 0 logger.info('Parsing data...') if input_file == '-': input_file = None with misc.smart_open_read(input_file, mode = 'rb', try_gzip = True) as fh: #if i >= 500000: break reader = csv.reader(fh, dialect = 'excel-tab') for l in reader: i += 1 #if i % int(1e5) == 0: # print '\r%d...' %(i), ; sys.stdout.flush() # report progress if len(l) > 1 and l[2] == field_name: attr = parse_attributes(l[8]) type_ = attr['gene_biotype'] if type_ not in ['protein_coding', 'polymorphic_pseudogene']: continue chrom = l[0] # test whether chromosome is valid m = chrom_pat.match(chrom) if m is None: excluded_chromosomes.add(chrom) continue chromosomes.add(m.group()) source = l[1] gene_id = attr['gene_id'] try: gene_name = attr['gene_name'] except KeyError as e: missing += 1 continue if gene_id in genes: if genes[gene_id] != gene_name: raise ValueError('Ensembl ID "%s" ' %(gene_id) + 'associated with multiple gene symbols.') else: genes[gene_id] = gene_name logger.info('Done! (Parsed %d lines.)', i) logger.info('Excluded %d chromosomes:', len(excluded_chromosomes)) logger.info(', '.join(sorted(excluded_chromosomes))) n = len(genes) m = len(set(genes.values())) logger.info('No. of chromosomes: %d', len(chromosomes)) logger.info('No. of genes IDs: %d', n) logger.info('No. of gene names: %d', m) with misc.smart_open_write(output_file) as ofh: writer = csv.writer(ofh, dialect = 'excel-tab', lineterminator = os.linesep, quoting = csv.QUOTE_NONE) for g in sorted(genes.keys()): writer.writerow([g, genes[g]]) return 0
def _parse_cel_v4_intensities(self): # Version 4 format, http://media.affymetrix.com/support/developer/powertools/changelog/gcos-agcc/cel.html#V4 # data encoding is little endian read = [0] def decode_unicode(s): return codecs.decode(s, 'UTF-16-BE') def decode_ascii(s): return codecs.decode(s, 'ascii') def decode_float(bytes): # this is an actual float, not a double! return struct.unpack('<f', bytes)[0] def decode_int32(bytes): return struct.unpack('<i', bytes)[0] def decode_uint32(bytes): return struct.unpack('<I', bytes)[0] def decode_int8(bytes): return struct.unpack('<b', bytes)[0] def decode_uint8(bytes): return struct.unpack('<B', bytes)[0] def decode_int16(bytes): return struct.unpack('<h', bytes)[0] def decode_uint16(bytes): return struct.unpack('<H', bytes)[0] def read_float(bytes): read[0] += 4 return decode_float(fh.read(4)) def read_short(fh): read[0] += 2 return decode_int16(fh.read(2)) def read_integer(fh): read[0] += 4 return decode_int32(fh.read(4)) def read_DWORD(fh): read[0] +=4 return decode_uint32(fh.read(4)) def read_raw(fh): # reads an int x and then x raw bytes bytes = read_integer(fh) read[0] += bytes return fh.read(bytes) def read_tag_val(fh): """Returns an OrderedDict containing tag-value entries.""" raw = codecs.decode(read_raw(fh), encoding='iso-8859-1') logger.debug('Tag/Value string:\n%s', raw) try: C = ConfigParser(interpolation = None, delimiters = ('=',), empty_lines_in_values = False) C.optionxform = lambda x: x C.read_string('[Section]\n' + raw) except ParsingError: C = ConfigParser(interpolation = None, delimiters = (':',), empty_lines_in_values = False) C.optionxform = lambda x: x C.read_string('[Section]\n' + '\n'.join(raw.split(';'))) return C['Section'] def read_cell(fh): intensity = read_float(fh) intensity_std = read_float(fh) pixel_count = read_short(fh) return (intensity, intensity_std, pixel_count) def read_coords(fh): x = read_short(fh) y = read_short(fh) return (x, y) def read_subgrid(fh): num_rows = read_integer(fh) num_cols = read_integer(fh) upper_left_x = read_float(fh) upper_left_y = read_float(fh) upper_right_x = read_float(fh) upper_right_y = read_float(fh) lower_left_x = read_float(fh) lower_left_y = read_float(fh) lower_right_x = read_float(fh) lower_right_y = read_float(fh) left = read_integer(fh) top = read_integer(fh) right = read_integer(fh) bottom = read_integer(fh) return (num_rows, num_cols, upper_left_x, upper_left_y, upper_right_x, upper_right_y, lower_left_x, lower_left_y, lower_right_x, lower_right_y, left, top, right, bottom) with misc.smart_open_read(self.path, mode = 'rb', try_gzip = True) as fh: #read_file_header(fh) magic_number = read_integer(fh) assert isinstance(magic_number, int) and magic_number == 64 version_number = read_integer(fh) assert version_number == 4 num_cols = read_integer(fh) num_rows = read_integer(fh) num_cells = read_integer(fh) logger.debug('Number of rows: %d', num_rows) logger.debug('Number of cols: %d', num_cols) header = read_tag_val(fh) logger.debug('; '.join(['%s = %s' %(k,v) for k,v in header.items()])) algo_name = read_raw(fh) logger.debug('Algorithm name: %s', algo_name) algo_params = read_tag_val(fh) logger.debug('; '.join(['%s = %s' %(k,v) for k,v in algo_params.items()])) cell_margin = read_integer(fh) num_outlier_cells = read_DWORD(fh) num_masked_cells = read_DWORD(fh) num_subgrids = read_integer(fh) logger.debug('Cell margin: %d', cell_margin) logger.debug('# outlier cells: %d', num_outlier_cells) logger.debug('# masked cells: %d', num_masked_cells) logger.debug('# sub-grids: %d', num_subgrids) cells = [] for j in range(num_cols): for i in range(num_rows): cells.append(read_cell(fh)) logger.debug('# cells: %d', len(cells)) masked = [] for i in range(num_masked_cells): masked.append(read_coords(fh)) outliers = [] for i in range(num_outlier_cells): outliers.append(read_coords(fh)) subgrids = [] for i in range(num_subgrids): subgrids.append(read_subgrid(fh)) return np.float64([c[0] for c in cells])
def main(args=None): """Extract Ensembl IDs and store in tab-delimited text file. Parameters ---------- args: argparse.Namespace object, optional The argument values. If not specified, the values will be obtained by parsing the command line arguments using the `argparse` module. Returns ------- int Exit code (0 if no error occurred). """ if args is None: # parse command-line arguments parser = get_argument_parser() args = parser.parse_args() input_file = args.annotation_file output_file = args.output_file species = args.species chrom_pat = args.chromosome_pattern field_name = args.field_name log_file = args.log_file quiet = args.quiet verbose = args.verbose # configure logger log_stream = sys.stdout if output_file == '-': # if we print output to stdout, redirect log messages to stderr log_stream = sys.stderr logger = misc.get_logger(log_stream = log_stream, log_file = log_file, quiet = quiet, verbose = verbose) if chrom_pat is None: chrom_pat = re.compile(ensembl.SPECIES_CHROMPAT[species]) else: chrom_pat = re.compile(chrom_pat) logger.info('Regular expression used for filtering chromosome names: "%s"', chrom_pat.pattern) # for statistics types = Counter() sources = Counter() # primary information genes = Counter() gene_chroms = dict() gene_ids = dict() # secondary information genes2 = Counter() polymorphic = set() # list of chromosomes chromosomes = set() excluded_chromosomes = set() transcripts = {} gene_id = None gene_name = None i = 0 missing = 0 logger.info('Parsing data...') if input_file == '-': input_file = None with misc.smart_open_read(input_file, mode = 'rb', try_gzip = True) as fh: #if i >= 500000: break reader = csv.reader(fh, dialect = 'excel-tab') for l in reader: i += 1 #if i % int(1e5) == 0: # print '\r%d...' %(i), ; sys.stdout.flush() # report progress if len(l) > 1 and l[2] == field_name: attr = parse_attributes(l[8]) type_ = attr['gene_biotype'] if type_ not in ['protein_coding', 'polymorphic_pseudogene']: continue chrom = l[0] # test whether chromosome is valid m = chrom_pat.match(chrom) if m is None: excluded_chromosomes.add(chrom) continue chromosomes.add(m.group()) source = l[1] gene_id = attr['gene_id'] try: gene_name = attr['gene_name'] except KeyError as e: missing += 1 continue if gene_id in genes: if genes[gene_id] != gene_name: raise ValueError('Ensembl ID "%s" ' %(gene_id) + 'associated with multiple gene symbols.') else: genes[gene_id] = gene_name logger.info('Done! (Parsed %d lines.)', i) logger.info('Excluded %d chromosomes:', len(excluded_chromosomes)) logger.info(', '.join(sorted(excluded_chromosomes))) n = len(genes) m = len(set(genes.values())) logger.info('No. of chromosomes: %d', len(chromosomes)) logger.info('No. of genes IDs: %d', n) logger.info('No. of gene names: %d', m) with misc.smart_open_write(output_file) as ofh: writer = csv.writer(ofh, dialect = 'excel-tab', lineterminator = os.linesep, quoting = csv.QUOTE_NONE) for g in sorted(genes.keys()): writer.writerow([g, genes[g]]) return 0
def main(args=None): """Extract all exon annotations of protein-coding genes.""" if args is None: parser = get_argument_parser() args = parser.parse_args() input_file = args.annotation_file output_file = args.output_file species = args.species chrom_pat = args.chromosome_pattern field_name = args.field_name log_file = args.log_file quiet = args.quiet verbose = args.verbose # configure root logger log_stream = sys.stdout if output_file == '-': # if we print output to stdout, redirect log messages to stderr log_stream = sys.stderr logger = misc.get_logger(log_stream = log_stream, log_file = log_file, quiet = quiet, verbose = verbose) if chrom_pat is None: chrom_pat = re.compile(ensembl.species_chrompat[species]) else: chrom_pat = re.compile(chrom_pat) logger.info('Regular expression used for filtering chromosome names: "%s"', chrom_pat.pattern) chromosomes = set() excluded_chromosomes = set() i = 0 exons = 0 logger.info('Parsing data...') if input_file == '-': input_file = None with misc.smart_open_read(input_file, mode = 'rb', try_gzip = True) as fh, \ misc.smart_open_write(output_file) as ofh: #if i >= 500000: break reader = csv.reader(fh, dialect = 'excel-tab') writer = csv.writer(ofh, dialect = 'excel-tab', lineterminator = os.linesep, quoting = csv.QUOTE_NONE , quotechar = '|') for l in reader: i += 1 #if i % int(1e5) == 0: # print '\r%d...' %(i), ; sys.stdout.flush() # report progress if len(l) > 1 and l[2] == field_name: attr = parse_attributes(l[8]) type_ = attr['gene_biotype'] if type_ in ['protein_coding','polymorphic_pseudogene']: # test whether chromosome is valid chrom = l[0] m = chrom_pat.match(chrom) if m is None: excluded_chromosomes.add(chrom) continue chromosomes.add(chrom) writer.writerow(l) exons += 1 logger.info('Done! (Parsed %d lines.)', i) logger.info('') logger.info('Gene chromosomes (%d):', len(chromosomes)) logger.info('\t' + ', '.join(sorted(chromosomes))) logger.info('') logger.info('Excluded chromosomes (%d):', len(excluded_chromosomes)) logger.info('\t' + ', '.join(sorted(excluded_chromosomes))) logger.info('') logger.info('Total no. of exons: %d' %(exons)) return 0