def _convert_tool_output( input_file, file_type=SUPPORTED_TOOL.MAVIS, stranded=False, log=DEVNULL, assume_no_untemplated=True, ): log('reading:', input_file) result = [] rows = None if file_type == SUPPORTED_TOOL.MAVIS: result = read_bpp_from_input_file(input_file, expand_orient=True, expand_svtype=True, add_default={'stranded': stranded}) elif file_type == SUPPORTED_TOOL.CNVNATOR: _, rows = tab.read_file( input_file, header=[ 'event_type', 'coordinates', 'size', 'normalized_RD', 'e-val1', 'e-val2', 'e-val3', 'e-val4', 'q0', ], ) elif file_type in [ SUPPORTED_TOOL.DELLY, SUPPORTED_TOOL.MANTA, SUPPORTED_TOOL.PINDEL, SUPPORTED_TOOL.VCF, SUPPORTED_TOOL.BREAKSEQ, SUPPORTED_TOOL.STRELKA, ]: rows = read_vcf(input_file, file_type, log) elif file_type == SUPPORTED_TOOL.BREAKDANCER: rows = _convert_breakdancer_file(input_file) else: _, rows = tab.read_file(input_file) if rows: log('found', len(rows), 'rows') for row in rows: try: std_rows = _convert_tool_row( row, file_type, stranded, assume_no_untemplated=assume_no_untemplated) except Exception as err: log('Error in converting row', row) raise err else: result.extend(std_rows) log('generated', len(result), 'breakpoint pairs') return result
def _convert_tool_output(input_file, file_type=SUPPORTED_TOOL.MAVIS, stranded=False, log=devnull, assume_no_untemplated=True): log('reading:', input_file) result = [] rows = None if file_type == SUPPORTED_TOOL.MAVIS: result = read_bpp_from_input_file(input_file, expand_orient=True, expand_svtype=True, add_default={'stranded': stranded}) elif file_type == SUPPORTED_TOOL.CNVNATOR: _, rows = tab.read_file(input_file, header=[ 'event_type', 'coordinates', 'size', 'normalized_RD', 'e-val1', 'e-val2', 'e-val3', 'e-val4', 'q0' ]) elif file_type in [ SUPPORTED_TOOL.DELLY, SUPPORTED_TOOL.MANTA, SUPPORTED_TOOL.PINDEL, SUPPORTED_TOOL.VCF, SUPPORTED_TOOL.BREAKSEQ ]: rows = [] vfile = VariantFile(input_file) try: vfile.header.info.add('END', number=1, type='Integer', description='End of the interval') except ValueError: pass for vcf_record in vfile.fetch(): rows.extend(_parse_vcf_record(vcf_record, log=log)) elif file_type == SUPPORTED_TOOL.BREAKDANCER: rows = _convert_breakdancer_file(input_file) else: _, rows = tab.read_file(input_file) if rows: log('found', len(rows), 'rows') for row in rows: try: std_rows = _convert_tool_row( row, file_type, stranded, assume_no_untemplated=assume_no_untemplated) except Exception as err: log('Error in converting row', row) raise err else: result.extend(std_rows) log('generated', len(result), 'breakpoint pairs') return result
def load_templates(*filepaths): """ primarily useful if template drawings are required and is not necessary otherwise assumes the input file is 0-indexed with [start,end) style. Columns are expected in the following order, tab-delimited. A header should not be given 1. name 2. start 3. end 4. band_name 5. giemsa_stain for example .. code-block:: text chr1 0 2300000 p36.33 gneg chr1 2300000 5400000 p36.32 gpos25 Args: filename (str): the path to the file with the cytoband template information Returns: List[Template]: list of the templates loaded """ header = ['name', 'start', 'end', 'band_name', 'giemsa_stain'] templates = {} for filename in filepaths: header, rows = tab.read_file( filename, header=header, cast={ 'start': int, 'end': int }, in_={'giemsa_stain': GIEMSA_STAIN.values()}, ) bands_by_template = {} for row in rows: band = BioInterval(None, row['start'] + 1, row['end'], name=row['band_name'], data=row) bands_by_template.setdefault(row['name'], []).append(band) for tname, bands in bands_by_template.items(): start = min([b.start for b in bands]) end = max([b.end for b in bands]) end = Template(tname, start, end, bands=bands) templates[end.name] = end return templates
def load_masking_regions(*filepaths): """ reads a file of regions. The expect input format for the file is tab-delimited and the header should contain the following columns - chr: the chromosome - start: start of the region, 1-based inclusive - end: end of the region, 1-based inclusive - name: the name/label of the region For example: .. code-block:: text #chr start end name chr20 25600000 27500000 centromere Args: filepath (str): path to the input tab-delimited file Returns: Dict[str,List[BioInterval]]: a dictionary keyed by chromosome name with values of lists of regions on the chromosome Example: >>> m = load_masking_regions('filename') >>> m['1'] [BioInterval(), BioInterval(), ...] """ regions = {} for filepath in filepaths: _, rows = tab.read_file( filepath, require=['chr', 'start', 'end', 'name'], cast={ 'start': int, 'end': int, 'chr': ReferenceName }, ) for row in rows: mask_region = BioInterval(reference_object=row['chr'], start=row['start'], end=row['end'], name=row['name']) regions.setdefault(mask_region.reference_object, []).append(mask_region) return regions
def convert_file(input_file): bam_to_lib = {} with open(input_file, 'r') as fh: # comments in breakdancer are marked with a single # so they need to be discarded before reading lines = fh.readlines() header = 0 while header < len(lines) and lines[header].startswith('#'): metadata_match = re.match(r'^#(\S+)\t.*\tlibrary:(\S+)\t.*', lines[header]) if metadata_match: bam_to_lib[metadata_match.group(1)] = metadata_match.group(2) header += 1 lines = lines[header - 1:] input_file = Namespace(readlines=lambda: lines) header, rows = tab.read_file(input_file, allow_short=True, require=['num_Reads_lib']) for row in rows: for bam, lib in bam_to_lib.items(): row['num_Reads_lib'] = row['num_Reads_lib'].replace(bam, lib) return rows
def convert_tab_to_json(filepath, warn=DEVNULL): """ given a file in the std input format (see below) reads and return a list of genes (and sub-objects) +-----------------------+---------------------------+-----------------------------------------------------------+ | column name | example | description | +=======================+===========================+===========================================================+ | ensembl_transcript_id | ENST000001 | | +-----------------------+---------------------------+-----------------------------------------------------------+ | ensembl_gene_id | ENSG000001 | | +-----------------------+---------------------------+-----------------------------------------------------------+ | strand | -1 | positive or negative 1 | +-----------------------+---------------------------+-----------------------------------------------------------+ | cdna_coding_start | 44 | where translation begins relative to the start of the cdna| +-----------------------+---------------------------+-----------------------------------------------------------+ | cdna_coding_end | 150 | where translation terminates | +-----------------------+---------------------------+-----------------------------------------------------------+ | genomic_exon_ranges | 100-201;334-412;779-830 | semi-colon demitited exon start/ends | +-----------------------+---------------------------+-----------------------------------------------------------+ | AA_domain_ranges | DBD:220-251,260-271 | semi-colon delimited list of domains | +-----------------------+---------------------------+-----------------------------------------------------------+ | hugo_names | KRAS | hugo gene name | +-----------------------+---------------------------+-----------------------------------------------------------+ Args: filepath (str): path to the input tab-delimited file Returns: :class:`dict` of :class:`list` of :any:`Gene` by :class:`str`: a dictionary keyed by chromosome name with values of list of genes on the chromosome Example: >>> ref = load_reference_genes('filename') >>> ref['1'] [Gene(), Gene(), ....] Warning: does not load translations unless then start with 'M', end with '*' and have a length of multiple 3 """ def parse_exon_list(row): if not row: return [] exons = [] for temp in re.split('[; ]', row): try: start, end = temp.split('-') exons.append({'start': int(start), 'end': int(end)}) except Exception as err: warn('exon error:', repr(temp), repr(err)) return exons def parse_domain_list(row): if not row: return [] domains = [] for domain in row.split(';'): try: name, temp = domain.rsplit(':') temp = temp.split(',') temp = [x.split('-') for x in temp] regions = [{'start': int(x), 'end': int(y)} for x, y in temp] domains.append({'name': name, 'regions': regions}) except Exception as err: warn('error in domain:', domain, row, repr(err)) return domains def nullable_int(row): try: row = int(row) except ValueError: row = tab.cast_null(row) return row _, rows = tab.read_file( filepath, require=['ensembl_gene_id', 'chr', 'ensembl_transcript_id'], add_default={ 'cdna_coding_start': 'null', 'cdna_coding_end': 'null', 'AA_domain_ranges': '', 'genomic_exon_ranges': '', 'hugo_names': '', 'transcript_genomic_start': 'null', 'transcript_genomic_end': 'null', 'best_ensembl_transcript_id': 'null' }, cast={ 'genomic_exon_ranges': parse_exon_list, 'AA_domain_ranges': parse_domain_list, 'cdna_coding_end': nullable_int, 'cdna_coding_start': nullable_int, 'transcript_genomic_end': nullable_int, 'transcript_genomic_start': nullable_int, 'gene_start': int, 'gene_end': int }) genes = {} for row in rows: gene = { 'chr': row['chr'], 'start': row['gene_start'], 'end': row['gene_end'], 'name': row['ensembl_gene_id'], 'strand': row['strand'], 'aliases': row['hugo_names'].split(';') if row['hugo_names'] else [], 'transcripts': [] } if gene['name'] not in genes: genes[gene['name']] = gene else: gene = genes[gene['name']] transcript = { 'is_best_transcript': row['best_ensembl_transcript_id'] == row['ensembl_transcript_id'], 'name': row['ensembl_transcript_id'], 'exons': row['genomic_exon_ranges'], 'domains': row['AA_domain_ranges'], 'start': row['transcript_genomic_start'], 'end': row['transcript_genomic_end'], 'cdna_coding_start': row['cdna_coding_start'], 'cdna_coding_end': row['cdna_coding_end'], 'aliases': [] } gene['transcripts'].append(transcript) return {'genes': genes.values()}
def read_pslx(filename, seqid_to_sequence_mapping, is_protein=False, verbose=True): pslx_header = [ 'match', 'mismatch', 'repmatch', 'ncount', 'qgap_count', 'qgap_bases', 'tgap_count', 'tgap_bases', 'strand', 'qname', 'qsize', 'qstart', 'qend', 'tname', 'tsize', 'tstart', 'tend', 'block_count', 'block_sizes', 'qstarts', 'tstarts', 'qseqs', 'tseqs' ] def split_csv_trailing_seq(x): return [s.upper() for s in re.sub(',$', '', x).split(',')] def split_csv_trailing_ints(x): return [int(s) for s in re.sub(',$', '', x).split(',')] header, rows = tab.read_file(filename, header=pslx_header, cast={ 'match': int, 'mismatch': int, 'repmatch': int, 'ncount': int, 'qgap_count': int, 'qgap_bases': int, 'tgap_count': int, 'tgap_bases': int, 'qsize': int, 'qstart': int, 'qend': int, 'tsize': int, 'tstart': int, 'tend': int, 'block_count': int, 'tname': lambda x: re.sub('^chr', '', x), 'block_sizes': split_csv_trailing_ints, 'qstarts': split_csv_trailing_ints, 'tstarts': split_csv_trailing_ints, 'qseqs': split_csv_trailing_seq, 'tseqs': split_csv_trailing_seq }, validate={'strand': r'^[\+-]$'}) final_rows = [] for row in rows: try: row['score'] = Blat.score(row, is_protein=is_protein) row['percent_ident'] = Blat.percent_identity( row, is_protein=is_protein) qseq = seqid_to_sequence_mapping[row['qname']] row['qseq_full'] = qseq for x in [ 'qgap_count', 'qgap_bases', 'tgap_count', 'tgap_bases', 'qsize', 'tsize', 'ncount', 'match', 'mismatch', 'repmatch' ]: if row[x] < 0 and verbose: raise AssertionError( 'Blat error: blat returned a negative number, which are not allowed: {}={}' .format(x, row[x])) final_rows.append(row) except AssertionError as err: if verbose: warnings.warn(repr(err)) return header, final_rows