Пример #1
0
def _convert_tool_output(
    input_file,
    file_type=SUPPORTED_TOOL.MAVIS,
    stranded=False,
    log=DEVNULL,
    assume_no_untemplated=True,
):
    log('reading:', input_file)
    result = []
    rows = None
    if file_type == SUPPORTED_TOOL.MAVIS:
        result = read_bpp_from_input_file(input_file,
                                          expand_orient=True,
                                          expand_svtype=True,
                                          add_default={'stranded': stranded})
    elif file_type == SUPPORTED_TOOL.CNVNATOR:
        _, rows = tab.read_file(
            input_file,
            header=[
                'event_type',
                'coordinates',
                'size',
                'normalized_RD',
                'e-val1',
                'e-val2',
                'e-val3',
                'e-val4',
                'q0',
            ],
        )
    elif file_type in [
            SUPPORTED_TOOL.DELLY,
            SUPPORTED_TOOL.MANTA,
            SUPPORTED_TOOL.PINDEL,
            SUPPORTED_TOOL.VCF,
            SUPPORTED_TOOL.BREAKSEQ,
            SUPPORTED_TOOL.STRELKA,
    ]:
        rows = read_vcf(input_file, file_type, log)
    elif file_type == SUPPORTED_TOOL.BREAKDANCER:
        rows = _convert_breakdancer_file(input_file)
    else:
        _, rows = tab.read_file(input_file)
    if rows:
        log('found', len(rows), 'rows')
        for row in rows:
            try:
                std_rows = _convert_tool_row(
                    row,
                    file_type,
                    stranded,
                    assume_no_untemplated=assume_no_untemplated)
            except Exception as err:
                log('Error in converting row', row)
                raise err
            else:
                result.extend(std_rows)
    log('generated', len(result), 'breakpoint pairs')
    return result
Пример #2
0
def _convert_tool_output(input_file,
                         file_type=SUPPORTED_TOOL.MAVIS,
                         stranded=False,
                         log=devnull,
                         assume_no_untemplated=True):
    log('reading:', input_file)
    result = []
    rows = None
    if file_type == SUPPORTED_TOOL.MAVIS:
        result = read_bpp_from_input_file(input_file,
                                          expand_orient=True,
                                          expand_svtype=True,
                                          add_default={'stranded': stranded})
    elif file_type == SUPPORTED_TOOL.CNVNATOR:
        _, rows = tab.read_file(input_file,
                                header=[
                                    'event_type', 'coordinates', 'size',
                                    'normalized_RD', 'e-val1', 'e-val2',
                                    'e-val3', 'e-val4', 'q0'
                                ])
    elif file_type in [
            SUPPORTED_TOOL.DELLY, SUPPORTED_TOOL.MANTA, SUPPORTED_TOOL.PINDEL,
            SUPPORTED_TOOL.VCF, SUPPORTED_TOOL.BREAKSEQ
    ]:
        rows = []
        vfile = VariantFile(input_file)
        try:
            vfile.header.info.add('END',
                                  number=1,
                                  type='Integer',
                                  description='End of the interval')
        except ValueError:
            pass
        for vcf_record in vfile.fetch():
            rows.extend(_parse_vcf_record(vcf_record, log=log))
    elif file_type == SUPPORTED_TOOL.BREAKDANCER:
        rows = _convert_breakdancer_file(input_file)
    else:
        _, rows = tab.read_file(input_file)
    if rows:
        log('found', len(rows), 'rows')
        for row in rows:
            try:
                std_rows = _convert_tool_row(
                    row,
                    file_type,
                    stranded,
                    assume_no_untemplated=assume_no_untemplated)
            except Exception as err:
                log('Error in converting row', row)
                raise err
            else:
                result.extend(std_rows)
    log('generated', len(result), 'breakpoint pairs')
    return result
Пример #3
0
def load_templates(*filepaths):
    """
    primarily useful if template drawings are required and is not necessary otherwise
    assumes the input file is 0-indexed with [start,end) style. Columns are expected in
    the following order, tab-delimited. A header should not be given

    1. name
    2. start
    3. end
    4. band_name
    5. giemsa_stain

    for example

    .. code-block:: text

        chr1    0   2300000 p36.33  gneg
        chr1    2300000 5400000 p36.32  gpos25

    Args:
        filename (str): the path to the file with the cytoband template information

    Returns:
        List[Template]: list of the templates loaded

    """
    header = ['name', 'start', 'end', 'band_name', 'giemsa_stain']
    templates = {}

    for filename in filepaths:
        header, rows = tab.read_file(
            filename,
            header=header,
            cast={
                'start': int,
                'end': int
            },
            in_={'giemsa_stain': GIEMSA_STAIN.values()},
        )

        bands_by_template = {}
        for row in rows:
            band = BioInterval(None,
                               row['start'] + 1,
                               row['end'],
                               name=row['band_name'],
                               data=row)
            bands_by_template.setdefault(row['name'], []).append(band)

        for tname, bands in bands_by_template.items():
            start = min([b.start for b in bands])
            end = max([b.end for b in bands])
            end = Template(tname, start, end, bands=bands)
            templates[end.name] = end
    return templates
Пример #4
0
def load_masking_regions(*filepaths):
    """
    reads a file of regions. The expect input format for the file is tab-delimited and
    the header should contain the following columns

    - chr: the chromosome
    - start: start of the region, 1-based inclusive
    - end: end of the region, 1-based inclusive
    - name: the name/label of the region

    For example:

    .. code-block:: text

        #chr    start       end         name
        chr20   25600000    27500000    centromere

    Args:
        filepath (str): path to the input tab-delimited file
    Returns:
        Dict[str,List[BioInterval]]: a dictionary keyed by chromosome name with values of lists of regions on the chromosome

    Example:
        >>> m = load_masking_regions('filename')
        >>> m['1']
        [BioInterval(), BioInterval(), ...]
    """
    regions = {}
    for filepath in filepaths:
        _, rows = tab.read_file(
            filepath,
            require=['chr', 'start', 'end', 'name'],
            cast={
                'start': int,
                'end': int,
                'chr': ReferenceName
            },
        )
        for row in rows:
            mask_region = BioInterval(reference_object=row['chr'],
                                      start=row['start'],
                                      end=row['end'],
                                      name=row['name'])
            regions.setdefault(mask_region.reference_object,
                               []).append(mask_region)
    return regions
Пример #5
0
def convert_file(input_file):
    bam_to_lib = {}
    with open(input_file, 'r') as fh:
        # comments in breakdancer are marked with a single # so they need to be discarded before reading
        lines = fh.readlines()
        header = 0
        while header < len(lines) and lines[header].startswith('#'):
            metadata_match = re.match(r'^#(\S+)\t.*\tlibrary:(\S+)\t.*',
                                      lines[header])
            if metadata_match:
                bam_to_lib[metadata_match.group(1)] = metadata_match.group(2)
            header += 1
        lines = lines[header - 1:]
        input_file = Namespace(readlines=lambda: lines)
    header, rows = tab.read_file(input_file,
                                 allow_short=True,
                                 require=['num_Reads_lib'])
    for row in rows:
        for bam, lib in bam_to_lib.items():
            row['num_Reads_lib'] = row['num_Reads_lib'].replace(bam, lib)
    return rows
Пример #6
0
def convert_tab_to_json(filepath, warn=DEVNULL):
    """
    given a file in the std input format (see below) reads and return a list of genes (and sub-objects)

    +-----------------------+---------------------------+-----------------------------------------------------------+
    | column name           | example                   | description                                               |
    +=======================+===========================+===========================================================+
    | ensembl_transcript_id | ENST000001                |                                                           |
    +-----------------------+---------------------------+-----------------------------------------------------------+
    | ensembl_gene_id       | ENSG000001                |                                                           |
    +-----------------------+---------------------------+-----------------------------------------------------------+
    | strand                | -1                        | positive or negative 1                                    |
    +-----------------------+---------------------------+-----------------------------------------------------------+
    | cdna_coding_start     | 44                        | where translation begins relative to the start of the cdna|
    +-----------------------+---------------------------+-----------------------------------------------------------+
    | cdna_coding_end       | 150                       | where translation terminates                              |
    +-----------------------+---------------------------+-----------------------------------------------------------+
    | genomic_exon_ranges   | 100-201;334-412;779-830   | semi-colon demitited exon start/ends                      |
    +-----------------------+---------------------------+-----------------------------------------------------------+
    | AA_domain_ranges      | DBD:220-251,260-271       | semi-colon delimited list of domains                      |
    +-----------------------+---------------------------+-----------------------------------------------------------+
    | hugo_names            | KRAS                      | hugo gene name                                            |
    +-----------------------+---------------------------+-----------------------------------------------------------+

    Args:
        filepath (str): path to the input tab-delimited file

    Returns:
        :class:`dict` of :class:`list` of :any:`Gene` by :class:`str`: a dictionary keyed by chromosome name with values of list of genes on the chromosome

    Example:
        >>> ref = load_reference_genes('filename')
        >>> ref['1']
        [Gene(), Gene(), ....]

    Warning:
        does not load translations unless then start with 'M', end with '*' and have a length of multiple 3
    """
    def parse_exon_list(row):
        if not row:
            return []
        exons = []
        for temp in re.split('[; ]', row):
            try:
                start, end = temp.split('-')
                exons.append({'start': int(start), 'end': int(end)})
            except Exception as err:
                warn('exon error:', repr(temp), repr(err))
        return exons

    def parse_domain_list(row):
        if not row:
            return []
        domains = []
        for domain in row.split(';'):
            try:
                name, temp = domain.rsplit(':')
                temp = temp.split(',')
                temp = [x.split('-') for x in temp]
                regions = [{'start': int(x), 'end': int(y)} for x, y in temp]
                domains.append({'name': name, 'regions': regions})
            except Exception as err:
                warn('error in domain:', domain, row, repr(err))
        return domains

    def nullable_int(row):
        try:
            row = int(row)
        except ValueError:
            row = tab.cast_null(row)
        return row

    _, rows = tab.read_file(
        filepath,
        require=['ensembl_gene_id', 'chr', 'ensembl_transcript_id'],
        add_default={
            'cdna_coding_start': 'null',
            'cdna_coding_end': 'null',
            'AA_domain_ranges': '',
            'genomic_exon_ranges': '',
            'hugo_names': '',
            'transcript_genomic_start': 'null',
            'transcript_genomic_end': 'null',
            'best_ensembl_transcript_id': 'null'
        },
        cast={
            'genomic_exon_ranges': parse_exon_list,
            'AA_domain_ranges': parse_domain_list,
            'cdna_coding_end': nullable_int,
            'cdna_coding_start': nullable_int,
            'transcript_genomic_end': nullable_int,
            'transcript_genomic_start': nullable_int,
            'gene_start': int,
            'gene_end': int
        })
    genes = {}
    for row in rows:
        gene = {
            'chr': row['chr'],
            'start': row['gene_start'],
            'end': row['gene_end'],
            'name': row['ensembl_gene_id'],
            'strand': row['strand'],
            'aliases':
            row['hugo_names'].split(';') if row['hugo_names'] else [],
            'transcripts': []
        }
        if gene['name'] not in genes:
            genes[gene['name']] = gene
        else:
            gene = genes[gene['name']]

        transcript = {
            'is_best_transcript':
            row['best_ensembl_transcript_id'] == row['ensembl_transcript_id'],
            'name':
            row['ensembl_transcript_id'],
            'exons':
            row['genomic_exon_ranges'],
            'domains':
            row['AA_domain_ranges'],
            'start':
            row['transcript_genomic_start'],
            'end':
            row['transcript_genomic_end'],
            'cdna_coding_start':
            row['cdna_coding_start'],
            'cdna_coding_end':
            row['cdna_coding_end'],
            'aliases': []
        }
        gene['transcripts'].append(transcript)

    return {'genes': genes.values()}
Пример #7
0
    def read_pslx(filename,
                  seqid_to_sequence_mapping,
                  is_protein=False,
                  verbose=True):
        pslx_header = [
            'match', 'mismatch', 'repmatch', 'ncount', 'qgap_count',
            'qgap_bases', 'tgap_count', 'tgap_bases', 'strand', 'qname',
            'qsize', 'qstart', 'qend', 'tname', 'tsize', 'tstart', 'tend',
            'block_count', 'block_sizes', 'qstarts', 'tstarts', 'qseqs',
            'tseqs'
        ]

        def split_csv_trailing_seq(x):
            return [s.upper() for s in re.sub(',$', '', x).split(',')]

        def split_csv_trailing_ints(x):
            return [int(s) for s in re.sub(',$', '', x).split(',')]

        header, rows = tab.read_file(filename,
                                     header=pslx_header,
                                     cast={
                                         'match': int,
                                         'mismatch': int,
                                         'repmatch': int,
                                         'ncount': int,
                                         'qgap_count': int,
                                         'qgap_bases': int,
                                         'tgap_count': int,
                                         'tgap_bases': int,
                                         'qsize': int,
                                         'qstart': int,
                                         'qend': int,
                                         'tsize': int,
                                         'tstart': int,
                                         'tend': int,
                                         'block_count': int,
                                         'tname':
                                         lambda x: re.sub('^chr', '', x),
                                         'block_sizes':
                                         split_csv_trailing_ints,
                                         'qstarts': split_csv_trailing_ints,
                                         'tstarts': split_csv_trailing_ints,
                                         'qseqs': split_csv_trailing_seq,
                                         'tseqs': split_csv_trailing_seq
                                     },
                                     validate={'strand': r'^[\+-]$'})

        final_rows = []
        for row in rows:
            try:
                row['score'] = Blat.score(row, is_protein=is_protein)
                row['percent_ident'] = Blat.percent_identity(
                    row, is_protein=is_protein)
                qseq = seqid_to_sequence_mapping[row['qname']]
                row['qseq_full'] = qseq

                for x in [
                        'qgap_count', 'qgap_bases', 'tgap_count', 'tgap_bases',
                        'qsize', 'tsize', 'ncount', 'match', 'mismatch',
                        'repmatch'
                ]:
                    if row[x] < 0 and verbose:
                        raise AssertionError(
                            'Blat error: blat returned a negative number, which are not allowed: {}={}'
                            .format(x, row[x]))
                final_rows.append(row)
            except AssertionError as err:
                if verbose:
                    warnings.warn(repr(err))
        return header, final_rows