Exemplo n.º 1
0
    def test_from_transposon_fusions(self, tr_fusion):
        """Tests from_tranpsoson_fusions for example."""

        insertions = list(Insertion.from_transposon_fusions([tr_fusion]))

        expected = [
            Insertion(id=None,
                      seqname='16',
                      position=52141095,
                      strand=-1,
                      support_junction=380,
                      support_spanning=118,
                      support=498,
                      metadata={
                          'transposon_anchor': 1541,
                          'orientation': 'antisense',
                          'gene_name': 'Cblb',
                          'feature_type': 'SA',
                          'feature_strand': -1,
                          'feature_name': 'En2SA',
                          'gene_strand': 1
                      })
        ]

        assert expected == insertions
Exemplo n.º 2
0
def insertions():
    """Example insertions in ENSMUSG00000051951 (5 exons)."""
    return [
        # After third exon.
        Insertion(
            id='1',
            seqname='1',
            position=3207327,
            strand=1,
            support_junction=1,
            support_spanning=1,
            support=2,
            metadata=frozendict({
                'gene_id': 'ENSMUSG00000051951',
                'sample': 'S1'})),

        # After first exon.
        Insertion(
            id='2',
            seqname='1',
            position=3214491,
            strand=-1,
            support_junction=1,
            support_spanning=1,
            support=2,
            metadata=frozendict({
                'gene_id': 'ENSMUSG00000051951',
                'sample': 'S2'}))
    ] # yapf: disable
Exemplo n.º 3
0
def main():
    """Main for imfusion-ctg."""

    logger = logging.getLogger()
    args = parse_args()

    # Read insertions and filter for depth.
    insertions = list(Insertion.from_csv(args.insertions, sep='\t'))

    if args.min_depth is not None:
        insertions = [
            ins for ins in insertions if ins.support >= args.min_depth
        ]

    # Identify CTGs.
    logger.info('Testing for CTGs')

    if args.window is not None:
        logger.info('- Using window (%d, %d)', *args.window)

    reference = Reference(args.reference)

    ctgs = test_ctgs(insertions,
                     reference=reference,
                     gene_ids=args.gene_ids,
                     chromosomes=args.chromosomes,
                     pattern=args.pattern,
                     window=args.window)

    # Filter using given threshold.
    if args.threshold is not None:
        ctgs = ctgs.query('q_value <= {}'.format(args.threshold))

    # If expression is given, test for differential expression.
    if args.expression is not None:
        logger.info('Testing for differential expression')

        # Perform DE tests.
        exon_counts = read_exon_counts(args.expression)
        de_results = test_de(insertions, exon_counts, gene_ids=ctgs['gene_id'])

        # Combine with CTG result.
        de_results = de_results.rename(
            columns={
                'direction': 'de_direction',
                'p_value': 'de_pvalue',
                'test_type': 'de_test'
            })

        col_order = (list(ctgs.columns) +
                     ['de_test', 'de_direction', 'de_pvalue'])
        ctgs = pd.merge(ctgs, de_results, on='gene_id', how='left')[col_order]

        if args.de_threshold is not None:
            # Filter for non-significant genes, keeping nans.
            ctgs = ctgs.loc[~(ctgs['de_pvalue'] > args.de_threshold)]

    # Write outputs.
    logger.info('Writing outputs')
    ctgs.to_csv(str(args.output), sep='\t', index=False)
Exemplo n.º 4
0
    def test_from_transposon_fusions(self, tr_fusion):
        """Tests from_tranpsoson_fusions for example."""

        insertions = list(Insertion.from_transposon_fusions([tr_fusion]))

        expected = [
            Insertion(
                id=None,
                seqname='16',
                position=52141095,
                strand=-1,
                support_junction=380,
                support_spanning=118,
                support=498,
                metadata={
                    'transposon_anchor': 1541,
                    'orientation': 'antisense',
                    'gene_name': 'Cblb',
                    'feature_type': 'SA',
                    'feature_strand': -1,
                    'feature_name': 'En2SA',
                    'gene_strand': 1
                })
        ]

        assert expected == insertions
Exemplo n.º 5
0
    def test_from_transposon_fusion(self, tr_fusion):
        """Tests from_tranpsoson_fusion for example."""

        insertion = Insertion.from_transposon_fusion(tr_fusion)

        expected = Insertion(
            id=None,
            seqname='16',
            position=52141095,
            strand=-1,
            support_junction=380,
            support_spanning=118,
            support=498,
            metadata={
                'transposon_anchor': 1541,
                'orientation': 'antisense',
                'gene_name': 'Cblb',
                'feature_type': 'SA',
                'feature_strand': -1,
                'feature_name': 'En2SA',
                'gene_strand': 1
            })

        assert insertion == expected
        assert isinstance(insertion.metadata, frozendict)
Exemplo n.º 6
0
def main():
    """Main for imfusion-ctg."""

    logger = logging.getLogger()
    args = parse_args()

    # Read insertions and filter for depth.
    insertions = list(Insertion.from_csv(args.insertions, sep='\t'))

    if args.min_depth is not None:
        insertions = [
            ins for ins in insertions if ins.support >= args.min_depth
        ]

    # Identify CTGs.
    logger.info('Testing for CTGs')

    if args.window is not None:
        logger.info('- Using window (%d, %d)', *args.window)

    reference = Reference(args.reference)

    ctgs = test_ctgs(
        insertions,
        reference=reference,
        gene_ids=args.gene_ids,
        chromosomes=args.chromosomes,
        pattern=args.pattern,
        window=args.window)

    # Filter using given threshold.
    if args.threshold is not None:
        ctgs = ctgs.query('q_value <= {}'.format(args.threshold))

    # If expression is given, test for differential expression.
    if args.expression is not None:
        logger.info('Testing for differential expression')

        # Perform DE tests.
        exon_counts = read_exon_counts(args.expression)
        de_results = test_de(insertions, exon_counts, gene_ids=ctgs['gene_id'])

        # Combine with CTG result.
        de_results = de_results.rename(columns={
            'direction': 'de_direction',
            'p_value': 'de_pvalue',
            'test_type': 'de_test'
        })

        col_order = (
            list(ctgs.columns) + ['de_test', 'de_direction', 'de_pvalue'])
        ctgs = pd.merge(ctgs, de_results, on='gene_id', how='left')[col_order]

        if args.de_threshold is not None:
            # Filter for non-significant genes, keeping nans.
            ctgs = ctgs.loc[~(ctgs['de_pvalue'] > args.de_threshold)]

    # Write outputs.
    logger.info('Writing outputs')
    ctgs.to_csv(str(args.output), sep='\t', index=False)
Exemplo n.º 7
0
    def test_from_transposon_fusions_id(self, tr_fusion):
        """Tests from_tranpsoson_fusions for example, with id."""

        insertions = list(
            Insertion.from_transposon_fusions(
                [tr_fusion], id_fmt_str='INS_{}'))

        assert insertions[0].id == 'INS_1'
Exemplo n.º 8
0
    def test_from_transposon_fusions_id(self, tr_fusion):
        """Tests from_tranpsoson_fusions for example, with id."""

        insertions = list(
            Insertion.from_transposon_fusions([tr_fusion],
                                              id_fmt_str='INS_{}'))

        assert insertions[0].id == 'INS_1'
Exemplo n.º 9
0
def main():
    """Main function of imfusion-merge."""

    args = _parse_args()

    # Use directory names to name samples if no names given.
    names = args.names or [fp.name for fp in args.sample_dirs]

    # Merge samples into single dataset.
    merged_ins, merged_expr = merge_samples(
        args.sample_dirs,
        sample_names=names,
        with_expression=args.output_expression is not None)

    # Write output(s).
    Insertion.to_csv(str(args.output), merged_ins, sep='\t', index=False)

    if args.output_expression is not None:
        merged_expr.to_csv(str(args.output_expression), sep='\t', index=True)
Exemplo n.º 10
0
def main():
    """Main function of imfusion-merge."""

    args = _parse_args()

    # Use directory names to name samples if no names given.
    names = args.names or [fp.name for fp in args.sample_dirs]

    # Merge samples into single dataset.
    merged_ins, merged_expr = merge_samples(
        args.sample_dirs,
        sample_names=names,
        with_expression=args.output_expression is not None)

    # Write output(s).
    Insertion.to_csv(str(args.output), merged_ins, sep='\t', index=False)

    if args.output_expression is not None:
        merged_expr.to_csv(str(args.output_expression), sep='\t', index=True)
Exemplo n.º 11
0
    def test_pos_example_trp53bp2_df(self, test_insertions, test_exon_counts):
        """Tests positive example of DE in Trp53bp2 with dataframe input."""

        test_insertion_df = Insertion.to_frame(test_insertions)

        result = test.test_de_exon(
            test_insertion_df, test_exon_counts, gene_id='ENSMUSG00000026510')

        assert result.p_value < 0.01
        assert result.direction == 1
Exemplo n.º 12
0
    def test_proper_example_df(self, insertions, exon_counts):
        """Tests same example, using dataframe input for insertions."""

        insertion_df = Insertion.to_frame(insertions)

        before, after, dropped = test.split_counts(
            exon_counts, insertion_df, gene_id='ENSMUSG00000051951')

        assert len(before) == 1
        assert len(after) == 2
        assert dropped == set()
Exemplo n.º 13
0
    def test_pos_example_trp53bp2_df(self, test_insertions, test_exon_counts):
        """Tests positive example of DE in Trp53bp2 with dataframe input."""

        test_insertion_df = Insertion.to_frame(test_insertions)

        result = test.test_de_exon(test_insertion_df,
                                   test_exon_counts,
                                   gene_id='ENSMUSG00000026510')

        assert result.p_value < 0.01
        assert result.direction == 1
Exemplo n.º 14
0
    def test_proper_example_df(self, insertions, exon_counts):
        """Tests same example, using dataframe input for insertions."""

        insertion_df = Insertion.to_frame(insertions)

        before, after, dropped = test.split_counts(
            exon_counts, insertion_df, gene_id='ENSMUSG00000051951')

        assert len(before) == 1
        assert len(after) == 2
        assert dropped == set()
Exemplo n.º 15
0
def _preprocess_insertions(insertions, gene_id):
    """Converts insertions into common object format and subsets for gene."""

    if isinstance(insertions, pd.DataFrame):
        insertions = insertions.loc[insertions['gene_id'] == gene_id]
        insertions = list(Insertion.from_frame(insertions))
    else:
        insertions = [ins for ins in insertions
                      if ins.metadata['gene_id'] == gene_id]  # yapf: disable

    return insertions
Exemplo n.º 16
0
def _preprocess_insertions(insertions, gene_id):
    """Converts insertions into common object format and subsets for gene."""

    if isinstance(insertions, pd.DataFrame):
        insertions = insertions.loc[insertions['gene_id'] == gene_id]
        insertions = list(Insertion.from_frame(insertions))
    else:
        insertions = [ins for ins in insertions
                      if ins.metadata['gene_id'] == gene_id]  # yapf: disable

    return insertions
Exemplo n.º 17
0
    def test_from_transposon_fusion(self, tr_fusion):
        """Tests from_tranpsoson_fusion for example."""

        insertion = Insertion.from_transposon_fusion(tr_fusion)

        expected = Insertion(id=None,
                             seqname='16',
                             position=52141095,
                             strand=-1,
                             support_junction=380,
                             support_spanning=118,
                             support=498,
                             metadata={
                                 'transposon_anchor': 1541,
                                 'orientation': 'antisense',
                                 'gene_name': 'Cblb',
                                 'feature_type': 'SA',
                                 'feature_strand': -1,
                                 'feature_name': 'En2SA',
                                 'gene_strand': 1
                             })

        assert insertion == expected
        assert isinstance(insertion.metadata, frozendict)
Exemplo n.º 18
0
def merge_samples(
        dir_paths,  # type: Iterable[pathlib.Path]
        sample_names=None,  # type: List[str]
        with_expression=True  # type: bool
):  # type: (...) -> Tuple[Iterable[Insertion], Optional[pd.DataFrame]]
    """Merges samples in dir_paths to a single set of insertions and counts.

    Parameters
    ----------
    dir_paths : List[pathlib.Path]
        Paths to the sample directories.
    samples : List[str]
        Names to use the merged samples.
    with_expression : bool
        Whether to also merge expression counts (assumes that expression.txt
        files have been generated using imfusion-expression).

    Returns
    -------
    Tuple[pandas.DataFrame, pandas.DataFrame]
        Two DataFrames respectively containing the merged insertions
        and the merged exon counts. If with_expression is False, the
        merged counts frame is returned as None.

    """

    # Use directory names as sample names if none given.
    if sample_names is None:
        sample_names = [dp.name for dp in dir_paths]

    # Check for duplicate names.
    if len(sample_names) != len(set(sample_names)):
        raise ValueError('Sample names contains duplicates')

    # Merge insertions.
    ins_paths = [dp / 'insertions.txt' for dp in dir_paths]
    ins_groups = (Insertion.from_csv(fp, sep='\t') for fp in ins_paths)
    merged_insertions = merge_insertions(ins_groups, sample_names)

    # Merge counts.
    if with_expression:
        count_paths = [dp / 'expression.txt' for dp in dir_paths]
        count_frames = (read_exon_counts(cp) for cp in count_paths)
        merged_counts = merge_exon_counts(count_frames, sample_names)
    else:
        merged_counts = None

    return merged_insertions, merged_counts
Exemplo n.º 19
0
def merge_samples(
    dir_paths,  # type: Iterable[pathlib.Path]
    sample_names=None,  # type: List[str]
    with_expression=True  # type: bool
):  # type: (...) -> Tuple[Iterable[Insertion], Optional[pd.DataFrame]]
    """Merges samples in dir_paths to a single set of insertions and counts.

    Parameters
    ----------
    dir_paths : List[pathlib.Path]
        Paths to the sample directories.
    samples : List[str]
        Names to use the merged samples.
    with_expression : bool
        Whether to also merge expression counts (assumes that expression.txt
        files have been generated using imfusion-expression).

    Returns
    -------
    Tuple[pandas.DataFrame, pandas.DataFrame]
        Two DataFrames respectively containing the merged insertions
        and the merged exon counts. If with_expression is False, the
        merged counts frame is returned as None.

    """

    # Use directory names as sample names if none given.
    if sample_names is None:
        sample_names = [dp.name for dp in dir_paths]

    # Check for duplicate names.
    if len(sample_names) != len(set(sample_names)):
        raise ValueError('Sample names contains duplicates')

    # Merge insertions.
    ins_paths = [dp / 'insertions.txt' for dp in dir_paths]
    ins_groups = (Insertion.from_csv(fp, sep='\t') for fp in ins_paths)
    merged_insertions = merge_insertions(ins_groups, sample_names)

    # Merge counts.
    if with_expression:
        count_paths = [dp / 'expression.txt' for dp in dir_paths]
        count_frames = (read_exon_counts(cp) for cp in count_paths)
        merged_counts = merge_exon_counts(count_frames, sample_names)
    else:
        merged_counts = None

    return merged_insertions, merged_counts
Exemplo n.º 20
0
def _insertion(id,
               seqname,
               position,
               strand,
               support_junction=1,
               support_spanning=1,
               metadata=None):
    """Helper function for building an Insertion instance."""
    return Insertion(id=id,
                     seqname=seqname,
                     position=position,
                     strand=strand,
                     support_junction=support_junction,
                     support_spanning=support_spanning,
                     support=support_junction + support_spanning,
                     metadata=frozendict(metadata or {}))
Exemplo n.º 21
0
def insertion():
    return Insertion(
        id='INS_1',
        seqname='16',
        position=52141095,
        strand=-1,
        support_junction=380,
        support_spanning=118,
        support=498,
        metadata=frozendict({
            'feature_type': 'SA',
            'feature_name': 'En2SA',
            'gene_strand': 1,
            'orientation': 'antisense',
            'feature_strand': -1,
            'gene_name': 'Cblb',
            'transposon_anchor': 1541
        }))
Exemplo n.º 22
0
def main():
    """Main function for imfusion-insertions."""

    args = parse_args()

    # Construct aligner and identify insertions.
    aligner = args.aligner.from_args(args)
    aligner.check_dependencies()

    insertions = aligner.identify_insertions(fastq_path=args.fastq,
                                             output_dir=args.output_dir,
                                             fastq2_path=args.fastq2)

    # Convert to dataframe.
    insertion_frame = Insertion.to_frame(insertions)
    insertion_frame = insertion_frame.sort_values('support', ascending=False)

    # Write output.
    output_path = args.output_dir / 'insertions.txt'
    insertion_frame.to_csv(str(output_path), sep='\t', index=False)
Exemplo n.º 23
0
def extract_insertions(
    fusions,  # type: Iterable[Fusion]
    gtf_path,  # type: pathlib.Path
    features_path,  # type: pathlib.Path
    chromosomes=None,  # type: List[str]
    assembled_gtf_path=None,  # type: pathlib.Path
    ffpm_fastq_path=None  # type: pathlib.Path
):  # type: (...) -> Iterable[Insertion]
    """Extract insertions from gene-transposon fusions."""

    # Annotate for genes.
    gtf_reference = TranscriptReference.from_gtf(gtf_path,
                                                 chromosomes=chromosomes)

    annotated = annotate_fusions_for_genes(fusions, gtf_reference)

    # Annotate for assembly (if given).
    if assembled_gtf_path is not None:
        assem_reference = TranscriptReference.from_gtf(assembled_gtf_path,
                                                       chromosomes=chromosomes)

        annotated = annotate_fusions_for_assembly(annotated, gtf_reference,
                                                  assem_reference)

    # Annotate for transposon.
    annotated = annotate_fusions_for_transposon(annotated, features_path)

    # Drop any fusions without a transposon feature.
    annotated = (fusion for fusion in annotated
                 if 'feature_name' in fusion.metadata)

    # Calculate FFPM scores.
    if ffpm_fastq_path is not None:
        annotated = annotate_ffpm(annotated, fastq_path=ffpm_fastq_path)

    # Convert to insertions.
    insertions = Insertion.from_transposon_fusions(annotated,
                                                   id_fmt_str='INS_{}')

    for insertion in insertions:
        yield insertion
Exemplo n.º 24
0
def extract_insertions(
        fusions,  # type: Iterable[Fusion]
        gtf_path,  # type: pathlib.Path
        features_path,  # type: pathlib.Path
        chromosomes=None,  # type: List[str]
        assembled_gtf_path=None,  # type: pathlib.Path
        ffpm_fastq_path=None  # type: pathlib.Path
):  # type: (...) -> Iterable[Insertion]
    """Extract insertions from gene-transposon fusions."""

    # Annotate for genes.
    gtf_reference = TranscriptReference.from_gtf(
        gtf_path, chromosomes=chromosomes)

    annotated = annotate_fusions_for_genes(fusions, gtf_reference)

    # Annotate for assembly (if given).
    if assembled_gtf_path is not None:
        assem_reference = TranscriptReference.from_gtf(
            assembled_gtf_path, chromosomes=chromosomes)

        annotated = annotate_fusions_for_assembly(annotated, gtf_reference,
                                                  assem_reference)

    # Annotate for transposon.
    annotated = annotate_fusions_for_transposon(annotated, features_path)

    # Drop any fusions without a transposon feature.
    annotated = (fusion for fusion in annotated
                 if 'feature_name' in fusion.metadata)

    # Calculate FFPM scores.
    if ffpm_fastq_path is not None:
        annotated = annotate_ffpm(annotated, fastq_path=ffpm_fastq_path)

    # Convert to insertions.
    insertions = Insertion.from_transposon_fusions(
        annotated, id_fmt_str='INS_{}')

    for insertion in insertions:
        yield insertion
Exemplo n.º 25
0
def main():
    """Main function for imfusion-insertions."""

    args = parse_args()

    # Construct aligner and identify insertions.
    aligner = args.aligner.from_args(args)
    aligner.check_dependencies()

    insertions = aligner.identify_insertions(
        fastq_path=args.fastq,
        output_dir=args.output_dir,
        fastq2_path=args.fastq2)

    # Convert to dataframe.
    insertion_frame = Insertion.to_frame(insertions)
    insertion_frame = insertion_frame.sort_values('support', ascending=False)

    # Write output.
    output_path = args.output_dir / 'insertions.txt'
    insertion_frame.to_csv(str(output_path), sep='\t', index=False)
Exemplo n.º 26
0
def test_ctgs(
        insertions,  # type: List[Insertion]
        reference,  # type: Reference
        gene_ids=None,  # type: Set[str]
        chromosomes=None,  # type: Set[str]
        pattern=None,  # type: str
        per_sample=True,  # type: bool
        window=None  #type: Tuple[int, int]
):
    """Identifies genes that are significantly enriched for insertions (CTGs).

    This function takes a DataFrame of insertions, coming from multiple samples,
    and identifies if any genes are more frequently affected by an
    insertion than would be expected by chance. These genes are called
    Commonly Targeted Genes (CTGs). CTGs are selected by comparing the
    number of insertions within the gene to the number of insertions
    that would be expected from the background insertion rate, which is
    modeled using a Poisson distribution.

    Parameters
    ----------
    insertions : List[Insertion]
        Insertions to test.
    reference : Reference
        Reference index used by the aligner to identify insertions.
    genes : List[str]
        List of genes to test (defaults to all genes with an insertion).
    chromosomes : List[str]
        List of chromosomes to include, defaults to all chromosomes
        shared between the reference sequence and the reference gtf.
    pattern : str
        Specificity pattern of the used transposon.
    per_sample : bool
        Whether to perform the per sample test (recommended), which
        effectively collapes insertions per sample/gene combination.
        This avoids issues in which insertions that are detected
        multiple times or that may have hopped inside the gene locus
        are counted multiple times.
    window : Tuple[int, int]
        Window to include around gene (in bp). Specified as (upstream_dist,
        downstream_dist). For example: (2000, 2000) specifies in a 2KB
        window around each gene.

    Returns
    -------
    pandas.DataFrame
        Results of CTG test for tested genes. Contains two columns:
        p_value and q_value. The last column (q_value)
        represents the p-value of the gene after correcting for
        multiple testing using bonferroni correction.

    """

    # Default to shared chromosome sequences (typically drops some
    # of the more esoteric extra scaffold/patch sequences).
    if chromosomes is None:
        reference_seq = pyfaidx.Fasta(str(reference.fasta_path))
        reference_gtf = GtfIterator(reference.indexed_gtf_path)

        chromosomes = list(
            set(reference_seq.keys()) & set(reference_gtf.contigs))

        if len(chromosomes) == 0:
            ValueError('No chromosomes are shared between the reference '
                       'sequence and reference gtf files')

    if len(chromosomes) == 0:
        raise ValueError('At least one chromosome must be given')

    # Determine gene windows using GTF.
    logging.info('Generating gene windows')
    gene_windows = _build_gene_windows(reference.indexed_gtf_path,
                                       window=window,
                                       chromosomes=chromosomes)

    # Subset insertions to gene intervals.
    insertions = _subset_to_windows(insertions, gene_windows)

    if gene_ids is None:
        gene_ids = set(ins.metadata['gene_id'] for ins in insertions)

    # Collapse insertions per gene/sample (recommended).
    # Corrects for hopping/multiple detection issues.
    if per_sample:
        logging.info('Collapsing insertions')
        insertions = list(_collapse_per_sample(insertions))

    # Calculate total number of pattern occurrences within intervals.
    logging.info('Counting pattern occurrences')
    reference_seq = pyfaidx.Fasta(str(reference.fasta_path))

    total = count_total(reference_seq,
                        pattern=pattern,
                        intervals=gene_windows.values())

    # Calculate p-values for each gene.
    logging.info('Calculating significance for genes')
    insertion_trees = GenomicIntervalTree.from_objects_position(
        insertions, chrom_attr='seqname')

    p_values = {
        gene_id: test_region(
            insertions=insertions,
            reference_seq=reference_seq,
            region=gene_windows[gene_id],
            total=total,
            pattern=pattern,
            filters=[lambda ins, gid=gene_id: ins.metadata['gene_id'] == gid],
            insertion_trees=insertion_trees)
        for gene_id in gene_ids
    }

    # Build result frame.
    result = pd.DataFrame.from_records(iter(p_values.items()),
                                       columns=['gene_id', 'p_value'])

    # Calculate corrected p-value using bonferroni correction.
    result['q_value'] = (result['p_value'] * len(result)).clip_upper(1.0)

    # Sort by q-value and p-value.
    result.sort_values(by=['q_value', 'p_value'], inplace=True)

    if len(insertions) > 0:
        # Annotate with gene_name if possible.
        if 'gene_name' in insertions[0].metadata:
            name_map = {
                ins.metadata['gene_id']: ins.metadata['gene_name']
                for ins in insertions
            }
            result.insert(1, 'gene_name', result['gene_id'].map(name_map))
        else:
            result['gene_name'] = np.nan

        # Annotate with frequency.
        frequency = (Insertion.to_frame(insertions).groupby('gene_id')
                     ['sample'].nunique().reset_index(name='n_samples'))
        result = pd.merge(result, frequency, on='gene_id', how='left')
    else:
        result['gene_name'] = np.nan
        result['n_samples'] = np.nan

    return result
Exemplo n.º 27
0
    def test_from_transposon_fusion_id(self, tr_fusion):
        """Tests from_tranpsoson_fusion for example, with id."""

        insertion = Insertion.from_transposon_fusion(tr_fusion, id_='a')
        assert insertion.id == 'a'
Exemplo n.º 28
0
    def test_from_transposon_fusion_id(self, tr_fusion):
        """Tests from_tranpsoson_fusion for example, with id."""

        insertion = Insertion.from_transposon_fusion(tr_fusion, id_='a')
        assert insertion.id == 'a'
Exemplo n.º 29
0
def test_insertions():
    """Example insertions for Trp53bp2 and Nf1."""

    file_path = pytest.helpers.data_path(
        'insertions.txt', relative_to=__file__)
    return list(Insertion.from_csv(file_path, sep='\t'))
Exemplo n.º 30
0
def test_insertions():
    """Example insertions for Trp53bp2 and Nf1."""

    file_path = pytest.helpers.data_path('insertions.txt',
                                         relative_to=__file__)
    return list(Insertion.from_csv(file_path, sep='\t'))
Exemplo n.º 31
0
def test_ctgs(
        insertions,  # type: List[Insertion]
        reference,  # type: Reference
        gene_ids=None,  # type: Set[str]
        chromosomes=None,  # type: Set[str]
        pattern=None,  # type: str
        per_sample=True,  # type: bool
        window=None  #type: Tuple[int, int]
):
    """Identifies genes that are significantly enriched for insertions (CTGs).

    This function takes a DataFrame of insertions, coming from multiple samples,
    and identifies if any genes are more frequently affected by an
    insertion than would be expected by chance. These genes are called
    Commonly Targeted Genes (CTGs). CTGs are selected by comparing the
    number of insertions within the gene to the number of insertions
    that would be expected from the background insertion rate, which is
    modeled using a Poisson distribution.

    Parameters
    ----------
    insertions : List[Insertion]
        Insertions to test.
    reference : Reference
        Reference index used by the aligner to identify insertions.
    genes : List[str]
        List of genes to test (defaults to all genes with an insertion).
    chromosomes : List[str]
        List of chromosomes to include, defaults to all chromosomes
        shared between the reference sequence and the reference gtf.
    pattern : str
        Specificity pattern of the used transposon.
    per_sample : bool
        Whether to perform the per sample test (recommended), which
        effectively collapes insertions per sample/gene combination.
        This avoids issues in which insertions that are detected
        multiple times or that may have hopped inside the gene locus
        are counted multiple times.
    window : Tuple[int, int]
        Window to include around gene (in bp). Specified as (upstream_dist,
        downstream_dist). For example: (2000, 2000) specifies in a 2KB
        window around each gene.

    Returns
    -------
    pandas.DataFrame
        Results of CTG test for tested genes. Contains two columns:
        p_value and q_value. The last column (q_value)
        represents the p-value of the gene after correcting for
        multiple testing using bonferroni correction.

    """

    # Default to shared chromosome sequences (typically drops some
    # of the more esoteric extra scaffold/patch sequences).
    if chromosomes is None:
        reference_seq = pyfaidx.Fasta(str(reference.fasta_path))
        reference_gtf = GtfIterator(reference.indexed_gtf_path)

        chromosomes = list(
            set(reference_seq.keys()) & set(reference_gtf.contigs))

        if len(chromosomes) == 0:
            ValueError('No chromosomes are shared between the reference '
                       'sequence and reference gtf files')

    if len(chromosomes) == 0:
        raise ValueError('At least one chromosome must be given')

    # Determine gene windows using GTF.
    logging.info('Generating gene windows')
    gene_windows = _build_gene_windows(
        reference.indexed_gtf_path, window=window, chromosomes=chromosomes)

    # Subset insertions to gene intervals.
    insertions = _subset_to_windows(insertions, gene_windows)

    if gene_ids is None:
        gene_ids = set(ins.metadata['gene_id'] for ins in insertions)

    # Collapse insertions per gene/sample (recommended).
    # Corrects for hopping/multiple detection issues.
    if per_sample:
        logging.info('Collapsing insertions')
        insertions = list(_collapse_per_sample(insertions))

    # Calculate total number of pattern occurrences within intervals.
    logging.info('Counting pattern occurrences')
    reference_seq = pyfaidx.Fasta(str(reference.fasta_path))

    total = count_total(
        reference_seq, pattern=pattern, intervals=gene_windows.values())

    # Calculate p-values for each gene.
    logging.info('Calculating significance for genes')
    insertion_trees = GenomicIntervalTree.from_objects_position(
        insertions, chrom_attr='seqname')

    p_values = {
        gene_id: test_region(
            insertions=insertions,
            reference_seq=reference_seq,
            region=gene_windows[gene_id],
            total=total,
            pattern=pattern,
            filters=[lambda ins, gid=gene_id: ins.metadata['gene_id'] == gid],
            insertion_trees=insertion_trees)
        for gene_id in gene_ids
    }

    # Build result frame.
    result = pd.DataFrame.from_records(
        iter(p_values.items()), columns=['gene_id', 'p_value'])

    # Calculate corrected p-value using bonferroni correction.
    result['q_value'] = (result['p_value'] * len(result)).clip_upper(1.0)

    # Sort by q-value and p-value.
    result.sort_values(by=['q_value', 'p_value'], inplace=True)

    if len(insertions) > 0:
        # Annotate with gene_name if possible.
        if 'gene_name' in insertions[0].metadata:
            name_map = {
                ins.metadata['gene_id']: ins.metadata['gene_name']
                for ins in insertions
            }
            result.insert(1, 'gene_name', result['gene_id'].map(name_map))
        else:
            result['gene_name'] = np.nan

        # Annotate with frequency.
        frequency = (Insertion.to_frame(insertions)
                     .groupby('gene_id')['sample'].nunique()
                     .reset_index(name='n_samples'))
        result = pd.merge(result, frequency, on='gene_id', how='left')
    else:
        result['gene_name'] = np.nan
        result['n_samples'] = np.nan

    return result