def test_from_transposon_fusions(self, tr_fusion): """Tests from_tranpsoson_fusions for example.""" insertions = list(Insertion.from_transposon_fusions([tr_fusion])) expected = [ Insertion(id=None, seqname='16', position=52141095, strand=-1, support_junction=380, support_spanning=118, support=498, metadata={ 'transposon_anchor': 1541, 'orientation': 'antisense', 'gene_name': 'Cblb', 'feature_type': 'SA', 'feature_strand': -1, 'feature_name': 'En2SA', 'gene_strand': 1 }) ] assert expected == insertions
def insertions(): """Example insertions in ENSMUSG00000051951 (5 exons).""" return [ # After third exon. Insertion( id='1', seqname='1', position=3207327, strand=1, support_junction=1, support_spanning=1, support=2, metadata=frozendict({ 'gene_id': 'ENSMUSG00000051951', 'sample': 'S1'})), # After first exon. Insertion( id='2', seqname='1', position=3214491, strand=-1, support_junction=1, support_spanning=1, support=2, metadata=frozendict({ 'gene_id': 'ENSMUSG00000051951', 'sample': 'S2'})) ] # yapf: disable
def main(): """Main for imfusion-ctg.""" logger = logging.getLogger() args = parse_args() # Read insertions and filter for depth. insertions = list(Insertion.from_csv(args.insertions, sep='\t')) if args.min_depth is not None: insertions = [ ins for ins in insertions if ins.support >= args.min_depth ] # Identify CTGs. logger.info('Testing for CTGs') if args.window is not None: logger.info('- Using window (%d, %d)', *args.window) reference = Reference(args.reference) ctgs = test_ctgs(insertions, reference=reference, gene_ids=args.gene_ids, chromosomes=args.chromosomes, pattern=args.pattern, window=args.window) # Filter using given threshold. if args.threshold is not None: ctgs = ctgs.query('q_value <= {}'.format(args.threshold)) # If expression is given, test for differential expression. if args.expression is not None: logger.info('Testing for differential expression') # Perform DE tests. exon_counts = read_exon_counts(args.expression) de_results = test_de(insertions, exon_counts, gene_ids=ctgs['gene_id']) # Combine with CTG result. de_results = de_results.rename( columns={ 'direction': 'de_direction', 'p_value': 'de_pvalue', 'test_type': 'de_test' }) col_order = (list(ctgs.columns) + ['de_test', 'de_direction', 'de_pvalue']) ctgs = pd.merge(ctgs, de_results, on='gene_id', how='left')[col_order] if args.de_threshold is not None: # Filter for non-significant genes, keeping nans. ctgs = ctgs.loc[~(ctgs['de_pvalue'] > args.de_threshold)] # Write outputs. logger.info('Writing outputs') ctgs.to_csv(str(args.output), sep='\t', index=False)
def test_from_transposon_fusions(self, tr_fusion): """Tests from_tranpsoson_fusions for example.""" insertions = list(Insertion.from_transposon_fusions([tr_fusion])) expected = [ Insertion( id=None, seqname='16', position=52141095, strand=-1, support_junction=380, support_spanning=118, support=498, metadata={ 'transposon_anchor': 1541, 'orientation': 'antisense', 'gene_name': 'Cblb', 'feature_type': 'SA', 'feature_strand': -1, 'feature_name': 'En2SA', 'gene_strand': 1 }) ] assert expected == insertions
def test_from_transposon_fusion(self, tr_fusion): """Tests from_tranpsoson_fusion for example.""" insertion = Insertion.from_transposon_fusion(tr_fusion) expected = Insertion( id=None, seqname='16', position=52141095, strand=-1, support_junction=380, support_spanning=118, support=498, metadata={ 'transposon_anchor': 1541, 'orientation': 'antisense', 'gene_name': 'Cblb', 'feature_type': 'SA', 'feature_strand': -1, 'feature_name': 'En2SA', 'gene_strand': 1 }) assert insertion == expected assert isinstance(insertion.metadata, frozendict)
def main(): """Main for imfusion-ctg.""" logger = logging.getLogger() args = parse_args() # Read insertions and filter for depth. insertions = list(Insertion.from_csv(args.insertions, sep='\t')) if args.min_depth is not None: insertions = [ ins for ins in insertions if ins.support >= args.min_depth ] # Identify CTGs. logger.info('Testing for CTGs') if args.window is not None: logger.info('- Using window (%d, %d)', *args.window) reference = Reference(args.reference) ctgs = test_ctgs( insertions, reference=reference, gene_ids=args.gene_ids, chromosomes=args.chromosomes, pattern=args.pattern, window=args.window) # Filter using given threshold. if args.threshold is not None: ctgs = ctgs.query('q_value <= {}'.format(args.threshold)) # If expression is given, test for differential expression. if args.expression is not None: logger.info('Testing for differential expression') # Perform DE tests. exon_counts = read_exon_counts(args.expression) de_results = test_de(insertions, exon_counts, gene_ids=ctgs['gene_id']) # Combine with CTG result. de_results = de_results.rename(columns={ 'direction': 'de_direction', 'p_value': 'de_pvalue', 'test_type': 'de_test' }) col_order = ( list(ctgs.columns) + ['de_test', 'de_direction', 'de_pvalue']) ctgs = pd.merge(ctgs, de_results, on='gene_id', how='left')[col_order] if args.de_threshold is not None: # Filter for non-significant genes, keeping nans. ctgs = ctgs.loc[~(ctgs['de_pvalue'] > args.de_threshold)] # Write outputs. logger.info('Writing outputs') ctgs.to_csv(str(args.output), sep='\t', index=False)
def test_from_transposon_fusions_id(self, tr_fusion): """Tests from_tranpsoson_fusions for example, with id.""" insertions = list( Insertion.from_transposon_fusions( [tr_fusion], id_fmt_str='INS_{}')) assert insertions[0].id == 'INS_1'
def test_from_transposon_fusions_id(self, tr_fusion): """Tests from_tranpsoson_fusions for example, with id.""" insertions = list( Insertion.from_transposon_fusions([tr_fusion], id_fmt_str='INS_{}')) assert insertions[0].id == 'INS_1'
def main(): """Main function of imfusion-merge.""" args = _parse_args() # Use directory names to name samples if no names given. names = args.names or [fp.name for fp in args.sample_dirs] # Merge samples into single dataset. merged_ins, merged_expr = merge_samples( args.sample_dirs, sample_names=names, with_expression=args.output_expression is not None) # Write output(s). Insertion.to_csv(str(args.output), merged_ins, sep='\t', index=False) if args.output_expression is not None: merged_expr.to_csv(str(args.output_expression), sep='\t', index=True)
def test_pos_example_trp53bp2_df(self, test_insertions, test_exon_counts): """Tests positive example of DE in Trp53bp2 with dataframe input.""" test_insertion_df = Insertion.to_frame(test_insertions) result = test.test_de_exon( test_insertion_df, test_exon_counts, gene_id='ENSMUSG00000026510') assert result.p_value < 0.01 assert result.direction == 1
def test_proper_example_df(self, insertions, exon_counts): """Tests same example, using dataframe input for insertions.""" insertion_df = Insertion.to_frame(insertions) before, after, dropped = test.split_counts( exon_counts, insertion_df, gene_id='ENSMUSG00000051951') assert len(before) == 1 assert len(after) == 2 assert dropped == set()
def test_pos_example_trp53bp2_df(self, test_insertions, test_exon_counts): """Tests positive example of DE in Trp53bp2 with dataframe input.""" test_insertion_df = Insertion.to_frame(test_insertions) result = test.test_de_exon(test_insertion_df, test_exon_counts, gene_id='ENSMUSG00000026510') assert result.p_value < 0.01 assert result.direction == 1
def _preprocess_insertions(insertions, gene_id): """Converts insertions into common object format and subsets for gene.""" if isinstance(insertions, pd.DataFrame): insertions = insertions.loc[insertions['gene_id'] == gene_id] insertions = list(Insertion.from_frame(insertions)) else: insertions = [ins for ins in insertions if ins.metadata['gene_id'] == gene_id] # yapf: disable return insertions
def test_from_transposon_fusion(self, tr_fusion): """Tests from_tranpsoson_fusion for example.""" insertion = Insertion.from_transposon_fusion(tr_fusion) expected = Insertion(id=None, seqname='16', position=52141095, strand=-1, support_junction=380, support_spanning=118, support=498, metadata={ 'transposon_anchor': 1541, 'orientation': 'antisense', 'gene_name': 'Cblb', 'feature_type': 'SA', 'feature_strand': -1, 'feature_name': 'En2SA', 'gene_strand': 1 }) assert insertion == expected assert isinstance(insertion.metadata, frozendict)
def merge_samples( dir_paths, # type: Iterable[pathlib.Path] sample_names=None, # type: List[str] with_expression=True # type: bool ): # type: (...) -> Tuple[Iterable[Insertion], Optional[pd.DataFrame]] """Merges samples in dir_paths to a single set of insertions and counts. Parameters ---------- dir_paths : List[pathlib.Path] Paths to the sample directories. samples : List[str] Names to use the merged samples. with_expression : bool Whether to also merge expression counts (assumes that expression.txt files have been generated using imfusion-expression). Returns ------- Tuple[pandas.DataFrame, pandas.DataFrame] Two DataFrames respectively containing the merged insertions and the merged exon counts. If with_expression is False, the merged counts frame is returned as None. """ # Use directory names as sample names if none given. if sample_names is None: sample_names = [dp.name for dp in dir_paths] # Check for duplicate names. if len(sample_names) != len(set(sample_names)): raise ValueError('Sample names contains duplicates') # Merge insertions. ins_paths = [dp / 'insertions.txt' for dp in dir_paths] ins_groups = (Insertion.from_csv(fp, sep='\t') for fp in ins_paths) merged_insertions = merge_insertions(ins_groups, sample_names) # Merge counts. if with_expression: count_paths = [dp / 'expression.txt' for dp in dir_paths] count_frames = (read_exon_counts(cp) for cp in count_paths) merged_counts = merge_exon_counts(count_frames, sample_names) else: merged_counts = None return merged_insertions, merged_counts
def _insertion(id, seqname, position, strand, support_junction=1, support_spanning=1, metadata=None): """Helper function for building an Insertion instance.""" return Insertion(id=id, seqname=seqname, position=position, strand=strand, support_junction=support_junction, support_spanning=support_spanning, support=support_junction + support_spanning, metadata=frozendict(metadata or {}))
def insertion(): return Insertion( id='INS_1', seqname='16', position=52141095, strand=-1, support_junction=380, support_spanning=118, support=498, metadata=frozendict({ 'feature_type': 'SA', 'feature_name': 'En2SA', 'gene_strand': 1, 'orientation': 'antisense', 'feature_strand': -1, 'gene_name': 'Cblb', 'transposon_anchor': 1541 }))
def main(): """Main function for imfusion-insertions.""" args = parse_args() # Construct aligner and identify insertions. aligner = args.aligner.from_args(args) aligner.check_dependencies() insertions = aligner.identify_insertions(fastq_path=args.fastq, output_dir=args.output_dir, fastq2_path=args.fastq2) # Convert to dataframe. insertion_frame = Insertion.to_frame(insertions) insertion_frame = insertion_frame.sort_values('support', ascending=False) # Write output. output_path = args.output_dir / 'insertions.txt' insertion_frame.to_csv(str(output_path), sep='\t', index=False)
def extract_insertions( fusions, # type: Iterable[Fusion] gtf_path, # type: pathlib.Path features_path, # type: pathlib.Path chromosomes=None, # type: List[str] assembled_gtf_path=None, # type: pathlib.Path ffpm_fastq_path=None # type: pathlib.Path ): # type: (...) -> Iterable[Insertion] """Extract insertions from gene-transposon fusions.""" # Annotate for genes. gtf_reference = TranscriptReference.from_gtf(gtf_path, chromosomes=chromosomes) annotated = annotate_fusions_for_genes(fusions, gtf_reference) # Annotate for assembly (if given). if assembled_gtf_path is not None: assem_reference = TranscriptReference.from_gtf(assembled_gtf_path, chromosomes=chromosomes) annotated = annotate_fusions_for_assembly(annotated, gtf_reference, assem_reference) # Annotate for transposon. annotated = annotate_fusions_for_transposon(annotated, features_path) # Drop any fusions without a transposon feature. annotated = (fusion for fusion in annotated if 'feature_name' in fusion.metadata) # Calculate FFPM scores. if ffpm_fastq_path is not None: annotated = annotate_ffpm(annotated, fastq_path=ffpm_fastq_path) # Convert to insertions. insertions = Insertion.from_transposon_fusions(annotated, id_fmt_str='INS_{}') for insertion in insertions: yield insertion
def extract_insertions( fusions, # type: Iterable[Fusion] gtf_path, # type: pathlib.Path features_path, # type: pathlib.Path chromosomes=None, # type: List[str] assembled_gtf_path=None, # type: pathlib.Path ffpm_fastq_path=None # type: pathlib.Path ): # type: (...) -> Iterable[Insertion] """Extract insertions from gene-transposon fusions.""" # Annotate for genes. gtf_reference = TranscriptReference.from_gtf( gtf_path, chromosomes=chromosomes) annotated = annotate_fusions_for_genes(fusions, gtf_reference) # Annotate for assembly (if given). if assembled_gtf_path is not None: assem_reference = TranscriptReference.from_gtf( assembled_gtf_path, chromosomes=chromosomes) annotated = annotate_fusions_for_assembly(annotated, gtf_reference, assem_reference) # Annotate for transposon. annotated = annotate_fusions_for_transposon(annotated, features_path) # Drop any fusions without a transposon feature. annotated = (fusion for fusion in annotated if 'feature_name' in fusion.metadata) # Calculate FFPM scores. if ffpm_fastq_path is not None: annotated = annotate_ffpm(annotated, fastq_path=ffpm_fastq_path) # Convert to insertions. insertions = Insertion.from_transposon_fusions( annotated, id_fmt_str='INS_{}') for insertion in insertions: yield insertion
def main(): """Main function for imfusion-insertions.""" args = parse_args() # Construct aligner and identify insertions. aligner = args.aligner.from_args(args) aligner.check_dependencies() insertions = aligner.identify_insertions( fastq_path=args.fastq, output_dir=args.output_dir, fastq2_path=args.fastq2) # Convert to dataframe. insertion_frame = Insertion.to_frame(insertions) insertion_frame = insertion_frame.sort_values('support', ascending=False) # Write output. output_path = args.output_dir / 'insertions.txt' insertion_frame.to_csv(str(output_path), sep='\t', index=False)
def test_ctgs( insertions, # type: List[Insertion] reference, # type: Reference gene_ids=None, # type: Set[str] chromosomes=None, # type: Set[str] pattern=None, # type: str per_sample=True, # type: bool window=None #type: Tuple[int, int] ): """Identifies genes that are significantly enriched for insertions (CTGs). This function takes a DataFrame of insertions, coming from multiple samples, and identifies if any genes are more frequently affected by an insertion than would be expected by chance. These genes are called Commonly Targeted Genes (CTGs). CTGs are selected by comparing the number of insertions within the gene to the number of insertions that would be expected from the background insertion rate, which is modeled using a Poisson distribution. Parameters ---------- insertions : List[Insertion] Insertions to test. reference : Reference Reference index used by the aligner to identify insertions. genes : List[str] List of genes to test (defaults to all genes with an insertion). chromosomes : List[str] List of chromosomes to include, defaults to all chromosomes shared between the reference sequence and the reference gtf. pattern : str Specificity pattern of the used transposon. per_sample : bool Whether to perform the per sample test (recommended), which effectively collapes insertions per sample/gene combination. This avoids issues in which insertions that are detected multiple times or that may have hopped inside the gene locus are counted multiple times. window : Tuple[int, int] Window to include around gene (in bp). Specified as (upstream_dist, downstream_dist). For example: (2000, 2000) specifies in a 2KB window around each gene. Returns ------- pandas.DataFrame Results of CTG test for tested genes. Contains two columns: p_value and q_value. The last column (q_value) represents the p-value of the gene after correcting for multiple testing using bonferroni correction. """ # Default to shared chromosome sequences (typically drops some # of the more esoteric extra scaffold/patch sequences). if chromosomes is None: reference_seq = pyfaidx.Fasta(str(reference.fasta_path)) reference_gtf = GtfIterator(reference.indexed_gtf_path) chromosomes = list( set(reference_seq.keys()) & set(reference_gtf.contigs)) if len(chromosomes) == 0: ValueError('No chromosomes are shared between the reference ' 'sequence and reference gtf files') if len(chromosomes) == 0: raise ValueError('At least one chromosome must be given') # Determine gene windows using GTF. logging.info('Generating gene windows') gene_windows = _build_gene_windows(reference.indexed_gtf_path, window=window, chromosomes=chromosomes) # Subset insertions to gene intervals. insertions = _subset_to_windows(insertions, gene_windows) if gene_ids is None: gene_ids = set(ins.metadata['gene_id'] for ins in insertions) # Collapse insertions per gene/sample (recommended). # Corrects for hopping/multiple detection issues. if per_sample: logging.info('Collapsing insertions') insertions = list(_collapse_per_sample(insertions)) # Calculate total number of pattern occurrences within intervals. logging.info('Counting pattern occurrences') reference_seq = pyfaidx.Fasta(str(reference.fasta_path)) total = count_total(reference_seq, pattern=pattern, intervals=gene_windows.values()) # Calculate p-values for each gene. logging.info('Calculating significance for genes') insertion_trees = GenomicIntervalTree.from_objects_position( insertions, chrom_attr='seqname') p_values = { gene_id: test_region( insertions=insertions, reference_seq=reference_seq, region=gene_windows[gene_id], total=total, pattern=pattern, filters=[lambda ins, gid=gene_id: ins.metadata['gene_id'] == gid], insertion_trees=insertion_trees) for gene_id in gene_ids } # Build result frame. result = pd.DataFrame.from_records(iter(p_values.items()), columns=['gene_id', 'p_value']) # Calculate corrected p-value using bonferroni correction. result['q_value'] = (result['p_value'] * len(result)).clip_upper(1.0) # Sort by q-value and p-value. result.sort_values(by=['q_value', 'p_value'], inplace=True) if len(insertions) > 0: # Annotate with gene_name if possible. if 'gene_name' in insertions[0].metadata: name_map = { ins.metadata['gene_id']: ins.metadata['gene_name'] for ins in insertions } result.insert(1, 'gene_name', result['gene_id'].map(name_map)) else: result['gene_name'] = np.nan # Annotate with frequency. frequency = (Insertion.to_frame(insertions).groupby('gene_id') ['sample'].nunique().reset_index(name='n_samples')) result = pd.merge(result, frequency, on='gene_id', how='left') else: result['gene_name'] = np.nan result['n_samples'] = np.nan return result
def test_from_transposon_fusion_id(self, tr_fusion): """Tests from_tranpsoson_fusion for example, with id.""" insertion = Insertion.from_transposon_fusion(tr_fusion, id_='a') assert insertion.id == 'a'
def test_insertions(): """Example insertions for Trp53bp2 and Nf1.""" file_path = pytest.helpers.data_path( 'insertions.txt', relative_to=__file__) return list(Insertion.from_csv(file_path, sep='\t'))
def test_insertions(): """Example insertions for Trp53bp2 and Nf1.""" file_path = pytest.helpers.data_path('insertions.txt', relative_to=__file__) return list(Insertion.from_csv(file_path, sep='\t'))
def test_ctgs( insertions, # type: List[Insertion] reference, # type: Reference gene_ids=None, # type: Set[str] chromosomes=None, # type: Set[str] pattern=None, # type: str per_sample=True, # type: bool window=None #type: Tuple[int, int] ): """Identifies genes that are significantly enriched for insertions (CTGs). This function takes a DataFrame of insertions, coming from multiple samples, and identifies if any genes are more frequently affected by an insertion than would be expected by chance. These genes are called Commonly Targeted Genes (CTGs). CTGs are selected by comparing the number of insertions within the gene to the number of insertions that would be expected from the background insertion rate, which is modeled using a Poisson distribution. Parameters ---------- insertions : List[Insertion] Insertions to test. reference : Reference Reference index used by the aligner to identify insertions. genes : List[str] List of genes to test (defaults to all genes with an insertion). chromosomes : List[str] List of chromosomes to include, defaults to all chromosomes shared between the reference sequence and the reference gtf. pattern : str Specificity pattern of the used transposon. per_sample : bool Whether to perform the per sample test (recommended), which effectively collapes insertions per sample/gene combination. This avoids issues in which insertions that are detected multiple times or that may have hopped inside the gene locus are counted multiple times. window : Tuple[int, int] Window to include around gene (in bp). Specified as (upstream_dist, downstream_dist). For example: (2000, 2000) specifies in a 2KB window around each gene. Returns ------- pandas.DataFrame Results of CTG test for tested genes. Contains two columns: p_value and q_value. The last column (q_value) represents the p-value of the gene after correcting for multiple testing using bonferroni correction. """ # Default to shared chromosome sequences (typically drops some # of the more esoteric extra scaffold/patch sequences). if chromosomes is None: reference_seq = pyfaidx.Fasta(str(reference.fasta_path)) reference_gtf = GtfIterator(reference.indexed_gtf_path) chromosomes = list( set(reference_seq.keys()) & set(reference_gtf.contigs)) if len(chromosomes) == 0: ValueError('No chromosomes are shared between the reference ' 'sequence and reference gtf files') if len(chromosomes) == 0: raise ValueError('At least one chromosome must be given') # Determine gene windows using GTF. logging.info('Generating gene windows') gene_windows = _build_gene_windows( reference.indexed_gtf_path, window=window, chromosomes=chromosomes) # Subset insertions to gene intervals. insertions = _subset_to_windows(insertions, gene_windows) if gene_ids is None: gene_ids = set(ins.metadata['gene_id'] for ins in insertions) # Collapse insertions per gene/sample (recommended). # Corrects for hopping/multiple detection issues. if per_sample: logging.info('Collapsing insertions') insertions = list(_collapse_per_sample(insertions)) # Calculate total number of pattern occurrences within intervals. logging.info('Counting pattern occurrences') reference_seq = pyfaidx.Fasta(str(reference.fasta_path)) total = count_total( reference_seq, pattern=pattern, intervals=gene_windows.values()) # Calculate p-values for each gene. logging.info('Calculating significance for genes') insertion_trees = GenomicIntervalTree.from_objects_position( insertions, chrom_attr='seqname') p_values = { gene_id: test_region( insertions=insertions, reference_seq=reference_seq, region=gene_windows[gene_id], total=total, pattern=pattern, filters=[lambda ins, gid=gene_id: ins.metadata['gene_id'] == gid], insertion_trees=insertion_trees) for gene_id in gene_ids } # Build result frame. result = pd.DataFrame.from_records( iter(p_values.items()), columns=['gene_id', 'p_value']) # Calculate corrected p-value using bonferroni correction. result['q_value'] = (result['p_value'] * len(result)).clip_upper(1.0) # Sort by q-value and p-value. result.sort_values(by=['q_value', 'p_value'], inplace=True) if len(insertions) > 0: # Annotate with gene_name if possible. if 'gene_name' in insertions[0].metadata: name_map = { ins.metadata['gene_id']: ins.metadata['gene_name'] for ins in insertions } result.insert(1, 'gene_name', result['gene_id'].map(name_map)) else: result['gene_name'] = np.nan # Annotate with frequency. frequency = (Insertion.to_frame(insertions) .groupby('gene_id')['sample'].nunique() .reset_index(name='n_samples')) result = pd.merge(result, frequency, on='gene_id', how='left') else: result['gene_name'] = np.nan result['n_samples'] = np.nan return result