def get_nominal_annotations(self): """Get nominal essentials and non-essentails in cerevisiae. Returns ------- (set, set) A pair sets, denoting the essential and non-essential genes, using their standard names. """ viable_filepath = Shared.get_dependency("cerevisiae", "cerevisiae_viable_annotations.txt") inviable_filepath = Shared.get_dependency("cerevisiae", "cerevisiae_inviable_annotations.txt") viable_table = pd.read_csv(viable_filepath, skiprows=8, delimiter="\t") inviable_table = pd.read_csv(inviable_filepath, skiprows=8, delimiter="\t") annotated_as_viable = set(self.feature_db.get_feature_by_name(f) for f in set(viable_table[viable_table["Mutant Information"] == "null"]["Gene"])) - set([None]) annotated_as_inviable = set(self.feature_db.get_feature_by_name(f) for f in set(inviable_table[inviable_table["Mutant Information"] == "null"]["Gene"])) - set([None]) # TODO: the dubious genes shouldn't be filtered here. consensus_viable_orfs = [f for f in annotated_as_viable if f.is_orf and f.feature_qualifier != "Dubious"] consensus_inviable_orfs = [f for f in annotated_as_inviable if f.is_orf and f.feature_qualifier != "Dubious"] return (set(f.standard_name for f in consensus_inviable_orfs), set(f.standard_name for f in consensus_viable_orfs))
def get_calb_orths_in_sp(): pom_db = GenomicFeatures.default_pom_db() ortholog_table = pd.read_csv(Shared.get_dependency("albicans", "C_albicans_SC5314_S_pombe_orthologs.txt"), skiprows=8, delimiter='\t', header=None, usecols=['albicans standard name', 'pombe standard name'], names=['albicans standard name', 'albicans common name', 'albicans alb_db id', 'pombe standard name', 'pombe common name', 'pombe alb_db id']) # TODO: we probably don't want to use the hit table, though the InParanoid # table is very stringent. best_hit_table = pd.read_csv(Shared.get_dependency("albicans", "C_albicans_SC5314_S_pombe_best_hits.txt"), skiprows=8, delimiter='\t', header=None, usecols=['albicans standard name', 'pombe standard name'], names=['albicans standard name', 'albicans common name', 'albicans alb_db id', 'pombe standard name', 'pombe common name', 'pombe alb_db id']) joined_table = pd.concat([ortholog_table, best_hit_table]) result = {} for alb_feature in GenomicFeatures.default_alb_db().get_all_features(): ortholog_row = joined_table[joined_table["albicans standard name"] == alb_feature.standard_name] if ortholog_row.empty: continue pom_feature = pom_db.get_feature_by_name(ortholog_row["pombe standard name"].iloc[0]) if pom_feature: result[alb_feature.standard_name] = pom_feature.name return result
def _get_spom_essentials(self): viability_table = pd.read_csv(Shared.get_dependency("pombe/FYPOviability.tsv"), header=None, delimiter='\t', names=["pombe standard name", "essentiality"]) return set(r[0] for _ix, r in viability_table.iterrows() if r[1] == "inviable"), \ set(r[0] for _ix, r in viability_table.iterrows() if r[1] == "viable")
def get_calb_orths_in_sp(): pom_db = GenomicFeatures.default_pom_db() ortholog_table = pd.read_csv( Shared.get_dependency("albicans", "C_albicans_SC5314_S_pombe_orthologs.txt"), skiprows=8, delimiter='\t', header=None, usecols=['albicans standard name', 'pombe standard name'], names=[ 'albicans standard name', 'albicans common name', 'albicans alb_db id', 'pombe standard name', 'pombe common name', 'pombe alb_db id' ]) # TODO: we probably don't want to use the hit table, though the InParanoid # table is very stringent. best_hit_table = pd.read_csv( Shared.get_dependency("albicans", "C_albicans_SC5314_S_pombe_best_hits.txt"), skiprows=8, delimiter='\t', header=None, usecols=['albicans standard name', 'pombe standard name'], names=[ 'albicans standard name', 'albicans common name', 'albicans alb_db id', 'pombe standard name', 'pombe common name', 'pombe alb_db id' ]) joined_table = pd.concat([ortholog_table, best_hit_table]) result = {} for alb_feature in GenomicFeatures.default_alb_db().get_all_features(): ortholog_row = joined_table[joined_table["albicans standard name"] == alb_feature.standard_name] if ortholog_row.empty: continue pom_feature = pom_db.get_feature_by_name( ortholog_row["pombe standard name"].iloc[0]) if pom_feature: result[alb_feature.standard_name] = pom_feature.name return result
def analyze_deletions(bam_file, threshold=50): bam_reader = pysam.AlignmentFile(bam_file, "rb") fasta_file = Shared.get_dependency( os.path.join( "albicans", "reference genome", "C_albicans_SC5314_version_A22-s07-m01-r08_chromosomes_HapA.fasta") ) chrom_names = [] chrom_lens = {} for record in SeqIO.parse(fasta_file, "fasta"): chrom_names.append(record.id) chrom_lens[record.id] = RangeSet([(1, len(record))]) seen = {chrom: [] for chrom in chrom_names} for read in bam_reader.fetch(): chrom_name = bam_reader.getrname(read.reference_id) if "chrM" in chrom_name: continue seen[chrom_name].append( (read.reference_start + 1, read.reference_end - 1 + 1)) unseen = { chrom: chrom_lens[chrom] - RangeSet(seen[chrom]) for chrom in chrom_names } write_ranges( unseen, "/Users/bermanlab/dev/transposon-pipeline/dependencies/albicans/deleted_regions.csv" ) ranges = { chrom: [r for r in unseen[chrom] if r[1] - r[0] >= threshold] for chrom in chrom_names } pprint(ranges) print "Total unseen:", sum(r.coverage for r in unseen.values()) print "Total filtered unseen:", sum( sum(r[1] - r[0] + 1 for r in rs) for rs in ranges.values()) for chrom in chrom_names: print chrom print "Total subranges:", len(unseen[chrom]) print "Total length:", unseen[chrom].coverage print "Ignored long subranges:", len(ranges[chrom]) print "Total length:", sum(r[1] - r[0] + 1 for r in ranges[chrom]) print "\n" import GenomicFeatures alb_db = GenomicFeatures.default_alb_db() for r in unseen[chrom]: fs = alb_db.get_features_at_range(chrom, r) if fs: print chrom, r, ", ".join(f.standard_name for f in fs) print "\n"
def _get_spom_essentials(self): viability_table = pd.read_csv( Shared.get_dependency("pombe/FYPOviability.tsv"), header=None, delimiter='\t', names=["pombe standard name", "essentiality"]) return set(r[0] for _ix, r in viability_table.iterrows() if r[1] == "inviable"), \ set(r[0] for _ix, r in viability_table.iterrows() if r[1] == "viable")
def _get_homologous_regions(self): ranges = self._read_range_data( Shared.get_dependency( os.path.join("albicans", "homologous_regions.csv"))) return { chrom: RangeSet(r for r in range_set if r[1] - r[0] + 1 >= self._ignore_region_threshold) for chrom, range_set in ranges.iteritems() }
def get_nominal_annotations(self): """Get nominal essentials and non-essentails in cerevisiae. Returns ------- (set, set) A pair sets, denoting the essential and non-essential genes, using their standard names. """ viable_filepath = Shared.get_dependency( "cerevisiae", "cerevisiae_viable_annotations.txt") inviable_filepath = Shared.get_dependency( "cerevisiae", "cerevisiae_inviable_annotations.txt") viable_table = pd.read_csv(viable_filepath, skiprows=8, delimiter="\t") inviable_table = pd.read_csv(inviable_filepath, skiprows=8, delimiter="\t") annotated_as_viable = set( self.feature_db.get_feature_by_name(f) for f in set(viable_table[viable_table["Mutant Information"] == "null"]["Gene"])) - set([None]) annotated_as_inviable = set( self.feature_db.get_feature_by_name(f) for f in set(inviable_table[inviable_table["Mutant Information"] == "null"]["Gene"])) - set([None]) # TODO: the dubious genes shouldn't be filtered here. consensus_viable_orfs = [ f for f in annotated_as_viable if f.is_orf and f.feature_qualifier != "Dubious" ] consensus_inviable_orfs = [ f for f in annotated_as_inviable if f.is_orf and f.feature_qualifier != "Dubious" ] return (set(f.standard_name for f in consensus_inviable_orfs), set(f.standard_name for f in consensus_viable_orfs))
def analyze_deletions(bam_file, threshold=50): bam_reader = pysam.AlignmentFile(bam_file, "rb") fasta_file = Shared.get_dependency(os.path.join("albicans", "reference genome", "C_albicans_SC5314_version_A22-s07-m01-r08_chromosomes_HapA.fasta")) chrom_names = [] chrom_lens = {} for record in SeqIO.parse(fasta_file, "fasta"): chrom_names.append(record.id) chrom_lens[record.id] = RangeSet([(1, len(record))]) seen = {chrom: [] for chrom in chrom_names} for read in bam_reader.fetch(): chrom_name = bam_reader.getrname(read.reference_id) if "chrM" in chrom_name: continue seen[chrom_name].append((read.reference_start+1, read.reference_end-1+1)) unseen = {chrom: chrom_lens[chrom] - RangeSet(seen[chrom]) for chrom in chrom_names} write_ranges(unseen, "/Users/bermanlab/dev/transposon-pipeline/dependencies/albicans/deleted_regions.csv") ranges = {chrom: [r for r in unseen[chrom] if r[1] - r[0] >= threshold] for chrom in chrom_names} pprint(ranges) print "Total unseen:", sum(r.coverage for r in unseen.values()) print "Total filtered unseen:", sum(sum(r[1]-r[0]+1 for r in rs) for rs in ranges.values()) for chrom in chrom_names: print chrom print "Total subranges:", len(unseen[chrom]) print "Total length:", unseen[chrom].coverage print "Ignored long subranges:", len(ranges[chrom]) print "Total length:", sum(r[1]-r[0]+1 for r in ranges[chrom]) print "\n" import GenomicFeatures alb_db = GenomicFeatures.default_alb_db() for r in unseen[chrom]: fs = alb_db.get_features_at_range(chrom, r) if fs: print chrom, r, ", ".join(f.standard_name for f in fs) print "\n"
(Adaptor cleaning works the same as with R1.) -d --delete-originals Delete input FASTQ files. -k --keep-clean-fqs Keep the cleaned FASTQ files. -p --primer-check Check primer specificity if percent transposon in reads is low. -h --help Show this help message and exit ''' TnPrimerAndTail = 'GTATTTTACCGACCGTTACCGACCGTTTTCATCCCTA' TnRev = 'TAGGGATGAAAACGGTCGGTAACGGTCGGTAAAATAC' PrimerOnly = 'GTATTTTACCGACCGTTACCGACC' PrimerRev = 'GGTCGGTAACGGTCGGTAAAATAC' AdapterSeq = 'AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC' # NB: bowtie2 requires spaces to be escapes with a backslash for the -x parameter. CInd = Shared.get_dependency( "albicans", "reference genome", "C_albicans_SC5314_version_A22-s07-m01-r08_chromosomes_HapA").replace( ' ', '\ ') CORES = 4 # Cores on the machine = how many threads should the external tools utilize def GetCmdPath(Program): """Gets the path to a desired program file on given computer. Parameters ---------- Program : string Name of program wish to have path for. Returns ------- CmdPath : string
import Shared usage = '''CreateHitFile.py -i --in-dir [str] Input directory with .bam files to parse. Defaults to current directory if left unspecified. -o --out-dir [str] Output directory to which the hit file will be writen. Defaults to current directory if left unspecified. -q --min-mapq [int] Map Quality - hits to parse from the bam file (default is 20) -m --merge-dist [int] Hits to merge with at most x nt distance between two hits. Default is 2 Example: Hits in positions 1 and 3 (3-1=2) will be united into a single hit -h --help Show this help message and exit ''' # TODO: move to config file. ChrFile = Shared.get_dependency('albicans', 'reference genome', 'C_albicans_SC5314_version_A22-s07-m01-r08_chromosomes_HapA.fasta') FeatureFName = Shared.get_dependency('albicans', 'reference genome', 'C_albicans_SC5314_version_A22-s07-m01-r08_chromosomal_feature.tab') ChrFeatCols = ['FeatureName', 'GeneName','Aliases','FeatureType','Chromosome','StartCoord','StopCoord','Strand','PrimaryCGDID','SecondaryCGDID',\ 'Description','DateCreated','SeqCoordVerDate','Blank1','Blank2','GeneNameReserDate','ReservedIsstandardName','SC_ortholog'] def FindHitsPerSample(SamAlign, ChrFeatMap, Sep0N = 2,MapQ=10): """Goes through Sam file, checks for high confidence alignment, unites unique positions if they can be aligned with adjunct positions. Parameters ---------- SamAlign : x ChrFeatMap : x SepON : integer MapQ : integer Setting for map quality. Default of 10 is equal to 1% chance of occurring in another position.
def _get_deleted_regions(self): ranges = self._read_range_data(Shared.get_dependency(os.path.join("albicans", "deleted_regions.csv"))) return {chrom: RangeSet(r for r in range_set if r[1] - r[0] + 1 >= self._ignore_region_threshold) for chrom, range_set in ranges.iteritems()}
import pysam import itertools import Shared usage = '''CreateHitFile.py -i --in-dir [str] Input directory with .bam files to parse. Defaults to current directory if left unspecified. -o --out-dir [str] Output directory to which the hit file will be writen. Defaults to current directory if left unspecified. -q --min-mapq [int] Map Quality - hits to parse from the bam file (default is 20) -m --merge-dist [int] Hits to merge with at most x nt distance between two hits. Default is 2 Example: Hits in positions 1 and 3 (3-1=2) will be united into a single hit -h --help Show this help message and exit ''' # TODO: move to config file. ChrFile = Shared.get_dependency( 'albicans', 'reference genome', 'C_albicans_SC5314_version_A22-s07-m01-r08_chromosomes_HapA.fasta') FeatureFName = Shared.get_dependency( 'albicans', 'reference genome', 'C_albicans_SC5314_version_A22-s07-m01-r08_chromosomal_feature.tab') ChrFeatCols = ['FeatureName', 'GeneName','Aliases','FeatureType','Chromosome','StartCoord','StopCoord','Strand','PrimaryCGDID','SecondaryCGDID',\ 'Description','DateCreated','SeqCoordVerDate','Blank1','Blank2','GeneNameReserDate','ReservedIsstandardName','SC_ortholog'] def FindHitsPerSample(SamAlign, ChrFeatMap, Sep0N=2, MapQ=10): """Goes through Sam file, checks for high confidence alignment, unites unique positions if they can be aligned with adjunct positions. Parameters ---------- SamAlign : x
def _get_genes_with_paralogs(self): return Organism._get_genes_with_paralogs(self, Shared.get_dependency(os.path.join("pombe", "hasParalogs_sp.txt")))
def _get_genes_with_paralogs(self): return Organism._get_genes_with_paralogs( self, Shared.get_dependency( os.path.join("albicans", "hasParalogs_ca.txt")))
def _get_homologous_regions(self): ranges = self._read_range_data(Shared.get_dependency(os.path.join("cerevisiae", "homologous_regions.csv"))) return {chrom: RangeSet(r for r in range_set if r[1] - r[0] + 1 >= self._ignore_region_threshold) for chrom, range_set in ranges.iteritems()}
def _get_genes_with_paralogs(self): return Organism._get_genes_with_paralogs(self, Shared.get_dependency(os.path.join("albicans", "hasParalogs_ca.txt")))
-r --reverse-strand Search with reverse complement sequence for R2 files. (Adaptor cleaning works the same as with R1.) -d --delete-originals Delete input FASTQ files. -k --keep-clean-fqs Keep the cleaned FASTQ files. -p --primer-check Check primer specificity if percent transposon in reads is low. -h --help Show this help message and exit ''' TnPrimerAndTail = 'GTATTTTACCGACCGTTACCGACCGTTTTCATCCCTA' TnRev = 'TAGGGATGAAAACGGTCGGTAACGGTCGGTAAAATAC' PrimerOnly = 'GTATTTTACCGACCGTTACCGACC' PrimerRev = 'GGTCGGTAACGGTCGGTAAAATAC' AdapterSeq = 'AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC' # NB: bowtie2 requires spaces to be escapes with a backslash for the -x parameter. CInd = Shared.get_dependency("albicans", "reference genome", "C_albicans_SC5314_version_A22-s07-m01-r08_chromosomes_HapA").replace(' ', '\ ') CORES = 4 # Cores on the machine = how many threads should the external tools utilize def GetCmdPath(Program): """Gets the path to a desired program file on given computer. Parameters ---------- Program : string Name of program wish to have path for. Returns ------- CmdPath : string Path for calling program as a command in the POSIX terminal. """
def _get_genes_with_paralogs(self): return Organism._get_genes_with_paralogs( self, Shared.get_dependency(os.path.join("pombe", "hasParalogs_sp.txt")))