def get_cluster_motifs(cluster_id): ''' Parse out the motif for a specific cluster. Parameters ---------- cluster_id: str The cluster of interest. path_to_saved_clusters: str The path to the directory that is holding the saved JSONs for the saved clusters Returns ------- motifs: str A string that is made of the concatenated consensus sequences of the motifs a part of the cluster. ''' path_to_saved_clusters = '/Users/ichaudr/Documents/UMBC/Lab-Erill/Isaac/Vibrio_SOA/Saved Clusters/complete_clusters/' motifs = [] for file in os.listdir(path_to_saved_clusters): if file.split('.')[0] == cluster_id: file_reader = json.load(open(path_to_saved_clusters + file, 'r')) motifs_from_file = file_reader['motifs'] for m in motifs_from_file: temp_motif = Motif(instances=Instances(m)) motifs.append(str(temp_motif.consensus)) return motifs
def get_cluster_motifs(cluster_id, path_to_saved_clusters=cluster_jsons_path): ''' Parse out the motif for a specific cluster. Parameters ---------- cluster_id: str The cluster of interest. path_to_saved_clusters: str The path to the directory that is holding the saved JSONs for the saved clusters Returns ------- motifs: str A string that is made of the concatenated consensus sequences of the motifs a part of the cluster. ''' motifs = [] for file in os.listdir(path_to_saved_clusters): if file.split('.')[0] == cluster_id: file_reader = json.load(open(path_to_saved_clusters + file, 'r')) motifs_from_file = file_reader['motifs'] for m in motifs_from_file: temp_motif = Motif(instances=Instances(m)) #temp_motif.weblogo(fname=logo_path+cluster_id+"_"+str(motifs_from_file.index(m))+'.png') motifs.append(str(temp_motif.consensus)) return ' | '.join(motifs)
def read(handle, format): alphabet = IUPAC.unambiguous_dna counts = {} if format == "pfm": # reads the motif from Jaspar .pfm file letters = "ACGT" for letter, line in zip(letters, handle): words = line.split() #if there is a letter in the beginning, ignore it if words[0] == letter: words = words[1:] counts[letter] = map(float, words) motif = Motif(alphabet, counts=counts) elif format == "sites": # reads the motif from Jaspar .sites file instances = [] for line in handle: if not line.startswith(">"): break # line contains the header ">...." # now read the actual sequence line = handle.next() instance = "" for c in line.strip(): if c == c.upper(): instance += c instance = Seq(instance, alphabet) instances.append(instance) instances = Instances(instances, alphabet) motif = Motif(alphabet, instances=instances) else: raise ValueError("Unknown format %s" % format) motif.mask = "*" * motif.length return motif
def main(): seq_list=[] pos_sequences = SeqIO.parse(open(input_file_sig70_compiled), 'fasta') count = 0 for seq in pos_sequences: sequence = seq.seq[28:34] if sequence[4] == "G": count+=1 print(count) print(sequence) seq_list.append(sequence) motif = Motif(instances=Instances(instances=seq_list)) PSSM = motif.pssm print(PSSM.max) print(PSSM.consensus) print(PSSM) amrZ_sequences = SeqIO.parse(open(input_file_AmrZ), 'fasta') positions = [] for a_seq in amrZ_sequences: print(a_seq.seq) prib_10_pos = evaluate_prib10(a_seq.seq, PSSM) if prib_10_pos != []: positions.append(prib_10_pos) print(positions)
def ic_at(motif, other, offset): ''' Caculates the information content, ic, for a specific alignment. The approach makes a temporary motif object containing the overlapping sequences in the alignemnt and taking the average of the pssm. Parameters ---------- motif, other: Motif objects The motifs of interest offset: int The offset value that results in the alignment of interest. ''' #Pull the sequences containined in the aligned region of the motifs from each of the motif instances. alignment_len = min(len(motif) - offset, len(other)) motif_seqs = [ site[offset:alignment_len + offset] for site in motif.instances ] other_seqs = [site[:alignment_len] for site in other.instances] # Create the motif and compute the IC amotif = Motif(instances=Instances(motif_seqs + other_seqs)) amotif.pseudocounts = dict(A=0.25, C=0.25, G=0.25, T=0.25) #print('Motif Seqs: ' , motif_seqs) #print('Other Seqs: ' , other_seqs) #print('Offset ', offset) #print('IC: ' , amotif.pssm.mean(), '\n\n') return amotif.pssm.mean()
def load_from_json(self, file_path): ''' Sets up all memeber variables based on the stored data in the JSON file. Parameters ---------- file_path: str Path to the JSON file that will be loaded in. Returns ------- None ''' file_reader = json.load(open(file_path, 'r')) self.cluster_id = file_reader['cluster_id'] for op in file_reader['operons']: temp_op = Operon(operon_id=op['operon_id'], genome_fragment_name='imported_cluster', genome_accession=op['genome_accession'], genome_features='imported', strand='/') temp_op.features = op['features'] temp_op.promoter = op['promoter'] self.operons.append(temp_op) self.filtered_promoters = file_reader['filtered_promoters'] for m in file_reader['motifs']: self.motifs.append(Motif(instances=Instances(m)))
def build_pssm_35(sequences): seq_list = [] for seq in sequences: seq35 = seq[28:34] seq_list.append(seq35) motif = Motif(instances=Instances(instances=seq_list)) PSSM = motif.pssm return PSSM
def build_pssm_10(sequences): seq_list = [] for seq in sequences: seq10 = seq[52:58] seq_list.append(seq10) motif = Motif(instances=Instances(instances=seq_list)) PSSM = motif.pssm return PSSM
def read(handle): """Parse an AlignACE format handle as a Record object.""" record = Record() line = next(handle) record.version = line.strip() line = next(handle) record.command = line.strip() mask = None number = None for line in handle: line = line.strip() if line == "": pass elif line[:4] == "Para": record.parameters = {} elif line[0] == "#": seq_name = line.split("\t")[1] record.sequences.append(seq_name) elif "=" in line: par_name, par_value = line.split("=") par_name = par_name.strip() par_value = par_value.strip() record.parameters[par_name] = par_value elif line[:5] == "Input": record.sequences = [] elif line[:5] == "Motif": words = line.split() assert words[0] == "Motif" number = int(words[1]) instances = [] elif line[:3] == "MAP": alphabet = "ACGT" instances = Instances(instances, alphabet) motif = Motif(alphabet, instances) motif.score = float(line.split()[-1]) motif.number = number motif.mask = mask record.append(motif) elif len(line.split("\t")) == 4: seq = Seq(line.split("\t")[0]) instances.append(seq) elif "*" in line: mask = line.strip("\r\n") else: raise ValueError(line) return record
def read(handle): """read(handle)""" record = Record() line = next(handle) record.version = line.strip() line = next(handle) record.command = line.strip() mask = None number = None for line in handle: line = line.strip() if line == "": pass elif line[:4] == "Para": record.parameters = {} elif line[0] == "#": seq_name = line.split("\t")[1] record.sequences.append(seq_name) elif "=" in line: par_name, par_value = line.split("=") par_name = par_name.strip() par_value = par_value.strip() record.parameters[par_name] = par_value elif line[:5] == "Input": record.sequences = [] elif line[:5] == "Motif": words = line.split() assert words[0] == "Motif" number = int(words[1]) instances = [] elif line[:3] == "MAP": alphabet = IUPAC.unambiguous_dna instances = Instances(instances, alphabet) motif = Motif(alphabet, instances) motif.score = float(line.split()[-1]) motif.number = number motif.mask = mask record.append(motif) elif len(line.split("\t")) == 4: seq = Seq(line.split("\t")[0], IUPAC.unambiguous_dna) instances.append(seq) elif "*" in line: mask = line.strip("\r\n") else: raise ValueError(line) return record
def build_pssm(sequences): seq_list = [] for seq in sequences: seq_list.append(seq) motif = Motif(instances=Instances(instances=seq_list)) motif = motif.counts.normalize(0.01) PSSM = PositionSpecificScoringMatrix(alphabet=IUPAC.unambiguous_dna, values=motif) return PSSM
def printCons10(sequences): seq_list = [] for seq in sequences: seq10 = seq[52:58] seq_list.append(seq10) motif = Motif(instances=Instances(instances=seq_list)) PSSM = motif.pssm print(PSSM.consensus) print(PSSM)
def printCons35(sequences): seq_list = [] for seq in sequences: seq35 = seq[28:34] seq_list.append(seq35) motif = Motif(instances=Instances(instances=seq_list)) PSSM = motif.pssm print(PSSM.consensus) print(PSSM)
#Get the counts to fill the above dictionaries for file in os.listdir(path_to_clusters): if file[-4:] != 'json': continue cluster_id = file[:-5] cluster_reader = json.load(open(path_to_clusters + file, 'r')) #Add info about the number of motifs num_motifs_per_cluster[cluster_id] = len(cluster_reader['motifs']) #Add info about the IC per motif ic_per_motif_per_cluster[cluster_id] = {} for m in range(len(cluster_reader['motifs'])): temp_motif = Motif(instances=Instances(cluster_reader['motifs'][m])) temp_motif_seq = str(temp_motif.consensus) temp_motif_ic = temp_motif.pssm.mean() ic_per_motif_per_cluster[cluster_id][temp_motif_seq] = temp_motif_ic #All stats to be calculated: avg_num_motifs_per_cluster = statistics.mean( list(num_motifs_per_cluster.values())) stdv_num_motifs_per_cluster = statistics.stdev( list(num_motifs_per_cluster.values())) max_num_motifs_per_cluster = max(list(num_motifs_per_cluster.values())) min_num_motifs_per_cluster = min(list(num_motifs_per_cluster.values())) median_num_motifs_per_cluster = statistics.median( list(num_motifs_per_cluster.values())) mode_num_motifs_per_cluster = statistics.mode(
all_motifs = [] #Holds motifs per cluster motifs_per_cluster = {} #Import the motifs for file in tqdm(list(os.listdir(operon_clusters_path)), desc='Reading in motifs from saved clusters'): if file.split('.')[-1] != 'json': continue cluster_info = dict(json.load(open(operon_clusters_path + file, 'r'))) motifs_per_cluster[cluster_info['cluster_id']] = [] for motif in cluster_info['motifs']: motif_obj = Motif(instances=Instances(motif)) all_motifs.append(motif_obj) motifs_per_cluster[cluster_info['cluster_id']].append(motif_obj) print('-' * 5, ' Total motifs: ', len(all_motifs)) ### STEP 2: Bin the motifs based off some threshold weight. #Holds all motif bins print('Binning motifs...') sim_threshold = 0.26 motif_bins = {} motif_assignments = {} for m in all_motifs:
''' A copy from https://github.com/ErillLab/Transfer_method_analysis/blob/71a042cc8ca0ce03a1d44c84d87f9bb509b6ec4c/src/motif.py#L81 Being used for testing purposes. ''' from Bio.motifs import Motif, Instances import math test_motif = Motif(instances=Instances(["ATCAGTCA", "ATCAGTAA", "ATCTGTCA"])) print(test_motif.consensus)