Пример #1
0
def ic_at(motif, other, offset):
    '''
    Caculates the information content, ic, for a specific alignment. The approach makes a temporary motif object containing the overlapping sequences in the alignemnt and taking the average of the pssm.

    Parameters
    ----------
    motif, other: Motif objects
        The motifs of interest
    offset: int
        The offset value that results in the alignment of interest. 
    '''

    #Pull the sequences containined in the aligned region of the motifs from each of the motif instances.
    alignment_len = min(len(motif) - offset, len(other))
    motif_seqs = [
        site[offset:alignment_len + offset] for site in motif.instances
    ]
    other_seqs = [site[:alignment_len] for site in other.instances]

    # Create the motif and compute the IC
    amotif = Motif(instances=Instances(motif_seqs + other_seqs))
    amotif.pseudocounts = dict(A=0.25, C=0.25, G=0.25, T=0.25)

    #print('Motif Seqs: ' , motif_seqs)
    #print('Other Seqs: ' , other_seqs)
    #print('Offset ', offset)
    #print('IC: ' , amotif.pssm.mean(), '\n\n')

    return amotif.pssm.mean()
Пример #2
0
def read(handle, format):
    alphabet = IUPAC.unambiguous_dna
    counts = {}
    if format == "pfm":
        # reads the motif from Jaspar .pfm file
        letters = "ACGT"
        for letter, line in zip(letters, handle):
            words = line.split()
            #if there is a letter in the beginning, ignore it
            if words[0] == letter:
                words = words[1:]
            counts[letter] = map(float, words)
        motif = Motif(alphabet, counts=counts)
    elif format == "sites":
        # reads the motif from Jaspar .sites file
        instances = []
        for line in handle:
            if not line.startswith(">"):
                break
            # line contains the header ">...."
            # now read the actual sequence
            line = handle.next()
            instance = ""
            for c in line.strip():
                if c == c.upper():
                    instance += c
            instance = Seq(instance, alphabet)
            instances.append(instance)
        instances = Instances(instances, alphabet)
        motif = Motif(alphabet, instances=instances)
    else:
        raise ValueError("Unknown format %s" % format)
    motif.mask = "*" * motif.length
    return motif
def main():
    seq_list=[]
    pos_sequences = SeqIO.parse(open(input_file_sig70_compiled), 'fasta')
    count = 0
    for seq in pos_sequences:
        sequence = seq.seq[28:34]
        if sequence[4] == "G":
            count+=1
            print(count)
        print(sequence)
        seq_list.append(sequence)

    motif = Motif(instances=Instances(instances=seq_list))
    PSSM = motif.pssm
    print(PSSM.max)
    print(PSSM.consensus)
    print(PSSM)

    amrZ_sequences = SeqIO.parse(open(input_file_AmrZ), 'fasta')
    positions = []
    for a_seq in amrZ_sequences:
        print(a_seq.seq)
        prib_10_pos = evaluate_prib10(a_seq.seq, PSSM)
        if prib_10_pos != []:
            positions.append(prib_10_pos)

    print(positions)
Пример #4
0
def get_cluster_motifs(cluster_id):
    '''
    Parse out the motif for a specific cluster.

    Parameters
    ----------
    cluster_id: str
        The cluster of interest.
    path_to_saved_clusters: str
        The path to the directory that is holding the saved JSONs for the saved clusters
    
    Returns
    -------
    motifs: str
        A string that is made of the concatenated consensus sequences of the motifs a part of the cluster. 
    '''
    path_to_saved_clusters = '/Users/ichaudr/Documents/UMBC/Lab-Erill/Isaac/Vibrio_SOA/Saved Clusters/complete_clusters/'
    motifs = []

    for file in os.listdir(path_to_saved_clusters):
        if file.split('.')[0] == cluster_id:
            file_reader = json.load(open(path_to_saved_clusters + file, 'r'))

            motifs_from_file = file_reader['motifs']

            for m in motifs_from_file:
                temp_motif = Motif(instances=Instances(m))
                motifs.append(str(temp_motif.consensus))
    return motifs
Пример #5
0
def get_cluster_motifs(cluster_id, path_to_saved_clusters=cluster_jsons_path):
    '''
    Parse out the motif for a specific cluster.

    Parameters
    ----------
    cluster_id: str
        The cluster of interest.
    path_to_saved_clusters: str
        The path to the directory that is holding the saved JSONs for the saved clusters
    
    Returns
    -------
    motifs: str
        A string that is made of the concatenated consensus sequences of the motifs a part of the cluster. 
    '''

    motifs = []

    for file in os.listdir(path_to_saved_clusters):
        if file.split('.')[0] == cluster_id:
            file_reader = json.load(open(path_to_saved_clusters + file, 'r'))

            motifs_from_file = file_reader['motifs']

            for m in motifs_from_file:
                temp_motif = Motif(instances=Instances(m))
                #temp_motif.weblogo(fname=logo_path+cluster_id+"_"+str(motifs_from_file.index(m))+'.png')
                motifs.append(str(temp_motif.consensus))
    return ' | '.join(motifs)
Пример #6
0
    def load_from_json(self, file_path):
        '''
        Sets up all memeber variables based on the stored data in the JSON file. 

        Parameters
        ----------
        file_path: str
            Path to the JSON file that will be loaded in.
        
        Returns
        -------
        None
        '''

        file_reader = json.load(open(file_path, 'r'))

        self.cluster_id = file_reader['cluster_id']

        for op in file_reader['operons']:
            temp_op = Operon(operon_id=op['operon_id'],
                             genome_fragment_name='imported_cluster',
                             genome_accession=op['genome_accession'],
                             genome_features='imported',
                             strand='/')

            temp_op.features = op['features']

            temp_op.promoter = op['promoter']

            self.operons.append(temp_op)

        self.filtered_promoters = file_reader['filtered_promoters']

        for m in file_reader['motifs']:
            self.motifs.append(Motif(instances=Instances(m)))
def build_pssm_35(sequences):
    seq_list = []
    for seq in sequences:
        seq35 = seq[28:34]
        seq_list.append(seq35)
    motif = Motif(instances=Instances(instances=seq_list))
    PSSM = motif.pssm
    return PSSM
def build_pssm_10(sequences):
    seq_list = []
    for seq in sequences:
        seq10 = seq[52:58]
        seq_list.append(seq10)
    motif = Motif(instances=Instances(instances=seq_list))
    PSSM = motif.pssm
    return PSSM
Пример #9
0
def read(handle):
    """Parse an AlignACE format handle as a Record object."""
    record = Record()
    line = next(handle)
    record.version = line.strip()
    line = next(handle)
    record.command = line.strip()
    mask = None
    number = None
    for line in handle:
        line = line.strip()
        if line == "":
            pass
        elif line[:4] == "Para":
            record.parameters = {}
        elif line[0] == "#":
            seq_name = line.split("\t")[1]
            record.sequences.append(seq_name)
        elif "=" in line:
            par_name, par_value = line.split("=")
            par_name = par_name.strip()
            par_value = par_value.strip()
            record.parameters[par_name] = par_value
        elif line[:5] == "Input":
            record.sequences = []
        elif line[:5] == "Motif":
            words = line.split()
            assert words[0] == "Motif"
            number = int(words[1])
            instances = []
        elif line[:3] == "MAP":
            alphabet = "ACGT"
            instances = Instances(instances, alphabet)
            motif = Motif(alphabet, instances)
            motif.score = float(line.split()[-1])
            motif.number = number
            motif.mask = mask
            record.append(motif)
        elif len(line.split("\t")) == 4:
            seq = Seq(line.split("\t")[0])
            instances.append(seq)
        elif "*" in line:
            mask = line.strip("\r\n")
        else:
            raise ValueError(line)
    return record
Пример #10
0
def read(handle):
    """read(handle)"""
    record = Record()
    line = next(handle)
    record.version = line.strip()
    line = next(handle)
    record.command = line.strip()
    mask = None
    number = None
    for line in handle:
        line = line.strip()
        if line == "":
            pass
        elif line[:4] == "Para":
            record.parameters = {}
        elif line[0] == "#":
            seq_name = line.split("\t")[1]
            record.sequences.append(seq_name)
        elif "=" in line:
            par_name, par_value = line.split("=")
            par_name = par_name.strip()
            par_value = par_value.strip()
            record.parameters[par_name] = par_value
        elif line[:5] == "Input":
            record.sequences = []
        elif line[:5] == "Motif":
            words = line.split()
            assert words[0] == "Motif"
            number = int(words[1])
            instances = []
        elif line[:3] == "MAP":
            alphabet = IUPAC.unambiguous_dna
            instances = Instances(instances, alphabet)
            motif = Motif(alphabet, instances)
            motif.score = float(line.split()[-1])
            motif.number = number
            motif.mask = mask
            record.append(motif)
        elif len(line.split("\t")) == 4:
            seq = Seq(line.split("\t")[0], IUPAC.unambiguous_dna)
            instances.append(seq)
        elif "*" in line:
            mask = line.strip("\r\n")
        else:
            raise ValueError(line)
    return record
Пример #11
0
def build_pssm(sequences):
    seq_list = []
    for seq in sequences:
        seq_list.append(seq)
    motif = Motif(instances=Instances(instances=seq_list))
    motif = motif.counts.normalize(0.01)
    PSSM = PositionSpecificScoringMatrix(alphabet=IUPAC.unambiguous_dna,
                                         values=motif)
    return PSSM
Пример #12
0
def printCons10(sequences):
    seq_list = []
    for seq in sequences:
        seq10 = seq[52:58]
        seq_list.append(seq10)
    motif = Motif(instances=Instances(instances=seq_list))
    PSSM = motif.pssm
    print(PSSM.consensus)
    print(PSSM)
Пример #13
0
def printCons35(sequences):
    seq_list = []
    for seq in sequences:
        seq35 = seq[28:34]
        seq_list.append(seq35)
    motif = Motif(instances=Instances(instances=seq_list))
    PSSM = motif.pssm
    print(PSSM.consensus)
    print(PSSM)
Пример #14
0
 def transcription_factor_matrices(self):
     if not os.path.exists(
             self.cache_location / 'transcription_factor_matrices.json'):
         tms = taxonid_to_jaspar_matrix_ids(self.taxonid)
         write_tms = {k: v.counts for k, v in tms.items()}
         with open(
                 self.cache_location / 'transcription_factor_matrices.json',
                 'w+') as f:
             json.dump(write_tms, f)
         self._transcription_factor_matrices = tms
     elif not hasattr(self, '_transcription_factor_matrices'):
         with open(self.cache_location /
                   'transcription_factor_matrices.json') as f:
             write_tms = json.load(f)
         tms = {k: Motif(counts=v) for k, v in write_tms.items()}
         self._transcription_factor_matrices = tms
     return self._transcription_factor_matrices
Пример #15
0
from Bio import motifs
from Bio.Seq import Seq

instances = [
    Seq("TACAA"),
    Seq("TACGC"),
    Seq("TACAC"),
    Seq("TACCC"),
    Seq("AACCC"),
    Seq("AATGC"),
    Seq("AATGC")
]
m = motifs.create(instances)

print(m.counts)
Motif.weblogo(m, 'test.png')

from Bio.Alphabet import IUPAC
# multiple sequence alignment 파일 읽기
alignment = AlignIO.read(
    "C:/"
    "Users/"
    "SD NOH/"
    "PycharmProjects/"
    "First/"
    "Bioinformatics_Biopython-master/"
    "Bioinformatics_Biopython-master/"
    "Section1/"
    "Chap7/"
    "HBA.aln", "clustal")
Пример #16
0
#Get the counts to fill the above dictionaries
for file in os.listdir(path_to_clusters):

    if file[-4:] != 'json':
        continue

    cluster_id = file[:-5]
    cluster_reader = json.load(open(path_to_clusters + file, 'r'))

    #Add info about the number of motifs
    num_motifs_per_cluster[cluster_id] = len(cluster_reader['motifs'])

    #Add info about the IC per motif
    ic_per_motif_per_cluster[cluster_id] = {}
    for m in range(len(cluster_reader['motifs'])):
        temp_motif = Motif(instances=Instances(cluster_reader['motifs'][m]))
        temp_motif_seq = str(temp_motif.consensus)
        temp_motif_ic = temp_motif.pssm.mean()

        ic_per_motif_per_cluster[cluster_id][temp_motif_seq] = temp_motif_ic

#All stats to be calculated:
avg_num_motifs_per_cluster = statistics.mean(
    list(num_motifs_per_cluster.values()))
stdv_num_motifs_per_cluster = statistics.stdev(
    list(num_motifs_per_cluster.values()))
max_num_motifs_per_cluster = max(list(num_motifs_per_cluster.values()))
min_num_motifs_per_cluster = min(list(num_motifs_per_cluster.values()))
median_num_motifs_per_cluster = statistics.median(
    list(num_motifs_per_cluster.values()))
mode_num_motifs_per_cluster = statistics.mode(
Пример #17
0
all_motifs = []

#Holds motifs per cluster
motifs_per_cluster = {}

#Import the motifs
for file in tqdm(list(os.listdir(operon_clusters_path)),
                 desc='Reading in motifs from saved clusters'):
    if file.split('.')[-1] != 'json':
        continue

    cluster_info = dict(json.load(open(operon_clusters_path + file, 'r')))
    motifs_per_cluster[cluster_info['cluster_id']] = []

    for motif in cluster_info['motifs']:
        motif_obj = Motif(instances=Instances(motif))
        all_motifs.append(motif_obj)
        motifs_per_cluster[cluster_info['cluster_id']].append(motif_obj)

print('-' * 5, ' Total motifs: ', len(all_motifs))

### STEP 2: Bin the motifs based off some threshold weight.

#Holds all motif bins
print('Binning motifs...')
sim_threshold = 0.26
motif_bins = {}

motif_assignments = {}

for m in all_motifs:
Пример #18
0
#7.5.2.WebLogo_example_1.py

from Bio.motifs import Motif
from Bio import motifs
from Bio.Seq import Seq

instances = [
    Seq("TACAA"),
    Seq("TACGC"),
    Seq("TACAC"),
    Seq("TACCC"),
    Seq("AACCC"),
    Seq("AATGC"),
    Seq("AATGC"),
]

m = motifs.create(instances)

print(m.counts)
Motif.weblogo(m, 'test.png')
Пример #19
0
'''
A copy from https://github.com/ErillLab/Transfer_method_analysis/blob/71a042cc8ca0ce03a1d44c84d87f9bb509b6ec4c/src/motif.py#L81
Being used for testing purposes.
'''

from Bio.motifs import Motif, Instances
import math


test_motif = Motif(instances=Instances(["ATCAGTCA", "ATCAGTAA", "ATCTGTCA"]))
print(test_motif.consensus)
Пример #20
0
sample_2 = SeqIO.read(open("/mnt/hgfs/shared_folder/AJ011405.gb"), "genbank")
sample_3 = SeqIO.read(open("/mnt/hgfs/shared_folder/AJ011408.gb"), "genbank")

list = [sample_1, sample_2, sample_3]
aln_fasta=""
for sample in list:
    title = sample.name+" "+sample.description
    aln_fasta+=">"+title+"\n"+sample.seq+"\n"
fr = open("aln_fasta", "w")
fr.write(str(aln_fasta))
fr.close()

from Bio.Align.Applications import MuscleCommandline
muscle_exe = "~/muscle64"
cmd_line = MuscleCommandline(muscle_exe, input="aln_fasta", out="7.8.aln", clw=" ")
print (cmd_line)
stdout, stderr = cmd_line()

from Bio import AlignIO
from Bio.motifs import Motif
from Bio import motifs
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
alignment = AlignIO.read("7.8.aln", "clustal")
instance=[]
for record in alignment:
    s = Seq(str(record.seq), IUPAC.unambiguous_dna)
    instance.append(s)
m =motifs.create(instance)
Motif.weblogo(m, '7.8_weblogo.png')
Пример #21
0
def jaspar_matrix_id_to_motif(matrix_id):
    return Motif(counts=requests.get(
        f'http://jaspar.genereg.net/api/v1/matrix/{matrix_id}/').json()['pfm'])
Пример #22
0
#13.5.WebLogo.py

from Bio.motifs import Motif
from Bio import motifs
from Bio.Seq import Seq

instances = [
    Seq("AATTAAA"),
    Seq("AAAAAGA"),
    Seq("AAATAGC"),
    Seq("AATCAAC"),
    Seq("AATTTAA"),
    Seq("TATCAGA"),
    Seq("ATATAGC"),
    Seq("ATATTAA"),
]

m = motifs.create(instances)

print(m.counts)
Motif.weblogo(m, '13.5.png')