Пример #1
0
def read(handle, format):
    alphabet = IUPAC.unambiguous_dna
    counts = {}
    if format == "pfm":
        # reads the motif from Jaspar .pfm file
        letters = "ACGT"
        for letter, line in zip(letters, handle):
            words = line.split()
            #if there is a letter in the beginning, ignore it
            if words[0] == letter:
                words = words[1:]
            counts[letter] = map(float, words)
        motif = Motif(alphabet, counts=counts)
    elif format == "sites":
        # reads the motif from Jaspar .sites file
        instances = []
        for line in handle:
            if not line.startswith(">"):
                break
            # line contains the header ">...."
            # now read the actual sequence
            line = handle.next()
            instance = ""
            for c in line.strip():
                if c == c.upper():
                    instance += c
            instance = Seq(instance, alphabet)
            instances.append(instance)
        instances = Instances(instances, alphabet)
        motif = Motif(alphabet, instances=instances)
    else:
        raise ValueError("Unknown format %s" % format)
    motif.mask = "*" * motif.length
    return motif
Пример #2
0
    def load_from_json(self, file_path):
        '''
        Sets up all memeber variables based on the stored data in the JSON file. 

        Parameters
        ----------
        file_path: str
            Path to the JSON file that will be loaded in.
        
        Returns
        -------
        None
        '''

        file_reader = json.load(open(file_path, 'r'))

        self.cluster_id = file_reader['cluster_id']

        for op in file_reader['operons']:
            temp_op = Operon(operon_id=op['operon_id'],
                             genome_fragment_name='imported_cluster',
                             genome_accession=op['genome_accession'],
                             genome_features='imported',
                             strand='/')

            temp_op.features = op['features']

            temp_op.promoter = op['promoter']

            self.operons.append(temp_op)

        self.filtered_promoters = file_reader['filtered_promoters']

        for m in file_reader['motifs']:
            self.motifs.append(Motif(instances=Instances(m)))
Пример #3
0
def get_cluster_motifs(cluster_id, path_to_saved_clusters=cluster_jsons_path):
    '''
    Parse out the motif for a specific cluster.

    Parameters
    ----------
    cluster_id: str
        The cluster of interest.
    path_to_saved_clusters: str
        The path to the directory that is holding the saved JSONs for the saved clusters
    
    Returns
    -------
    motifs: str
        A string that is made of the concatenated consensus sequences of the motifs a part of the cluster. 
    '''

    motifs = []

    for file in os.listdir(path_to_saved_clusters):
        if file.split('.')[0] == cluster_id:
            file_reader = json.load(open(path_to_saved_clusters + file, 'r'))

            motifs_from_file = file_reader['motifs']

            for m in motifs_from_file:
                temp_motif = Motif(instances=Instances(m))
                #temp_motif.weblogo(fname=logo_path+cluster_id+"_"+str(motifs_from_file.index(m))+'.png')
                motifs.append(str(temp_motif.consensus))
    return ' | '.join(motifs)
def main():
    seq_list=[]
    pos_sequences = SeqIO.parse(open(input_file_sig70_compiled), 'fasta')
    count = 0
    for seq in pos_sequences:
        sequence = seq.seq[28:34]
        if sequence[4] == "G":
            count+=1
            print(count)
        print(sequence)
        seq_list.append(sequence)

    motif = Motif(instances=Instances(instances=seq_list))
    PSSM = motif.pssm
    print(PSSM.max)
    print(PSSM.consensus)
    print(PSSM)

    amrZ_sequences = SeqIO.parse(open(input_file_AmrZ), 'fasta')
    positions = []
    for a_seq in amrZ_sequences:
        print(a_seq.seq)
        prib_10_pos = evaluate_prib10(a_seq.seq, PSSM)
        if prib_10_pos != []:
            positions.append(prib_10_pos)

    print(positions)
Пример #5
0
def get_cluster_motifs(cluster_id):
    '''
    Parse out the motif for a specific cluster.

    Parameters
    ----------
    cluster_id: str
        The cluster of interest.
    path_to_saved_clusters: str
        The path to the directory that is holding the saved JSONs for the saved clusters
    
    Returns
    -------
    motifs: str
        A string that is made of the concatenated consensus sequences of the motifs a part of the cluster. 
    '''
    path_to_saved_clusters = '/Users/ichaudr/Documents/UMBC/Lab-Erill/Isaac/Vibrio_SOA/Saved Clusters/complete_clusters/'
    motifs = []

    for file in os.listdir(path_to_saved_clusters):
        if file.split('.')[0] == cluster_id:
            file_reader = json.load(open(path_to_saved_clusters + file, 'r'))

            motifs_from_file = file_reader['motifs']

            for m in motifs_from_file:
                temp_motif = Motif(instances=Instances(m))
                motifs.append(str(temp_motif.consensus))
    return motifs
Пример #6
0
def ic_at(motif, other, offset):
    '''
    Caculates the information content, ic, for a specific alignment. The approach makes a temporary motif object containing the overlapping sequences in the alignemnt and taking the average of the pssm.

    Parameters
    ----------
    motif, other: Motif objects
        The motifs of interest
    offset: int
        The offset value that results in the alignment of interest. 
    '''

    #Pull the sequences containined in the aligned region of the motifs from each of the motif instances.
    alignment_len = min(len(motif) - offset, len(other))
    motif_seqs = [
        site[offset:alignment_len + offset] for site in motif.instances
    ]
    other_seqs = [site[:alignment_len] for site in other.instances]

    # Create the motif and compute the IC
    amotif = Motif(instances=Instances(motif_seqs + other_seqs))
    amotif.pseudocounts = dict(A=0.25, C=0.25, G=0.25, T=0.25)

    #print('Motif Seqs: ' , motif_seqs)
    #print('Other Seqs: ' , other_seqs)
    #print('Offset ', offset)
    #print('IC: ' , amotif.pssm.mean(), '\n\n')

    return amotif.pssm.mean()
def build_pssm_35(sequences):
    seq_list = []
    for seq in sequences:
        seq35 = seq[28:34]
        seq_list.append(seq35)
    motif = Motif(instances=Instances(instances=seq_list))
    PSSM = motif.pssm
    return PSSM
def build_pssm_10(sequences):
    seq_list = []
    for seq in sequences:
        seq10 = seq[52:58]
        seq_list.append(seq10)
    motif = Motif(instances=Instances(instances=seq_list))
    PSSM = motif.pssm
    return PSSM
Пример #9
0
def build_pssm(sequences):
    seq_list = []
    for seq in sequences:
        seq_list.append(seq)
    motif = Motif(instances=Instances(instances=seq_list))
    motif = motif.counts.normalize(0.01)
    PSSM = PositionSpecificScoringMatrix(alphabet=IUPAC.unambiguous_dna,
                                         values=motif)
    return PSSM
Пример #10
0
def printCons10(sequences):
    seq_list = []
    for seq in sequences:
        seq10 = seq[52:58]
        seq_list.append(seq10)
    motif = Motif(instances=Instances(instances=seq_list))
    PSSM = motif.pssm
    print(PSSM.consensus)
    print(PSSM)
Пример #11
0
def printCons35(sequences):
    seq_list = []
    for seq in sequences:
        seq35 = seq[28:34]
        seq_list.append(seq35)
    motif = Motif(instances=Instances(instances=seq_list))
    PSSM = motif.pssm
    print(PSSM.consensus)
    print(PSSM)
Пример #12
0
def read(handle):
    """Parse an AlignACE format handle as a Record object."""
    record = Record()
    line = next(handle)
    record.version = line.strip()
    line = next(handle)
    record.command = line.strip()
    mask = None
    number = None
    for line in handle:
        line = line.strip()
        if line == "":
            pass
        elif line[:4] == "Para":
            record.parameters = {}
        elif line[0] == "#":
            seq_name = line.split("\t")[1]
            record.sequences.append(seq_name)
        elif "=" in line:
            par_name, par_value = line.split("=")
            par_name = par_name.strip()
            par_value = par_value.strip()
            record.parameters[par_name] = par_value
        elif line[:5] == "Input":
            record.sequences = []
        elif line[:5] == "Motif":
            words = line.split()
            assert words[0] == "Motif"
            number = int(words[1])
            instances = []
        elif line[:3] == "MAP":
            alphabet = "ACGT"
            instances = Instances(instances, alphabet)
            motif = Motif(alphabet, instances)
            motif.score = float(line.split()[-1])
            motif.number = number
            motif.mask = mask
            record.append(motif)
        elif len(line.split("\t")) == 4:
            seq = Seq(line.split("\t")[0])
            instances.append(seq)
        elif "*" in line:
            mask = line.strip("\r\n")
        else:
            raise ValueError(line)
    return record
Пример #13
0
 def transcription_factor_matrices(self):
     if not os.path.exists(
             self.cache_location / 'transcription_factor_matrices.json'):
         tms = taxonid_to_jaspar_matrix_ids(self.taxonid)
         write_tms = {k: v.counts for k, v in tms.items()}
         with open(
                 self.cache_location / 'transcription_factor_matrices.json',
                 'w+') as f:
             json.dump(write_tms, f)
         self._transcription_factor_matrices = tms
     elif not hasattr(self, '_transcription_factor_matrices'):
         with open(self.cache_location /
                   'transcription_factor_matrices.json') as f:
             write_tms = json.load(f)
         tms = {k: Motif(counts=v) for k, v in write_tms.items()}
         self._transcription_factor_matrices = tms
     return self._transcription_factor_matrices
Пример #14
0
#Get the counts to fill the above dictionaries
for file in os.listdir(path_to_clusters):

    if file[-4:] != 'json':
        continue

    cluster_id = file[:-5]
    cluster_reader = json.load(open(path_to_clusters + file, 'r'))

    #Add info about the number of motifs
    num_motifs_per_cluster[cluster_id] = len(cluster_reader['motifs'])

    #Add info about the IC per motif
    ic_per_motif_per_cluster[cluster_id] = {}
    for m in range(len(cluster_reader['motifs'])):
        temp_motif = Motif(instances=Instances(cluster_reader['motifs'][m]))
        temp_motif_seq = str(temp_motif.consensus)
        temp_motif_ic = temp_motif.pssm.mean()

        ic_per_motif_per_cluster[cluster_id][temp_motif_seq] = temp_motif_ic

#All stats to be calculated:
avg_num_motifs_per_cluster = statistics.mean(
    list(num_motifs_per_cluster.values()))
stdv_num_motifs_per_cluster = statistics.stdev(
    list(num_motifs_per_cluster.values()))
max_num_motifs_per_cluster = max(list(num_motifs_per_cluster.values()))
min_num_motifs_per_cluster = min(list(num_motifs_per_cluster.values()))
median_num_motifs_per_cluster = statistics.median(
    list(num_motifs_per_cluster.values()))
mode_num_motifs_per_cluster = statistics.mode(
Пример #15
0
all_motifs = []

#Holds motifs per cluster
motifs_per_cluster = {}

#Import the motifs
for file in tqdm(list(os.listdir(operon_clusters_path)),
                 desc='Reading in motifs from saved clusters'):
    if file.split('.')[-1] != 'json':
        continue

    cluster_info = dict(json.load(open(operon_clusters_path + file, 'r')))
    motifs_per_cluster[cluster_info['cluster_id']] = []

    for motif in cluster_info['motifs']:
        motif_obj = Motif(instances=Instances(motif))
        all_motifs.append(motif_obj)
        motifs_per_cluster[cluster_info['cluster_id']].append(motif_obj)

print('-' * 5, ' Total motifs: ', len(all_motifs))

### STEP 2: Bin the motifs based off some threshold weight.

#Holds all motif bins
print('Binning motifs...')
sim_threshold = 0.26
motif_bins = {}

motif_assignments = {}

for m in all_motifs:
Пример #16
0
def jaspar_matrix_id_to_motif(matrix_id):
    return Motif(counts=requests.get(
        f'http://jaspar.genereg.net/api/v1/matrix/{matrix_id}/').json()['pfm'])
Пример #17
0
'''
A copy from https://github.com/ErillLab/Transfer_method_analysis/blob/71a042cc8ca0ce03a1d44c84d87f9bb509b6ec4c/src/motif.py#L81
Being used for testing purposes.
'''

from Bio.motifs import Motif, Instances
import math


test_motif = Motif(instances=Instances(["ATCAGTCA", "ATCAGTAA", "ATCTGTCA"]))
print(test_motif.consensus)