def _read_motif_seq(block): """ From the block of lines used in MotifFile, return a Motif object """ gapped = Gapped(ExtendedIUPACDNA(), '-') instances = [] head = block[0][0] head = head.rstrip("\n").split(" ") motif_name = head[0] motif_length = head[-1] for l in block[1]: l = l.rstrip("\n").split("\t") instance = meme.Instance(l[-1], gapped) instance.motif_name = motif_name instance.sequence_name = l[0].translate(None, " ") instance.start = int(l[1].translate(None, " ")) instance.length = int(motif_length) instances.append(instance) instances = motifs.Instances(instances, alphabet=gapped) motif = WeightedMotif(gapped, instances, float(head[-5]), float(head[-3])) motif.length = motif_length motif.name = motif_name return motif
def _read_sites(handle): """Read the motif from JASPAR .sites file (PRIVATE).""" alphabet = dna instances = [] for line in handle: if not line.startswith(">"): break # line contains the header ">...." # now read the actual sequence line = next(handle) instance = "" for c in line.strip(): if c == c.upper(): instance += c instance = Seq(instance, alphabet) instances.append(instance) instances = motifs.Instances(instances, alphabet) motif = Motif(matrix_id=None, name=None, alphabet=alphabet, instances=instances) motif.mask = "*" * motif.length record = Record() record.append(motif) return record
def convert(motifs_list, alphabet): """ Returns a new list of motifs converting using given alphabet. """ new_list = [] for motif in motifs_list: tot_inst = [] # Recreate instances using given alphabet for i in motif.instances: inst = meme.Instance(i.tostring(), alphabet) inst.motif_name = i.motif_name inst.sequence_name = i.sequence_name inst.start = int(i.start) inst.length = int(i.length) tot_inst.append(inst) tot_inst = motifs.Instances(tot_inst, alphabet) # Converting instances and motifs mot = meme.Motif(alphabet=alphabet, instances=tot_inst) mot.name = motif.name mot.evalue = motif.evalue new_list.append(mot) return new_list
def __read_motifs(record, xml_tree, sequence_id_name_map): for motif_tree in xml_tree.find("motifs").findall("motif"): instances = [] for site_tree in motif_tree.find("contributing_sites").findall( "contributing_site" ): letters = [ letter_ref.get("letter_id") for letter_ref in site_tree.find("site").findall("letter_ref") ] sequence = "".join(letters) instance = Instance(sequence) instance.motif_name = motif_tree.get("name") instance.sequence_id = site_tree.get("sequence_id") instance.sequence_name = sequence_id_name_map[instance.sequence_id] # TODO - left flank, right flank instance.start = int(site_tree.get("position")) + 1 instance.pvalue = float(site_tree.get("pvalue")) instance.strand = __convert_strand(site_tree.get("strand")) instance.length = len(sequence) instances.append(instance) instances = motifs.Instances(instances, record.alphabet) motif = Motif(record.alphabet, instances) motif.id = motif_tree.get("id") motif.name = motif_tree.get("name") motif.alt_id = motif_tree.get("alt") motif.length = int(motif_tree.get("width")) motif.num_occurrences = int(motif_tree.get("sites")) motif.evalue = float(motif_tree.get("e_value")) # TODO - ic, re, llr, pvalue, bayes_threshold, elapsed_time record.append(motif)
def get_random_instances(records, motiflength): """ Function that gets some fully random motifs out of a list of sequences :param records: :param motiflength: :return: """ # get a random gapsize to start, the gap will be refined after multiple iterations of the algorithm global gapSize gapSize = random.randint(0, Config.max_gapsize + 1) instances = motifs.Instances() for idx, record in enumerate(records): pos = random.randint(0, len(record.seq) - (motiflength + gapSize)) gappos = random.randint(0, motiflength) seq = None if (pos + gappos + gapSize - pos + motiflength + gapSize > 0): seq = record.seq[pos:pos + gappos] + record.seq[pos + gappos + gapSize:pos + motiflength + gapSize] else: if (gappos == 0): seq = record.seq[pos + gappos + gapSize:pos + motiflength + gapSize] else: seq = record.seq[pos:pos + gappos] instances.append(seq) gapList[idx] = gappos return instances
def new_motif(sites): """Given sites, return motif object""" sites = listutils.nub_by(sequence.overlap_test, sites) seqs = [site.seq for site in sites] motif_ = motifs.Motif(instances=motifs.Instances(seqs)) motif_.pseudocounts = dict(A=0.25, C=0.25, G=0.25, T=0.25) return Motif(sites, motif_)
def permute(motif): """Permute the given motif by shuffling its columns""" cols = range(length(motif)) random.shuffle(cols) shuffled = [''.join(site[i] for i in cols) for site in seqs(motif)] _motif = motifs.Motif(instances=motifs.Instances(shuffled)) _motif.pseudocounts = pseudocounts(motif) return Motif(None, _motif)
def ic_at(motif, other, offset): """Return the total IC of two aligned motifs""" alignment_len = min(length(motif) - offset, length(other)) motif_seqs = [site[offset:alignment_len + offset] for site in seqs(motif)] other_seqs = [site[:alignment_len] for site in seqs(other)] # Create the motif and compute the IC amotif = motifs.Motif(instances=motifs.Instances(motif_seqs + other_seqs)) amotif.pseudocounts = dict(A=0.25, C=0.25, G=0.25, T=0.25) return amotif.pssm.mean()
def __read_motif_sequences(handle, motif_name, alphabet, length, revcomp): try: line = next(handle) except StopIteration: raise ValueError( 'Unexpected end of stream: Failed to find motif sequences') if not line.startswith('---'): raise ValueError("Line does not start with '---':\n%s" % line) try: line = next(handle) except StopIteration: raise ValueError( "Unexpected end of stream: Expected to find line starting with 'Sequence name'" ) if not line.startswith('Sequence name'): raise ValueError("Line does not start with 'Sequence name':\n%s" % line) try: line = next(handle) except StopIteration: raise ValueError( 'Unexpected end of stream: Failed to find motif sequences') if not line.startswith('---'): raise ValueError("Line does not start with '---':\n%s" % line) instances = [] for line in handle: if line.startswith('---'): break line = line.strip() words = line.split() if revcomp: strand = words.pop(1) else: strand = '+' sequence = words[4] assert len(sequence) == length instance = Instance(sequence, alphabet) instance.motif_name = motif_name instance.sequence_name = words[0] instance.start = int(words[1]) instance.pvalue = float(words[2]) instance.strand = strand instance.length = length instances.append(instance) else: raise ValueError('Unexpected end of stream') return motifs.Instances(instances, alphabet)