def MOODS_search(seq, motif, thresholds=0): """an equivalent to Motif.search_pwm()""" if not USE_MOODS: raise RuntimeError("MOODS could not be imported") sequence = seq matrix_ = MOODS.transpose([map(lambda x: x[1], sorted(x.items())) for x in motif.log_odds()]) # Note: algorithm = 'lf' fails due to segmentation fault results_per_matrix = MOODS.search( sequence, [matrix_], thresholds, bg=None, algorithm="pla", q=7, absolute_threshold=True, both_strands=True, combine=True, ) # format as Motif.search_pwm results search_results = results_per_matrix[0] # figure out direction of reverse results # do we need to reverse it? results_sorted_like_Bio_Motif = sorted( search_results, key=operator.itemgetter(0), cmp=lambda x, y: cmp(abs(x), abs(y)) ) return results_sorted_like_Bio_Motif
def getMOODSscore_old(seq, mat): results = MOODS.search(seq, [mat], thresholds=1, absolute_threshold=False) resarray = np.zeros((1, len(seq)), dtype=np.dtype('Float32')) resarray[:, :] = -100 for (position, score) in results[0]: resarray[0, position] = score return (resarray)
def getMOODSscore(seqfile, pwmfiles, both_strands=False): handle = open(seqfile, "r") records = list(Bio.SeqIO.parse(handle, "fasta")) handle.close() seq = records[0].seq print 'len(seq)=', len(seq) matrixlist = list() for f in pwmfiles: matrix = MOODS.load_matrix(f) print 'pwm ', f, 'windowlength=', len(matrix[0]) matrixlist.append(matrix) if both_strands: matrixlist.append( MOODS.reverse_complement(matrix) ) # both_strand option in MOODS returned a akward result. print 'starting MOODS.search', datetime.now() results = MOODS.search(seq, matrixlist, thresholds=1, absolute_threshold=False) print 'done MOODS.search', datetime.now() reslist = [] for n in range(len(pwmfiles)): thisind = n * (1 + both_strands) reslist.append(vegardparseMOODSres(results[thisind], len(seq))) if both_strands: reslist[n] = np.append(reslist[n], vegardparseMOODSres(results[thisind + 1], len(seq)), axis=0) return (reslist)
def getPWMscores(regions, PWM, fasta, regionsFile): """ Score every basepairof a set of regions with a match to a PWM. :param regions: `bedtools.BedTool` with regions to look at. :type regions: bedtools.BedTool :param PWM: :type PWM: :param fasta: Fasta file with genome sequence. :type fasta: str :param regionsFile: :type regionsFile: """ import MOODS import numpy as np import pandas as pd # Get nucleotides seq = regions.sequence(s=True, fi=fasta) with open(seq.seqfn) as handle: seqs = handle.read().split("\n")[1::2] # get fasta sequences # Match strings with PWM scores = list() for sequence in seqs: result = MOODS.search(sequence, [PWM], 30) scores.append(np.array([j for i, j in result[0]])) names = open(regionsFile).read().split("\n") names.pop(-1) names = [line.split("\t")[3] for line in names] return pd.DataFrame(scores, index=names, columns=range(-990, 990))
def getMOODSscore_old(seq, mat): results = MOODS.search(seq, [mat], thresholds=1, absolute_threshold=False) resarray = np.zeros( (1, len(seq)), dtype=np.dtype('Float32')) resarray[:,:] = -100 for (position, score) in results[0]: resarray[0, position] = score return(resarray)
def search(consensus_list,TF,master,search_region,p): count = 0 count1 = 0 interaction_only = False try: if sys.argv[3] == "True": interaction_only = True else: interaction_only = False except IndexError: pass duplicate = [] threshold = [] header = '' for i in master: count +=1 threshold += [p] print >> sys.stderr, count #print threshold for region in search_region: if 'strand' in region: continue if '>' in region: header = region else: for i in range(len(master)): tf = TF[i] #print tf,consensus_list[i] tf_length = len(consensus_list[i]) #result = MOODS.search(region,master,threshold,absolute_threshold=threshold) result = MOODS.search(region,master,p) for check in range(len(result)): for j in range(len(result[check])): position = result[check][j][0] tf_length = len(consensus_list[check]) if result[check][j] == []: continue if interaction_only: if [TF[check],header] not in duplicate: duplicate += [[TF[check],header]] print tf.strip(),header.strip()[1:] print '' else: if [TF[check],header,position] not in duplicate: duplicate+= [[TF[check],header,position]] print TF[check].strip(),header.strip()[1:] print 'position:',position align(consensus_list[check],region[position:position+tf_length]) #print consensus_list[i],'Matched the motif in the upstream region:',region[position:position+tf_length] print ''
def ProcessSeqs(SEQ_HANDLE, PWMS, THRESHOLD, WANT_REV=False, bg=None): """Yields matches on sequences in an 'interval' formatted dictionary""" pwm_names = map(lambda x: x[0], PWMS) pwm_mats = map(lambda x: x[1], PWMS) thresh = map(lambda x: MOODS.threshold_from_p(x, bg, THRESHOLD), pwm_mats) for interval in ReadInterval(SEQ_HANDLE): print interval['NAME'] results = MOODS.search(interval['SEQ'].upper(), pwm_mats, thresh, both_strands=WANT_REV, algorithm='lf', absolute_threshold=True, bg=bg) for res, pwm_name, pwm_mat, th in zip(results, pwm_names, pwm_mats, thresh): width = len(pwm_mat[0]) for position, score in res: if score > th: yield { 'NAME': interval['NAME'], 'START': int(interval['START']) + position, 'END': int(interval['START']) + width + position, 'STRAND': interval['STRAND'], 'PWM': pwm_name, 'SCORE': score, 'CHROM': interval['CHROM'], 'SEQ': interval['SEQ'][position:(position + width)].upper() } else: print 'got bad result'
def getMOODSscore(seqfile, pwmfiles, both_strands=False): handle = open(seqfile, "r") records = list(Bio.SeqIO.parse(handle, "fasta")) handle.close() seq = records[0].seq print 'len(seq)=',len(seq) matrixlist=list() for f in pwmfiles: matrix = MOODS.load_matrix(f) print 'pwm ', f , 'windowlength=', len(matrix[0]) matrixlist.append(matrix) if both_strands: matrixlist.append(MOODS.reverse_complement(matrix)) # both_strand option in MOODS returned a akward result. print 'starting MOODS.search', datetime.now() results = MOODS.search(seq, matrixlist, thresholds=1, absolute_threshold=False) print 'done MOODS.search', datetime.now() reslist=[] for n in range(len(pwmfiles)): thisind = n * (1 + both_strands) reslist.append(vegardparseMOODSres( results[thisind] , len(seq))) if both_strands: reslist[n] = np.append( reslist[n] , vegardparseMOODSres( results[thisind+1] , len(seq)), axis=0) return(reslist)
records = fasta.parseFasta(fasta_filepath) seq = records[0][1] matrix1 = [ [0,1,0,0,0,0,0,1,1,0], [1,0,0,0,0,0,0,0,0,0], [0,0,0,0,0,0,0,0,0,0], [0,0,1,1,1,1,1,0,0,1] ] matrix2 = [ [10,0,10,3,5,5], [0,5,0,3,5,0,5], [0,1,0,3,0,5,0], [0,4,0,1,0,0,5] ] results = MOODS.search(seq, [matrix1, matrix2], 0.011) print("Matrix 1 results: "+ str(len(results[0]))) print("Matrix 2 results: "+ str(len(results[1]))) matrices = [matrix1, matrix2] thresholds = [0.011, 0.011] bg = MOODS.bg_from_sequence(seq, 0.1) q = 7 absolute_threshold = False both_strands=False ms = MOODS.MOODSSearch(matrices, thresholds, bg, q, absolute_threshold, both_strands) results = ms.search(seq) print("New Matrix 1 results: "+ str(len(results[0])))
def main(): """ The main loop. Lets ROCK! """ desc = """... ask me later! I'm on a deadline! ...""" parser = argparse.ArgumentParser(description=desc) parser.add_argument('--seqs', type=str, help="""Path to a fasta file containing the 'promoter' regions of a single species you wish to scan with motifs.""") parser.add_argument('--species', type=str, help="""A quoted string of the species name: 'Anophles gambiae'.""") parser.add_argument('--motifs', type=str, help="""Path to a file containing the motifs you wish to use. The file must be in JASPAR's 'matrix_only.txt' format.""") parser.add_argument('--thresh', type=float, required=False, default=0.001, help="""A p-val cut-off above which hits will be ignored. (default = %(default)s)""") parser.add_argument('--out', type=str, required=False, default='compare_motifs.out', help="""Path to outfile. (default = %(default)s)""") parser.add_argument('--norm', type=str, required=False, default=False, help="""Optional path to outfile for data normalized by upper quartiles w.r.t. each motif. (default = %(default)s)""") parser.add_argument('--to-norm', type=str, required=False, default=False, help="""Optional path to outfile of previous run that needs to be normalized. (default = %(default)s)""") args = parser.parse_args() if not args.to_norm: # create parsers motifs = ParseJasparMatrixOnly(args.motifs) seqs = ParseFastA(args.seqs) # Load all motifs at once # We will be loading one seq at a time. motifs = motifs.to_dict() # set up output and headers headers = 'seq_name\tspecies\t%s\n' % ('\t'.join(motifs.keys())) out_file = open(args.out,'w') out_file.write(headers) # lets start the major looping if args.norm and not args.to_norm: norm_dict = OrderedDict() elif args.to_norm: norm_dict,headers = load_to_normalize(args.to_norm) write_normalized_table(headers,args.norm,norm_dict) exit(0) for name,seq in seqs: hits = MOODS.search(seq,motifs.values(),args.thresh) counts = [len(x) for x in hits] out_file.write('%s\t%s\t%s\n' % (name,args.species,'\t'.join([str(x) for x in counts]))) if args.norm: norm_dict['%s\t%s' % (name,args.species)] = np.array(counts) out_file.close() if args.norm: write_normalized_table(headers,args.norm,norm_dict)
[0,0,0,1]] teststring = 'acgtacgt' ''' handle = open('/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest2.fasta', "r") records = list(Bio.SeqIO.parse(handle, "fasta")) handle.close() seq = records[0] teststring=seq.seq print('both strands') results = MOODS.search(teststring, [matrix], thresholds=1, absolute_threshold=False, both_strands=True) for i in results: for (position, score) in i: print("Position: " + str(position) + " Score: "+ str(score)) print('one way') results = MOODS.search(teststring, [matrix], thresholds=1, absolute_threshold=False, both_strands=False) for i in results: for (position, score) in i: print("Position: " + str(position) + " Score: "+ str(score))
import MOODS matrix = [ [10,0,0], [0,10,0], [0,0,10], [10,10,10]] results = MOODS.search('actgtggcgtcaacgtaggccaacgtggacccgtacgtaaacgaagaggggtagtc', [matrix], 30, absolute_threshold=30) for i in results: for (position, score) in i: print("Position: " + str(position) + " Score: "+ str(score))
def find_motif_disruptions( position, ref, alt, genome_fasta, matrices, ): """ Determine whether there is a difference between the ref and alt alleles for TF binding. Requires samtools in your path. Parameters ---------- position : str Zero based genomic coordinates of the reference allele of the form chrom:start-end (chr5:100-101 for a SNV for instance). The value end - start should equal the length of the ref allele. ref : str Reference allele. This should match the reference sequence at "position" in genome_fasta. alt : str Alternate allele. genome_fasta : str Path to genome fasta file. This file should be indexed. matrices : dict Dict whose keys are motif names and whose values are pandas data frames or numpy arrays containing PWMs with columns ACGT. Returns ------- out : pandas.DataFrame Pandas data frame with motifs whose best matches that overlapped the variant differed between the reference and alternate sequences. A score of zero and a strand of '' indicates that there was not a match for the motif on the given allele. """ import subprocess import MOODS # import pybedtools as pbt max_motif_length = max([x.shape[0] for x in matrices.values()]) chrom, coords = position.split(':') start, end = [int(x) for x in coords.split('-')] s = '{}:{}-{}'.format(chrom, start - max_motif_length + 1, end + max_motif_length - 1) c = 'samtools faidx {} {}'.format(genome_fasta, s) seq_lines = subprocess.check_output(c, shell=True).strip().split() ref_seq = seq_lines[1] alt_seq = ref_seq[0:max_motif_length - 1] + alt + ref_seq[max_motif_length + len(ref) - 1:] ref_variant_start = max_motif_length - 1 ref_variant_end = max_motif_length - 1 + len(ref) alt_variant_start = max_motif_length - 1 alt_variant_end = max_motif_length - 1 + len(alt) ms = [matrices[x].T.values.tolist() for x in matrices.keys()] ref_res = MOODS.search(ref_seq, ms, 0.001, both_strands=True, bg=[0.25, 0.25, 0.25, 0.25]) ref_res = dict(zip(matrices.keys(), ref_res)) alt_res = MOODS.search(alt_seq, ms, 0.001, both_strands=True, bg=[0.25, 0.25, 0.25, 0.25]) alt_res = dict(zip(matrices.keys(), alt_res)) # First we'll remove any motif matches that don't overlap the variant of interest (and thus # can't be affected by the variant and will be the same for ref and alt). Then we'll get the # best match for each motif for ref and alt. rows = [] for motif in ref_res.keys(): ref_res[motif] = _filter_variant_motif_res(ref_res[motif], ref_variant_start, ref_variant_end, matrices[motif].shape[0], ref_seq) alt_res[motif] = _filter_variant_motif_res(alt_res[motif], alt_variant_start, alt_variant_end, matrices[motif].shape[0], alt_seq) if len(ref_res[motif]) > 0: ref_pos, ref_score = sorted(ref_res[motif], key=lambda x: x[1], reverse=True)[0] ref_strand = {True: '+', False: '-'}[ref_pos > 0] else: ref_score = 0 ref_strand = '' if len(alt_res[motif]) > 0: alt_pos, alt_score = sorted(alt_res[motif], key=lambda x: x[1], reverse=True)[0] alt_strand = {True: '+', False: '-'}[alt_pos > 0] else: alt_score = 0 alt_strand = '' if ref_score > 0 or alt_score > 0: diff = ref_score - alt_score rows.append( [motif, ref_score, ref_strand, alt_score, alt_strand, diff]) out = pd.DataFrame(rows, columns=[ 'motif', 'ref_score', 'ref_strand', 'alt_score', 'alt_strand', 'score_diff' ]) out.index = out.motif out = out.drop('motif', axis=1) out = out[out.score_diff != 0] return out
def count(enhlist, c, p_val): write_match = False index = 0 seq = "" while index <= len(enhlist) - 1 and not enhlist[index].startswith('>'): seq += enhlist[index] seq = seq.replace('\n','') index += 1 index = 0 results = [] pseudocount = 0.001 # starrmot15 # caca = [[49,0,100,0,91,0,100,0],[1,83,0,88,0,49,0,94],[49,1,0,12,0,9,0,6],[0,16,0,0,9,41,0,0,]] caca = [[0,1,0,1,0,1],[1,0,1,0,1,0],[0,0,0,0,0,0],[0,0,0,0,0,0]] me137 = [[0,1,0,1,0,1],[0,0,0,0,0,0],[1,0,1,0,1,0],[0,0,0,0,0,0]] gcgc = [[0,0,0,0,0,0],[0,1,0,1,0,1],[1,0,1,0,1,0],[0,0,0,0,0,0]] tata = [[0,1,0,1,0,1],[0,0,0,0,0,0],[0,0,0,0,0,0],[1,0,1,0,1,0]] matrices = [me137,caca,gcgc,tata] if (c == "default"): results = MOODS.search(seq,matrices,p_val,both_strands=True) elif (c == "human"): BG_hum =[0.29508855202553025, 0.20466109233964447, 0.20478482916547036, 0.2954655264693549] matrices = [MOODS.count_log_odds(matrix,BG_hum,pseudocount) for matrix in matrices] thresholds = [MOODS.threshold_from_p(matrix, BG_hum, p_val) for matrix in matrices] results = MOODS.search(seq, matrices, thresholds, convert_log_odds=False, threshold_from_p=False,both_strands=True) elif (c == "equal"): BG_eq = [0.25,0.25,0.25,0.25] matrices = [MOODS.count_log_odds(matrix,BG_eq,pseudocount) for matrix in matrices] thresholds = [MOODS.threshold_from_p(matrix, BG_eq, p_val) for matrix in matrices] results = MOODS.search(seq, matrices, thresholds, convert_log_odds=False, threshold_from_p=False,both_strands=True) else: write_match = False ga = re.compile(r'GAGA(?:GA)+',re.IGNORECASE) ct = re.compile(r'TCTC(?:TC)+',re.IGNORECASE) ca = re.compile(r'CACA(?:CA)+',re.IGNORECASE) gt = re.compile(r'TGTG(?:TG)+',re.IGNORECASE) gc = re.compile(r'GCGC(?:GC)+',re.IGNORECASE) cg = re.compile(r'CGCG(?:CG)+',re.IGNORECASE) ta = re.compile(r'TATA(?:TA)+',re.IGNORECASE) at = re.compile(r'ATAT(?:AT)+',re.IGNORECASE) ga_occur = re.findall(ga,seq) + re.findall(ct,seq) ca_occur = re.findall(ca,seq) + re.findall(gt,seq) gc_occur = re.findall(gc,seq) + re.findall(cg,seq) ta_occur = re.findall(ta,seq) + re.findall(at,seq) results = [ga_occur, ca_occur, gc_occur, ta_occur] # write files containing the matches for each motif as found by MOODS # only if boolean set at top of count() is True if write_match: ga = open('%sga_matches_%d.txt' % (time.strftime("%Y-%m-%d"),1/p_val), 'a') ca = open('%sca_matches_%d.txt' % (time.strftime("%Y-%m-%d"),1/p_val), 'a') gc = open('%sgc_matches_%d.txt' % (time.strftime("%Y-%m-%d"),1/p_val), 'a') ta = open('%sta_matches_%d.txt' % (time.strftime("%Y-%m-%d"),1/p_val), 'a') comp = False for match in results[0]: pos = match[0] st = "" if (pos < 0): comp = True pos += len(seq) for i in range(len(me137[1])): st += seq[pos+i] if comp: st = complement(st) comp = False ga.write('%s\n' % st) for match in results[1]: pos = match[0] st = "" if (pos < 0): comp = True pos += len(seq) for i in range(len(caca[1])): st += seq[pos+i] if comp: st = complement(st) comp = False ca.write('%s\n' % st) for match in results[2]: pos = match[0] st = "" if (pos < 0): comp = True pos += len(seq) for i in range(len(gcgc[1])): st += seq[pos+i] if comp: st = complement(st) comp = False gc.write('%s\n' % st) for match in results[3]: pos = match[0] st = "" if (pos < 0): comp = True pos += len(seq) for i in range(len(tata[1])): st += seq[pos+i] if comp: st = complement(st) comp = False ta.write('%s\n' % st) ga.close() ca.close() gc.close() ta.close() results.append(len(seq)) return results
def match_single(motif, sequence, genomic_region, unique_threshold=None, normalize_bitscore=True, sort=False): """ Performs motif matching given sequence and the motif.pssm passed as parameter. The genomic_region is needed to evaluate the correct binding position. Please note that the arguments should be passed as a list, to allow for parallelization mapping function. Keyword arguments: motif -- TODO. sequence -- A DNA sequence (string). genomic_region -- A GenomicRegion. output_file -- TODO. unique_threshold -- If this argument is provided, the motif search will be made using a threshold of 0 and then accepting only the motif matches with bitscore/motif_length >= unique_threshold. Return: Print MPBSs to output_file. """ # Establishing threshold if unique_threshold: current_threshold = 0.0 eval_threshold = unique_threshold motif_max = motif.max / motif.len else: current_threshold = motif.threshold eval_threshold = motif.threshold motif_max = motif.max # Performing motif matching try: # old MOODS version results = MOODS.search(sequence, [motif.pssm_list], current_threshold, absolute_threshold=True, both_strands=True) except: # TODO: we can expand this to use bg from sequence, for example, # or from organism. bg = MOODS.tools.flat_bg(4) results = MOODS.scan.scan_dna(sequence, [motif.pssm_list], bg, [current_threshold], 7) grs = GenomicRegionSet("mpbs") for search_result in results: for r in search_result: try: position = r.pos score = r.score except: (position, score) = r # Verifying unique threshold acceptance if unique_threshold and score / motif.len < unique_threshold: continue # If match forward strand if position >= 0: p1 = genomic_region.initial + position strand = "+" # If match reverse strand elif not motif.is_palindrome: p1 = genomic_region.initial - position strand = "-" else: continue # Evaluating p2 p2 = p1 + motif.len # Evaluating score (integer between 0 and 1000 -- needed for bigbed transformation) if normalize_bitscore: # Normalized bitscore = standardize to integer between 0 and 1000 (needed for bigbed transformation) if motif_max > eval_threshold: norm_score = int(((score - eval_threshold) * 1000.0) / (motif_max - eval_threshold)) else: norm_score = 1000 else: # Keep the original bitscore if unique_threshold: norm_score = score / motif.len else: norm_score = score grs.add( GenomicRegion(genomic_region.chrom, int(p1), int(p2), name=motif.name, orientation=strand, data=str(norm_score))) if sort: grs.sort() return grs
import MOODS matrix = [[10, 0, 0], [0, 10, 0], [0, 0, 10], [10, 10, 10]] results = MOODS.search( 'actgtggcgtcaacgtaggccaacgtggacccgtacgtaaacgaagaggggtagtc', [matrix], 30, absolute_threshold=30) for i in results: for (position, score) in i: print("Position: " + str(position) + " Score: " + str(score))
def main(): p = optparse.OptionParser(__doc__) p.add_option('-t', '--thresh', action='store', dest='threshold', default=0.0, help='determines threshold') p.add_option('-a', '--append', action='store', dest='name', default='resultsfor', help='appends pwm name to this when\ creating files') p.add_option('-A', '--absolute', action='store_true', dest='A', default=False, help='absolute threshold') p.add_option('-s', '--standard_background', action='store_true', dest='stdbg') p.add_option('-M', '--specific_Matrix', action='store', dest='specific') options, args = p.parse_args() pwm = open(args[0], 'rU') fa = open(args[1], 'rU') pfa = list(Bio.SeqIO.parse(fa, 'fasta')) index, matricies, sizes = pySeq.parsing.PWMparser.parse(pwm) underorequal20 = [] over20 = [] under20names = [] over20names = [] pwmdata = {} fileout = {} bgt = False if options.stdbg: bgt = [0.25, 0.25, 0.25, 0.25] # Construct Matrices to search and files to write to. for k in index.keys(): if options.specific: if k == options.specific: filename = options.name + k + '.bed' fileout[k] = open(filename, 'w') if sizes[k] <= 20: underorequal20.append(matricies[k]) under20names.append(k) else: over20.append(matricies[k]) over20names.append(k) else: filename = options.name + k + '.bed' fileout[k] = open(filename, 'w') if sizes[k] <= 20: underorequal20.append(matricies[k]) under20names.append(k) else: over20.append(matricies[k]) over20names.append(k) for chrom in pfa: print(chrom.name) #Run under 20s # Should we sort the results as all downstream applications require a # sort first res = MOODS.search(chrom.seq, underorequal20, float(options.threshold), absolute_threshold=options.A, both_strands=True, bg=bgt, algorithm='lf') for n, r in enumerate(res): for position, score in r: start, end, strand = strand_adjust(position, sizes[under20names[n]]) # Add option to round the score values. Defaulting to int atm # since bedToBigBed only accepts integer values.... fileout[under20names[n]].write('\t'.join([ chrom.name, str(start), str(end), under20names[n], str(int(score * 100)), strand, '\n' ])) #Run over 20s res = MOODS.search(chrom.seq, over20, float(options.threshold), absolute_threshold=options.A, both_strands=True, bg=bgt, algorithm='supera') for n, r in enumerate(res): for position, score in r: start, end, strand = strand_adjust(position, sizes[over20names[n]]) fileout[over20names[n]].write('\t'.join([ chrom.name, str(start), str(end), over20names[n], str(int(score * 100)), strand, '\n' ]))
import fasta DIST_DIR = abspath(dirname(dirname(LOCAL_DIR))) print(DIST_DIR) fasta_filepath = join(DIST_DIR, "examples/data/sequence/dnaACGT.txt") records = fasta.parseFasta(fasta_filepath) seq = records[0][1] matrix1 = [[0, 1, 0, 0, 0, 0, 0, 1, 1, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 1, 1, 1, 1, 0, 0, 1]] matrix2 = [[10, 0, 10, 3, 5, 5], [0, 5, 0, 3, 5, 0, 5], [0, 1, 0, 3, 0, 5, 0], [0, 4, 0, 1, 0, 0, 5]] results = MOODS.search(seq, [matrix1, matrix2], 0.011) print("Matrix 1 results: " + str(len(results[0]))) print("Matrix 2 results: " + str(len(results[1]))) matrices = [matrix1, matrix2] thresholds = [0.011, 0.011] bg = MOODS.bg_from_sequence(seq, 0.1) q = 7 absolute_threshold = False both_strands = False ms = MOODS.MOODSSearch(matrices, thresholds, bg, q, absolute_threshold, both_strands) results = ms.search(seq) print("New Matrix 1 results: " + str(len(results[0])))
def main(): p = optparse.OptionParser(__doc__) p.add_option('-t', '--thresh', action='store', dest='threshold', default=0.0,help='determines threshold') p.add_option('-a', '--append', action='store', dest='name', default='resultsfor', help='appends pwm name to this when\ creating files') p.add_option('-A', '--absolute', action='store_true',dest='A', default=False,help='absolute threshold') p.add_option('-s','--standard_background',action='store_true',dest='stdbg') p.add_option('-M', '--specific_Matrix', action='store', dest='specific') options, args = p.parse_args() pwm = open(args[0], 'rU') fa = open(args[1], 'rU') pfa = list(Bio.SeqIO.parse(fa, 'fasta')) index, matricies, sizes = pySeq.parsing.PWMparser.parse(pwm) underorequal20 = [] over20 = [] under20names = [] over20names = [] pwmdata={} fileout = {} bgt = False if options.stdbg: bgt = [0.25,0.25,0.25,0.25] # Construct Matrices to search and files to write to. for k in index.keys(): if options.specific: if k == options.specific: filename = options.name + k + '.bed' fileout[k] = open(filename, 'w') if sizes[k] <= 20: underorequal20.append(matricies[k]) under20names.append(k) else: over20.append(matricies[k]) over20names.append(k) else: filename = options.name + k + '.bed' fileout[k] = open(filename, 'w') if sizes[k] <= 20: underorequal20.append(matricies[k]) under20names.append(k) else: over20.append(matricies[k]) over20names.append(k) for chrom in pfa: print(chrom.name) #Run under 20s # Should we sort the results as all downstream applications require a # sort first res = MOODS.search(chrom.seq, underorequal20, float(options.threshold), absolute_threshold=options.A , both_strands = True, bg=bgt, algorithm='lf') for n,r in enumerate(res): for position,score in r: start, end, strand = strand_adjust(position, sizes[under20names[n]]) # Add option to round the score values. Defaulting to int atm # since bedToBigBed only accepts integer values.... fileout[under20names[n]].write('\t'.join([chrom.name, str(start), str(end), under20names[n], str(int(score*100)), strand, '\n'])) #Run over 20s res = MOODS.search(chrom.seq, over20, float(options.threshold), absolute_threshold=options.A , both_strands = True, bg=bgt, algorithm='supera') for n,r in enumerate(res): for position,score in r: start, end, strand = strand_adjust(position, sizes[over20names[n]]) fileout[over20names[n]].write('\t'.join([chrom.name, str(start), str(end),over20names[n], str(int(score*100)),strand, '\n']))
def find_motif_disruptions( position, ref, alt, genome_fasta, matrices, ): """ Determine whether there is a difference between the ref and alt alleles for TF binding. Requires samtools in your path. Parameters ---------- position : str Zero based genomic coordinates of the reference allele of the form chrom:start-end (chr5:100-101 for a SNV for instance). The value end - start should equal the length of the ref allele. ref : str Reference allele. This should match the reference sequence at "position" in genome_fasta. alt : str Alternate allele. genome_fasta : str Path to genome fasta file. This file should be indexed. matrices : dict Dict whose keys are motif names and whose values are pandas data frames or numpy arrays containing PWMs with columns ACGT. Returns ------- out : pandas.DataFrame Pandas data frame with motifs whose best matches that overlapped the variant differed between the reference and alternate sequences. A score of zero and a strand of '' indicates that there was not a match for the motif on the given allele. """ import subprocess import MOODS # import pybedtools as pbt max_motif_length = max([x.shape[0] for x in matrices.values()]) chrom, coords = position.split(':') start,end = [int(x) for x in coords.split('-')] s = '{}:{}-{}'.format(chrom, start - max_motif_length + 1, end + max_motif_length - 1) c = 'samtools faidx {} {}'.format(genome_fasta, s) seq_lines = subprocess.check_output(c, shell=True).strip().split() ref_seq = seq_lines[1] alt_seq = ref_seq[0:max_motif_length - 1] + alt + ref_seq[max_motif_length + len(ref) - 1:] ref_variant_start = max_motif_length - 1 ref_variant_end = max_motif_length - 1 + len(ref) alt_variant_start = max_motif_length - 1 alt_variant_end = max_motif_length - 1 + len(alt) ms = [matrices[x].T.values.tolist() for x in matrices.keys()] ref_res = MOODS.search(ref_seq, ms, 0.001, both_strands=True, bg=[0.25, 0.25, 0.25, 0.25]) ref_res = dict(zip(matrices.keys(), ref_res)) alt_res = MOODS.search(alt_seq, ms, 0.001, both_strands=True, bg=[0.25, 0.25, 0.25, 0.25]) alt_res = dict(zip(matrices.keys(), alt_res)) # First we'll remove any motif matches that don't overlap the variant of interest (and thus # can't be affected by the variant and will be the same for ref and alt). Then we'll get the # best match for each motif for ref and alt. rows = [] for motif in ref_res.keys(): ref_res[motif] = _filter_variant_motif_res(ref_res[motif], ref_variant_start, ref_variant_end, matrices[motif].shape[0], ref_seq) alt_res[motif] = _filter_variant_motif_res(alt_res[motif], alt_variant_start, alt_variant_end, matrices[motif].shape[0], alt_seq) if len(ref_res[motif]) > 0: ref_pos, ref_score = sorted(ref_res[motif], key=lambda x: x[1], reverse=True)[0] ref_strand = {True:'+', False:'-'}[ref_pos > 0] else: ref_score = 0 ref_strand = '' if len(alt_res[motif]) > 0: alt_pos, alt_score = sorted(alt_res[motif], key=lambda x: x[1], reverse=True)[0] alt_strand = {True:'+', False:'-'}[alt_pos > 0] else: alt_score = 0 alt_strand = '' if ref_score > 0 or alt_score > 0: diff = ref_score - alt_score rows.append([motif, ref_score, ref_strand, alt_score, alt_strand, diff]) out = pd.DataFrame(rows, columns=['motif', 'ref_score', 'ref_strand', 'alt_score', 'alt_strand', 'score_diff']) out.index = out.motif out = out.drop('motif', axis=1) out = out[out.score_diff != 0] return out