def clean(self): super(Part3Form, self).clean() numToMutate = self.cleaned_data.get("numToMutate") sequenceF = self.cleaned_data.get("sequenceF") sequenceS = self.cleaned_data.get("sequenceS") if sequenceF: sequence = mc.FastaFile(sequenceF, fileName=False) elif sequenceS: sequence = mc.FastaFile(sequenceS, fileName=False) else: sequence = None if sequence: allowedLetters = set(myf.mapperDict.keys() + ["N"]) for i in sequence: if len(set(i.sequence) - set(allowedLetters)) > 0: raise forms.ValidationError("Invalid letters!") if list(set(i.sequence)) == ["N"]: raise forms.ValidationError( "Please do not enter a string only containing N!") if numToMutate > sequence.getMinLength(): raise forms.ValidationError( "The number of mutations is greater than the length of the smallest sequence" )
def clean(self): super(FastaForm, self).clean() fastaFile = self.cleaned_data.get("sequenceF") fastaSequence = self.cleaned_data.get("sequenceS") if fastaFile == None: pass elif fastaSequence == None: pass elif fastaFile == "" and fastaSequence == "": raise forms.ValidationError( "Enter a sequence ! Either upload or enter in directly!") elif fastaFile and fastaSequence: raise forms.ValidationError( "Either upload or enter in directly ! Don't do both!") if fastaFile: sequence = mc.FastaFile(fastaFile, fileName=False) elif fastaSequence: sequence = mc.FastaFile(fastaSequence, fileName=False) else: sequence = None if sequence: if len(sequence) > self.sequenceLimit: raise forms.ValidationError( "Currently only a maximum of %s sequences are allowed" % self.sequenceLimit) if sequence.getMaxLength() > self.sequenceLengthLimit: raise forms.ValidationError( "Currently a sequence can only be %s long." % self.sequenceLengthLimit)
def getMotifAndSequenceObjects(motifF, motifS, sequenceF, sequenceS): if motifF: motifO = mc.FastaFile(motifF, fileName=False) elif motifS: motifO = mc.FastaFile(motifS, fileName=False) if sequenceF: sequenceO = mc.FastaFile(sequenceF, fileName=False) elif sequenceS: sequenceO = mc.FastaFile(sequenceS, fileName=False) return sequenceO, motifO
def getSequenceView(request): fastaFile = request.GET.get('fastaFile') sampleFastaB = request.GET.get('sampleFastaFile') response = HttpResponse(content_type="text/plain") base_dir = settings.BASE_DIR if not sampleFastaB: response = HttpResponse(content_type="text/plain") openfile = open(os.path.join(base_dir, "fasta_files/%s.fa" % fastaFile)) readfile = openfile.read() openfile.close() fastaO = mc.FastaFile(readfile, fileName=False) html = "" for i in fastaO: html += "<input type='checkbox' value='>%s\n%s'>>%s<br>%s<br><br>" % ( i.name, i.sequence, i.name, i.sequence) response.write(html) else: openfile = open(os.path.join(base_dir, "sample_fasta.fa")) readfile = openfile.read() openfile.close() response.write(readfile) return response
def get_sequences(fasta): filed = fasta sequenceO = mycustom.FastaFile(filed) sequencesL = [i.sequence.upper() for i in sequenceO] sequencesL_rev_compl = [] for i in sequencesL: seq = Seq(i) sequencesL_rev_compl += [str(seq.reverse_complement())] return sequencesL, sequencesL_rev_compl
def clean(self): super(Part1Form, self).clean() minSpacing = self.cleaned_data['minSpacing'] maxSpacing = self.cleaned_data['maxSpacing'] leftDistance = self.cleaned_data['minSpacing'] rightDistance = self.cleaned_data['maxSpacing'] minimumGCContent = self.cleaned_data['minimumGCContent'] maximumGCContent = self.cleaned_data['maximumGCContent'] motifS = self.cleaned_data.get('motifS') sequenceF = self.cleaned_data.get('sequenceF') sequenceS = self.cleaned_data.get('sequenceS') if motifS == "": raise forms.ValidationError("Enter motifs!") if motifS: motifO = mc.FastaFile(motifS, fileName=False) if motifO.areDuplicatesPresent(): raise forms.ValidationError("There are duplicate motifs!") maximumNumberOfMotifsTimesSequences = 100 if sequenceF or sequenceS: if sequenceF: sequenceO = mc.FastaFile(sequenceF, fileName=False) elif sequenceS: sequenceO = mc.FastaFile(sequenceS, fileName=False) if not sequenceO.lengthsSame(): raise forms.ValidationError( "Sizes of the sequences should be the same") if (len(motifO)** 4) * len(sequenceO) > maximumNumberOfMotifsTimesSequences: raise forms.ValidationError( "Only a maximum of %s motifs^4*sequences allowed" % maximumNumberOfMotifsTimesSequences) if minSpacing > maxSpacing: raise forms.ValidationError( "Maximum spacing should be greater than min spacing") elif minimumGCContent > maximumGCContent: raise forms.ValidationError( "Minimum GC content is larger than maximum GC content")
def test_One(self): """ Testing whether the Fasta class parses Fasta files properly. There are 2 tests here. :return: """ a = mc.FastaFile(self.sampleFastas, fileName=False) self.assertEqual(a[0].sequence, "AGAGATACATAGACAATGTGTTGCGTAGAGATAG") self.assertEqual(a[1].sequence, "TTTTGGAA") self.assertEqual(len(a), 2)
import re,os,sys,glob import mycustom import ushuffle from Bio import SeqIO import pdb names=[] sequencesL=[] # path with the fasta file to be simulated filed = "/nfs/compgen-04/team218/ilias/nullomers_hg38_v2/hg38.fa" sequenceO = mycustom.FastaFile(filed) sequencesL = [ i.sequence.upper() for i in sequenceO ] names = [ i.name.upper() for i in sequenceO ] # Number of simulations for k in range(1,101): datafile=open("sims_genome_dinucleotide/hg38_bootstrap_number_"+str(k)+"_controlling_dinucleotide_content.fasta","w") sequencesL_c=[] for index,i in enumerate(sequencesL): seq_random=ushuffle.shuffle(i,len(i), 2) datafile.write(">"+names[index]+'_control_bootstrap_'+str(k)+'\n') datafile.write(seq_random+'\n') datafile.close()
def resultsView(request): context = {} if request.method == "POST": part1Form = Part1Form(request.POST, request.FILES) if not part1Form.is_valid(): context['part1form'] = part1Form context['boxes'] = ['restriction', 'adapter'] return render(request, "iliasApp/part1.html", context) ordering = request.POST.get('ordering').strip().split(",")[:-1] postDict = request.POST sequenceS = part1Form.cleaned_data['sequenceS'] sequenceF = part1Form.cleaned_data['sequenceF'] motifS = part1Form.cleaned_data['motifS'] reverseComplement = part1Form.cleaned_data['reverseComplement'] leftDistance = int(part1Form.cleaned_data['leftDistance']) rightDistance = int(part1Form.cleaned_data['rightDistance']) frequencyOfInsertion = int( part1Form.cleaned_data['frequencyOfInsertion']) minSpacing = int(part1Form.cleaned_data['minSpacing']) maxSpacing = int(part1Form.cleaned_data['maxSpacing']) barCodeDistance = int(part1Form.cleaned_data.get( 'barCodeDistance')) if postDict.get('barCodeDistance') else None barCodeLength = int(part1Form.cleaned_data.get( 'barCodeLength')) if postDict.get('barCodeLength') else None minimumGCContent = postDict.get('minimumGCContent') maximumGCContent = postDict.get('maximumGCContent') numOfBarCodesPerSequence = int( postDict['numOfBarCodesPerSequence']) if postDict.get( "numOfBarCodesPerSequence") else None restriction1 = postDict.get('restriction1') restriction2 = postDict.get('restriction2') adapter1 = postDict.get('adapter1') adapter2 = postDict.get('adapter2') motifO = mc.FastaFile(motifS, fileName=False) if sequenceF: sequenceO = mc.FastaFile(sequenceF, fileName=False) elif sequenceS: sequenceO = mc.FastaFile(sequenceS, fileName=False) motifsL = [motif.sequence for motif in motifO] allCombinations = part1.generateCombinations(motifsL) # This is only working for 20 sequence for now. CHANGE THIS numOfSequencesToUse = 20 backgroundSequencesL = [ i.sequence[:800] for i in sequenceO[:numOfSequencesToUse] ] backgroundSequenceHeadersL = [ i.name for i in sequenceO[:numOfSequencesToUse] ] # doing the reverse complement if reverseComplement: copyBackgroundSequencesL = backgroundSequencesL[:] backgroundSequencesL = [ myf.revcompl(backgroundSequence).lower() for backgroundSequence in copyBackgroundSequencesL ] # getting the combinations finalOutput = [] for index, backgroundSequence in enumerate(backgroundSequencesL): for combination in allCombinations: finalOutput += oligo.oligo(backgroundSequence, minSpacing, maxSpacing, combination, leftDistance, rightDistance, frequencyOfInsertion, backgroundSequenceHeadersL[index]) # creating the barcodes. It can be a none value barCodes, numOfBarCodesPerSequence = part1.getBarCodes( barCodeLength, minimumGCContent, maximumGCContent, numOfBarCodesPerSequence, barCodeDistance, finalOutput) mpraOutput, sequenceHTMLL = part1.createMPRAResultOutput( finalOutput, numOfBarCodesPerSequence, barCodes, restriction1, restriction2, adapter1, adapter2, ordering) usingDownload = request.POST.get('usingDownload', False) if usingDownload: response = HttpResponse(content_type="text/plain") response.write(mpraOutput) # context = {"backgroundSequence": sequenceS, "motif": motif, "allCombinations": allCombinations, "finalOutput": finalOutput, "barCodes" : barCodes} return response else: response = HttpResponse(content_type="text/plain") response.write(mpraOutput) context['sequenceHTML'] = sequenceHTMLL context['forDownload'] = mpraOutput context['fileName'] = 'MPRA_Motif_results.txt' return render(request, "iliasApp/results.html", context) return HttpResponseRedirect(urlresolvers.reverse(("iliasApp:ViewIndex")))
def part3RresultsView(request): context = {} if request.method == "POST": form = Part3Form(request.POST, request.FILES) if form.is_valid(): pass else: context['form'] = form return render(request, "iliasApp/part3.html", context) sequenceS = form.cleaned_data.get('sequenceS') sequenceF = form.cleaned_data.get('sequenceF') scrambleOption = request.POST.get('scramble') reverseOption = request.POST.get('reverse') compOption = request.POST.get('complement') numToMutate = int(form.cleaned_data.get('numToMutate')) if sequenceF: sequenceO = mc.FastaFile(sequenceF, fileName=False) elif sequenceS: sequenceO = mc.FastaFile(sequenceS, fileName=False) outputSequenceL = [i.sequence for i in sequenceO] scrambleHeader = "No" reverseHeader = "No" complementHeader = "No" if scrambleOption == "on": scrambleHeader = "Yes" outputSequenceL = [ part3.scramble_motifs(seq) for seq in outputSequenceL ] if reverseOption == "on": reverseHeader = "Yes" outputSequenceL = [seq[::-1] for seq in outputSequenceL] if compOption == "on": complementHeader = "Yes" outputSequenceL = [myf.complement(seq) for seq in outputSequenceL] finalOutputSequenceL = outputSequenceL outputSequenceHTMLL = outputSequenceL if numToMutate: finalOutputSequenceL = [] outputSequenceHTMLL = [] for seq in outputSequenceL: mutatedString, positionMutated = part3.mutateString( seq, numToMutate) finalOutputSequenceL.append(mutatedString) outputSequenceHTMLL.append( myf.highlightString(mutatedString, positionMutated)) headers = [ ">" + seq.name + "| Mutated_nucleotides - %s | Scrambled - %s | Reversed - %s | Complemented - %s" % (numToMutate, scrambleHeader, reverseHeader, complementHeader) for seq in sequenceO ] context['headers'] = headers context['zipped'] = zip(headers, outputSequenceHTMLL) forDownload = "" for header, scramble in zip(headers, finalOutputSequenceL): forDownload += header + '\n' forDownload += scramble + '\n' # YOU NEED TO add new lines as the sequence will be displayed context['forDownload'] = forDownload context['fileName'] = "Transmutation_results.txt" return render(request, "iliasApp/part3Results.html", context) else: return HttpResponseRedirect( urlresolvers.reverse(("iliasApp:ViewPart3")))
# 32 jobs ( 1 motif per job) jobNumber = int(sys.argv[1]) motifsPerJob = 1 lowerBound = (jobNumber - 1) * motifsPerJob upperBound = jobNumber * motifsPerJob #The genomes to scan for the motif occurrences, finds motifs in the plus orientation of the genome files = glob.glob( "/lustre/scratch117/cellgen/team218/igs/properties/hg19/All_chr_hg19.fa") #read the genomic files, in this case only the human genome hg19 for filed in files: print "This is the input argument: %s " % jobNumber # reading the sequence file sequenceO = mycustom.FastaFile(filed) sequencesL = [i.sequence for i in sequenceO] del sequenceO # reading the motif file, provide the path to the motifs motifO = mycustom.FastaFile("polyN1.fa") motifsL = [i.sequence for i in motifO[lowerBound:upperBound]] del motifO print "lowerbound is ", lowerBound print "upperbound is ", upperBound # now finding the sequences result = Motif_combinatorics.findAllMotifAllSeqs(motifsL, sequencesL) #provide path to output here"
'W': 'W', 'S': 'S', 'R': 'Y', 'Y': 'R' }[B] for B in x][::-1]) if 1 == 1: # provide path to the two json files generated for each strand json_file1 = "All_chr_hg19_di_same_strand_n1.json" json_file2 = "All_chr_hg19_di_opposite_strand_n1.json" # provide path to genome seqFileName1 = "/lustre/scratch117/cellgen/team218/igs/properties/hg19/All_chr_hg19.fa" seqFileName2 = "/lustre/scratch117/cellgen/team218/igs/properties/hg19/All_chr_hg19.fa" fastaO = mc.FastaFile(seqFileName1) SequencesL1 = [i.sequence for i in fastaO.getSequences()] NamesL1 = [i for i in fastaO.getNames()] sequence_length1 = len(SequencesL1[0]) nucs_seq1 = sequence_length1 * len(SequencesL1) fastaO = mc.FastaFile(seqFileName2) SequencesL2 = [i.sequence for i in fastaO.getSequences()] NamesL2 = [i for i in fastaO.getNames()] sequence_length2 = len(SequencesL2[1]) nucs_seq2 = sequence_length2 * len(SequencesL2) # provide file of polyN motifs (e.g. di-nucleotides or mono-nucleotides) motifO = mc.FastaFile("polyNs_di.fa") consensusL = [i.sequence for i in motifO.getSequences()] MotifsL = consensusL