def predict(self, inpseq, useCascade = True): """ Classify each symbol in a sequence. Return the predictions as a list of symbols. """ W = self.nn1.ninput / len(self.inp_alpha) if useCascade and self.cascade: nn1seq = self.predict(inpseq, useCascade = False) subseqs = slidewin(nn1seq, self.cascade) predsyms = ['C' for _ in range(len(inpseq))] # use coil for positions in flanking regions for i in range(len(subseqs)): # for each input sub-sequence of the primary NN input = numpy.zeros(self.cascade * len(self.outp_alpha)) input[_onehotIndex(self.outp_alpha, subseqs[i])] = 1 outvec = self.nn2.feedforward(input) d = prob.Distrib(self.outp_alpha) for k in range(len(outvec)): d.observe(self.outp_alpha[k], outvec[k]) predsyms[i + self.cascade / 2] = d.getmax() # use the symbol with the highest probability return sequence.Sequence(predsyms, self.outp_alpha) else: # only predict using the first NN subseqs = slidewin(inpseq, W) predsyms = ['C' for _ in range(len(inpseq))] # use coil for positions in flanking regions for i in range(len(subseqs)): # for each input sub-sequence of the primary NN input = numpy.zeros(self.inp_len * len(self.inp_alpha)) input[_onehotIndex(self.inp_alpha, subseqs[i])] = 1 outvec = self.nn1.feedforward(input) d = prob.Distrib(self.outp_alpha) for k in range(len(outvec)): d.observe(self.outp_alpha[k], outvec[k]) predsyms[i + W / 2] = d.getmax() # use the symbol with the highest probability return sequence.Sequence(predsyms, self.outp_alpha)
def sort_sequence(self, files): #print '' res = [] currentSeq = sequence.Sequence() for file_item in files: name = sequence.SeqString(file_item.basename()) sequenceSplit = False if not currentSeq.match(name, self.numPos): sequenceSplit = True if sequenceSplit: res.append(currentSeq) currentSeq = sequence.Sequence() currentSeq.append(name, file_item) if len(currentSeq) > 0: res.append(currentSeq) content_list = [] for item in res: if len(item) <= 1: content_list.append(item[0]) else: f = item[0] if isinstance(f, content_types.ImageFile): c = content_types.ImageSequence else: c = content_types.FileSequence content_item = c(dirname=f.dirname(), sequence_object=item, mimetype=f.mimetype()) content_item.set_common_prefix(f.common_prefix()) #print content_item[0] #for item in content_item: #print item #print content_item #print item.ranges() #print item.sequenceName() content_list.append(content_item) return content_list
def _backwardParsimony(self, aln, seq=None): """ Internal function that operates recursively to inspect scores to determine most parsimonious sequence, from root to leaves. """ if self.sequence == None: # no sequence has been assigned leftbuf = [] rightbuf = [] if self.left == None and self.right == None: # no children, so terminal, cannot propagate scores raise RuntimeError("No sequence assigned to leaf node:", self.label) if seq == None: # Only root can do this, no parents to consider, so we pick the lowest scoring symbol currbuf = [] for col in range(aln.alignlen): min_score = 999999 min_symb = None left_symb = None right_symb = None for a_parent in range(len(aln.alphabet)): if self.seqscores[col][a_parent] < min_score: min_score = self.seqscores[col][a_parent] min_symb = a_parent left_symb = self.backleft[col][a_parent] right_symb = self.backright[col][a_parent] currbuf.append(aln.alphabet[min_symb]) leftbuf.append(aln.alphabet[left_symb]) rightbuf.append(aln.alphabet[right_symb]) self.sequence = sequence.Sequence(currbuf, aln.alphabet, self.label, gappy=True) else: # Non-root, but not leaf self.sequence = seq col = 0 for sym_parent in self.sequence: a_parent = aln.alphabet.index(sym_parent) left_symb = self.backleft[col][a_parent] right_symb = self.backright[col][a_parent] leftbuf.append(aln.alphabet[left_symb]) rightbuf.append(aln.alphabet[right_symb]) col += 1 self.left._backwardParsimony( aln, sequence.Sequence(leftbuf, aln.alphabet, self.label, gappy=True)) self.right._backwardParsimony( aln, sequence.Sequence(rightbuf, aln.alphabet, self.label, gappy=True)) return self.sequence
def read(args): outputfile = output(args) orig_dict = {} if '.csv' in args.input: print("this is a CSV file") outputfile = outputfile + '.fa' with open(args.input, newline='') as f: reader = csv.reader(f) for row in reader: orig_dict[row[0]] = row[1] seq_list = [ sequence.Sequence(sequence=seq, name=seqname) for seqname, seq in orig_dict.items() ] sequence.writeFastaFile(outputfile, seq_list) elif '.tab' in args.input or '.tsv' in args.input: print("this is a TAB/TSV file") outputfile = outputfile + '.fa' with open(args.input) as tsv: for line in csv.reader(tsv, dialect="excel-tab"): orig_dict[line[0]] = line[1] seq_list = [ sequence.Sequence(sequence=seq, name=seqname) for seqname, seq in orig_dict.items() ] sequence.writeFastaFile(outputfile, seq_list) elif '.fa' in args.input or '.fasta' in args.input: print("this is a FASTA file") outputfile = outputfile + '.csv' db100 = sequence.readFastaFile(args.input, sequence.Protein_Alphabet, ignore=True, parse_defline=False) with open(outputfile, 'w', newline='') as f: fieldnames = ['Name', 'Sequence'] thewriter = csv.DictWriter(f, fieldnames=fieldnames) thewriter.writeheader() for seq in db100: s = ''.join(seq.sequence) thewriter.writerow({'Name': seq.name, 'Sequence': s})
def parse_sequences(raw_susceptible_file, raw_resistant_file): """ Parses a raw sequence file into a Sequence object. Args: raw_sequence_file: the filename where sequences are stored. raw_drm_file: a file containing a comma-separated list of DRM positions. Returns: A list of Sequence objects. """ print 'Parsing sequences.' sequences = [] for susceptible, resistant in \ zip(SeqIO.parse(raw_susceptible_file, "fasta"), \ SeqIO.parse(raw_resistant_file, "fasta")): sequences.append(sequence.Sequence( susceptible, resistant )) print '.', sys.stdout.flush() print return sequences
def add_sequence(self, x, y): '''Add a sequence to the list, where x is the sequence of observations, and y is the sequence of states.''' num_seqs = len(self.seq_list) x_ids = [self.x_dict.get_label_id(name) for name in x] y_ids = [self.y_dict.get_label_id(name) for name in y] self.seq_list.append(seq.Sequence(self, x_ids, y_ids, num_seqs))
def KL_MC(self, p2, nb=100, lg=1000): """Compute KL-distance with Monte Carlo at Proportion p2 on nb=100 Sequence of length lg=1000.""" if nb <= 0: print "Too few sequences" return if lg <= 1: print "Too short sequences" return lx = lexique.Lexique() lx[1] = self.loglex(1) lx[2] = p2.loglex(2) g = sequence.Sequence() v = 0.0 for i in range(nb): g.read_prop(self, long=lg) lv = lx.ls_evalue(g) v += lv[1] - lv[2] v /= nb return v / (lg - 1)
def KL_MC(self, lp2, nb=100, lg=1000): """Compute Kullback-Leibler divergence to Lproportion lp2 with Monte Carlo simulation on nb=100 Sequence of length lg=1000. """ if nb <= 0: print "Too few sequences" return if lg <= 1: print "Too short sequences" return lx1 = lexique.Lexique() lx1.read_Lprop(self) lx2 = lexique.Lexique() lx2.read_Lprop(lp2) g = sequence.Sequence() p = partition.Partition() v = 0.0 for i in range(nb): print i, '\r', sys.stdout.flush() g.read_Lprop(self, long=lg) p.viterbi(g, lx1) v += p.val() p.viterbi(g, lx2) v -= p.val() v /= nb return v / (lg - 1)
def allMotifs_fa(args): #check hoow many cols, check if all of them has a value #make a FASTA and make a CSV for i in range (len(args.input)): c=0 fasta = {} name = (args.input[i]) name = name.split('.')[0] name = name + '_reduced.fa' with open(args.input[i], newline='') as f: reader = csv.reader(f) header = next(reader) for row in reader: isEmpty = False for i in range(1, len(header)-1): if row[i] == "": isEmpty = True break if isEmpty == False: fasta[row[0]] = row[len(header)-1] seq_list = [sequence.Sequence(sequence=seq, name=seqname) for seqname, seq in fasta.items()] sequence.writeFastaFile(name, seq_list) c+=1 print(str(len(seq_list)) + " sequences kept after applying the requirements for " + name)
def _backwardParsimony(self, aln, seq=None): """ Internal function that operates recursively to inspect scores to determine most parsimonious sequence, from root to leaves. """ if self.sequence == None: # no sequence has been assigned childbuf = [[] for _ in range(self.nChildren())] if self.nChildren( ) == 0: # no children, so terminal, cannot propagate scores raise RuntimeError("No sequence assigned to leaf node:", self.label) if seq == None: # Only root can do this, no parents to consider, so we pick the lowest scoring symbol currbuf = [] for col in range(aln.alignlen): min_score = 999999 min_symb = None child_symb = [None for _ in range(self.nChildren())] for a_parent in range(len(aln.alphabet)): if self.seqscores[col][a_parent] < min_score: min_score = self.seqscores[col][a_parent] min_symb = a_parent for i in range(self.nChildren()): child_symb[i] = self.backptr[i][col][a_parent] currbuf.append(aln.alphabet[min_symb]) for i in range(self.nChildren()): childbuf[i].append(aln.alphabet[child_symb[i]]) self.sequence = sequence.Sequence(currbuf, aln.alphabet, self.label, gappy=True) else: # Non-root, but not leaf self.sequence = seq col = 0 for sym_parent in self.sequence: a_parent = aln.alphabet.index(sym_parent) child_symb = [None for _ in range(self.nChildren())] for i in range(self.nChildren()): child_symb[i] = self.backptr[i][col][a_parent] childbuf.append(aln.alphabet[child_symb[i]]) col += 1 for i in range(self.nChildren()): self.children[i]._backwardParsimony( aln, sequence.Sequence(childbuf[i], aln.alphabet, self.label, gappy=True)) return self.sequence
def sample(hmm, observations): """ Samples a finite number of times (observations) the given HMM. returns two sequences: State path and Emission sequence. """ random.seed() # force reseeding state_path = seq.Sequence("State path", "") emission_sequence = seq.Sequence("Sequence", "") current_state = hmm.begin_state() for i in range(observations): current_state = current_state.sample_transition() if current_state.is_end(): break state_path.append(current_state.short_name) emission_sequence.append(current_state.sample_emission()) return alignment.Alignment(emission_sequence, state_path)
def run(channels, output, attributes, canvas, mask): seq = sequence.Sequence(attributes) seq.append(channels, mask, canvas, output['num_frames'], 0) seq.loop_to_beginning(output['num_loop_frames']) img = lapnorm.generate(seq, attributes, output, start_from=0, preview=False) return img
def dna_read_file(filename): gene = [] with open (filename, 'r') as gene_a: # load the pulses dna file lines_1 = gene_a.readlines () for line in lines_1: parts = line.split (',') gene.append (parts [0]) genome_sequence = sequence.Sequence (parts [0], 'ACGT', 'ACTT', "genome_01.dat") all_bases = genome_sequence.all_base () print (f'The total number of base is: {all_bases}')
def read_sequences(self): sra = open(self.f) sra_lines = sra.readlines() c = 0 for i in range(len(sra_lines)): if re.match(r'^>', sra_lines[i]): #s = dame_sig_linea() s_l = int(sra_lines[i].split()[2].split('=')[1]) s = sra_lines[i+1] seq = sequence.Sequence(s, s_l) self.sequences.append(seq)
def __init__(self, label, *args, **kwargs): self.label = label self.protein_name = kwargs.get('protein_name', self.label) self.class_name = kwargs.get('class_name', self.label) self.struct_type = kwargs.get('struct_type', 'single') self._load_universe() self._update() self.structure_sequence = sequence.Sequence(self.protein) self.domains = sequence.SequenceDomain(self.structure_sequence) self._translation_vectors = []
def get_best_alignment(self): state_path = seq.Sequence("State path", "") current_cell = self.get_end_cell() score = current_cell.value if score > -INFINITY: current_cell = current_cell.parent while not current_cell.state.is_begin(): state_path.append(current_cell.state.short_name) current_cell = current_cell.parent state_path.reverse() else: state_path = None return alignment.Alignment(self.sequence, state_path, score)
def posEqual_fa(args): fasta = {} name = (args.input[0]) name = name.split('.')[0] csvFile = name + '_reduced.csv' seqCol = 0 with open(csvFile, newline='') as f: reader = csv.reader(f) header = next(reader) seqCol = len(header)-1 for row in reader: fasta[row[0]] = row[len(header)-1] seq_list = [sequence.Sequence(sequence=seq, name=seqname) for seqname, seq in fasta.items()] sequence.writeFastaFile(name+'.fa', seq_list)
def generateSubsequence(sequence, itemK, itemList): string = getStringBetween(str(sequence), '<', '>') string = '{' + string[string.find(str(itemK)):len(string)] itemsetStrings = re.findall(REGEX_IS, string) # create sequence object sequenceObject = seq.Sequence() # create itemset objects and append to sequence object for itemset in itemsetStrings: tokens = itemset.split(',') intValues = [int(x) for x in tokens] # create a new ItemSet object temp = element.ItemSet() # add items and mis into itemset [temp.addItem(itemList.getItem(x)) for x in intValues] # add itemset to a sequence sequenceObject.addItemSet(temp) return sequenceObject
def call_samfile(self, samf): if type(samf) in (str, unicode): samf = Samfile(samf) for reference in samf.references: # stats self.coverage = { "max_coverage": None, "min_coverage": None, "avg_coverage": 0, "column_count": 0 } self.call_type_hist = collections.defaultdict(int) # pileup = samf.pileup(reference=reference) seq = self.call_pileup(pileup) name = "%s_consensus" % reference if self.coverage["column_count"]: self.coverage["avg_coverage"] = self.coverage[ "avg_coverage"] / float(self.coverage["column_count"]) yield sequence.Sequence(seq, name=name)
def generate_sequence(nframes, npts, msm_noise=.02): np.random.seed(654) # repeatability pt_radius = 1. R_pert = .02 t_pert = .02 # Generate points pts = (np.random.rand(npts, 3) * 2 - 1) * pt_radius # Generate cameras K = np.eye(3) R = np.eye(3) t = np.zeros(3) Rs = [] ts = [] measurements = [] for i in range(nframes): Rs.append(R) ts.append(t) measurements.append(generate_measurements(K, R, t, pts, msm_noise)) R = perturb_rotation(Rs[-1], R_pert) t = perturb_vector(ts[-1], t_pert) measurements = np.array(measurements) # Convert into tracks tracks = [] for track_msms in np.transpose(measurements, (1, 0, 2)): tracks.append(sequence.Track(arange(nframes), track_msms)) # Create the sequence seq = sequence.Sequence() seq.K = K seq.tracks = tracks seq.initial_Rs = Rs # TODO: add noise seq.initial_ts = ts # TODO: add noise seq.initial_xs = pts # TODO: add noise return seq
def parseDataFile(dataFileName, misValueDictionary): # database object database = db.Database() dataFile = open(dataFileName, 'r') try: # for each line ending with newline char for line in dataFile.readlines(): # get sequence string sequenceString = getStringBetween(line, '<', '>') # get all itemset strings itemsetStrings = re.findall('{([^{]*)}', sequenceString) # create sequence object sequenceObject = seq.Sequence() # create itemset objects and append to sequence object for itemset in itemsetStrings: tokens = itemset.split(',') intValues = [int(x) for x in tokens] # create a new ItemSet object temp = element.ItemSet() # add items and mis into itemset [ temp.addItem(item.Item(x, misValueDictionary[x])) for x in intValues ] # sort items based on mis values temp.sortItemSet() # add itemset to a sequence sequenceObject.addItemSet(temp) # append sequence to database database.sequenceList.append(sequenceObject) # else throw IOError except IOError as e: print(str(e)) sys.exit(2) finally: dataFile.close() return database
def add_named_sequence(self, name): seq = sequence.Sequence(self.profile, name) seq.create_default_tracks() self.sequences.append(seq) self.next_seq_number += 1
test_element1.print_overview() test_element2.print_overview() #-------------------------continue------------------------------- # ------------------------------------------------------- # # viewing of the sequence for second check of timing etc viewer.show_element_stlab(test_element1, delay=False, channels='all', ax=None) viewer.show_element_stlab(test_element2, delay=False, channels='all', ax=None) #-------------------------------------------------------- #------------------------------------------------------- # now to send everything to the AWG, we have perform the last step by putting everything # into a sequence seq = sequence.Sequence(sequence_name) seq.append(name='first_element', wfname=sequence_name + '_element1', trigger_wait=True) #, # # goto_target='first_element')#, jump_target='first special element') seq.append(name='second element', wfname=sequence_name + '_element1', trigger_wait=True) #, # # goto_target='third element', jump_target='second special element') AWG.program_awg(seq, test_element1, test_element2, verbose=True) #, test_element2) AWG.AWGrun()
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", help="FASTA file to query from", required=True) parser.add_argument("-q", "--query", help="Query FASTA file", required=True) parser.add_argument("-db", "--database", help="Database output file name", required=True) parser.add_argument("-r", "--reference", help="Reference database ", default="uniprotkb") parser.add_argument("-o", "--output", help="Output path", default="matchmyseqs") args = parser.parse_args() seqDict = {} tier1seq = '' representative = '' fasta = {} seqsforCSV = {} progress = 0 tier1 = {} tier1_annots = { } # annotations that we want to include in the final dataset os.system('makeblastdb -dbtype prot -in ' + args.input + ' -out ' + args.database) db = sequence.readFastaFile(args.input, sequence.Protein_Alphabet, ignore=True, parse_defline=False) db_map = {} # map from "long" name to actual entry db_map_short = {} # map from "short" name to entry for s in db: db_map[s.name] = s db_map_short[sequence.parseDefline(s.name)[0]] = s print("Database size is " + str(len(db_map))) print( "Blast started, this might take a bit depending on your dataset size") os.system("blastp -db " + args.database + " -outfmt 3 -num_descriptions 1 -num_alignments 0 -query " + args.query + " -out query.txt") if args.reference == 'uniprotkb': os.system( "grep -e \"^[st][pr]|\" query.txt | cut -d\' \' -f1 > UniProt_query.tab" ) # Extract the resulting sequence identifiers repSeqNames = set([]) f = open('UniProt_query.tab', 'rt') for row in f: repSeqNames.add(sequence.parseDefline(row.strip())[0]) f.close() print(str(len(repSeqNames)), " representative sequences have been found") #Annot the representative sequences notfound = [] for name in repSeqNames: if name in db_map_short: s = db_map_short[name] seqsforCSV[s.name] = "".join(s) else: notfound.append(name) print('Matched', len(repSeqNames) - len(notfound), 'of', len(repSeqNames)) with open("query.txt", newline='') as f: reader = csv.reader(f) for row in reader: if len(row) > 0 and row[0].startswith('Query'): querySeq = (str(row).split("=")[1][:-2].strip()) elif len(row) > 0 and (row[0].startswith('tr|') or row[0].startswith('sp|')): representative = (str(row).split(" ")[0][2:].strip()) seqDict[querySeq] = representative elif args.reference == 'refseq': grab = False repSeqNames = set([]) with open("query.txt", newline='') as f: reader = csv.reader(f) for row in reader: if len(row) > 0 and row[0].startswith('Query'): querySeq = (str( row[0]).split("=")[1][:-2].strip().split(" ")[0]) elif len(row) > 0 and row[0].startswith('Sequences'): grab = True continue elif grab == True: if len(row) > 0 and not row[0].strip() == "": representative = (row[0].split('.')[0] + "." + row[0].split('.')[1].split(" ")[0]) repSeqNames.add(representative) seqDict[querySeq] = representative grab = False #print(len(repSeqNames)) notfound = [] for name in repSeqNames: if name in db_map_short: s = db_map_short[name] seqsforCSV[s.name] = "".join(s) else: notfound.append(name) print('Matched', len(repSeqNames) - len(notfound), 'of', len(repSeqNames)) print(len(repSeqNames), " representative sequences found for " + args.query) # done25 = False # done50 = False # done75 = False # for s,rep in seqDict.items(): # total = (len(seqDict)) # seq = (sequence.getSequence(rep,'uniprotkb')) # seqsforCSV[rep] = str(seq).split(":")[1].strip() # elem = rep + str(seq) # progress+=1 # if (progress/total)*100 > 25 and not done25: # print("25% done") # done25 = True # elif (progress/total)*100 > 50 and not done50: # print("50% done") # done50 = True # elif (progress/total)*100 > 75 and not done75: # print("75% done") # done75 = True faOut = args.output + '.fa' seq_list = [ sequence.Sequence(sequence=seq, name=seqname) for seqname, seq in seqsforCSV.items() ] sequence.writeFastaFile(faOut, seq_list) csvOut = args.output + '.csv' with open(csvOut, 'w', newline='') as f: fieldnames = ['Name', 'Representative', 'Sequence'] thewriter = csv.DictWriter(f, fieldnames=fieldnames) thewriter.writeheader() for given, rep in seqDict.items(): thewriter.writerow({ 'Name': given, 'Representative': rep, 'Sequence': seqsforCSV[rep] })
def _render_reverse_clip_dialog_callback(dialog, response_id, fb_widgets, media_file): if response_id == Gtk.ResponseType.ACCEPT: # speed, filename folder speed = float(int(fb_widgets.hslider.get_value())) / 100.0 file_name = fb_widgets.file_name.get_text() filenames = fb_widgets.out_folder.get_filenames() folder = filenames[0] write_file = folder + "/"+ file_name + fb_widgets.extension_label.get_text() if os.path.exists(write_file): primary_txt = _("A File with given path exists!") secondary_txt = _("It is not allowed to render Motion Files with same paths as existing files.\nSelect another name for file.") dialogutils.warning_message(primary_txt, secondary_txt, dialog) return # Profile profile_index = fb_widgets.out_profile_combo.get_active() if profile_index == 0: # project_profile is first selection in combo box profile = PROJECT().profile else: profile = mltprofiles.get_profile_for_index(profile_index - 1) # Render consumer properties encoding_option_index = fb_widgets.encodings_cb.get_active() quality_option_index = fb_widgets.quality_cb.get_active() # Range range_selection = fb_widgets.render_range.get_active() dialog.destroy() # Create motion producer source_path = media_file.path if media_file.is_proxy_file == True: source_path = media_file.second_file_path motion_producer = mlt.Producer(profile, None, str("timewarp:" + str(speed) + ":" + str(source_path))) mltrefhold.hold_ref(motion_producer) # Create sequence and add motion producer into it seq = sequence.Sequence(profile) seq.create_default_tracks() track = seq.tracks[seq.first_video_index] track.append(motion_producer, 0, motion_producer.get_length() - 1) print "motion clip render starting..." consumer = renderconsumer.get_render_consumer_for_encoding_and_quality(write_file, profile, encoding_option_index, quality_option_index) # start and end frames start_frame = 0 end_frame = motion_producer.get_length() - 1 wait_for_producer_stop = True if range_selection == 1: start_frame = int(float(media_file.length - media_file.mark_out - 1) * (1.0 / -speed)) end_frame = int(float(media_file.length - media_file.mark_out + (media_file.mark_out - media_file.mark_in) + 1) * (1.0 / -speed)) + int(1.0 / -speed) if end_frame > motion_producer.get_length() - 1: end_frame = motion_producer.get_length() - 1 if start_frame < 0: start_frame = 0 wait_for_producer_stop = False # consumer wont stop automatically and needs to stopped explicitly # Launch render global motion_renderer, motion_progress_update motion_renderer = renderconsumer.FileRenderPlayer(write_file, seq.tractor, consumer, start_frame, end_frame) motion_renderer.wait_for_producer_end_stop = wait_for_producer_stop motion_renderer.start() title = _("Rendering Reverse Clip") text = "<b>Motion Clip File: </b>" + write_file progress_bar = Gtk.ProgressBar() dialog = rendergui.clip_render_progress_dialog(_FB_render_stop, title, text, progress_bar, gui.editor_window.window) motion_progress_update = renderconsumer.ProgressWindowThread(dialog, progress_bar, motion_renderer, _REVERSE_render_stop) motion_progress_update.start() else: dialog.destroy()
default=True, help= "Specify whether reverse complementary patterns should be taken into account as well.", ) return parser if __name__ == "__main__": start_time = time.time() print("\nParsing input file...") parser = construct_argparser() args = parser.parse_args() contents = seq.parse_fasta_file(args.input_path) sequence = seq.Sequence(contents) print("\nSuccesfully parsed {} into a sequence of length {}.\n".format( args.input_path, sequence.length)) print("\nDetermining the G-C skew minima...") skew = sequence.skew_graph() skew_minima = [str(minimum) for minimum in skew["Skew minima"]] skew_minima = " ".join(skew_minima) print("\nLocations of skew minima:\n\n{}\n".format(skew_minima)) print("\nFinding clumps of patterns around the skew minima...") width = int(args.window_length / 2) dnaa_boxes = [] for minimum in skew["Skew minima"]: min_skew = int(minimum) start_pos = (min_skew - width) if (min_skew >= width) else 0
def sequence(*steps): return s.Sequence(steps)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", help="Input FASTA file", required=True) parser.add_argument("-db", "--database", help="Database output file name", required=True) parser.add_argument("-r", "--redundancy", nargs='*', help="List of redundancy levels", default=[90, 80, 70]) parser.add_argument("-t1", "--tier1", help="User's Tier1 sequences") parser.add_argument("-t2", "--tier2", help="User's Tier2 sequences") parser.add_argument("-ml", "--maxlength", help="Max length that the sequence can be", default=800) parser.add_argument("-e", "--eval", nargs='*', help="List of evalues", default=[1e-100, 1e-75, 1e-50, 1e-20, 1e-10, 1e-5]) args = parser.parse_args() tier2 = {} tier2_short = {} tier2_annots = { } # annotations that we want to include in the final dataset if args.tier2: print("tier2 sequences have been provided") if '.fa' in args.tier2 or '.fasta' in args.tier2: print("tier2 sequences are FASTA file") tier2db = sequence.readFastaFile(args.tier2, sequence.Protein_Alphabet, ignore=True, parse_defline=False) print(str(len(tier2_list)) + " sequences in tier2") tier2_list = {} # map from "long" name to actual entry tier2_map_short = {} # map from "short" name to entry for s in tier2db: tier2_list[s.name] = s tier2_map_short[sequence.parseDefline(s.name)[0]] = s else: print("Please provide FASTA file for tier-2") if args.tier1: tier1 = {} tier1_annots = { } # annotations that we want to include in the final dataset print("Tier-1 sequences have been provided") if '.fa' in args.tier1 or '.fasta' in args.tier1: print("Tier-1 sequences are provided as a FASTA file") tier1db = sequence.readFastaFile(args.tier1, sequence.Protein_Alphabet, ignore=True, parse_defline=False) tier1_list = {} for s in tier1db: tier1_list[s.name] = "".join(s.sequence) print("Tier-1 has " + str(len(tier1_list)) + " sequences") else: print("Please provide FASTA file for tier-1") db100 = sequence.readFastaFile(args.input, sequence.Protein_Alphabet, ignore=True, parse_defline=False) db100_map = {} # map from "long" name to actual entry db100_map_short = {} # map from "short" name to entry for s in db100: db100_map[s.name] = s db100_map_short[sequence.parseDefline(s.name)[0]] = s print("Database has " + str(len(db100_map)) + " sequences") for rr in args.redundancy: rs = str(rr) os.system('cd-hit -i ' + args.input + ' -c 0.' + rs + ' -T 5 -o db' + rs + ' -d 0') selected = {} for rr in args.redundancy: selected[rr] = [] filename = 'db' + str(rr) + '.clstr' clusters = readCDHIT(filename) for c in clusters: picked_one = False shortest = None reviewed = None for name in clusters[c]: if name in db100_map: seq = db100_map[name] if shortest: if len(seq) < len(shortest) and not disqualified( seq, args): shortest = seq elif not disqualified(seq, args): shortest = seq if seq.name.startswith('sp|') and not disqualified( seq, args): reviewed = seq if name in tier1_list: #print("this one orig" + str(seq)) selected[rr].append(seq) picked_one = True else: pass #print('Did not find', name) # If no Tier-1, prefer "reviewed", then shortest length if not picked_one and reviewed: selected[rr].append(reviewed) elif not picked_one and shortest: selected[rr].append(shortest) for rr in args.redundancy: filename = 'db' + str(rr) + '.fa' sequence.writeFastaFile(filename, selected[rr]) for rr in args.redundancy: os.system('makeblastdb -dbtype prot -in db' + str(rr) + '.fa -out db-' + str(rr)) # for rr in args.redundancy: # for evalue in args.evalue: # result_file = "dataset-" + str(rr) + '-'+ str(evalue) # cmd1 = "blastp -db db-" + str(rr) + " -outfmt 3 -num_descriptions 20000 -num_alignments 0 -num_threads 5 -query " + args.tier1 + " -out " + result_file + ".txt -evalue " + str(evalue) # print(cmd1) # os.system(cmd1) grab = False for rr in args.redundancy: for evalue in args.eval: c = 0 tpsIdentifier = set([]) seqs = [] result_file = "dataset-" + str(rr) + '-' + str(evalue) f = open(result_file + '.txt', 'rt') for row in f: if row.startswith('Sequences'): grab = True continue if grab == True: if row.startswith('Lambda'): grab = False if not row.strip() == "": identifier = row.split(' ')[0] if identifier != "Lambda": tpsIdentifier.add(identifier) for name in tpsIdentifier: try: seq = db100_map[name] info = '' seqs.append( sequence.Sequence(seq.sequence, seq.alphabet, seq.name, info)) except: pass sequence.writeFastaFile(result_file + ".fa", seqs) print(result_file + " has " + str(len(seqs)) + "sequences") print('Done') totalSeqCount = [] c = 0 for evalue in args.eval: for rr in args.redundancy: output = [] ev = str(evalue) ev = ev[1:] red = str(rr) result_file = "dataset-" + str(rr) + '-' + str(evalue) a = sequence.readFastaFile(result_file + '.fa', sequence.Protein_Alphabet, ignore=True, parse_defline=False) names = set([]) for s in a: names.add(s.name) tier1_cnt = 0 tier2_cnt = 0 seqs = [] for name in names: try: seq = db100_map[name] info = '' if name in tier1_list: tier1_cnt += 1 #info = seq.info + ' ' + tier1_annots[name] elif name in tier2: tier2_cnt += 1 #info = seq.info + ' ' + tier2_annots[name] seqs.append( sequence.Sequence(seq.sequence, seq.alphabet, seq.name, info)) except: pass #print('Did not find', name) print('Processed', len(seqs), 'for', result_file, ' Tier-1:', tier1_cnt, ' Tier-2:', tier2_cnt) output = [ev, red, len(seqs)] totalSeqCount.append(output) plotSeqs(totalSeqCount)
def setup_AWG_pulsed_spec_sequence(sequence_name='Cool_Sequence', measurement_trigger_delay=2e-6, SSB_modulation_frequency=-50e6, measurement_pulse_length=10e-6, cooling_pulse_length=200e-6, cooling_measurement_delay=5e-6, buffer_pulse_length=2.e-6, readout_trigger_length=1.0e-6, measurement_pulse_amp=0.5, doplot=True, devAWG=Tektronix_AWG520(name='AWG'), us_clock=True, trigger_first=False): ''' makes the AWG single element sequences for the cooling experiment. It contains a cooling pulse, a readout trigger and a readout pulse. readout trigger is the fixpoint, as it defines the timing we see on the signal analyzer. readout pulse is defined with the IQ modulation of a vector source. Cooling pulse is a marker to a microwave switch. There is some funky stuff happening if there is no buffers around the sequence, therefore we have buffer pulses at the beginning and end such that the channels are zero there! ''' if us_clock is True: measurement_trigger_delay = measurement_trigger_delay * 1e-3 SSB_modulation_frequency = SSB_modulation_frequency * 1e-3 measurement_pulse_length = measurement_pulse_length * 1e-3 cooling_measurement_delay = cooling_measurement_delay * 1e-3 cooling_pulse_length = cooling_pulse_length * 1e-3 buffer_pulse_length = buffer_pulse_length * 1e-3 readout_trigger_length = 1 * readout_trigger_length * 1e-3 if trigger_first is True: left_reference_pulse_name = 'readout trigger' else: left_reference_pulse_name = 'pulsed spec' AWG = AWG_station.AWG_Station() AWG.AWG = devAWG clock = devAWG.get_clock() devAWG.set_run_mode('ENH') devAWG.set_refclock_ext() AWG.define_channels(id='ch1', name='RF1', type='analog', high=0.541, low=-0.541, offset=0., delay=0, active=True) AWG.define_channels(id='ch2', name='RF2', type='analog', high=0.541, low=-0.541, offset=0., delay=0, active=True) AWG.define_channels(id='ch2_marker1', name='MW_pulsemod', type='marker', high=1.0, low=0, offset=0., delay=0, active=True) AWG.define_channels(id='ch1_marker1', name='readout_trigger', type='marker', high=1, low=0, offset=0., delay=0, active=True) sin_pulse = pulse.CosPulse(channel='RF1', name='A sine pulse on RF') sin_pulse_2 = pulse.CosPulse(channel='RF2', name='A sine pulse on RF') SSB_pulse = pulse.MW_IQmod_pulse(I_channel='RF1', Q_channel='RF2', name='SSB pulse') pulsed_spec_pulse = pulse.SquarePulse(channel='MW_pulsemod', name='A square pulse on MW pmod') readout_trigger_pulse = pulse.SquarePulse(channel='readout_trigger', name='A square pulse on MW pmod') readout_trigger_pulse = pulse.SquarePulse(channel='readout_trigger', name='A square pulse on MW pmod') sq_pulse_ch1 = pulse.SquarePulse(channel='RF1', name='A square pulse on MW pmod') sq_pulse_ch2 = pulse.SquarePulse(channel='RF2', name='A square pulse on MW pmod') test_element1 = element.Element( (sequence_name + '_element1'), pulsar=AWG) #, ignore_offset_correction=True) test_element2 = element.Element( (sequence_name + '_element2'), pulsar=AWG) #, ignore_offset_correction=True) test_element1.add(pulse.cp(readout_trigger_pulse, amplitude=1., length=readout_trigger_length), start=0.1e-6, name='readout trigger', refpoint='start') test_element1.add(pulse.cp(SSB_pulse, mod_frequency=SSB_modulation_frequency, amplitude=measurement_pulse_amp, length=measurement_pulse_length), start=measurement_trigger_delay, name='readout pulse', refpulse='readout trigger', refpoint='start') test_element1.add(pulse.cp(pulsed_spec_pulse, amplitude=1., length=cooling_pulse_length), start=-1 * cooling_measurement_delay - cooling_pulse_length, name='pulsed spec', refpulse='readout pulse', refpoint='start') test_element1.add(pulse.cp(readout_trigger_pulse, amplitude=0., length=buffer_pulse_length), start=-1 * buffer_pulse_length, name='buffer left', refpulse=left_reference_pulse_name, refpoint='start') test_element1.add(pulse.cp(readout_trigger_pulse, amplitude=0., length=buffer_pulse_length), start=0, name='buffer right', refpulse='readout pulse', refpoint='end') test_element2.add(pulse.cp(readout_trigger_pulse, amplitude=1., length=readout_trigger_length), start=0.1e-6, name='readout trigger', refpoint='start') test_element2.add(pulse.cp(SSB_pulse, mod_frequency=SSB_modulation_frequency, amplitude=measurement_pulse_amp, length=measurement_pulse_length), start=measurement_trigger_delay, name='readout pulse', refpulse='readout trigger', refpoint='start') test_element2.add(pulse.cp(pulsed_spec_pulse, amplitude=1., length=cooling_pulse_length), start=-1 * cooling_measurement_delay - cooling_pulse_length, name='pulsed spec', refpulse='readout pulse', refpoint='start') test_element2.add(pulse.cp(readout_trigger_pulse, amplitude=0., length=buffer_pulse_length), start=-1 * buffer_pulse_length, name='buffer left', refpulse=left_reference_pulse_name, refpoint='start') test_element2.add(pulse.cp(readout_trigger_pulse, amplitude=0., length=buffer_pulse_length), start=0, name='buffer right', refpulse='readout pulse', refpoint='end') #print('Channel definitions: ') #test_element1.print_overview() # test_element2.print_overview() # -------------------------continue------------------------------- # ------------------------------------------------------- # viewing of the sequence for second check of timing etc if doplot is True: viewer.show_element_stlab(test_element1, delay=False, channels='all', ax=None) viewer.show_element_stlab(test_element2, delay=False, channels='all', ax=None) # -------------------------------------------------------- devAWG.init_dir() devAWG.clear_waveforms() seq = sequence.Sequence(sequence_name) seq.append(name='first_element', wfname=(sequence_name + '_element1'), trigger_wait=True, goto_target='second element') seq.append(name='second_element', wfname=(sequence_name + '_element2'), trigger_wait=True, goto_target='first_element') AWG.program_awg(seq, test_element1, test_element2, verbose=True) #, test_element2)
def __init__(self, tx): self.txid = txid.Txid(tx) self.vout = vout.Vout(tx) self.script_sig_size = script_sig_size.ScriptSigSize(tx) self.script_sig = script_sig.ScriptSig(tx, self) self.sequence = sequence.Sequence(tx)