def getSequence(): sequence = None if request.method == 'POST': if 'file1' in request.files: f = request.files['file1'] sequence = 0 state['seq'][1]['status'] = 'next' elif 'file2' in request.files: f = request.files['file2'] sequence = 1 fasta = Fasta(fh=f) fasta.read() print(fasta.format()) seq = state['seq'][sequence] seq['fasta'] = fasta seq['status'] = 'loaded' # if both sequences have been selected, check whether the sequences are DNA or protein state['params']['seqtype'] = 'protein' if state['seq'][0]['status'] is 'loaded' and state['seq'][1][ 'status'] is 'loaded': if state['seq'][0]['fasta'].isACGT( ) and state['seq'][1]['fasta'].isACGT(): state['params']['seqtype'] = 'DNA' return render_template('dashboard.html', state=state)
def write_as_fasta(self, fh, n=None): """----------------------------------------------------------------------------------------- Write to a file in fasta format, if n is defined, write only the specified ORF in the list :param fh, open filehandle for writing :param n: integer, index of ORF to write, write all if not specified :return: n -----------------------------------------------------------------------------------------""" fasta = Fasta() nwritten = 0 if n is None: # print all ORFS for orf in self.orf: fasta.id = orf['id'] fasta.doc = 'len={} strand={} frame={} begin={} end={}'. \ format(orf['length'], orf['direction'], orf['frame'], orf['begin'], orf['end']) fasta.seq = orf['sequence'] fh.write(fasta.format(linelen=60)) fh.write('\n') nwritten += 1 elif n < len(self.orf): # print the selected ORF orf = self.orf[n] fasta.id = orf['id'] fasta.doc = 'len={} strand={} frame={} begin={} end={}'. \ format(orf['length'], orf['direction'], orf['frame'], orf['begin'], orf['end']) fasta.seq = orf['sequence'] fh.write(fasta.format(linelen=60)) fh.write('\n') nwritten = 1 return nwritten
def __init__(self): self.s1 = Fasta() self.s2 = Fasta() self.i1 = [] self.i2 = [] self.score = Score() self.smat = None self.pmat = None self.gi = 0 # length independent gap penalty self.gd = 0 # length dependent gap penalty
def __init__(self): """----------------------------------------------------------------------------------------- Diagonal class constructor. Subclass of Score Delegates to Fasta via self.s1 and self.s2 Delegates to pyplot via self.fig diagonal: one diagonal of scores yinc: direction of y axis, 1 or -1 means forward or reverse respectively window: length of window for calculation threshold: minimum value for window to be plotted -----------------------------------------------------------------------------------------""" Score.__init__(self) self.diagonal = [] self.single = False self.yinc = 1 self.threshold = 0 self.window = 0 self.nscore = 0 self.nrun = 0 self.frame = {} # data frames self.function = {} # functions for populating data frames # Plotting variables # sizes of panels are defined in setupBokeh() self.title = '' self.figure = {} self.grid = None self.palette = None self.cmap = None self.alpha = 0.5 self.mindotsize = 2 self.maxdotsize = 10 # sequences, s1 is horizontal, s2 is vertical self.s1 = Fasta() self.s2 = Fasta() self.i1 = None # integer array representation of sequences self.i2 = None self.l1 = 0 self.l2 = 0 self.seqreverse = False # only applies to s2
def find(self, direction='+', frame=0, minlen=0, includeseq=False): """----------------------------------------------------------------------------------------- find the open reading frames in a specific frame and direction. For the reverse complement, the coordinates are in terms of the reversed sequence :param direction: string, '+' or '-' :param frame: int, 0 - 2 :param minlen: int, only save if the orf is longer than minlen :param includeseq: boolean, if true, include the sequence in the identified reading frames :return: int, number of rfs added to self.list -----------------------------------------------------------------------------------------""" seq = self.sequence if direction == '-': seq = Fasta.reverseComplement(self.sequence) nrf = 0 pos = frame begin = pos while pos < len(seq) - 2: codon = seq[pos:pos + 3] if codon in Orf.stop: # end of an ORF if pos - begin > 3: nrf += 1 if pos - begin >= minlen: self.rflist.append({'direction': direction, 'frame': frame, 'begin': begin, 'end': pos}) if includeseq: newrf = self.rflist[-1] newrf['seq'] = seq[newrf['begin']:newrf['end']] begin = pos + 3 pos += 3 if pos - begin > 2: nrf += 1 if pos - begin >= minlen: self.rflist.append({'direction': direction, 'frame': frame, 'begin': begin, 'end': pos}) if includeseq: newrf = self.rflist[-1] newrf['seq'] = seq[newrf['begin']:newrf['end']] return nrf
def findall(self, minlen=0, includeseq=False): """----------------------------------------------------------------------------------------- Find orfs in all six reading frames :param minlen: int, only save if the orf is longer than minlen :param includeseq: boolean, if true, include the sequence in the identified reading frames :return: int, number of orfs found -----------------------------------------------------------------------------------------""" nrf = 0 for direction in ('+', '-'): s = self.sequence if direction == '-': s = Fasta.reverseComplement(self.sequence) for frame in range(3): nrf += self.find(direction=direction, frame=frame, minlen=minlen, includeseq=includeseq) return nrf
scoremin = score[0] scoremax = score[0] for s in score: scoremin = min(scoremin, s) scoremax = max(scoremax, s) return scoremin, scoremax # -------------------------------------------------------------------------------------------------- # Testing # -------------------------------------------------------------------------------------------------- if __name__ == '__main__': match = Diagonal() fasta = Fasta(filename=sys.argv[1]) fasta.read() fasta1 = fasta.copy() fasta2 = fasta.copy() fasta2.id = 'seq 2' fasta2.doc = 'Sequence 2' fasta1.seq = fasta1.seq[:200] fasta2.seq = fasta2.seq[:400] dataframes = [{ 'data': 'dots', 'fn': match.windowThreshold, 'var': ['x', 'y', 'score'] }, {
snplist, maxpos = read_snps_tabular(snp_file) # this version reads gff format # snp_file = 'C:/Users/michael/Desktop/apple/GDDH13_1-1_SNPs.gff3' # snplist, maxpos = read_snps_gff(snp_file) for chr in snplist: print('{} {} snps max: {}'.format(chr, len(snplist[chr]), maxpos[chr])) print() # output file from command line out = open(sys.argv[1], 'w') # read genome and match, one sequence at a time fastafile = 'C:/Users/michael/Desktop/apple/GDDH13_1-1_formatted.fasta' fasta = Fasta() fasta.open(fastafile) pad = 25 window = 2 * pad + 1 bases = 0 seqlen = {} sequence = '' seqbegin = 0 seqend = 0 snpcount = 0 for line in fasta.fh: line = line.rstrip() if line.startswith('>'): try:
# -------------------------------------------------------------------------------------------------- # Testing # -------------------------------------------------------------------------------------------------- if __name__ == '__main__': import sys from sequence.fasta import Fasta from plotter import Plotter match = Windowmatch() print('done {}'.format(type(match))) print(match.alphabet) # match.readNCBI('table/NUC4.4.matrix') print(match.format()) fasta1 = Fasta(filename=sys.argv[1]) fasta1.read() fasta2 = Fasta() fasta2.id = 'seq2' fasta2.doc = ' bases 1:50' fasta2.seq = fasta1.seq[:50] fasta1.seq = fasta1.seq[:200] match.s1 = fasta1 match.s2 = fasta2 l1, l2 = match.seqToInt() print(l1, l2) match.window = 10
reformat the output from the apc.pl de-circularization program. Input is two lines, idline and sequence Output is 100 letters/line usage fasta_reformat.py *.fasta ---------------------------------------------------------------------------------------------------------------------""" import glob import sys from sequence.fasta import Fasta linelen = 100 # default target file name target = '*.fasta' if len(sys.argv) > 1: target = sys.argv[1] print(' target file:', target) for fastafile in glob.glob(target): # output file outfile = fastafile + '.reformatted' out = open(outfile, 'w') print(' input file:', fastafile, ' output file:', outfile) fasta = Fasta() fasta.open(fastafile) while fasta.next(): fasta.doc = ' len={}'.format(fasta.length()) out.write(fasta.format(linelen=linelen))
"""--------------------------------------------------------------------------------------------------------------------- Remove the Trinity path information from the id line usage fasta_reformat.py *.fasta ---------------------------------------------------------------------------------------------------------------------""" import glob import sys import re from sequence.fasta import Fasta linelen = 60 # default target file name target = '*.fasta' if len(sys.argv) > 1: target = sys.argv[1] print(' target file:', target) for fastafile in glob.glob(target): # output file outfile = fastafile + '.reformatted' out = open(outfile, 'w') print(' input file:', fastafile, ' output file:', outfile) fasta = Fasta() fasta.open(fastafile) while fasta.next(): fasta.doc = re.sub(r' path=\[[^]]+\]', '', fasta.doc) out.write(fasta.format(linelen=linelen))
nrf += self.find(direction=direction, frame=frame, minlen=minlen, includeseq=includeseq) return nrf # -------------------------------------------------------------------------------------------------- # Testing # -------------------------------------------------------------------------------------------------- if __name__ == '__main__': orf = Orf() orf.sequence = 'TAAATGATGTGACCCTCACCGTGA' print(orf.sequence) nrf = orf.findall(includeseq=True) print(f'{nrf} reading frames found') for i in range(nrf): rf = orf.rflist[i] s = orf.sequence if rf['direction'] == '-': s = Fasta.reverseComplement(orf.sequence) begin = rf["begin"] end = rf["end"] # print(f'f:{rf["frame"]}{rf["direction"]}\tbegin:{begin:4d}\tend:{end:4d}\t{s[begin:end]}') print(f'f:{rf["frame"]}{rf["direction"]}\tbegin:{begin:4d}\tend:{end:4d}\t{rf["seq"]}') exit(0)
''' ''' from sequence.fasta import Fasta trinity = Fasta()
# -------------------------------------------------------------------------------------------------- # main # -------------------------------------------------------------------------------------------------- if __name__ == '__main__': # open files gtffile = sys.argv[1] try: gtf = open(gtffile, 'r') except: sys.stderr.write('Unable to open GTF file ({})\n'.format(gtffile)) exit(1) seq = {} fasta = Fasta() fasta.open(sys.argv[2]) sys.stderr.write('Reading Fasta {}...\n'.format(sys.argv[2])) nseq = 0 while fasta.next(): seq[fasta.id] = fasta.seq nseq += 1 sys.stderr.write('\n{} Sequences read from {}\n'.format(nseq, sys.argv[2])) for s in seq: sys.stderr.write('\t{} len={}\n'.format(s, len(seq[s]))) sys.stderr.write('\ngtf2fasta\n') sys.stderr.write('\tGTF: {}\n'.format(gtffile)) sys.stderr.write('\tFasta: {}\n'.format(sys.argv[2]))
return True # -------------------------------------------------------------------------------------------------- # Testing # -------------------------------------------------------------------------------------------------- if __name__ == '__main__': from wordmatch import Match # for testing only from sequence.fasta import Fasta print('\ntest 1: identity matching, unequal length sequences') print('\texpect 11 matches\n') match = Match() fasta1 = Fasta() fasta1.id = 'test0.1' fasta1.doc = '5 letter DNA test' fasta1.seq = 'ACAGT' match.s1 = fasta1 fasta2 = Fasta() fasta2.id = 'test0.2' fasta2.doc = '7 letter DNA test' fasta2.seq = 'ACAGTAA' match.s2 = fasta2 nmatch = match.identity() plot = Plotter() plot.match = match
"phams":["56154"], "Start":15822, "Stop":16230, "Length":408, "Name":"24", "translation":"MTNVFTLDAMREETRKKYQPVKIGLSEDVTVELKPLLKLGKKAREAVADAVKEIEALPDEIDEDDEDSDELMDEVAEKICESIAKVFKLIATSPRKLLAELDTEEEPQIRAELYGAVLRTWMRET QLGEAAPSPN", "Orientation":"F", "Notes":"b'tail assembly chaperone'"} ... Michael Gribskov 10 April 2021 =================================================================================================""" import sys import json from sequence.fasta import Fasta # -------------------------------------------------------------------------------------------------- # main program # -------------------------------------------------------------------------------------------------- if __name__ == '__main__': fp = open(sys.argv[1], 'r') phage = json.load(fp) for gene in phage['results']: f = Fasta() f.id = gene['GeneID'] f.seq = gene['translation'] f.doc = gene['Notes'][2:-1] print(f.format(linelen=100)) exit(0)
return the row and col corresponding to cell n :param l1: int, length of sequence 1 (col) :param n: int, cell n :return: int, int; row, col -----------------------------------------------------------------------------------------""" return (n - 1) // l1, (n - 1) % l1 # -------------------------------------------------------------------------------------------------- # testing # -------------------------------------------------------------------------------------------------- if __name__ == '__main__': align = Alignment() align.s1 = Fasta() align.s2 = Fasta() # align.alphabet = 'ACGT' # align.identity(pos=3, neg=-3) # align.s1.seq = 'ACTTATCTTAT' # align.s1.seq = 'TATTCTATTCA' # align.s1.seq = 'TGGTATACTAT' # align.s1.seq = 'GATACTATCTA' # align.s2.seq = 'AGTATCATATT' # align.s2.seq = 'TTATACTATGG' # align.s2.seq = 'TACTATTTAGAT' # align.s2.seq = 'TTATACTATGA'
class Diagonal(Score, Fasta): """============================================================================================= =============================================================================================""" def __init__(self): """----------------------------------------------------------------------------------------- Diagonal class constructor. Subclass of Score Delegates to Fasta via self.s1 and self.s2 Delegates to pyplot via self.fig diagonal: one diagonal of scores yinc: direction of y axis, 1 or -1 means forward or reverse respectively window: length of window for calculation threshold: minimum value for window to be plotted -----------------------------------------------------------------------------------------""" Score.__init__(self) self.diagonal = [] self.single = False self.yinc = 1 self.threshold = 0 self.window = 0 self.nscore = 0 self.nrun = 0 self.frame = {} # data frames self.function = {} # functions for populating data frames # Plotting variables # sizes of panels are defined in setupBokeh() self.title = '' self.figure = {} self.grid = None self.palette = None self.cmap = None self.alpha = 0.5 self.mindotsize = 2 self.maxdotsize = 10 # sequences, s1 is horizontal, s2 is vertical self.s1 = Fasta() self.s2 = Fasta() self.i1 = None # integer array representation of sequences self.i2 = None self.l1 = 0 self.l2 = 0 self.seqreverse = False # only applies to s2 def setupCalculation(self, seq1, seq2, window=5, threshold=3, resetstat=True): """----------------------------------------------------------------------------------------- Load the sequences and do some basic setup for score calculations. Sequences are passed as Fasta object to make it easier to use multi fasta files. :param seq1: Fasta object :param seq2: Fasta object :param window: int, length of window for calculation :param threshold: float, minimum score in window to plot :param resetstat: boolean, if False, reset score and run counts to zero :return: True -----------------------------------------------------------------------------------------""" # sequence setup self.s1 = seq1 self.s2 = seq2 self.l1 = len(seq1.seq) self.l2 = len(seq2.seq) # move shorter sequence to s2 if necessary if self.l1 < self.l2: # shorter sequence is always s2 self.s1, self.s2 = self.s2, self.s1 self.l1, self.l2 = self.l2, self.l1 # reverse sequence 2 if necessary yinc = 1 if self.seqreverse: self.s2.seq = self.s2.reverseComplement() self.yinc = -1 # setup integer array version of sequence self.seqToInt() self.diagonal = [0 for _ in range(min(self.l1, self.l2))] self.window = window self.threshold = threshold # stat() histograms. nrun is always positive if resetstat: self.nscore = 0 self.nrun = 0 for frame in self.frame: self.resetFrame(frame) return True def setupBokeh(self, cbase=None, clevels=None, creverse=None): """----------------------------------------------------------------------------------------- SEt up four plot in 2 x 2 grid, but with differing sizes mainplot is the dotplot itself, upper right legend shows the colorbar legend scoreplot shows the window score distribution runploot shows the log of the run length distribution :param cbase: string, e.g. Greys, Blues, Reds, Viridis, etc :param clevels: int, usually 0-9 or 256 :param creverse: boolean, if True highest color is dark :return: True -----------------------------------------------------------------------------------------""" # turn off MISSING_RENDERERS warning caused by plotting colorbars in empty plot silence(MISSING_RENDERERS, True) self.palette = self.setupPalette(cbase=cbase, clevels=clevels, creverse=creverse) if self.title: titlestr = self.title else: now = date.today() titlestr = 'Dotplot of {} and {} - {}'.format( self.s1.id, self.s2.id, now) xlabel = '\n'.join([self.s1.id, self.s1.doc]) ylabel = '\n'.join([self.s2.doc, self.s2.id]) # account for sequence length difference, ylen scaling affects main and legend panels xlen = 800 ylen = xlen * self.l2 / self.l1 # define each panel as a figure label = '({}, {}, score)'.format(self.s1.id, self.s2.id) TIPS = [(label, '($x{0}, $y{0}, @score)')] self.figure['main'] = figure(title=titlestr, x_axis_label=xlabel, y_axis_label=ylabel, height=int(ylen), width=int(xlen), align='center', tooltips=TIPS) self.figure['legend'] = figure(height=int(ylen), width=200) TIPS = [('score, number', '$x{0}, $y{0.00}')] self.figure['scoredist'] = figure(height=300, width=500, tooltips=TIPS) TIPS = [('length,count', '$x{0}, $y{0}')] self.figure['rundist'] = figure(height=300, width=500, y_axis_type='log', tooltips=TIPS) # grid layout self.grid = layout([[self.figure['main'], self.figure['legend']], [self.figure['scoredist'], self.figure['rundist']]]) return True def setupFrame(self, defs): """----------------------------------------------------------------------------------------- Setup data frames for the defined analyses with empty ndata fields. Each def in defs defines name - name of data frame function - a callback function used to construct the data from a diagonal of scores variables - variables that will be populated As used here, a dataframes are stored in the object as self.frame[name] self.frame[name] = {function, var1: [], var2: [], var3: [], ...} :param defs: list, see above :return: int, number of frames define -----------------------------------------------------------------------------------------""" n = 0 for defin in defs: n += 1 self.frame[defin['data']] = {} self.function[defin['data']] = defin['fn'] for v in defin['var']: self.frame[defin['data']][v] = [] return n def resetFrame(self, framename): """----------------------------------------------------------------------------------------- Reset the data in one frame to empty lists. Needed for reverse plots :param framename: :return: True -----------------------------------------------------------------------------------------""" frame = self.frame[framename] for var in frame: frame[var] = [] return True def setupPalette(self, cbase, clevels, creverse): """----------------------------------------------------------------------------------------- Colormaps are used in multiple methods so this utility provides a unified safe method for setup. Bokeh handles colormaps a little differently than other plotting programs :param cbase: string, e.g. Greys, Blues, Reds, Viridis, etc :param clevels: int, usually 0-9 or 256 :param creverse: boolean, if True highest color is dark :return: -----------------------------------------------------------------------------------------""" from bokeh.palettes import all_palettes # the defaults are here instead of in definition so that they never change default_base = 'Greys' default_levels = 256 default_reverse = True try: palette = all_palettes[cbase][clevels] except (KeyError, IndexError) as error: # if lookup fails, use default palette = all_palettes[default_base][default_levels] creverse = default_reverse sys.stderr.write( 'Diagonal::setupPalettes - {}, color {} levels {} is undefined.\n' .format(error, cbase, clevels)) sys.stderr.write('\tUsing default {}{}\n'.format( default_base, default_levels)) if creverse: # reverse the orde of colors palette = palette[::-1] return palette def seqToInt(self): """----------------------------------------------------------------------------------------- Convert sequence strings to an integer arrays and stores in object. An integer array is more convenient for direct lookups in the scoring table than a string :return: int, int length of sequence lists -----------------------------------------------------------------------------------------""" a2i = self.a2i self.i1 = [a2i[c] for c in self.s1.seq] self.i2 = [a2i[c] for c in self.s2.seq] return len(self.i1), len(self.i2) def rle2coord(self): """----------------------------------------------------------------------------------------- Return a list of beginning and ending positions of each run. List is a list of four coordinates for each run [s1begin, s1end, s2begin, s2end] :return: 4 x int, beg1, end1, beg2, end2 -----------------------------------------------------------------------------------------""" coord = [] l2 = self.l2 for diag in range(len(self.diagonal)): for offset, length in self.diagonal[diag]: end1 = max(diag - l2 + 1, 0) + offset end2 = max(l2 - diag - 1, 0) + offset beg1 = end1 - length + 1 beg2 = end2 - length + 1 coord.append([beg1, end1, beg2, end2]) return coord def diagLenBegin(self, diag): """----------------------------------------------------------------------------------------- Calculates the length of diagonal diag and the beginning position of the diagonal in each sequence :param diag: int, diagonal number :return: int (diagonal length), int (seq1 begin), int (seq2 begin) -----------------------------------------------------------------------------------------""" pos1 = max(diag - self.l2 + 1, 0) pos2 = max(self.l2 - diag - 1, 0) diaglen = min(self.l1 - pos1, self.l2 - pos2) # if self.seqreverse: # pos2 = self.l2 - pos2 return diaglen, pos1, pos2 def diagonalScore(self, d): """----------------------------------------------------------------------------------------- Calculate the moving window sum of comparison score along one diagonal and store in the object. :param d: int, diagonal number :return: list, scores along diagonal -----------------------------------------------------------------------------------------""" diaglen, pos1, pos2 = self.diagLenBegin(d) i1 = self.i1 i2 = self.i2 window = self.window cmp = self.table diagonal = self.diagonal old1 = pos1 old2 = pos2 if diaglen < window: # skip diagonals shorter than window length return [] diagonal[:] = map(lambda i: 0, diagonal) # lambda much faster to set all values # to zero score = 0 # first window for offset in range(window): score += cmp[i1[pos1]][i2[pos2]] pos1 += 1 pos2 += 1 dpos = 0 diagonal[dpos] = score # rest of diagonal for offset in range(window, diaglen): # sys.stderr.write('{}\t{}\n'.format(pos1,pos2)) score -= cmp[i1[old1]][i2[old2]] score += cmp[i1[pos1]][i2[pos2]] dpos += 1 diagonal[dpos] = score old1 += 1 old2 += 1 pos1 += 1 pos2 += 1 return diagonal def random(self, n=10000): """----------------------------------------------------------------------------------------- Calculate random score distribution using current scoring table, window, and threshold. Use stat() to get distributions and run lengths. Use n = number of windows calculated for actual sequences. :param n: int, number of windows to calculate :return: list of n scores -----------------------------------------------------------------------------------------""" window = self.window cmp = self.table i1 = self.i1 i2 = self.i2 if n == 0: n = self.l1 * self.l2 self.diagonal = [0 for _ in range(n - window)] dist = self.diagonal win = [0 for _ in range(window)] wsum = 0 for i in range(window): a = choice(i1) b = choice(i2) score = cmp[a][b] win[i] = score wsum += score newpos = 0 pos = 0 for i in range(n - window): dist[pos] = wsum wsum -= win[newpos] a = choice(i1) b = choice(i2) score = cmp[a][b] wsum += score win[newpos] = score newpos = (newpos + 1) % window pos += 1 return dist def allDiagonals(self, select): """----------------------------------------------------------------------------------------- Iterate over all diagonals and apply specified actions to each diagonal. Each action is a tuple that specifies the name of the resulting data frame, and a function to process the diagonal. The frames are usable as Bokeh sources for plotting. :param select: list, names of dataframes to calculate from each diagonal :return: True -----------------------------------------------------------------------------------------""" frame = self.frame function = self.function for d in range(self.l1 + self.l2 - 1): dscore = self.diagonalScore(d) if not dscore: continue for data in select: # apply each selected function to this diagonal of scores to populate the # dataframes fxn = function[data] fxn(data, d) return True def windowThreshold(self, framename, d): """----------------------------------------------------------------------------------------- Callback function for allDiagonals. Savs windows with score >= threshold in dataframe framename. Works on the internally stored diagonal of scores calculated by diagonalScore() :param framename: string, name of a dataframe in self.frame :param d: int, diagonal number :return: True -----------------------------------------------------------------------------------------""" frame = self.frame[framename] dscore = self.diagonal window = self.window halfwindow = (window - 1) / 2.0 threshold = self.threshold yinc = self.yinc diaglen, xpos, ypos = self.diagLenBegin(d) if diaglen < window: return False xpos += halfwindow if self.yinc < 0: ypos = self.l2 - ypos - halfwindow - 1 else: ypos += halfwindow for pos in range(diaglen - window + 1): if dscore[pos] >= threshold: frame['x'].append(xpos) frame['y'].append(ypos) frame['score'].append(dscore[pos]) xpos += 1 ypos += yinc self.nscore += 1 return True def scaleColumn(self, framename, column_source, column_dest, value, scale): """----------------------------------------------------------------------------------------- Performs a simple linear scaling on a column :param framename: string, a data frame in self.frame :param column_source: string, the column in frame to be scaled :param column_dest: string, name for the scaled column (in frame) :param value: tuple, low and high value for the input data :param scale: tuple, low and high value for the scaled data :return: -----------------------------------------------------------------------------------------""" frame = self.frame[framename] values = frame[column_source] frame[column_dest] = [] # width = frame[column_dest] rangeval = value[1] - value[0] rangesize = scale[1] - scale[0] m = rangesize / rangeval for v in values: size = scale[0] + (v - value[0]) * m frame[column_dest].append(size) return def histogramScore(self, scoreframe, d): """----------------------------------------------------------------------------------------- Callback function for allDiagonals. Creates data frames with the score distribution. Works on the internally stored diagonal of scores calculated by diagonalScore() :param scoreframe: string, name of dataframe in self.frame :param d: int, diagonal number :return: int, number of values in columns of dataframe -----------------------------------------------------------------------------------------""" scoreframe = self.frame[scoreframe] diagonal = self.diagonal window = self.window if self.single: diaglen = len(diagonal) else: diaglen, xpos, ypos = self.diagLenBegin(d) diaglen -= window - 1 nscore = 0 score = {} for s in diagonal[:diaglen]: try: score[s] += 1 except KeyError: score[s] = 1 nscore += 1 # insert into data frame, the dateframe is randomly ordered for s in score: try: i = scoreframe['score'].index(s) scoreframe['count'][i] += score[s] except ValueError: scoreframe['score'].append(s) scoreframe['count'].append(score[s]) return len(scoreframe['score']) def histogramRun(self, runframe, d): """----------------------------------------------------------------------------------------- Callback function for allDiagonals. Create a dataframe with the run length distribution, apply the threshold stored in self.threshold. Works on the internally stored diagonal of scores calculated by diagonalScore() :param runframe: string, name of dataframe in self.frame :param d: int, diagonal number :return: int, number of values in columns of dataframe -----------------------------------------------------------------------------------------""" runframe = self.frame[runframe] diagonal = self.diagonal window = self.window threshold = self.threshold if self.single: diaglen = len(diagonal) else: diaglen, xpos, ypos = self.diagLenBegin(d) diaglen -= window - 1 run = {} nrun = 0 runlen = 0 for offset in range(diaglen): if diagonal[offset] >= threshold: runlen += 1 else: try: run[runlen] += 1 except KeyError: # runlen key doesn't exist yet run[runlen] = 1 runlen = 0 nrun += 1 if runlen: try: run[runlen] += 1 except KeyError: # runlen key doesn't exist yet run[runlen] = 1 nrun += 1 # insert into data frame, the dataframe is randomly ordered for r in run: try: i = runframe['len'].index(r) runframe['count'][i] += run[r] except ValueError: runframe['len'].append(r) runframe['count'].append(run[r]) return len(runframe['len']) def sortFrame(self, frame, keyvar): """----------------------------------------------------------------------------------------- Sort all the variables in the dataframe according to the order of keyvar TODO should this and return the min and max values? :param frame: string :param keyvar: string :return: True -----------------------------------------------------------------------------------------""" unsorted = self.frame[frame] # save the order so it can be applied to all viariables in the dataframe order = sorted(range(len(unsorted[keyvar])), key=lambda x: unsorted[keyvar][x]) sorted_frame = {} for column in unsorted: sorted_frame[column] = [] for i in order: sorted_frame[column].append(unsorted[column][i]) self.frame[frame] = sorted_frame return True def bdot(self, dataname, figurename, width=1, color=1, mode='dot', set_colormap=True): """----------------------------------------------------------------------------------------- Bokeh plot of dots in the main panel, and colorbar in the legend panel :param dataname: string, name of a dataframe in self.frame :param figurename: string, a figure defined in setupBokeh and stored in self.figure :param width: boolean, scale size of markers by the score :param color: boolean, scale the color of the markers by the score :param mode: string, if dot use the circle renderer, otherwise segment renderer :param set_colormap: boolean, set the colormap based on score range, turn off for second plot to use the same scale :return: True -----------------------------------------------------------------------------------------""" data = self.frame[dataname] figure = self.figure[figurename] legend = self.figure['legend'] window = self.window threshold = self.threshold alpha = self.alpha scoremin, scoremax = self.valueMinMax(data['score']) if width == 1: self.scaleColumn('dots', 'score', 'size', (threshold - 1, scoremax), (self.mindotsize, self.maxdotsize)) else: data['size'] = [self.mindotsize for _ in range(len(data['score']))] if color == 1: pass else: data['score'] = [scoremax for _ in range(len(data['score']))] if set_colormap: if color == 1: cmap = LinearColorMapper(self.palette, low=max(threshold - 1.0, scoremin - 1), high=scoremax) else: cmap = LinearColorMapper(self.palette, low=threshold - 0.1, high=threshold) self.cmap = cmap else: cmap = self.cmap source = ColumnDataSource(data) if mode == 'dot': figure.circle(source=source, x='x', y='y', size='size', line_color=transform('score', cmap), line_alpha=alpha, fill_color=transform('score', cmap), fill_alpha=alpha) else: # line mode figure.segment(source=source, x0='x', x1='x1', y0='y', y1='y1', line_width='size', line_color=transform('score', cmap), alpha=alpha) # color bar is in a separate window, self.legend, so it doesn't disturb the # aspect ratio if color: color_bar = ColorBar(color_mapper=cmap, label_standoff=3, bar_line_color='black', scale_alpha=alpha, width=20, margin=0, location=(0, 0), major_tick_in=20, major_tick_out=5, major_tick_line_color='black') legend.add_layout(color_bar, 'left') return True def bscoreDist(self, figurename, dataname, color): """----------------------------------------------------------------------------------------- Bokeh plot of score distribution and cumulative score distribution. :param figurename: string, name of figures (stored in self.figure) :param dataname: string, name of data frame (stored in self.frame) :param figurename: string, name of figures (stored in self.figure) :param dataname: string, name of data frame (stored in self.frame) :param color: string, and valid Bokeh color, used to fill bars :return: True -----------------------------------------------------------------------------------------""" data = self.frame[dataname] figure = self.figure[figurename] minp, maxp = self.valueMinMax(data['count']) source = ColumnDataSource(data) # observed score density figure.vbar(source=source, x='score', top='count', width=0.8, color=color, line_color='black', alpha=self.alpha, bottom=0.0) figure.y_range = Range1d(0.0, maxp * 1.1) return True def brunDist(self, figurename, dataname, color): """----------------------------------------------------------------------------------------- Bokeh plot of run length distribution :param figurename: string, name of figures (stored in self.figure) :param dataname: string, name of data frame (stored in self.frame) :param color: string, and valid Bokeh color, used to fill bars :return: True -----------------------------------------------------------------------------------------""" run = self.frame[dataname] figure = self.figure[figurename] source = ColumnDataSource(run) minrun = 1 # x = [i for i in range(minrun, maxrun + 1)] # observed and simulated run lengths, need bottom=1 because of log axis figure.vbar(source=source, x='len', top='count', width=0.8, color=color, line_color='black', alpha=self.alpha, line_width=0.5, bottom=0.1) return True def bscoreCumulative(self, figurename, dataname): """----------------------------------------------------------------------------------------- Bokeh plot of cumulative distribution as a line on right hand axis :param figurename: string, name of figures (stored in self.figure) :param dataname: string, name of data frame (stored in self.frame) :return: True -----------------------------------------------------------------------------------------""" data = self.frame[dataname] figure = self.figure[figurename] source = ColumnDataSource(data) figure.extra_y_ranges = {"cumulative": Range1d(start=0.0, end=1.0)} axis2 = LinearAxis(y_range_name="cumulative") axis2.ticker.num_minor_ticks = 10 figure.add_layout(axis2, 'right') figure.line(source=source, x='score', y='cumulative', y_range_name='cumulative', line_width=2, color='#1122cc') # shaded box showing 95% level box = BoxAnnotation(bottom=0.95, top=1.0, y_range_name='cumulative', fill_color='#FFBBBB', line_width=3, line_dash='dashed') figure.add_layout(box) return True def writeFrame(self, framename, key='x', out=sys.stdout): """----------------------------------------------------------------------------------------- Write the dataframe out as a table to the specified output file. Output file should be opened for writing in advance. TODO figure out how to format values more nicely :param framename: string, name of a dataframe in self.frame :param key: string, name of column to use as key (first column in table) :param out: open output file :return: True -----------------------------------------------------------------------------------------""" frame = self.frame[framename] out.write('\n{} dataframe\n'.format(framename)) out.write('\t{}'.format(key)) for column in frame: if column == key: continue out.write('\t{}'.format(column)) out.write('\n') n = len(frame[key]) for i in range(n): out.write('\t{}'.format(frame[key][i])) for column in frame: if column == key: continue out.write('\t{}'.format(frame[column][i])) out.write('\n') return True def show(self, *args, **kwargs): """----------------------------------------------------------------------------------------- Delegate to plt.show(). Makes syntax a little easier in application since the object is used instead of the plotting class :param args: arguments to pass to show() :param kwargs: arguments to pass to show() :return: True -----------------------------------------------------------------------------------------""" show(self.grid, *args, **kwargs) return True @staticmethod def cumulative(score, total): """----------------------------------------------------------------------------------------- Return cumulative score probability distribution as a list. :param score: list :param total: int, number of observations :return: list -----------------------------------------------------------------------------------------""" cumulative = [] wsum = 0 for i in range(len(score)): wsum += score[i] / total cumulative.append(wsum) return cumulative def addCumulative(self, data, sourcecol, destcol): """----------------------------------------------------------------------------------------- Add cumulative distribution to dataframe data, based on column sourcecol and stored in a new column named destcol :param data: string (dataframe in self.frames) :param sourcecol: string, column name in self.frame[data] :param destcol: string, new column name for cumulative distribution :return: True -----------------------------------------------------------------------------------------""" data = self.frame[data] source = data[sourcecol] cum = [] total = 0 for v in source: total += v cum.append(total) for i in range(len(cum)): cum[i] /= total data[destcol] = cum return True def addSegment(self, framename, xcol='x', ycol='y', xnew='x1', ynew='y1'): """----------------------------------------------------------------------------------------- convert x, y dot positions to line segments; the segment renderer requires beginning and ending points for each segment. The existing x and y are modified to be the beginning and new variables (xnew and ynew) are added for the end points. :param xcol: string, name of x column in data frame :param ycol: string, name of y column in data frame :param xnew: string, name of new x column in data frame (end of segment) :param ynew: string, name of new y column in data frame (end of segment) :return: True -----------------------------------------------------------------------------------------""" frame = self.frame[framename] frame[xnew] = [] frame[ynew] = [] # correct the direction when sequence 2 is reversed yinc = self.yinc dither = 0.5 ydither = [-dither * yinc, dither * yinc] for pos in range(len(frame[xcol])): frame[xnew].append(frame[xcol][pos] + dither) frame[ynew].append(frame[ycol][pos] + ydither[1]) frame[xcol][pos] -= dither frame[ycol][pos] += ydither[0] return True @staticmethod def density(score, total): """----------------------------------------------------------------------------------------- Convert a list representing the score distribution to a density by dividing by total :param score: list of int or float :param total: total number of scores (sum(score)) :return: -----------------------------------------------------------------------------------------""" maxp = 0.0 for i in range(len(score)): score[i] /= total maxp = max(maxp, score[i]) return maxp @staticmethod def scoreMinMax(score): """----------------------------------------------------------------------------------------- Returns the first and last non-zero positions in a list of scores. Use to get ranges for score histograms :param score: list :return: int, int -----------------------------------------------------------------------------------------""" scoremin = None scoremax = None for i in range(len(score)): if score[i] > 0: if scoremin is None: scoremin = i scoremax = i return scoremin, scoremax @staticmethod def valueMinMax(score): """----------------------------------------------------------------------------------------- Returns the minimum and maximum value in a list of values. :param score: list :return: float, float -----------------------------------------------------------------------------------------""" scoremin = score[0] scoremax = score[0] for s in score: scoremin = min(scoremin, s) scoremax = max(scoremax, s) return scoremin, scoremax
peptide_maxlen = 20000 args = arguments_get() sys.stderr.write('\nfasta_getorfs - Get ORFs from transcript sequences\n') sys.stderr.write('\tinput transcript file: {}\n'.format( args.transcript.name)) sys.stderr.write('\toutput ORF Fasta file: {}\n'.format(args.fasta.name)) if args.tabular: sys.stderr.write('\toutput ORF tabular file: {}\n'.format( args.tabular.name)) sys.stderr.write('\tORF histogram file: {}\n'.format(args.histogram.name)) sys.stderr.write('\tminimum ORF length: {}\n'.format(args.minlen)) if args.longest_only: sys.stderr.write('\tOnly longest ORFs will be reported\n') sys.stderr.write('\n') fasta = Fasta(fh=args.transcript) nsequence = 0 npeptide = 0 npeptide_total = 0 # initialize lists for histograms # lenhist is for all ORFs # longhist is for the longerst ORF in each transcript lenhist = [0 for _ in range(peptide_maxlen)] lentotal = 0 longhist = [0 for _ in range(peptide_maxlen)] longtotal = 0 n = None
# default target file name target = '*.fasta' if len(sys.argv) > 1: target = sys.argv[1] sys.stderr.write(' target file: {}\n\n'.format(target)) # read all the sequences and store in dictionary unique_seq = {} n_file = 0 n_perfile = 0 n_uniqueperfile = 0 n_total = 0 n_unique_total = 0 sys.stderr.write('{}\t{}\t\t{}\n'.format('file', 'per file', 'total')) for fastafile in glob.glob(target): fasta = Fasta() fasta.open(fastafile) n_file += 1 n_perfile = 0 while fasta.next(): n_perfile += 1 if fasta.id in unique_seq: continue else: n_uniqueperfile += 1 unique_seq[fasta.id] = fasta.format(linelen=100) n_total += n_perfile n_unique_total += n_uniqueperfile sys.stderr.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(n_file, fastafile, n_perfile,
# l1 = len(i1) # l2 = len(i2) # # score = bestscore # ipos, jpos = bestpos # while score > 0: target # -------------------------------------------------------------------------------------------------- # main # -------------------------------------------------------------------------------------------------- if __name__ == '__main__': align = Alignment() align.s1 = Fasta(filename=sys.argv[1]) align.s2 = Fasta(filename=sys.argv[2]) align.readNCBI('../../dotplot/table/BLOSUM62.matrix') # testing # align.s1 = Fasta() # align.s1.seq = 'ACTGCC' # align.s2 = Fasta() # align.s2.seq = 'ATGCC' # align.readNCBI('../../dotplot/table/NUC4.4.matrix') align.seqToInt() # random.shuffle(align.i1) # uncomment to test scores for random alignments original_score, bestpos = align.localScore(-10, -1) print('original score: {} at {}'.format(original_score, bestpos)) beginscore, beginpos = align.localReverse(-10, -1, original_score, bestpos)
base = base.replace('.seq', '') sys.stdout.write('\n\tExpanded file: {}\n\tbasename: {}\n'.format( infilename, base)) outfilename = base + '.fasta' outfile = None try: outfile = open(outfilename, 'w') except: sys.stderr.write( 'Unable to open output file ({})\n'.format(outfilename)) exit(2) # process all sequences in the file n = 0 for seq in infile: fasta = Fasta() fasta.id = base + '_{}'.format(n) fasta.seq = seq.rstrip().upper() fasta.doc = 'length={}'.format(fasta.length()) outfile.write(fasta.format(linelen=100)) n += 1 infile.close() outfile.close() sys.stdout.write('\t{} sequences written to {}\n'.format( n, outfilename)) # end of loop over files exit(0)
for job in delete_list: del joblist[job] return text # ================================================================================================== # Main # ================================================================================================== args = arguments_get() args.logfile.write('\ninterpro_batch - interproscan of ORF sequences\n') args.logfile.write('\tinput ORF file: {}\n'.format(args.fasta_in.name)) args.logfile.write('\tminimum ORF length: {}\n\n'.format(args.minlen)) fasta = Fasta(fh=args.fasta_in) # The job list keeps track of the ips object that have been created and their current status # the joblist is a dictionary where the ips object is the key and the value is a status string joblist = {} # create a template for the jobs. The template is an interpro object with the metadata added template = Interpro(loglevel=1) template.log_fh = args.logfile template.email = '*****@*****.**' template.application_select(['Pfam', 'Panther', 'SignalP']) template.output_select = 'json' template.poll_time = 60 template.poll_max = 100 sequence_limit = 20
cl = commandline.parse_args() maxbases = cl.maxbases outbase = cl.prefix outsuffix = cl.suffix outsuffix = outsuffix.lstrip('.') # remove leading . if present trim = cl.trim print('\nsplit.py - split fasta file into chunks') print(" fasta file:", cl.fasta_file.name) print(" maximum characters:", maxbases) print(" output prefix:", outbase) print(" output suffix:", outsuffix) print(" doc trimmer:", trim) print('') fasta = Fasta() fasta.fh = cl.fasta_file trimre = re.compile(trim) # initialize counters base_total = 0 base_current = 0 n_out = 0 n_seq = 0 n_current = 0 while fasta.next(): if trimre: fasta.trimDocByRegex(trimre) if not n_seq or base_current + fasta.length() > maxbases:
counts, bases = feature.total() -----------------------------------------------------------------------------------------""" total_count = 0 total_len = 0 for lspace in self.space: total_count += self.space[lspace] total_len += self.space[lspace] * int(lspace) return total_count, total_len # ================================================================================================== # main/test # ================================================================================================== if __name__ == '__main__': fasta = Fasta(file=sys.argv[1]) feature = Feature() feature.feature = "A.C" c = 0; while fasta.next(): count = feature.count_space(fasta) print(count) c += 1 if c > 100: break fcount, flen = feature.total() print('count:{} len:{} avg:{}'.format( fcount, flen, flen/fcount)) exit(0)
# end of a run at end of diagonal (do not subtract because the end position is # the true end) filtered.append([pos, runlen]) diagonal[d] = filtered return nmatch # -------------------------------------------------------------------------------------------------- # Testing # -------------------------------------------------------------------------------------------------- if __name__ == '__main__': print('\ntest 0: identity matching') print('\texpect 7 matches\n') fasta = Fasta() fasta.id = 'test0' fasta.doc = '5 letter DNA test' fasta.seq = 'ACAGT' print('{}\n'.format(fasta.format())) match = Match() match.s1 = fasta match.s2 = fasta nmatch = match.identityPos() print('matches: {}'.format(nmatch)) print('\ntest 1: identity matching, unequal length sequences') print('\texpect 11 matches\n') match = Match()
# idlist, idlist will be an emtpy list if none is provided idlist = get_id_list(args) # read the sequences and store all that match the IDs # duplicates in sequence files will be stored twice n_match = {} # per file number of sequences in list n_notmatch = {} # per file number of sequences not in list n_sequence = {} # per file number of sequences n_found = {} # per ID, number of times found in all files n_file = 0 n_total = 0 n_written = 0 out = sys.stdout for fastafile in glob.glob(args.input_filename): fasta = Fasta() fasta.open(fastafile) if args.outsuffix: outfile = os.path.basename(fastafile) + f'{args.outsuffix}' out = opensafe(outfile, 'w') if not out: # if file can't be opened use stdout out = sys.stdout n_sequence[fastafile] = 0 n_match[fastafile] = 0 n_notmatch[fastafile] = 0 n_file += 1 while fasta.next(): n_sequence[fastafile] += 1
Michael Gribskov 20 April 2021 =================================================================================================""" import time import pymongo from sequence.fasta import Fasta # -------------------------------------------------------------------------------------------------- # main program # -------------------------------------------------------------------------------------------------- mongo = pymongo.MongoClient("mongodb://localhost:27017/") biocomputing = mongo['biocomputing'] biocomputing.drop_collection('phage') phage = biocomputing['phage'] fasta = Fasta('C:/Users/michael/Desktop/phage.fa') fasta_start_time = time.perf_counter() nseq = 0 all = [] while fasta.next(): nseq += 1 # if not nseq % 10001: # break all.append({ '_id': fasta.id, 'documentation': fasta.doc, 'sequence': fasta.seq }) fasta_end_time = time.perf_counter()