def calculateProteinRatioSignificance(self, num_nearest_proteins, ratio_field="ratio_hl_normalized", abundance_field="intensity"): # Limit significance calculations to proteins with ratios recs_with_ratios = [r for r in self.protein_data.values() if not na.isNA(getattr(r,ratio_field))] # Sort proteins by estimated abundance recs = sorted(recs_with_ratios, key=lambda x: getattr(x, abundance_field)) rec_norm_hl = [getattr(r,ratio_field) for r in recs] n = num_nearest_proteins half_n = int(math.ceil(n/2.0)) for ti in range(len(recs)): # fetch nearest N proteins by intensity if ti < half_n: beg_i = 0 end_i = int(min(n, len(recs))) elif ti + half_n >= len(recs): beg_i = len(recs)-half_n end_i = len(recs) else: beg_i = ti - half_n end_i = ti + half_n log_ratios = [math.log(getattr(r,ratio_field)) for r in recs[beg_i:end_i]] my_log_ratio = math.log(getattr(recs[ti],ratio_field)) (n,m,sd,se) = stats.StatsSummary(log_ratios) z = (my_log_ratio - m)/sd p_z = stats.Prob_Z(z) recs[ti].significance = p_z
def inferHandlers(self, max_lines=100): # DAD: run through fields until we've seen at least one non-NA for each. handlers_identified = False li = 0 self.cur_line = self.cache.getLine(li) self.handlers = None inferred_string = [] while not handlers_identified and li < max_lines and self.isValid(): if not self.isComment() and not self.isBlank(): # Not a comment line -- parse it. if self.strip: self.cur_line = self.cur_line.strip() flds = self.cur_line.split(self.delim) flds[-1] = flds[-1].strip() # Get rid of \n # Initialize empty handler list if we haven't done so already if self.handlers is None: self.handlers = [None]*len(flds) inferred_string = ['X']*len(flds) #if len(flds) != len(self.handlers): # print flds assert len(flds) == len(self.handlers), "Number of fields {} not equal to number of handlers {}".format(len(flds), len(self.handlers)) for hi in range(len(self.handlers)): fld = flds[hi] if self.handlers[hi] is None: if not na.isNA(fld): handler_key = self.inferHandlerKey(fld) inferred_string[hi] = handler_key self.handlers[hi] = self.handler_dict[handler_key] else: # handler has already been found; just confirm, and upgrade if necessary try: val = self.handlers[hi](fld) except ValueError: #print "upgrading handler", inferred_string[hi], handler_key = self.inferHandlerKey(fld) inferred_string[hi] = handler_key self.handlers[hi] = self.handler_dict[handler_key] #print "to", handler_key # We're finished when all handlers are not None. handlers_identified = len([h for h in self.handlers if h is None]) == 0 if not handlers_identified: li += 1 try: self.cur_line = self.cache.getLine(li) except ReaderEOFError: # We've reached the end of the file with an inconclusive result -- some fields # still can't have types inferred. # Just assume everything's a string. for hi in range(len(self.handlers)): if self.handlers[hi] is None: self.handlers[hi] = self.handler_dict["s"] handlers_identified = True if not handlers_identified and li >= max_lines: # Went past the allowed number of lines to look ahead; set all unset handlers to strings for hi in range(len(self.handlers)): if self.handlers[hi] is None: self.handlers[hi] = self.handler_dict["s"] #print inferred_string inferred_string = ''.join(inferred_string) return inferred_string
def naIntParser(x): v = None try: v = int(x) except ValueError as ve: if not na.isNA(x): raise ve return v
def naSciParser(x): v = None try: v = float(x) except ValueError as ve: if not na.isNA(x): raise ve return v
def looseIntParser(x): v = None try: v = int(x) except ValueError: if not na.isNA(x): v = naFloatParser(x) return v
def naFloatParser(x): v = None try: v = float(x) except ValueError as ve: if not na.isNA(x): raise ve return v
def add(self, x): if not na.isNA(x): self._sum += x self._sum_sq += x*x self._n += 1 if self._store: self._data.append(x) else: self._na += 1
def __str__(self): res = None if not na.isNA(self.var): try: trans_var = self.transform(self.var) res = self.format.format(trans_var) except ValueError: pass except TypeError: pass else: res = na.NA return res
def add(self, x): if not na.isNA(x): self._sum += x self._sum_sq += x*x self._n += 1 if self._min > x: self._min = x if self._max < x: self._max = x if self._store: self._data.append(x) else: self._na += 1
def naStringParser(x): """A parser that respects NA's.""" v = None if not na.isNA(x): v = str(x) return v
def normalizeHeavyIntensity(self, weight): new_int = [intens/weight for intens in self.intensity_h_list if not na.isNA(intens)] self.intensity_h_list = new_int
def normalized_ratio_hm(self): res = None med = self.getNormalizedHeavyMediumRatioSummary().median if not na.isNA(med): res = math.exp(med) return res
def normalizeMediumIntensity(self, weight): new_int = [intens/weight for intens in self.intensity_m_list if not na.isNA(intens)] self.intensity_m_list = new_int
if prot_ids == []: line = '{0}\tNA\t0'.format(pep.key) else: line = '{0}\t{1}\t{2}'.format(p, ",".join(prot_ids), len(prot_ids)) output_fields = [] for rat in ['hl','ml','hm']: ratio_stats = pep.getRatioSummary(rat) ratio_norm_stats = pep.getNormalizedRatioSummary(rat) output_fields.append(util.FieldFormatter(ratio_stats.median,"{0:e}")) output_fields.append(util.FieldFormatter(ratio_stats.mean,"{0:e}")) output_fields.append(util.FieldFormatter(ratio_norm_stats.median,"{0:e}")) output_fields.append(util.FieldFormatter(ratio_norm_stats.mean,"{0:e}")) rn_lower_95 = None rn_upper_95 = None if not na.isNA(ratio_norm_stats.se): rn_lower_95 = math.exp(math.log(ratio_norm_stats.mean)-1.96*ratio_norm_stats.se) rn_upper_95 = math.exp(math.log(ratio_norm_stats.mean)+1.96*ratio_norm_stats.se) output_fields.append(util.FieldFormatter(rn_lower_95,"{0:e}")) output_fields.append(util.FieldFormatter(rn_upper_95,"{0:e}")) output_fields.append(util.FieldFormatter(ratio_stats.n,"{0:d}")) output_fields.append(util.FieldFormatter(ratio_stats.sd,"{0:e}")) output_fields.append(util.FieldFormatter(ratio_norm_stats.sd,"{0:e}")) # Intensity ratios -- no "normalized" ratios here. iratio_stats = pep.getIntensityRatioSummary(rat) output_fields.append(util.FieldFormatter(iratio_stats.median,"{0:e}")) output_fields.append(util.FieldFormatter(iratio_stats.mean,"{0:e}")) output_fields.append(util.FieldFormatter(iratio_stats.n,"{0:d}")) output_fields.append(util.FieldFormatter(iratio_stats.sd,"{0:e}")) output_fields.append(util.FieldFormatter(pep.intensity,"{0:e}"))
(len(pep.sequence) + pos)) outs.write(">{}\n{}\n".format(pepid, line)) elif options.output_type == 'ratio': outs.write( "seq\tmod.seq\tbegin\tend\tratio\tint.ratio\tintensity\tintensity.h\tintensity.l\n" ) n_written = 0 for (pos, pep) in pep_list: #ratio_stats = pep.getHeavyLightRatioSummary() for (ri, ratio) in enumerate(pep.heavy_light_ratio_list): inth = pep.intensity_h_list[ri] intl = pep.intensity_l_list[ri] inten = None # Ratio of intensities int_ratio = None if not (na.isNA(inth) or na.isNA(intl)): inten = inth + intl if intl > 0: int_ratio = inth / float(intl) outs.write( "{seq}\t{modseq}\t{begin}\t{end}\t{ratio}\t{intratio}\t{inten}\t{inth}\t{intl}\n" .format(seq=pep.sequence, modseq=pep.modified_sequence, begin=pos + 1, end=pos + len(pep.sequence), ratio=na.formatNA(ratio), intratio=na.formatNA(int_ratio), inten=na.formatNA(inten), inth=na.formatNA(inth), intl=na.formatNA(intl))) n_written += 1
def ratio(self): res = None med = self.getHeavyLightRatioSummary().median if not na.isNA(med): res = math.exp(med) return res
def normalizeRatiosBy(self, ratio, norm_ratio): self.heavy_light_ratio_list = [x/ratio for x in self.heavy_light_ratio_list if not na.isNA(x)] self.heavy_light_normalized_ratio_list = [x/norm_ratio for x in self.heavy_light_normalized_ratio_list if not na.isNA(x)]
# Get directory of guide file path = os.path.dirname(fname) curwd = os.getcwd() species_names = [] with open(fname,'r') as inf: os.chdir(path) tab = util.readTable(inf, header=True) rows = tab.dictrows if options.debug: rows = [x for x in tab.dictrows][:2] just_started = True for row in rows: spec_fname = row['filename'] #print(spec_fname) if not na.isNA(spec_fname): spec_inf = util.readTable(open(spec_fname,'r'), header=True) twig = phyloutil.treeFromClassificationTable(spec_inf) added = phyloutil.mergeTrees(tree_root, twig, add_to_leaf=just_started) if added: just_started = False species_names.append(row['updated.species']) #print(spec_fname) else: info_outs.write("# Didn't add {}\n".format(spec_fname)) #phyloutil.printTree(tree_root) # Testing # Write tree # Read it back in # Extract leaf species # Check to make sure they're all the ones we expect
def isNA(x): sys.stderr.write("util.isNA() should be updated to na.isNA()") return na.isNA(x)
def add(self, x): if not na.isNA(x) and x>0.0: super(LogAccumulator,self).add(math.log(x)) self._nolog_sum += x else: self._na += 1
def isNA(x): print("util.isNA() should be updated to na.isNA()", file=sys.stderr) return na.isNA(x)
def normalized_ratio_ml(self): res = None med = self.getNormalizedMediumLightRatioSummary().median if not na.isNA(med): res = math.exp(med) return res
output_fields = [] for rat in ['hl', 'ml', 'hm']: ratio_stats = pep.getRatioSummary(rat) ratio_norm_stats = pep.getNormalizedRatioSummary(rat) output_fields.append( util.FieldFormatter(ratio_stats.median, "{0:e}")) output_fields.append( util.FieldFormatter(ratio_stats.mean, "{0:e}")) output_fields.append( util.FieldFormatter(ratio_norm_stats.median, "{0:e}")) output_fields.append( util.FieldFormatter(ratio_norm_stats.mean, "{0:e}")) rn_lower_95 = None rn_upper_95 = None if not na.isNA(ratio_norm_stats.se): rn_lower_95 = math.exp( math.log(ratio_norm_stats.mean) - 1.96 * ratio_norm_stats.se) rn_upper_95 = math.exp( math.log(ratio_norm_stats.mean) + 1.96 * ratio_norm_stats.se) output_fields.append(util.FieldFormatter(rn_lower_95, "{0:e}")) output_fields.append(util.FieldFormatter(rn_upper_95, "{0:e}")) output_fields.append( util.FieldFormatter(ratio_stats.n, "{0:d}")) output_fields.append( util.FieldFormatter(ratio_stats.sd, "{0:e}")) output_fields.append( util.FieldFormatter(ratio_norm_stats.sd, "{0:e}")) # Intensity ratios -- no "normalized" ratios here.
parser.add_argument("--mw", dest="do_mw", default=False, action="store_true", help="compute molecular weights?") parser.add_argument("--target-aas", dest="target_aas", type=str, default=translate.AAs(), help="amino acids (e.g. ACDEF) for frequency analysis") parser.add_argument("-p", "--pseudo", dest="pseudocount", type=float, default=0.0, help="pseudocount to add to all frequencies") parser.add_argument("-o", "--out", dest="out_fname", type=str, default=None, help="output filename") options = parser.parse_args() cdna_dict = biofile.readFASTADict(os.path.expanduser(options.cds_in_fname)) prot_dict = biofile.readFASTADict(os.path.expanduser(options.prot_in_fname)) # Read paralog data from Yeast Gene Order Browser file ygob_data = util.readTable(file(os.path.expanduser(options.paralog_fname),'r')) paralog_dict = {} for flds in ygob_data.dictrows: scer1 = flds['scer1'].strip() scer2 = flds['scer2'].strip() if not (na.isNA(scer1) or na.isNA(scer2)): paralog_dict[scer1] = scer2 paralog_dict[scer2] = scer1 # Read SGD data sgd_features = util.readTable(file(os.path.expanduser(options.feature_fname),'r'), header=False) ''' http://downloads.yeastgenome.org/curation/chromosomal_feature/SGD_features.README 1. Primary SGDID (mandatory) 2. Feature type (mandatory) 3. Feature qualifier (optional) 4. Feature name (optional) 5. Standard gene name (optional) 6. Alias (optional, multiples separated by |) 7. Parent feature name (optional) 8. Secondary SGDID (optional, multiples separated by |)
def inferHandlers(self, max_lines=100): # DAD: run through fields until we've seen at least one non-NA for each. handlers_identified = False li = 0 self.cur_line = self.cache.getLine(li) self.handlers = None inferred_string = [] while not handlers_identified and li < max_lines and self.isValid(): if not self.isComment() and not self.isBlank(): # Not a comment line -- parse it. if self.strip: self.cur_line = self.cur_line.strip() flds = self.cur_line.split(self.delim) flds[-1] = flds[-1].strip() # Get rid of \n # Initialize empty handler list if we haven't done so already if self.handlers is None: self.handlers = [None] * len(flds) inferred_string = ['X'] * len(flds) #if len(flds) != len(self.handlers): # print flds assert len(flds) == len( self.handlers ), "Number of fields {} not equal to number of handlers {}".format( len(flds), len(self.handlers)) for hi in range(len(self.handlers)): fld = flds[hi] if self.handlers[hi] is None: if not na.isNA(fld): handler_key = self.inferHandlerKey(fld) inferred_string[hi] = handler_key self.handlers[hi] = self.handler_dict[handler_key] else: # handler has already been found; just confirm, and upgrade if necessary try: val = self.handlers[hi](fld) except ValueError: #print "upgrading handler", inferred_string[hi], handler_key = self.inferHandlerKey(fld) inferred_string[hi] = handler_key self.handlers[hi] = self.handler_dict[handler_key] #print "to", handler_key # We're finished when all handlers are not None. handlers_identified = len( [h for h in self.handlers if h is None]) == 0 if not handlers_identified: li += 1 try: self.cur_line = self.cache.getLine(li) except ReaderEOFError: # We've reached the end of the file with an inconclusive result -- some fields # still can't have types inferred. # Just assume everything's a string. for hi in range(len(self.handlers)): if self.handlers[hi] is None: self.handlers[hi] = self.handler_dict["s"] handlers_identified = True if not handlers_identified and li >= max_lines: # Went past the allowed number of lines to look ahead; set all unset handlers to strings for hi in range(len(self.handlers)): if self.handlers[hi] is None: self.handlers[hi] = self.handler_dict["s"] #print inferred_string inferred_string = ''.join(inferred_string) return inferred_string
# Create mapping mapping_dict = dict(zip(map_table['species'], map_table['updated.species'])) # Update the FASTA headers #new_headers = [] #new_seqs = [] seq_dict = {} header_dict = {} short_species_names = {} for (i, h) in enumerate(headers): species_name = extractSpeciesName(h) short_name = makeShortSpeciesName(species_name) try: updated_species_name = mapping_dict[species_name] if not na.isNA(updated_species_name): new_header = "{}[{}]{}".format( h.split('[')[0], updated_species_name, h.split(']')[1]) #new_headers.append(new_header) #new_seqs.append(seqs[i]) seq_dict[updated_species_name] = seqs[i] header_dict[updated_species_name] = new_header short_species_names[updated_species_name] = short_name except KeyError as ke: print(ke) # Iterate over tree and write out FASTA in tree-sorted order n_written = 0 sorted_headers = [] sorted_seqs = []
for (pos, pep) in pep_list: n_peps += 1 pepid = "{}-{}".format(options.target_orf, n_peps) line = gap*pos + pep.sequence + gap*(len_prot-(len(pep.sequence)+pos)) outs.write(">{}\n{}\n".format(pepid, line)) elif options.output_type == 'ratio': outs.write("seq\tmod.seq\tbegin\tend\tratio\tint.ratio\tintensity\tintensity.h\tintensity.l\n") n_written = 0 for (pos, pep) in pep_list: #ratio_stats = pep.getHeavyLightRatioSummary() for (ri,ratio) in enumerate(pep.heavy_light_ratio_list): inth = pep.intensity_h_list[ri] intl = pep.intensity_l_list[ri] inten = None # Ratio of intensities int_ratio = None if not (na.isNA(inth) or na.isNA(intl)): inten = inth+intl if intl>0: int_ratio = inth/float(intl) outs.write("{seq}\t{modseq}\t{begin}\t{end}\t{ratio}\t{intratio}\t{inten}\t{inth}\t{intl}\n".format( seq=pep.sequence, modseq=pep.modified_sequence, begin=pos+1, end=pos+len(pep.sequence), ratio=na.formatNA(ratio), intratio=na.formatNA(int_ratio), inten=na.formatNA(inten), inth=na.formatNA(inth), intl=na.formatNA(intl))) n_written += 1 info_outs.write("# Wrote {} peptide records\n".format(n_written)) #outs.write("{seq}\t{begin}\t{end}\t{ratio}\t{ratio_n}\n".format( # seq=pep.sequence, begin=pos+1, end=pos+len(pep.sequence), ratio=na.formatNA(ratio_stats.median), ratio_n=na.formatNA(ratio_stats.n)))
default=None, help="output filename") options = parser.parse_args() cdna_dict = biofile.readFASTADict(os.path.expanduser(options.cds_in_fname)) prot_dict = biofile.readFASTADict(os.path.expanduser( options.prot_in_fname)) # Read paralog data from Yeast Gene Order Browser file ygob_data = util.readTable( file(os.path.expanduser(options.paralog_fname), 'r')) paralog_dict = {} for flds in ygob_data.dictrows: scer1 = flds['scer1'].strip() scer2 = flds['scer2'].strip() if not (na.isNA(scer1) or na.isNA(scer2)): paralog_dict[scer1] = scer2 paralog_dict[scer2] = scer1 # Read SGD data sgd_features = util.readTable(file( os.path.expanduser(options.feature_fname), 'r'), header=False) ''' http://downloads.yeastgenome.org/curation/chromosomal_feature/SGD_features.README 1. Primary SGDID (mandatory) 2. Feature type (mandatory) 3. Feature qualifier (optional) 4. Feature name (optional) 5. Standard gene name (optional) 6. Alias (optional, multiples separated by |)