def main(): cvs_file_abs_name_gz = os.path.join(cingDirData, 'PluginCode', 'Whatif', cvs_file_abs_name + '.gz') gunzip(cvs_file_abs_name_gz) reader = csv.reader(open(cvs_file_abs_name, "rb"), quoting=csv.QUOTE_NONE) valuesBySsAndResType = {} histJaninBySsAndResType = {} histJaninBySsAndCombinedResType = {} # histByCombinedSsAndResType = {} histJaninCtupleBySsAndResType = {} valuesByEntrySsAndResType = {} hrange = (xRange, yRange) # rowCount = 0 for row in reader: # rowCount += 1 # 7a3h,A,VAL , 5,H, -62.8, -52.8 # 7a3h,A,VAL , 6,H, -71.2, -33.6 # 7a3h,A,GLU , 7,H, -63.5, -41.6 (entryId, _chainId, resType, _resNum, ssType, chi1, chi2, _max_bfactor) = row ssType = to3StateDssp(ssType)[0] resType = resType.strip() chi1 = chi1.strip() chi2 = chi2.strip() chi1 = floatParse(chi1) chi2 = floatParse(chi2) if isNaN(chi1) or isNaN(chi2): continue if not inRange(chi1): nTerror("chi1 not in range for row: %s" % repr(row)) return if not inRange(chi2): nTerror("chi2 not in range for row: %s" % repr(row)) return if not common20AADict.has_key(resType): nTdebug("Residue not in common 20 for row: %s" % repr(row)) # rowCount -= 1 continue appendDeepByKeys(valuesBySsAndResType, chi1, ssType, resType, 'chi1') appendDeepByKeys(valuesByEntrySsAndResType, chi1, entryId, ssType, resType, 'chi1') appendDeepByKeys(valuesBySsAndResType, chi2, ssType, resType, 'chi2') appendDeepByKeys(valuesByEntrySsAndResType, chi2, entryId, ssType, resType, 'chi2') # nTdebug('resType,ssType,chi1: %4s %1s %s' % (resType,ssType,floatFormat(chi1, "%6.1f"))) # nTdebug('resType,ssType,chi2: %4s %1s %s' % (resType,ssType,floatFormat(chi2, "%6.1f"))) del (reader) # closes the file handles os.unlink(cvs_file_abs_name) for ssType in valuesBySsAndResType.keys(): for resType in valuesBySsAndResType[ssType].keys(): chi1 = valuesBySsAndResType[ssType][resType]['chi1'] chi2 = valuesBySsAndResType[ssType][resType]['chi2'] if chi1 and chi2: hist2d, _xedges, _yedges = histogram2d(chi2, chi1, bins=binCount, range=hrange) setDeepByKeys(histJaninBySsAndResType, hist2d, ssType, resType) cTuple = getEnsembleAverageAndSigmaHis(hist2d) (c_av, c_sd, hisMin, hisMax) = cTuple cTuple += tuple([str([ssType, resType]) ]) # append the hash keys as a way of id. nTdebug( "For ssType %s residue type %s found (av/sd/min/max) %8.0f %8.0f %8.0f %8.0f" % (ssType, resType, c_av, c_sd, hisMin, hisMax)) if c_sd == None: nTdebug( 'Failed to get c_sd when testing not all residues are present in smaller sets.' ) continue if c_sd == 0.: nTdebug( 'Got zero c_sd, ignoring histogram. This should only occur in smaller sets. Not setting values.' ) continue setDeepByKeys(histJaninCtupleBySsAndResType, cTuple, ssType, resType) for ssType in valuesBySsAndResType.keys(): chi1 = [] chi2 = [] for resType in valuesBySsAndResType[ssType].keys(): if resType == 'PRO' or resType == 'GLY': continue chi1 += valuesBySsAndResType[ssType][resType]['chi1'] chi2 += valuesBySsAndResType[ssType][resType]['chi2'] if chi1 and chi2: hist2d, _xedges, _yedges = histogram2d( chi2, # Note that the x is the chi2 for some stupid reason, chi1, # otherwise the imagery but also the [row][column] notation is screwed. bins=binCount, range=hrange) # hist2d = zscaleHist( hist2d, Cav, Csd ) setDeepByKeys(histJaninBySsAndCombinedResType, hist2d, ssType) # Throws a verbose error message on python 2.6.3 as per issue http://code.google.com/p/cing/issues/detail?id=211 # Using Pickle instead # dbase = shelve.open( dbase_file_abs_name ) # dbase.close() if os.path.exists(dbase_file_abs_name): os.unlink(dbase_file_abs_name) output = open(dbase_file_abs_name, 'wb') dbase = {} dbase['histJaninBySsAndCombinedResType'] = histJaninBySsAndCombinedResType dbase['histJaninBySsAndResType'] = histJaninBySsAndResType dbase['histJaninCtupleBySsAndResType'] = histJaninCtupleBySsAndResType # histJaninCtupleBySsAndResType cPickle.dump(dbase, output, 2) output.close()
def main(): cvs_file_abs_name_gz = os.path.join(cingDirData, 'PluginCode', 'Whatif', cvs_file_abs_name + '.gz') gunzip(cvs_file_abs_name_gz) reader = csv.reader(open(cvs_file_abs_name, "rb"), quoting=csv.QUOTE_NONE) valuesBySsAndResType = {} histJaninBySsAndResType = {} histJaninBySsAndCombinedResType = {} # histByCombinedSsAndResType = {} histJaninCtupleBySsAndResType = {} valuesByEntrySsAndResType = {} hrange = (xRange, yRange) # rowCount = 0 for row in reader: # rowCount += 1 # 7a3h,A,VAL , 5,H, -62.8, -52.8 # 7a3h,A,VAL , 6,H, -71.2, -33.6 # 7a3h,A,GLU , 7,H, -63.5, -41.6 (entryId, _chainId, resType, _resNum, ssType, chi1, chi2, _max_bfactor) = row ssType = to3StateDssp(ssType)[0] resType = resType.strip() chi1 = chi1.strip() chi2 = chi2.strip() chi1 = floatParse(chi1) chi2 = floatParse(chi2) if isNaN(chi1) or isNaN(chi2): continue if not inRange(chi1): nTerror("chi1 not in range for row: %s" % repr(row)) return if not inRange(chi2): nTerror("chi2 not in range for row: %s" % repr(row)) return if not common20AADict.has_key(resType): nTdebug("Residue not in common 20 for row: %s" % repr(row)) # rowCount -= 1 continue appendDeepByKeys(valuesBySsAndResType, chi1, ssType, resType, 'chi1') appendDeepByKeys(valuesByEntrySsAndResType, chi1, entryId, ssType, resType, 'chi1') appendDeepByKeys(valuesBySsAndResType, chi2, ssType, resType, 'chi2') appendDeepByKeys(valuesByEntrySsAndResType, chi2, entryId, ssType, resType, 'chi2') # nTdebug('resType,ssType,chi1: %4s %1s %s' % (resType,ssType,floatFormat(chi1, "%6.1f"))) # nTdebug('resType,ssType,chi2: %4s %1s %s' % (resType,ssType,floatFormat(chi2, "%6.1f"))) del(reader) # closes the file handles os.unlink(cvs_file_abs_name) for ssType in valuesBySsAndResType.keys(): for resType in valuesBySsAndResType[ssType].keys(): chi1 = valuesBySsAndResType[ssType][resType]['chi1'] chi2 = valuesBySsAndResType[ssType][resType]['chi2'] if chi1 and chi2: hist2d, _xedges, _yedges = histogram2d( chi2, chi1, bins=binCount, range=hrange) setDeepByKeys(histJaninBySsAndResType, hist2d, ssType, resType) cTuple = getEnsembleAverageAndSigmaHis(hist2d) (c_av, c_sd, hisMin, hisMax) = cTuple cTuple += tuple([str([ssType, resType])]) # append the hash keys as a way of id. nTdebug("For ssType %s residue type %s found (av/sd/min/max) %8.0f %8.0f %8.0f %8.0f" % ( ssType, resType, c_av, c_sd, hisMin, hisMax)) if c_sd == None: nTdebug('Failed to get c_sd when testing not all residues are present in smaller sets.') continue if c_sd == 0.: nTdebug('Got zero c_sd, ignoring histogram. This should only occur in smaller sets. Not setting values.') continue setDeepByKeys(histJaninCtupleBySsAndResType, cTuple, ssType, resType) for ssType in valuesBySsAndResType.keys(): chi1 = [] chi2 = [] for resType in valuesBySsAndResType[ssType].keys(): if resType == 'PRO' or resType == 'GLY': continue chi1 += valuesBySsAndResType[ssType][resType]['chi1'] chi2 += valuesBySsAndResType[ssType][resType]['chi2'] if chi1 and chi2: hist2d, _xedges, _yedges = histogram2d( chi2, # Note that the x is the chi2 for some stupid reason, chi1, # otherwise the imagery but also the [row][column] notation is screwed. bins=binCount, range=hrange) # hist2d = zscaleHist( hist2d, Cav, Csd ) setDeepByKeys(histJaninBySsAndCombinedResType, hist2d, ssType) # Throws a verbose error message on python 2.6.3 as per issue https://github.com/VuisterLab/cing/issues/211 # Using Pickle instead # dbase = shelve.open( dbase_file_abs_name ) # dbase.close() if os.path.exists(dbase_file_abs_name): os.unlink(dbase_file_abs_name) output = open(dbase_file_abs_name, 'wb') dbase = {} dbase[ 'histJaninBySsAndCombinedResType' ] = histJaninBySsAndCombinedResType dbase[ 'histJaninBySsAndResType' ] = histJaninBySsAndResType dbase[ 'histJaninCtupleBySsAndResType' ] = histJaninCtupleBySsAndResType # histJaninCtupleBySsAndResType cPickle.dump(dbase, output, 2) output.close()
def main(): cvs_file_abs_name_gz = cvs_file_abs_name + '.gz' gunzip(cvs_file_abs_name_gz) reader = csv.reader(open(cvs_file_abs_name, "rb"), quoting=csv.QUOTE_NONE) valuesBySsAndResType = {} histRamaBySsAndResType = {} histRamaBySsAndCombinedResType = {} # histByCombinedSsAndResType = {} histRamaCtupleBySsAndResType = {} valuesByEntrySsAndResType = {} hrange = (xRange, yRange) rowCount = 0 for row in reader: rowCount += 1 # 7a3h,A,VAL , 5,H, -62.8, -52.8 # 7a3h,A,VAL , 6,H, -71.2, -33.6 # 7a3h,A,GLU , 7,H, -63.5, -41.6 (entryId, _chainId, resType, _resNum, ssType, phi, psi, _max_bfactor) = row ssType = to3StateDssp(ssType)[0] resType = resType.strip() phi = float(phi) psi = float(psi) if not (inRange(phi, isRange360=isRange360) and inRange(psi, isRange360=isRange360)): nTerror("phi and/or psi not in range for row: %s" % repr(row)) return if not common20AADict.has_key(resType): nTdebug("Residue not in common 20 for row: %s" % repr(row)) rowCount -= 1 continue appendDeepByKeys(valuesBySsAndResType, phi, ssType, resType, 'phi') appendDeepByKeys(valuesBySsAndResType, psi, ssType, resType, 'psi') # nTdebug('resType,ssType,phi,psi: %4s %1s %8.3f %8.3f' % (resType,ssType,phi,psi)) appendDeepByKeys(valuesByEntrySsAndResType, phi, entryId, ssType, resType, 'phi') appendDeepByKeys(valuesByEntrySsAndResType, psi, entryId, ssType, resType, 'psi') del (reader) # closes the file handles os.unlink(cvs_file_abs_name) nTdebug('Total number of included residues including PRO/GLY: %d' % rowCount) # nTdebug('valuesByEntrySsAndResType:\n%s'%valuesByEntrySsAndResType) # (cAv, cSd, _Cn) = getRescaling(valuesByEntrySsAndResType) (cAv, cSd) = (1.0, 1.0) nTdebug("Overall found av,sd: %r %r" % (cAv, cSd)) for ssType in valuesBySsAndResType.keys(): for resType in valuesBySsAndResType[ssType].keys(): hist2d, _xedges, _yedges = histogram2d( valuesBySsAndResType[ssType][resType]['psi'], valuesBySsAndResType[ssType][resType]['phi'], bins=binCount, range=hrange) # hist2d = zscaleHist( hist2d, cAv, cSd ) setDeepByKeys(histRamaBySsAndResType, hist2d, ssType, resType) # nTdebug('hist2d ssType, resType: %s %s\n%s' % (ssType, resType, hist2d)) cTuple = getEnsembleAverageAndSigmaHis(hist2d) (c_av, c_sd, hisMin, hisMax) = cTuple cTuple += tuple([str([ssType, resType]) ]) # append the hash keys as a way of id. nTdebug( "For ssType %s residue type %s found (av/sd/min/max) %8.0f %8.0f %8.0f %8.0f" % (ssType, resType, c_av, c_sd, hisMin, hisMax)) # nTdebug("xedges %s" % repr(xedges)) # sys.exit(1) if c_sd == None: nTdebug( 'Failed to get c_sd when testing not all residues are present in smaller sets.' ) continue if c_sd == 0.: nTdebug( 'Got zero c_sd, ignoring histogram. This should only occur in smaller sets. Not setting values.' ) continue setDeepByKeys(histRamaCtupleBySsAndResType, cTuple, ssType, resType) for ssType in valuesBySsAndResType.keys(): phi = [] psi = [] for resType in valuesBySsAndResType[ssType].keys(): if resType == 'PRO' or resType == 'GLY': continue phi += valuesBySsAndResType[ssType][resType]['phi'] psi += valuesBySsAndResType[ssType][resType]['psi'] hist2d, _xedges, _yedges = histogram2d( psi, # Note that the x is the psi for some stupid reason, phi, # otherwise the imagery but also the [row][column] notation is screwed. bins=binCount, range=hrange) # hist2d = zscaleHist( hist2d, cAv, cSd ) setDeepByKeys(histRamaBySsAndCombinedResType, hist2d, ssType) phi = [] psi = [] for ssType in valuesBySsAndResType.keys(): for resType in valuesBySsAndResType[ssType].keys(): if resType == 'PRO' or resType == 'GLY': continue phi += valuesBySsAndResType[ssType][resType]['phi'] psi += valuesBySsAndResType[ssType][resType]['psi'] nTdebug('Total number of residues without PRO/GLY: %d' % len(psi)) hist2d, _xedges, _yedges = histogram2d( psi, # Note that the x is the psi for some stupid reason, phi, # otherwise the imagery but also the [row][column] notation is screwed. bins=binCount, range=hrange) # sumHistCombined = sum( hist2d ) # sumsumHistCombined = sum( sumHistCombined ) nTdebug('hist2d : \n%s' % hist2d) # nTdebug('sumHistCombined : %s' % repr(sumHistCombined)) # nTdebug('sumsumHistCombined: %.0f' % sumsumHistCombined) # hist2d = zscaleHist( hist2d, cAv, cSd ) # nTdebug('hist2d scaled : \n%s' % hist2d) if os.path.exists(dbase_file_abs_name): os.unlink(dbase_file_abs_name) # dbase = shelve.open( dbase_file_abs_name ) output = open(dbase_file_abs_name, 'wb') # dbase = {'bar':'milky'} dbase = {} # Pickle the list using the highest protocol available. dbase['histRamaCombined'] = hist2d dbase['histRamaBySsAndCombinedResType'] = histRamaBySsAndCombinedResType dbase['histRamaBySsAndResType'] = histRamaBySsAndResType dbase['histRamaCtupleBySsAndResType'] = histRamaCtupleBySsAndResType # pickle.dump(dbase, output, -1) # pickle.dump(dbase, output) cPickle.dump( dbase, output, 2) # Was -1 for the most recent version but this caused an issue 239 # NB 2 is the highest listed protocol too but behind the scenes cPickle will probably write something higher still. # If the protocol parameter is omitted, protocol 0 is used. # If protocol is specified as a negative value or HIGHEST_PROTOCOL, the highest protocol version will be used. output.close()
def main(): cvs_file_abs_name_gz = cvs_file_abs_name + '.gz' gunzip(cvs_file_abs_name_gz) reader = csv.reader(open(cvs_file_abs_name, "rb"), quoting=csv.QUOTE_NONE) valuesBySsAndResType = {} histRamaBySsAndResType = {} histRamaBySsAndCombinedResType = {} # histByCombinedSsAndResType = {} histRamaCtupleBySsAndResType = {} valuesByEntrySsAndResType = {} hrange = (xRange, yRange) rowCount = 0 for row in reader: rowCount += 1 # 7a3h,A,VAL , 5,H, -62.8, -52.8 # 7a3h,A,VAL , 6,H, -71.2, -33.6 # 7a3h,A,GLU , 7,H, -63.5, -41.6 (entryId, _chainId, resType, _resNum, ssType, phi, psi, _max_bfactor) = row ssType = to3StateDssp(ssType)[0] resType = resType.strip() phi = float(phi) psi = float(psi) if not (inRange(phi, isRange360=isRange360) and inRange(psi, isRange360=isRange360)): nTerror("phi and/or psi not in range for row: %s" % repr(row)) return if not common20AADict.has_key(resType): nTdebug("Residue not in common 20 for row: %s" % repr(row)) rowCount -= 1 continue appendDeepByKeys(valuesBySsAndResType, phi, ssType, resType, 'phi') appendDeepByKeys(valuesBySsAndResType, psi, ssType, resType, 'psi') # nTdebug('resType,ssType,phi,psi: %4s %1s %8.3f %8.3f' % (resType,ssType,phi,psi)) appendDeepByKeys(valuesByEntrySsAndResType, phi, entryId, ssType, resType, 'phi') appendDeepByKeys(valuesByEntrySsAndResType, psi, entryId, ssType, resType, 'psi') del(reader) # closes the file handles os.unlink(cvs_file_abs_name) nTdebug('Total number of included residues including PRO/GLY: %d' % rowCount) # nTdebug('valuesByEntrySsAndResType:\n%s'%valuesByEntrySsAndResType) # (cAv, cSd, _Cn) = getRescaling(valuesByEntrySsAndResType) (cAv, cSd) = (1.0, 1.0) nTdebug("Overall found av,sd: %r %r" % (cAv, cSd)) for ssType in valuesBySsAndResType.keys(): for resType in valuesBySsAndResType[ssType].keys(): hist2d, _xedges, _yedges = histogram2d( valuesBySsAndResType[ssType][resType]['psi'], valuesBySsAndResType[ssType][resType]['phi'], bins=binCount, range=hrange) # hist2d = zscaleHist( hist2d, cAv, cSd ) setDeepByKeys(histRamaBySsAndResType, hist2d, ssType, resType) # nTdebug('hist2d ssType, resType: %s %s\n%s' % (ssType, resType, hist2d)) cTuple = getEnsembleAverageAndSigmaHis(hist2d) (c_av, c_sd, hisMin, hisMax) = cTuple cTuple += tuple([str([ssType, resType])]) # append the hash keys as a way of id. nTdebug("For ssType %s residue type %s found (av/sd/min/max) %8.0f %8.0f %8.0f %8.0f" % ( ssType, resType, c_av, c_sd, hisMin, hisMax)) # nTdebug("xedges %s" % repr(xedges)) # sys.exit(1) if c_sd == None: nTdebug('Failed to get c_sd when testing not all residues are present in smaller sets.') continue if c_sd == 0.: nTdebug('Got zero c_sd, ignoring histogram. This should only occur in smaller sets. Not setting values.') continue setDeepByKeys(histRamaCtupleBySsAndResType, cTuple, ssType, resType) for ssType in valuesBySsAndResType.keys(): phi = [] psi = [] for resType in valuesBySsAndResType[ssType].keys(): if resType == 'PRO' or resType == 'GLY': continue phi += valuesBySsAndResType[ssType][resType]['phi'] psi += valuesBySsAndResType[ssType][resType]['psi'] hist2d, _xedges, _yedges = histogram2d( psi, # Note that the x is the psi for some stupid reason, phi, # otherwise the imagery but also the [row][column] notation is screwed. bins=binCount, range=hrange) # hist2d = zscaleHist( hist2d, cAv, cSd ) setDeepByKeys(histRamaBySsAndCombinedResType, hist2d, ssType) phi = [] psi = [] for ssType in valuesBySsAndResType.keys(): for resType in valuesBySsAndResType[ssType].keys(): if resType == 'PRO' or resType == 'GLY': continue phi += valuesBySsAndResType[ssType][resType]['phi'] psi += valuesBySsAndResType[ssType][resType]['psi'] nTdebug('Total number of residues without PRO/GLY: %d' % len(psi)) hist2d, _xedges, _yedges = histogram2d( psi, # Note that the x is the psi for some stupid reason, phi, # otherwise the imagery but also the [row][column] notation is screwed. bins=binCount, range=hrange) # sumHistCombined = sum( hist2d ) # sumsumHistCombined = sum( sumHistCombined ) nTdebug('hist2d : \n%s' % hist2d) # nTdebug('sumHistCombined : %s' % repr(sumHistCombined)) # nTdebug('sumsumHistCombined: %.0f' % sumsumHistCombined) # hist2d = zscaleHist( hist2d, cAv, cSd ) # nTdebug('hist2d scaled : \n%s' % hist2d) if os.path.exists(dbase_file_abs_name): os.unlink(dbase_file_abs_name) # dbase = shelve.open( dbase_file_abs_name ) output = open(dbase_file_abs_name, 'wb') # dbase = {'bar':'milky'} dbase = {} # Pickle the list using the highest protocol available. dbase[ 'histRamaCombined' ] = hist2d dbase[ 'histRamaBySsAndCombinedResType' ] = histRamaBySsAndCombinedResType dbase[ 'histRamaBySsAndResType' ] = histRamaBySsAndResType dbase[ 'histRamaCtupleBySsAndResType' ] = histRamaCtupleBySsAndResType # pickle.dump(dbase, output, -1) # pickle.dump(dbase, output) cPickle.dump(dbase, output, 2) # Was -1 for the most recent version but this caused an issue 239 # NB 2 is the highest listed protocol too but behind the scenes cPickle will probably write something higher still. # If the protocol parameter is omitted, protocol 0 is used. # If protocol is specified as a negative value or HIGHEST_PROTOCOL, the highest protocol version will be used. output.close()
def main(): 'See above.' cvs_file_abs_name_gz = os.path.join(cingDirData, 'PluginCode', 'Whatif', cvs_file_abs_name + '.gz') gunzip(cvs_file_abs_name_gz) reader = csv.reader(open(cvs_file_abs_name, "rb"), quoting=csv.QUOTE_NONE) valueBySs0AndResTypes = {} # keys are SSi, RTi, RTi-1 valueBySs1AndResTypes = {} # keys are SSi-1, RTi, RTi-1 valueByResTypes = {} valueBySs0 = {} # keys are SSi valueBySs1 = {} # keys are SSi-1 histd1CtupleBySsAndResTypes = {} value = [] # NB is an array without being keyed. histd1BySs0AndResTypes = {} # keys are SSi, RTi, RTi-1 histd1BySs1AndResTypes = {} # keys are SSi-1, RTi, RTi-1 histd1ByResTypes = {} histd1BySs0 = {} histd1BySs1 = {} linesByEntry = {} lineCount = 0 for row in reader: lineCount += 1 if lineCount > lineCountMax: break entryId = row[0] if not linesByEntry.has_key(entryId): linesByEntry[ entryId ] = [] linesByEntry[ entryId ].append( row ) skippedResTypes = [] entryIdList = linesByEntry.keys() entryIdList.sort() # Do some pre filtering. for entryId2 in entryIdList: lineList = linesByEntry[ entryId2 ] for idx,line in enumerate(lineList): line.append(idx) lineListSorted = NTsort(lineList,BFACTOR_COLUMN,inplace=False) # Now throw away the worst 10 % of residues. n = len(lineListSorted) bad_count = int(round((n * DEFAULT_BFACTOR_PERCENTAGE_FILTER) / 100.)) to_remove_count = n-bad_count # nTmessage("Removing at least %d from %d residues" % (bad_count,n)) badIdxList = [lineItem[IDX_COLUMN] for lineItem in lineListSorted[to_remove_count:n]] iList = range(n) iList.reverse() for i in iList: lineItem = lineList[i] max_bfactor = float(lineItem[BFACTOR_COLUMN]) if max_bfactor > DEFAULT_MAX_BFACTOR: # nTdebug('Skipping because max bfactor in dihedral %.3f is above %.3f %s' % (max_bfactor, DEFAULT_MAX_BFACTOR, lineItem)) del lineList[i] # TODO: check if indexing is still right or we shoot in the foot. continue if i in badIdxList: # nTdebug('Skipping because bfactor worst %.3f %s' % (max_bfactor, lineItem)) del lineList[i] continue removed_count = n - len(lineList) # nTdebug("Reduced list by %d" % removed_count) if removed_count < bad_count: nTwarning("Failed to remove at least %d residues" % bad_count) for entryId2 in entryIdList: prevChainId = None prevResType = None prevResNum = None prevSsType = None for _r, row in enumerate(linesByEntry[ entryId2 ]): #1zzk,A,GLN , 17,E, 205.2, 193.6 #1zzk,A,VAL , 18,E, 193.6, 223.2 #1zzk,A,THR , 19,E, 223.2, 190.1 (entryId, chainId, resType, resNum, ssType, d1, _d2, _max_bfactor, _idx) = row resNum = int(resNum) ssType = to3StateDssp(ssType)[0] resType = resType.strip() db = NTdb.getResidueDefByName( resType ) if not db: nTerror("resType not in db: %s" % resType) return resType = db.nameDict['IUPAC'] d1 = d1.strip() d1 = floatParse(d1) if isNaN(d1): # nTdebug("d1 %s is a NaN on row: %s" % (d1,row)) continue if not inRange(d1): nTerror("d1 not in range for row: %s" % str(row)) return if not (resType in common20AAList): # nTmessage("Skipping uncommon residue: %s" % resType) if not ( resType in skippedResTypes): skippedResTypes.append( resType ) continue if isSibling(chainId, resNum, prevChainId, prevResNum): appendDeepByKeys(valueBySs0AndResTypes, d1, ssType, resType, prevResType) appendDeepByKeys(valueBySs1AndResTypes, d1, prevSsType, resType, prevResType) appendDeepByKeys(valueByResTypes, d1, resType, prevResType) appendDeepByKeys(valueBySs0, d1, ssType) appendDeepByKeys(valueBySs1, d1, prevSsType) value.append( d1 ) prevResType = resType prevResNum = resNum prevChainId = chainId prevSsType = ssType os.unlink(cvs_file_abs_name) nTmessage("Skipped skippedResTypes: %r" % skippedResTypes ) nTmessage("Got count of values: %r" % len(value) ) # fill FOUR types of hist. # TODO: filter differently for pro/gly keyListSorted1 = valueBySs0AndResTypes.keys() keyListSorted1.sort() for isI in (True, False): if isI: valueBySs = valueBySs0 valueBySsAndResTypes = valueBySs0AndResTypes histd1BySs = histd1BySs0 histd1BySsAndResTypes = histd1BySs0AndResTypes else: valueBySs = valueBySs1 valueBySsAndResTypes = valueBySs1AndResTypes histd1BySs = histd1BySs1 histd1BySsAndResTypes = histd1BySs1AndResTypes for ssType in keyListSorted1: # keyListSorted1b = deepcopy(keyListSorted1) # for ssTypePrev in keyListSorted1b: d1List = valueBySs[ssType] if not d1List: nTerror("Expected d1List from valueBySs[%s]" % (ssType)) continue hist1d, _bins, _patches = hist(d1List, bins=binCount, range=xRange) nTmessage("Count %6d in valueBySs[%s]" % (sum(hist1d), ssType)) setDeepByKeys(histd1BySs, hist1d, ssType) keyListSorted2 = valueBySsAndResTypes[ssType].keys() keyListSorted2.sort() for resType in keyListSorted2: # nTmessage("Working on valueBySsAndResTypes for [%s][%s]" % (ssType, resType)) # nice for balancing output verbosity. keyListSorted3 = valueBySsAndResTypes[ssType][resType].keys() keyListSorted3.sort() for prevResType in keyListSorted3: # nTmessage("Working on valueBySsAndResTypes[%s][%s][%s]" % (ssType, resType, prevResType)) d1List = valueBySsAndResTypes[ssType][resType][prevResType] if not d1List: nTerror("Expected d1List from valueBySsAndResTypes[%s][%s][%s]" % (ssType, resType, prevResType)) continue hist1d, _bins, _patches = hist(d1List, bins=binCount, range=xRange) # nTmessage("Count %6d in valueBySsAndResTypes[%s][%s][%s]" % (sum(hist1d), ssType, resType, prevResType)) setDeepByKeys(histd1BySsAndResTypes, hist1d, ssType, resType, prevResType) # Now that they are all in we can redo this. # Delete the reference -not- the object. valueBySs = None valueBySsAndResTypes = None histd1BySs = None histd1BySsAndResTypes = None for ssType in keyListSorted1: for resType in keyListSorted2: # nTmessage("Working on valueBySsAndResTypes for [%s][%s]" % (ssType, resType)) # nice for balancing output verbosity. keyListSorted3 = valueBySs0AndResTypes[ssType][resType].keys() keyListSorted3.sort() for resTypePrev in keyListSorted3: keyListSorted4 = keyListSorted3[:] # take a copy for resTypeNext in keyListSorted4: hist1 = getDeepByKeys(histd1BySs0AndResTypes, ssType, resType, resTypePrev) # x-axis # This was bug! It needs to be hashed on the ssType of resType -not- on resTypeNext hist2 = getDeepByKeys(histd1BySs1AndResTypes, ssType, resTypeNext, resType) if hist1 == None: nTdebug('skipping for hist1 is empty for [%s] [%s] [%s]' % (ssType, resTypePrev, resType)) continue if hist2 == None: nTdebug('skipping for hist2 is empty for [%s] [%s] [%s]' % (ssType, resType, resTypeNext)) continue m1 = mat(hist1,dtype='float') m2 = mat(hist2,dtype='float') m2 = m2.transpose() # pylint: disable=E1101 hist2d = multiply(m1,m2) cTuple = getEnsembleAverageAndSigmaHis( hist2d ) (_c_av, c_sd, _hisMin, _hisMax) = cTuple #@UnusedVariable cTuple += tuple([str([ssType, resType, resTypePrev, resTypeNext])]) # append the hash keys as a way of id. # nTdebug("For ssType %s residue types %s %s %s found (av/sd/min/max) %8.0f %8.0f %8.0f %8.0f" % ( # ssType, resType, resTypePrev, resTypeNext, c_av, c_sd, hisMin, hisMax)) if c_sd == None: nTdebug('Failed to get c_sd when testing not all residues are present in smaller sets.') continue if c_sd == 0.: nTdebug('Got zero c_sd, ignoring histogram. This should only occur in smaller sets. Not setting values.') continue setDeepByKeys( histd1CtupleBySsAndResTypes, cTuple, ssType, resType, resTypePrev, resTypeNext) # end for isI keyListSorted1 = valueByResTypes.keys() keyListSorted1.sort() for resType in keyListSorted1: keyListSorted2 = valueByResTypes[resType].keys() keyListSorted2.sort() for prevResType in keyListSorted2: d1List = valueByResTypes[resType][prevResType] if not d1List: nTerror("Expected d1List from valueByResTypes[%s][%s]" % (resType, prevResType)) continue hist1d, _bins, _patches = hist(d1List, bins=binCount, range=xRange) # nTmessage("Count %6d in valueByResTypes[%s][%s]" % (sum(hist1d), resType, prevResType)) setDeepByKeys(histd1ByResTypes, hist1d, resType, prevResType) histd1, _bins, _patches = hist(value, bins=binCount, range=xRange) nTmessage("Count %6d in value" % sum(histd1)) # setDeepByKeys(histd1, hist1d, resType, prevResType) if os.path.exists(dbase_file_abs_name): os.unlink(dbase_file_abs_name) output = open(dbase_file_abs_name, 'wb') dbase = {} dbase[ 'histd1BySs0AndResTypes' ] = histd1BySs0AndResTypes # 92 kb uncompressed in the case of ~1000 lines only dbase[ 'histd1BySs1AndResTypes' ] = histd1BySs1AndResTypes dbase[ 'histd1CtupleBySsAndResTypes' ] = histd1CtupleBySsAndResTypes dbase[ 'histd1ByResTypes' ] = histd1ByResTypes # 56 kb dbase[ 'histd1BySs0' ] = histd1BySs0 # 4 kb dbase[ 'histd1BySs1' ] = histd1BySs1 dbase[ 'histd1' ] = histd1 # 4 kb cPickle.dump(dbase, output, 2) output.close()
def main(): 'See above.' cvs_file_abs_name_gz = os.path.join(cingDirData, 'PluginCode', 'Whatif', cvs_file_abs_name + '.gz') gunzip(cvs_file_abs_name_gz) reader = csv.reader(open(cvs_file_abs_name, "rb"), quoting=csv.QUOTE_NONE) valueBySs0AndResTypes = {} # keys are SSi, RTi, RTi-1 valueBySs1AndResTypes = {} # keys are SSi-1, RTi, RTi-1 valueByResTypes = {} valueBySs0 = {} # keys are SSi valueBySs1 = {} # keys are SSi-1 histd1CtupleBySsAndResTypes = {} value = [] # NB is an array without being keyed. histd1BySs0AndResTypes = {} # keys are SSi, RTi, RTi-1 histd1BySs1AndResTypes = {} # keys are SSi-1, RTi, RTi-1 histd1ByResTypes = {} histd1BySs0 = {} histd1BySs1 = {} linesByEntry = {} lineCount = 0 for row in reader: lineCount += 1 if lineCount > lineCountMax: break entryId = row[0] if not linesByEntry.has_key(entryId): linesByEntry[entryId] = [] linesByEntry[entryId].append(row) skippedResTypes = [] entryIdList = linesByEntry.keys() entryIdList.sort() # Do some pre filtering. for entryId2 in entryIdList: lineList = linesByEntry[entryId2] for idx, line in enumerate(lineList): line.append(idx) lineListSorted = NTsort(lineList, BFACTOR_COLUMN, inplace=False) # Now throw away the worst 10 % of residues. n = len(lineListSorted) bad_count = int(round((n * DEFAULT_BFACTOR_PERCENTAGE_FILTER) / 100.)) to_remove_count = n - bad_count # nTmessage("Removing at least %d from %d residues" % (bad_count,n)) badIdxList = [ lineItem[IDX_COLUMN] for lineItem in lineListSorted[to_remove_count:n] ] iList = range(n) iList.reverse() for i in iList: lineItem = lineList[i] max_bfactor = float(lineItem[BFACTOR_COLUMN]) if max_bfactor > DEFAULT_MAX_BFACTOR: # nTdebug('Skipping because max bfactor in dihedral %.3f is above %.3f %s' % (max_bfactor, DEFAULT_MAX_BFACTOR, lineItem)) del lineList[ i] # TODO: check if indexing is still right or we shoot in the foot. continue if i in badIdxList: # nTdebug('Skipping because bfactor worst %.3f %s' % (max_bfactor, lineItem)) del lineList[i] continue removed_count = n - len(lineList) # nTdebug("Reduced list by %d" % removed_count) if removed_count < bad_count: nTwarning("Failed to remove at least %d residues" % bad_count) for entryId2 in entryIdList: prevChainId = None prevResType = None prevResNum = None prevSsType = None for _r, row in enumerate(linesByEntry[entryId2]): #1zzk,A,GLN , 17,E, 205.2, 193.6 #1zzk,A,VAL , 18,E, 193.6, 223.2 #1zzk,A,THR , 19,E, 223.2, 190.1 (entryId, chainId, resType, resNum, ssType, d1, _d2, _max_bfactor, _idx) = row resNum = int(resNum) ssType = to3StateDssp(ssType)[0] resType = resType.strip() db = NTdb.getResidueDefByName(resType) if not db: nTerror("resType not in db: %s" % resType) return resType = db.nameDict['IUPAC'] d1 = d1.strip() d1 = floatParse(d1) if isNaN(d1): # nTdebug("d1 %s is a NaN on row: %s" % (d1,row)) continue if not inRange(d1): nTerror("d1 not in range for row: %s" % str(row)) return if not (resType in common20AAList): # nTmessage("Skipping uncommon residue: %s" % resType) if not (resType in skippedResTypes): skippedResTypes.append(resType) continue if isSibling(chainId, resNum, prevChainId, prevResNum): appendDeepByKeys(valueBySs0AndResTypes, d1, ssType, resType, prevResType) appendDeepByKeys(valueBySs1AndResTypes, d1, prevSsType, resType, prevResType) appendDeepByKeys(valueByResTypes, d1, resType, prevResType) appendDeepByKeys(valueBySs0, d1, ssType) appendDeepByKeys(valueBySs1, d1, prevSsType) value.append(d1) prevResType = resType prevResNum = resNum prevChainId = chainId prevSsType = ssType os.unlink(cvs_file_abs_name) nTmessage("Skipped skippedResTypes: %r" % skippedResTypes) nTmessage("Got count of values: %r" % len(value)) # fill FOUR types of hist. # TODO: filter differently for pro/gly keyListSorted1 = valueBySs0AndResTypes.keys() keyListSorted1.sort() for isI in (True, False): if isI: valueBySs = valueBySs0 valueBySsAndResTypes = valueBySs0AndResTypes histd1BySs = histd1BySs0 histd1BySsAndResTypes = histd1BySs0AndResTypes else: valueBySs = valueBySs1 valueBySsAndResTypes = valueBySs1AndResTypes histd1BySs = histd1BySs1 histd1BySsAndResTypes = histd1BySs1AndResTypes for ssType in keyListSorted1: # keyListSorted1b = deepcopy(keyListSorted1) # for ssTypePrev in keyListSorted1b: d1List = valueBySs[ssType] if not d1List: nTerror("Expected d1List from valueBySs[%s]" % (ssType)) continue hist1d, _bins, _patches = hist(d1List, bins=binCount, range=xRange) nTmessage("Count %6d in valueBySs[%s]" % (sum(hist1d), ssType)) setDeepByKeys(histd1BySs, hist1d, ssType) keyListSorted2 = valueBySsAndResTypes[ssType].keys() keyListSorted2.sort() for resType in keyListSorted2: # nTmessage("Working on valueBySsAndResTypes for [%s][%s]" % (ssType, resType)) # nice for balancing output verbosity. keyListSorted3 = valueBySsAndResTypes[ssType][resType].keys() keyListSorted3.sort() for prevResType in keyListSorted3: # nTmessage("Working on valueBySsAndResTypes[%s][%s][%s]" % (ssType, resType, prevResType)) d1List = valueBySsAndResTypes[ssType][resType][prevResType] if not d1List: nTerror( "Expected d1List from valueBySsAndResTypes[%s][%s][%s]" % (ssType, resType, prevResType)) continue hist1d, _bins, _patches = hist(d1List, bins=binCount, range=xRange) # nTmessage("Count %6d in valueBySsAndResTypes[%s][%s][%s]" % (sum(hist1d), ssType, resType, prevResType)) setDeepByKeys(histd1BySsAndResTypes, hist1d, ssType, resType, prevResType) # Now that they are all in we can redo this. # Delete the reference -not- the object. valueBySs = None valueBySsAndResTypes = None histd1BySs = None histd1BySsAndResTypes = None for ssType in keyListSorted1: for resType in keyListSorted2: # nTmessage("Working on valueBySsAndResTypes for [%s][%s]" % (ssType, resType)) # nice for balancing output verbosity. keyListSorted3 = valueBySs0AndResTypes[ssType][resType].keys() keyListSorted3.sort() for resTypePrev in keyListSorted3: keyListSorted4 = keyListSorted3[:] # take a copy for resTypeNext in keyListSorted4: hist1 = getDeepByKeys(histd1BySs0AndResTypes, ssType, resType, resTypePrev) # x-axis # This was bug! It needs to be hashed on the ssType of resType -not- on resTypeNext hist2 = getDeepByKeys(histd1BySs1AndResTypes, ssType, resTypeNext, resType) if hist1 == None: nTdebug( 'skipping for hist1 is empty for [%s] [%s] [%s]' % (ssType, resTypePrev, resType)) continue if hist2 == None: nTdebug( 'skipping for hist2 is empty for [%s] [%s] [%s]' % (ssType, resType, resTypeNext)) continue m1 = mat(hist1, dtype='float') m2 = mat(hist2, dtype='float') m2 = m2.transpose() # pylint: disable=E1101 hist2d = multiply(m1, m2) cTuple = getEnsembleAverageAndSigmaHis(hist2d) (_c_av, c_sd, _hisMin, _hisMax) = cTuple #@UnusedVariable cTuple += tuple([ str([ssType, resType, resTypePrev, resTypeNext]) ]) # append the hash keys as a way of id. # nTdebug("For ssType %s residue types %s %s %s found (av/sd/min/max) %8.0f %8.0f %8.0f %8.0f" % ( # ssType, resType, resTypePrev, resTypeNext, c_av, c_sd, hisMin, hisMax)) if c_sd == None: nTdebug( 'Failed to get c_sd when testing not all residues are present in smaller sets.' ) continue if c_sd == 0.: nTdebug( 'Got zero c_sd, ignoring histogram. This should only occur in smaller sets. Not setting values.' ) continue setDeepByKeys(histd1CtupleBySsAndResTypes, cTuple, ssType, resType, resTypePrev, resTypeNext) # end for isI keyListSorted1 = valueByResTypes.keys() keyListSorted1.sort() for resType in keyListSorted1: keyListSorted2 = valueByResTypes[resType].keys() keyListSorted2.sort() for prevResType in keyListSorted2: d1List = valueByResTypes[resType][prevResType] if not d1List: nTerror("Expected d1List from valueByResTypes[%s][%s]" % (resType, prevResType)) continue hist1d, _bins, _patches = hist(d1List, bins=binCount, range=xRange) # nTmessage("Count %6d in valueByResTypes[%s][%s]" % (sum(hist1d), resType, prevResType)) setDeepByKeys(histd1ByResTypes, hist1d, resType, prevResType) histd1, _bins, _patches = hist(value, bins=binCount, range=xRange) nTmessage("Count %6d in value" % sum(histd1)) # setDeepByKeys(histd1, hist1d, resType, prevResType) if os.path.exists(dbase_file_abs_name): os.unlink(dbase_file_abs_name) output = open(dbase_file_abs_name, 'wb') dbase = {} dbase[ 'histd1BySs0AndResTypes'] = histd1BySs0AndResTypes # 92 kb uncompressed in the case of ~1000 lines only dbase['histd1BySs1AndResTypes'] = histd1BySs1AndResTypes dbase['histd1CtupleBySsAndResTypes'] = histd1CtupleBySsAndResTypes dbase['histd1ByResTypes'] = histd1ByResTypes # 56 kb dbase['histd1BySs0'] = histd1BySs0 # 4 kb dbase['histd1BySs1'] = histd1BySs1 dbase['histd1'] = histd1 # 4 kb cPickle.dump(dbase, output, 2) output.close()