def createHighThroughputSoftFile(compositeTrack, cv, expIds, expVars, geoMapping, series, datatype, replace, audit, tarpath, argseries, all=False, rep=False): print 'Creating HighThroughput soft file' softfile = HighThroughputSoftFile() fileList = list() createSeries(softfile, compositeTrack, expIds, expVars, geoMapping, series, datatype, replace, audit, argseries, all) if argseries: return softfile, fileList for idNum in expIds.iterkeys(): expId = expIds[idNum] firstStanza = expId[0] if not all: print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')' sample = HighThroughputSampleStanza(softfile) sample['^SAMPLE'] = sampleTitle(firstStanza, expVars, 1, rep) sample['!Sample_type'] = 'SRA' sample['!Sample_title'] = sample['^SAMPLE'] if 'geoSeriesAccession' in series: sample['!Sample_series_id'] = series['geoSeriesAccession'] count = 1 #figure out if the instrument model is consistent across the entire sample instrumentModel = None for stanza in expId: if 'seqPlatform' in stanza: if instrumentModel == None: instrumentModel = submission.instrumentModels[stanza['seqPlatform']] else: if instrumentModel != submission.instrumentModels[stanza['seqPlatform']]: instrumentModel = None if audit: print 'expId' + str(expId) + ': inconsistent instrument model' break for stanza in expId: for fname in stanza['fileName'].split(','): file = compositeTrack.files[fname] filelist = list() if file.extension == 'fasta': print 'WARNING: fastas detected!!!' if isRawFile(file): if all: continue if file.name.endswith('.tgz') or file.name.endswith('.tar.gz'): if tarpath == None: raise IOError('this track contains tarred fastqs. Please specify a path through the -z option') dirname = tarpath + file.name.split('.')[0] + '/' if os.path.exists(dirname): print dirname + ' already exists, so not unzipping' else: print 'creating ' + dirname + '...' os.mkdir(dirname) os.system('tar -xf %s -C %s' % (file.path + file.name, dirname)) for root, dirnames, filenames in os.walk(dirname): for filename in filenames: if 'reject' in filename or 'md5sum' in filename: continue if filename.endswith('.fastq') or filename.endswith('.txt'): print 'gzipping ' + filename os.system('gzip %s' % (root + '/' + filename)) for root, dirnames, filenames in os.walk(dirname): rootmd5s = None if os.path.isfile(root + '/md5sum.txt'): rootmd5s = encodeUtils.readMd5sums(root + '/md5sum.txt') for filename in filenames: if 'reject' in filename or 'md5sum' in filename: continue print root + '/' + filename if rootmd5s != None and filename in rootmd5s: newmd5 = rootmd5s[filename] else: newmd5 = encodeUtils.hashFile(root + '/' + filename) encodeUtils.writeMd5sums(root + '/md5sum.txt', filename, newmd5) newfile = track.TrackFile(root + '/' + filename, newmd5) filelist.append(newfile) else: filelist.append(file) for f in filelist: sample['!Sample_raw_file_' + str(count)] = linkName(f, compositeTrack) if f.extension == 'txt': sample['!Sample_raw_file_type_' + str(count)] = 'fastq' elif f.extension == 'csfasta': sample['!Sample_raw_file_type_' + str(count)] = 'SOLiD_native_csfasta' elif f.extension == 'csqual': sample['!Sample_raw_file_type_' + str(count)] = 'SOLiD_native_qual' else: sample['!Sample_raw_file_type_' + str(count)] = f.extension sample['!Sample_raw_file_checksum_' + str(count)] = f.md5sum if instrumentModel == None and 'seqPlatform' in stanza: sample['!Sample_raw_file_instrument_model_' + str(count)] = submission.instrumentModels[stanza['seqPlatform']] fileList.append(f) count = count + 1 count = 1 pooledStanza = dict() for stanza in expId: for fname in stanza['fileName'].split(','): file = compositeTrack.files[fname] if isSupplementaryFile(file): sample['!Sample_supplementary_file_' + str(count)] = linkName(file, compositeTrack) if not all: if file.md5sum != None: sample['!Sample_supplementary_file_checksum_' + str(count)] = file.md5sum sample['!Sample_supplementary_file_build_' + str(count)] = compositeTrack.database if instrumentModel == None and 'seqPlatform' in stanza: sample['!Sample_supplementary_file_instrument_model_' + str(count)] = submission.instrumentModels[stanza['seqPlatform']] fileList.append(file) count = count + 1 if 'objStatus' in stanza: continue for k in stanza.iterkeys(): if k not in pooledStanza: pooledStanza[k] = set() pooledStanza[k].add(stanza[k]) for k in pooledStanza.iterkeys(): pooledStanza[k] = ','.join(pooledStanza[k]) if (idNum in geoMapping and geoMapping[idNum] != 'Inconsistent'): sample['!Sample_geo_accession'] = geoMapping[idNum] else: if all and 'geoSampleAccession' in pooledStanza: sample['!Sample_geo_accession'] = pooledStanza['geoSampleAccession'] sample['!Sample_source_name'] = pooledStanza['cell'] sample['!Sample_organism'] = compositeTrack.organism sample['!Sample_characteristics'] = list() allVars = expVars + mdbWhitelist for var in allVars: if var in pooledStanza: foobar = var sample['!Sample_characteristics'].append(var + ': ' + pooledStanza[var]) for pretend in cvPretend.iterkeys(): if var + ' ' + pooledStanza[var] == pretend: foobar = cvPretend[pretend] if foobar in cvDetails: for cvVar in cvDetails[foobar]: if cvVar in cvOverride and cvVar in pooledStanza: sample['!Sample_characteristics'].append(var + ' ' + cvVar + ': ' + pooledStanza[cvVar]) elif cvVar in cv[pooledStanza[var]]: sample['!Sample_characteristics'].append(var + ' ' + cvVar + ': ' + cv[pooledStanza[var]][cvVar]) else: for cvVar in cvDefaults: if pooledStanza[var] in cv and cvVar in cv[pooledStanza[var]]: sample['!Sample_characteristics'].append(var + ' ' + cvVar + ': ' + cv[pooledStanza[var]][cvVar]) sample['!Sample_biomaterial_provider'] = cv[pooledStanza['cell']]['vendorName'] if 'treatment' in pooledStanza: sample['!Sample_treatment_protocol'] = pooledStanza['treatment'] if 'protocol' in cv[pooledStanza['cell']]: for protocol in cv[pooledStanza['cell']]['protocol'].split(' '): if protocol == 'missing': continue if ':' not in protocol: raise KeyError(protocol + ' is not valid') key, val = protocol.split(':') if key == 'ENCODE' or key == cv[pooledStanza['lab']]['labPi']: sample['!Sample_growth_protocol'] = val if datatype.molecule == 'RNA': if 'rnaExtract' not in pooledStanza: sample['!Sample_molecule'] = 'total RNA' elif pooledStanza['rnaExtract'] in submission.rnaExtractMapping: sample['!Sample_molecule'] = submission.rnaExtractMapping[pooledStanza['rnaExtract']] elif pooledStanza['localization'] in submission.localizationMapping: sample['!Sample_molecule'] = submission.localizationMapping[pooledStanza['localization']] else: sample['!Sample_molecule'] = datatype.molecule if '!Sample_instrument_model' in replace and replace['!Sample_instrument_model'][0] == 'Unknown': sample['!Sample_extract_protocol'] = 'Instrument model unknown. ("%s" specified by default). For more information, see %s' % (submission.instrumentModels[replace['!Sample_instrument_model'][0]], compositeTrack.url) else: sample['!Sample_extract_protocol'] = compositeTrack.url sample['!Sample_library_strategy'] = datatype.strategy sample['!Sample_library_source'] = datatype.source sample['!Sample_library_selection'] = datatype.selection # if the instrumentModel is consistent, just use that # otherwise take the first seqPlatform value from metadata # if that still fails, check the replacement file # finally just make it say [REPLACE] if instrumentModel != None: sample['!Sample_instrument_model'] = instrumentModel else: for stanza in expId: if 'seqPlatform' in stanza: sample['!Sample_instrument_model'] = submission.instrumentModels[stanza['seqPlatform']] break if '!Sample_instrument_model' not in sample: if '!Sample_instrument_model' in replace: sample['!Sample_instrument_model'] = submission.instrumentModels[replace['!Sample_instrument_model'][0]] if '!Sample_instrument_model' not in sample: sample['!Sample_instrument_model'] = '[REPLACE]' if audit: print stanza.name + ': no instrument' sample['!Sample_data_processing'] = compositeTrack.url softfile[sample['^SAMPLE']] = sample return softfile, fileList
def md5sum(self): '''The md5sum for this file, stored in the md5sum.txt file in the downloads directory''' if self._md5sum == None: self._md5sum = encodeUtils.hashFile(self.fullname) return self._md5sum
def createHighThroughputSoftFile(compositeTrack, cv, expIds, expVars, geoMapping, series, datatype, replace, audit, tarpath, argseries, all=False, rep=False): print 'Creating HighThroughput soft file' softfile = HighThroughputSoftFile() fileList = list() createSeries(softfile, compositeTrack, expIds, expVars, geoMapping, series, datatype, replace, audit, argseries, all) if argseries: return softfile, fileList for idNum in expIds.iterkeys(): expId = expIds[idNum] firstStanza = expId[0] if not all: print 'Writing sample ' + firstStanza[ 'metaObject'] + ' (' + idNum + ')' sample = HighThroughputSampleStanza(softfile) sample['^SAMPLE'] = sampleTitle(firstStanza, expVars, 1, rep) sample['!Sample_type'] = 'SRA' sample['!Sample_title'] = sample['^SAMPLE'] if 'geoSeriesAccession' in series: sample['!Sample_series_id'] = series['geoSeriesAccession'] count = 1 #figure out if the instrument model is consistent across the entire sample instrumentModel = None for stanza in expId: if 'seqPlatform' in stanza: if instrumentModel == None: instrumentModel = submission.instrumentModels[ stanza['seqPlatform']] else: if instrumentModel != submission.instrumentModels[ stanza['seqPlatform']]: instrumentModel = None if audit: print 'expId' + str( expId) + ': inconsistent instrument model' break for stanza in expId: for fname in stanza['fileName'].split(','): file = compositeTrack.files[fname] filelist = list() if file.extension == 'fasta': print 'WARNING: fastas detected!!!' if isRawFile(file): if all: continue if file.name.endswith('.tgz') or file.name.endswith( '.tar.gz'): if tarpath == None: raise IOError( 'this track contains tarred fastqs. Please specify a path through the -z option' ) dirname = tarpath + file.name.split('.')[0] + '/' if os.path.exists(dirname): print dirname + ' already exists, so not unzipping' else: print 'creating ' + dirname + '...' os.mkdir(dirname) os.system('tar -xf %s -C %s' % (file.path + file.name, dirname)) for root, dirnames, filenames in os.walk(dirname): for filename in filenames: if 'reject' in filename or 'md5sum' in filename: continue if filename.endswith( '.fastq') or filename.endswith('.txt'): print 'gzipping ' + filename os.system('gzip %s' % (root + '/' + filename)) for root, dirnames, filenames in os.walk(dirname): rootmd5s = None if os.path.isfile(root + '/md5sum.txt'): rootmd5s = encodeUtils.readMd5sums( root + '/md5sum.txt') for filename in filenames: if 'reject' in filename or 'md5sum' in filename: continue print root + '/' + filename if rootmd5s != None and filename in rootmd5s: newmd5 = rootmd5s[filename] else: newmd5 = encodeUtils.hashFile(root + '/' + filename) encodeUtils.writeMd5sums( root + '/md5sum.txt', filename, newmd5) newfile = track.TrackFile( root + '/' + filename, newmd5) filelist.append(newfile) else: filelist.append(file) for f in filelist: sample['!Sample_raw_file_' + str(count)] = linkName( f, compositeTrack) if f.extension == 'txt': sample['!Sample_raw_file_type_' + str(count)] = 'fastq' elif f.extension == 'csfasta': sample['!Sample_raw_file_type_' + str(count)] = 'SOLiD_native_csfasta' elif f.extension == 'csqual': sample['!Sample_raw_file_type_' + str(count)] = 'SOLiD_native_qual' else: sample['!Sample_raw_file_type_' + str(count)] = f.extension sample['!Sample_raw_file_checksum_' + str(count)] = f.md5sum if instrumentModel == None and 'seqPlatform' in stanza: sample['!Sample_raw_file_instrument_model_' + str(count)] = submission.instrumentModels[ stanza['seqPlatform']] fileList.append(f) count = count + 1 count = 1 pooledStanza = dict() for stanza in expId: for fname in stanza['fileName'].split(','): file = compositeTrack.files[fname] if isSupplementaryFile(file): sample['!Sample_supplementary_file_' + str(count)] = linkName(file, compositeTrack) if not all: if file.md5sum != None: sample['!Sample_supplementary_file_checksum_' + str(count)] = file.md5sum sample['!Sample_supplementary_file_build_' + str(count)] = compositeTrack.database if instrumentModel == None and 'seqPlatform' in stanza: sample['!Sample_supplementary_file_instrument_model_' + str(count)] = submission.instrumentModels[ stanza['seqPlatform']] fileList.append(file) count = count + 1 if 'objStatus' in stanza: continue for k in stanza.iterkeys(): if k not in pooledStanza: pooledStanza[k] = set() pooledStanza[k].add(stanza[k]) for k in pooledStanza.iterkeys(): pooledStanza[k] = ','.join(pooledStanza[k]) if (idNum in geoMapping and geoMapping[idNum] != 'Inconsistent'): sample['!Sample_geo_accession'] = geoMapping[idNum] else: if all and 'geoSampleAccession' in pooledStanza: sample['!Sample_geo_accession'] = pooledStanza[ 'geoSampleAccession'] sample['!Sample_source_name'] = pooledStanza['cell'] sample['!Sample_organism'] = compositeTrack.organism sample['!Sample_characteristics'] = list() allVars = expVars + mdbWhitelist for var in allVars: if var in pooledStanza: foobar = var sample['!Sample_characteristics'].append(var + ': ' + pooledStanza[var]) for pretend in cvPretend.iterkeys(): if var + ' ' + pooledStanza[var] == pretend: foobar = cvPretend[pretend] if foobar in cvDetails: for cvVar in cvDetails[foobar]: if cvVar in cvOverride and cvVar in pooledStanza: sample['!Sample_characteristics'].append( var + ' ' + cvVar + ': ' + pooledStanza[cvVar]) elif cvVar in cv[pooledStanza[var]]: sample['!Sample_characteristics'].append( var + ' ' + cvVar + ': ' + cv[pooledStanza[var]][cvVar]) else: for cvVar in cvDefaults: if pooledStanza[var] in cv and cvVar in cv[ pooledStanza[var]]: sample['!Sample_characteristics'].append( var + ' ' + cvVar + ': ' + cv[pooledStanza[var]][cvVar]) sample['!Sample_biomaterial_provider'] = cv[ pooledStanza['cell']]['vendorName'] if 'treatment' in pooledStanza: sample['!Sample_treatment_protocol'] = pooledStanza[ 'treatment'] if 'protocol' in cv[pooledStanza['cell']]: for protocol in cv[pooledStanza['cell']]['protocol'].split( ' '): if protocol == 'missing': continue if ':' not in protocol: raise KeyError(protocol + ' is not valid') key, val = protocol.split(':') if key == 'ENCODE' or key == cv[ pooledStanza['lab']]['labPi']: sample['!Sample_growth_protocol'] = val if datatype.molecule == 'RNA': if 'rnaExtract' not in pooledStanza: sample['!Sample_molecule'] = 'total RNA' elif pooledStanza[ 'rnaExtract'] in submission.rnaExtractMapping: sample['!Sample_molecule'] = submission.rnaExtractMapping[ pooledStanza['rnaExtract']] elif pooledStanza[ 'localization'] in submission.localizationMapping: sample[ '!Sample_molecule'] = submission.localizationMapping[ pooledStanza['localization']] else: sample['!Sample_molecule'] = datatype.molecule if '!Sample_instrument_model' in replace and replace[ '!Sample_instrument_model'][0] == 'Unknown': sample[ '!Sample_extract_protocol'] = 'Instrument model unknown. ("%s" specified by default). For more information, see %s' % ( submission.instrumentModels[ replace['!Sample_instrument_model'][0]], compositeTrack.url) else: sample['!Sample_extract_protocol'] = compositeTrack.url sample['!Sample_library_strategy'] = datatype.strategy sample['!Sample_library_source'] = datatype.source sample['!Sample_library_selection'] = datatype.selection # if the instrumentModel is consistent, just use that # otherwise take the first seqPlatform value from metadata # if that still fails, check the replacement file # finally just make it say [REPLACE] if instrumentModel != None: sample['!Sample_instrument_model'] = instrumentModel else: for stanza in expId: if 'seqPlatform' in stanza: sample[ '!Sample_instrument_model'] = submission.instrumentModels[ stanza['seqPlatform']] break if '!Sample_instrument_model' not in sample: if '!Sample_instrument_model' in replace: sample[ '!Sample_instrument_model'] = submission.instrumentModels[ replace['!Sample_instrument_model'][0]] if '!Sample_instrument_model' not in sample: sample['!Sample_instrument_model'] = '[REPLACE]' if audit: print stanza.name + ': no instrument' sample['!Sample_data_processing'] = compositeTrack.url softfile[sample['^SAMPLE']] = sample return softfile, fileList