def files(self): '''A list of all files in the downloads directory of this composite''' try: return self._files except AttributeError: md5sums = encodeUtils.readMd5sums(self._md5path) radict = dict() for stanza in self.alphaMetaDb.itervalues(): if 'fileName' in stanza: for file in stanza['fileName'].split(','): radict[file] = stanza self._files = dict() for file in os.listdir(self.downloadsDirectory): if os.path.isfile(self.downloadsDirectory + file): stanza = None if file in radict: stanza = radict[file] if file in md5sums: self._files[file] = TrackFile( self.downloadsDirectory + file, md5sums[file], stanza) else: self._files[file] = TrackFile( self.downloadsDirectory + file, None, stanza) return self._files
def files(self): '''A list of all files in the downloads directory of this composite''' try: return self._files except AttributeError: md5sums = encodeUtils.readMd5sums(self._md5path) radict = dict() for stanza in self.alphaMetaDb.itervalues(): if 'fileName' in stanza: for file in stanza['fileName'].split(','): radict[file] = stanza self._files = dict() for file in os.listdir(self.downloadsDirectory): if os.path.isfile(self.downloadsDirectory + file): stanza = None if file in radict: stanza = radict[file] if file in md5sums: self._files[file] = TrackFile(self.downloadsDirectory + file, md5sums[file], stanza) else: self._files[file] = TrackFile(self.downloadsDirectory + file, None, stanza) return self._files
def releases(self): '''A list of all files in the release directory of this composite''' try: return self._releaseFiles except AttributeError: self._releaseFiles = list() count = 1 while os.path.exists(self.downloadsDirectory + 'release' + str(count)): releasepath = self.downloadsDirectory + 'release' + str(count) + '/' md5s = encodeUtils.readMd5sums(releasepath + 'md5sum.txt') releasefiles = dict() for file in os.listdir(releasepath): if file != 'md5sum.txt' and md5s != None and file in md5s and not os.path.isdir(releasepath + file): releasefiles[file] = TrackFile(releasepath + file, md5s[file]) elif not os.path.isdir(releasepath + file): releasefiles[file] = TrackFile(releasepath + file, None) elif os.path.isdir(releasepath + file): if not re.match('.*supplemental.*', releasepath + file): continue for innerfile in os.listdir(releasepath + file): pathfile = file + "/" + innerfile releasefiles[pathfile] = TrackFile(releasepath + pathfile, None) #releasefiles.sort() self._releaseFiles.append(releasefiles) count = count + 1 return self._releaseFiles
def createHighThroughputSoftFile(compositeTrack, cv, expIds, expVars, geoMapping, series, datatype, replace, audit, tarpath, argseries, all=False, rep=False): print 'Creating HighThroughput soft file' softfile = HighThroughputSoftFile() fileList = list() createSeries(softfile, compositeTrack, expIds, expVars, geoMapping, series, datatype, replace, audit, argseries, all) if argseries: return softfile, fileList for idNum in expIds.iterkeys(): expId = expIds[idNum] firstStanza = expId[0] if not all: print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')' sample = HighThroughputSampleStanza(softfile) sample['^SAMPLE'] = sampleTitle(firstStanza, expVars, 1, rep) sample['!Sample_type'] = 'SRA' sample['!Sample_title'] = sample['^SAMPLE'] if 'geoSeriesAccession' in series: sample['!Sample_series_id'] = series['geoSeriesAccession'] count = 1 #figure out if the instrument model is consistent across the entire sample instrumentModel = None for stanza in expId: if 'seqPlatform' in stanza: if instrumentModel == None: instrumentModel = submission.instrumentModels[stanza['seqPlatform']] else: if instrumentModel != submission.instrumentModels[stanza['seqPlatform']]: instrumentModel = None if audit: print 'expId' + str(expId) + ': inconsistent instrument model' break for stanza in expId: for fname in stanza['fileName'].split(','): file = compositeTrack.files[fname] filelist = list() if file.extension == 'fasta': print 'WARNING: fastas detected!!!' if isRawFile(file): if all: continue if file.name.endswith('.tgz') or file.name.endswith('.tar.gz'): if tarpath == None: raise IOError('this track contains tarred fastqs. Please specify a path through the -z option') dirname = tarpath + file.name.split('.')[0] + '/' if os.path.exists(dirname): print dirname + ' already exists, so not unzipping' else: print 'creating ' + dirname + '...' os.mkdir(dirname) os.system('tar -xf %s -C %s' % (file.path + file.name, dirname)) for root, dirnames, filenames in os.walk(dirname): for filename in filenames: if 'reject' in filename or 'md5sum' in filename: continue if filename.endswith('.fastq') or filename.endswith('.txt'): print 'gzipping ' + filename os.system('gzip %s' % (root + '/' + filename)) for root, dirnames, filenames in os.walk(dirname): rootmd5s = None if os.path.isfile(root + '/md5sum.txt'): rootmd5s = encodeUtils.readMd5sums(root + '/md5sum.txt') for filename in filenames: if 'reject' in filename or 'md5sum' in filename: continue print root + '/' + filename if rootmd5s != None and filename in rootmd5s: newmd5 = rootmd5s[filename] else: newmd5 = encodeUtils.hashFile(root + '/' + filename) encodeUtils.writeMd5sums(root + '/md5sum.txt', filename, newmd5) newfile = track.TrackFile(root + '/' + filename, newmd5) filelist.append(newfile) else: filelist.append(file) for f in filelist: sample['!Sample_raw_file_' + str(count)] = linkName(f, compositeTrack) if f.extension == 'txt': sample['!Sample_raw_file_type_' + str(count)] = 'fastq' elif f.extension == 'csfasta': sample['!Sample_raw_file_type_' + str(count)] = 'SOLiD_native_csfasta' elif f.extension == 'csqual': sample['!Sample_raw_file_type_' + str(count)] = 'SOLiD_native_qual' else: sample['!Sample_raw_file_type_' + str(count)] = f.extension sample['!Sample_raw_file_checksum_' + str(count)] = f.md5sum if instrumentModel == None and 'seqPlatform' in stanza: sample['!Sample_raw_file_instrument_model_' + str(count)] = submission.instrumentModels[stanza['seqPlatform']] fileList.append(f) count = count + 1 count = 1 pooledStanza = dict() for stanza in expId: for fname in stanza['fileName'].split(','): file = compositeTrack.files[fname] if isSupplementaryFile(file): sample['!Sample_supplementary_file_' + str(count)] = linkName(file, compositeTrack) if not all: if file.md5sum != None: sample['!Sample_supplementary_file_checksum_' + str(count)] = file.md5sum sample['!Sample_supplementary_file_build_' + str(count)] = compositeTrack.database if instrumentModel == None and 'seqPlatform' in stanza: sample['!Sample_supplementary_file_instrument_model_' + str(count)] = submission.instrumentModels[stanza['seqPlatform']] fileList.append(file) count = count + 1 if 'objStatus' in stanza: continue for k in stanza.iterkeys(): if k not in pooledStanza: pooledStanza[k] = set() pooledStanza[k].add(stanza[k]) for k in pooledStanza.iterkeys(): pooledStanza[k] = ','.join(pooledStanza[k]) if (idNum in geoMapping and geoMapping[idNum] != 'Inconsistent'): sample['!Sample_geo_accession'] = geoMapping[idNum] else: if all and 'geoSampleAccession' in pooledStanza: sample['!Sample_geo_accession'] = pooledStanza['geoSampleAccession'] sample['!Sample_source_name'] = pooledStanza['cell'] sample['!Sample_organism'] = compositeTrack.organism sample['!Sample_characteristics'] = list() allVars = expVars + mdbWhitelist for var in allVars: if var in pooledStanza: foobar = var sample['!Sample_characteristics'].append(var + ': ' + pooledStanza[var]) for pretend in cvPretend.iterkeys(): if var + ' ' + pooledStanza[var] == pretend: foobar = cvPretend[pretend] if foobar in cvDetails: for cvVar in cvDetails[foobar]: if cvVar in cvOverride and cvVar in pooledStanza: sample['!Sample_characteristics'].append(var + ' ' + cvVar + ': ' + pooledStanza[cvVar]) elif cvVar in cv[pooledStanza[var]]: sample['!Sample_characteristics'].append(var + ' ' + cvVar + ': ' + cv[pooledStanza[var]][cvVar]) else: for cvVar in cvDefaults: if pooledStanza[var] in cv and cvVar in cv[pooledStanza[var]]: sample['!Sample_characteristics'].append(var + ' ' + cvVar + ': ' + cv[pooledStanza[var]][cvVar]) sample['!Sample_biomaterial_provider'] = cv[pooledStanza['cell']]['vendorName'] if 'treatment' in pooledStanza: sample['!Sample_treatment_protocol'] = pooledStanza['treatment'] if 'protocol' in cv[pooledStanza['cell']]: for protocol in cv[pooledStanza['cell']]['protocol'].split(' '): if protocol == 'missing': continue if ':' not in protocol: raise KeyError(protocol + ' is not valid') key, val = protocol.split(':') if key == 'ENCODE' or key == cv[pooledStanza['lab']]['labPi']: sample['!Sample_growth_protocol'] = val if datatype.molecule == 'RNA': if 'rnaExtract' not in pooledStanza: sample['!Sample_molecule'] = 'total RNA' elif pooledStanza['rnaExtract'] in submission.rnaExtractMapping: sample['!Sample_molecule'] = submission.rnaExtractMapping[pooledStanza['rnaExtract']] elif pooledStanza['localization'] in submission.localizationMapping: sample['!Sample_molecule'] = submission.localizationMapping[pooledStanza['localization']] else: sample['!Sample_molecule'] = datatype.molecule if '!Sample_instrument_model' in replace and replace['!Sample_instrument_model'][0] == 'Unknown': sample['!Sample_extract_protocol'] = 'Instrument model unknown. ("%s" specified by default). For more information, see %s' % (submission.instrumentModels[replace['!Sample_instrument_model'][0]], compositeTrack.url) else: sample['!Sample_extract_protocol'] = compositeTrack.url sample['!Sample_library_strategy'] = datatype.strategy sample['!Sample_library_source'] = datatype.source sample['!Sample_library_selection'] = datatype.selection # if the instrumentModel is consistent, just use that # otherwise take the first seqPlatform value from metadata # if that still fails, check the replacement file # finally just make it say [REPLACE] if instrumentModel != None: sample['!Sample_instrument_model'] = instrumentModel else: for stanza in expId: if 'seqPlatform' in stanza: sample['!Sample_instrument_model'] = submission.instrumentModels[stanza['seqPlatform']] break if '!Sample_instrument_model' not in sample: if '!Sample_instrument_model' in replace: sample['!Sample_instrument_model'] = submission.instrumentModels[replace['!Sample_instrument_model'][0]] if '!Sample_instrument_model' not in sample: sample['!Sample_instrument_model'] = '[REPLACE]' if audit: print stanza.name + ': no instrument' sample['!Sample_data_processing'] = compositeTrack.url softfile[sample['^SAMPLE']] = sample return softfile, fileList
def createHighThroughputSoftFile(compositeTrack, cv, expIds, expVars, geoMapping, series, datatype, replace, audit, tarpath, argseries, all=False, rep=False): print 'Creating HighThroughput soft file' softfile = HighThroughputSoftFile() fileList = list() createSeries(softfile, compositeTrack, expIds, expVars, geoMapping, series, datatype, replace, audit, argseries, all) if argseries: return softfile, fileList for idNum in expIds.iterkeys(): expId = expIds[idNum] firstStanza = expId[0] if not all: print 'Writing sample ' + firstStanza[ 'metaObject'] + ' (' + idNum + ')' sample = HighThroughputSampleStanza(softfile) sample['^SAMPLE'] = sampleTitle(firstStanza, expVars, 1, rep) sample['!Sample_type'] = 'SRA' sample['!Sample_title'] = sample['^SAMPLE'] if 'geoSeriesAccession' in series: sample['!Sample_series_id'] = series['geoSeriesAccession'] count = 1 #figure out if the instrument model is consistent across the entire sample instrumentModel = None for stanza in expId: if 'seqPlatform' in stanza: if instrumentModel == None: instrumentModel = submission.instrumentModels[ stanza['seqPlatform']] else: if instrumentModel != submission.instrumentModels[ stanza['seqPlatform']]: instrumentModel = None if audit: print 'expId' + str( expId) + ': inconsistent instrument model' break for stanza in expId: for fname in stanza['fileName'].split(','): file = compositeTrack.files[fname] filelist = list() if file.extension == 'fasta': print 'WARNING: fastas detected!!!' if isRawFile(file): if all: continue if file.name.endswith('.tgz') or file.name.endswith( '.tar.gz'): if tarpath == None: raise IOError( 'this track contains tarred fastqs. Please specify a path through the -z option' ) dirname = tarpath + file.name.split('.')[0] + '/' if os.path.exists(dirname): print dirname + ' already exists, so not unzipping' else: print 'creating ' + dirname + '...' os.mkdir(dirname) os.system('tar -xf %s -C %s' % (file.path + file.name, dirname)) for root, dirnames, filenames in os.walk(dirname): for filename in filenames: if 'reject' in filename or 'md5sum' in filename: continue if filename.endswith( '.fastq') or filename.endswith('.txt'): print 'gzipping ' + filename os.system('gzip %s' % (root + '/' + filename)) for root, dirnames, filenames in os.walk(dirname): rootmd5s = None if os.path.isfile(root + '/md5sum.txt'): rootmd5s = encodeUtils.readMd5sums( root + '/md5sum.txt') for filename in filenames: if 'reject' in filename or 'md5sum' in filename: continue print root + '/' + filename if rootmd5s != None and filename in rootmd5s: newmd5 = rootmd5s[filename] else: newmd5 = encodeUtils.hashFile(root + '/' + filename) encodeUtils.writeMd5sums( root + '/md5sum.txt', filename, newmd5) newfile = track.TrackFile( root + '/' + filename, newmd5) filelist.append(newfile) else: filelist.append(file) for f in filelist: sample['!Sample_raw_file_' + str(count)] = linkName( f, compositeTrack) if f.extension == 'txt': sample['!Sample_raw_file_type_' + str(count)] = 'fastq' elif f.extension == 'csfasta': sample['!Sample_raw_file_type_' + str(count)] = 'SOLiD_native_csfasta' elif f.extension == 'csqual': sample['!Sample_raw_file_type_' + str(count)] = 'SOLiD_native_qual' else: sample['!Sample_raw_file_type_' + str(count)] = f.extension sample['!Sample_raw_file_checksum_' + str(count)] = f.md5sum if instrumentModel == None and 'seqPlatform' in stanza: sample['!Sample_raw_file_instrument_model_' + str(count)] = submission.instrumentModels[ stanza['seqPlatform']] fileList.append(f) count = count + 1 count = 1 pooledStanza = dict() for stanza in expId: for fname in stanza['fileName'].split(','): file = compositeTrack.files[fname] if isSupplementaryFile(file): sample['!Sample_supplementary_file_' + str(count)] = linkName(file, compositeTrack) if not all: if file.md5sum != None: sample['!Sample_supplementary_file_checksum_' + str(count)] = file.md5sum sample['!Sample_supplementary_file_build_' + str(count)] = compositeTrack.database if instrumentModel == None and 'seqPlatform' in stanza: sample['!Sample_supplementary_file_instrument_model_' + str(count)] = submission.instrumentModels[ stanza['seqPlatform']] fileList.append(file) count = count + 1 if 'objStatus' in stanza: continue for k in stanza.iterkeys(): if k not in pooledStanza: pooledStanza[k] = set() pooledStanza[k].add(stanza[k]) for k in pooledStanza.iterkeys(): pooledStanza[k] = ','.join(pooledStanza[k]) if (idNum in geoMapping and geoMapping[idNum] != 'Inconsistent'): sample['!Sample_geo_accession'] = geoMapping[idNum] else: if all and 'geoSampleAccession' in pooledStanza: sample['!Sample_geo_accession'] = pooledStanza[ 'geoSampleAccession'] sample['!Sample_source_name'] = pooledStanza['cell'] sample['!Sample_organism'] = compositeTrack.organism sample['!Sample_characteristics'] = list() allVars = expVars + mdbWhitelist for var in allVars: if var in pooledStanza: foobar = var sample['!Sample_characteristics'].append(var + ': ' + pooledStanza[var]) for pretend in cvPretend.iterkeys(): if var + ' ' + pooledStanza[var] == pretend: foobar = cvPretend[pretend] if foobar in cvDetails: for cvVar in cvDetails[foobar]: if cvVar in cvOverride and cvVar in pooledStanza: sample['!Sample_characteristics'].append( var + ' ' + cvVar + ': ' + pooledStanza[cvVar]) elif cvVar in cv[pooledStanza[var]]: sample['!Sample_characteristics'].append( var + ' ' + cvVar + ': ' + cv[pooledStanza[var]][cvVar]) else: for cvVar in cvDefaults: if pooledStanza[var] in cv and cvVar in cv[ pooledStanza[var]]: sample['!Sample_characteristics'].append( var + ' ' + cvVar + ': ' + cv[pooledStanza[var]][cvVar]) sample['!Sample_biomaterial_provider'] = cv[ pooledStanza['cell']]['vendorName'] if 'treatment' in pooledStanza: sample['!Sample_treatment_protocol'] = pooledStanza[ 'treatment'] if 'protocol' in cv[pooledStanza['cell']]: for protocol in cv[pooledStanza['cell']]['protocol'].split( ' '): if protocol == 'missing': continue if ':' not in protocol: raise KeyError(protocol + ' is not valid') key, val = protocol.split(':') if key == 'ENCODE' or key == cv[ pooledStanza['lab']]['labPi']: sample['!Sample_growth_protocol'] = val if datatype.molecule == 'RNA': if 'rnaExtract' not in pooledStanza: sample['!Sample_molecule'] = 'total RNA' elif pooledStanza[ 'rnaExtract'] in submission.rnaExtractMapping: sample['!Sample_molecule'] = submission.rnaExtractMapping[ pooledStanza['rnaExtract']] elif pooledStanza[ 'localization'] in submission.localizationMapping: sample[ '!Sample_molecule'] = submission.localizationMapping[ pooledStanza['localization']] else: sample['!Sample_molecule'] = datatype.molecule if '!Sample_instrument_model' in replace and replace[ '!Sample_instrument_model'][0] == 'Unknown': sample[ '!Sample_extract_protocol'] = 'Instrument model unknown. ("%s" specified by default). For more information, see %s' % ( submission.instrumentModels[ replace['!Sample_instrument_model'][0]], compositeTrack.url) else: sample['!Sample_extract_protocol'] = compositeTrack.url sample['!Sample_library_strategy'] = datatype.strategy sample['!Sample_library_source'] = datatype.source sample['!Sample_library_selection'] = datatype.selection # if the instrumentModel is consistent, just use that # otherwise take the first seqPlatform value from metadata # if that still fails, check the replacement file # finally just make it say [REPLACE] if instrumentModel != None: sample['!Sample_instrument_model'] = instrumentModel else: for stanza in expId: if 'seqPlatform' in stanza: sample[ '!Sample_instrument_model'] = submission.instrumentModels[ stanza['seqPlatform']] break if '!Sample_instrument_model' not in sample: if '!Sample_instrument_model' in replace: sample[ '!Sample_instrument_model'] = submission.instrumentModels[ replace['!Sample_instrument_model'][0]] if '!Sample_instrument_model' not in sample: sample['!Sample_instrument_model'] = '[REPLACE]' if audit: print stanza.name + ': no instrument' sample['!Sample_data_processing'] = compositeTrack.url softfile[sample['^SAMPLE']] = sample return softfile, fileList