def download_fastq(self): ''' Download Fastq associated with Accession from ENA :param run_accession: Run Accession ID from ENA :return: True ''' try: Path(self.dir).makedirs_p() retcode = call(self.download, stdout=PIPE) except OSError as e: _logger.error('FastQ Failed: %s [%s]', self.accession, e) _logger.error('CMD: %s', self.download) Sequence.__errors[self.accession] = 'FastQ Failed' self.error = True else: if retcode < 0: _logger.error('Child was terminated by signal') self.error = True Sequence.__errors[self.accession] = 'Child was'\ 'terminated'\ '(signal)' else: _logger.info('Success: %s', self.accession) self.files = [ f.abspath() for f in Path(self.dir).files() ] Sequence.__sequence_id += 1
def ProcessExperimentSeparate(experiment_id, json, batch_dir, sample_dir_id, preserve, failed_accession, skip_files=False): m = ExtractExperimentMetadata(experiment_id, json) if m.valid_metadata(): # Check if a run ID was submitted, and if so only process that if experiment_id in m.runIDs: m.runIDs = [experiment_id] # Process the runIDs as samples _logger.info("Found Following Runs: %s", ', '.join(m.runIDs)) for runid in m.runIDs: with TemporaryDirectory() as tmpdir: os.chdir(batch_dir) sample_dir = "%s/%s/"%(batch_dir, sample_dir_id) if os.path.exists(sample_dir): sfiles = [x for x in os.listdir(sample_dir) if any([y in x for y in ['fq','fastq']])] else: sfiles = [] if not preserve or not skip_files or len(sfiles) == 0: sfiles = DownloadRunFiles(runid, tmpdir) if sfiles is not None: success = CreateSampleDir(sfiles, m, sample_dir, preserve, skip_files) if success: sample_dir_id += 1 else: failed_accession.append(runid) else: _logger.error("Files could not be retrieved! (%s)", runid) failed_accession.append(runid) else: _logger.error("Metadata Invalid! (%s) - %s", experiment_id, m.metadata.items()) failed_accession.append(experiment_id) return sample_dir_id
def CreateSampleDir(sfiles, m, sample_dir, preserve=False, skip_files=False): sample_dir = str(sample_dir) if not skip_files and len(sfiles) == 0: _logger.error("Error: No files were found! (%s)", sample_dir) return False if not os.path.exists(sample_dir): _logger.info("Create sample dir: %s", sample_dir) # Create 'sample' dir os.mkdir(sample_dir) # Move files from tmpdir to sample dir for sf in sfiles: move(sf, sample_dir) elif not preserve and not skip_files: # Empty sample directory for fn in os.listdir(sample_dir): os.unlink("%s/%s"%(sample_dir, fn)) # Move files from tmpdir to sample dir for sf in sfiles: move(sf, sample_dir) # Update and create metadata file try: m.metadata["file_names"] = ' '.join( [os.path.basename(sf).replace(' ','_') for sf in sfiles if not os.path.basename(sf) == 'meta.json'] ) m.save_metadata(sample_dir) except ValueError, e: _logger.error(e) return False
def download_fastq_from_list(accession_list, output, json, preserve=False, all_runs_as_samples=False, skip_files=False): """ Get Fastq from list of IDs :param accession_list: List of accessions :param dir: Output folder """ metadata = [] cwd = os.getcwd() with open(accession_list, 'r') as f: # Setup batch dir batch_dir = "%s/%s/"%(cwd, output) if not os.path.exists(batch_dir): os.mkdir(batch_dir) os.chdir(batch_dir) # Set logging _logger.Set(filename="%s/download-acceession-list.log"%batch_dir) # Count samples in accession_list n_samples = sum(1 for l in f) f.seek(0) _logger.info("Number of samples to download: %s", n_samples) # Start progress bar pbar = ProgressBar( widgets = [ETA(), ' - ', Percentage(), ' : ', Bar()], maxval = n_samples ).start() pbar.update(0) failed_accession = [] sample_dir_id = 0 for i, l in enumerate(f): accession = l.strip() if accession == '': continue # Determine accession type if accession[:3] in acctypes: accession_type = acctypes[accession[:3]] else: _logger.error("unknown accession type for '%s'!", accession) failed_accession.append(accession) continue _logger.info("Acc Found: %s (%s)", accession, accession_type) if accession_type in ['study', 'sample']: for experiment_id in ExtractExperimentIDs_acc(accession): sample_dir_id = ProcessExperiment( experiment_id, json, batch_dir,sample_dir_id, preserve, failed_accession, all_runs_as_samples, skip_files) elif accession_type == 'experiment': sample_dir_id = ProcessExperiment( accession, json, batch_dir,sample_dir_id, preserve, failed_accession, all_runs_as_samples, skip_files) elif accession_type == 'run': sample_dir_id = ProcessExperiment( accession, json, batch_dir,sample_dir_id, preserve, failed_accession, all_runs_as_samples, skip_files) pbar.update(i) pbar.finish() if failed_accession: _logger.info("The following accessions were not downloaded!") _logger.info('\n'.join(failed_accession)) else: _logger.info("All accessions downloaded succesfully!")
def ProcessExperiment(experiment_id, json, batch_dir, sample_dir_id, preserve, failed_accession, all_runs_as_samples, skip_files=False): _logger.info("Processing %s...", experiment_id) if all_runs_as_samples: sample_dir_id = ProcessExperimentSeparate( experiment_id, json, batch_dir, sample_dir_id, preserve, failed_accession, skip_files) else: sample_dir_id = ProcessExperimentCombined( experiment_id, json, batch_dir, sample_dir_id, preserve, failed_accession, skip_files) return sample_dir_id
def DownloadRunFiles(runid, tmpdir): # Download run files try: s = Sequence(runid, tmpdir) s.download_fastq() if not s.error: _logger.info("Downloaded files: %s", ','.join(s.files)) return s.files else: return None except ValueError, e: _logger.error(e) return None
def ExtractData(self, query): ''' Extract Sample Metadata ''' new_platforms = [] new_seqtypes = [] # New approach using runinfo list with openurl(self.sra_url1%(query)) as u: headers = u.readline().split(',') indexes = [(x, headers.index(x)) for x in ["Run", "Experiment", "Sample", "SRAStudy", "BioSample", "Platform", "LibraryLayout", "SampleName", "ScientificName", "CenterName"]] for l in u: l = l.strip() if l == '': continue if l[0] == '#': continue d = l.split(',') self.accessions['run'] = d[indexes[0][1]] self.accessions['experiment'] = d[indexes[1][1]] self.accessions['sample'] = d[indexes[2][1]] self.accessions['study'] = d[indexes[3][1]] self.accessions['biosample'] = d[indexes[4][1]] platform = d[indexes[5][1]].lower() if platform in platforms: self['sequencing_platform'] = platforms[platform] else: self['sequencing_platform'] = 'unknown' if not platform in new_platforms: new_platforms.append(platform) seqtype = d[indexes[6][1]].lower() if seqtype in sequencing_types: self['sequencing_type'] = sequencing_types[seqtype] else: self['sequencing_type'] = 'unknown' if not seqtype in new_seqtypes: new_seqtypes.append(seqtype) self['sample_name'] = d[indexes[7][1]] self['organism'] = d[indexes[8][1]] self['collected_by'] = d[indexes[9][1]] self['biosample'] = self.accessions['biosample'] break # Just use the first entry! # Should be fixed to handle different query sequences!!! with openurl(self.sra_url2%(query)) as u: qdata = u.read() # Extract sample attributes match = re.findall(r'Sample Attributes: (.+)\n', qdata) lcs = {} # location parts host = None source = None for answer in match: for attributes in answer.split(';'): stat = attributes.split('=') att = stat[0].strip('/ ').lower().replace('\'', '') val = stat[1].strip('\' ').replace('\'', '\`') if att in ['geo_loc_name', 'geographic location']: self.__interpret_loc(val) elif att == 'serovar': self['subtype']['serovar'] = val elif att == 'mlst': self['subtype']['mlst'] = val elif att in ['scientific_name', 'scientific name']: self['organism'] = val elif att == 'strain': self['strain'] = val elif att in ['isolation_source', 'isolation source']: source = val elif att in ['host', 'specific_host', 'specific host']: host = val elif att == 'BioSample': self['biosample'] = val elif att in ['collection_date', 'collection date']: self['collection_date'] = self.__format_date( *self.__interpret_date(val) ) if self['collection_date'] == '': _logger.warning( 'Date Empty: %s, %s', val, query ) elif att in ['collected_by', 'collected by']: self['collected_by'] = val elif att in ['country', 'region', 'city', 'zip_code']: lcs[att] = val else: self['notes'] = '%s %s: %s,' % ( self['notes'], att, val) if lcs != {}: h = ['country', 'region', 'city', 'zip_code'] self.__interpret_loc( ','.join([lcs[x] for x in h if x in lcs])) # Handle Isolation source cats = [] if host is not None: for d in ontology: cats = [d[k][0] for k in d.keys() if k in host.lower()] if cats: break if not cats and host not in self.new_ontologies: self.new_ontologies[host] = query if (not cats or cats[0] == 'unknown') and source is not None: for d in ontology: cats = [d[k][0] for k in d.keys() if k in source.lower()] if cats: break if not cats and source not in self.new_ontologies: self.new_ontologies[source] = query if cats: self['isolation_source'] = cats[0] _logger.warning( 'Source identified: %s (%s, %s), %s', self['isolation_source'], host, source, query ) else: if host is None: host = 'unknown' if source is None: source = 'unknown' _logger.warning( 'Source not identified: (%s, %s), %s', host, source, query ) self['source_note'] = source # Extract Run IDs associated with the sample #Run #1: ERR276921, 1356661 spots, 271332200 bases self.runIDs = re.findall(r'Run #\d+: (.+?),.+', qdata) # Notify Curators By Email _logger.info('Make mail? %s'%(mail is not None)) if mail is not None: _logger.info('Any unknowns? %s'%(len(self.new_ontologies) > 0)) if len(self.new_ontologies) > 0: _logger.debug(mail.test( 'New isolation source...', 'Sources not identified:\n%s\n'%( '\n'.join(map(', '.join, self.new_ontologies.items())) ) )) mail.send( 'New isolation source...', 'Sources not identified:\n%s\n'%( '\n'.join(map(', '.join, self.new_ontologies.items())) ) ) if len(new_platforms) > 0: _logger.debug(mail.test( 'New platforms...', 'Platforms not accounted for:\n%s\n'%( '\n'.join(new_platforms) ) )) mail.send( 'New platforms...', 'Platforms not accounted for:\n%s\n'%( '\n'.join(new_platforms) ) ) elif len(self.new_ontologies) > 0: _logger.debug( "NO MAIL!", 'Sources not identified:\n%s\n'%( '\n'.join(map(', '.join, self.new_ontologies.items())) ) )
def ProcessExperimentCombined(experiment_id, json, batch_dir, sample_dir_id, preserve, failed_accession, skip_files=False): m = ExtractExperimentMetadata(experiment_id, json) if m.valid_metadata(): # Check if a run ID was submitted, and if so only process that if experiment_id in m.runIDs: m.runIDs = [experiment_id] # Process the runs as one sample _logger.info("Found Following Runs: %s", ', '.join(m.runIDs)) with TemporaryDirectory() as tmpdir: os.chdir(batch_dir) sample_dir = "%s/%s/"%(batch_dir, sample_dir_id) csfiles = [] if preserve and os.path.exists(sample_dir): csfiles = [x for x in os.listdir(sample_dir) if any([y in x for y in ['fq','fastq']])] if csfiles == [] and not skip_files: sfiles = [] for runid in m.runIDs: sf = DownloadRunFiles(runid, tmpdir) if sf is not None: sfiles.append(sf) else: _logger.error("Run files could not be retrieved! (%s)", runid) _logger.info("Found Following files sets:\n%s\n", '\n'.join([', '.join(sf) for sf in sfiles])) # Combine sfiles into one entry if len(sfiles) > 1: for file_no, file_set in enumerate(zip(*sfiles)): ext = '.'.join(file_set[0].split('/')[-1].split('.')[1:]) if len(sfiles[0]) > 1: new_file = "%s_%s.combined.%s"%(experiment_id,file_no+1, ext) else: new_file = "%s.combined.%s"%(experiment_id, ext) with open(new_file, 'w') as nf: for fn in file_set: with open(fn, 'rb') as f: nf.write(f.read()) if os.path.exists(new_file): csfiles.append(new_file) else: _logger.error("Combined file creation failed! (%s: %s)", experiment_id, file_no) break elif isinstance(sfiles[0], list): csfiles = sfiles[0] if csfiles == []: _logger.error("Files could not be combined! (%s)", experiment_id) failed_accession.append(experiment_id) if csfiles != [] or skip_files: success = CreateSampleDir(csfiles, m, sample_dir, preserve, skip_files) if success: sample_dir_id += 1 else: failed_accession.append(experiment_id) else: _logger.error("Files could not be retrieved! (%s)", experiment_id) failed_accession.append(experiment_id) else: _logger.error("Metadata Invalid! (%s) - %s", experiment_id, m.metadata.items()) failed_accession.append(experiment_id) return sample_dir_id