Exemplo n.º 1
0
    def download_fastq(self):
        '''
        Download Fastq associated with Accession from ENA

        :param run_accession: Run Accession ID from ENA
        :return: True
        '''
        try:
            Path(self.dir).makedirs_p()
            retcode = call(self.download, stdout=PIPE)
        except OSError as e:
            _logger.error('FastQ Failed: %s [%s]', self.accession, e)
            _logger.error('CMD: %s', self.download)
            Sequence.__errors[self.accession] = 'FastQ Failed'
            self.error = True
        else:
            if retcode < 0:
                _logger.error('Child was terminated by signal')
                self.error = True
                Sequence.__errors[self.accession] = 'Child was'\
                    'terminated'\
                    '(signal)'
            else:
                _logger.info('Success: %s', self.accession)
                self.files = [
                    f.abspath() for f in Path(self.dir).files()
                ]
                Sequence.__sequence_id += 1
def ProcessExperimentSeparate(experiment_id, json, batch_dir, sample_dir_id, preserve, failed_accession, skip_files=False):
    m = ExtractExperimentMetadata(experiment_id, json)
    if m.valid_metadata():
        # Check if a run ID was submitted, and if so only process that
        if experiment_id in m.runIDs: m.runIDs = [experiment_id]
        # Process the runIDs as samples
        _logger.info("Found Following Runs: %s", ', '.join(m.runIDs))
        for runid in m.runIDs:
            with TemporaryDirectory() as tmpdir:
                os.chdir(batch_dir)
                sample_dir = "%s/%s/"%(batch_dir, sample_dir_id)
                if os.path.exists(sample_dir):
                    sfiles = [x for x in os.listdir(sample_dir) if any([y in x for y in ['fq','fastq']])]
                else:
                    sfiles = []
                if not preserve or not skip_files or len(sfiles) == 0:
                    sfiles = DownloadRunFiles(runid, tmpdir)
                if sfiles is not None:
                    success = CreateSampleDir(sfiles, m, sample_dir, preserve, skip_files)
                    if success:
                        sample_dir_id += 1
                    else:
                        failed_accession.append(runid)
                else:
                    _logger.error("Files could not be retrieved! (%s)", runid)
                    failed_accession.append(runid)
    else:
        _logger.error("Metadata Invalid! (%s) - %s", experiment_id, m.metadata.items())
        failed_accession.append(experiment_id)
    return sample_dir_id
def CreateSampleDir(sfiles, m, sample_dir, preserve=False, skip_files=False):
    sample_dir = str(sample_dir)
    if not skip_files and len(sfiles) == 0:
            _logger.error("Error: No files were found! (%s)", sample_dir)
            return False
    if not os.path.exists(sample_dir):
        _logger.info("Create sample dir: %s", sample_dir)
        # Create 'sample' dir
        os.mkdir(sample_dir)
        # Move files from tmpdir to sample dir
        for sf in sfiles: move(sf, sample_dir)
    elif not preserve and not skip_files:
        # Empty sample directory
        for fn in os.listdir(sample_dir):
            os.unlink("%s/%s"%(sample_dir, fn))
        # Move files from tmpdir to sample dir
        for sf in sfiles: move(sf, sample_dir)
    # Update and create metadata file
    try:
        m.metadata["file_names"] = ' '.join(
            [os.path.basename(sf).replace(' ','_')
                for sf in sfiles
                if not os.path.basename(sf) == 'meta.json']
            )
        m.save_metadata(sample_dir)
    except ValueError, e:
        _logger.error(e)
        return False
def download_fastq_from_list(accession_list, output, json, preserve=False, all_runs_as_samples=False, skip_files=False):
    """
    Get Fastq from list of IDs

    :param accession_list: List of accessions
    :param dir: Output folder
    """
    metadata = []
    cwd = os.getcwd()
    with open(accession_list, 'r') as f:
        # Setup batch dir
        batch_dir = "%s/%s/"%(cwd, output)
        if not os.path.exists(batch_dir): os.mkdir(batch_dir)
        os.chdir(batch_dir)
        # Set logging
        _logger.Set(filename="%s/download-acceession-list.log"%batch_dir)
        # Count samples in accession_list
        n_samples = sum(1 for l in f)
        f.seek(0)
        _logger.info("Number of samples to download: %s", n_samples)
        # Start progress bar
        pbar = ProgressBar(
            widgets = [ETA(), ' - ', Percentage(), ' : ', Bar()],
            maxval  = n_samples
        ).start()
        pbar.update(0)
        failed_accession = []
        sample_dir_id = 0
        for i, l in enumerate(f):
            accession = l.strip()
            if accession == '': continue
            # Determine accession type
            if accession[:3] in acctypes:
                accession_type = acctypes[accession[:3]]
            else:
                _logger.error("unknown accession type for '%s'!", accession)
                failed_accession.append(accession)
                continue
            _logger.info("Acc Found: %s (%s)", accession, accession_type)
            if accession_type in ['study', 'sample']:
                for experiment_id in ExtractExperimentIDs_acc(accession):
                    sample_dir_id = ProcessExperiment(
                        experiment_id, json, batch_dir,sample_dir_id, preserve,
                        failed_accession, all_runs_as_samples, skip_files)
            elif accession_type == 'experiment':
                sample_dir_id = ProcessExperiment(
                    accession, json, batch_dir,sample_dir_id, preserve,
                    failed_accession, all_runs_as_samples, skip_files)
            elif accession_type == 'run':
                sample_dir_id = ProcessExperiment(
                    accession, json, batch_dir,sample_dir_id, preserve,
                    failed_accession, all_runs_as_samples, skip_files)
            pbar.update(i)
        pbar.finish()
        if failed_accession:
            _logger.info("The following accessions were not downloaded!")
            _logger.info('\n'.join(failed_accession))
        else:
            _logger.info("All accessions downloaded succesfully!")
def ProcessExperiment(experiment_id, json, batch_dir, sample_dir_id, preserve, failed_accession, all_runs_as_samples, skip_files=False):
    _logger.info("Processing %s...", experiment_id)
    if all_runs_as_samples:
        sample_dir_id = ProcessExperimentSeparate(
            experiment_id, json, batch_dir, sample_dir_id,
            preserve, failed_accession, skip_files)
    else:
        sample_dir_id = ProcessExperimentCombined(
            experiment_id, json, batch_dir, sample_dir_id,
            preserve, failed_accession, skip_files)
    return sample_dir_id
def DownloadRunFiles(runid, tmpdir):
    # Download run files
    try:
        s = Sequence(runid, tmpdir)
        s.download_fastq()
        if not s.error:
            _logger.info("Downloaded files: %s", ','.join(s.files))
            return s.files
        else: return None
    except ValueError, e:
        _logger.error(e)
        return None
Exemplo n.º 7
0
 def ExtractData(self, query):
     ''' Extract Sample Metadata '''
     new_platforms = []
     new_seqtypes = []
     # New approach using runinfo list
     with openurl(self.sra_url1%(query)) as u:
         headers = u.readline().split(',')
         indexes = [(x, headers.index(x)) for x in ["Run", "Experiment",
             "Sample", "SRAStudy", "BioSample", "Platform", "LibraryLayout",
             "SampleName", "ScientificName", "CenterName"]]
         for l in u:
             l = l.strip()
             if l == '': continue
             if l[0] == '#': continue
             d = l.split(',')
             self.accessions['run'] = d[indexes[0][1]]
             self.accessions['experiment'] = d[indexes[1][1]]
             self.accessions['sample'] = d[indexes[2][1]]
             self.accessions['study'] = d[indexes[3][1]]
             self.accessions['biosample'] = d[indexes[4][1]]
             platform = d[indexes[5][1]].lower()
             if platform in platforms:
                 self['sequencing_platform'] = platforms[platform]
             else:
                 self['sequencing_platform'] = 'unknown'
                 if not platform in new_platforms:
                     new_platforms.append(platform)
             seqtype = d[indexes[6][1]].lower()
             if seqtype in sequencing_types:
                 self['sequencing_type'] = sequencing_types[seqtype]
             else:
                 self['sequencing_type'] = 'unknown'
                 if not seqtype in new_seqtypes:
                     new_seqtypes.append(seqtype)
             self['sample_name'] = d[indexes[7][1]]
             self['organism'] = d[indexes[8][1]]
             self['collected_by'] = d[indexes[9][1]]
             self['biosample'] = self.accessions['biosample']
             break # Just use the first entry!
             # Should be fixed to handle different query sequences!!!
     with openurl(self.sra_url2%(query)) as u: qdata = u.read()
     # Extract sample attributes
     match = re.findall(r'Sample Attributes: (.+)\n', qdata)
     lcs = {} # location parts
     host = None
     source = None
     for answer in match:
         for attributes in answer.split(';'):
             stat = attributes.split('=')
             att = stat[0].strip('/ ').lower().replace('\'', '')
             val = stat[1].strip('\' ').replace('\'', '\`')
             if att in ['geo_loc_name', 'geographic location']:
                 self.__interpret_loc(val)
             elif att == 'serovar':
                 self['subtype']['serovar'] = val
             elif att == 'mlst':
                 self['subtype']['mlst'] = val
             elif att in ['scientific_name', 'scientific name']:
                 self['organism'] = val
             elif att == 'strain':
                 self['strain'] = val
             elif att in ['isolation_source', 'isolation source']:
                 source = val
             elif att in ['host', 'specific_host', 'specific host']:
                 host = val
             elif att == 'BioSample':
                 self['biosample'] = val
             elif att in ['collection_date', 'collection date']:
                 self['collection_date'] = self.__format_date(
                     *self.__interpret_date(val)
                 )
                 if self['collection_date'] == '':
                     _logger.warning(
                         'Date Empty: %s, %s',
                         val, query
                     )
             elif att in ['collected_by', 'collected by']:
                 self['collected_by'] = val
             elif att in ['country', 'region', 'city', 'zip_code']:
                 lcs[att] = val
             else:
                 self['notes'] = '%s %s: %s,' % (
                     self['notes'], att, val)
         if lcs != {}:
             h = ['country', 'region', 'city', 'zip_code']
             self.__interpret_loc( ','.join([lcs[x] for x in h if x in lcs]))
     # Handle Isolation source
     cats = []
     if host is not None:
         for d in ontology:
             cats = [d[k][0] for k in d.keys() if k in host.lower()]
             if cats:
                 break
     
     if not cats and host not in self.new_ontologies:
         self.new_ontologies[host] = query
     
     if (not cats or cats[0] == 'unknown') and source is not None:
         for d in ontology:
             cats = [d[k][0] for k in d.keys() if k in source.lower()]
             if cats:
                 break
         
         if not cats and source not in self.new_ontologies:
             self.new_ontologies[source] = query
     
     if cats:
         self['isolation_source'] = cats[0]
         _logger.warning(
             'Source identified: %s (%s, %s), %s',
             self['isolation_source'], host, source, query
         )
     else:
         if host is None:   host   = 'unknown'
         if source is None: source = 'unknown'
         _logger.warning(
             'Source not identified: (%s, %s), %s',
             host, source, query
         )
     self['source_note'] = source
     
     # Extract Run IDs associated with the sample
     #Run #1: ERR276921, 1356661 spots, 271332200 bases
     self.runIDs = re.findall(r'Run #\d+: (.+?),.+', qdata)
     
     # Notify Curators By Email
     _logger.info('Make mail? %s'%(mail is not None))
     if mail is not None:
         _logger.info('Any unknowns? %s'%(len(self.new_ontologies) > 0))
         if len(self.new_ontologies) > 0:
             _logger.debug(mail.test(
                 'New isolation source...',
                 'Sources not identified:\n%s\n'%(
                     '\n'.join(map(', '.join, self.new_ontologies.items()))
                     )
                 ))
             mail.send(
                 'New isolation source...',
                 'Sources not identified:\n%s\n'%(
                     '\n'.join(map(', '.join, self.new_ontologies.items()))
                     )
                 )
         if len(new_platforms) > 0:
             _logger.debug(mail.test(
                 'New platforms...',
                 'Platforms not accounted for:\n%s\n'%(
                     '\n'.join(new_platforms)
                     )
                 ))
             mail.send(
                 'New platforms...',
                 'Platforms not accounted for:\n%s\n'%(
                     '\n'.join(new_platforms)
                     )
                 )
     elif len(self.new_ontologies) > 0:
         _logger.debug(
             "NO MAIL!",
             'Sources not identified:\n%s\n'%(
                 '\n'.join(map(', '.join, self.new_ontologies.items()))
                 )
             )
def ProcessExperimentCombined(experiment_id, json, batch_dir, sample_dir_id, preserve, failed_accession, skip_files=False):
    m = ExtractExperimentMetadata(experiment_id, json)
    if m.valid_metadata():
        # Check if a run ID was submitted, and if so only process that
        if experiment_id in m.runIDs: m.runIDs = [experiment_id]
        # Process the runs as one sample
        _logger.info("Found Following Runs: %s", ', '.join(m.runIDs))
        with TemporaryDirectory() as tmpdir:
            os.chdir(batch_dir)
            sample_dir = "%s/%s/"%(batch_dir, sample_dir_id)
            csfiles = []
            if preserve and os.path.exists(sample_dir):
                csfiles = [x for x in os.listdir(sample_dir) if any([y in x for y in ['fq','fastq']])]
            if csfiles == [] and not skip_files:
                sfiles = []
                for runid in m.runIDs:
                    sf = DownloadRunFiles(runid, tmpdir)
                    if sf is not None:
                        sfiles.append(sf)
                    else:
                        _logger.error("Run files could not be retrieved! (%s)",
                                      runid)
                _logger.info("Found Following files sets:\n%s\n",
                             '\n'.join([', '.join(sf) for sf in sfiles]))
                # Combine sfiles into one entry
                if len(sfiles) > 1:
                    for file_no, file_set in enumerate(zip(*sfiles)):
                        ext = '.'.join(file_set[0].split('/')[-1].split('.')[1:])
                        if len(sfiles[0]) > 1:
                            new_file = "%s_%s.combined.%s"%(experiment_id,file_no+1, ext)
                        else:
                            new_file = "%s.combined.%s"%(experiment_id, ext)
                        with open(new_file, 'w') as nf:
                            for fn in file_set:
                                with open(fn, 'rb') as f:
                                    nf.write(f.read())
                        if os.path.exists(new_file):
                            csfiles.append(new_file)
                        else:
                            _logger.error("Combined file creation failed! (%s: %s)",
                                          experiment_id, file_no)
                            break
                elif isinstance(sfiles[0], list):
                    csfiles = sfiles[0]
                if csfiles == []:
                    _logger.error("Files could not be combined! (%s)",
                                  experiment_id)
                    failed_accession.append(experiment_id)
            if csfiles != [] or skip_files:
                success = CreateSampleDir(csfiles, m, sample_dir, preserve, skip_files)
                if success:
                    sample_dir_id += 1
                else:
                    failed_accession.append(experiment_id)
            else:
                _logger.error("Files could not be retrieved! (%s)",
                              experiment_id)
                failed_accession.append(experiment_id)
    else:
        _logger.error("Metadata Invalid! (%s) - %s", experiment_id, m.metadata.items())
        failed_accession.append(experiment_id)
    return sample_dir_id