Exemplo n.º 1
0
def get_git(name, description, source, url, commit, checksum, destination,
            preparation, checker):
    '''
    Get a dependency from git
    '''

    if _os.path.realpath(_os.path.curdir) != destination:
        try:
            _os.chdir(destination)
        except OSError:
            _os.makedirs(destination)
            _os.chdir(destination)

    try:
        # clear any previous verions
        _shutil.rmtree(url.split('/')[-1].replace('.git', ''))
    except OSError:
        pass

    git_server = url.replace('https://', '').replace('http://',
                                                     '').split('/')[0]
    print('Downloading {} via git from {} . . .'.format(name, git_server))
    _subprocess.call(['git', 'clone', url])
    _os.chdir(url.split('/')[-1].replace('.git', ''))
    _subprocess.call(['git', 'checkout', commit])
    # if repo uses git submodules, those will be set to the correct revisions for this commit
    # else will do nothing
    _subprocess.call(['git', 'submodule', 'update', '--init'])

    working_dir = _os.path.sep.join(
        [destination, url.split('/')[-1].replace('.git', '')])

    if preparation is not None:
        for do_this in preparation:
            if isinstance(do_this['arguments'], dict):
                do_this['function'](**do_this['arguments'])
            else:
                do_this['function'](*do_this['arguments'])

            # restore position in path if a prepare changed it
            if working_dir != _os.path.realpath(_os.path.curdir):
                _os.chdir(working_dir)

    _os.chdir(_os.path.pardir)
    _os.chdir(_os.path.pardir)
Exemplo n.º 2
0
def get_git(name, description, source, url, commit, checksum, destination, preparation, checker):
    '''
    Get a dependency from git
    '''
    
    if _os.path.realpath(_os.path.curdir) != destination:
        try:
            _os.chdir(destination)
        except OSError:
            _os.makedirs(destination)
            _os.chdir(destination)
    
    try:
        # clear any previous verions
        _shutil.rmtree(url.split('/')[-1].replace('.git',''))
    except OSError:
        pass
    
    git_server = url.replace('https://','').replace('http://','').split('/')[0]
    print('Downloading {} via git from {} . . .'.format(name, git_server))
    _subprocess.call(['git', 'clone', url])
    _os.chdir(url.split('/')[-1].replace('.git',''))
    _subprocess.call(['git', 'checkout', commit])
    # if repo uses git submodules, those will be set to the correct revisions for this commit
    # else will do nothing
    _subprocess.call(['git', 'submodule', 'update', '--init'])
    
    working_dir = _os.path.sep.join([destination,url.split('/')[-1].replace('.git','')])
    
    if preparation is not None:
        for do_this in preparation:
            if isinstance(do_this['arguments'], dict):
                do_this['function'](**do_this['arguments'])
            else:
                do_this['function'](*do_this['arguments'])
            
            # restore position in path if a prepare changed it
            if working_dir != _os.path.realpath(_os.path.curdir):
                _os.chdir(working_dir)
    
    _os.chdir(_os.path.pardir)
    _os.chdir(_os.path.pardir)
Exemplo n.º 3
0
    def getFromENA(self, run_acc_list, 
                         ftp_server_url = 'ftp.sra.ebi.ac.uk', 
                         local_reads_path = ['reads']):
        '''
        Given a list of 'run' accession numbers for paired end short read analyses, 
        download the read files from the European Nucleotide Archive.

        If using a mirror server, supply an alternative for 'ftp_server_url'.

        'local_reads_path' can be a path string or list or folder names.
        '''
        if isinstance(local_reads_path, list):
            local_reads_path = _os.path.sep.join(local_reads_path)

        if not _os.path.exists(local_reads_path):
            _os.makedirs(local_reads_path)

        print('Logging in to %s' % ftp_server_url)
        ftp = _FTP(ftp_server_url)
        # anonymous login
        print(ftp.login())

        def check_connection(ftp):
            try:
                print('FTP: %s' % ftp.voidcmd("NOOP"))
                # http://docs.python.org/2/library/ftplib.html
                return(True)
            except IOError as e:
                print('Seems to be a problem with the connection to FTP server:')
                print('I/O error({0}): {1}'.format(e.errno, e.strerror) )
                return(False)

        def calc_checksum(filepath):
            hasher = _md5()
            handle = open(filepath, 'rb')
            buff = handle.read(65536)
            while len(buff) > 0:
                hasher.update(buff)
                buff = handle.read(65536)
            
            return(hasher.hexdigest())

        downloaded_read_files = {}

        start_time = _time.time()
        failed = []
        for cnum,run_acc in enumerate(run_acc_list):
            
            query_url_base = 'http://www.ebi.ac.uk/ena/data/warehouse/search?query='
            success = False
            tries = 0
            max_tries = 5
            while not success:
                rest_req = '"run_accession=%s"&result=read_run&fields=fastq_ftp,fastq_md5&display=report' % run_acc
                print('Sending query to ENA:\n%s' % rest_req)
                result = _urllib2.urlopen(query_url_base + rest_req).read()
                print('ENA accession numbers query result:\n%s' % result)
                if result.count('ERR') == 7:
                    success = True
                else:
                    print('Query result from ENA was unexpected on attempt %s of %s' % (tries, max_tries))
                    _time.sleep(0.5)
                    tries += 1
                    if tries == max_tries:
                        print('Attempt %s failed. Try again later and if problem persists, report bug.' % tries)
                        failed += [run_acc]
                        break
                        #_sys.exit(1)
            
            if not success:
                continue
            
            md5s = result.split('\n')[-2].split('\t')[-1][:-1].split(';')
            
            ENA_paths = result.split('\n')[-2].split('\t')[-2][:-1].split(';')
            
            ENA_reads_pair_paths = {}
            ENA_reads_pair_paths[1] = ENA_paths[0].replace(ftp_server_url, '')
            ENA_reads_pair_paths[2] = ENA_paths[1].replace(ftp_server_url, '')
            
            local_reads_pair_paths = {}
            local_reads_pair_paths[1] = local_reads_path + \
                                        _os.path.sep + \
                                        ENA_reads_pair_paths[1].split('/')[-1]
            local_reads_pair_paths[2] = local_reads_path + \
                                        _os.path.sep + \
                                        ENA_reads_pair_paths[2].split('/')[-1]
            
            downloaded_read_files[run_acc] = {}
            
            for f in (1,2):
                # ensure connection is still open
                while not check_connection(ftp):
                    _sleep(0.5)
                    print('Attempting to re-establish connection . . .')
                    ftp = _FTP(ftp_server_url)
                    # anonymous login
                    print(ftp.login())
                    pass
                
                expected_checksum = md5s[f - 1]
                
                exists = _os.path.exists(local_reads_pair_paths[f])
                if exists:
                    print('File %s for %s exists locally: %s' % (f, run_acc, local_reads_pair_paths[f]))
                    actual_checksum = calc_checksum(local_reads_pair_paths[f])
                    if actual_checksum == expected_checksum:
                        print('File checksum matches: %s. Skipping download' % (expected_checksum))
                        downloaded_read_files[run_acc][f] = local_reads_pair_paths[f]
                        continue
                    else:
                        print('Checksum mismatch')
                
                print('Downloading via %s: %s' % (ftp_server_url, ENA_reads_pair_paths[f]))
                res = ftp.retrbinary('RETR %s' % ENA_reads_pair_paths[f], 
                                     open(local_reads_pair_paths[f], 'wb').write)
                print('FTP: %s' % res)
                
                print('Calculating checksum . . .')
                actual_checksum = calc_checksum(local_reads_pair_paths[f])
                
                if actual_checksum == expected_checksum:
                    print('File checksum matches: %s.' % (expected_checksum))
                    downloaded_read_files[run_acc][f] = local_reads_pair_paths[f]
                else:
                    print('Checksum mismatch for: %s')
            
            if len(run_acc_list) > 1:
                # report durations, time left etc
                _report_time(start_time, cnum, len(run_acc_list))

        if len(failed) > 0:
            print('WARNING: some accession numbers did not return a result from ENA')
            print('Try searching http://www.ebi.ac.uk/ena in a web-browser for:')
            print(', '.join(failed))

        self.read_files = downloaded_read_files
Exemplo n.º 4
0
    def getFromENA(self,
                   run_acc_list,
                   ftp_server_url='ftp.sra.ebi.ac.uk',
                   local_reads_path=['reads']):
        '''
        Given a list of 'run' accession numbers for paired end short read analyses, 
        download the read files from the European Nucleotide Archive.

        If using a mirror server, supply an alternative for 'ftp_server_url'.

        'local_reads_path' can be a path string or list or folder names.
        '''
        if isinstance(local_reads_path, list):
            local_reads_path = _os.path.sep.join(local_reads_path)

        if not _os.path.exists(local_reads_path):
            _os.makedirs(local_reads_path)

        print('Logging in to %s' % ftp_server_url)
        ftp = _FTP(ftp_server_url)
        # anonymous login
        print(ftp.login())

        def check_connection(ftp):
            try:
                print('FTP: %s' % ftp.voidcmd("NOOP"))
                # http://docs.python.org/2/library/ftplib.html
                return (True)
            except IOError as e:
                print(
                    'Seems to be a problem with the connection to FTP server:')
                print('I/O error({0}): {1}'.format(e.errno, e.strerror))
                return (False)

        def calc_checksum(filepath):
            hasher = _md5()
            handle = open(filepath, 'rb')
            buff = handle.read(65536)
            while len(buff) > 0:
                hasher.update(buff)
                buff = handle.read(65536)

            return (hasher.hexdigest())

        downloaded_read_files = {}

        start_time = _time.time()
        failed = []
        for cnum, run_acc in enumerate(run_acc_list):

            query_url_base = 'http://www.ebi.ac.uk/ena/data/warehouse/search?query='
            success = False
            tries = 0
            max_tries = 5
            while not success:
                rest_req = '"run_accession=%s"&result=read_run&fields=fastq_ftp,fastq_md5&display=report' % run_acc
                print('Sending query to ENA:\n%s' % rest_req)
                result = _urllib2.urlopen(query_url_base + rest_req).read()
                print('ENA accession numbers query result:\n%s' % result)
                if result.count('ERR') == 7:
                    success = True
                else:
                    print(
                        'Query result from ENA was unexpected on attempt %s of %s'
                        % (tries, max_tries))
                    _time.sleep(0.5)
                    tries += 1
                    if tries == max_tries:
                        print(
                            'Attempt %s failed. Try again later and if problem persists, report bug.'
                            % tries)
                        failed += [run_acc]
                        break
                        #_sys.exit(1)

            if not success:
                continue

            md5s = result.split('\n')[-2].split('\t')[-1][:-1].split(';')

            ENA_paths = result.split('\n')[-2].split('\t')[-2][:-1].split(';')

            ENA_reads_pair_paths = {}
            ENA_reads_pair_paths[1] = ENA_paths[0].replace(ftp_server_url, '')
            ENA_reads_pair_paths[2] = ENA_paths[1].replace(ftp_server_url, '')

            local_reads_pair_paths = {}
            local_reads_pair_paths[1] = local_reads_path + \
                                        _os.path.sep + \
                                        ENA_reads_pair_paths[1].split('/')[-1]
            local_reads_pair_paths[2] = local_reads_path + \
                                        _os.path.sep + \
                                        ENA_reads_pair_paths[2].split('/')[-1]

            downloaded_read_files[run_acc] = {}

            for f in (1, 2):
                # ensure connection is still open
                while not check_connection(ftp):
                    _sleep(0.5)
                    print('Attempting to re-establish connection . . .')
                    ftp = _FTP(ftp_server_url)
                    # anonymous login
                    print(ftp.login())
                    pass

                expected_checksum = md5s[f - 1]

                exists = _os.path.exists(local_reads_pair_paths[f])
                if exists:
                    print('File %s for %s exists locally: %s' %
                          (f, run_acc, local_reads_pair_paths[f]))
                    actual_checksum = calc_checksum(local_reads_pair_paths[f])
                    if actual_checksum == expected_checksum:
                        print('File checksum matches: %s. Skipping download' %
                              (expected_checksum))
                        downloaded_read_files[run_acc][
                            f] = local_reads_pair_paths[f]
                        continue
                    else:
                        print('Checksum mismatch')

                print('Downloading via %s: %s' %
                      (ftp_server_url, ENA_reads_pair_paths[f]))
                res = ftp.retrbinary(
                    'RETR %s' % ENA_reads_pair_paths[f],
                    open(local_reads_pair_paths[f], 'wb').write)
                print('FTP: %s' % res)

                print('Calculating checksum . . .')
                actual_checksum = calc_checksum(local_reads_pair_paths[f])

                if actual_checksum == expected_checksum:
                    print('File checksum matches: %s.' % (expected_checksum))
                    downloaded_read_files[run_acc][f] = local_reads_pair_paths[
                        f]
                else:
                    print('Checksum mismatch for: %s')

            if len(run_acc_list) > 1:
                # report durations, time left etc
                _report_time(start_time, cnum, len(run_acc_list))

        if len(failed) > 0:
            print(
                'WARNING: some accession numbers did not return a result from ENA'
            )
            print(
                'Try searching http://www.ebi.ac.uk/ena in a web-browser for:')
            print(', '.join(failed))

        self.read_files = downloaded_read_files
Exemplo n.º 5
0
    def SPAdes(self,
               exe=[],
               output_folder=['assemblies', 'SPAdes'],
               mem_num_gigs=8,
               max_cpus=-1,
               single_assembly=False,
               careful=True,
               only_assembler=False):
        '''
        de novo assembly of short reads using SPAdes

        By default, the provided short reads in dictionary: self.paths_to_reads
        will be assembled separately, unless single_assembly set to True in 
        which case each set of paired read fastq files will be used in a 
        single assembly.

        http://spades.bioinf.spbau.ru/release3.6.1/manual.html
        relevent inputs:
        -o <output_dir> Specify the output directory. Required option.
        --sc required for MDA (single-cell) data.
        --only-error-correction
        --only-assembler
        --careful reduce the number of mismatches and short indels. Run MismatchCorrector – a post processing tool. Recommended.
        --continue from the specified output folder starting from the last available check-point
        --restart-from <check_point>
            ec start from error correction
            as restart assembly module from the first iteration
            k<int> restart from the iteration with specified k values, e.g. k55
            mc restart mismatch correction
        --pe1-12 <file_name> interlaced forward and reverse paired-end reads.
        --pe1-1 <file_name> File with forward reads.
        --pe1-2 <file_name> File with reverse reads.
        --pe1-s <file_name> File with unpaired reads . . use --pe2-... for next library
        --threads <int>
        --memory <int> max memory in Gb
        -k <int,int,...>  Comma-separated list of odd ascending k-mers
        If --sc is set the default value are 21,33,55, for multicell data sets it is auto
        --cov-cutoff <float> positive float value, or 'auto', or 'off'. Default value is 'off'
        '''

        assert isinstance(
            output_folder,
            list), 'Provide output folder as list of folders forming path'

        base_output_path = _os.path.sep.join(output_folder)

        if not _os.path.exists(base_output_path):
            _os.makedirs(base_output_path)

        # max threads is slightly different to cpus
        # . . can probably use more
        max_processes = _decide_max_processes(max_cpus)

        # if an exe is not provided, use that stored in Dependencies
        if len(exe):
            use_exe = _os.path.sep.join(exe)
        else:
            from baga import Dependencies
            use_exe = _get_exe_path('spades')

        def run_SPAdes(cmd):
            proc = _subprocess.Popen(cmd,
                                     stdout=_subprocess.PIPE,
                                     stderr=_subprocess.PIPE)
            # allow for failed SPAdes runs (possibly caused by small fastq files) <== but also check they were actually built properly
            try:
                stdout_value, stderr_value = proc.communicate()
                checkthese = []
                getline = False
                for line in stdout_value.split('\n'):
                    if 'Warnings saved to' in line:
                        getline = False
                    if getline:
                        l = line.rstrip()
                        if len(l):
                            checkthese += [l]
                    if 'SPAdes pipeline finished WITH WARNINGS!' in line:
                        getline = True

                if len(checkthese):
                    print('SPAdes completed with warnings:\n{}\n'.format(
                        '\n'.join(checkthese)))
                else:
                    print('SPAdes completed without warnings')

                # with open('___SPAdes_{}_good_{}.log'.format(cnum, thetime), 'w') as fout:
                # fout.write(stdout_value)
                path2contigs = _os.path.sep.join(
                    [this_output_path, 'contigs.fasta'])
            except _subprocess.CalledProcessError as e:
                print('SPAdes probably did not complete: error returned ({})'.
                      format(proc.returncode))
                print('Error: {}'.format(e))
                print(
                    'Writing some info relevent to SPAdes crash to ___SPAdes_{}_bad_{}.log'
                    .format(cnum, thetime))
                with open('___SPAdes_{}_bad_{}.log'.format(cnum, thetime),
                          'w') as fout:
                    fout.write(dir(proc))
                    fout.write('\n' + str(e.returncode) + '\n')
                    fout.write(
                        _os.path.sep.join([this_output_path, 'contigs.fasta']))

                path2contigs = None

            return (path2contigs)

        if isinstance(use_exe, list):
            # allow for use of prepended executable with script to run
            cmd = list(use_exe)
        else:
            # or just executable
            cmd = [use_exe]

        contigs = {}
        if single_assembly:
            print(
                'Combining reads aligned at multiple regions into single assembly'
            )
            if isinstance(use_exe, list):
                # allow for use of prepended executable with script to run
                cmd = list(use_exe)
            else:
                # or just executable
                cmd = [use_exe]
            for cnum, (pairname, files) in enumerate(self.read_files.items()):
                # allow use of tuples or dicts by converting dicts to lists
                if isinstance(files, dict):
                    use_files = []
                    for k, v in sorted(files.items()):
                        use_files += [v]
                else:
                    use_files = files

                cmd += ['--pe{}-1'.format(cnum + 1), use_files[0]]
                cmd += ['--pe{}-2'.format(cnum + 1), use_files[1]]
                try:
                    # use unpaired reads if available
                    cmd += ['--pe{}-s'.format(cnum + 1), use_files[2]]
                except IndexError:
                    pass
            try:
                # add a second library if provided
                if isinstance(self.read_files2[pairname], dict):
                    # if a dict supplied, make it a list
                    use_files2 = []
                    for k, v in sorted(self.read_files2[pairname].items()):
                        use_files2 += [v]
                else:
                    use_files2 = self.read_files2[pairname]
                cmd += ['--pe{}-1'.format(cnum + 2), use_files2[0]]
                cmd += ['--pe{}-2'.format(cnum + 2), use_files2[1]]
                try:
                    cmd += ['--pe{}-s'.format(cnum + 2), use_files2[2]]
                except IndexError:
                    pass
            except AttributeError:
                pass

            ## this isn't very flexible:
            # retain <sample>__<genome> from pairname:
            # pairname == <sample>__<genome>_<start>-<end>+<padding>
            # and replace with multiregion
            folder = '{}__{}_{}'.format(
                pairname.split('__')[0],
                pairname.split('__')[1].split('_')[0], 'multi_region')
            this_output_path = _os.path.sep.join(output_folder + [folder])
            if not _os.path.exists(this_output_path):
                _os.makedirs(this_output_path)

            cmd += ['-o', this_output_path]
            cmd += ['--threads', str(max_processes)]
            cmd += ['--memory', str(mem_num_gigs)]
            if only_assembler:
                cmd += ['--only-assembler']
            if careful:
                cmd += ['--careful']
            thetime = _time.asctime(_time.localtime(_time.time()))
            print('about to launch SPAdes . . . at {}'.format(thetime))
            print(' '.join(cmd))
            contigs['multi_region'] = run_SPAdes(cmd)
        else:
            start_time = _time.time()
            # prepare commandline and launch each SPAdes assembly
            contigs = {}
            for cnum, (pairname,
                       files) in enumerate(sorted(self.read_files.items())):
                if isinstance(use_exe, list):
                    # allow for use of prepended executable with script to run
                    cmd = list(use_exe)
                else:
                    # or just executable
                    cmd = [use_exe]
                # allow use of tuples or dicts by converting dicts to lists
                if isinstance(files, dict):
                    use_files = []
                    for k, v in sorted(files.items()):
                        use_files += [v]
                else:
                    use_files = files

                cmd += ['--pe1-1', use_files[0]]
                cmd += ['--pe1-2', use_files[1]]
                try:
                    # use unpaired reads if available
                    cmd += ['--pe1-s', use_files[2]]
                except IndexError:
                    pass
                try:
                    # add a second library if provided
                    if isinstance(self.read_files2[pairname], dict):
                        # if a dict supplied, make it a list
                        use_files2 = []
                        for k, v in sorted(self.read_files2[pairname].items()):
                            use_files2 += [v]
                    else:
                        use_files2 = self.read_files2[pairname]
                    cmd += ['--pe2-1', use_files2[0]]
                    cmd += ['--pe2-2', use_files2[1]]
                    try:
                        cmd += ['--pe2-s', use_files2[2]]
                    except IndexError:
                        pass
                except AttributeError:
                    pass

                this_output_path = _os.path.sep.join(output_folder +
                                                     [pairname])
                if not _os.path.exists(this_output_path):
                    _os.makedirs(this_output_path)

                cmd += ['-o', this_output_path]
                cmd += ['--threads', str(max_processes)]
                cmd += ['--memory', str(mem_num_gigs)]
                if only_assembler:
                    cmd += ['--only-assembler']
                if careful:
                    cmd += ['--careful']
                thetime = _time.asctime(_time.localtime(_time.time()))
                print('about to launch SPAdes . . . at {}'.format(thetime))
                print(' '.join(cmd))
                contigs[pairname] = run_SPAdes(cmd)
                if len(self.read_files) > 1:
                    # report durations, time left etc
                    _report_time(start_time, cnum, len(self.read_files))

        self.paths_to_contigs = contigs
Exemplo n.º 6
0
def get_download(name, description, source, url, commit, checksum, destination,
                 preparation, checker):
    '''
    Download and unpack a dependancy
    '''
    ##
    initialdir = _os.path.abspath(_os.curdir)
    try:
        _os.chdir(destination)
    except OSError:
        _os.makedirs(destination)
        _os.chdir(destination)

    if checksum:
        hasher_algorithm = checksum.split('=')[0]
        if hasher_algorithm == 'md5':
            hasher = _hashlib.md5()
        elif hasher_algorithm == 'sha1':
            hasher = _hashlib.sha1()
        elif hasher_algorithm == 'sha224':
            hasher = _hashlib.sha224()
        elif hasher_algorithm == 'sha256':
            hasher = _hashlib.sha256()
        elif hasher_algorithm == 'sha384':
            hasher = _hashlib.sha384()
        elif hasher_algorithm == 'sha512':
            hasher = _hashlib.sha512()
        else:
            print("{} checksums not implemented in Python's hashlib!".format(
                hasher_algorithm))

    print('Downloading: %s' % url)
    req = _urllib2.urlopen(url)
    CHUNK = 16 * 1024 * 16
    data = _cStringIO.StringIO()
    c = 0
    for chunk in iter(lambda: req.read(CHUNK), ''):
        c += CHUNK
        print("{:,} bytes".format(c))
        data.write(chunk)

    print('Download complete . . .')
    data.seek(0)

    if checksum:
        buff = data.read(65536)
        while len(buff) > 0:
            hasher.update(buff)
            buff = data.read(65536)

        e = '. . . checksum fail!'
        assert hasher.hexdigest() == checksum.split('=')[1], e
        print('. . . checksum passed!')
        data.seek(0)

    if url[-6:] == 'tar.gz':
        archive = _tarfile.open(mode="r:gz", fileobj=data)
    elif url[-7:] == 'tar.bz2':
        archive = _tarfile.open(mode="r:bz2", fileobj=data)
    elif url[-4:] == '.zip':
        archive = _zipfile.ZipFile(data)

    if destination == 'local_packages':
        # extract as a pypi python package
        release = url.split('/')[-1][:-7]
        print('Extracting {} to {}'.format(
            release, _os.path.sep.join([destination, name])))
        c = 0
        nostrip = {'pysam'}
        if name in nostrip:
            try:
                _shutil.rmtree(archive.getnames()[0])
            except OSError:
                pass
            #_shutil.rmtree(_os.path.sep.join([destination, archive.getnames()[0]]))
            # some python modules should not be stripped . . more complex install
            for member in archive.getmembers():
                if member.isreg():
                    archive.extract(member)
                    print(member.name)
                    c += 1
        else:
            # others don't need additional compilation
            check_path1 = '{}/{}'.format(release, name)
            for member in archive.getmembers():
                if member.isreg() and check_path1 in member.name:
                    member.name = _os.path.sep.join(
                        member.name.split(_os.path.sep)[1:])
                    archive.extract(member)
                    c += 1
        print('Extracted {} files'.format(c))

    else:
        # extract as a generic external program
        archive.extractall()

    if preparation:
        for do_this in preparation:
            if 'just_packages' in do_this['arguments']:
                # this is the only thing that differentiates this prepare()
                # from others that need some chdir <== this should be improved
                # see dep dict
                curdir = _os.path.abspath(_os.curdir)
                _os.chdir(_os.path.pardir)
                do_this['function'](*do_this['arguments']['package_list'])
                # return to previous folder
                _os.chdir(curdir)
            else:
                extracted_base_dir = archive.getnames()[0].split(
                    _os.path.sep)[0]
                curdir = _os.path.abspath(_os.curdir)
                # go to installed folder
                _os.chdir(_os.path.sep.join([destination, extracted_base_dir]))
                do_this['function'](**do_this['arguments'])
                # return to previous folder
                _os.chdir(curdir)

    _os.chdir(initialdir)
Exemplo n.º 7
0
    def SPAdes(self, 
            exe = [], 
            output_folder = ['assemblies','SPAdes'],
            mem_num_gigs = 8, 
            max_cpus = -1,
            single_assembly = False,
            careful = True,
            only_assembler = False):
        '''
        de novo assembly of short reads using SPAdes

        By default, the provided short reads in dictionary: self.paths_to_reads
        will be assembled separately, unless single_assembly set to True in 
        which case each set of paired read fastq files will be used in a 
        single assembly.

        http://spades.bioinf.spbau.ru/release3.6.1/manual.html
        relevent inputs:
        -o <output_dir> Specify the output directory. Required option.
        --sc required for MDA (single-cell) data.
        --only-error-correction
        --only-assembler
        --careful reduce the number of mismatches and short indels. Run MismatchCorrector – a post processing tool. Recommended.
        --continue from the specified output folder starting from the last available check-point
        --restart-from <check_point>
            ec start from error correction
            as restart assembly module from the first iteration
            k<int> restart from the iteration with specified k values, e.g. k55
            mc restart mismatch correction
        --pe1-12 <file_name> interlaced forward and reverse paired-end reads.
        --pe1-1 <file_name> File with forward reads.
        --pe1-2 <file_name> File with reverse reads.
        --pe1-s <file_name> File with unpaired reads . . use --pe2-... for next library
        --threads <int>
        --memory <int> max memory in Gb
        -k <int,int,...>  Comma-separated list of odd ascending k-mers
        If --sc is set the default value are 21,33,55, for multicell data sets it is auto
        --cov-cutoff <float> positive float value, or 'auto', or 'off'. Default value is 'off'
        '''

        assert isinstance(output_folder, list), 'Provide output folder as list of folders forming path'

        base_output_path = _os.path.sep.join(output_folder)

        if not _os.path.exists(base_output_path):
            _os.makedirs(base_output_path)

        # max threads is slightly different to cpus
        # . . can probably use more
        max_processes = _decide_max_processes( max_cpus )

        # if an exe is not provided, use that stored in Dependencies
        if len(exe):
            use_exe = _os.path.sep.join(exe)
        else:
            from baga import Dependencies
            use_exe = _get_exe_path('spades')

        def run_SPAdes(cmd):
            proc = _subprocess.Popen(cmd, stdout=_subprocess.PIPE, stderr=_subprocess.PIPE)
            # allow for failed SPAdes runs (possibly caused by small fastq files) <== but also check they were actually built properly
            try:
                stdout_value, stderr_value = proc.communicate()
                checkthese = []
                getline = False
                for line in stdout_value.split('\n'):
                    if 'Warnings saved to' in line:
                        getline = False
                    if getline:
                        l = line.rstrip()
                        if len(l):
                            checkthese += [l]
                    if 'SPAdes pipeline finished WITH WARNINGS!' in line:
                        getline = True
                
                if len(checkthese):
                    print('SPAdes completed with warnings:\n{}\n'.format('\n'.join(checkthese)))
                else:
                    print('SPAdes completed without warnings')
                
                # with open('___SPAdes_{}_good_{}.log'.format(cnum, thetime), 'w') as fout:
                    # fout.write(stdout_value)
                path2contigs = _os.path.sep.join([this_output_path,'contigs.fasta'])
            except _subprocess.CalledProcessError as e:
                print('SPAdes probably did not complete: error returned ({})'.format(proc.returncode))
                print('Error: {}'.format(e))
                print('Writing some info relevent to SPAdes crash to ___SPAdes_{}_bad_{}.log'.format(cnum, thetime))
                with open('___SPAdes_{}_bad_{}.log'.format(cnum, thetime), 'w') as fout:
                    fout.write(dir(proc))
                    fout.write('\n' + str(e.returncode) + '\n')
                    fout.write(_os.path.sep.join([this_output_path,'contigs.fasta']))
                
                path2contigs = None
            
            return(path2contigs)

        if isinstance(use_exe, list):
            # allow for use of prepended executable with script to run
            cmd = list(use_exe)
        else:
            # or just executable
            cmd = [use_exe]

        contigs = {}
        if single_assembly:
            print('Combining reads aligned at multiple regions into single assembly')
            if isinstance(use_exe, list):
                # allow for use of prepended executable with script to run
                cmd = list(use_exe)
            else:
                # or just executable
                cmd = [use_exe]
            for cnum, (pairname, files) in enumerate(self.read_files.items()):
                # allow use of tuples or dicts by converting dicts to lists
                if isinstance(files, dict):
                    use_files = []
                    for k,v in sorted(files.items()):
                        use_files += [v]
                else:
                    use_files = files
                
                cmd += ['--pe{}-1'.format(cnum+1), use_files[0]]
                cmd += ['--pe{}-2'.format(cnum+1), use_files[1]]
                try:
                    # use unpaired reads if available
                    cmd += ['--pe{}-s'.format(cnum+1), use_files[2]]
                except IndexError:
                    pass
            try:
                # add a second library if provided
                if isinstance(self.read_files2[pairname], dict):
                    # if a dict supplied, make it a list
                    use_files2 = []
                    for k,v in sorted(self.read_files2[pairname].items()):
                        use_files2 += [v]
                else:
                    use_files2 = self.read_files2[pairname]
                cmd += ['--pe{}-1'.format(cnum+2), use_files2[0]]
                cmd += ['--pe{}-2'.format(cnum+2), use_files2[1]]
                try:
                    cmd += ['--pe{}-s'.format(cnum+2), use_files2[2]]
                except IndexError:
                    pass
            except AttributeError:
                pass
            
            ## this isn't very flexible:
            # retain <sample>__<genome> from pairname:
            # pairname == <sample>__<genome>_<start>-<end>+<padding>
            # and replace with multiregion
            folder = '{}__{}_{}'.format(pairname.split('__')[0],
                                        pairname.split('__')[1].split('_')[0],
                                        'multi_region')
            this_output_path = _os.path.sep.join(output_folder + [folder])
            if not _os.path.exists(this_output_path):
                _os.makedirs(this_output_path)
            
            cmd += ['-o', this_output_path]
            cmd += ['--threads', str(max_processes)]
            cmd += ['--memory', str(mem_num_gigs)]
            if only_assembler:
                cmd += ['--only-assembler']
            if careful:
                cmd += ['--careful']
            thetime = _time.asctime( _time.localtime(_time.time()) )
            print('about to launch SPAdes . . . at {}'.format(thetime))
            print(' '.join(cmd))
            contigs['multi_region'] = run_SPAdes(cmd)
        else:
            start_time = _time.time()
            # prepare commandline and launch each SPAdes assembly
            contigs = {}
            for cnum, (pairname, files) in enumerate(sorted(self.read_files.items())):
                if isinstance(use_exe, list):
                    # allow for use of prepended executable with script to run
                    cmd = list(use_exe)
                else:
                    # or just executable
                    cmd = [use_exe]
                # allow use of tuples or dicts by converting dicts to lists
                if isinstance(files, dict):
                    use_files = []
                    for k,v in sorted(files.items()):
                        use_files += [v]
                else:
                    use_files = files
                
                cmd += ['--pe1-1', use_files[0]]
                cmd += ['--pe1-2', use_files[1]]
                try:
                    # use unpaired reads if available
                    cmd += ['--pe1-s', use_files[2]]
                except IndexError:
                    pass
                try:
                    # add a second library if provided
                    if isinstance(self.read_files2[pairname], dict):
                        # if a dict supplied, make it a list
                        use_files2 = []
                        for k,v in sorted(self.read_files2[pairname].items()):
                            use_files2 += [v]
                    else:
                        use_files2 = self.read_files2[pairname]
                    cmd += ['--pe2-1', use_files2[0]]
                    cmd += ['--pe2-2', use_files2[1]]
                    try:
                        cmd += ['--pe2-s', use_files2[2]]
                    except IndexError:
                        pass
                except AttributeError:
                    pass
                
                this_output_path = _os.path.sep.join(output_folder + [pairname])
                if not _os.path.exists(this_output_path):
                    _os.makedirs(this_output_path)
                
                cmd += ['-o', this_output_path]
                cmd += ['--threads', str(max_processes)]
                cmd += ['--memory', str(mem_num_gigs)]
                if only_assembler:
                    cmd += ['--only-assembler']
                if careful:
                    cmd += ['--careful']
                thetime = _time.asctime( _time.localtime(_time.time()) )
                print('about to launch SPAdes . . . at {}'.format(thetime))
                print(' '.join(cmd))
                contigs[pairname] = run_SPAdes(cmd)
                if len(self.read_files) > 1:
                    # report durations, time left etc
                    _report_time(start_time, cnum, len(self.read_files))

        self.paths_to_contigs = contigs
Exemplo n.º 8
0
    def align(self, insert_size = False, 
                    path_to_exe = False, 
                    local_alns_path = ['alignments'], 
                    force = False, 
                    max_cpus = -1):


        if not path_to_exe:
            path_to_exe = _get_exe_path('bwa')

        # write genome sequence to a fasta file
        try:
            _os.makedirs('genome_sequences')
        except OSError:
            pass

        genome_fna = 'genome_sequences/%s.fna' % self.genome_id

        _SeqIO.write(_SeqRecord(_Seq(self.genome_sequence.tostring()), id = self.genome_id), 
                    genome_fna, 
                    'fasta')

        # make folder for alignments (BAMs)
        local_alns_path = _os.path.sep.join(local_alns_path)
        if not _os.path.exists(local_alns_path):
            _os.makedirs(local_alns_path)

        # make a subdir for this genome
        local_alns_path_genome = _os.path.sep.join([
                                local_alns_path, 
                                self.genome_id])
        if not _os.path.exists(local_alns_path_genome):
            _os.makedirs(local_alns_path_genome)


        max_processes = _decide_max_processes( max_cpus )


        e1 = 'Could not find "read_files" attribute. Before aligning to genome, reads must be quality score trimmed. Please run trim() method on this Reads instance.'

        assert hasattr(self, 'read_files'), e1

        e2 = 'Could not find %s. Either run trim() again or ensure file exists'

        for pairname, files in self.read_files.items():
            assert _os.path.exists(files[1]), e2 % files[1]
            assert _os.path.exists(files[2]), e2 % files[2]

        have_index_files = [_os.path.exists(genome_fna + '.' + a) for a in ('ann','pac','amb','bwt','sa')]

        if not all(have_index_files):
            print('Writing BWA index files for %s' % genome_fna)
            _subprocess.call([path_to_exe, 'index', genome_fna])


        aligned_read_files = {}
        for pairname,files in self.read_files.items():
            RGinfo = r"@RG\tID:%s\tSM:%s\tPL:ILLUMINA" % (pairname,pairname)
            if insert_size:
                cmd = [path_to_exe, 'mem', '-t', str(max_processes), '-M', '-a', '-I', insert_size, '-R', RGinfo, genome_fna, files[1], files[2]]
            else:
                # BWA can estimate on-the-fly
                cmd = [path_to_exe, 'mem', '-t', str(max_processes), '-M', '-a', '-R', RGinfo, genome_fna, files[1], files[2]]
            
            out_sam = _os.path.sep.join([local_alns_path_genome, '%s__%s.sam' % (pairname, self.genome_id)])
            
            if not _os.path.exists(out_sam) or force:
                print('Called: "%s"' % ' '.join(cmd))
                with open(out_sam, "wb") as out:
                    _subprocess.call(cmd, stdout = out)
                
            else:
                print('Found:')
                print(out_sam)
                print('use "force = True" to overwrite')
            
            print(' '.join(cmd))
            
            aligned_read_files[pairname] = out_sam

        self.aligned_read_files = aligned_read_files
Exemplo n.º 9
0
    def align(self,
              insert_size=False,
              path_to_exe=False,
              local_alns_path=['alignments'],
              force=False,
              max_cpus=-1):

        if not path_to_exe:
            path_to_exe = _get_exe_path('bwa')

        # write genome sequence to a fasta file
        try:
            _os.makedirs('genome_sequences')
        except OSError:
            pass

        genome_fna = 'genome_sequences/%s.fna' % self.genome_id

        _SeqIO.write(
            _SeqRecord(_Seq(self.genome_sequence.tostring()),
                       id=self.genome_id), genome_fna, 'fasta')

        # make folder for alignments (BAMs)
        local_alns_path = _os.path.sep.join(local_alns_path)
        if not _os.path.exists(local_alns_path):
            _os.makedirs(local_alns_path)

        # make a subdir for this genome
        local_alns_path_genome = _os.path.sep.join(
            [local_alns_path, self.genome_id])
        if not _os.path.exists(local_alns_path_genome):
            _os.makedirs(local_alns_path_genome)

        max_processes = _decide_max_processes(max_cpus)

        e1 = 'Could not find "read_files" attribute. Before aligning to genome, reads must be quality score trimmed. Please run trim() method on this Reads instance.'

        assert hasattr(self, 'read_files'), e1

        e2 = 'Could not find %s. Either run trim() again or ensure file exists'

        for pairname, files in self.read_files.items():
            assert _os.path.exists(files[1]), e2 % files[1]
            assert _os.path.exists(files[2]), e2 % files[2]

        # always (re)index in case of upstream changes in data
        print('Writing BWA index files for %s' % genome_fna)
        _subprocess.call([path_to_exe, 'index', genome_fna])

        aligned_read_files = {}
        for pairname, files in self.read_files.items():
            RGinfo = r"@RG\tID:%s\tSM:%s\tPL:ILLUMINA" % (pairname, pairname)
            if insert_size:
                cmd = [
                    path_to_exe, 'mem', '-t',
                    str(max_processes), '-M', '-a', '-I', insert_size, '-R',
                    RGinfo, genome_fna, files[1], files[2]
                ]
            else:
                # BWA can estimate on-the-fly
                cmd = [
                    path_to_exe, 'mem', '-t',
                    str(max_processes), '-M', '-a', '-R', RGinfo, genome_fna,
                    files[1], files[2]
                ]

            out_sam = _os.path.sep.join([
                local_alns_path_genome,
                '%s__%s.sam' % (pairname, self.genome_id)
            ])

            if not _os.path.exists(out_sam) or force:
                print('Called: "%s"' % ' '.join(cmd))
                with open(out_sam, "wb") as out:
                    _subprocess.call(cmd, stdout=out)

            else:
                print('Found:')
                print(out_sam)
                print('use "force = True" to overwrite')

            print(' '.join(cmd))

            aligned_read_files[pairname] = out_sam

        self.aligned_read_files = aligned_read_files
Exemplo n.º 10
0
    def generateReads(self, path_to_exe = False, 
                            paths_to_genomes = False,
                            readcov = 60,
                            readlen = 100,
                            fraglen = 350,
                            sterrfraglen = 20,
                            model = 4,
                            max_cpus = -1):
        '''
        Call GemSIM to generate reads

        Need to have written genome sequences to generate from, possibly with 
        generated SNPs, small indels and large deletions.
        '''

        #max_cpus etc

        if paths_to_genomes:
            use_genomes = sorted(paths_to_genomes)
        elif hasattr(self, 'written_genomes'):
            use_genomes = sorted(self.written_genomes)
        else:
            raise ValueError('provide either paths_to_genomes or generate some then .writeSequences()')

        if not path_to_exe:
            path_to_exe = _get_exe_path('gemsim')

        comment2 = '''
        to generate reads put GemSIM v1.6 into subfolder GemSIM_v1.6 and issue these commands:
        GemSIM_v1.6/GemReads.py -r LESB58_for_GemSim_01.fasta -n 1980527 -l d -u 350 -s 20 -m GemSIM_v1.6/models/ill100v4_p.gzip -c -q 33 -p -o GemSimLESB58_01
        '''

        num_pairs = len(self.genome.sequence) * readcov / (readlen*2)

        if model == 4:
            path_to_model = _os.path.sep.join(path_to_exe.split(_os.path.sep)[:-1] + ['models','ill100v4_p.gzip'])
        elif model == 5:
            path_to_model = _os.path.sep.join(path_to_exe.split(_os.path.sep)[:-1] + ['models','ill100v5_p.gzip'])

        print('Using error model: {}'.format(path_to_model))
        print('Generating {:,} {}bp read pairs for {}x coverage depth of a {}bp genome ({})'.format(
                num_pairs, readlen, readcov, len(self.genome.sequence), self.genome.id))

        processes = set()
        max_processes = _decide_max_processes( max_cpus )

        import time
        start = time.time()
        out_raw = []
        for i,genome_in in enumerate(use_genomes):
            # could use per genome length . . less consistent than using reference
            # genome_len = len(_SeqIO.read(genome_in,'fasta').seq)
            # num_pairs = genome_len * readcov / (readlen*2)
            outprefix = 'GemSim_{}_{:02d}'.format(self.genome.id, i+1)
            cmd = [path_to_exe, 
                        '-r', genome_in,
                        '-n', num_pairs, 
                        '-l', 'd', '-u', fraglen, '-s', sterrfraglen, 
                        '-m', path_to_model, 
                        '-c', 
                        '-q', 33, '-p',
                        '-o', outprefix]
            out_raw += [outprefix+'_fir.fastq', outprefix+'_sec.fastq']
            # this would be better to rename and compress all in one
            # maybe as a shell script? Then resuming (--force) would be easier.
            if _os.path.exists(outprefix+'_fir.fastq') and \
                    _os.path.exists(outprefix+'_sec.fastq'):
                print('Found output for {}_fir.fastq (and sec), not regenerating, '\
                'delete these to start from scratch'.format(outprefix))
            else:
                cmd = map(str,cmd)
                print(' '.join(cmd))
                processes.add( _subprocess.Popen(cmd, shell=False) )
            if len(processes) >= max_processes:
                (pid, exit_status) = _os.wait()
                processes.difference_update(
                    [p for p in processes if p.poll() is not None])
            

        # Check if all the child processes were closed
        for p in processes:
            if p.poll() is None:
                p.wait()

        missing = []
        for o in out_raw:
            if not _os.path.exists(o):
                missing += [o]

        assert len(missing) == 0, 'Could not find:\n{}'.format('\n'.join(missing))
        print('all finished after {} minutes'.format(int(round((time.time() - start)/60.0))))

        outdir = _os.path.sep.join(['simulated_reads',self.genome.id])
        try:
            _os.makedirs(outdir)
        except OSError:
            pass

        for o in out_raw:
            new = _os.path.sep.join([outdir, o.replace('fir','R1').replace('sec','R2')])
            print('{} ==> {}'.format(o, new))
            _os.rename(o, new)
            cmd = ['gzip', new]
            print(' '.join(cmd))
            _subprocess.call(cmd)
Exemplo n.º 11
0
def get_download(name, description, source, url, commit, checksum, destination, preparation, checker):
    '''
    Download and unpack a dependancy
    '''
    ## 
    initialdir = _os.path.abspath(_os.curdir)
    try:
        _os.chdir(destination)
    except OSError:
        _os.makedirs(destination)
        _os.chdir(destination)
    
    if checksum:
        hasher_algorithm = checksum.split('=')[0]
        if hasher_algorithm == 'md5':
            hasher = _hashlib.md5()
        elif hasher_algorithm == 'sha1':
            hasher = _hashlib.sha1()
        elif hasher_algorithm == 'sha224':
            hasher = _hashlib.sha224()
        elif hasher_algorithm == 'sha256':
            hasher = _hashlib.sha256()
        elif hasher_algorithm == 'sha384':
            hasher = _hashlib.sha384()
        elif hasher_algorithm == 'sha512':
            hasher = _hashlib.sha512()
        else:
            print("{} checksums not implemented in Python's hashlib!".format(hasher_algorithm))
    
    print('Downloading: %s' % url)
    req = _urllib2.urlopen(url)
    CHUNK = 16 * 1024 * 16
    data = _cStringIO.StringIO()
    c = 0
    for chunk in iter(lambda: req.read(CHUNK), ''):
        c += CHUNK
        print("{:,} bytes".format(c))
        data.write(chunk)
    
    print('Download complete . . .')
    data.seek(0)
    
    if checksum:
        buff = data.read(65536)
        while len(buff) > 0:
            hasher.update(buff)
            buff = data.read(65536)
        
        e = '. . . checksum fail!'
        assert hasher.hexdigest() == checksum.split('=')[1], e
        print('. . . checksum passed!')
        data.seek(0)
    
    if url[-6:] == 'tar.gz':
        archive = _tarfile.open(mode="r:gz", fileobj = data)
    elif url[-7:] == 'tar.bz2':
        archive = _tarfile.open(mode="r:bz2", fileobj = data)
    elif url[-4:] == '.zip':
        archive = _zipfile.ZipFile(data)
    
    if destination == 'local_packages':
        # extract as a pypi python package
        release = url.split('/')[-1][:-7]
        print('Extracting {} to {}'.format(release, _os.path.sep.join([destination,name])))
        c = 0
        nostrip = {'pysam'}
        if name in nostrip:
            try:
                _shutil.rmtree(archive.getnames()[0])
            except OSError:
                pass
            #_shutil.rmtree(_os.path.sep.join([destination, archive.getnames()[0]]))
            # some python modules should not be stripped . . more complex install
            for member in archive.getmembers():
                if member.isreg():
                    archive.extract(member)
                    print(member.name)
                    c += 1
        else:
            # others don't need additional compilation
            check_path1 = '{}/{}'.format(release,name)
            for member in archive.getmembers():
                if member.isreg() and check_path1 in member.name:
                    member.name = _os.path.sep.join(member.name.split(_os.path.sep)[1:])
                    archive.extract(member)
                    c += 1
        print('Extracted {} files'.format(c))
    
    else:
        # extract as a generic external program
        archive.extractall()
    
    if preparation:
        for do_this in preparation:
            if 'just_packages' in do_this['arguments']:
                # this is the only thing that differentiates this prepare()
                # from others that need some chdir <== this should be improved
                # see dep dict
                curdir = _os.path.abspath(_os.curdir)
                _os.chdir(_os.path.pardir)
                do_this['function'](*do_this['arguments']['package_list'])
                # return to previous folder
                _os.chdir(curdir)
            else:
                extracted_base_dir = archive.getnames()[0].split(_os.path.sep)[0]
                curdir = _os.path.abspath(_os.curdir)
                # go to installed folder
                _os.chdir(_os.path.sep.join([destination,extracted_base_dir]))
                do_this['function'](**do_this['arguments'])
                # return to previous folder
                _os.chdir(curdir)
    
    _os.chdir(initialdir)
Exemplo n.º 12
0
    def generateReads(self,
                      path_to_exe=False,
                      paths_to_genomes=False,
                      readcov=60,
                      readlen=100,
                      fraglen=350,
                      sterrfraglen=20,
                      model=4,
                      max_cpus=-1):
        '''
        Call GemSIM to generate reads

        Need to have written genome sequences to generate from, possibly with 
        generated SNPs, small indels and large deletions.
        '''

        #max_cpus etc

        if paths_to_genomes:
            use_genomes = sorted(paths_to_genomes)
        elif hasattr(self, 'written_genomes'):
            use_genomes = sorted(self.written_genomes)
        else:
            raise ValueError(
                'provide either paths_to_genomes or generate some then .writeSequences()'
            )

        if not path_to_exe:
            path_to_exe = _get_exe_path('gemsim')

        comment2 = '''
        to generate reads put GemSIM v1.6 into subfolder GemSIM_v1.6 and issue these commands:
        GemSIM_v1.6/GemReads.py -r LESB58_for_GemSim_01.fasta -n 1980527 -l d -u 350 -s 20 -m GemSIM_v1.6/models/ill100v4_p.gzip -c -q 33 -p -o GemSimLESB58_01
        '''

        num_pairs = len(self.genome.sequence) * readcov / (readlen * 2)

        if model == 4:
            path_to_model = _os.path.sep.join(
                path_to_exe.split(_os.path.sep)[:-1] +
                ['models', 'ill100v4_p.gzip'])
        elif model == 5:
            path_to_model = _os.path.sep.join(
                path_to_exe.split(_os.path.sep)[:-1] +
                ['models', 'ill100v5_p.gzip'])

        print('Using error model: {}'.format(path_to_model))
        print(
            'Generating {:,} {}bp read pairs for {}x coverage depth of a {}bp genome ({})'
            .format(num_pairs, readlen, readcov, len(self.genome.sequence),
                    self.genome.id))

        processes = set()
        max_processes = _decide_max_processes(max_cpus)

        import time
        start = time.time()
        out_raw = []
        for i, genome_in in enumerate(use_genomes):
            # could use per genome length . . less consistent than using reference
            # genome_len = len(_SeqIO.read(genome_in,'fasta').seq)
            # num_pairs = genome_len * readcov / (readlen*2)
            outprefix = 'GemSim_{}_{:02d}'.format(self.genome.id, i + 1)
            cmd = [
                path_to_exe, '-r', genome_in, '-n', num_pairs, '-l', 'd', '-u',
                fraglen, '-s', sterrfraglen, '-m', path_to_model, '-c', '-q',
                33, '-p', '-o', outprefix
            ]
            out_raw += [outprefix + '_fir.fastq', outprefix + '_sec.fastq']
            # this would be better to rename and compress all in one
            # maybe as a shell script? Then resuming (--force) would be easier.
            if _os.path.exists(outprefix+'_fir.fastq') and \
                    _os.path.exists(outprefix+'_sec.fastq'):
                print('Found output for {}_fir.fastq (and sec), not regenerating, '\
                'delete these to start from scratch'.format(outprefix))
            else:
                cmd = map(str, cmd)
                print(' '.join(cmd))
                processes.add(_subprocess.Popen(cmd, shell=False))
            if len(processes) >= max_processes:
                (pid, exit_status) = _os.wait()
                processes.difference_update(
                    [p for p in processes if p.poll() is not None])

        # Check if all the child processes were closed
        for p in processes:
            if p.poll() is None:
                p.wait()

        missing = []
        for o in out_raw:
            if not _os.path.exists(o):
                missing += [o]

        assert len(missing) == 0, 'Could not find:\n{}'.format(
            '\n'.join(missing))
        print('all finished after {} minutes'.format(
            int(round((time.time() - start) / 60.0))))

        outdir = _os.path.sep.join(['simulated_reads', self.genome.id])
        try:
            _os.makedirs(outdir)
        except OSError:
            pass

        for o in out_raw:
            new = _os.path.sep.join(
                [outdir, o.replace('fir', 'R1').replace('sec', 'R2')])
            print('{} ==> {}'.format(o, new))
            _os.rename(o, new)
            cmd = ['gzip', new]
            print(' '.join(cmd))
            _subprocess.call(cmd)