Пример #1
0
def test_get_summary(monkeypatch, req, tmpdir):
    """Test getting the assembly summary file."""
    cache_dir = tmpdir.mkdir('cache')
    monkeypatch.setattr(core, 'CACHE_DIR', str(cache_dir))
    cache_file = cache_dir.join('refseq_bacteria_assembly_summary.txt')
    req.get(
        'https://ftp.ncbi.nih.gov/genomes/refseq/bacteria/assembly_summary.txt',
        text='test')

    ret = core.get_summary('refseq', 'bacteria', NgdConfig.get_default('uri'),
                           False)
    assert ret.read() == 'test'
    assert not cache_file.check()

    ret = core.get_summary('refseq', 'bacteria', NgdConfig.get_default('uri'),
                           True)
    assert ret.read() == 'test'
    assert cache_file.check()

    req.get(
        'https://ftp.ncbi.nih.gov/genomes/refseq/bacteria/assembly_summary.txt',
        text='never read')
    ret = core.get_summary('refseq', 'bacteria', NgdConfig.get_default('uri'),
                           True)
    assert ret.read() == 'test'
Пример #2
0
def test_filter_entries():
    """Test filter_entries."""
    config = NgdConfig()
    with open(_get_file('assembly_status.txt'), 'r') as fh:
        entries = list(core.parse_summary(fh))

    assert core.filter_entries(entries, config) == entries

    expected = entries[-1:]
    config.assembly_accessions = "GCF_000203835.1"

    assert core.filter_entries(entries, config) == expected
Пример #3
0
def test_filter_entries():
    """Test filter_entries."""
    config = NgdConfig()
    with open(_get_file('assembly_status.txt'), 'r') as fh:
        entries = list(core.parse_summary(fh))

    assert core.filter_entries(entries, config) == entries

    expected = entries[-1:]
    config.assembly_accessions = "GCF_000203835.1"

    assert core.filter_entries(entries, config) == expected
Пример #4
0
def prepare_create_downloadjob(req,
                               tmpdir,
                               format_map=NgdConfig._FORMATS,
                               human_readable=False,
                               create_local_file=False):
    # Set up test env
    entry = {
        'assembly_accession': 'FAKE0.1',
        'organism_name': 'Example species',
        'infraspecific_name': 'strain=ABC 1234',
        'ftp_path': 'https://fake/genomes/FAKE0.1'
    }

    config = NgdConfig()

    outdir = tmpdir.mkdir('output')
    download_jobs = []
    config.output = str(outdir)
    config.human_readable = human_readable

    checksum_file_content = ''
    for key, val in format_map.items():
        seqfile = tmpdir.join('fake{}'.format(val))
        seqfile.write(key)

        checksum = core.md5sum(str(seqfile))
        filename = path.basename(str(seqfile))
        full_url = 'https://fake/genomes/FAKE0.1/{}'.format(filename)
        local_file = outdir.join('refseq', 'bacteria', 'FAKE0.1', filename)
        if create_local_file:
            local_file.write(seqfile.read(), ensure=True)

        symlink_path = None
        if human_readable:
            symlink_path = str(
                outdir.join('human_readable', 'refseq', 'bacteria', 'Example',
                            'species', 'ABC_1234', filename))

        download_jobs.append(
            core.DownloadJob(full_url, str(local_file), checksum,
                             symlink_path))
        checksum_file_content += '{}\t./{}\n'.format(checksum, filename)
        req.get(full_url, text=seqfile.read())

    req.get('https://fake/genomes/FAKE0.1/md5checksums.txt',
            text=checksum_file_content)

    return entry, config, download_jobs
Пример #5
0
def test_get_summary(monkeypatch, req, tmpdir):
    """Test getting the assembly summary file."""
    cache_dir = tmpdir.mkdir('cache')
    monkeypatch.setattr(core, 'CACHE_DIR', str(cache_dir))
    cache_file = cache_dir.join('refseq_bacteria_assembly_summary.txt')
    req.get('https://ftp.ncbi.nih.gov/genomes/refseq/bacteria/assembly_summary.txt', text='test')

    ret = core.get_summary('refseq', 'bacteria', NgdConfig.get_default('uri'), False)
    assert ret.read() == 'test'
    assert not cache_file.check()

    ret = core.get_summary('refseq', 'bacteria', NgdConfig.get_default('uri'), True)
    assert ret.read() == 'test'
    assert cache_file.check()

    req.get('https://ftp.ncbi.nih.gov/genomes/refseq/bacteria/assembly_summary.txt', text='never read')
    ret = core.get_summary('refseq', 'bacteria', NgdConfig.get_default('uri'), True)
    assert ret.read() == 'test'
Пример #6
0
def test_get_summary_error_handling(monkeypatch, mocker, req, tmpdir):
    """Test get_summary error handling."""
    cache_dir = tmpdir.join('cache')
    monkeypatch.setattr(core, 'CACHE_DIR', str(cache_dir))
    req.get('https://ftp.ncbi.nih.gov/genomes/refseq/bacteria/assembly_summary.txt', text='test')

    fake_makedirs = mocker.MagicMock(side_effect=OSError(13, "Permission denied"))
    monkeypatch.setattr(os, 'makedirs', fake_makedirs)
    with pytest.raises(OSError):
        core.get_summary('refseq', 'bacteria', NgdConfig.get_default('uri'), True)
Пример #7
0
def test_get_summary_error_handling(monkeypatch, mocker, req, tmpdir):
    """Test get_summary error handling."""
    cache_dir = tmpdir.join('cache')
    monkeypatch.setattr(core, 'CACHE_DIR', str(cache_dir))
    req.get('https://ftp.ncbi.nih.gov/genomes/refseq/bacteria/assembly_summary.txt', text='test')

    fake_makedirs = mocker.MagicMock(side_effect=OSError(13, "Permission denied"))
    monkeypatch.setattr(os, 'makedirs', fake_makedirs)
    with pytest.raises(OSError):
        core.get_summary('refseq', 'bacteria', NgdConfig.get_default('uri'), True)
Пример #8
0
def prepare_create_downloadjob(req, tmpdir, format_map=NgdConfig._FORMATS, human_readable=False,
                               create_local_file=False):
    # Set up test env
    entry = {
        'assembly_accession': 'FAKE0.1',
        'organism_name': 'Example species',
        'infraspecific_name': 'strain=ABC 1234',
        'ftp_path': 'https://fake/genomes/FAKE0.1'
    }

    config = NgdConfig()

    outdir = tmpdir.mkdir('output')
    download_jobs = []
    config.output = str(outdir)
    config.human_readable = human_readable

    checksum_file_content = ''
    for key, val in format_map.items():
        seqfile = tmpdir.join('fake{}'.format(val))
        seqfile.write(key)

        checksum = core.md5sum(str(seqfile))
        filename = path.basename(str(seqfile))
        full_url = 'https://fake/genomes/FAKE0.1/{}'.format(filename)
        local_file = outdir.join('refseq', 'bacteria', 'FAKE0.1', filename)
        if create_local_file:
            local_file.write(seqfile.read(), ensure=True)

        symlink_path = None
        if human_readable:
            symlink_path = str(
                outdir.join('human_readable', 'refseq', 'bacteria', 'Example', 'species',
                            'ABC_1234', filename))

        download_jobs.append(core.DownloadJob(full_url, str(local_file), checksum, symlink_path))
        checksum_file_content += '{}\t./{}\n'.format(checksum, filename)
        req.get(full_url, text=seqfile.read())

    req.get('https://fake/genomes/FAKE0.1/md5checksums.txt', text=checksum_file_content)

    return entry, config, download_jobs
Пример #9
0
def test_get_name_and_checksum():

    class TestData:
        def __init__(self, checksums, end, filename, md5sum):
            self.checksums = checksums
            self.end = end
            self.filename = filename
            self.md5sum = md5sum

    regular_filenames = (
        {'checksum': 'd76c643ec4bbc34d2935eb0664156d99', 'file': 'GCF_000009605.1_ASM960v1_cds_from_genomic.fna.gz'},
        {'checksum': '42c1bb1447aea2512a17aeb3645b55e9', 'file': 'GCF_000009605.1_ASM960v1_genomic.fna.gz'},
        {'checksum': '8a685d49d826c4f0ad05152e906f3250', 'file': 'GCF_000009605.1_ASM960v1_genomic.gbff.gz'},
        {'checksum': 'e2d9e1cfa085cb462a73d3d2d2c22be5', 'file': 'GCF_000009605.1_ASM960v1_genomic.gff.gz'},
    )
    weird_filenames = (
        {'checksum': '4d5f39ceb7e113ad461f8370aaac4e41', 'file': 'GCF_003583405.1_CHULA_Jazt_1.1_for_version_1.1_of_the_Jishengella_sp._nov._AZ1-13_genome_from_a_lab_in_CHULA_cds_from_genomic.fna.gz'},
        {'checksum': 'e77c1e8bf0df2c353ce6a4899ae0cb5e', 'file': 'GCF_003583405.1_CHULA_Jazt_1.1_for_version_1.1_of_the_Jishengella_sp._nov._AZ1-13_genome_from_a_lab_in_CHULA_genomic.fna.gz'},
        {'checksum': 'c93ba924075c8b22210ac283d41207ad', 'file': 'GCF_003583405.1_CHULA_Jazt_1.1_for_version_1.1_of_the_Jishengella_sp._nov._AZ1-13_genome_from_a_lab_in_CHULA_genomic.gbff.gz'},
        {'checksum': 'd8394d0aff594ae962c88e1192238413', 'file': 'GCF_003583405.1_CHULA_Jazt_1.1_for_version_1.1_of_the_Jishengella_sp._nov._AZ1-13_genome_from_a_lab_in_CHULA_rna_from_genomic.fna.gz'},
    )
    test_table = (
        TestData(regular_filenames, NgdConfig.get_fileending('genbank'), regular_filenames[2]['file'], regular_filenames[2]['checksum']),
        TestData(regular_filenames, NgdConfig.get_fileending('fasta'), regular_filenames[1]['file'], regular_filenames[1]['checksum']),
        TestData(regular_filenames, NgdConfig.get_fileending('cds-fasta'), regular_filenames[0]['file'], regular_filenames[0]['checksum']),
        TestData(weird_filenames, NgdConfig.get_fileending('genbank'), weird_filenames[2]['file'], weird_filenames[2]['checksum']),
        TestData(weird_filenames, NgdConfig.get_fileending('fasta'), weird_filenames[1]['file'], weird_filenames[1]['checksum']),
        TestData(weird_filenames, NgdConfig.get_fileending('cds-fasta'), weird_filenames[0]['file'], weird_filenames[0]['checksum']),
    )

    for test in test_table:
        filename, checksum = core.get_name_and_checksum(test.checksums, test.end)
        assert filename == test.filename
        assert checksum == test.md5sum
Пример #10
0
    "-n",
    "name",
    help="input the phylum name or other. use ; to separate multiple ")
@click.option(
    "-t",
    "taxons",
    help=
    "input the taxon id. It will retrieve all the genomes desceding to the provided taxon; to separate multiple "
)
@click.option(
    "-F",
    "formats",
    help='Which formats to download (default: %(default)s).'
    'A comma-separated list of formats is also possible. For example: "fasta,assembly-report". '
    'Choose from: {choices}'.format(
        choices=NgdConfig.get_choices('file_formats')),
    default='fasta')
@click.option(
    "-o",
    "odir",
    help=
    f"Create output hierarchy in specified folder (default: {NgdConfig.get_default('output')})",
    default=NgdConfig.get_default('output'))
@click.option("-size",
              "size_of_batch",
              help=f"The size of each batch.",
              default=20)
@click.option("-p",
              "parallel",
              help=f"Run N downloads in parallel (default: 10)",
              default=5)