예제 #1
0
                    default=85,
                    help='Kaju alignment score')
parser.add_argument('-l',
                    '--kaiju_minlen',
                    type=int,
                    default=15,
                    help='Kaju minimum length')
parser.add_argument('-i',
                    '--kaiju_mismatch',
                    type=int,
                    default=1,
                    help='Kaju allowed mismatches')
args = parser.parse_args()

# Check that dirs have "/" at the end
args.output_dir += check_path(args.output_dir)
args.kraken_db += check_path(args.kraken_db)
args.kaiju_db += check_path(args.kaiju_db)

# Check args.output_dir exits else make dir
if not os.path.exists(args.output_dir):
    os.makedirs(args.output_dir)

# os.chdir(args.output_dir)

# Write a log_file:
log_filename = os.path.join(args.output_dir, "log_file.txt")
with open(log_filename, "w") as log_file:
    log_file.write("General parameters:\n"
                   "file1 = %s\n"
                   "file2 = %s\n"
예제 #2
0
def main():
    """Run kodoja build."""
    parser = argparse.ArgumentParser(
        description=help_text,
        epilog=help_epilog,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('--version',
                        action='version',
                        version='Kodoja v' + version)
    parser.add_argument('-o',
                        '--output_dir',
                        type=str,
                        required=True,
                        help='Output directory path, required')
    parser.add_argument('-t',
                        '--threads',
                        type=int,
                        default=1,
                        help='Number of threads, default=1')
    parser.add_argument('-p',
                        '--host_taxid',
                        type=int,
                        default=False,
                        help='Host tax ID')
    parser.add_argument('-d',
                        '--download_parallel',
                        type=int,
                        default=4,
                        help='Parallel genome download, default=4')
    parser.add_argument('-n',
                        '--no_download',
                        action='store_false',
                        help='Genomes have already been downloaded')
    parser.add_argument('-e',
                        '--extra_files',
                        type=str,
                        nargs='*',
                        help='List of extra files added to "extra" dir')
    parser.add_argument('-x',
                        '--extra_taxids',
                        type=str,
                        nargs='*',
                        help='List of taxID of extra files')
    parser.add_argument(
        '-v',
        '--all_viruses',
        action='store_true',
        help='Build databases with all viruses (not plant specific)')
    parser.add_argument('-b',
                        '--kraken_tax',
                        type=str,
                        default=False,
                        help='Path to taxonomy directory')
    parser.add_argument('-k',
                        '--kraken_kmer',
                        type=int,
                        default=31,
                        help='Kraken kmer size, default=31')
    parser.add_argument('-m',
                        '--kraken_minimizer',
                        type=int,
                        default=15,
                        help='Kraken minimizer size, default=15')
    parser.add_argument('-a',
                        '--db_tag',
                        type=str,
                        default=False,
                        help='Suffix for databases')
    args = parser.parse_args()

    # extra_files = ["Rubus_occidentalis_v1.0.a1.scaffolds.fna.gz",]
    # extra_taxid = [75079,]
    # args.extra_files = ['/home/ae42909/Scratch/GCF_000147415.1_v_1.0_genomic.fna.gz']
    # args.extra_taxids = [554065]

    tool_list = ['kraken', 'kaiju']

    args.output_dir += check_path(args.output_dir)
    kraken_db_dir = os.path.join(args.output_dir, "krakenDB")
    kaiju_db_dir = os.path.join(args.output_dir, "kaijuDB")

    # Check args.output_dir exits, else make dir
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Name databases with tag
    if args.db_tag:
        kraken_db_dir += '_' + args.db_tag
        kaiju_db_dir += '_' + args.db_tag

    kraken_db_dir += check_path(kraken_db_dir)
    kaiju_db_dir += check_path(kaiju_db_dir)

    # Ensure extra files have taxIDs/are in the right format and create symlinks
    if args.extra_files:
        # Check there are the same number of files as taxids
        assert len(args.extra_files) == len(args.extra_taxids), \
            "Each extra file provided needs to have a corresponding ncbi taxid"
        # Check extra files provided are compressed and have the right file extension
        for f in args.extra_files:
            if not f.endswith((".fna.gz", ".faa.gz")):
                sys.exit(
                    "File extensions need to be either compressed '.fna.gz' "
                    "for genomic data, or '.faa.gz' for protein data. "
                    "Got %r" % f)
        # Make a copy of each file in extra_files into 'extra' directory
        os.makedirs(os.path.join(args.output_dir, "extra/"))
        for extraFile in args.extra_files:
            shutil.copy(extraFile, os.path.join(args.output_dir, 'extra/'))

    # Download virus assembly summary for refseq
    if not os.path.exists(
            os.path.join(args.output_dir, "viral_assembly_summary.txt")):
        download_with_retries(
            'https://ftp.ncbi.nih.gov/genomes/refseq/viral/assembly_summary.txt',
            os.path.join(args.output_dir, 'viral_assembly_summary.txt'))
    path_assembly_summary = os.path.join(args.output_dir,
                                         "viral_assembly_summary.txt")
    vir_assembly = pd.read_table(path_assembly_summary,
                                 sep='\t',
                                 skiprows=1,
                                 header=0)
    vir_assembly = vir_assembly.rename(
        columns={'# assembly_accession': 'assembly_accession'})

    # Set subset_vir_assembly and vir_host
    # subset_vir_assembly - list of virus accession names which will be added to
    #                       databases (used in krakenDB_build and kaijuDB_build)
    # vir_host - list of viral taxIDs for plant viruses. A subset of the genomes
    #            in refseq, be added to datbase by setting 'subset_vir_assembly'.
    if args.all_viruses:
        subset_vir_assembly = False
        vir_host = False
    else:
        # After downloading, will filter for plant-host virus
        if not os.path.exists(os.path.join(args.output_dir,
                                           "virushostdb.tsv")):
            # os.chdir(args.output_dir)
            download_with_retries(
                'ftp://ftp.genome.jp/pub/db/virushostdb/virushostdb.tsv',
                os.path.join(args.output_dir, 'virushostdb.tsv'))
        virHost_table = pd.read_csv(os.path.join(args.output_dir,
                                                 "virushostdb.tsv"),
                                    sep="\t").fillna('')
        plnVir = virHost_table[virHost_table['host lineage'].str.contains(
            "Viridiplantae")]
        vir_host = list(plnVir['virus tax id'])

        subset_vir_assembly = list(vir_assembly.assembly_accession[
            vir_assembly['taxid'].isin(vir_host)])

    for tool in tool_list:
        if args.no_download:
            # Download NCBI genomes
            ncbi_download(tool, args.output_dir, args.download_parallel,
                          args.host_taxid)
            print("DONE with downloading")
        # Rename downloaded genomic files for Kraken or protein file for Kaiju
        ncbi_rename_customDB(tool, args.output_dir, args.host_taxid,
                             args.extra_files, args.extra_taxids)
        print("DONE with renaming")
        # Make Kraken database
        if tool == "kraken":
            krakenDB_build(args.output_dir, kraken_db_dir, args.threads,
                           args.kraken_kmer, args.kraken_minimizer,
                           subset_vir_assembly, args.kraken_tax)
            print("DONE with kraken db")
            if subset_vir_assembly:
                vir_genomes_text = 'plant viruses'
            else:
                vir_genomes_text = 'all viruses'

            other_genomes_text = ''
            if args.host_taxid:
                other_genomes_text += str(args.host_taxid) + ', '
            if args.extra_taxids:
                other_genomes_text += str(args.extra_taxids)
            with open(os.path.join(kraken_db_dir, "log_file.txt"),
                      "w") as out_file:
                text = 'output_dir = ' + args.output_dir + '\n'
                text += 'kraken_kmer = ' + str(args.kraken_kmer) + '\n'
                text += 'kraken_minimizer = ' + str(
                    args.kraken_minimizer) + '\n'
                text += 'Viral genomes added to db = ' + vir_genomes_text + '\n'
                text += 'Other genome added to db = ' + other_genomes_text + '\n'
                out_file.write(text)
        elif tool == "kaiju":
            # Make Kaiju database
            kaijuDB_build(args.output_dir, kaiju_db_dir, subset_vir_assembly)
            with open(os.path.join(kaiju_db_dir, "log_file.txt"),
                      "w") as out_file:
                text = 'output_dir = ' + args.output_dir + '\n'
                text += 'Viral genomes added to db = ' + vir_genomes_text + '\n'
                text += 'Other genome added to db = ' + other_genomes_text + '\n'
                out_file.write(text)
예제 #3
0
def main():
    """Run kodoka retrieve."""
    parser = argparse.ArgumentParser(
        description=help_text,
        epilog=help_epilog,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('--version',
                        action='version',
                        version='Kodoja v' + version)
    parser.add_argument(
        '-o',
        '--file_dir',
        type=str,
        required=True,
        help='Path to directory of kodoja_search results, required')
    parser.add_argument('-r1',
                        '--read1',
                        type=str,
                        required=True,
                        help='Read 1 file path, required')
    parser.add_argument('-r2',
                        '--read2',
                        type=str,
                        default=False,
                        help='Read 2 file path, default: False')
    parser.add_argument('-f',
                        '--user_format',
                        type=str,
                        default='fastq',
                        help='Sequence data format, default: fastq')
    parser.add_argument(
        '-t',
        '--taxID',
        type=int,
        default=False,
        help='Virus tax ID for subsetting, default: All viral sequences')
    parser.add_argument('-g',
                        '--genus',
                        action='store_true',
                        help='Include sequences classified at genus')
    parser.add_argument('-s',
                        '--stringent',
                        action='store_true',
                        help='Only subset sequences identified by both tools')
    args = parser.parse_args()

    table_summary = pd.read_csv(os.path.join(args.file_dir, "virus_table.txt"),
                                sep="\t",
                                header=0,
                                index_col=False)
    kodoja_vrl = pd.read_csv(os.path.join(args.file_dir, "kodoja_VRL.txt"),
                             sep="\t",
                             header=0,
                             index_col=False).fillna('')
    args.file_dir += check_path(args.file_dir)
    output_dir = os.path.join(args.file_dir, "subset_files/")

    # Create directory
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    if args.taxID:
        TaxId_out = [args.taxID]
        label = 'virus_' + str(args.taxID)
        if args.genus:
            more_taxids = []
            with open(os.path.join(args.file_dir, 'genus_taxid.pkl'),
                      'rb') as id_dict:
                genus_taxid = pickle.load(id_dict)
            for sp_taxid in TaxId_out:
                genus_name = table_summary.Genus[table_summary['Species TaxID']
                                                 == sp_taxid].values[0]
                if genus_name in genus_taxid:
                    for items in genus_taxid[genus_name]:
                        if items not in more_taxids:
                            more_taxids.append(items)
            for new_taxid in more_taxids:
                TaxId_out.append(new_taxid)
    else:
        kraken_taxid = list(kodoja_vrl.kraken_tax_ID[
            kodoja_vrl['kraken_seq_tax'].str.contains("Viruses")])
        kraken_taxid += list(kodoja_vrl.kraken_tax_ID[
            kodoja_vrl['kraken_seq_tax'].str.contains("Viroids")])
        kaiju_taxid = list(kodoja_vrl.kaiju_tax_ID[
            kodoja_vrl['kaiju_seq_tax'].str.contains("Viruses")])
        TaxId_out = sorted(set(kraken_taxid + kaiju_taxid))
        # TaxId_out = list(table_summary['Species TaxID'])
        label = 'virus_all'

    if args.stringent:
        rows_wanted = kodoja_vrl['combined_result'].isin(TaxId_out)
    else:
        rows_wanted = (kodoja_vrl['kraken_tax_ID'].isin(TaxId_out)
                       | kodoja_vrl['kaiju_tax_ID'].isin(TaxId_out))

    # Since kodoja v0.0.8 the Seq_ID column has been just the ID
    # but on earlier versions would be full description line -
    # thus splitting on the first white space:
    seqID_wanted = set(
        _.rstrip("\n").split(None, 1)[0]
        for _ in kodoja_vrl.loc[rows_wanted, 'Seq_ID'])

    if args.read2:
        # Paired reads

        # Since kodoja v0.0.8 in paired-end more, we like Kraken
        # also strip off the /1 or /2 suffix
        # Removing it here in case using input from an older version of kodoja
        seqID_wanted = set(_[:-2] if _.endswith("/1") else _
                           for _ in seqID_wanted)

        filter_sequence_file(args.read1,
                             os.path.join(
                                 output_dir,
                                 label + "_sequences1." + args.user_format),
                             args.user_format,
                             seqID_wanted,
                             ignore_suffix="/1")
        filter_sequence_file(args.read2,
                             os.path.join(
                                 output_dir,
                                 label + "_sequences2." + args.user_format),
                             args.user_format,
                             seqID_wanted,
                             ignore_suffix="/2")
    else:
        # Single reads
        filter_sequence_file(
            args.read1,
            os.path.join(output_dir,
                         label + "_sequences1." + args.user_format),
            args.user_format, seqID_wanted)
예제 #4
0
                    help='Read 2 file path, default: False')
parser.add_argument('-f', '--user_format', type=str, default='fastq',
                    help='Sequence data format, default: fastq')
parser.add_argument('-t', '--taxID', type=int, default=False,
                    help='Virus tax ID for subsetting, default: All viral sequences')
parser.add_argument('-g', '--genus', action='store_true',
                    help='Include sequences classified at genus')
parser.add_argument('-s', '--stringent', action='store_true',
                    help='Only subset sequences identified by both tools')
args = parser.parse_args()

table_summary = pd.read_csv(args.file_dir + "virus_table.txt", sep="\t", header=0,
                            index_col=False)
kodoja_vrl = pd.read_csv(args.file_dir + "kodoja_VRL.txt", sep="\t", header=0,
                         index_col=False).fillna('')
args.file_dir += check_path(args.file_dir)
output_dir = args.file_dir + 'subset_files/'


# Create directory
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

if args.taxID:
    TaxId_out = [args.taxID]
    label = 'virus_' + str(args.taxID)
    if args.genus:
        more_taxids = []
        with open(args.file_dir + 'genus_taxid.pkl', 'rb') as id_dict:
            genus_taxid = pickle.load(id_dict)
        for sp_taxid in TaxId_out:
예제 #5
0
                    help='Kraken minimizer size, default=15')
parser.add_argument('-a',
                    '--db_tag',
                    type=str,
                    default=False,
                    help='Suffix for databases')
args = parser.parse_args()

# extra_files = ["Rubus_occidentalis_v1.0.a1.scaffolds.fna.gz",]
# extra_taxid = [75079,]
# args.extra_files = ['/home/ae42909/Scratch/GCF_000147415.1_v_1.0_genomic.fna.gz']
# args.extra_taxids = [554065]

tool_list = ['kraken', 'kaiju']

args.output_dir += check_path(args.output_dir)
kraken_db_dir = args.output_dir + "krakenDB"
kaiju_db_dir = args.output_dir + "kaijuDB"

# Check args.output_dir exits, else make dir
if not os.path.exists(args.output_dir):
    os.makedirs(args.output_dir)

# Name databases with tag
if args.db_tag:
    kraken_db_dir += '_' + args.db_tag
    kaiju_db_dir += '_' + args.db_tag

kraken_db_dir += check_path(kraken_db_dir)
kaiju_db_dir += check_path(kaiju_db_dir)