Пример #1
0
                if re.match(r".*txt$", file) and os.path.getsize(filePath) > 0:
                    with open(filePath) as infile:
                        for line in infile:
                            sampleName = ntpath.basename(filePath).split(".txt")[0]
                            outfile.write(line.strip() + "\t" + sampleName + "\t" + args.cohort_name + "\n")

    # Eval spHMMs
    rpackages.importr('base')
    #packageNames = ('tidyverse','ggsci','ggpubr')
    #utils = rpackages.importr('utils')
    #utils.chooseCRANmirror(ind=1)
    #packnames_to_install = [x for x in packageNames if not rpackages.isinstalled(x)]
    #if len(packnames_to_install) > 0:
    #    utils.install_packages(StrVector(packnames_to_install))

    rpackages.importr('tidyverse')
    rpackages.importr('ggsci')
    rpackages.importr('ggpubr')

    hp_hmm_directory = os.path.join(build_op_dir, 'HiPer_spHMMs')
    os.makedirs(hp_hmm_directory,0o777,True)
    with open('EvaluateSpHMMs.R', 'r') as f:
        rStr = f.read()
    myfunc = STAP(rStr, "EvaluateSpHMM")
    myfunc.EvaluateSpHMM(allHMMResult, allBLASTResult, gene_pos_file, args.prot_family_name, float(args.F1_Thresh), hmm_directory, hp_hmm_directory)

    timeTaken = time.time() - startTime
    mins = int(timeTaken / 60)
    secs = int(timeTaken) % 60
    print("\nTotal time taken : " + str(mins) + " mins " + str(secs) + " seconds")
Пример #2
0
def mbgcbuild(prot_alignment, prot_family_name, cohort_name,
              nucl_seq_directory, prot_seq_directory, seq_fmt, pair_fmt, r1_file_suffix,
              r2_file_suffix, tp_genes_nucl, blast_db_directory_map_file, blastn_search_directory, hmm_search_directory, f1_thresh,
              output_directory, cpu):
    try:
        CPU_THREADS = 4
        startTime = time.time()
        if cpu is not None:
            CPU_THREADS = int(cpu)

        # setup paths
        build_op_dir = output_directory + os.sep + "build"
        hmm_directory = os.path.join(build_op_dir, 'spHMMs')
        tp_genes_prot = build_op_dir + os.sep + "TPGenes.faa"
        alnOutput = os.path.join(build_op_dir,"TP_Homolog_Alignment.afa")
        gene_pos_file = os.path.join(build_op_dir, 'Gene_Interval_Pos.txt')
        gene_pos_file_aa = os.path.join(build_op_dir, 'Gene_Interval_Pos_AA.txt')
        if hmm_search_directory is None:
            hmm_search_directory = os.path.join(build_op_dir, 'hmm_result')
        allHMMResult = os.path.join(build_op_dir,"CombinedHmmSearch.txt")
        if blastn_search_directory is None:
            blastn_search_directory = os.path.join(build_op_dir, 'blastn_result')
        allBLASTResult = os.path.join(build_op_dir,"CombinedBLASTSearch.txt")

        # Create OP dirs
        os.makedirs(hmm_directory, 0o777, True)

        # Translate protein sequence
        runTranSeq(tp_genes_nucl,"1",tp_genes_prot)

        # Join true positives in the sample with the BGC proteins
        tmpFile = os.path.join(build_op_dir,"TP_Homolog.faa")
        joinedSeqs = []
        tpGeneSeqs = list(SeqIO.parse(tp_genes_prot, "fasta"))
        # Removing _1 added by TranSeq
        for seq in tpGeneSeqs:
            seq.id = seq.id[:-2]
            seq.description = ""
            joinedSeqs.append(seq)
        SeqIO.write(joinedSeqs,tp_genes_prot,"fasta")
        protAlnSeqs = list(SeqIO.parse(prot_alignment, "fasta"))
        for seq in protAlnSeqs:
            joinedSeqs.append(seq)
        SeqIO.write(joinedSeqs, tmpFile, "fasta")

        # MUSCLE align TP genes with markers
        runMUSCLE(tmpFile, alnOutput)

        # Gen spHMMs and interval pos
        # Extract spHMM coordinates from MUSCLE alignment
        hmmDict = gensphmmfiles(prot_family_name, alnOutput, tp_genes_prot,
                                hmm_directory, gene_pos_file, gene_pos_file_aa)

        if r1_file_suffix is None:
            r1_file_suffix = ""
        if r2_file_suffix is None:
            r2_file_suffix = ""

        # #Preprocess synthetic reads
        nucl_seq_directory = PreProcessReadsPar(nucl_seq_directory,
                                                seq_fmt,pair_fmt,
                                                r1_file_suffix.strip(),
                                                r2_file_suffix.strip(),
                                                build_op_dir,
                                                CPU_THREADS)

        #Check if BLAST DB directory mapping file is provided or not
        if blast_db_directory_map_file is None:
            blast_db_directory_map_file = ""

        # Translate nucleotide seq
        if not os.path.isdir(prot_seq_directory):
            prot_seq_directory = TranseqReadsDir(build_op_dir, nucl_seq_directory, CPU_THREADS)

        # HMMER Search
        if not os.path.exists(allHMMResult):
            os.makedirs(hmm_search_directory,0o777,True)
            for hmmSeqPosKey, hmmFileObj in hmmDict.items():
                hmmInterval = str(hmmDict[hmmSeqPosKey].intervalStart)+"_"+str(hmmDict[hmmSeqPosKey].intervalEnd)
                RunHMMDirectoryParallel(prot_seq_directory,hmmFileObj.hmmFile, cohort_name, prot_family_name, "30_10", hmmInterval, hmm_search_directory, CPU_THREADS)

            with open(allHMMResult, 'w') as outfile:
                for subdir, dirs, files in os.walk(hmm_search_directory):
                    for file in files:
                        filePath = os.path.join(subdir, file)
                        if re.match(r".*txt$", file) and os.path.getsize(filePath) > 0:
                            with open(filePath) as infile:
                                for line in infile:
                                    outfile.write(line)

        # BLAST Alignment
        if not os.path.exists(allBLASTResult):
            if not os.path.isdir(blastn_search_directory):
                print("Constructing BLAST Search Dir:" + blastn_search_directory)
                os.makedirs(blastn_search_directory,0o777,True)
                RunMakeDBandBlastN(nucl_seq_directory, blast_db_directory_map_file,
                                   tp_genes_nucl, "blastn", "-max_target_seqs 10000 -perc_identity 90.0 -outfmt \"6 sseqid slen sstart send qseqid qlen qstart qend pident evalue\" ",
                                   blastn_search_directory, CPU_THREADS)

            with open(allBLASTResult, 'w') as outfile:
                outfile.write("sseqid\tslen\tsstart\tsend\tqseqid\tqlen\tqstart\tqend\tpident\tevalue\tSample\tsampleType\n")
                for subdir, dirs, files in os.walk(blastn_search_directory):
                    for file in files:
                        filePath = os.path.join(subdir, file)
                        if re.match(r".*txt$", file) and os.path.getsize(filePath) > 0:
                            with open(filePath) as infile:
                                for line in infile:
                                    sampleName = os.path.basename(filePath).split(".txt")[0]
                                    outfile.write(line.strip() + "\t" + sampleName + "\t" + cohort_name + "\n")

        # Eval spHMMs
        rpackages.importr('base')
        utils = rpackages.importr('utils')
        packageNames = ('tidyverse','ggsci','ggpubr','dplyr','ggplot2')
        packnames_to_install = [x for x in packageNames if not rpackages.isinstalled(x)]
        if len(packnames_to_install) > 0:
            utils.install_packages(StrVector(packnames_to_install))
        rpackages.importr('tidyverse')
        rpackages.importr('ggsci')
        rpackages.importr('ggpubr')
        rpackages.importr('dplyr')
        rpackages.importr('ggplot2')

        hp_hmm_directory = os.path.join(build_op_dir, 'HiPer_spHMMs')
        os.makedirs(hp_hmm_directory,0o777,True)
        module_dir = os.path.dirname(os.path.abspath(createhmm.__file__))
        print("\nR-script path : " + module_dir)
        r_script = os.path.join(module_dir,'EvaluateSpHMMs.R')

        with open(r_script, 'r') as f:
            rStr = f.read()
        myfunc = STAP(rStr, "EvaluateSpHMM")
        myfunc.EvaluateSpHMM(allHMMResult, allBLASTResult, gene_pos_file, prot_family_name, float(f1_thresh), hmm_directory, hp_hmm_directory)
        timeTaken = time.time() - startTime
        mins = int(timeTaken / 60)
        secs = int(timeTaken) % 60
        print("\nTotal time taken : " + str(mins) + " mins " + str(secs) + " seconds")
        return hp_hmm_directory
    except:
        print("Metabgc-build has failed. Please check your inputs and contact support on : https://github.com/donia-lab/MetaBGC")
        exit()
Пример #3
0
def mbgcbuild(prot_alignment, prot_family_name, cohort_name,
              nucl_seq_directory, prot_seq_directory, seq_fmt, pair_fmt,
              r1_file_suffix, r2_file_suffix, tp_genes_nucl,
              blastn_search_directory, hmm_search_directory, f1_thresh,
              output_directory, cpu):
    startTime = time.time()
    if cpu is not None:
        CPU_THREADS = int(cpu)

    # setup paths
    build_op_dir = output_directory + os.sep + "build"
    hmm_directory = os.path.join(build_op_dir, 'spHMMs')
    prot_aln_file = os.path.join(hmm_directory,
                                 ntpath.basename(prot_alignment))
    tp_genes_prot = build_op_dir + os.sep + "TPGenes.faa"
    alnOutput = os.path.join(build_op_dir, "tmp.afa")
    gene_pos_file = os.path.join(build_op_dir, 'Gene_Interval_Pos.txt')
    if hmm_search_directory is None:
        hmm_search_directory = os.path.join(build_op_dir, 'hmm_result')
    allHMMResult = hmm_search_directory + os.sep + "CombinedHmmSearch.txt"
    if blastn_search_directory is None:
        blastn_search_directory = os.path.join(build_op_dir, 'blastn_result')
    allBLASTResult = blastn_search_directory + os.sep + "CombinedBLASTSearch.txt"

    # Gen spHMMs and interval pos
    os.makedirs(hmm_directory, 0o777, True)
    copyfile(prot_alignment, prot_aln_file)
    hmmDict = gensphmmfiles(prot_family_name, prot_aln_file, hmm_directory)

    runTranSeq(tp_genes_nucl, "1", tp_genes_prot)
    tmpFile = os.path.join(build_op_dir, "tmp.fa")

    # Join true positives in the sample with the BGC proteins
    joinedSeqs = []
    tpGeneSeqs = list(SeqIO.parse(tp_genes_prot, "fasta"))
    # Removing _1 added by TranSeq
    for seq in tpGeneSeqs:
        seq.id = seq.id[:-2]
        seq.description = ""
        joinedSeqs.append(seq)
    protAlnSeqs = list(SeqIO.parse(prot_aln_file, "fasta"))
    for seq in protAlnSeqs:
        joinedSeqs.append(seq)
    SeqIO.write(joinedSeqs, tmpFile, "fasta")

    # MUSCLE align TP genes with markers
    runMUSCLE(tmpFile, alnOutput)
    # Extract spHMM coordinates from MUSCLE alignment
    gengeneposlist(prot_family_name, protAlnSeqs, hmmDict, alnOutput,
                   gene_pos_file)

    if r1_file_suffix is None:
        r1_file_suffix = ""
    if r2_file_suffix is None:
        r2_file_suffix = ""

    # #Preprocess synthetic reads
    nucl_seq_directory = PreProcessReadsPar(nucl_seq_directory, seq_fmt,
                                            pair_fmt, r1_file_suffix.strip(),
                                            r2_file_suffix.strip(),
                                            build_op_dir, CPU_THREADS)
    # Translate nucleotide seq
    if not os.path.isdir(prot_seq_directory):
        prot_seq_directory = TranseqReadsDir(build_op_dir, nucl_seq_directory,
                                             CPU_THREADS)

    # HMMER Search
    os.makedirs(hmm_search_directory, 0o777, True)
    for hmmSeqPosKey, hmmFileObj in hmmDict.items():
        hmmInterval = str(hmmDict[hmmSeqPosKey].intervalStart) + "_" + str(
            hmmDict[hmmSeqPosKey].intervalEnd)
        RunHMMDirectory(prot_seq_directory, hmmFileObj.hmmFile, cohort_name,
                        prot_family_name, "30_10", hmmInterval,
                        hmm_search_directory, CPU_THREADS)

    with open(allHMMResult, 'w') as outfile:
        for subdir, dirs, files in os.walk(hmm_search_directory):
            for file in files:
                filePath = os.path.join(subdir, file)
                if re.match(r".*txt$", file) and os.path.getsize(filePath) > 0:
                    with open(filePath) as infile:
                        for line in infile:
                            outfile.write(line)

    # BLAST Alignment
    if not os.path.isdir(blastn_search_directory):
        os.makedirs(blastn_search_directory, 0o777, True)
        RunBLASTNDirectoryPar(nucl_seq_directory, tp_genes_nucl,
                              blastn_search_directory, CPU_THREADS)

    with open(allBLASTResult, 'w') as outfile:
        outfile.write(
            "sseqid\tslen\tsstart\tsend\tqseqid\tqlen\tqstart\tqend\tpident\tevalue\tSample\tsampleType\n"
        )
        for subdir, dirs, files in os.walk(blastn_search_directory):
            for file in files:
                filePath = os.path.join(subdir, file)
                if re.match(r".*txt$", file) and os.path.getsize(filePath) > 0:
                    with open(filePath) as infile:
                        for line in infile:
                            sampleName = ntpath.basename(filePath).split(
                                ".txt")[0]
                            outfile.write(line.strip() + "\t" + sampleName +
                                          "\t" + cohort_name + "\n")

    # Eval spHMMs
    rpackages.importr('base')
    utils = rpackages.importr('utils')
    packageNames = ('tidyverse', 'ggsci', 'ggpubr', 'dplyr', 'ggplot2')
    packnames_to_install = [
        x for x in packageNames if not rpackages.isinstalled(x)
    ]
    if len(packnames_to_install) > 0:
        utils.install_packages(StrVector(packnames_to_install))
    rpackages.importr('tidyverse')
    rpackages.importr('ggsci')
    rpackages.importr('ggpubr')
    rpackages.importr('dplyr')
    rpackages.importr('ggplot2')

    hp_hmm_directory = os.path.join(build_op_dir, 'HiPer_spHMMs')
    os.makedirs(hp_hmm_directory, 0o777, True)
    r_script = os.path.join(sys.path[0], 'metabgc', 'src', 'EvaluateSpHMMs.R')

    with open(r_script, 'r') as f:
        rStr = f.read()
    myfunc = STAP(rStr, "EvaluateSpHMM")
    myfunc.EvaluateSpHMM(allHMMResult,
                         allBLASTResult, gene_pos_file, prot_family_name,
                         float(f1_thresh), hmm_directory, hp_hmm_directory)
    timeTaken = time.time() - startTime
    mins = int(timeTaken / 60)
    secs = int(timeTaken) % 60
    print("\nTotal time taken : " + str(mins) + " mins " + str(secs) +
          " seconds")
    return hp_hmm_directory