def paraAT(in_dir):
    nucleotide_dir_path = in_dir / "nucleotide"
    homolog_dir_path = in_dir / "homolog"
    protein_dir_path = in_dir / "aminoacid"
    out_dir_path = directory_creater(in_dir / "ParaAT_out")
    ParaAT_stdout = directory_creater(in_dir / "ParaAT_stdout")
    ParaAT_stderr = directory_creater(in_dir / "ParaAT_stderr")
    cmds_list = []
    for gene_id in nucleotide_dir_path.iterdir():
        f = NamedTemporaryFile('w+t', delete=False)
        f.write('6')
        cmds_list.append(([
            "/mnt/d/zhes_learning_space/software_in_ubuntu/ParaAT2.0/ParaAT.pl",
            "-h",
            str(homolog_dir_path / (gene_id.stem + ".txt")), "-n",
            str(nucleotide_dir_path / (gene_id.stem + ".fasta")), "-a",
            str(protein_dir_path / (gene_id.stem + ".fasta")), "-p", f.name,
            "-m", "muscle", "-f", "axt", "-g", "-k", "-o",
            str(out_dir_path / gene_id.stem)
        ], gene_id.stem))
    procs_list = [
        subprocess.Popen(cmd[0],
                         stdout=(ParaAT_stdout / (cmd[1] + ".txt")).open('w'),
                         stderr=(ParaAT_stderr / (cmd[1] + ".txt")).open('w'))
        for cmd in cmds_list
    ]
    for proc in procs_list:
        proc.wait()
Пример #2
0
def extract_ortholog_gene(ortholog_blast_path_name):
    '''
    input 1: strain_db_dir_path
    input 2: contig_path
    input 3: orthogroup_sequence_path
    output 1: ortholog_blast_path_name
    '''
    general_out_path = Path(ortholog_blast_path_name)
    blast_general = general_out_path / "blast_general"
    if blast_general.exists() is False:
        blast_general.mkdir()
    all_row_gene_dir_path = directory_creater(general_out_path /
                                              "all_row_gene")
    all_row_gene_fasta_dir = directory_creater(all_row_gene_dir_path / "fasta")
    all_row_gene_list_dir = directory_creater(all_row_gene_dir_path / "list")
    all_row_gene_list_dir_all = directory_creater(all_row_gene_dir_path /
                                                  "all_list")
    blast_db_path = blast_general / "blast_db"
    if blast_db_path.exists() is False:
        blast_db_path.mkdir()
    blast_out_path = blast_general / "blast_out"
    if blast_out_path.exists() is False:
        blast_out_path.mkdir()
    blast_out_xml_path = blast_out_path / "blast_out_xml"
    if blast_out_xml_path.exists() is False:
        blast_out_xml_path.mkdir()
    blast_out_asn_path = blast_out_path / "blast_out_asn"
    if blast_out_asn_path.exists() is False:
        blast_out_asn_path.mkdir()
    blast_out_txt_path = blast_out_path / "blast_out_txt"
    if blast_out_txt_path.exists() is False:
        blast_out_txt_path.mkdir()
    blast_identity_value_dir = blast_general / "blast_identity_value"
    if blast_identity_value_dir.exists() is False:
        blast_identity_value_dir.mkdir()

    base = importr("base")
    utils = importr("utils")
    # R_blast_vlaue_list=base.list()
    # Parallel(n_jobs=12)(delayed(generate_row_list)(orthogroup_file,all_row_gene_list_dir_all) for orthogroup_file in orthogroup_sequence_path.iterdir()
    # Parallel(n_jobs=12)(delayed(extract_gene_gff)(orthogroup_file,strain_db_dir_path,contig_path,all_row_gene_fasta_dir) for orthogroup_file in orthogroup_sequence_path.iterdir())
    calls = []
    no_mgg_fl = (general_out_path / "no_MGG.txt").open('w')
    for all_row_gene_fasta_file in all_row_gene_fasta_dir.iterdir():
        calls.append(
            delayed(ortholog_blast)(
                all_row_gene_fasta_file,
                blast_identity_value_dir,
                # no_mgg_fl,
                utils.read_table,
                utils.write_table,
                blast_db_path,
                blast_out_xml_path,
                blast_out_asn_path,
                blast_out_txt_path,
                base.list,
                base.do_call))
    dask.compute(*calls)
Пример #3
0
def blast(species_path, query_file_1, species_out_path_name, pav_excel_name):
    '''
    input:1 contig
    input 3 blastn query file,pan gene
    intermediate_out: species_out_path=Path('../Pan_genome_data/c_blast_present_contig/')
    output 1: pav_excel name
    '''
    global pan_sh
    global query_file
    query_file = query_file_1
    species_out_path = Path(species_out_path_name)
    excel_book = Workbook()
    pan_sh = excel_book.active
    species_count = 1

    global species_db_dir, species_out_xml_dir, species_out_asn_dir, species_out_txt_dir
    species_db_dir = directory_creater(species_out_path / "blastdb")
    species_out_xml_dir = directory_creater(species_out_path / "xml")
    species_out_asn_dir = directory_creater(species_out_path / "asn")
    species_out_txt_dir = directory_creater(species_out_path / "txt")

    Parallel(n_jobs=1)(delayed(run_blast)(i)
                       for i in species_path.glob("1106.2.fasta"))
    for species_file in species_path.glob("1106.2.fasta"):
        # print (str(species_file)+"\n")
        species_count = species_count + 1
        species_name = species_file.stem

        excel_species_name(species_count + 1, species_name)
        with open(species_out_xml_dir / (species_name + ".xml")) as fl:
            for record in NCBIXML.parse(fl):
                gene_name = record.query.split()[0]
                if record.alignments:
                    max_flag = -1
                    #out_to_excel(species_count,record.query,1)
                    for alignment in record.alignments:
                        for hsp in alignment.hsps:
                            if max_flag == -1:
                                identity_discriminant_for_length = hsp.align_length / record.query_length
                                identity_discriminant_for_identity_perscent = hsp.identities / hsp.align_length
                                max_flag = max_flag + 2
                                if hsp.align_length < 100 and identity_discriminant_for_length < 0.5:
                                    out_to_excel(species_count, gene_name, 0)
                                elif identity_discriminant_for_length == 1 and identity_discriminant_for_identity_perscent == 1:
                                    out_to_excel(species_count, gene_name, 4)
                                elif identity_discriminant_for_length == 1 and hsp.gaps == 0:
                                    out_to_excel(species_count, gene_name, 3)
                                elif hsp.align_length > record.query_length or record.query_length - hsp.align_length <= 50:
                                    out_to_excel(species_count, gene_name, 2)
                                else:
                                    out_to_excel(species_count, gene_name, 1)
                else:
                    out_to_excel(species_count, gene_name, 0)
    excel_book.save(pav_excel_name)
Пример #4
0
def prepare_for_ParaAT(joined_df_file_name,coding_gene_base_file_path,protein_base_file_path,out_dir):
    '''
    input 1: joined_df_file_name
    input 2: coding_gene_base_file_path
    input 3: protein_base_file_path
    output 1: out_dir
    '''
    global coding_gene_base,protein_base,gene_fasta_fl,protein_fasta_fl
    gene_dir_path=directory_creater(out_dir/"nucleotide")
    protein_dir_path=directory_creater(out_dir/"aminoacid")
    fasta_base_dir_path=directory_creater(out_dir/"gene_protein_base")
    homolog_dir_path=directory_creater(out_dir/"homolog")
    coding_gene_base=SeqIO.index(
        str(coding_gene_base_file_path),
        "fasta"
        )
    protein_base=SeqIO.index(
        str(protein_base_file_path),
        "fasta"
        )
    base=importr("base")
    utils=importr("utils")
    ortholog_joined_df=utils.read_table(
        joined_df_file_name,
        sep = "\t",
        header = True,
        **{'stringsAsFactors': False},
        **{'check.names': False}
        )
    ortholog_joined_df_sub=ortholog_joined_df.rx(True,-1)
    for i in range(1,(int(base.nrow(ortholog_joined_df)[0])+1)):
                df_row=ortholog_joined_df_sub.rx(i, True)
                df_row_iter=iter(df_row)
                head_list=next(df_row_iter)[0].split()
                if len(head_list)==1:
                    gene_fasta=gene_dir_path/(head_list[0]+".fasta")
                    protein_fasta=protein_dir_path/(head_list[0]+".fasta")
                    homolog_file_path=homolog_dir_path/(head_list[0]+".txt")
                    if gene_fasta.is_file() is True:continue
                    with gene_fasta.open('w') as gene_fasta_fl:
                        with protein_fasta.open('w') as protein_fasta_fl:
                            with homolog_file_path.open('w') as homolog_fl:
                                extract_gene(head_list[0])
                                homolog_fl.write(head_list[0]+"\t")
                                for homolog_id in one_head(df_row_iter):
                                    homolog_fl.write(homolog_id+"\t")
                                homolog_fl.write("\n")
                else:
                    two_head(head_list,df_row_iter)
Пример #5
0
def format_paraAT_parameter(in_dir):
    nucleotide_dir_path=in_dir/"nucleotide"
    homolog_dir_path=in_dir/"homolog"
    protein_dir_path=in_dir/"aminoacid"
    out_dir_path=directory_creater(in_dir/"ParaAT_out")
    for gene_id in nucleotide_dir_path.iterdir():
        f=NamedTemporaryFile('w+t',delete=False)
        f.write('1')
        yield ([
            "/mnt/d/zhes_learning_space/software_in_ubuntu/ParaAT2.0/ParaAT.pl",
            "-h",
            str(homolog_dir_path/(gene_id.stem+".txt")),
            "-n",
            str(nucleotide_dir_path/(gene_id.stem+".fasta")),
            "-a",
            str(protein_dir_path/(gene_id.stem+".fasta")),
            "-p",
            f.name,
            "-m",
            "muscle",
            "-f",
            "axt",
            # "-g",
            "-k",
            "-o",
            str(out_dir_path/gene_id.stem)
        ],
        gene_id.stem
        )
def extract_ortholog_gene(strain_db_dir_path,contig_path,joined_df_file_name,ortholog_blast_path_name):
    '''
    input 1: strain_db_dir_path
    input 2: contig_path
    input 3: joined_df_file_name
    output 1: ortholog_blast_path_name
    '''
    general_out_path=Path(ortholog_blast_path_name)
    blast_general=general_out_path / "blast_general"
    if blast_general.exists() is False:
        blast_general.mkdir()
    all_row_gene_dir_path=directory_creater(general_out_path/"all_row_gene")
    all_row_gene_fasta_dir=directory_creater(all_row_gene_dir_path/"fasta")
    all_row_gene_list_dir=directory_creater(all_row_gene_dir_path/"list")
    all_row_gene_list_dir_all=directory_creater(all_row_gene_dir_path/"all_list")
    blast_db_path=blast_general/"blast_db"
    if blast_db_path.exists() is False:
        blast_db_path.mkdir()
    blast_out_path=blast_general/"blast_out"
    if blast_out_path.exists() is False:
        blast_out_path.mkdir()
    blast_out_xml_path=blast_out_path/"blast_out_xml"
    if blast_out_xml_path.exists() is False:
        blast_out_xml_path.mkdir()
    blast_out_asn_path=blast_out_path/"blast_out_asn"
    if blast_out_asn_path.exists() is False:
        blast_out_asn_path.mkdir()
    blast_out_txt_path=blast_out_path/"blast_out_txt"
    if blast_out_txt_path.exists() is False:
        blast_out_txt_path.mkdir()
    blast_identity_value_dir=blast_general/"blast_identity_value"
    if blast_identity_value_dir.exists() is False:
        blast_identity_value_dir.mkdir()

    base=importr("base")
    utils=importr("utils")
    ortholog_joined_df=utils.read_table(
        str(joined_df_file_name),
        sep = "\t",
        header = True,
        **{'stringsAsFactors': False},
        **{'check.names': False}
        )
    na_count=0
    R_blast_vlaue_list=base.list()
    Parallel(n_jobs=12)(delayed(generate_row_list)(ortholog_joined_df.rx(i, True),all_row_gene_list_dir,all_row_gene_list_dir_all) for i in range(1,(int(base.nrow(ortholog_joined_df)[0])+1)))
    Parallel(n_jobs=12)(delayed(extract_gene_gff)(all_row_gene_list_file,strain_db_dir_path,contig_path,all_row_gene_fasta_dir) for all_row_gene_list_file in all_row_gene_list_dir_all.iterdir())
Пример #7
0
def calculate_nucleotide_diversity(gene_category_dir_name, output_dir):
    '''
    input 1: gene category dir path
    output 1: nucleotide_diversity_dir
    '''
    global MGG_db
    MGG_db = gffutils.FeatureDB(
        "../Pan_genome_data/ortholog/gffutils_db/MGGdb.db")
    gene_category_dir_path = Path(gene_category_dir_name)
    category_bed_dir_path = directory_creater(output_dir / "category_bed")
    pi_result_dir_path = directory_creater(output_dir / "pi_result")
    pi_std_out_err_dir_path = directory_creater(output_dir / "pi_std_out_err")
    for category_file in gene_category_dir_path.iterdir():
        bed_file_path = category_bed_dir_path / (category_file.stem + ".bed")
        if bed_file_path.is_file() is False:
            pybedtools.BedTool(generate_interval(
                category_file.open())).saveas(bed_file_path)
        vcftool(
            bed_file_path, pi_result_dir_path / category_file.stem,
            pi_std_out_err_dir_path / (category_file.stem + "_out_err.txt"))
Пример #8
0
def CombineGVCFs(snp_dir):
    '''
    input 1:snp_dir generate by call snp
    '''
    vcfdb_dir = directory_creater(snp_dir / "CombineGVCFs_vcf")
    std_out_err = directory_creater(snp_dir / "intermediate_files" /
                                    "CombineGVCFs_std_out_err")
    call_list = [
        "/mnt/d/zhes_learning_space/software_in_ubuntu/gatk-4.1.8.0/gatk",
        "--java-options", "-Xmx20g -Xms20g", "CombineGVCFs", "-R",
        "/mnt/d/zhes_learning_space/the_assignment/pan_genome/Mag_genomes/70-15_refference_genome/70-15_supercontigs.fasta",
        "-O",
        str(vcfdb_dir / "pan_genome_CombineGVCFs.vcf")
    ]
    for vcf_file in gvcf_dir_path.glob("*.vcf"):
        call_list.append("-V")
        call_list.append(str(vcf_file))
    call_CombineGVCFs = subprocess.Popen(
        call_list,
        stdout=(std_out_err / "CombineGVCFs_stdout.txt").open('w'),
        stderr=(std_out_err / "CombineGVCFs_stderr.txt").open('w'))
    call_CombineGVCFs.wait()
Пример #9
0
def GenotypeGVCFs(snp_dir):
    input_vcf_file = snp_dir / "CombineGVCFs_vcf" / "pan_genome_CombineGVCFs.vcf"
    output_vcf_file = snp_dir / "call_snp_gatk.vcf"
    std_out_err_dir = directory_creater(snp_dir / "intermediate_files" /
                                        "GenotypeGVCFs_out_err")
    call_GenotypeGVCFs = subprocess.Popen([
        "/mnt/d/zhes_learning_space/software_in_ubuntu/gatk-4.1.8.0/gatk",
        "--java-options", "-Xmx20g -Xms20g", "GenotypeGVCFs", "-R",
        "/mnt/d/zhes_learning_space/the_assignment/pan_genome/Mag_genomes/70-15_refference_genome/70-15_supercontigs.fasta",
        "-V",
        str(input_vcf_file), "-O",
        str(output_vcf_file)
    ],
                                          stdout=(std_out_err_dir /
                                                  "GenotypeGVCFs_out.txt"
                                                  ).open('w'),
                                          stderr=(std_out_err_dir /
                                                  "GenotypeGVCFs_err.txt"
                                                  ).open('w'))
    call_GenotypeGVCFs.wait()
Пример #10
0
def GenomicsDBImport(snp_dir):
    '''
    input 1:snp_dir generate by call snp
    '''
    gvcf_dir_path = snp_dir / "gvcf"
    vcfdb_dir = snp_dir / "vcfdb"
    std_out_err = directory_creater(snp_dir / "intermediate_files" /
                                    "GenomicsDBImport_std_out_err")
    call_list = [
        "/mnt/d/zhes_learning_space/software_in_ubuntu/gatk-4.1.8.0/gatk",
        "--java-options", "-Xmx20g -Xms20g", "GenomicsDBImport",
        "--genomicsdb-workspace-path",
        str(vcfdb_dir)
    ]
    for vcf_file in gvcf_dir_path.glob("*.vcf"):
        call_list.append("-V")
        call_list.append(str(vcf_file))
    call_GenomicsDBImport = subprocess.Popen(
        call_list,
        stdout=(std_out_err / "GenomicsDBImport_stdout.txt").open('w'),
        stderr=(std_out_err / "GenomicsDBImport_stderr.txt").open('w'))
    call_GenomicsDBImport.wait()
def pav_contig_present_blast_gth_main (species_path,query_file_1,species_out_path_name,pav_excel_name):
    '''
    input:1 contig
    input 3 blastn query file,pan gene
    intermediate_out: species_out_path=Path('../Pan_genome_data/c_blast_present_contig/')
    output 1: pav_excel name
    '''
    global query_file
    query_file=query_file_1
    species_out_path=Path(species_out_path_name)
    
    global species_db_dir,species_out_xml_dir,species_out_asn_dir,species_out_txt_dir
    species_db_dir=directory_creater(species_out_path/"blastdb")
    species_out_xml_dir=directory_creater(species_out_path/"xml")
    species_out_asn_dir=directory_creater(species_out_path/"asn")
    species_out_txt_dir=directory_creater(species_out_path/"txt")
    species_out_gth_dir=directory_creater(species_out_path/"gth_out")
    species_out_gth_stdout_dir=directory_creater(species_out_path/"gth_stdout")
    gff_db_dir_path=directory_creater(species_out_path/"gff_db")
    # Parallel(n_jobs=12)(delayed(run_blast)(query_file,i,species_out_path) for i in species_path.glob("*.fasta"))
    # Parallel(n_jobs=12)(delayed(run_gth)(query_file,i,species_out_path) for i in species_path.glob("*.fasta"))
    # Parallel(n_jobs=12)(delayed(update_gff_db)(i,gff_db_dir_path) for i in species_out_gth_dir.glob("*.gff"))
    excel(species_out_xml_dir,species_out_path,pav_excel_name)
def extract_ortholog_gene(strain_db_dir_path,contig_path,joined_df_file_name,ortholog_blast_path_name):
    '''
    input 1: strain_db_dir_path
    input 2: contig_path
    input 3: joined_df_file_name
    output 1: ortholog_blast_path_name
    '''
    general_out_path=Path(ortholog_blast_path_name)
    blast_general=general_out_path / "blast_general"
    if blast_general.exists() is False:
        blast_general.mkdir()
    all_row_gene_dir_path=directory_creater(general_out_path/"all_row_gene")
    all_row_gene_fasta_dir=directory_creater(all_row_gene_dir_path/"fasta")
    all_row_gene_list_dir=directory_creater(all_row_gene_dir_path/"list")
    all_row_gene_list_dir_all=directory_creater(all_row_gene_dir_path/"all_list")
    blast_db_path=blast_general/"blast_db"
    if blast_db_path.exists() is False:
        blast_db_path.mkdir()
    blast_out_path=blast_general/"blast_out"
    if blast_out_path.exists() is False:
        blast_out_path.mkdir()
    blast_out_xml_path=blast_out_path/"blast_out_xml"
    if blast_out_xml_path.exists() is False:
        blast_out_xml_path.mkdir()
    blast_out_asn_path=blast_out_path/"blast_out_asn"
    if blast_out_asn_path.exists() is False:
        blast_out_asn_path.mkdir()
    blast_out_txt_path=blast_out_path/"blast_out_txt"
    if blast_out_txt_path.exists() is False:
        blast_out_txt_path.mkdir()
    blast_identity_value_dir=blast_general/"blast_identity_value"
    if blast_identity_value_dir.exists() is False:
        blast_identity_value_dir.mkdir()
    lower_blast_value_dir_path=directory_creater(general_out_path/"lower_blast_value")
    best_sequence_id_list_dir_path=directory_creater(general_out_path/"best_sequence_id_list")
    single_copy_fasta_dir_path=directory_creater(general_out_path.parent/"MAFFT_ortholog_MGG"/"in_put_fasta")
    base=importr("base")
    utils=importr("utils")
    
    # na_count=0
    # R_blast_vlaue_list=base.list()
    # # client = Client(processes=False)
    
    # generate_row_list_calls=[]
    # extract_gene_gff_calls=[]
    # for i in range(1,(int(base.nrow(ortholog_joined_df)[0])+1)):
    #     df_row=ortholog_joined_df.rx(i, True)
    #     df_row_iter=iter(df_row[2:])
    #     head_id=df_row.rx(1,2)[0]
    #     if re.search(",",head_id) is not None:
    #         continue
    #     all_row_gene_fasta_file=all_row_gene_fasta_dir / (head_id+"_"+"_all_row_gene_fasta.fasta")
        # generate_row_list_calls.append(dask.delayed(generate_row_list)(
        #     df_row_iter,
        #     head_id,
        #     all_row_gene_list_dir,
        #     all_row_gene_list_dir_all
        # ))
        # extract_gene_gff_calls.append(dask.delayed(extract_gene_gff)(
        #     all_row_gene_list_dir_all/(head_id+".txt"),
        #     strain_db_dir_path,
        #     contig_path,
        #     all_row_gene_fasta_dir
        # ))
    # dask.compute(*generate_row_list_calls)
    # dask.compute(*extract_gene_gff_calls)
    '''
    generate_row_list
    '''
    # ortholog_joined_df=utils.read_table(
    #     str(joined_df_file_name),
    #     sep = "\t",
    #     header = True,
    #     **{'stringsAsFactors': False},
    #     **{'check.names': False}
    #     )
    # head_id_list=[]
    # strain_num_list=[]
    # Parallel(n_jobs=1)(delayed(generate_row_list)(
    #     ortholog_joined_df.rx(i, True),
    #     all_row_gene_list_dir,
    #     all_row_gene_list_dir_all,
    #     head_id_list,
    #     strain_num_list
    #     ) for i in range(1,(int(base.nrow(ortholog_joined_df)[0])+1)))
    # with (general_out_path/"MGG_strain_num.txt").open('w') as MGG_strain_num_fl:
    #     for head_id_1,strain_num_1 in zip(head_id_list,strain_num_list):
    #         MGG_strain_num_fl.write("{}\t{}\n".format(head_id_1,strain_num_1))
    '''
    extract_gene_gff
    '''
    # Parallel(n_jobs=12)(delayed(extract_gene_gff)(all_row_gene_list_file,strain_db_dir_path,contig_path,all_row_gene_fasta_dir) for all_row_gene_list_file in all_row_gene_list_dir_all.iterdir())
    # Parallel(n_jobs=1)(delayed(extract_gene_gff)(all_row_gene_list_file,strain_db_dir_path,contig_path,all_row_gene_fasta_dir) for all_row_gene_list_file in all_row_gene_list_dir_all.glob("MGG_16565T0*"))
    '''
    ortholog_blast
    '''
    # output_list=[]
    # lineList=[]
    # rdocall=base.do_call
    # rread_table=utils.read_table
    # rwrite_table=utils.write_table
    # rlist=base.list
    # rlength=base.length
    # only_itself_list=[]
    # length_0_list=[]
    # no_mgg_list=[]
    # for all_row_gene_fasta_file in all_row_gene_fasta_dir.glob("MGG_01742T0*"):
    # # for all_row_gene_fasta_file in all_row_gene_fasta_dir.iterdir():
    #     R_blast_vlaue_df=dask.delayed(ortholog_blast)(
    #     # R_blast_vlaue_df=ortholog_blast(
    #         all_row_gene_fasta_file,
    #         # all_row_gene_fasta_dir,
    #         blast_identity_value_dir,
    #         only_itself_list,
    #         length_0_list,
    #         no_mgg_list,
    #         rread_table,
    #         rwrite_table,
    #         blast_db_path,
    #         blast_out_xml_path,
    #         blast_out_asn_path,
    #         blast_out_txt_path,
    #         rlist,
    #         rlength,
    #         rdocall
    #         )
    #     output_list.append(R_blast_vlaue_df)
    # total=dask.delayed(merge_df)(output_list,base.list,utils.write_table,base.do_call,general_out_path)
    # # total.visualize()
    # total.compute()
    '''
    parse_blast_result
    '''
    # mem=Memory(general_out_path/"parse_blast_result_cache")
    # parse_blast_result_mem=mem.cache(parse_blast_result,verbose=0)
    # reddd=Parallel(n_jobs=10)(delayed(parse_blast_result_mem)(
    #     all_row_gene_list_dir/(blast_identity_value_tsv.stem+"_singlecopy_list.txt"),
    #     all_row_gene_list_dir/(blast_identity_value_tsv.stem+"_multicopy_list.txt"),
    #     blast_identity_value_tsv,
    #     lower_blast_value_dir_path/(blast_identity_value_tsv.stem+".txt"),
    #     best_sequence_id_list_dir_path/(blast_identity_value_tsv.stem+".txt")
    # ) for blast_identity_value_tsv in blast_identity_value_dir.iterdir())
    # # ) for blast_identity_value_tsv in blast_identity_value_dir.glob("MGG_00010T0*"))
    # with (general_out_path/"best_strain_num.txt").open('w') as best_strain_num_fl:
    #     for head_id_1,strain_num_1 in reddd:
    #         best_strain_num_fl.write("{}\t{}\n".format(head_id_1,strain_num_1))

    '''
    extract_gene
    '''
    Parallel(n_jobs=12)(delayed(extract_gene)(
        best_sequence_id_list,
        all_row_gene_fasta_dir/(best_sequence_id_list.stem+".fasta"),
        single_copy_fasta_dir_path/(best_sequence_id_list.stem+".fasta")
    ) for best_sequence_id_list in best_sequence_id_list_dir_path.iterdir())
Пример #13
0
def call_snp_HaplotypeCaller(id_file, contig_dir_name, out_dir):
    '''
    input 1: strain 95 file
    input 2: contig_dir_name
    output 1: out dir
    '''
    global mummer_stderr_dir, mummer_stdout_dir, contig_dir_path, sam_out_dir
    sam_out_dir = directory_creater(out_dir / "mummer_sam_files")
    raw_bam_out_dir = directory_creater(out_dir / "raw_bam_files")
    RG_bam_dir = directory_creater(out_dir / "RG_bam_files")
    intermediate_dir = directory_creater(out_dir / "intermediate_files")
    mummer_stderr_dir = directory_creater(intermediate_dir / "mummer_err")
    mummer_stdout_dir = directory_creater(intermediate_dir / "mummer_out")
    samview_stderr_dir = directory_creater(intermediate_dir / "sam_view_err")
    add_RG_stdout_dir = directory_creater(intermediate_dir / "add_RG_out")
    add_RG_stderr_dir = directory_creater(intermediate_dir / "add_RG_err")
    strain_95_list = extract_strain_id(id_file)
    strain_95_list.append("ina168")
    strain_95_list.remove("magnaporthe_oryzae_70-15_8_proteins_T0")
    contig_dir_path = Path(contig_dir_name)
    replace_header_sam_dir = directory_creater(intermediate_dir /
                                               "replace_header_sam")
    bam_sort_dir_path = directory_creater(intermediate_dir / "bam_sort")
    bam_sort_stdout_dir_path = directory_creater(intermediate_dir /
                                                 "bam_sort_out")
    bam_sort_stderr_dir_path = directory_creater(intermediate_dir /
                                                 "bam_sort_err")
    bam_index_stdout_dir_path = directory_creater(intermediate_dir /
                                                  "bam_index_out")
    bam_index_stderr_dir_path = directory_creater(intermediate_dir /
                                                  "bam_index_err")
    gvcf_dir = directory_creater(out_dir / "gvcf")
    haplotypecaller_stdout_dir_path = directory_creater(
        intermediate_dir / "haplotypecaller_stdout")
    haplotypecaller_stderr_dir_path = directory_creater(
        intermediate_dir / "haplotypecaller_stderr")
    for strain_95_id in strain_95_list:
        mummer_sam_out = sam_out_dir / (strain_95_id + ".sam")
        if mummer_sam_out.is_file() is False:
            mummer_call(strain_95_id, str(mummer_sam_out))
        replace_header_sam_file_path = replace_header_sam_dir / (strain_95_id +
                                                                 ".sam")
        if replace_header_sam_file_path.is_file() is False:
            replace_header(mummer_sam_out, replace_header_sam_file_path)
        raw_bam_file_path = raw_bam_out_dir / (strain_95_id + ".bam")
        if raw_bam_file_path.is_file() is False:
            sam2bam(replace_header_sam_file_path, raw_bam_file_path,
                    samview_stderr_dir / (strain_95_id + "_samview_err.txt"))
        RG_bam_file_path = RG_bam_dir / (strain_95_id + ".bam")
        if RG_bam_file_path.is_file() is False:
            add_RG(raw_bam_file_path, RG_bam_file_path,
                   add_RG_stdout_dir / (strain_95_id + "_out.txt"),
                   add_RG_stderr_dir / (strain_95_id + "_err.txt"))
        bam_RG_sort_file_path = bam_sort_dir_path / (strain_95_id +
                                                     "_sort.bam")
        if bam_RG_sort_file_path.is_file() is False:
            bam_sort(RG_bam_file_path, bam_RG_sort_file_path,
                     bam_sort_stdout_dir_path / (strain_95_id + "_out.txt"),
                     bam_sort_stderr_dir_path / (strain_95_id + "_err.txt"))
            bam_index(bam_RG_sort_file_path,
                      bam_index_stdout_dir_path / (strain_95_id + "_out.txt"),
                      bam_index_stderr_dir_path / (strain_95_id + "_err.txt"))
        gvcf_file_path = gvcf_dir / (strain_95_id + ".vcf")
        if gvcf_file_path.is_file() is False:
            haplotypecaller(
                bam_RG_sort_file_path, gvcf_file_path,
                haplotypecaller_stdout_dir_path / (strain_95_id + "_out.txt"),
                haplotypecaller_stderr_dir_path / (strain_95_id + "_err.txt"))
Пример #14
0
# draw_stack()
# set_minus(str(R_result)+"/")
# drawer_curve(str(R_result)+"/")

pav_with_cluster_out_file_name = "../Pan_genome_data/pav_with_cluster.tsv"
clade_set = ["1", "2", "3", "4"]
color_set = "Set2"
cluster_clade_file_name = "../Pan_genome_data/R_result/strain_clade_category_ID.txt"

# cluster(pav_excel,clade_set,color_set,pav_with_cluster_out_file_name,cluster_clade_file_name,str(R_result)+"/")
# write_cluster_result()
# set_minus_with_cluster(str(R_result)+"/",cluster_clade_file_name,clade_set,color_set)

orthofinder_assianed_tsv_file_name = "../Pan_genome_data/Results_May22_1/Orthogroups/Orthogroups.tsv"
orthofinder_unassianed_tsv_file_name = "../Pan_genome_data/Results_May22_1/Orthogroups/Orthogroups_UnassignedGenes.tsv"
set_minus_orthofinder_result = directory_creater(
    "../Pan_genome_data/set_minus_orthofinder_result")
pav_orthofinder_file_name = "../Pan_genome_data/set_minus_orthofinder_result/pav_orthofinder.xlsx"
# R_set_minus_cut_orthofinder(orthofinder_assianed_tsv_file_name,orthofinder_unassianed_tsv_file_name,pan_id_file,str(set_minus_orthofinder_result)+"/")
# set_minus(str(set_minus_orthofinder_result)+"/")
# drawer_curve(str(set_minus_orthofinder_result)+"/")

MGG_70_15 = "../../70-15_refference_genome/magnaporthe_oryzae_70-15_8_genes.fasta"
Augustus_70_15 = "../../GFF/70-15_gene.fasta"
MGG_Augustus_70_15_dir = directory_creater(
    "../Pan_genome_data/70-15_MGG_Augustus")
MGG_Augustus_70_15_blast_db_dir = directory_creater(MGG_Augustus_70_15_dir /
                                                    "blast_db")
blast_out_asn_file = MGG_Augustus_70_15_dir / "blast_out.asn"
blast_out_xml_file = MGG_Augustus_70_15_dir / "blast_out.xml"
blast_out_txt_file = MGG_Augustus_70_15_dir / "blast_out.txt"
MGG_unpresent_Augustus = MGG_Augustus_70_15_dir / "MGG_unpresent_Augustus_list.txt"
def extract_ortholog_gene(gene_base_name, joined_df_file_name, id_file,
                          ortholog_blast_path_name):
    '''
    input 1: gene_base_name
    input 2: joined_df_file_name
    input 3: id_file
    output 1: ortholog_blast_path_name
    '''
    global gene_base
    gene_base = SeqIO.index(gene_base_name, "fasta", key_function=get_id_gene)
    blast_path = Path(ortholog_blast_path_name)
    global blast_general, blast_per_MGG_gene, blast_body_sequence_path, blast_head_sequence_path, blast_db_path, blast_out_path, blast_out_xml_path, blast_out_txt_path, blast_sequence, fst_sequence, blast_result, blast_out_asn_path
    sequence_out_dir = directory_creater(blast_path.parent / "sequence_out")
    blast_general = blast_path / "blast_general"
    if blast_general.exists() is False:
        blast_general.mkdir()
    blast_per_MGG_gene = blast_path / "blast_per_MGG_gene"
    if blast_per_MGG_gene.exists() is False:
        blast_per_MGG_gene.mkdir()
    blast_body_sequence_path = blast_path / "blast_body_sequence"
    if blast_body_sequence_path.exists() is False:
        blast_body_sequence_path.mkdir()
    blast_head_sequence_path = blast_path / "blast_head_sequence"
    if blast_head_sequence_path.exists() is False:
        blast_head_sequence_path.mkdir()
    blast_db_path = blast_path / "blast_db"
    if blast_db_path.exists() is False:
        blast_db_path.mkdir()
    blast_out_path = blast_path / "blast_out"
    if blast_out_path.exists() is False:
        blast_out_path.mkdir()
    blast_out_xml_path = blast_out_path / "blast_out_xml"
    if blast_out_xml_path.exists() is False:
        blast_out_xml_path.mkdir()
    blast_out_asn_path = blast_out_path / "blast_out_asn"
    if blast_out_asn_path.exists() is False:
        blast_out_asn_path.mkdir()
    blast_out_txt_path = blast_out_path / "blast_out_txt"
    if blast_out_txt_path.exists() is False:
        blast_out_txt_path.mkdir()

    blast_sequence = blast_path / "blast_sequence"
    if blast_sequence.exists() is False:
        blast_sequence.mkdir()
    fst_sequence = sequence_out_dir / "fst_sequence"
    if fst_sequence.exists() is False:
        fst_sequence.mkdir()
    global fst_sequence_two_head
    fst_sequence_two_head = sequence_out_dir / "fst_sequence_two_head"
    if fst_sequence_two_head.exists() is False:
        fst_sequence_two_head.mkdir()
    blast_result = blast_path / "blast_result"
    if blast_result.exists() is False:
        blast_result.mkdir()
    global blast_exception_path_name
    blast_exception_path_name = directory_creater(blast_path /
                                                  "blast_exception")
    global gffutils_db_dir_path_name
    gffutils_db_dir_path_name = directory_creater(blast_path.parent /
                                                  "gffutils_db")
    global json_dir_path_name
    json_dir_path_name = Path("../Pan_genome_data/contig_length_json/")
    global contig_dir_path_name
    contig_dir_path_name = Path("../../contig/")
    global gff_path_name
    gff_path_name = Path("../../GFF/")

    gff_out_path = directory_creater(sequence_out_dir / "gene_slop_1K_gff")
    global fst_sequence_slop_1K_dir
    fst_sequence_slop_1K_dir = directory_creater(sequence_out_dir /
                                                 "fst_sequence_slop_1K")
    global protein_id_vs_chrom_dir
    protein_id_vs_chrom_dir = directory_creater(sequence_out_dir /
                                                "protein_id_vs_chrom")

    species_95_list = extract_strain_id(id_file)
    species_95_list.append("ina168")
    species_95_list.remove("magnaporthe_oryzae_70-15_8_proteins_T0")
    global_names = globals()
    for json_file in json_dir_path_name.iterdir():
        with json_file.open() as json_fl:
            global_names[json_file.stem + "_json"] = json.load(json_fl)
    for strain_id in species_95_list:
        for gff_file in gff_path_name.glob(strain_id + ".gff"):
            gff_db_name = gffutils_db_dir_path_name / (strain_id + ".db")
            if gff_db_name.is_file() is False:
                gffutils.create_db(str(gff_file),
                                   str(gff_db_name),
                                   force=True,
                                   id_spec=None)
            global_names[strain_id + "_db"] = gffutils.FeatureDB(gff_db_name)

    MGG_db_file_path = gffutils_db_dir_path_name / "MGGdb.db"
    if MGG_db_file_path.is_file() is False:
        gffutils.create_db(
            "../../70-15_refference_genome/70-15_Gff/magnaporthe_oryzae_70-15_8_genome_summary_per_gene_amend.txt",
            str(MGG_db_file_path),
            id_spec=':source:')
    global MGG_db
    MGG_db = gffutils.FeatureDB(MGG_db_file_path)
    global strain_protein_id_pattern
    strain_protein_id_pattern = re.compile("(.+)_protein_(.+)_")

    base = importr("base")
    utils = importr("utils")
    ortholog_joined_df = utils.read_table(joined_df_file_name,
                                          sep="\t",
                                          header=True,
                                          **{'stringsAsFactors': False},
                                          **{'check.names': False})
    blast_general_same_strain_list = blast_general / "same_strain_list.txt"
    blast_general_value_list = blast_general / "value_list.txt"
    global blast_general_same_strain_list_fl, blast_general_value_list_fl
    ortholog_joined_df_sub = ortholog_joined_df.rx(True, -1)
    with blast_general_same_strain_list.open(
            'w+') as blast_general_same_strain_list_fl:
        with blast_general_value_list.open(
                'w+') as blast_general_value_list_fl:
            for i in range(1, (int(base.nrow(ortholog_joined_df)[0]) + 1)):
                df_row = ortholog_joined_df_sub.rx(i, True)
                df_row_iter = iter(df_row)
                head_list = next(df_row_iter)[0].split()
                if len(head_list) == 1:
                    one2one(head_list[0], df_row_iter)
                    bedtool_file = pybedtools.BedTool(slop_list2gff()).saveas(
                        gff_out_path / (head_list[0] + ".gff"))
                else:
                    two_head(head_list, df_row_iter)
Пример #16
0
    #         blast_identity_value_dir/(head_id+".tsv"),
    #         utils.read_table,
    #         utils.write_table,
    #         blast_db_path,
    #         blast_out_xml_path,
    #         blast_out_asn_path,
    #         blast_out_txt_path,
    #         base
    #         )
    #     R_blast_vlaue_list.rx2[i]=robjects.DataFrame(R_blast_vlaue_df)
    #     i=i+1
    #     #检查结果不是all true?multi_copy_gene_Vector中是否有异常值,blast结果中检查只用一个hsp行不行
    #     # best_blast_value_list=list(R_parse_blast_result.parse_blast_result(
    #     #     single_copy_gene_list,
    #     #     multi_copy_gene_list,
    #     #     R_blast_vlaue_df,
    #     #     str(blast_general/(head_id+"_single_err.txt"))
    #     #     ))
    #     # extract_gene(best_blast_value_list,all_row_gene_fasta_file,blast_general/(head_id+"_single_copy.fasta"))
    # R_blast_vlaue_df=base.do_call("rbind",R_blast_vlaue_list)
    # utils.write_table(R_blast_vlaue_df,**{'file': str(general_out_path/"all.txt")},**{'append': False},**{'quote': False},**{'sep': "\t"},**{'row.names': False},**{'col.names': True})
# extract_ortholog_gene(directory_creater("/gpfshome/home/Baojd/wangzhe/ortho_blast"))
extract_ortholog_gene(
    "strain_db_dir_path",
    "contig_path",
    "orthogroup_sequence_path", 
    "pav_file_path", 
    150,
    directory_creater("../Pan_genome_data_2/ortholog_blast_2/")
    )
Пример #17
0
def extract_ortholog_gene(strain_db_dir_path,contig_path,joined_df_file_name,ortholog_blast_path_name):
    '''
    input 1: strain_db_dir_path
    input 2: contig_path
    input 3: joined_df_file_name
    output 1: ortholog_blast_path_name
    '''
    general_out_path=Path(ortholog_blast_path_name)
    blast_general=general_out_path / "blast_general"
    if blast_general.exists() is False:
        blast_general.mkdir()
    all_row_gene_dir_path=directory_creater(general_out_path/"all_row_gene")
    all_row_gene_fasta_dir=directory_creater(all_row_gene_dir_path/"fasta")
    all_row_gene_list_dir=directory_creater(all_row_gene_dir_path/"list")
    all_row_gene_list_dir_all=directory_creater(all_row_gene_dir_path/"all_list")
    blast_db_path=blast_general/"blast_db"
    if blast_db_path.exists() is False:
        blast_db_path.mkdir()
    blast_out_path=blast_general/"blast_out"
    if blast_out_path.exists() is False:
        blast_out_path.mkdir()
    blast_out_xml_path=blast_out_path/"blast_out_xml"
    if blast_out_xml_path.exists() is False:
        blast_out_xml_path.mkdir()
    blast_out_asn_path=blast_out_path/"blast_out_asn"
    if blast_out_asn_path.exists() is False:
        blast_out_asn_path.mkdir()
    blast_out_txt_path=blast_out_path/"blast_out_txt"
    if blast_out_txt_path.exists() is False:
        blast_out_txt_path.mkdir()
    blast_identity_value_dir=blast_general/"blast_identity_value"
    if blast_identity_value_dir.exists() is False:
        blast_identity_value_dir.mkdir()

    base=importr("base")
    utils=importr("utils")
    # ortholog_joined_df=utils.read_table(
    #     str(joined_df_file_name),
    #     sep = "\t",
    #     header = True,
    #     **{'stringsAsFactors': False},
    #     **{'check.names': False}
    #     )
    na_count=0
    R_blast_vlaue_list=base.list()
    # client = Client(processes=False)
    generate_row_list_calls=[]
    extract_gene_gff_calls=[]
    # for i in range(1,(int(base.nrow(ortholog_joined_df)[0])+1)):
    #     df_row=ortholog_joined_df.rx(i, True)
    #     df_row_iter=iter(df_row[2:])
    #     head_id=df_row.rx(1,2)[0]
    #     if re.search(",",head_id) is not None:
    #         continue
    #     all_row_gene_fasta_file=all_row_gene_fasta_dir / (head_id+"_"+"_all_row_gene_fasta.fasta")
        # generate_row_list_calls.append(dask.delayed(generate_row_list)(
        #     df_row_iter,
        #     head_id,
        #     all_row_gene_list_dir,
        #     all_row_gene_list_dir_all
        # ))
        # extract_gene_gff_calls.append(dask.delayed(extract_gene_gff)(
        #     all_row_gene_list_dir_all/(head_id+".txt"),
        #     strain_db_dir_path,
        #     contig_path,
        #     all_row_gene_fasta_dir
        # ))
    # dask.compute(*generate_row_list_calls)
    # dask.compute(*extract_gene_gff_calls)
    # Parallel(n_jobs=12)(delayed(generate_row_list)(
    #     ortholog_joined_df.rx(i, True),
    #     all_row_gene_list_dir,
    #     all_row_gene_list_dir_all
    #     ) for i in range(1,(int(base.nrow(ortholog_joined_df)[0])+1)))
    # Parallel(n_jobs=12)(delayed(extract_gene_gff)(all_row_gene_list_file,strain_db_dir_path,contig_path,all_row_gene_fasta_dir) for all_row_gene_list_file in all_row_gene_list_dir_all.iterdir())
    output_list=[]
    lineList=[]
    rdocall=base.do_call
    rread_table=utils.read_table
    rwrite_table=utils.write_table
    rlist=base.list
    rlength=base.length
    only_itself_list=[]
    length_0_list=[]
    no_mgg_list=[]
    # for all_row_gene_fasta_file in all_row_gene_fasta_dir.glob("MGG_05204T0*"):
    for all_row_gene_fasta_file in all_row_gene_fasta_dir.iterdir():
        R_blast_vlaue_df=dask.delayed(ortholog_blast)(
        # R_blast_vlaue_df=ortholog_blast(
            all_row_gene_fasta_file,
            # all_row_gene_fasta_dir,
            blast_identity_value_dir,
            only_itself_list,
            length_0_list,
            no_mgg_list,
            rread_table,
            rwrite_table,
            blast_db_path,
            blast_out_xml_path,
            blast_out_asn_path,
            blast_out_txt_path,
            rlist,
            rlength,
            rdocall
            )
        output_list.append(R_blast_vlaue_df)
    total=dask.delayed(merge_df)(output_list,base.list,utils.write_table,base.do_call,general_out_path)
    # total.visualize()
    total.compute()
Пример #18
0
from set_minus import drawer_curve, set_minus
from set_minus_orthofinder import R_set_minus_cut_orthofinder
from copy_contig import Copy_contig
from Directory_creater import directory_creater
from annotation_secreted_proteins_2 import drawer_secreted_protein, secreted_protein  # annotation_secreted_proteins_2是没有mRNA_protein_mapping_table_file_name的
from fgenesh_result_to_fasta_pro import fgenesh_result_to_fasta
from CD_HIT import cd_hit
from fgenesh_GFF import writen_name
from extract_sequence_from_Orthogroups import extract_sequence_from_Orthogroups
'''
copy
'''
contig_path = Path("../../contig/")
MGG_70_15_contig = "../../70-15_refference_genome/70-15_supercontigs.fasta"
ina168_contig = "../../contig/ina168.fasta"
general_out = directory_creater("../Pan_genome_data_2/")
copy_std_out_err = directory_creater(general_out / "copy_out_err")
contig_156_path = directory_creater(general_out / "156_contig")
# Copy(contig_path,MGG_70_15_contig,ina168_contig,contig_156_path,copy_std_out_err)
'''
phase2:分析预测基因结果
70-15需要跳过,从molquest结果里面删掉,已删掉
ina168需要单独处理,已处理
手动把70-15放入protein_base和mRNA_base
FR13,gene,protein, 手动全部替换,把genome换成FR13,文件名也换成FR13

唯三需要注意的:
70-15
ina168
FR13
'''
Пример #19
0
def prepare_for_ParaAT(joined_df_file_name, id_file, out_dir):
    '''
    input 1:joined_df_file_name
    input 2:id_file
    output 1:out_dir
    '''
    global coding_gene_base, protein_base, gene_fasta_fl, protein_fasta_fl
    gene_dir_path = directory_creater(out_dir / "nucleotide")
    protein_dir_path = directory_creater(out_dir / "aminoacid")
    fasta_base_dir_path = directory_creater(out_dir / "gene_protein_base")
    homolog_dir_path = directory_creater(out_dir / "homolog")
    protein_base_file_path = fasta_base_dir_path / "protein_base.fasta"
    coding_gene_base_file_path = fasta_base_dir_path / "coding_gene_base.fasta"
    if coding_gene_base_file_path.is_file() is False:
        coding_gene_base_list = [
            'cat',
            '/mnt/d/zhes_learning_space/the_assignment/pan_genome/Mag_genomes/70-15_refference_genome/magnaporthe_oryzae_70-15_8_transcripts.fasta',
            "/mnt/d/zhes_learning_space/the_assignment/pan_genome/Mag_genomes/wangzhe2/New_add_ina168/ina168_CDS.fasta"
        ]
        protein_base_list = [
            'cat',
            '/mnt/d/zhes_learning_space/the_assignment/pan_genome/Mag_genomes/70-15_refference_genome/magnaporthe_oryzae_70-15_8_proteins_T0.fasta',
            "/mnt/d/zhes_learning_space/the_assignment/pan_genome/Mag_genomes/wangzhe2/New_add_ina168/ina168_protein.fasta"
        ]
        merge_to_one(coding_gene_base_list, id_file, "_CDS.fasta",
                     fasta_base_dir_path / "coding_gene_list.txt",
                     coding_gene_base_file_path,
                     fasta_base_dir_path / "coding_gene_cat_err.txt")
        merge_to_one(protein_base_list, id_file, "_protein.fasta",
                     fasta_base_dir_path / "protein_list.txt",
                     protein_base_file_path,
                     fasta_base_dir_path / "protein_cat_err.txt")
    coding_gene_base = SeqIO.index(str(coding_gene_base_file_path),
                                   "fasta",
                                   key_function=get_id_protein)
    protein_base = SeqIO.index(str(protein_base_file_path),
                               "fasta",
                               key_function=get_id_protein)
    base = importr("base")
    utils = importr("utils")
    ortholog_joined_df = utils.read_table(joined_df_file_name,
                                          sep="\t",
                                          header=True,
                                          **{'stringsAsFactors': False},
                                          **{'check.names': False})
    ortholog_joined_df_sub = ortholog_joined_df.rx(True, -1)
    for i in range(1, (int(base.nrow(ortholog_joined_df)[0]) + 1)):
        df_row = ortholog_joined_df_sub.rx(i, True)
        df_row_iter = iter(df_row)
        head_list = next(df_row_iter)[0].split()
        if len(head_list) == 1:
            gene_fasta = gene_dir_path / (head_list[0] + ".fasta")
            protein_fasta = protein_dir_path / (head_list[0] + ".fasta")
            homolog_file_path = homolog_dir_path / (head_list[0] + ".txt")
            if gene_fasta.is_file() is True: continue
            with gene_fasta.open('w') as gene_fasta_fl:
                with protein_fasta.open('w') as protein_fasta_fl:
                    with homolog_file_path.open('w') as homolog_fl:
                        extract_gene(head_list[0])
                        homolog_fl.write(head_list[0] + "\t")
                        for homolog_id in one_head(df_row_iter):
                            homolog_fl.write(homolog_id + "\t")
                        homolog_fl.write("\n")
        else:
            two_head(head_list, df_row_iter)
Пример #20
0
def run_paraAT(in_dir):
    ParaAT_stdout=directory_creater(in_dir/"ParaAT_stdout")
    ParaAT_stderr=directory_creater(in_dir/"ParaAT_stderr")
    Parallel(n_jobs=12)(delayed(Popen_paraAT)(i,ParaAT_stdout,ParaAT_stderr) for i in format_paraAT_parameter(in_dir))
Пример #21
0
def call_snp_mummer(id_file, contig_dir_name, out_dir):
    '''
    input 1: strain 95 file
    input 2: contig_dir_name
    output 1: out dir
    '''
    strain_95_list = extract_strain_id(id_file)
    strain_95_list.append("ina168")
    strain_95_list.remove("magnaporthe_oryzae_70-15_8_proteins_T0")

    contig_dir_path = Path(contig_dir_name)
    intermediate_dir_path = directory_creater(out_dir / "intermediate_files")
    nucmer_stdout_dir_path = directory_creater(intermediate_dir_path /
                                               "nucmer_std_out")
    nucmer_stderr_dir_path = directory_creater(intermediate_dir_path /
                                               "nucmer_std_err")
    delta_dir_path = directory_creater(intermediate_dir_path / "delta")
    delta_filter_err_dir_path = directory_creater(intermediate_dir_path /
                                                  "filter_err")
    delta_filter_file_dir_path = directory_creater(intermediate_dir_path /
                                                   "filter")
    delta_snp_dir_path = directory_creater(intermediate_dir_path / "snp")
    delta_snp_err_path = directory_creater(intermediate_dir_path / "snp_err")
    delta_vcf_file_path = directory_creater(intermediate_dir_path / "vcf")
    delta_vcf_err_path = directory_creater(intermediate_dir_path /
                                           "To_vcf_err")
    merge_std_out_err_dir_path = directory_creater(intermediate_dir_path /
                                                   "merge_std_out_err")
    bgzip_out_err_dir_path = directory_creater(intermediate_dir_path / "bgzip")
    index_out_err_dir_path = directory_creater(intermediate_dir_path / "index")
    global merge_list
    merge_list = ["bcftools", "merge"]
    for strain_95_id in strain_95_list:
        delta_file_path = delta_dir_path / (strain_95_id + ".delta")
        if delta_file_path.is_file() is False:
            nucmer(str(contig_dir_path / (strain_95_id + ".fasta")),
                   str(delta_file_path),
                   nucmer_stdout_dir_path / (strain_95_id + '_stdout.txt'),
                   nucmer_stderr_dir_path / (strain_95_id + '_stderr.txt'))
        delta_filter_file_path = delta_filter_file_dir_path / (strain_95_id +
                                                               "_filter.txt")
        if delta_filter_file_path.is_file() is False:
            call_filter(
                delta_file_path, delta_filter_file_path,
                delta_filter_err_dir_path / (strain_95_id + "_filter_err.txt"))
        delta_snp_file_path = delta_snp_dir_path / (strain_95_id + "_snp.txt")
        if delta_snp_file_path.is_file() is False:
            snp(delta_filter_file_path, delta_snp_file_path,
                delta_snp_err_path / (strain_95_id + "_snp_err.txt"))
        delta_snp2vcf_path = delta_vcf_file_path / (strain_95_id + ".vcf")
        delta_snp2vcf_gz_path = delta_vcf_file_path / (strain_95_id +
                                                       ".vcf.gz")
        if delta_snp2vcf_gz_path.is_file() is False:
            To_vcf(
                delta_snp_file_path, delta_snp2vcf_path,
                delta_vcf_err_path / (strain_95_id + "_to_vcf_err.txt"),
                bgzip_out_err_dir_path / (strain_95_id + "_bgzip_out_err.txt"),
                index_out_err_dir_path / (strain_95_id + "_index_out_err.txt"))
        merge_list.append(str(delta_snp2vcf_path) + ".gz")
    merge_list.extend(["-O", "v", "-o", str(out_dir / "merge.vcf")])
    call_merge = subprocess.Popen(merge_list,
                                  stdout=(merge_std_out_err_dir_path /
                                          "merge_out.txt").open('w'),
                                  stderr=subprocess.STDOUT)
    call_merge.wait()
def extract_ortholog_gene(strain_db_dir_path, contig_path,
                          orthogroup_sequence_path, ortholog_blast_path_name):
    '''
    input 1: strain_db_dir_path
    input 2: contig_path
    input 3: orthogroup_sequence_path
    output 1: ortholog_blast_path_name
    '''
    general_out_path = Path(ortholog_blast_path_name)
    blast_general = general_out_path / "blast_general"
    if blast_general.exists() is False:
        blast_general.mkdir()
    all_row_gene_dir_path = directory_creater(general_out_path /
                                              "all_row_gene")
    all_row_gene_fasta_dir = directory_creater(all_row_gene_dir_path / "fasta")
    all_row_gene_list_dir = directory_creater(all_row_gene_dir_path / "list")
    all_row_gene_list_dir_all = directory_creater(all_row_gene_dir_path /
                                                  "all_list")
    blast_db_path = blast_general / "blast_db"
    if blast_db_path.exists() is False:
        blast_db_path.mkdir()
    blast_out_path = blast_general / "blast_out"
    if blast_out_path.exists() is False:
        blast_out_path.mkdir()
    blast_out_xml_path = blast_out_path / "blast_out_xml"
    if blast_out_xml_path.exists() is False:
        blast_out_xml_path.mkdir()
    blast_out_asn_path = blast_out_path / "blast_out_asn"
    if blast_out_asn_path.exists() is False:
        blast_out_asn_path.mkdir()
    blast_out_txt_path = blast_out_path / "blast_out_txt"
    if blast_out_txt_path.exists() is False:
        blast_out_txt_path.mkdir()
    blast_identity_value_dir = blast_general / "blast_identity_value"
    if blast_identity_value_dir.exists() is False:
        blast_identity_value_dir.mkdir()

    base = importr("base")
    utils = importr("utils")
    R_blast_vlaue_list = base.list()
    # Parallel(n_jobs=12)(delayed(generate_row_list)(orthogroup_file,all_row_gene_list_dir_all) for orthogroup_file in orthogroup_sequence_path.iterdir()
    # Parallel(n_jobs=12)(delayed(extract_gene_gff)(orthogroup_file,strain_db_dir_path,contig_path,all_row_gene_fasta_dir) for orthogroup_file in orthogroup_sequence_path.iterdir())
    R_blast_vlaue_list = base.list()
    i = 1
    no_mgg_fl = (general_out_path / "no_mgg.txt").open('w')
    only_itself_fl = (general_out_path / "only_itself.txt").open('w')
    length_0_fl = (general_out_path / "length_0.txt").open('w')
    for all_row_gene_fasta_file in blast_identity_value_dir.iterdir():
        R_blast_vlaue_df = ortholog_blast(
            all_row_gene_fasta_file, blast_identity_value_dir, only_itself_fl,
            length_0_fl, no_mgg_fl, utils.read_table, utils.write_table,
            blast_db_path, blast_out_xml_path, blast_out_asn_path,
            blast_out_txt_path, base)
        if R_blast_vlaue_df == "NA" or R_blast_vlaue_df.rx2("V1") == "NA":
            continue
        R_blast_vlaue_list.rx2[i] = robjects.DataFrame(R_blast_vlaue_df)
        i = i + 1
    R_blast_vlaue_df = base.do_call("rbind", R_blast_vlaue_list)
    utils.write_table(R_blast_vlaue_df,
                      **{'file': str(general_out_path / "all.txt")},
                      **{'append': False}, **{'quote': False}, **{'sep': "\t"},
                      **{'row.names': False}, **{'col.names': True})
Пример #23
0
    #     i=i+1
    #     #检查结果不是all true?multi_copy_gene_Vector中是否有异常值,blast结果中检查只用一个hsp行不行
    #     # best_blast_value_list=list(R_parse_blast_result.parse_blast_result(
    #     #     single_copy_gene_list,
    #     #     multi_copy_gene_list,
    #     #     R_blast_vlaue_df,
    #     #     str(blast_general/(head_id+"_single_err.txt"))
    #     #     ))
    #     # extract_gene(best_blast_value_list,all_row_gene_fasta_file,blast_general/(head_id+"_single_copy.fasta"))
    # R_blast_vlaue_df=base.do_call("rbind",R_blast_vlaue_list)
    # utils.write_table(R_blast_vlaue_df,**{'file': str(general_out_path/"all.txt")},**{'append': False},**{'quote': False},**{'sep': "\t"},**{'row.names': False},**{'col.names': True})
if __name__ == '__main__':
    # client = Client(n_workers=12, threads_per_worker=1)
    cluster = PBSCluster(
        job_extra=["-l nodes=1:ppn=24","-l mem=5000MB"],
        header_skip=["select"],
        processes=24,
        walltime='25:00:00'
        )
    cluster.scale(jobs=10)
    from dask.distributed import Client
    client = Client(cluster)

    print(cluster.job_script())
    extract_ortholog_gene(
        Path("../Pan_genome_data_2/contig_gff3_gffutils_db"),
        Path("../Pan_genome_data_2/156_contig"),
        Path("../Pan_genome_data_2/ortho/joined_df.tsv"),
        directory_creater("/gpfshome/home/Baojd/wangzhe/ortholog_blast_mgg_key"),
    )
Пример #24
0
def extract_ortholog_gene(strain_db_dir_path, contig_path, joined_df_file_name,
                          ortholog_blast_path_name):
    '''
    input 1: strain_db_dir_path
    input 2: contig_path
    input 3: joined_df_file_name
    output 1: ortholog_blast_path_name
    '''
    sequence_out_dir = directory_creater(ortholog_blast_path_name /
                                         "sequence_out")
    gff_out_path = directory_creater(sequence_out_dir / "gene_slop_1K_gff")
    global fst_sequence_slop_1K_dir
    fst_sequence_slop_1K_dir = directory_creater(sequence_out_dir /
                                                 "fst_sequence_slop_1K")
    global protein_id_vs_chrom_dir
    protein_id_vs_chrom_dir = directory_creater(sequence_out_dir /
                                                "protein_id_vs_chrom")
    blast_path = Path(ortholog_blast_path_name)
    global blast_general, blast_per_MGG_gene, blast_body_sequence_path, blast_head_sequence_path, blast_db_path, blast_out_path, blast_out_xml_path, blast_out_txt_path, blast_sequence, fst_sequence, blast_result, blast_out_asn_path
    sequence_out_dir = directory_creater(blast_path.parent / "sequence_out")
    blast_general = blast_path / "blast_general"
    if blast_general.exists() is False:
        blast_general.mkdir()
    blast_per_MGG_gene = blast_path / "blast_per_MGG_gene"
    if blast_per_MGG_gene.exists() is False:
        blast_per_MGG_gene.mkdir()
    blast_body_sequence_path = blast_path / "blast_body_sequence"
    if blast_body_sequence_path.exists() is False:
        blast_body_sequence_path.mkdir()
    blast_head_sequence_path = blast_path / "blast_head_sequence"
    if blast_head_sequence_path.exists() is False:
        blast_head_sequence_path.mkdir()
    blast_db_path = blast_path / "blast_db"
    if blast_db_path.exists() is False:
        blast_db_path.mkdir()
    blast_out_path = blast_path / "blast_out"
    if blast_out_path.exists() is False:
        blast_out_path.mkdir()
    blast_out_xml_path = blast_out_path / "blast_out_xml"
    if blast_out_xml_path.exists() is False:
        blast_out_xml_path.mkdir()
    blast_out_asn_path = blast_out_path / "blast_out_asn"
    if blast_out_asn_path.exists() is False:
        blast_out_asn_path.mkdir()
    blast_out_txt_path = blast_out_path / "blast_out_txt"
    if blast_out_txt_path.exists() is False:
        blast_out_txt_path.mkdir()

    blast_sequence = blast_path / "blast_sequence"
    if blast_sequence.exists() is False:
        blast_sequence.mkdir()
    fst_sequence = sequence_out_dir / "fst_sequence"
    if fst_sequence.exists() is False:
        fst_sequence.mkdir()
    global fst_sequence_two_head
    fst_sequence_two_head = sequence_out_dir / "fst_sequence_two_head"
    if fst_sequence_two_head.exists() is False:
        fst_sequence_two_head.mkdir()
    blast_result = blast_path / "blast_result"
    if blast_result.exists() is False:
        blast_result.mkdir()
    global blast_exception_path_name
    blast_exception_path_name = directory_creater(blast_path /
                                                  "blast_exception")
    global gffutils_db_dir_path_name
    gffutils_db_dir_path_name = directory_creater(blast_path.parent /
                                                  "gffutils_db")
    # global json_dir_path_name
    # json_dir_path_name=Path("../Pan_genome_data/contig_length_json/")
    global contig_dir_path_name
    contig_dir_path_name = Path("../../contig/")
    global gff_path_name
    gff_path_name = Path("../../GFF/")

    global_names = globals()
    # for json_file in json_dir_path_name.iterdir():
    #     with json_file.open() as json_fl:
    #         global_names[json_file.stem+"_json"]=json.load(json_fl)
    # for strain_id in species_95_list:
    #     for gff_file in gff_path_name.glob(strain_id+".gff"):
    #         gff_db_name=gffutils_db_dir_path_name/(strain_id+".db")
    #         if gff_db_name.is_file() is False:
    #             gffutils.create_db(str(gff_file),str(gff_db_name),force=True,id_spec=None)
    #         global_names[strain_id+"_db"]=gffutils.FeatureDB(gff_db_name)

    MGG_db_file_path = gffutils_db_dir_path_name / "MGGdb.db"
    if MGG_db_file_path.is_file() is False:
        gffutils.create_db(
            "../../70-15_refference_genome/70-15_Gff/magnaporthe_oryzae_70-15_8_genome_summary_per_gene_amend.txt",
            str(MGG_db_file_path),
            id_spec=':source:')
    global MGG_db
    MGG_db = gffutils.FeatureDB(MGG_db_file_path)
    # global strain_protein_id_pattern
    # strain_protein_id_pattern=re.compile("(.+)_protein_(.+)_")

    base = importr("base")
    utils = importr("utils")
    ortholog_joined_df = utils.read_table(str(joined_df_file_name),
                                          sep="\t",
                                          header=True,
                                          **{'stringsAsFactors': False},
                                          **{'check.names': False})
    blast_general_same_strain_list = blast_general / "same_strain_list.txt"
    blast_general_value_list = blast_general / "value_list.txt"
    global blast_general_same_strain_list_fl, blast_general_value_list_fl
    ortholog_joined_df_sub = ortholog_joined_df.rx(True, -1)
    with blast_general_same_strain_list.open(
            'w+') as blast_general_same_strain_list_fl:
        with blast_general_value_list.open(
                'w+') as blast_general_value_list_fl:
            for i in range(1, (int(base.nrow(ortholog_joined_df)[0]) + 1)):
                df_row = ortholog_joined_df_sub.rx(i, True)
                df_row_iter = iter(df_row)
                one2one(df_row_iter)
def extract_ortholog_gene(strain_db_dir_path, contig_path,
                          orthogroup_sequence_path, pav_file_path,
                          filter_lower, ortholog_blast_path_name):
    '''
    input 1: strain_db_dir_path
    input 2: contig_path
    input 3: orthogroup_sequence_path
    input 4: pav_file_path
    input 5: filter_lower
    output 1: ortholog_blast_path_name
    '''
    general_out_path = Path(ortholog_blast_path_name)
    blast_general = general_out_path / "blast_general"
    if blast_general.exists() is False:
        blast_general.mkdir()
    all_row_gene_dir_path = directory_creater(general_out_path /
                                              "all_row_gene")
    all_row_gene_fasta_dir = directory_creater(all_row_gene_dir_path / "fasta")
    all_row_gene_list_dir = directory_creater(all_row_gene_dir_path / "list")
    all_row_gene_list_dir_all = directory_creater(all_row_gene_dir_path /
                                                  "all_list")
    blast_db_path = blast_general / "blast_db"
    if blast_db_path.exists() is False:
        blast_db_path.mkdir()
    blast_out_path = blast_general / "blast_out"
    if blast_out_path.exists() is False:
        blast_out_path.mkdir()
    blast_out_xml_path = blast_out_path / "blast_out_xml"
    if blast_out_xml_path.exists() is False:
        blast_out_xml_path.mkdir()
    blast_out_asn_path = blast_out_path / "blast_out_asn"
    if blast_out_asn_path.exists() is False:
        blast_out_asn_path.mkdir()
    blast_out_txt_path = blast_out_path / "blast_out_txt"
    if blast_out_txt_path.exists() is False:
        blast_out_txt_path.mkdir()
    blast_identity_value_dir = blast_general / "blast_identity_value"
    if blast_identity_value_dir.exists() is False:
        blast_identity_value_dir.mkdir()

    base = importr("base")
    utils = importr("utils")
    # Parallel(n_jobs=12)(delayed(generate_row_list)(orthogroup_file,all_row_gene_list_dir_all) for orthogroup_file in orthogroup_sequence_path.iterdir()
    # Parallel(n_jobs=12)(delayed(extract_gene_gff)(orthogroup_file,strain_db_dir_path,contig_path,all_row_gene_fasta_dir) for orthogroup_file in orthogroup_sequence_path.iterdir())
    no_mgg_fl = (general_out_path / "no_mgg.txt").open('w')
    only_itself_fl = (general_out_path / "only_itself.txt").open('w')
    length_0_fl = (general_out_path / "length_0.txt").open('w')
    filter_lower_list_file = general_out_path / ("filter_" +
                                                 str(filter_lower) + ".txt")
    if filter_lower_list_file.exists() is False:
        R_filter_strain_num.filter_strain_num(str(pav_file_path), filter_lower,
                                              str(filter_lower_list_file))
    # cluster = LocalCluster()
    # client = Client(cluster)
    calls = []
    with filter_lower_list_file.open() as filter_lower_list_fl:
        #     for all_row_gene_fasta_file in filter_lower_list_fl:
        #         calls.append(dask.delayed(ortholog_blast)(
        #                 all_row_gene_fasta_file,
        #                 blast_identity_value_dir,
        #                 only_itself_fl,
        #                 length_0_fl,
        #                 no_mgg_fl,
        #                 utils.read_table,
        #                 utils.write_table,
        #                 blast_db_path,
        #                 blast_out_xml_path,
        #                 blast_out_asn_path,
        #                 blast_out_txt_path,
        #                 base
        #                 ))
        # dask.compute(*calls)
        for all_row_gene_fasta_file in filter_lower_list_fl:
            ortholog_blast(all_row_gene_fasta_file.strip(),
                           all_row_gene_fasta_dir, blast_identity_value_dir,
                           only_itself_fl, length_0_fl, no_mgg_fl,
                           utils.read_table, utils.write_table, blast_db_path,
                           blast_out_xml_path, blast_out_asn_path,
                           blast_out_txt_path, base.list, base.length,
                           base.do_call)
Пример #26
0
    #     #     break
    #     extract_gene_gff(all_row_gene_list,strain_db_dir_path,contig_path,all_row_gene_fasta_file)
    #     R_blast_vlaue_df=ortholog_blast(
    #         head_id,
    #         all_row_gene_fasta_file,
    #         blast_identity_value_dir/(head_id+".tsv"),
    #         utils.read_table,
    #         utils.write_table,
    #         blast_db_path,
    #         blast_out_xml_path,
    #         blast_out_asn_path,
    #         blast_out_txt_path,
    #         base
    #         )
    #     R_blast_vlaue_list.rx2[i]=robjects.DataFrame(R_blast_vlaue_df)
    #     i=i+1
    #     #检查结果不是all true?multi_copy_gene_Vector中是否有异常值,blast结果中检查只用一个hsp行不行
    #     # best_blast_value_list=list(R_parse_blast_result.parse_blast_result(
    #     #     single_copy_gene_list,
    #     #     multi_copy_gene_list,
    #     #     R_blast_vlaue_df,
    #     #     str(blast_general/(head_id+"_single_err.txt"))
    #     #     ))
    #     # extract_gene(best_blast_value_list,all_row_gene_fasta_file,blast_general/(head_id+"_single_copy.fasta"))
    # R_blast_vlaue_df=base.do_call("rbind",R_blast_vlaue_list)
    # utils.write_table(R_blast_vlaue_df,**{'file': str(general_out_path/"all.txt")},**{'append': False},**{'quote': False},**{'sep': "\t"},**{'row.names': False},**{'col.names': True})


extract_ortholog_gene(
    directory_creater("/gpfshome/home/Baojd/wangzhe/ortho_blast"))