示例#1
0
def run_fanngo(config):
    workdir = config["input"]["gomap_dir"] + "/"
    fanngo_sw_conf = config["data"]["mixed-method"]["fanngo"]
    fanngo_conf = config["software"]["fanngo"]
    fanngo_template = fanngo_conf["path"] + "/" + fanngo_conf["template"]
    run_file_path = workdir + fanngo_sw_conf["out_dir"] + "/" + config[
        "input"]["basename"] + ".fanngo.m"
    #print fanngo_template
    conf_lines = open(fanngo_template, "r").readlines()
    run_file = open(run_file_path, "w")
    cwd = os.getcwd()
    output = workdir + run_file_path
    out_score = workdir + fanngo_sw_conf["out_dir"] + "/" + config["input"][
        "basename"] + ".score.txt"
    input_fasta = workdir + "input/" + config["input"]["fasta"]

    for line in conf_lines:
        line = line.strip()
        if line.find("$PATH") > -1:
            code_path = cwd + "/" + fanngo_conf["path"] + "/code"
            outline = line.replace("$PATH", code_path)
            print >> run_file, outline
        elif line.find("$INPUT_FASTA") > -1:
            outline = line.replace("$INPUT_FASTA", input_fasta)
            print >> run_file, outline
        elif line.find("$OUTPUT_SCORE") > -1:
            outline = line.replace("$OUTPUT_SCORE", out_score)
            print >> run_file, outline
        else:
            print >> run_file, line
    run_file.close()
    cmd = ["/matlab/bin/matlab", "-nojvm", "-nodisplay", "-nosplash"]
    print(" ".join(cmd))
    check_output_and_run(out_score, cmd, run_file_path)
示例#2
0
def run_fanngo_split(config, split_fa):
    workdir = config["input"]["gomap_dir"] + "/"
    fanngo_sw_conf = config["data"]["mixed-method"]["fanngo"]
    fanngo_conf = config["software"]["fanngo"]
    fanngo_template = fanngo_conf["template"]
    out_base = os.path.basename(split_fa.replace(r".fa", ""))
    run_file_path = workdir + fanngo_sw_conf[
        "out_dir"] + "/split/" + out_base + ".fanngo.m"
    #print fanngo_template
    conf_lines = open(fanngo_template, "r").readlines()
    run_file = open(run_file_path, "w")
    cwd = os.getcwd()
    output = workdir + run_file_path
    out_score = workdir + fanngo_sw_conf[
        "out_dir"] + "/split/" + out_base + ".score.txt"
    input_fasta = workdir + "input/" + config["input"]["fasta"]
    print(split_fa)
    with open(run_file_path, "w") as run_file:
        generate_fanngo_file(conf_lines, cwd, fanngo_conf, split_fa, out_score,
                             run_file)
        run_file.close()
    cmd = [
        "octave", "--norc", "--no-window-system", "--quiet", "--no-history",
        "--traditional", "--verbose"
    ]
    os.environ["NPROC"] = str(config["input"]["cpus"])
    print(run_file_path)
    check_output_and_run(out_score, cmd, run_file_path)
示例#3
0
def get_rbh_annotations(config):
    out_file = config["input"]["gomap_dir"] + "/" + config["data"]["seq-sim"][
        "uniprot"]["tmpdir"] + "/test"
    command = [
        "Rscript", "code/pipeline/run_rbh.r", config["input"]["config_file"]
    ]
    check_output_and_run(out_file, command)
示例#4
0
def run_uniprot_blast(config):
    pipeline_loc = config["pipeline"]["pipeline_loc"] + "/"
    uniprot_config = config["data"]["seq-sim"]["uniprot"]
    uniprot_fa = pipeline_loc + uniprot_config[
        "basedir"] + "/" + uniprot_config["basename"] + ".fa"
    blast_config = config["software"]["blast"]
    blast_bin = pipeline_loc + blast_config["bin"] + "/blastp"
    workdir = config["input"]["gomap_dir"] + "/"
    input_fa = workdir + "/input/" + config["input"]["fasta"]
    tmp_base_dir = workdir + uniprot_config["tmpdir"]

    #running main vs other blast.
    main2other_file = tmp_base_dir + "/" + config["input"][
        "basename"] + "-vs-" + uniprot_config["basename"] + ".bl.out"
    main2other_cmd = [
        blast_bin, "-outfmt", outcols, "-db", uniprot_fa, "-query", input_fa,
        "-out", main2other_file, "-num_threads",
        str(config["input"]["cpus"])
    ]
    check_output_and_run(main2other_file, main2other_cmd)

    #running other vs main blast
    other2maize_file = tmp_base_dir + "/" + uniprot_config[
        "basename"] + "-vs-" + config["input"]["basename"] + ".bl.out"
    other2maize_cmd = [
        blast_bin, "-outfmt", outcols, "-db", input_fa, "-query", uniprot_fa,
        "-out", other2maize_file, "-num_threads",
        str(config["input"]["cpus"])
    ]
    check_output_and_run(other2maize_file, other2maize_cmd)
示例#5
0
def run_pannzer(config):
    workdir=config["input"]["gomap_dir"]+"/"
    pannzer_data=config["data"]["mixed-method"]["pannzer"]
    pannzer_conf=config["software"]["pannzer"]
    blast_dir=workdir + pannzer_data["preprocess"]["blast"]
    blast_files = glob(blast_dir+"/*.xml")    
    cwd = os.getcwd()
    os.chdir(cwd+"/"+pannzer_conf["path"])
    for blast_file in blast_files:
        blank_config = ConfigParser.ConfigParser()
        blank_config.read(pannzer_conf["conf_template"])
        blank_config.set("GENERAL_SETTINGS","INPUT_FOLDER",cwd+"/"+pannzer_data["preprocess"]["blast"])
        blank_config.set("GENERAL_SETTINGS","INPUT_FILE",blast_file)
        blank_config.set("GENERAL_SETTINGS","RESULT_FOLDER",workdir+pannzer_data["result_dir"])
        #blank_config.set("GENERAL_SETTINGS","db",cwd+"/"+pannzer_conf["path"]+"/db")
        
        blank_config.set("GENERAL_SETTINGS","QUERY_TAXON",config["input"]["taxon"])
        out_base = os.path.basename(blast_file).replace(".xml","")
        out_conf = workdir+pannzer_data["conf_dir"]+"/"+out_base+".conf"
        
        blank_config.set("GENERAL_SETTINGS","RESULT_BASE_NAME",out_base)
        blank_config.set("MYSQL","SQL_DB_HOST",pannzer_conf["database"]["SQL_DB_HOST"])
        blank_config.set("MYSQL","SQL_DB_PORT",pannzer_conf["database"]["SQL_DB_PORT"])
        blank_config.set("MYSQL","SQL_DB_USER",pannzer_conf["database"]["SQL_DB_USER"])
        blank_config.set("MYSQL","SQL_DB_SOCKET",pannzer_conf["database"]["SQL_DB_SOCKET"])
        # blank_config.set("MYSQL","SQL_DB_PASSWORD",pannzer_conf["database"]["SQL_DB_PASSWORD"])
        blank_config.set("MYSQL","SQL_DB",pannzer_conf["database"]["SQL_DB"])
        #pp.pprint(blank_config.items("GENERAL_SETTINGS"))
        # pprint(blank_config.items("GENERAL_SETTINGS"))
        
        blank_config.write(open(out_conf,"w"))
        pannzer_out = blank_config.get("GENERAL_SETTINGS","RESULT_FOLDER")+"/"+out_base + "_results.GO"
        pannzer_cmd = ["python","run.py",out_conf]
        check_output_and_run(pannzer_out,pannzer_cmd)
    os.chdir(cwd)
示例#6
0
def iprs2gaf(config):
    dom_config = config["data"]["domain"]
    input_config = config["input"]
    workdir = config["input"]["gomap_dir"] + "/"
    tsv_base = workdir + dom_config["tmpdir"] + "/" + config["input"][
        "basename"]
    infile = tsv_base + ".tsv"
    tmpfile = tsv_base + ".go.tsv"
    gaf_dir = workdir + config["data"]["gaf"]["raw_dir"] + "/"

    tmp_iprs = open(tmpfile, "w")

    with open(infile, "r+") as raw_iprs:
        for line in raw_iprs:
            if re.search("GO:", line) is not None:
                tmp_iprs.write(line)
                #print >> tmp_iprs, line
    tmp_iprs.close()

    out_gaf = gaf_dir + os.path.basename(infile)
    tool_ext = "." + config["data"]["domain"]["tool"]["name"] + ".gaf"
    out_gaf = re.sub(".tsv", tool_ext, out_gaf)
    cmd = [
        "Rscript", "code/pipeline/iprs2gaf.r", config["input"]["config_file"]
    ]
    check_output_and_run(out_gaf, cmd)
示例#7
0
def setup(config):
    setlogging(config, "setup")
    """
    setup(config)

    This function downloads the **GOMAP-data.tar.gz** directory from CyVerse and extracts the content to the **data** directory. The steps run by this function is given below

    1. asdsdsa
    2. sadsadsad
    3. sadsadsad

    Parameters
    ----------
    config : dict
        The config dict generated in the gomap.py script.
    """

    outdir = "data/"
    cmd = ["irsync", "-rv", cyverse_path, outdir]
    logging.info("Downloading file from Cyverse using irsync")
    #The irsync will checksum the files on both ends and dtermine if the download is necessary and will only download if necessary
    # might take time to check if the files needs to be downloaded
    print(os.getcwd())
    print(" ".join(cmd))
    check_output_and_run("outfile", cmd)

    with open("data/compress_files.txt", "r") as comp_files:
        counter = 0
        for infile in comp_files.readlines():
            counter = counter + 1
            outfile = outdir + infile.strip()
            gzfile = outdir + infile.strip() + ".gz"
            if os.path.exists(gzfile):
                if os.path.exists(outfile):
                    print(gzfile + " already extracted")
                else:
                    print("Extracting " + gzfile)
                    with gzip.open(gzfile, "rb") as in_f:
                        with open(outfile, "wb") as out_f:
                            shutil.copyfileobj(in_f, out_f)
                    os.remove(gzfile)
            else:
                print(gzfile + " doesn't exist")

    with open("data/tar_files.txt", "r") as comp_files:
        for infile in comp_files.readlines():
            infile = infile.strip()
            outfile = outdir + infile.strip()
            tar_f = outdir + infile.strip() + ".tar.gz"
            base_dir = os.path.basename(outfile)
            if os.path.exists(tar_f):
                if os.path.exists(outfile):
                    print(tar_f + " already extracted")
                else:
                    print("Extracting " + tar_f)
                    with tarfile.open(tar_f) as tar:
                        tar.extractall("data/")
                    os.remove(tar_f)
            else:
                print(tar_f + " doesn't exist")
示例#8
0
def make_blastdb(in_fasta, config):
    fasta_db = in_fasta + ".phr"
    makedb_command = [
        config["pipeline"]["pipeline_loc"] + "/" +
        config["software"]["blast"]["bin"] + "/makeblastdb", "-in", in_fasta,
        "-dbtype", "prot", "-out", in_fasta, "-title", in_fasta, "-hash_index"
    ]
    check_output_and_run(fasta_db, makedb_command)
示例#9
0
def run_hmmer(config):
    workdir = config["input"]["gomap_dir"] + "/"
    fa_dir = workdir + config["input"]["split_path"]
    fa_files = natsorted(glob(fa_dir + "/*fa"))
    hmmer_bin = config["software"]["hmmer"]["path"] + "/hmmscan"
    hmmerdb = config["data"]["mixed-method"]["preprocess"]["hmmerdb"]
    cpu = str(config["input"]["cpus"])
    tmp_file = workdir + "hmmscan.tmp"
    num_seqs = int(config["input"]["num_seqs"])

    chunks = []
    counter_start = 0
    counter_curr = -1
    chunk_seqs = 0
    chunk_count = 0
    all_seqs = []
    for fa_file in fa_files:
        counter_curr = counter_curr + 1
        seqs = list(SeqIO.parse(fa_file, "fasta"))
        num_fa_records = len(seqs)
        chunk_seqs = chunk_seqs + num_fa_records
        all_seqs = all_seqs + seqs

        if chunk_seqs % num_seqs == 0 or fa_file == fa_files[-1]:
            chunk_count = chunk_count + 1
            out_fa = workdir + config["data"]["mixed-method"]["argot2"][
                "preprocess"]["hmmer"] + "/" + config["input"][
                    "basename"] + "." + str(chunk_count) + ".fa"
            print(out_fa)
            SeqIO.write(all_seqs, out_fa, "fasta")
            all_seqs = []
            chunk_seqs = 0

    hmmer_dir = workdir + config["data"]["mixed-method"]["argot2"][
        "preprocess"]["hmmer"]
    fa_files = glob(hmmer_dir + "/*fa")
    for infile in fa_files:
        outfile = re.sub("\.fa", ".hmm.out", infile)
        cmd = [
            hmmer_bin, "-o", tmp_file, "--tblout", outfile, "--cpu", cpu,
            hmmerdb, infile
        ]
        zipfile_loc = outfile + ".zip"
        check_output_and_run(zipfile_loc, cmd)
        if os.path.exists(outfile):
            zf = zipfile.ZipFile(zipfile_loc, 'w', zipfile.ZIP_DEFLATED)
            zf.write(outfile, os.path.basename(outfile))
        if os.path.isfile(tmp_file):
            os.remove(tmp_file)
示例#10
0
def run_iprs(fa_file, config, iprs_loc=None):
    dom_config = config["data"]["domain"]
    iprs_config = config["software"]["iprs"]
    workdir = config["input"]["gomap_dir"] + "/"
    out_file = re.sub(
        r"\.fa$", "", workdir + "/" + dom_config["split_path"] + "/" +
        os.path.basename(fa_file))
    temp_dir = workdir + dom_config["tmpdir"] + "/temp"

    if iprs_loc is None:
        iprs_loc = iprs_config["path"]

    cmd = [
        iprs_loc + "/interproscan.sh", "-goterms", "-pa", "-i", fa_file, "-dp",
        "-b", out_file, "-T", temp_dir, "-cpu",
        str(config["input"]["cpus"])
    ] + iprs_config["options"]
    check_out = out_file + ".tsv"
    check_output_and_run(check_out, cmd)
示例#11
0
def run_blast(fa_file, blast_db, config):
    in_file = fa_file
    out_file = re.sub(r'fa$', "xml", fa_file)
    blast_config = config["software"]["blast"]
    workdir = config["input"]["gomap_dir"] + "/"

    blast_opts = config["data"]["mixed-method"]["preprocess"]["blast_opts"]
    skip_blast = check_bl_out(in_file, out_file)

    if skip_blast:
        logging.info(
            out_file +
            " already exists.\n The number of sequences in output match input")
    else:
        blast_cmd = [
            blast_config["bin"] + "/blastp", "-outfmt", "5", "-db", blast_db,
            "-query", in_file, "-out", out_file, "-num_threads",
            str(config["input"]["cpus"])
        ] + blast_opts
        print(" ".join(blast_cmd))
        check_output_and_run(out_file, blast_cmd)
示例#12
0
def download_mysql_data(config):
    outdir = "/var/lib/mysql/"

    if os.path.isdir(outdir):
        os.mkdir(outdir)

    outfile = "/var/lib/mysql/pannzer/uniprot.MYI"
    cmd = ["irsync", "-rv", cyverse_path, outdir]
    logging.info("Downloading file from Cyverse using irsync")
    print(" ".join(cmd))
    check_output_and_run(outfile, cmd)

    gz_files = glob("/var/lib/mysql/pannzer/*.gz")
    if len(gz_files) > 0:
        for gz_file in gz_files:
            print("Extracting " + gz_file)
            outfile = gz_file.replace(".gz", "")
            with gzip.open(gzfile, "rb") as in_f:
                with open(outfile, "wb") as out_f:
                    shutil.copyfileobj(in_f, out_f)
                os.remove(gzfile)
    else:
        print("No mysql files to extract")
示例#13
0
def make_uniprotdb(config):
    uniprot_fa = config["mixed-method"]["preprocess"]["uniprot_db"] + ".fa"
    uniprot_db = config["mixed-method"]["preprocess"]["uniprot_db"]
    uniprot_db_dir = os.path.dirname(uniprot_db)

    db_dir = os.path.dirname(uniprot_db)
    files = os.listdir("mixed-method/data/blastdb/")
    db_pattern = os.path.basename(uniprot_db)
    db_pattern = re.compile(db_pattern + ".*phd")

    db_exist = [(1 if db_pattern.match(tmp_file) is not None else 0)
                for tmp_file in files]
    makedb_cmd = [
        "makeblastdb", "-in", uniprot_fa, "-dbtype", "prot", "-out",
        uniprot_db, "-parse_seqids", "-hash_index", "-max_file_sz", "10GB"
    ]
    if 1 in db_exist:
        logging.warn(
            "The Uniprot blast database already exists, if not remove the database files to recreate the database"
        )
        logging.info(makedb_cmd)
    else:
        check_output_and_run("temp/uniprotdb", makedb_cmd)
示例#14
0
def tair_go2gaf(in_go, out_gaf, config_file):
    logging.info("Converting TAIR GO file to GAF format")
    basic_utils.check_output_and_run(
        out_gaf, ["Rscript", "code/R/tair2gaf.r", config_file])
示例#15
0
def clean_redundant(config):
    command = [
        "Rscript", "code/pipeline/clean_redundancy.r",
        config["input"]["config_file"]
    ]
    check_output_and_run("test.pod", command)
示例#16
0
def aggregate_datasets(config):
    command = [
        "Rscript", "code/pipeline/aggregate_datasets.r",
        config["input"]["config_file"]
    ]
    check_output_and_run("test.pod", command)
示例#17
0
def fanngo2gaf(config):
    command = [
        "Rscript", "code/pipeline/fanngo2gaf.R", config["input"]["config_file"]
    ]
    check_output_and_run("test.pod", command)
示例#18
0
def clean_duplicate(config):
    command = [
        "Rscript", "code/pipeline/clean_duplicate.r",
        config["input"]["config_file"]
    ]
    check_output_and_run("test.pod", command)
示例#19
0
def filter_mixed(config):
    command = ["Rscript","code/pipeline/filter_mixed.r",config["input"]["config_file"]]
    check_output_and_run("test.pod",command)