Exemplos de find_files em Python, exemplos de metawibele.utilities.find_files em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: ddi_DOMINE_protein.py Projeto: biobakery/metawibele

def DDI_annotation (extension, pfam_path, filter_flag, spe_level, interact, pfams, human_pfam, suffix):
	filelist = utilities.find_files(pfam_path, extension, None)
	for myfile in filelist:
		#myfile = pfam_path + "/" + samplelist + "/" + samplelist + ".interpro.PfamDomain.tsv"
		if not os.path.isfile(myfile):
			config.logger.info ("ERROR! File not exist: " + myfile)
		else:
			myout = re.sub(extension, suffix, myfile)
			myout_detail = re.sub(".tsv", ".detail.tsv", myout)
			peptide = collect_pfam_info (myfile)
			assign_interaction (filter_flag, spe_level, pfams, human_pfam, interact, peptide, myout, myout_detail)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: interproscan_pfam_protein_family.py Projeto: biobakery/metawibele

def collect_pfam_info(cluster_mem, extension, ann_path, outfile):  # list.txt
    pfams = {}
    filelist = utilities.find_files(ann_path, extension, None)
    for myfile in filelist:
        #myfile = ann_path + "/" + samplelist + "/" + samplelist + ".interpro.PfamDomain.tsv"
        if not os.path.isfile(myfile):
            config.logger.info("ERROR! File not exist: " + myfile)
            continue
        open_file = open(myfile, "r")
        for line in open_file.readlines():
            line = line.strip()
            if not len(line):
                continue
            if re.search("^" + utilities.PROTEIN_ID, line):
                continue
            info = line.split("\t")
            myid = info[0]
            if not myid in cluster_mem:
                continue
            pfam = info[1]
            if not myid in pfams:
                pfams[myid] = {}
            pfams[myid][pfam] = info[2]
        # foreach line
        open_file.close()
    # foreach samplelist

    # output details
    outfile1 = re.sub("_proteinfamilies.", "_proteinfamilies.ORF.", outfile)
    open_out = open(outfile1, "w")
    open_out.write(utilities.PROTEIN_ID + "\ttype\tdetail\tdescription\n")
    for myid in sorted(pfams.keys()):
        myacc = ""
        myann = ""
        for item in sorted(pfams[myid].keys()):
            if myacc == "":
                myacc = item
            else:
                myacc = myacc + ";" + item
            if myann == "":
                myann = pfams[myid][item]
            else:
                myann = myann + ";" + pfams[myid][item]
        open_out.write(myid + "\tPfam_PfamDomain\t" + myacc + "\t" + myann +
                       "\n")
    # foreach seqID
    open_out.close()

    return pfams

Exemplo n.º 3

0

Exibir arquivo

def collect_counts(map_path, extension, gene_cluster):
    counts = {}
    mysample = {}
    '''
	samples = {}
	open_file = open(sample_file, "r")
	for line in open_file:
		line = line.strip()
		if not len(line):
			continue
		samples[line.split("\t")[0]] = ""
	# foreach sample
	open_file.close()
	'''

    filelist = utilities.find_files(map_path, extension, None)
    for myfile in filelist:
        mym = re.search("([^\/]+)$", myfile)
        sample = mym.group(1)
        sample = re.sub("." + extension, "", sample)
        if not os.path.isfile(myfile):
            config.logger.info("ERROR! File not exist: " + myfile)
            continue
        mysample[sample] = ""
        open_file = open(myfile, "r")
        for line in open_file:
            line = line.strip()
            if not len(line):
                continue
            if re.search("^#", line):
                continue
            if re.search("^Geneid", line):
                continue
            info = line.split("\t")
            myid = info[0]
            if not myid in gene_cluster:  # not specified genes
                continue
            mycount = info[-1]
            if mycount == str(0):  # no counts
                continue
            if not myid in counts:
                counts[myid] = {}
            counts[myid][sample] = mycount
        # foreach line
        open_file.close()
    # foreach samplelist

    return counts, mysample

Exemplo n.º 4

0

Exibir arquivo

def format_contig_info(contig_path, extension, outfile):
    filelist = utilities.find_files(contig_path, extension, None)
    open_out = open(outfile, "w")
    for myfile in filelist:
        myfile = myfile.strip()
        if not len(myfile):
            continue
        sample = myfile
        mym = re.search("([^\/]+)$", sample)
        sample = mym.group(1)
        sample = re.sub("." + extension, "", sample)
        # collect seq info
        if not os.path.isfile(myfile):
            config.logger.info("WARNING! Contig file doesn't exist!\t" +
                               myfile)
            continue
        open_contig = open(myfile, "r")
        contigs = {}
        contig_order = []
        myid = ""
        for line in open_contig:
            line = line.strip()
            if not len(line):
                continue
            if re.search("^>", line):
                mym = re.search(">([\S]+)", line)
                myid_old = mym.group(1)
                #myid_new = sample + "_contig_" + mym.group(1) + "|" + sample + "|"
                #myid = re.sub(myid_old + "[\s]+", myid_new, line)
                myid = ">" + sample + "_contig_" + mym.group(1)
                if not myid in contigs:
                    contig_order.append(myid)
                    contigs[myid] = ""
                continue
            contigs[myid] = contigs[myid] + line
        # foreach line
        open_contig.close()
        # output contig sequence
        for myid in contig_order:
            if myid in contigs:
                open_out.write(myid + "\n" + contigs[myid] + "\n")
        # foreach contig
    # foreach sample

    open_out.close()

Exemplo n.º 5

0

Exibir arquivo

def collect_transmembrane_info (cluster_mem, extension, ann_path, outfile): # list.txt
	transmem = {}
	details = {}
	filelist = utilities.find_files(ann_path, extension, None)
	for myfile in filelist:
		if not os.path.isfile(myfile):
			config.logger.info ("ERROR! File not exist: " + myfile)
		else:
			open_file = open(myfile, "r")
			titles = {}
			for line in open_file.readlines():
				line = line.strip()
				if not len(line):
					continue
				info = line.split("\t")
				if re.search("^" + utilities.PROTEIN_ID, line):
					for item in info:
						titles[item] = info.index(item)
					continue
				myid = info[titles[utilities.PROTEIN_ID]]
				if not myid in cluster_mem:
					continue
				details[info[titles[utilities.PROTEIN_ID]]] = info[titles["Prediction"]]
				sample = re.sub("_[\d]+$", "", myid)
				if not sample in transmem:
					transmem[sample] = {}
				transmem[sample][info[titles[utilities.PROTEIN_ID]]] = info[titles["Prediction"]]
			# foreach line
			open_file.close()
	# foreach samplelist

	# output details
	outfile1 = re.sub("_proteinfamilies.", "_proteinfamilies.ORF.", outfile)
	open_out = open(outfile1, "w")
	open_out.write(utilities.PROTEIN_ID + "\ttype\tdetail\tdescription\n")
	for myid in sorted(details.keys()):
		myinfo = details[myid]
		open_out.write(myid + "\tTMHMM_transmembrane\tTMHMM_transmembrane\t" + myinfo + "\n")
	open_out.close()
	
	return transmem

Exemplo n.º 6

0

Exibir arquivo

def gene_calling(workflow, assembly_dir, assembly_extentsion, input_dir,
                 extension, extension_paired, gene_call_type, prokka_dir,
                 prodigal_dir, threads, gene_file, gene_PC_file, protein_file,
                 protein_sort, gene_info, complete_gene, complete_protein):
    """
	This set of tasks will run gene-calling workflow.

	Args:
		workflow (anadama2.workflow): An instance of the workflow class.
		assembly_dir: The direcory path of assembly results.
		sample_file: The sample list file.
		prokka_dir: The direcory path of prokka results.
		prodigal_dir: The direcory path of prodigal results.
		gene_file: The fasta file of gene nucleotide sequences.
		gene_PC_file: The fasta file of protein coding gene nucleotide sequences.
		protein_file: The fasta file of protein sequences.
		protein_sort: The sorted fasta file of protein sequences.
		gene_info: The summaized gene calling file.
		complete_gene: The fasta file of gene nucleotide sequences for complete ORFs.
		complete_protein: The fasta file of protein sequences for complete ORFs.

	Requires:
		prokka 1.14-dev: rapid prokaryotic genome annotation (recommend to close '-c' parameter in prodigal)
		prodigal v2.6: gene prediction
		usearch (tested with usearch v9.0.2132_i86linux64)
		assembled contig files

	Returns:
		string: name of gene files

	Example:
		from anadama2 import Workflow
		from MetaWIBELE.characterize import characterization

		# create an anadama2 workflow instance
		workflow=Workflow()

		# add gene calling tasks
		mygene, myprotein = preprocessing_tasks.gene_calling (workflow, assembly_dir, args.sample_file,
															  prokka_dir, prodigal_dir,
															  gene_file, gene_PC_file, protein_file, protein_sort,
															  gene_info, complete_gene, complete_protein)
		# run the workflow
		workflow.go()
	"""

    config.logger.info("###### Start gene_calling module ######")

    time_equation = config.time  # xxx hours defined in global config
    mem_equation = config.memory  # xxx GB defined in global config

    # ================================================
    # collect sequences
    # ================================================
    if extension_paired:
        extension_paireds = extension_paired.split(",")
        sample_files = utilities.find_files(input_dir, extension_paireds[0],
                                            None)
        samples = utilities.sample_names(sample_files, extension_paireds[0],
                                         None)
    else:
        sample_files = utilities.find_files(input_dir, extension, None)
        samples = utilities.sample_names(sample_files, extension, None)
    sequence_files = []
    for mysample in samples:
        myfile = os.path.join(assembly_dir, mysample,
                              mysample + "%s" % assembly_extentsion)
        sequence_files.append(myfile)
    # foreach sample

    filtered_contigs = sequence_files

    # ================================================
    # Gene calling
    # ================================================
    fna_file = []
    faa_file = []
    gff_files = []
    fna_file_tmp = []
    faa_file_tmp = []
    gff_files_tmp = []

    ## Using Prodigal
    if gene_call_type == "prodigal" or gene_call_type == "both":
        os.system("mkdir -p " + prodigal_dir)
        for contig in filtered_contigs:
            contig_base = os.path.basename(contig).split(os.extsep)[0]
            annotation_dir = os.path.join(prodigal_dir, contig_base)
            os.system("mkdir -p " + annotation_dir)
            gff_file = os.path.join(annotation_dir, '%s.gff' % contig_base)
            cds_file = os.path.join(annotation_dir, '%s.fna' % contig_base)
            cds_aa = os.path.join(annotation_dir, '%s.faa' % contig_base)
            score = os.path.join(annotation_dir,
                                 '%s.gene_score.txt' % contig_base)
            stdout_log = os.path.join(annotation_dir,
                                      '%s.stdout.log' % contig_base)
            faa_file_tmp.append(cds_aa)

            workflow.add_task_gridable(
                'prodigal -m -p meta -i [depends[0]] '
                '-f gff -o [targets[0]] -d [targets[1]] -s [targets[3]] '
                '-a [targets[2]] '
                '>[args[0]] 2>&1',
                depends=[contig, TrackedExecutable("prodigal")],
                targets=[gff_file, cds_file, cds_aa, score],
                args=[stdout_log],
                cores=threads,
                mem=mem_equation,
                time=time_equation,
                name=contig_base + "__prodigal")

        for myfile in faa_file_tmp:
            myname = os.path.basename(myfile)
            myfile_new = os.path.join(prodigal_dir, myname)
            faa_file.append(myfile_new)
            workflow.add_task("ln -fs [depends[0]] [targets[0]]",
                              depends=[myfile],
                              targets=[myfile_new],
                              cores=1,
                              name="ln__" + myname)
            myfna = re.sub(".faa", ".fna", myfile)
            myfna_new = re.sub(".faa", ".fna", myfile_new)
            if gene_call_type == "prodigal":
                fna_file.append(myfna_new)
                mygff_new = re.sub(".faa", ".gff", myfile_new)
                gff_files.append(mygff_new)
                prokka_dir = prodigal_dir
            workflow.add_task("ln -fs [depends[0]] [targets[0]]",
                              depends=[myfna],
                              targets=[myfna_new],
                              cores=1,
                              name="ln__" + os.path.basename(myfna))
            mygff = re.sub(".faa", ".gff", myfile)
            mygff_new = re.sub(".faa", ".gff", myfile_new)
            workflow.add_task("ln -fs [depends[0]] [targets[0]]",
                              depends=[mygff],
                              targets=[mygff_new],
                              cores=1,
                              name="ln__" + os.path.basename(mygff))

    if gene_call_type == "prokka" or gene_call_type == "both":
        ## Calling genes with Prokka
        os.system("mkdir -p " + prokka_dir)
        for contig in filtered_contigs:
            contig_base = os.path.basename(contig).split(os.extsep)[0]
            sample = os.path.basename(contig_base)
            annotation_dir = os.path.join(prokka_dir, sample)
            os.system("mkdir -p " + annotation_dir)
            stdout_log = os.path.join(
                annotation_dir, '%s.prokka.bacteria.stdout.log' % contig_base)
            score = os.path.join(annotation_dir,
                                 '%s.gene_score.txt' % contig_base)
            gene_nuc = os.path.join(annotation_dir, '%s.ffn' % contig_base)
            gene_aa = os.path.join(annotation_dir, '%s.faa' % contig_base)
            gff_file = os.path.join(annotation_dir, '%s.gff' % contig_base)
            fna_file_tmp.append(gene_nuc)
            gff_files_tmp.append(gff_file)

            workflow.add_task_gridable(
                'prokka --prefix [args[0]] --addgenes --addmrna --force --metagenome '
                '--cpus [args[2]] '
                '--outdir [args[1]] [depends[0]] '
                '>[args[3]] 2>&1 ',
                depends=[contig, TrackedExecutable("prokka")],
                targets=[gene_nuc, gene_aa, gff_file],
                args=[sample, annotation_dir, threads, stdout_log],
                cores=threads,
                mem=mem_equation,
                time=time_equation,
                name=contig_base + "__prokka")

        for myfile in gff_files_tmp:
            myname = os.path.basename(myfile)
            myfile_new = os.path.join(prokka_dir, myname)
            gff_files.append(myfile_new)
        for myfile in fna_file_tmp:
            myname = os.path.basename(myfile)
            myfile_new = os.path.join(prokka_dir, myname)
            fna_file.append(myfile_new)
            workflow.add_task("ln -fs [depends[0]] [targets[0]]",
                              depends=[myfile],
                              targets=[myfile_new],
                              cores=1,
                              name="ln__" + myname)
            myfaa = re.sub(".ffn", ".faa", myfile)
            myfaa_new = re.sub(".ffn", ".faa", myfile_new)
            if gene_call_type == "prokka":
                faa_file.append(myfaa_new)
                prodigal_dir = prokka_dir
            workflow.add_task("ln -fs [depends[0]] [targets[0]]",
                              depends=[myfaa],
                              targets=[myfaa_new],
                              cores=1,
                              name="ln__" + os.path.basename(myfaa))
            mygff = re.sub(".ffn", ".gff", myfile)
            mygff_new = re.sub(".ffn", ".gff", myfile_new)
            workflow.add_task("ln -fs [depends[0]] [targets[0]]",
                              depends=[mygff],
                              targets=[mygff_new],
                              cores=1,
                              name="ln__" + os.path.basename(mygff))

    # ================================================
    # Summarize sequences
    # ================================================
    #mem_equation = "50000"
    ### combine gene sequences ###
    nuc_type = "ffn"
    if gene_call_type == "prodigal":
        nuc_type = "fna"
    mylog = re.sub(".fna", ".log", gene_file)
    workflow.add_task(
        'metawibele_combine_gene_sequences -p [args[0]] -e [args[1]] -o [targets[0]] > [args[2]] 2>&1 ',
        depends=utilities.add_to_list(
            fna_file, TrackedExecutable("metawibele_combine_gene_sequences")) +
        fna_file_tmp + gff_files + gff_files_tmp,
        targets=[gene_file],
        args=[prokka_dir, nuc_type, mylog],
        cores=1,
        name="combine_gene_sequences")

    ### combine protein sequences ###
    ## collect sequences
    mylog = re.sub(".faa", ".log", protein_file)
    workflow.add_task(
        'metawibele_format_protein_sequences -p [args[0]] -q [args[1]] -e faa -o [targets[0]] '
        '-m [targets[1]] >[args[2]] 2>&1 ',
        depends=utilities.add_to_list(
            faa_file, TrackedExecutable("metawibele_format_protein_sequences"))
        + faa_file_tmp + gff_files + gff_files_tmp,
        targets=[protein_file, gene_info],
        args=[prokka_dir, prodigal_dir, mylog],
        cores=1,
        name="format_protein_sequences")

    ## sort by length and filter out short-length sequence
    mylog = re.sub(".faa", ".log", protein_sort)
    workflow.add_task(
        'usearch -sortbylength [depends[0]] '
        '-fastaout [targets[0]] -minseqlength 0 >[args[0]] 2>&1 ',
        depends=[protein_file, TrackedExecutable("usearch")],
        targets=[protein_sort],
        args=[mylog],
        cores=1,
        name="usearch__sorting")

    ## extract nucleotide sequence for protein coding genes
    mylog = re.sub(".fna", ".log", gene_PC_file)
    workflow.add_task(
        'metawibele_extract_protein_coding_genes -g [depends[0]] -p [depends[1]] -o [targets[0]] > [args[0]] 2>&1 ',
        depends=[
            gene_file, protein_sort,
            TrackedExecutable("metawibele_extract_protein_coding_genes")
        ],
        targets=[gene_PC_file],
        args=[mylog],
        cores=1,
        name="extract_protein_coding_genes")

    ## extract sequences
    mylog = re.sub(".fna", ".log", complete_gene)
    workflow.add_task(
        'metawibele_extract_complete_ORF_seq -t complete -m [depends[0]] -i [depends[1]] -o [targets[0]] >[args[0]] 2>&1',
        depends=[
            gene_info, gene_PC_file,
            TrackedExecutable("metawibele_extract_complete_ORF_seq")
        ],
        targets=[complete_gene],
        args=[mylog],
        cores=1,
        name='extract_complete_ORF_seq')

    mylog = re.sub(".faa", ".log", complete_protein)
    workflow.add_task(
        'metawibele_extract_complete_ORF_seq -t complete -m [depends[0]] -i [depends[1]] -o [targets[0]] >[args[0]] 2>&1',
        depends=[
            gene_info, protein_sort,
            TrackedExecutable("metawibele_extract_complete_ORF_seq")
        ],
        targets=[complete_protein],
        args=[mylog],
        cores=1,
        name='extract_complete_ORF_seq')

    return complete_gene, complete_protein

Exemplo n.º 7

0

Exibir arquivo

def collect_sequence(ann_path, extension, partial_path, outfile):
    filelist = utilities.find_files(ann_path, extension, None)
    open_out = open(outfile, "w")
    outfile2 = re.sub(".faa", ".abnormal_seq.faa", outfile)
    outfile2 = re.sub(".fasta", ".abnormal_seq.fasta", outfile)
    open_out2 = open(outfile2, "w")
    gff = {}
    types = {}
    partial = {}
    for myfile in filelist:
        sample = myfile
        mym = re.search("([^\/]+)$", sample)
        sample = mym.group(1)
        sample = re.sub("." + extension, "", sample)

        # collect gff info that is corresponded the sequences file
        contigs = {}
        mapping = {}
        mygff = re.sub("." + extension, ".gff", myfile)
        if not os.path.isfile(mygff):
            config.logger.info("ERROR! Gff file doesn't exist!\t" + mygff)
            continue
        config.logger.info("Read gff file: " + mygff)
        open_gff = open(mygff, "r")
        for line in open_gff:
            line = line.strip()
            if not len(line):
                continue
            if re.search("^#", line):
                if re.search("^##sequence-region", line):
                    mym = re.search(
                        "##sequence-region\s+([\S]+)\s+([\d]+)\s+([\d]+)",
                        line)
                    tmp_contig = mym.group(1)
                    contigs[tmp_contig] = str(mym.group(2)) + "\t" + str(
                        mym.group(3))
                if re.search("^# Sequence Data", line):
                    mytmp = line.split(";")
                    mym = re.sub("\"", "", mytmp[-1])
                    mym = re.search("^seqhdr\=([\S]+)[\s]+[\s\S]+len\=([\d]+)",
                                    mym)
                    tmp_contig = mym.group(1)
                    contigs[tmp_contig] = str(1) + "\t" + str(mym.group(2))
                continue
            if re.search("^>", line):
                break
            info = line.split("\t")
            feature = info[2]
            start = info[3]
            stop = info[4]
            strand = info[6]
            desc = info[8]
            myinfo = desc.split(";")
            if re.search("ID\=([^\;]+)", desc):
                myid = re.search("ID\=([^\;]+)", desc)
                myid = myid.group(1)
            else:
                continue
            gene_name = "NA"
            gene_id = "NA"
            gene_num = "NA"
            sample_id = "NA"
            contig_id = info[0]
            contig_len = "NA\tNA"
            if contig_id in contigs:
                contig_len = str(contigs[contig_id])
            for item in myinfo:
                if re.search("locus_tag=", item):
                    mym = re.search("locus_tag=([\S]+)", item)
                    gene_id = mym.group(1)
                if re.search("Name=", item):
                    mym = re.search("Name=([\S]+)", item)
                    gene_name = mym.group(1)
            # foreach item
            if not re.search("locus_tag=", desc):
                gene_id = sample + "_" + re.sub("_", "-", myid)
            if not re.search("Name=", desc):
                gene_name = sample + "_" + re.sub("_", "-", myid)
            if re.search("\_", gene_id):
                mym = re.search("^([^\_]+)\_([\S]+)", gene_id)
                sample_id = mym.group(1)
                gene_num = mym.group(2)
            if not re.search("locus_tag=", desc):
                gene = gene_id
                sample_id = sample
            else:
                gene = sample + "_" + gene_num
            contig = sample + "_contig_" + contig_id
            if feature == "gene":
                if not sample in gff:
                    gff[sample] = {}
                if not gene_id in gff[sample]:
                    gff[sample][
                        gene_id] = gene + "\t" + gene_id + "\t" + gene_name + "\t" + start + "\t" + stop + "\t" + strand + "\n" + contig + "\t" + contig_id + "\t" + contig_len + "\n" + sample + "\t" + sample_id
            if feature != "gene" and feature != "mRNA":
                if not sample in types:
                    types[sample] = {}
                if not gene_id in types[sample]:
                    types[sample][gene_id] = feature
                if feature == "CDS":
                    new_id = contig_id + "\t" + start + "\t" + stop + "\t" + strand
                    mapping[new_id] = gene + "\t" + gene_id
                    if not re.search("locus_tag=", desc):
                        if not sample in gff:
                            gff[sample] = {}
                        if not gene_id in gff[sample]:
                            gff[sample][
                                gene_id] = gene + "\t" + gene_id + "\t" + gene_name + "\t" + start + "\t" + stop + "\t" + strand + "\n" + contig + "\t" + contig_id + "\t" + contig_len + "\n" + sample + "\t" + sample_id
                # foreach line
        open_gff.close()

        # collect sequences from prodigal results including pratial info
        myfile1 = re.sub(ann_path, partial_path, myfile)
        open_file = open(myfile1, "r")
        AA_seq = {}
        myname = ""
        flag = 0
        hit_num = 0
        total_num = 0
        for line in open_file.readlines():
            line = line.strip()
            if not len(line):
                continue
            if re.search("^>", line):  # sequence id
                total_num = total_num + 1
                line = re.sub("^>", "", line)
                info = line.split(" # ")
                if len(info) < 4:
                    # debug
                    config.logger.info("WARNING! No info!\t" + myfile1 + "\t" +
                                       line)
                    continue
                myref = re.sub("_[\d]+$", "", info[0])
                mystart = info[1]
                mystop = info[2]
                mystrand = "+"
                if info[3] == "-1":
                    mystrand = "-"
                myid = myref + "\t" + mystart + "\t" + mystop + "\t" + mystrand
                myname = myid
                flag = 0
                if myid in mapping:
                    hit_num = hit_num + 1
                    # debug
                    #print("Mapping:" + myid + "\t" + mapping[myid])
                    gene, gene_id = mapping[myid].split("\t")
                    myname = ">" + gene
                    if not myname in AA_seq:
                        AA_seq[myname] = ""
                    mym = re.search("partial=([\d]+)", info[-1])
                    mypartial = mym.group(1)
                    if not sample in partial:
                        partial[sample] = {}
                    partial[sample][gene_id] = mypartial
                    flag = 1
                else:
                    # debug
                    config.logger.info("No mapping info!\t" + line)
                continue
            else:
                if flag == 1:
                    if myname in AA_seq:
                        AA_seq[myname] = AA_seq[myname] + line
        # foreach line
        open_file.close()

        if hit_num != total_num:
            open_file = open(myfile, "r")
            myname = ""
            for line in open_file.readlines():
                line = line.strip()
                if not len(line):
                    continue
                if re.search("^>", line):  # sequence id
                    mym = re.search("^>([^\_]+)\_([\S]+)", line)
                    sample_id = mym.group(1)
                    gene_num = mym.group(2)
                    gene = sample + "_" + gene_num
                    mym = re.search("^>([\S]+)", line)
                    gene_id = mym.group(1)
                    myname = ">" + gene
                    if not myname in AA_seq:
                        AA_seq[myname] = ""
                    else:
                        myname = ""
                        continue
                    if not sample in partial:
                        partial[sample] = {}
                    partial[sample][gene_id] = "00"
                else:
                    if myname in AA_seq:
                        AA_seq[myname] = AA_seq[myname] + line
            # foreach line
            open_file.close()

        for myname in sorted(AA_seq.keys()):
            myseq = AA_seq[myname]
            myseq = re.sub("\*$", "", myseq)
            AA_seq[myname] = myseq
            if re.search("\*", myseq):  # terminal codon in CDS
                #print("Abnormal CDS\t" + sample + "\t" + myname)
                open_out2.write(myname + "\n" + AA_seq[myname] + "\n")
                continue
            else:
                open_out.write(myname + "\n" + AA_seq[myname] + "\n")
    # foreach sample
    open_out.close()

    return gff, types, partial

Exemplo n.º 8

0

Exibir arquivo

def collect_DDI_info(cluster_mem, extension, ann_path, level, label,
                     outfile):  # list.txt
    DDIs = {}
    anns = {}
    titles = {}
    filelist = utilities.find_files(ann_path, extension, None)
    for myfile in filelist:
        #myfile = ann_path + "/" + samplelist + "/" + samplelist + "." + suffix
        if not os.path.isfile(myfile):
            config.logger.info("ERROR! File not exist: " + myfile)
            continue
        open_file = open(myfile, "r")
        for line in open_file.readlines():
            line = line.strip()
            if not len(line):
                continue
            info = line.split("\t")
            if re.search("^" + utilities.PROTEIN_ID, line):
                for item in info:
                    titles[item] = info.index(item)
                continue
            myid = info[titles[utilities.PROTEIN_ID]]
            if not myid in cluster_mem:
                continue
            mytype = info[titles["Type"]]
            mylevel = info[titles["Interaction"]]
            mypfam = info[titles["Pfam1_ID"]] + ":" + info[titles["Pfam2_ID"]]
            myann = info[titles["Pfam1_ann"]] + ":" + info[titles["Pfam2_ann"]]
            if not myid in DDIs:
                DDIs[myid] = {}
            DDIs[myid][mylevel + "\t" + mypfam] = ""
            anns[mypfam] = myann
        # foreach line
        open_file.close()
    # foreach samplelist

    # output details
    outfile1 = re.sub("_proteinfamilies.", "_proteinfamilies.ORF.", outfile)
    outfile2 = re.sub(".tsv", ".detail.tsv", outfile1)
    open_out1 = open(outfile1, "w")
    open_out2 = open(outfile2, "w")
    open_out2.write(utilities.PROTEIN_ID +
                    "\ttype\tdetail\tannotation\tinteraction\n")
    open_out1.write(
        utilities.PROTEIN_ID +
        "\tType\tInteraction\tPfam1_ID\tPfam2_ID\tPfam1_ann\tPfam2_ann\n")
    for myid in sorted(DDIs.keys()):
        mypfam = ""
        myann = ""
        mylevel = ""
        for item in sorted(DDIs[myid].keys()):
            tmp = item.split("\t")
            #myt = "DOMINE_interaction"
            myt = label
            myl = tmp[0]
            pfam1, pfam2 = tmp[1].split(":")
            ann1 = "NA:NA"
            if tmp[1] in anns:
                ann1 = anns[tmp[1]]
            ann1 = re.sub(":", "\t", ann1)
            open_out1.write(myid + "\t" + myt + "\t" + myl + "\t" + pfam1 +
                            "\t" + pfam2 + "\t" + ann1 + "\n")
            if level != "no":
                if tmp[0] != "NA":
                    if tmp[0] != level:
                        continue
            mypfam = mypfam + tmp[1] + ";"
            mylevel = mylevel + tmp[0] + ";"
            if tmp[1] in anns:
                myann = myann + anns[tmp[1]] + ";"
            else:
                myann = myann + "NA;"
        # foreach DDI
        mypfam = re.sub(";$", "", mypfam)
        myann = re.sub(";$", "", myann)
        mylevel = re.sub(";$", "", mylevel)
        if mypfam == "":
            continue
        #open_out2.write(myid + "\tDOMINE_interaction\t" + mypfam + "\t" + myann  + "\t" + mylevel + "\n")
        open_out2.write(myid + "\t" + label + "\t" + mypfam + "\t" + myann +
                        "\t" + mylevel + "\n")
    # foreach seqID
    open_out1.close()
    open_out2.close()

    return DDIs, anns

Exemplo n.º 9

0

Exibir arquivo

def collect_ann_info(cluster_mem, extension, ann_path, types,
                     outfile):  # list.txt
    anns = {}
    anns_info = {}
    filelist = utilities.find_files(ann_path, extension, None)
    for myfile in filelist:
        for suffix in types:
            myfile1 = re.sub(extension, suffix, myfile)
            if not os.path.isfile(myfile1):
                #print ("File not exist!\t" + myfile1)
                continue
            open_file = open(myfile1, "r")
            mym = re.search("interpro.([\S]+).tsv", suffix)
            mytype = "InterProScan_" + mym.group(1)
            for line in open_file.readlines():
                line = line.strip()
                if not len(line):
                    continue
                if re.search("^" + utilities.PROTEIN_ID, line):
                    continue
                info = line.split("\t")
                myid = info[0]
                if not myid in cluster_mem:
                    continue
                acc = info[1]
                if not myid in anns:
                    anns[myid] = {}
                if not mytype in anns[myid]:
                    anns[myid][mytype] = {}
                if not mytype in anns_info:
                    anns_info[mytype] = {}
                if info[2] == "":
                    info[2] = "NA"
                if info[3] == "":
                    info[3] = "NA"
                anns[myid][mytype][acc] = info[2] + "\t" + info[3]
                anns_info[mytype][acc] = info[2] + "\t" + info[3]
            # foreach line
            open_file.close()
        # foreach type of annotation
    #foreach samplelist

    # output details
    outfile1 = re.sub("_proteinfamilies.", "_proteinfamilies.ORF.", outfile)
    open_out = open(outfile1, "w")
    open_out.write(utilities.PROTEIN_ID +
                   "\ttype\tdetail\tdescription\tInterPro_accession\n")
    for myid in sorted(anns.keys()):
        for mytype in sorted(anns[myid].keys()):
            myinfo1 = ""
            myinfo2 = ""
            myinfo3 = ""
            for myacc in sorted(anns[myid][mytype].keys()):
                myinfo1 = myinfo1 + myacc + ";"
                tmp = anns[myid][mytype][myacc].split("\t")
                myinfo2 = myinfo2 + tmp[0] + ";"
                myinfo3 = myinfo3 + tmp[1] + ";"
            # foreach annotated accession
            myinfo1 = re.sub(";$", "", myinfo1)
            myinfo2 = re.sub(";$", "", myinfo2)
            myinfo3 = re.sub(";$", "", myinfo3)
            if myinfo1 == "":
                continue
            open_out.write(myid + "\t" + mytype + "\t" + myinfo1 + "\t" +
                           myinfo2 + "\t" + myinfo3 + "\n")
        # foreach type
    # foreach seqID
    open_out.close()
    return anns, anns_info

Exemplo n.º 10

0

Exibir arquivo

def collect_localizing_info(cluster_mem, extension, ann_path,
                            outfile):  # list.txt
    gram_p = {}
    gram_n = {}
    archaea = {}
    location = {}
    score = {}
    location_n = {}
    location_p = {}
    location_a = {}
    filelist = utilities.find_files(ann_path, extension, None)
    for myfile in filelist:
        #myfile = ann_path + "/" + samplelist + "/" + samplelist + ".psortb.gram_positive.out.location.tsv"
        if not os.path.isfile(myfile):
            config.logger.info("ERROR! File not exist: " + myfile)
        else:
            open_file = open(myfile, "r")
            for line in open_file.readlines():
                line = line.strip()
                if not len(line):
                    continue
                if re.search("^name", line):
                    continue
                info = line.split("\t")
                myid = info[0]
                if not myid in cluster_mem:
                    continue
                sample = re.sub("_[\d]+$", "", myid)
                if not sample in gram_p:
                    gram_p[sample] = {}
                if not info[0] in location:
                    location[info[0]] = info[1]
                    score[info[0]] = info[2]
                else:
                    if float(info[2]) > float(score[info[0]]):
                        location[info[0]] = info[1]
                        score[info[0]] = info[2]
                location_p[info[0]] = info[1]
                gram_p[sample][info[0]] = info[1]
            # foreach line
            open_file.close()

        #myfile = ann_path + "/" + samplelist + "/" + samplelist + ".psortb.gram_negative.out.location.tsv"
        myfile1 = re.sub("psortb.gram_positive.out.location.tsv",
                         "psortb.gram_negative.out.location.tsv", myfile)
        if not os.path.isfile(myfile1):
            config.logger.info("ERROR! File not exist: " + myfile1)
        else:
            open_file = open(myfile1, "r")
            for line in open_file.readlines():
                line = line.strip()
                if not len(line):
                    continue
                if re.search("^name", line):
                    continue
                info = line.split("\t")
                myid = info[0]
                if not myid in cluster_mem:
                    continue
                sample = re.sub("_[\d]+$", "", myid)
                if not sample in gram_n:
                    gram_n[sample] = {}
                if not info[0] in location:
                    location[info[0]] = info[1]
                    score[info[0]] = info[2]
                else:
                    if float(info[2]) > float(score[info[0]]):
                        location[info[0]] = info[1]
                        score[info[0]] = info[2]
                location_n[info[0]] = info[1]
                gram_n[sample][info[0]] = info[1]
            # foreach line
            open_file.close()

        #myfile = ann_path + "/" + samplelist + "/" + samplelist + ".psortb.archaea.out.location.tsv"
        myfile1 = re.sub("psortb.gram_positive.out.location.tsv",
                         "psortb.archaea.out.location.tsv", myfile)
        if not os.path.isfile(myfile1):
            config.logger.info("ERROR! File not exist: " + myfile1)
        else:
            open_file = open(myfile1, "r")
            for line in open_file.readlines():
                line = line.strip()
                if not len(line):
                    continue
                if re.search("^name", line):
                    continue
                info = line.split("\t")
                myid = info[0]
                if not myid in cluster_mem:
                    continue
                sample = re.sub("_[\d]+$", "", myid)
                if not sample in archaea:
                    archaea[sample] = {}
                if not info[0] in location:
                    location[info[0]] = info[1]
                    score[info[0]] = info[2]
                else:
                    if float(info[2]) > float(score[info[0]]):
                        location[info[0]] = info[1]
                        score[info[0]] = info[2]
                location_a[info[0]] = info[1]
                archaea[sample][info[0]] = info[1]
            # foreach line
            open_file.close()
    # foreach samplelist

    # output details
    outfile1 = re.sub("_proteinfamilies.", "_proteinfamilies.ORF.", outfile)
    open_out = open(outfile1, "w")
    open_out.write(utilities.PROTEIN_ID + "\ttype\tdetail\tscore\n")
    for myid in sorted(location.keys()):
        mytype = location[myid]
        if mytype == "Unknown":
            mytype = "PSORTb_unknown"
        if mytype == "Cytoplasmic":
            mytype = "PSORTb_cytoplasmic"
        if mytype == "CytoplasmicMembrane":
            mytype = "PSORTb_cytoplasmicMembrane"
        if mytype == "Extracellular":
            mytype = "PSORTb_extracellular"
        if mytype == "Cellwall":
            mytype = "PSORTb_cellWall"
        if mytype == "OuterMembrane":
            mytype = "PSORTb_outerMembrane"
        if mytype == "Periplasmic":
            mytype = "PSORTb_periplasmic"
        myscore = "NA"
        if myid in score:
            myscore = score[myid]
        mydetail = re.sub("PSORTb_", "", mytype)
        open_out.write(myid + "\t" + mytype + "\t" + mydetail + "\t" +
                       str(myscore) + "\n")
    # foreach seqID
    open_out.close()

    return gram_p, gram_n, archaea, location, score, location_p, location_n, location_a

Exemplo n.º 11

0

Exibir arquivo

Arquivo: psortb_protein.py Projeto: biobakery/metawibele

def extract_psortb_info (extension, psortb_path):
	filelist = utilities.find_files(psortb_path, extension, None)
	for myfile in filelist:
		# gram+
		#myfile = psortb_path + "/" + samplelist + "/" + samplelist + ".psortb.gram_positive.out.txt"
		if not os.path.isfile(myfile):
			config.logger.info ("ERROR! File not exist: " + myfile)
		else:
			config.logger.info ("OK!\t" + myfile)
			open_file = open(myfile, "r")
			myid = ""
			out_p = []
			flag = 0
			for line in open_file:
				line = line.strip()
				if not len(line):
					continue
				if re.search("SeqID:", line):
					mym = re.search("SeqID:\s+([\S]+)", line)
					myid = mym.group(1)
					continue
				# id
				if re.search("Final", line):
					flag = 1
					continue
				if flag == 1:
					if not len(line):
						continue
					line = re.sub("\s+", "\t", line)
					info = line.split("\t")
					mypredict = "NA"
					myscore = 0
					if not re.search("[\S]+", info[0]):
						mypredict = info[1]
						myscore = info[-1]
					else:
						mypredict = info[0]
						myscore = info[-1]
					if mypredict == myscore:
						myscore = 0
					flag = 0
					if re.search("Unknown", mypredict):
						mypredic = "Unknown"
					myscore = re.sub("\s+", "", str(myscore))
					if re.search("[a-zA-Z]+", myscore):
						myscore = 0
					#open_out.write(myid + "\t" + mypredict + "\t" + str(myscore) + "\n")
					out_p.append(myid + "\t" + mypredict + "\t" + str(myscore))
			# foreach line
			myout = re.sub(".txt", ".location.tsv", myfile)
			open_out = open(myout, "w")
			open_out.write("name\ttype\tscore\n")
			for item in out_p:
				open_out.write(item + "\n")
			open_file.close()
		# if file exist

		# gram-
		#myfile = psortb_path + "/" + samplelist + "/" + samplelist + ".psortb.gram_negtive.out.txt"
		myfile1 = re.sub("psortb.gram_positive.out.txt", "psortb.gram_negative.out.txt", myfile)
		if not os.path.isfile(myfile1):
			config.logger.info ("ERROR! File not exist: " + myfile1)
		else:
			config.logger.info ("OK!\t" + myfile1)
			open_file = open(myfile1, "r")
			out_n = []
			myid = ""
			flag = 0	
			for line in open_file:
				line = line.strip()
				if not len(line):
					continue
				if re.search("SeqID:", line):
					mym = re.search("SeqID:\s+([\S]+)", line)
					myid = mym.group(1)
					continue
				# id
				if re.search("Final", line):
					flag = 1
					continue
				if flag == 1:
					if not len(line):
						continue
					line = re.sub("\s+", "\t", line)
					info = line.split("\t")
					mypredict = "NA"
					myscore = 0
					if not re.search("[\S]+", info[0]):
						mypredict = info[1]
						myscore = info[-1]
					else:
						mypredict = info[0]
						myscore = info[-1]
					if mypredict == myscore:
						myscore = 0
					flag = 0
					if re.search("Unknown", mypredict):
						mypredic = "Unknown"
					myscore = re.sub("\s+", "", str(myscore))
					if re.search("[a-zA-Z]+", myscore):
						myscore = 0
					#open_out.write(myid + "\t" + mypredict + "\t" + str(myscore) + "\n")
					out_n.append(myid + "\t" + mypredict + "\t" + str(myscore))
			# foreach line
			#myout = re.sub(".txt", ".location.tsv", myfile)
			myout = re.sub("gram_negative.out.txt", "gram_negative.out.location.tsv", myfile1)
			open_out = open(myout, "w")
			open_out.write("name\ttype\tscore\n")
			for item in out_n:
				open_out.write(item + "\n")
			open_out.close()
		# if file exist

		# archaea
		#myfile = psortb_path + "/" + samplelist + "/" + samplelist + ".psortb.archaea.out.txt"
		myfile1 = re.sub("psortb.gram_positive.out.txt", "psortb.archaea.out.txt", myfile)
		if not os.path.isfile(myfile1):
			config.logger.info ("ERROR! File not exist: " + myfile1)
			continue
		config.logger.info("OK!\t" + myfile1)
		open_file = open(myfile1, "r")
		myid = ""
		out_a = []
		flag = 0
		for line in open_file:
			line = line.strip()
			if not len(line):
				continue
			if re.search("SeqID:", line):
				mym = re.search("SeqID:\s+([\S]+)", line)
				myid = mym.group(1)
				continue
			# id
			if re.search("Final", line):
				flag = 1
				continue
			if flag == 1:
				line = re.sub("\s+", "\t", line)
				info = line.split("\t")
				mypredict = "NA"
				myscore = 0
				if not re.search("[\S]+", info[0]):
					mypredict = info[1]
					myscore = info[-1]
				else:
					mypredict = info[0]
					myscore = info[-1]
				if mypredict == myscore:
					myscore = 0
				flag = 0
				if re.search("Unknown", mypredict):
					mypredic = "Unknown"
				myscore = re.sub("\s+", "", str(myscore))
				if re.search("[a-zA-Z]+", myscore):
					myscore = 0
				#open_out.write(myid + "\t" + mypredict + "\t" + str(myscore) + "\n")
				out_a.append(myid + "\t" + mypredict + "\t" + str(myscore))
		# foreach line
		myout = re.sub(".txt", ".location.tsv", myfile1)
		open_out = open(myout, "w")
		open_out.write("name\ttype\tscore\n")
		for item in out_a:
			open_out.write(item + "\n")
		open_file.close()

Exemplo n.º 12

0

Exibir arquivo

def collect_sequence(gene_path, extension, outfile):
    sampleid = {}
    filelist = utilities.find_files(gene_path, extension, None)
    open_out = open(outfile, "w")
    outfile1 = re.sub(".fna", "_protein_coding.fna", outfile)
    #open_out1 = open(outfile1, "w")
    for myfile in filelist:
        sample = myfile
        mym = re.search("([^\/]+)$", sample)
        sample = mym.group(1)
        sample = re.sub("." + extension, "", sample)
        mygff = re.sub("." + extension, ".gff", myfile)

        # collect protein-coding IDs
        gffs = {}
        open_gff = open(mygff, "r")
        for line in open_gff:
            line = line.strip()
            if not len(line):
                continue
            if re.search("^\#", line):
                continue
            if re.search("^\>", line):
                break
            info = line.split("\t")
            if info[2] == "CDS":  # protein-coding genes
                mym = re.search("^ID=([^\;]+)", info[-1])
                gffs[mym.group(1)] = ""
                # debug
                #print("Protein-coding gene:\t" + mym.group(1))
            else:
                # debug
                config.logger.info("Skip non-CDS\t" + info[2])
        # foreach line
        open_gff.close()

        # output sequences
        open_file = open(myfile, "r")
        flag = 0
        for line in open_file.readlines():
            line = line.strip()
            if not len(line):
                continue
            if re.search("^\>", line):  # sequence id
                if re.search("ID\=", line):
                    mym = re.search("ID\=([^\;]+)", line)
                    mygene = mym.group(1)
                    mym = re.search("\>([\S]+)", line)
                    myid = mym.group(1)
                    myid_new = sample + "_" + re.sub("_", "-", mygene)
                    sampleid[sample] = sample
                    line = re.sub(myid, myid_new, line)
                else:
                    mym = re.search("\>([\S]+)", line)
                    mygene = mym.group(1)
                    mym = re.search("\>([^\_]+)", line)
                    myid = mym.group(1)
                    sampleid[sample] = myid
                    line = re.sub(myid, sample, line)
                open_out.write(line + "\n")
                if mygene in gffs:
                    flag = 1
                    #open_out1.write(line + "\n")
                else:
                    # debug
                    config.logger.info("Skip non protein coding sequences: " +
                                       mygene + "\t" + line)
                    flag = 0
                continue
            else:
                open_out.write(line + "\n")
                #if flag == 1:
                #	open_out1.write(line + "\n")
        # foreach line
        open_file.close()
    # foeach sample
    open_out.close()
    #open_out1.close()

    return sampleid

Exemplo n.º 13

0

Exibir arquivo

Arquivo: interproscan_protein.py Projeto: biobakery/metawibele

def extract_interproscan_info (extension, interproscan_path):
	filelist = utilities.find_files(interproscan_path, extension, None)
	for myfile in filelist:
		#myfile = interproscan_path + "/" + samplelist + "/" + samplelist + ".interproscan.txt"
		if not os.path.isfile(myfile):
			config.logger.info ("ERROR! File not exist: " + myfile)
		else:
			config.logger.info ("OK!\t" + myfile)
			myout1 = re.sub(".interproscan.txt", ".signalp.signaling.tsv", myfile)
			myout2 = re.sub(".interproscan.txt", ".tmhmm.transmembrane.tsv", myfile)
			myout3 = re.sub(".interproscan.txt", ".phobius.signaling.tsv", myfile)
			myout4 = re.sub(".interproscan.txt", ".phobius.transmembrane.tsv", myfile)
			myout5 = re.sub(".interproscan.txt", ".interpro.PfamDomain.tsv", myfile)
			myout6 = re.sub(".interproscan.txt", ".interpro.SUPERFAMILY.tsv", myfile)
			myout7 = re.sub(".interproscan.txt", ".interpro.PROSITEPROFILES.tsv", myfile)
			myout8 = re.sub(".interproscan.txt", ".interpro.Gene3D.tsv", myfile)
			myout9 = re.sub(".interproscan.txt", ".interpro.PANTHER.tsv", myfile)
			myout10 = re.sub(".interproscan.txt", ".interpro.TIGRFAM.tsv", myfile)
			myout11 = re.sub(".interproscan.txt", ".interpro.SFLD.tsv", myfile)
			myout12 = re.sub(".interproscan.txt", ".interpro.ProDom.tsv", myfile)
			myout13 = re.sub(".interproscan.txt", ".interpro.Hamap.tsv", myfile)
			myout14 = re.sub(".interproscan.txt", ".interpro.SMART.tsv", myfile)
			myout15 = re.sub(".interproscan.txt", ".interpro.CDD.tsv", myfile)
			myout16 = re.sub(".interproscan.txt", ".interpro.PROSITEPATTERNS.tsv", myfile)
			myout17 = re.sub(".interproscan.txt", ".interpro.PRINTS.tsv", myfile)
			myout18 = re.sub(".interproscan.txt", ".interpro.PIRSF.tsv", myfile)
			myout19 = re.sub(".interproscan.txt", ".interpro.MobiDBLite.tsv", myfile)
			myout20 = re.sub(".interproscan.txt", ".interpro.Coils.tsv", myfile)
			open_out1 = open(myout1, "w")
			open_out2 = open(myout2, "w")
			open_out3 = open(myout3, "w")
			open_out4 = open(myout4, "w")
			open_out5 = open(myout5, "w")
			open_out6 = open(myout6, "w")
			open_out7 = open(myout7, "w")
			open_out8 = open(myout8, "w")
			open_out9 = open(myout9, "w")
			open_out10 = open(myout10, "w")
			open_out11 = open(myout11, "w")
			open_out12 = open(myout12, "w")
			open_out13 = open(myout13, "w")
			open_out14 = open(myout14, "w")
			open_out15 = open(myout15, "w")
			open_out16 = open(myout16, "w")
			open_out17 = open(myout17, "w")
			open_out18 = open(myout18, "w")
			open_out19 = open(myout19, "w")
			open_out20 = open(myout20, "w")

			open_file = open(myfile, "r")
			open_out1.write(utilities.PROTEIN_ID + "\tSP\tPrediction\tStart\tEnd\n")
			open_out2.write(utilities.PROTEIN_ID + "\tTM\tPrediction\tStart\tEnd\n")
			open_out3.write(utilities.PROTEIN_ID + "\tSP\tPrediction\tStart\tEnd\n")
			open_out4.write(utilities.PROTEIN_ID + "\tTM\tPrediction\tStart\tEnd\n")
			open_out5.write(utilities.PROTEIN_ID + "\tPfam\tDescription\tInterPro\tEvalue\n")
			open_out6.write(utilities.PROTEIN_ID + "\tSUPERFAMILY\tDescription\tInterPro\tEvalue\n")
			open_out7.write(utilities.PROTEIN_ID + "\tProSiteProfiles\tDescription\tInterPro\tEvalue\n")
			open_out8.write(utilities.PROTEIN_ID + "\tGene3D\tDescription\tInterPro\tEvalue\n")
			open_out9.write(utilities.PROTEIN_ID + "\tPANTHER\tDescription\tInterPro\tEvalue\n")
			open_out10.write(utilities.PROTEIN_ID + "\tTIGRFAM\tDescription\tInterPro\tEvalue\n")
			open_out11.write(utilities.PROTEIN_ID + "\tSFLD\tDescription\tInterPro\tEvalue\n")
			open_out12.write(utilities.PROTEIN_ID + "\tProDom\tDescription\tInterPro\tEvalue\n")
			open_out13.write(utilities.PROTEIN_ID + "\tHamap\tDescription\tInterPro\tEvalue\n")
			open_out14.write(utilities.PROTEIN_ID + "\tSMART\tDescription\tInterPro\tEvalue\n")
			open_out15.write(utilities.PROTEIN_ID + "\tCDD\tDescription\tInterPro\tEvalue\n")
			open_out16.write(utilities.PROTEIN_ID + "\tProSitePatterns\tDescription\tInterPro\tEvalue\n")
			open_out17.write(utilities.PROTEIN_ID + "\tPRINTS\tDescription\tInterPro\tEvalue\n")
			open_out18.write(utilities.PROTEIN_ID + "\tPIRSF\tDescription\tInterPro\tEvalue\n")
			open_out19.write(utilities.PROTEIN_ID + "\tMobiDBLite\tDescription\tInterPro\tEvalue\n")
			open_out20.write(utilities.PROTEIN_ID + "\tCoils\tDescription\tInterPro\tEvalue\n")
			for line in open_file.readlines():
				line = line.strip()
				if not len(line):
					continue
				if re.search("^#", line):
					continue
				info = line.split("\t")
				myid = info[0]
				mytype = info[3]
				myacc = info[4]
				mydec = info[5]
				start = info[6]
				end = info[7]
				myscore = info[8]
				mystatus = info[9]
				if len(info) < 12:
					interproacc = "NA"
					interprodec = "NA"
				else:
					interproacc = info[11]
					interprodec = info[12]
				if start == "":
					start = "NA"
				if end == "":
					end = "NA"
				if interproacc == "":
					interproacc = "NA"
				if interprodec == "":
					interprodec = "NA"
				if mydec == "":
					mydec = "NA"
				if interprodec != "NA":
					mydec = interprodec
				if mystatus != "T":	# not reliable prediction
					continue
				# SignalP
				if mytype == "SignalP_GRAM_NEGATIVE" or mytype == "SignalP_GRAM_POSITIVE":
					open_out1.write(myid + "\t" + mytype + "\t" + myacc + "\t" + start + "\t" + end + "\n")
				# TMHMM
				if mytype == "TMHMM":
					open_out2.write(myid + "\t" + mytype + "\t" + myacc + "\t" + start + "\t" + end + "\n")
				# Phobius
				if mytype == "Phobius":
					if re.search("SIGNAL_PEPTIDE", myacc): # signal peptide
						open_out3.write(myid + "\t" + myacc + "\t" + mydec + "\t" + start + "\t" + end + "\n")
					if re.search("TRANSMEMBRANE", myacc): # transmembrane
						open_out4.write(myid + "\t" + myacc + "\t" + mydec + "\t" + start + "\t" + end + "\n")
				# Pfam
				if mytype == "Pfam":
					open_out5.write(myid + "\t" + myacc + "\t" + mydec + "\t" + interproacc + "\t" + myscore + "\n") 
				# SUPERFAMILY
				if mytype == "SUPERFAMILY":
					open_out6.write(myid + "\t" + myacc + "\t" + mydec + "\t" + interproacc + "\t" + myscore + "\n") 
				# ProSiteProfiles
				if mytype == "ProSiteProfiles":
					open_out7.write(myid + "\t" + myacc + "\t" + mydec + "\t" + interproacc + "\t" + myscore + "\n") 
				# Gene3D
				if mytype == "Gene3D":
					open_out8.write(myid + "\t" + myacc + "\t" + mydec + "\t" + interproacc + "\t" + myscore + "\n") 
				# PANTHER
				if mytype == "PANTHER":
					open_out9.write(myid + "\t" + myacc + "\t" + mydec + "\t" + interproacc + "\t" + myscore + "\n") 
				# TIGRFAM
				if mytype == "TIGRFAM":
					open_out10.write(myid + "\t" + myacc + "\t" + mydec + "\t" + interproacc + "\t" + myscore + "\n") 
				# SFLD
				if mytype == "SFLD":
					open_out11.write(myid + "\t" + myacc + "\t" + mydec + "\t" + interproacc + "\t" + myscore + "\n") 
				# ProDom
				if mytype == "ProDom":
					open_out12.write(myid + "\t" + myacc + "\t" + mydec + "\t" + interproacc + "\t" + myscore + "\n") 
				# Hamap
				if mytype == "Hamap":
					open_out13.write(myid + "\t" + myacc + "\t" + mydec + "\t" + interproacc + "\t" + myscore + "\n") 
				# SMART
				if mytype == "SMART":
					open_out14.write(myid + "\t" + myacc + "\t" + mydec + "\t" + interproacc + "\t" + myscore + "\n") 
				# CDD
				if mytype == "CDD":
					open_out15.write(myid + "\t" + myacc + "\t" + mydec + "\t" + interproacc + "\t" + myscore + "\n") 
				# ProSitePatterns
				if mytype == "ProSitePatterns":
					open_out16.write(myid + "\t" + myacc + "\t" + mydec + "\t" + interproacc + "\t" + myscore + "\n") 
				# PRINTS
				if mytype == "PRINTS":
					open_out17.write(myid + "\t" + myacc + "\t" + mydec + "\t" + interproacc + "\t" + myscore + "\n") 
				# PIRSF
				if mytype == "PIRSF":
					open_out18.write(myid + "\t" + myacc + "\t" + mydec + "\t" + interproacc + "\t" + myscore + "\n") 
				# MobiDBLite
				if mytype == "MobiDBLite":
					open_out19.write(myid + "\t" + myacc + "\t" + mydec + "\t" + interproacc + "\t" + myscore + "\n") 
				# Coils
				if mytype == "Coils":
					open_out20.write(myid + "\t" + myacc + "\t" + mydec + "\t" + interproacc + "\t" + myscore + "\n") 
			# foreach line	
			open_out1.close()
			open_out2.close()
			open_out3.close()
			open_out4.close()
			open_out5.close()
			open_out6.close()
			open_out7.close()
			open_out8.close()
			open_out9.close()
			open_out10.close()
			open_out11.close()
			open_out12.close()
			open_out13.close()
			open_out14.close()
			open_out15.close()
			open_out16.close()
			open_out17.close()
			open_out18.close()
			open_out19.close()
			open_out20.close()
			open_file.close()

Exemplo n.º 14

0

Exibir arquivo

def assembly(workflow, input_dir, extension, extension_paired, threads,
             output_folder, contigs):
    """
	This set of tasks will run assembly on the input files provided.

	Args:
		workflow (anadama2.workflow): An instance of the workflow class.
		input_dir: The direcory path of fastq files.
		extension: The extension for all reads files, e.g. .fastq.gz
		extension_paired: The extension for paired reads, e.g. _R1.fastq.gz,_R2.fastq.gz
		threads (int): The number of threads/cores for clustering to use.
		output_folder (string): The path of the output folder.
		contigs: The summarized contig file.

	Requires:
		metahit v1.1.3: A program for assembling metagenomic sequencing reads
		fastq files

	Returns:
		string: the name of contigs file.

	Example:
		from anadama2 import Workflow
		from MetaWIBELE.characterize import characterization

		# create an anadama2 workflow instance
		workflow=Workflow()

		# add assembly tasks
		mycontigs  = preprocessing_tasks.assembly (workflow, input_dir, args.sample_file,
												   args.extension_paired, args.extension_orphan,
												   args.threads,
												   assembly_dir, contigs)
		# run the workflow
		workflow.go()
	"""

    config.logger.info("###### Start assembly module ######")

    time_equation = config.time  # xxx hours defined in global config
    mem_equation = config.memory  # xxx GB defined in global config

    # ================================================
    # collect sequences
    # ================================================
    pair_identifier = None
    pair_identifier2 = None
    if extension_paired:
        extension_paireds = extension_paired.split(",")
        pair_identifier = re.sub(extension, "", extension_paireds[0])
        pair_identifier2 = re.sub("1", "2", pair_identifier)
        sample_files = utilities.find_files(input_dir, extension_paireds[0],
                                            None)
        samples = utilities.sample_names(sample_files, extension_paireds[0],
                                         None)
    else:
        extension_paireds = [extension]
        sample_files = utilities.find_files(input_dir, extension, None)
        samples = utilities.sample_names(sample_files, extension, None)
    split_dir = input_dir
    assembly_dir = output_folder

    split_files = []
    contigs_list = []
    for sample in samples:
        mypair = "none"
        myorphan = "none"
        mypair_tmp = []
        for item in extension_paireds:
            if item == "none":
                continue
            myfile = os.path.join(split_dir, sample + item)
            if os.path.isfile(myfile):
                mypair_tmp.append(myfile)
            else:
                sys.exit("File not exist! " + myfile)
        if len(mypair_tmp) == 1:
            # split into paired reads files
            mypair_tmp = utilities.split_paired_reads(mypair_tmp[0], extension,
                                                      pair_identifier)
            if len(mypair_tmp) == 1:
                myorphan = mypair_tmp[0]
            if len(mypair_tmp) == 2:
                mypair = ",".join(mypair_tmp)
            if len(mypair_tmp) == 3:
                mypair = ",".join(mypair_tmp[0:2])
                myorphan = mypair_tmp[2]
        else:
            if len(mypair_tmp) == 2:
                mypair = ",".join(mypair_tmp)
            if len(mypair_tmp) == 3:
                tmp1 = []
                tmp2 = []
                for i in mypair_tmp:
                    if re.search(pair_identifier, i):
                        tmp1.append(i)
                    elif re.search(pair_identifier2, i):
                        tmp1.append(i)
                    else:
                        tmp2.append(i)
                if len(tmp1) > 0:
                    mypair = ",".join(tmp1)
                if len(tmp2) > 0:
                    myorphan = ",".join(tmp2)
        split_files.append((sample, mypair, myorphan))

        seq_base = sample
        megahit_contig_dir = os.path.join(assembly_dir, seq_base)
        megahit_contig = os.path.join(megahit_contig_dir,
                                      '%s.contigs.fa' % seq_base)
        contigs_list.append(megahit_contig)

    ## run MEGAHIT
    os.system("mkdir -p " + assembly_dir)
    for (sample, mypair, myorphan) in split_files:
        seq_base = sample
        megahit_contig_dir = os.path.join(assembly_dir, seq_base)
        megahit_contig = os.path.join(megahit_contig_dir,
                                      '%s.contigs.fa' % seq_base)

        ## MEGAHIT needs memory in a byte format so let's take care of data
        #time_equation = "24*60 if file_size('[depends[0]]') < 25 else 6*24*60" # 24 hours or more depending on file size
        #mem_equation = "32*1024 if file_size('[depends[0]]') < 25 else 3*32*1024" # 32 GB or more depending on file size
        mylog = os.path.join(assembly_dir, '%s.log' % seq_base)

        if mypair != "none":
            tmp = mypair.split(",")
            if len(tmp) == 2:  # paired reads:
                tmp = mypair.split(",")
                f_seq = tmp[0]
                r_seq = tmp[1]
                if myorphan != "none":
                    workflow.add_task_gridable(
                        "rm -rf " + megahit_contig_dir + " && " +
                        "megahit -1 [depends[0]] -2 [depends[1]] -r [args[2]] -t [args[0]] -o [args[3]] --out-prefix [args[1]] >[args[4]] 2>&1",
                        depends=[f_seq, r_seq,
                                 TrackedExecutable("megahit")],
                        targets=[megahit_contig],
                        args=[
                            threads, seq_base, myorphan, megahit_contig_dir,
                            mylog
                        ],
                        cores=threads,
                        mem=mem_equation,
                        time=time_equation,
                        name=sample + "__megahit")
                else:
                    workflow.add_task_gridable(
                        "rm -rf " + megahit_contig_dir + " && " +
                        "megahit -1 [depends[0]] -2 [depends[1]] -t [args[0]] -o [args[2]] --out-prefix [args[1]] >[args[3]] 2>&1",
                        depends=[f_seq, r_seq,
                                 TrackedExecutable("megahit")],
                        targets=[megahit_contig],
                        args=[threads, seq_base, megahit_contig_dir, mylog],
                        cores=threads,
                        mem=mem_equation,
                        time=time_equation,
                        name=sample + "__megahit")
            else:
                workflow.add_task_gridable(
                    "rm -rf " + megahit_contig_dir + " && " +
                    "megahit -r [depends[0]] -t [args[0]] -o [args[2]] --out-prefix [args[1]] >[args[3]] 2>&1",
                    depends=[mypair, TrackedExecutable("megahit")],
                    targets=[megahit_contig],
                    args=[threads, seq_base, megahit_contig_dir, mylog],
                    cores=threads,
                    mem=mem_equation,
                    time=time_equation,
                    name=sample + "__megahit")
        else:
            if myorphan != "none":
                workflow.add_task_gridable(
                    "rm -rf " + megahit_contig_dir + " && " +
                    "megahit -r [depends[0]] -t [args[0]] -o [args[2]] --out-prefix [args[1]] >[args[3]] 2>&1",
                    depends=[myorphan, TrackedExecutable("megahit")],
                    targets=[megahit_contig],
                    args=[threads, seq_base, megahit_contig_dir, mylog],
                    cores=threads,
                    mem=mem_equation,
                    time=time_equation,
                    name=sample + "__megahit")

    for myfile in contigs_list:
        myname = os.path.basename(myfile)
        myfile_new = os.path.join(assembly_dir, myname)
        workflow.add_task("ln -fs [depends[0]] [targets[0]]",
                          depends=[myfile],
                          targets=[myfile_new],
                          cores=1,
                          name="ln__" + myname)

    ## combine contigs sequences
    mylog = contigs + ".log"
    workflow.add_task(
        "metawibele_format_contig_sequences -p [args[0]] -e contigs.fa -o [targets[0]] > [args[1]] 2>&1",
        depends=utilities.add_to_list(
            contigs_list,
            TrackedExecutable("metawibele_format_contig_sequences")),
        targets=[contigs],
        args=[assembly_dir, mylog],
        cores=1,
        name="format_contig_table")

    return contigs_list

Exemplo n.º 15

0

Exibir arquivo

Arquivo: interproscan_signalp_protein_family.py Projeto: biobakery/metawibele

def collect_signaling_info(cluster_mem, extension, ann_path,
                           outfile):  # split.list
    gram_p = {}
    gram_n = {}
    signals = {}
    signals_n = {}
    signals_p = {}
    #open_list = open(listfile, "r")
    #for samplelist in open_list.readlines():
    filelist = utilities.find_files(ann_path, extension, None)
    for myfile in filelist:
        if not os.path.isfile(myfile):
            config.logger.info("ERROR! File not exist: " + myfile)
            continue
        open_file = open(myfile, "r")
        titles = {}
        for line in open_file.readlines():
            line = line.strip()
            if not len(line):
                continue
            info = line.split("\t")
            if re.search("^" + utilities.PROTEIN_ID, line):
                for item in info:
                    titles[item] = info.index(item)
                continue
            if not info[titles[utilities.PROTEIN_ID]] in cluster_mem:
                continue
            if not info[titles[utilities.PROTEIN_ID]] in signals:
                signals[info[titles[utilities.PROTEIN_ID]]] = {}
            signals[info[titles[utilities.PROTEIN_ID]]][info[
                titles["Prediction"]]] = ""
            mytype = info[titles["SP"]]
            sample = info[titles[utilities.PROTEIN_ID]]
            sample = re.sub("_[\d]+$", "", sample)
            if mytype == "SignalP_GRAM_POSITIVE":
                signals_p[info[titles[utilities.PROTEIN_ID]]] = info[
                    titles["Prediction"]]
                if not sample in gram_p:
                    gram_p[sample] = 0
                gram_p[sample] = gram_p[sample] + 1
            if mytype == "SignalP_GRAM_NEGATIVE":
                signals_n[info[titles[utilities.PROTEIN_ID]]] = info[
                    titles["Prediction"]]
                if not sample in gram_n:
                    gram_n[sample] = 0
                gram_n[sample] = gram_n[sample] + 1
        # foreach line
        open_file.close()
    # foreach sample

    # output details for each ORF
    outfile1 = re.sub("_proteinfamilies.", "_proteinfamilies.ORF.", outfile)
    open_out = open(outfile1, "w")
    open_out.write(utilities.PROTEIN_ID + "\ttype\tdetail\tdescription\n")
    for myid in sorted(signals.keys()):
        myinfo = ";".join(signals[myid].keys())
        open_out.write(myid + "\tSignalP_signaling\tSignalP_signaling\t" +
                       myinfo + "\n")
    # foreach seqID
    open_out.close()

    return gram_p, gram_n, signals, signals_p, signals_n

Exemplo n.º 16

0

Exibir arquivo

def gene_catalog(workflow, complete_gene, complete_protein, input_dir,
                 extension, extension_paired, threads, prefix_gene_catalog,
                 gene_catalog, gene_catalog_nuc, gene_catalog_prot,
                 mapping_dir, gene_catalog_saf, gene_catalog_count):
    """
    This set of tasks will build gene catalogs.

    Args:
		workflow (anadama2.workflow): An instance of the workflow class.
		complete_gene: The fasta file of gene nucleotide sequences for complete ORFs.
        complete_protein: The fasta file of protein sequences for complete ORFs.
		mapping_dir: The direcory path of mapping results.
        prefix_gene_catalog: The prefix of gene catalog file.
        gene_catalog: The gene catalog file.
        gene_catalog_nuc: The fastq file of nucleotide sequences for gene catalogs.
        gene_catalog_prot: The fastq file of protein sequences for gene catalogs.
        gene_catalog_saf: The SAF gtf file for gene catalogs.
        gene_catalog_count: The count file for gene catalogs.


    Requires:
        bowtie2 (tested with 2.3.2)
        samtools (tested with 1.5)
        featureCounts (tested with Version 1.6.2)
        the nucleotide and amino acid sequences for gene catalogs
        fastq files for each sample

    Returns:
        string: file names of gene catalogs

    Example:
        from anadama2 import Workflow
        from MetaWIBELE.characterize import characterization

        # create an anadama2 workflow instance
        workflow=Workflow()

        # add quality control tasks for the fastq files
		mygene_catalog, mycounts = preprocessing_tasks.gene_catalogs (workflow, complete_gene, complete_protein,
		                                                              mapping_dir,
		                                                              prefix_gene_catalog, gene_catalog, gene_catalog_nuc, gene_catalog_prot,
		                                                              gene_catalog_saf, gene_catalog_count)

        # run the workflow
        workflow.go()
    """

    config.logger.info("###### Start gene_catalog module ######")

    time_equation = config.time  # xxx hours defined in global config
    mem_equation = config.memory  # xxx GB defined in global config

    ### run gene-catalog workflow
    mylog = gene_catalog_nuc + ".log"
    myclust = gene_catalog_nuc + ".clstr"
    workflow.add_task(
        'cd-hit-est -i [depends[0]] [args[0]] -o [targets[0]] >[args[1]] 2>&1 ',
        depends=[complete_gene, TrackedExecutable("cd-hit-est")],
        targets=[gene_catalog_nuc, myclust],
        args=[config.cd_hit_gene_opts, mylog],
        cores=threads,
        name="cd-hit-est")

    mylog = gene_catalog + ".log"
    workflow.add_task(
        'metawibele_extract_cluster -c [depends[0]] -o [targets[0]] >[args[0]] 2>&1 ',
        depends=[myclust,
                 TrackedExecutable("metawibele_extract_cluster")],
        targets=[gene_catalog],
        args=[mylog],
        cores=1,
        name="extract_cluster_CD-hit")

    mylog = gene_catalog_prot + ".log"
    workflow.add_task(
        'metawibele_extract_non_redundance_seq -r [depends[0]] -i [depends[1]] -o [targets[0]] >[args[0]] 2>&1 ',
        depends=[
            gene_catalog_nuc, complete_protein,
            TrackedExecutable("metawibele_extract_non_redundance_seq")
        ],
        targets=[gene_catalog_prot],
        args=[mylog],
        cores=1,
        name="extract_non_redundance_seq")

    ### get the abundance of gene catalog
    # run gene-abundance workflow
    mylog = gene_catalog_saf + ".log"
    workflow.add_task(
        'metawibele_gene_abundance_indexRef -r [depends[0]] -t gene -b [args[0]] -o [targets[0]] >[args[1]] 2>&1 ',
        depends=[
            gene_catalog_nuc,
            TrackedExecutable("metawibele_gene_abundance_indexRef")
        ],
        targets=[gene_catalog_saf],
        args=[prefix_gene_catalog, mylog],
        cores=1,
        name="gene_abundance_indexRef")

    ## collect sequences
    if extension_paired:
        extension_paireds = extension_paired.split(",")
        sample_files = utilities.find_files(input_dir, extension_paireds[0],
                                            None)
        samples = utilities.sample_names(sample_files, extension_paireds[0],
                                         None)
    else:
        sample_files = utilities.find_files(input_dir, extension, None)
        samples = utilities.sample_names(sample_files, extension, None)

    ## bowtie2 will map reads to gene categories
    flt_seqs = []
    for sample in samples:
        seq_file = "NA"
        if extension_paired:
            tmp = extension_paired.split(",")
        else:
            if extension != "none":
                tmp = extension.split(",")
        for item in tmp:
            if seq_file == "NA":
                seq_file = os.path.join(input_dir, sample + '%s' % item)
            else:
                seq_file = seq_file + "," + os.path.join(
                    input_dir, sample + '%s' % item)
        flt_seqs.append((sample, seq_file))
    # foreah sample

    ## Now run bowtie2 to map reads to gene categories
    mappings = []
    mappings_tmp = []
    #mem_equation = "2*12*1024 if file_size('[depends[0]]') < 10 else 4*12*1024"
    #time_equation = "2*60 if file_size('[depends[0]]') < 10 else 2*2*60"
    for (sample, seq_file) in flt_seqs:
        seq_base = sample
        mydir = os.path.join(mapping_dir, sample)
        os.system("mkdir -p " + mydir)
        sample_counts = os.path.join(mydir, seq_base + ".sort.bed")
        stdout_log = os.path.join(mydir, '%s.mapping.stdout.log' % seq_base)
        mappings_tmp.append(sample_counts)

        workflow.add_task(
            'metawibele_gene_abundance -r [depends[0]] -u [args[0]] -t [args[1]] -s [args[2]] -w [args[3]] '
            '> [args[4]] 2>&1 ',
            depends=[
                gene_catalog_nuc, gene_catalog_saf,
                TrackedExecutable("metawibele_gene_abundance")
            ],
            targets=[sample_counts],
            args=[seq_file, threads, seq_base, mydir, stdout_log],
            cores=1,
            name=sample + "__gene_abundance")

    for myfile in mappings_tmp:
        myname = os.path.basename(myfile)
        myfile_new = os.path.join(mapping_dir, myname)
        mappings.append(myfile_new)
        workflow.add_task("ln -fs [depends[0]] [targets[0]]",
                          depends=[myfile],
                          targets=[myfile_new],
                          cores=1,
                          name="ln__" + myname)

    # collect abundance
    mylog = gene_catalog_count + ".log"
    workflow.add_task(
        'metawibele_gene_catalog_abundance -p [args[0]] -s sort.bed -c [args[1]] -o [targets[0]] >[args[2]] 2>&1 ',
        depends=utilities.add_to_list(
            mappings, TrackedExecutable("metawibele_gene_catalog_abundance")),
        targets=[gene_catalog_count],
        args=[mapping_dir, gene_catalog, mylog],
        cores=1,
        name="gene_catalog_abundance")

    return gene_catalog, gene_catalog_count

Exemplo n.º 17

0

Exibir arquivo

Arquivo: interproscan_phobius_protein_family.py Projeto: biobakery/metawibele

def collect_phobius_info(cluster_mem, extension, ann_path,
                         outfile):  # list.txt
    transmem = {}
    signal = {}
    detail_signal = {}
    detail_trans = {}
    filelist = utilities.find_files(ann_path, extension, None)
    for myfile in filelist:
        if not os.path.isfile(myfile):
            config.logger.info("ERROR! File not exist: " + myfile)
        else:
            open_file = open(myfile, "r")
            for line in open_file.readlines():
                line = line.strip()
                if not len(line):
                    continue
                if re.search("^" + utilities.PROTEIN_ID, line):
                    continue
                info = line.split("\t")
                myid = info[0]
                if not myid in cluster_mem:
                    continue
                sample = re.sub("_[\d]+$", "", myid)
                if not sample in signal:
                    signal[sample] = {}
                signal[sample][info[0]] = info[2]
                detail_signal[info[0]] = info[2]
            # foreach line
            open_file.close()

        myfile = re.sub(extension, "phobius.transmembrane.tsv", myfile)
        if not os.path.isfile(myfile):
            config.logger.info("ERROR! File not exist: " + myfile)
        else:
            open_file = open(myfile, "r")
            for line in open_file.readlines():
                line = line.strip()
                if not len(line):
                    continue
                if re.search("^" + utilities.PROTEIN_ID, line):
                    continue
                info = line.split("\t")
                myid = info[0]
                if not myid in cluster_mem:
                    continue
                sample = re.sub("_[\d]+$", "", myid)
                if not sample in transmem:
                    transmem[sample] = {}
                transmem[sample][info[0]] = info[2]
                detail_trans[info[0]] = info[2]
            # foreach line
    # foreach samplelist

    # output details
    outfile1 = re.sub("_proteinfamilies.", "_proteinfamilies.ORF.signaling.",
                      outfile)
    open_out = open(outfile1, "w")
    open_out.write(utilities.PROTEIN_ID + "\ttype\tdetail\tdescription\n")
    for myid in sorted(detail_signal.keys()):
        myinfo = detail_signal[myid]
        open_out.write(myid + "\tPhobius_signaling\tPhobius_signaling\t" +
                       myinfo + "\n")
    # foreach seqID
    open_out.close()
    outfile1 = re.sub("_proteinfamilies.",
                      "_proteinfamilies.ORF.transmembrane.", outfile)
    open_out = open(outfile1, "w")
    open_out.write(utilities.PROTEIN_ID + "\ttype\tdetail\tdescription\n")
    for myid in sorted(detail_trans.keys()):
        myinfo = detail_trans[myid]
        open_out.write(myid +
                       "\tPhobius_transmembrane\tPhobius_transmembrane\t" +
                       myinfo + "\n")
    # foreach seqID
    open_out.close()

    return signal, transmem