Exemplo n.º 1
0
]
allowed_modes = ["query", "database", "both"]

if args.format not in allowed_formats:
    raise ValueError("This format of input file is not allowed")
elif (args.white_list_id_file is None) and (args.black_list_id_file is None):
    raise ValueError(
        "Both files with ids from black and white list were not set")
elif (args.white_list_id_file is not None) and (args.black_list_id_file
                                                is not None):
    raise ValueError("Both files with ids from black and white list were set")
elif args.mode not in allowed_modes:
    raise ValueError("This filtering mode is not allowed")

if args.white_list_id_file:
    white_list = read_ids(args.white_list_id_file)
    if args.mode == "query":

        def iterator(blast_dict):
            for entry in blast_dict:
                if entry in white_list:
                    yield blast_dict[entry]
    elif args.mode == "database":

        def iterator(blast_dict):
            for entry in blast_dict:
                entry_hits = []
                for hit in blast_dict[entry].hits:
                    if hit.id in white_list:
                        # filter hits
                        entry_hits.append(hit)
Exemplo n.º 2
0
5 	internal and leaf branches + leaf names
6 	internal branches + leaf names
7 	leaf branches + all names
8 	all names
9 	leaf names
100 	topology only""")

parser.add_argument("-d",
                    "--id_file",
                    action="store",
                    dest="id_file",
                    help="File with ids of nodes to remove")

args = parser.parse_args()

id_list = read_ids(args.id_file)

print("Nodes with ids present in %s file will be removed" % args.id_file)

tree_index = 1
with open(args.input_tree_file, "r") as in_fd:
    with open(args.output_tree_file, "w") as out_fd:
        for line in in_fd:
            tree_line = line.strip()
            tree = Tree(tree_line, format=args.input_tree_format)
            print("Totaly %i leaves in tree %i" % (len(tree), tree_index))
            #print(tree.write())
            for node in tree.traverse():
                #if node.is_leaf():
                #    print node.features
                # node.name
Exemplo n.º 3
0
                "dog",
                "human",
                "mouse",
                "opossum"]

sys.path.append("/Users/mahajrod/genetics/MAVR/")

workdir = "/Users/mahajrod/genetics/Projects/Dobrzhansky/project/gene_families/cafe_run_no_selenocystein/"
data_dir = "/Users/mahajrod/genetics/data/"
treefam_output = workdir + "gene_families_of_eightSpeices"
selenocystein_ids_dict = {}

output_cafe = "cafe_input.cafe"

for species in species_list:
    selenocystein_ids_dict[species] = read_ids(data_dir + species + "/" + species + "_selenocystein_proteins_id.t")

os.chdir(workdir)
selenocystein_families_dict = OrderedDict({})
for species in species_list:
    selenocystein_families_dict[species] = set([])

counts_fd = open("counts.t", "w")
#counts_fd.write = ("#family\tbefore\tafter\tchanges\n")
full_selenocystein_fd = open("full_selonocystein_families", "w")

with open(treefam_output, "r") as in_fd:
    with open(output_cafe, "w") as out_fd:
        header = "FAMILYDESC\tFAMILY\t%s\n" % "\t".join(species_list)
        out_fd.write(header)
        for line in in_fd:
Exemplo n.º 4
0
#!/usr/bin/env python
__author__ = 'Sergei F. Kliver'
import os
from RouToolPa.Tools.GATK import FastaAlternateReferenceMaker
from RouToolPa.Routines.File import read_ids

ref_dir = "/home/mahajrod/Genetics/Projects/desaminases/data/LAN210_v0.10m/"
reference = ref_dir + "LAN210_v0.10m.fasta"
ref_annotations = ref_dir + "annotations/merged_annotations_Nagalakshmi_tranf_to_LAN210_v0.10m.gff3"
gene_ids_file = "/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/polymorphisms/gene.ids"
gatk_dir = "/home/mahajrod/Repositories/genetic/NGS_tools/GenomeAnalysisTK-3.2-0/"

work_dir = "/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf/"

gene_ids = read_ids(gene_ids_file)

sample_set_names_list = [
    "PmCDA1_3d", "HAP", "PmCDA1_sub1_3d", "PmCDA1_6d", "HAP_sub1",
    "PmCDA1_sub1_6d", "A1_3d", "A1_6d", "A3G_3d", "AID_3d", "AID_6d"
]

for sample in sample_set_names_list:
    os.chdir(work_dir)
    os.chdir(sample)
    os.system("mkdir -p genome")
    variants_file_list = os.listdir("per_sample_vcf/")
    for vcf_file in variants_file_list:
        FastaAlternateReferenceMaker.correct_reference(
            gatk_dir, reference, "genome/%s.fasta" % vcf_file[:-4],
            "per_sample_vcf/%s" % vcf_file)