def main(argv): # parse arguments args = vars(parser.parse_args()) ## read input mapping file (.mapping.txt) to create read2origin map contig2origin = {} # dictionary/hash table mapping input sequences to their original header, e.g. >[Header] contig2origin = create_contig2origin(args["mapping_file"]) ## create preferred mapping ncbi_megan_map = {} # hash map from given taxonomy to preferred one used by megan with open(args["ncbi_megan_map"], 'r') as meganfile: for line in meganfile: fields = line.split("\t") fields = map(str.strip, fields) ncbi_megan_map[fields[0]] = fields[1] # contig_taxa_data structure contig_to_taxa = {} # Read blast table with open(args["parsed_blast"], "r") as fh: for l in fh: if re.match("^\#", l): clean_tab_lines(l) else: fields = clean_tab_lines(l) contig_hits = contig_pattern.search(fields[0]) if contig_hits: contig = contig_hits.group(1) orf = contig_hits.group(2) # add to data structre if it doesn't exist if contig not in contig_to_taxa: contig_to_taxa[contig] = {} if orf not in contig_to_taxa[contig]: contig_to_taxa[contig][orf] = [] # pull taxonomy out of annotation taxa_hits = taxonomy_pattern.search(fields[9]) if taxa_hits: taxa = taxa_hits.group(1) bitscore = fields[3] contig_to_taxa[contig][orf].append( (taxa, float(str(bitscore))) ) else: continue else: continue ## Load contig references (if available or applicable) # read contig taxa reference if available contig_to_taxa_ref = None if args["contig_taxa_ref"]: contig_to_taxa_ref = {} if args["contig_taxa_ref"]: with open(args["contig_taxa_ref"], "r") as fh: for l in fh: fields = clean_tab_lines(l) contig_id = fields[0] contig_origin = fields[1] contig_to_taxa_ref[contig_id] = contig_origin # all contigs hypothetically have the same reference origin (i.e., single cells) sample_ref = None if args["sample_taxa_ref"]: sample_ref = args["sample_taxa_ref"] ## Build the LCA Star NCBI Tree lcastar = LCAStar(args["ncbi_tree"]) lcastar.setLCAStarParameters(min_depth = 1, alpha = args["alpha"], min_reads = 1) ## Calculate LCA for each ORF contig_to_lca = {} for contig in contig_to_taxa: for orf in contig_to_taxa[contig]: if contig not in contig_to_lca: contig_to_lca[contig] = {} if orf not in contig_to_lca[contig]: contig_to_lca[contig][orf] = None contig_taxas = contig_to_taxa[contig][orf] if len(contig_taxas) == 0: contig_to_lca[contig][orf] = "root" else: if args['orf_summary'] == 'besthit': contig_taxas.sort(key=operator.itemgetter(1), reverse=True) best_blast_taxa = contig_taxas[0][0] contig_to_lca[contig][orf] = best_blast_taxa elif args['orf_summary'] == 'orf_majority': majority_list = [] for t in contig_taxas: majority_list.append(t[0]) #TODO Update to check for alterantive taxonomy names contig_to_lca[contig][orf] = lcastar.simple_majority(majority_list) else: lca_list = [] # create a list of lists for LCA calculation for t in contig_taxas: lca_list.append([t[0]]) contig_to_lca[contig][orf] = lcastar.getTaxonomy(lca_list) ## calculate taxonomy statistics LCA, for each ORF # contig_to_taxa = {} ## LCA^2, Majority, and LCA* for each ORF writeout(args, contig_to_lca, contig_to_taxa_ref, sample_ref, lcastar, ncbi_megan_map)
def main(argv): args = vars(parser.parse_args()) # read the input fasta_files to be mapped to create read2origin map fh = open(args["input_mapping"], "r") lines = fh.readlines() fh.close() # create preferred megan mapping ncbi_megan_map = {} with open(args["ncbi_megan_map"], 'r') as meganfile: for line in meganfile: fields = line.split("\t") fields = map(str.strip, fields) ncbi_megan_map[fields[0]] = fields[1] read2origin = {} # hash mapping input sequences to their original header # fill read2origin for l in lines: fields =l.split("\t") fields = map(str.strip, fields) read_name = fields[0] gi_line = fields[1].strip("\n") gi_line = gi_line.split("|") ncbi_id = re.sub("\.[0-9]+$", "", gi_line[3] ) read2origin[fields[0]] = ncbi_id # Parse NCBI: summary table fh = open(args["sum_table"], "r") lines = fh.readlines() fh.close() ncbiID_to_taxaID = {} # ncbiID to NCBI Tree taxaID taxaID_to_taxa = {} # ncbiID to full taxa name for l in lines: fields = l.split("\t") fields = map(str.strip, fields) ncbi_id = re.sub("\.[0-9]+$", "", fields[0]) tax_id = fields[3] tax = fields[5] if ncbi_id not in ncbiID_to_taxaID: ncbiID_to_taxaID[ncbi_id] = tax_id if tax_id not in taxaID_to_taxa: taxaID_to_taxa[tax_id] = tax # read functional and taxonomic table fh = open(args["ft_table"], "r") header = fh.readline() header = header.split("\t") # list of headers lines = fh.readlines() contig_to_orfs = {} # get a list of ORFs for a specific contig orfs_to_lca = {} # get the lca taxonomy for l in lines: fields = l.split("\t") # ORF_ID ORF_length start end Contig_Name Contig_length strand ec taxonomy product orf_id = fields[0] contig = fields[4] lca = fields[8] if contig not in contig_to_orfs: contig_to_orfs[contig] = [] contig_to_orfs[contig].append(orf_id) if orf_id not in orfs_to_lca: orfs_to_lca[orf_id] = lca # get original taxonomy # print taxaID_to_taxa[ncbiID_to_taxaID[read2origin[contig]]] # Build the LCA Star NCBI Tree print "Loading LCAStar:" lcastar = LCAStar(args["ncbi_tree"]) print "Done." # set LCAStar parameters lcastar.setLCAStarParameters(min_depth = 1, alpha = 0.5, min_reads = 1 ) # small helper to translate the list def get_orfs_taxa(orfs): list = [] for o in orfs: list.append(orfs_to_lca[o]) return list output = open(args["output"], "w") header = "\t".join(["contig","real","taxa","method","dist", "wtd", "real_linage"]) output.write(header + "\n") for c in contig_to_orfs: # the read taxa contig = c real = taxaID_to_taxa[ncbiID_to_taxaID[read2origin[c]]] taxa_list = get_orfs_taxa(contig_to_orfs[c]) lca_list = [] for t in taxa_list: lca_list.append([ t ]) taxon = lcastar.lca_star(taxa_list) real_lineage = lcastar.get_lineage(lcastar.get_a_Valid_ID([real])) lineage = [] for id in real_lineage: lineage.append(translate_to_prefered_name(id, ncbi_megan_map, lcastar)) real_lineage = lineage line = "\t".join([contig, real, translate_to_prefered_name(lcastar.get_a_Valid_ID([taxon]),ncbi_megan_map, lcastar), "LCA_Star", str(lcastar.get_distance(taxon, real)), str(lcastar.wtd_distance(real, taxon)), ";".join(real_lineage[::-1])]) output.write(line + "\n") taxon = lcastar.lca_majority(taxa_list) line = "\t".join([contig, real, translate_to_prefered_name(lcastar.get_a_Valid_ID([taxon]),ncbi_megan_map, lcastar), "Majority", str(lcastar.get_distance(taxon, real)), str(lcastar.wtd_distance(real, taxon)), ";".join(real_lineage[::-1])]) output.write(line + "\n") taxon = lcastar.getTaxonomy(lca_list) line = "\t".join([contig, real, translate_to_prefered_name(lcastar.get_a_Valid_ID([taxon]),ncbi_megan_map, lcastar), "LCA_Squared", str(lcastar.get_distance(taxon, real)), str(lcastar.wtd_distance(real, taxon)), ";".join(real_lineage[::-1])]) output.write(line + "\n") output.close()
def main(argv): # parse arguments args = vars(parser.parse_args()) ## read input mapping file (.mapping.txt) to create read2origin map contig2origin = { } # dictionary/hash table mapping input sequences to their original header, e.g. >[Header] contig2origin = create_contig2origin(args["mapping_file"]) ## create preferred mapping ncbi_megan_map = { } # hash map from given taxonomy to preferred one used by megan with open(args["ncbi_megan_map"], 'r') as meganfile: for line in meganfile: fields = line.split("\t") fields = map(str.strip, fields) ncbi_megan_map[fields[0]] = fields[1] # contig_taxa_data structure contig_to_taxa = {} # Read blast table with open(args["parsed_blast"], "r") as fh: for l in fh: if re.match("^\#", l): clean_tab_lines(l) else: fields = clean_tab_lines(l) contig_hits = contig_pattern.search(fields[0]) if contig_hits: contig = contig_hits.group(1) orf = contig_hits.group(2) # add to data structre if it doesn't exist if contig not in contig_to_taxa: contig_to_taxa[contig] = {} if orf not in contig_to_taxa[contig]: contig_to_taxa[contig][orf] = [] # pull taxonomy out of annotation taxa_hits = taxonomy_pattern.search(fields[9]) if taxa_hits: taxa = taxa_hits.group(1) bitscore = fields[3] contig_to_taxa[contig][orf].append( (taxa, float(str(bitscore)))) else: continue else: continue ## Load contig references (if available or applicable) # read contig taxa reference if available contig_to_taxa_ref = None if args["contig_taxa_ref"]: contig_to_taxa_ref = {} if args["contig_taxa_ref"]: with open(args["contig_taxa_ref"], "r") as fh: for l in fh: fields = clean_tab_lines(l) contig_id = fields[0] contig_origin = fields[1] contig_to_taxa_ref[contig_id] = contig_origin # all contigs hypothetically have the same reference origin (i.e., single cells) sample_ref = None if args["sample_taxa_ref"]: sample_ref = args["sample_taxa_ref"] ## Build the LCA Star NCBI Tree lcastar = LCAStar(args["ncbi_tree"]) lcastar.setLCAStarParameters(min_depth=1, alpha=args["alpha"], min_reads=1) ## Calculate LCA for each ORF contig_to_lca = {} for contig in contig_to_taxa: for orf in contig_to_taxa[contig]: if contig not in contig_to_lca: contig_to_lca[contig] = {} if orf not in contig_to_lca[contig]: contig_to_lca[contig][orf] = None contig_taxas = contig_to_taxa[contig][orf] if len(contig_taxas) == 0: contig_to_lca[contig][orf] = "root" else: if args['orf_summary'] == 'besthit': contig_taxas.sort(key=operator.itemgetter(1), reverse=True) best_blast_taxa = contig_taxas[0][0] contig_to_lca[contig][orf] = best_blast_taxa elif args['orf_summary'] == 'orf_majority': majority_list = [] for t in contig_taxas: majority_list.append(t[0]) #TODO Update to check for alterantive taxonomy names contig_to_lca[contig][orf] = lcastar.simple_majority( majority_list) else: lca_list = [] # create a list of lists for LCA calculation for t in contig_taxas: lca_list.append([t[0]]) contig_to_lca[contig][orf] = lcastar.getTaxonomy(lca_list) ## calculate taxonomy statistics LCA, for each ORF # contig_to_taxa = {} ## LCA^2, Majority, and LCA* for each ORF writeout(args, contig_to_lca, contig_to_taxa_ref, sample_ref, lcastar, ncbi_megan_map)