def main(): ''' Script to extract reads without any alignments ''' # Input arguments argparser = ArgParser() args = argparser.parse_args() # Access database dataAccess = DataAccess(args) # ------------------ # print '1. Loading tax tree...' start = time.time() tax_tree = TaxTree() end = time.time() print("done: {0:.2f} sec".format(end - start)) # ------------------ # print '2. Loading alignment file...' start = time.time() read_container = ReadContainer() read_container.load_alignment_data(args.alignment_file) #---SET TAXIDS FOR ALL ALIGNMENTS--# read_container.set_taxids(dataAccess) end = time.time() print("done: {0:.2f} sec".format(end - start)) # ------------------ # # Loop through reads and take those with no alignments out_file = open(args.read_ids_out, 'w') no_aln_count = 0 for read in read_container.fetch_all_reads(format=list): if not read.has_alignments(): out_file.write("{0}\n".format(read.id)) no_aln_count += 1 out_file.close() total_read_count = read_container.get_read_count() print("total number of reads : {0}".format( total_read_count )) print("reads without alignments: {0}".format(no_aln_count)) print print("no aln percentage: {0:.2f}%".format(no_aln_count * 100 / float(total_read_count)))
def main(): ''' Script to run binner in one of the most common usage scenarios. * load alignment data * load taxonomy data * do basic alignment data filtering (remove host reads ecc) ''' #----------------------------------# #------ INPUT ARGUMENTS -----------# argparser = TestRunArgParser() args = argparser.parse_args() #----------------------------------# #------- STATIC DATA SOURCE -------# # CDS - GI2TAXID -- NAMES -- NODES # dataAccess = DataAccess(args) #raw_input('Data access created') #----------------------------------# #-------- TAXONOMY TREE -----------# print '1. Loading tax tree...' tax_tree = TaxTree() # tax_tree.load_taxonomy_data(dataAccess) print 'done.' #----------------------------------# #------- ALIGNMENT DATA SOURCE ----# print '2. Loading alignment file...' read_container = ReadContainer() read_container.load_alignment_data(args.input) #---SET TAXIDS FOR ALL ALIGNMENTS--# read_container.set_taxids(dataAccess) # Remember total number of reads total_read_num = read_container.get_read_count() print 'done' #------- FILTER HOST READS -------# print '3. Filtering host reads & alignments...' new_reads = host_filter.filter_potential_host_reads( read_container.fetch_all_reads(format=list), tax_tree.tax2relevantTax, tax_tree.potential_hosts, #delete_host_alignments = True, #filter_unassigned = True, #unassigned_taxid= -1, host_filter.perc_of_host_alignments_larger_than) dataAccess.clear_cache() # deletes gi2taxid cache reads_with_no_host_alignments = host_filter.filter_potential_hosts_alignments( new_reads, tax_tree.tax2relevantTax, tax_tree.potential_hosts, True, # delete host alignments True, # filter unassigned -1) # unassigned taxid host_read_count = len(read_container.fetch_all_reads(format=list)) - len(reads_with_no_host_alignments) read_container.set_new_reads(reads_with_no_host_alignments) print 'done' #----------------------------------# #------- LOAD ALL RECORDS -------# print '4. Loading referenced records...' record_container = RecordContainer() record_container.set_db_access(dataAccess) record_container.populate(read_container.fetch_all_reads_versions(), table='cds') print 'done' #----------------------------------# #-- MAP ALIGNMENTS TO GENES -----# print '5. Mapping alignments to genes...' read_container.populate_cdss(record_container) #----------------------------------# #- RECORD ALL ALIGNEMENTS TO GENE -# cds_aln_container = CdsAlnContainer() cds_aln_container.populate(read_container.fetch_all_reads(format=list)) print 'done' print '6. Estimating organisms present in sample...' target_organisms = [633, 632, 263, 543, 86661, 1392, 55080, 1386] # What is this part? print 'done.' print '7. Annotating reads...' annotated_reads = rstate.annotate_reads( read_container.fetch_all_reads(format=list), cds_aln_container.read2cds, tax_tree, target_organisms) read_container.set_new_reads(annotated_reads) print 'done' print '8. Binning reads...' orgs = bin_reads( read_container.fetch_all_reads(format=list), cds_aln_container.cds_repository, cds_aln_container.read2cds, tax_tree, target_organisms, None, None, False) ''' for org in orgs.values(): print org.name print len(set(org.get_reads())) print len(org.identified_coding_regions) print 'done.' ''' print ("total_read_num: " + str(total_read_num)) print '9. Generating XML...' dataset = Dataset(args.xml_description_file) xml_organisms = [] host = Organism (host_read_count, host_read_count/float(total_read_num), None, None, "Host", None, None, [], [], [], is_host=True) xml_organisms.append(host) for org in orgs.values(): xml_organisms.append(org.to_xml_organism(tax_tree, total_read_num)) xml_organisms.sort(key=operator.attrgetter("amount_count"), reverse=True) xml = XMLOutput(dataset, xml_organisms, args.output) xml.xml_output();