예제 #1
0
def main():
    '''
    Script to extract reads without any alignments
    '''

    # Input arguments
    argparser   = ArgParser()
    args        = argparser.parse_args()

    # Access database
    dataAccess = DataAccess(args)

    # ------------------ #

    print '1. Loading tax tree...'
    start = time.time()

    tax_tree = TaxTree()

    end = time.time()
    print("done: {0:.2f} sec".format(end - start))

    # ------------------ #

    print '2. Loading alignment file...'
    start = time.time()

    read_container = ReadContainer()
    read_container.load_alignment_data(args.alignment_file)
    #---SET TAXIDS FOR ALL ALIGNMENTS--#
    read_container.set_taxids(dataAccess)

    end = time.time()
    print("done: {0:.2f} sec".format(end - start))

    # ------------------ #

    # Loop through reads and take those with no alignments
    out_file = open(args.read_ids_out, 'w')
    
    no_aln_count = 0
    for read in read_container.fetch_all_reads(format=list):
        if not read.has_alignments():
            out_file.write("{0}\n".format(read.id))
            no_aln_count += 1

    out_file.close()
    
    total_read_count = read_container.get_read_count()

    print("total number of reads   : {0}".format( total_read_count ))
    print("reads without alignments: {0}".format(no_aln_count))
    print
    print("no aln percentage: {0:.2f}%".format(no_aln_count * 100 / float(total_read_count)))
예제 #2
0
def main():
    '''
    Script to run binner in one of the most common
    usage scenarios.
    * load alignment data
    * load taxonomy data
    * do basic alignment data filtering (remove host reads ecc)
    '''

    #----------------------------------#
    #------ INPUT ARGUMENTS -----------#
    argparser = TestRunArgParser()
    args  = argparser.parse_args()

    #----------------------------------#
    #------- STATIC DATA SOURCE -------#
    # CDS - GI2TAXID -- NAMES -- NODES #
    dataAccess = DataAccess(args)
    #raw_input('Data access created')
    #----------------------------------#

    #-------- TAXONOMY TREE -----------#
    print '1. Loading tax tree...'
    tax_tree = TaxTree()
    # tax_tree.load_taxonomy_data(dataAccess)
    print 'done.'

    #----------------------------------#
    #------- ALIGNMENT DATA SOURCE ----#
    print '2. Loading alignment file...'
    read_container = ReadContainer()
    read_container.load_alignment_data(args.input)
    #---SET TAXIDS FOR ALL ALIGNMENTS--#
    read_container.set_taxids(dataAccess)
    # Remember total number of reads
    total_read_num = read_container.get_read_count()
    print 'done'

    #------- FILTER HOST READS -------#
    print '3. Filtering host reads & alignments...'
    new_reads = host_filter.filter_potential_host_reads(
        read_container.fetch_all_reads(format=list),
        tax_tree.tax2relevantTax,
        tax_tree.potential_hosts,
        #delete_host_alignments =
        True,
        #filter_unassigned =
        True,
        #unassigned_taxid=
        -1,
        host_filter.perc_of_host_alignments_larger_than)

    dataAccess.clear_cache()    # deletes gi2taxid cache

    reads_with_no_host_alignments = host_filter.filter_potential_hosts_alignments(
        new_reads,
        tax_tree.tax2relevantTax,
        tax_tree.potential_hosts,
        True,   # delete host alignments
        True,   # filter unassigned
        -1)     # unassigned taxid

    host_read_count = len(read_container.fetch_all_reads(format=list)) - len(reads_with_no_host_alignments)
    read_container.set_new_reads(reads_with_no_host_alignments)
    print 'done'

    #----------------------------------#
    #------- LOAD ALL RECORDS   -------#
    print '4. Loading referenced records...'
    record_container = RecordContainer()
    record_container.set_db_access(dataAccess)
    record_container.populate(read_container.fetch_all_reads_versions(), table='cds')
    print 'done'
    #----------------------------------#
    #-- MAP ALIGNMENTS TO GENES   -----#
    print '5. Mapping alignments to genes...'
    read_container.populate_cdss(record_container)
    #----------------------------------#
    #- RECORD ALL ALIGNEMENTS TO GENE -#
    cds_aln_container = CdsAlnContainer()
    cds_aln_container.populate(read_container.fetch_all_reads(format=list))
    print 'done'

    print '6. Estimating organisms present in sample...'
    target_organisms = [633, 632, 263, 543, 86661, 1392, 55080, 1386] # What is this part?
    print 'done.'
   
    print '7. Annotating reads...' 
    annotated_reads = rstate.annotate_reads(
                    read_container.fetch_all_reads(format=list),
                    cds_aln_container.read2cds, 
                    tax_tree, 
                    target_organisms)
    read_container.set_new_reads(annotated_reads)
    print 'done'
   
    print '8. Binning reads...' 
    orgs = bin_reads(
        read_container.fetch_all_reads(format=list),
        cds_aln_container.cds_repository,
        cds_aln_container.read2cds, 
        tax_tree,
        target_organisms,
        None,
        None,
        False) 

    '''
    for org in orgs.values():
        print org.name
        print len(set(org.get_reads()))
        print len(org.identified_coding_regions)
    print 'done.'
    '''

    print ("total_read_num: " + str(total_read_num))

    print '9. Generating XML...'
    dataset = Dataset(args.xml_description_file)
    xml_organisms = []
    host = Organism (host_read_count, host_read_count/float(total_read_num), None, None, "Host",
                 None, None, [], [], [], is_host=True)
    xml_organisms.append(host)
    for org in orgs.values():
        xml_organisms.append(org.to_xml_organism(tax_tree, total_read_num))
    xml_organisms.sort(key=operator.attrgetter("amount_count"), reverse=True)
    xml = XMLOutput(dataset, xml_organisms, args.output) 
    xml.xml_output();