hairpin_file = "hairpin.fa"
mature_seq_file = "mature.fa"
high_conf_file = "high_conf_hairpin.fa"
other_types = "mirTrons_other.txt"

hsa_to_hairpin, other_to_hairpin = mirbase.read_miRNA_fasta(hairpin_file)
hsa_to_mature, other_to_mature = mirbase.read_miRNA_fasta(mature_seq_file)
miRNA_high_conf = miRNA.read_high_confidence(high_conf_file) 


before = len(hsa_to_hairpin)
before2 = len(hsa_to_mature)

special_types.remove_mirTrons(hsa_to_hairpin, other_types)
special_types.remove_mirTrons(hsa_to_mature, other_types)



assert before != len(hsa_to_hairpin)
assert before2 != len(hsa_to_mature)



 
# miRNA_species = mirbase.similar_hairpins(hsa_to_hairpin, other_to_hairpin)
harpinID_to_mature, harpinID_to_matseqs = mirbase.combine_hairpin_mature(hsa_to_hairpin, hsa_to_mature)


print len(set(harpinID_to_matseqs) )
示例#2
0
def main():
    start_time = time.clock()
    print "starting miRNA analysis"
    

    
#     fasta2 = ["Demux.SRhi10002.Adipocyte", "Demux.SRhi10002.Alveolar", "Demux.SRhi10002.Amniotic",
#              "Demux.SRhi10002.Dendritic1", "Demux.SRhi10002.Dendritic2", "Demux.SRhi10002.Endothelial",
#              "Demux.SRhi10002.Fibroblast1", "Demux.SRhi10002.Fibroblast2", "Demux.SRhi10002.Fibroblast3",
#              "Demux.SRhi10002.Intestinal", "Demux.SRhi10002.Meningeal", "Demux.SRhi10002.Mesenchymal",
#              "Demux.SRhi10002.Osteoblast", "Demux.SRhi10002.Pericytes", "Demux.SRhi10002.Renal",
#              "Demux.SRhi10002.Sebocyte1", "Demux.SRhi10002.Sebocyte2", "Demux.SRhi10002.SmoothBrachiocephalic",
#              "Demux.SRhi10002.SmoothProstate", "Demux.SRhi10002.SmoothSubclavian", "Demux.SRhi10002.SmoothUterine"]
#     
#     fasta3 = ["Demux.SRhi10003.Adipocyte", "Demux.SRhi10003.Amniotic%20Epithelial", "Demux.SRhi10003.amniotic%20membrane",
#               "Demux.SRhi10003.Endothelial0", "Demux.SRhi10003.Endothelial1", "Demux.SRhi10003.Endothelial2",
#               "Demux.SRhi10003.Fibroblast1", "Demux.SRhi10003.Fibroblast2", "Demux.SRhi10003.Fibroblast3",
#               "Demux.SRhi10003.Keratinocyte", "Demux.SRhi10003.Mesenchymaladipose", "Demux.SRhi10003.Mesenchymalbone",
#               "Demux.SRhi10003.Osteoblast", "Demux.SRhi10003.Pancreatic", "Demux.SRhi10003.Peripheral",
#               "Demux.SRhi10003.Prostate", "Demux.SRhi10003.Renal", "Demux.SRhi10003.Sertoli",
#               "Demux.SRhi10003.Skeletal", "Demux.SRhi10003.SmoothBrain", "Demux.SRhi10003.SmoothPulmonary",
#               "Demux.SRhi10003.SmoothUmbilical"]
#     
#     fasta2 = ["hg19/"+n for n in fasta2]
#     fasta3 = ["hg19/"+n for n in fasta3]
# 
#     fasta4 = ["hg19/Demux.SRhi10004."+str(i) for i in range(1,23)]
#     fasta5 = ["hg19/Demux.SRhi10005."+str(i) for i in range(1,24)]
#     fasta_files.extend(fasta2)
#     
#     fasta2.extend(fasta3)
#     fasta2.extend(fasta4)
#     fasta2.extend(fasta5)
  
  
#     
#     fasta_files = ["SRR797059.collapsed", "SRR797060.collapsed", "SRR797061.collapsed",
#                     "SRR797062.collapsed", "SRR797063.collapsed", "SRR797064.collapsed",
#                     "SRR207110.collapsed", "SRR207111.collapsed", "SRR207112.collapsed"]    
# 
#     fasta_files = ["SRR797060.collapsed", "SRR797061.collapsed",
#                     "SRR797062.collapsed", "SRR797063.collapsed", "SRR797064.collapsed"]
#     fasta_files = ["SRR797060.collapsed", "SRR797061.collapsed", "SRR207111.collapsed"]
#     fasta_files = ["SRR797060.collapsed", "SRR797061.collapsed"]
  
#     fasta_files =  ["SRR207110.collapsed", "SRR207111.collapsed", "SRR207112.collapsed"] 
#     fasta_file = "SRR797062.fa"



  
    fasta_files = ["SRR797062.collapsed"] #  small file for fast testing
    fasta_files_large_folder = ["fastas/Demux.SRhi." + str(i) + ".collapsed"  for i in range(296)]
      
  
    fasta_files_small = ["SRR797059.collapsed", "SRR797060.collapsed", "SRR797061.collapsed",
                    "SRR797062.collapsed", "SRR797063.collapsed", "SRR797064.collapsed",
                    "SRR207110.collapsed", "SRR207111.collapsed", "SRR207112.collapsed",
                    "SRR207113.collapsed", "SRR207114.collapsed", "SRR207115.collapsed",
                    "SRR207116.collapsed", "SRR207117.collapsed", "SRR207118.collapsed",
                    "SRR207119.collapsed"]
      
  
    fasta_files = fasta_files_large_folder
#     fasta_files = fasta_files_large_folder[:40]
  
    hairpin_file = "hairpin.fa"
    mature_seq_file = "mature.fa"
    miRNA_file_name = "mirnas.fa"
    high_conf_file = "high_conf_hairpin.fa"
    miRNA_family_file = "miFam.dat"
    other_types = "mirTrons_other.txt"
#     dead_mirnas = "miRNA.dead"
    dead_mirnas = "dead_list"
    dead_mirna_hairpins = "dead_hairpins.txt"
    dead_mirna_bowtie_file = "dead_hairpin_bowtie.fa"
    dead_mirna_bowtie_out = "dead_hairpin_locations.map"
      
      
    all_reads_file = "all.collapsed"
    bowtie_output = "bowtie_out.map"
      
    miRNA_bowtie_output = "miRNA.map"
    ml_folds = 10
    
    is_new_run = True
    is_new_run = False
    
    #===========================================================================
    # #  making data for mirdeep2
    #===========================================================================
    # not_human_file(mature_seq_file, "other_matures.fa")
    # assert 0
    # human_only_file(mature_seq_file, "human_matures.fa")
    #  
    #  
    # hsa_to_hairpin, other_to_hairpin = mirbase.read_miRNA_fasta(hairpin_file)
    #  
    # write_human_hairpins(hsa_to_hairpin, "human_hairpins.fa")
    # assert 0
    #  
    #  
    # assert 0
    #  
    # one_large_fasta(fasta_files, "all_fasta.fa")
    # assert 0
    #   
    #  
    #===========================================================================
    
    
    if is_new_run:
        
        id_to_dead_hp, id_to_dead_mature = dead_mirna.get_hairpin(dead_mirnas)
#         assert False
        
        print "merging",len(fasta_files), "collapsed files" if len(fasta_files)>1 else ""
            
        dict_collapsed = merge.collapse_collapsed(fasta_files, min_len=8, min_count=2)
            
    #     split small and larger sequences
    #     write reads to file
            
        reads, reads_count, small_reads, small_reads_count = merge.filter_seqeunces(dict_collapsed, 18)
        merge.write_collapsed(all_reads_file, reads, reads_count)
        
        print "long reads:", len(reads), "small:", len(small_reads), len(small_reads_count)
        print "fraction small / all:", len(small_reads)*1.0 / (len(small_reads) + len(reads))

            
            
    #     aligning to genome using bowtie
        
        _align_bowtie(bowtie_output, all_reads_file)
        print "finished bowtie in ", time.clock() - start_time, " seconds" 
            
    #     read genome alignment from bowtie
        fixed_lines = [line.strip().split("\t") for line in open(bowtie_output)] 
        print "read positions in ", time.clock() - start_time, " seconds"
            
            
        print "loading miRNA hairpins:"
        hsa_to_hairpin, other_to_hairpin = mirbase.read_miRNA_fasta(hairpin_file)
        hsa_to_mature, other_to_mature = mirbase.read_miRNA_fasta(mature_seq_file)
            
        special_types.remove_mirTrons(hsa_to_hairpin, other_types)
        special_types.remove_mirTrons(hsa_to_mature, other_types)
        miRNA_species = mirbase.similar_hairpins(hsa_to_hairpin, other_to_hairpin)
            
        hairpinID_to_mature, harpinID_to_matseqs = mirbase.combine_hairpin_mature(hsa_to_hairpin, hsa_to_mature)
        miRNA_high_conf = miRNA.read_high_confidence(high_conf_file)
         
         
#         pickle.dump(harpinID_to_matseqs, open("harpinID_to_matseqs.p", "wb"))
#         pickle.dump(hsa_to_hairpin, open("hsa_to_hairpin.p", "wb"))
            
        print "\nhigh confidence set:", len(miRNA_high_conf),
        print miRNA_high_conf.issubset(miRNA_species.keys())
        print "\nreading miRNA family info (mifam)"
        miRNA_fam = miRNA.read_family(miRNA_family_file)
        
    #     print len(set(miRNA_species.keys()) & set(miRNA_fam.keys()))
    #     run write micro rnas to file
        
        mirbase.write_miRNA(hsa_to_hairpin, miRNA_file_name)
        print "\nwrote human miRNAs to file", time.clock() - start_time, " seconds"
            
    #     run bowtie to find miRNA positions
        _align_bowtie(miRNA_bowtie_output, miRNA_file_name)
            
        print "aligned miRNAs in", time.clock() - start_time, " seconds"
        miRNA_bowtie_hits = [line.strip().split("\t") for line in open(miRNA_bowtie_output)] 
            
        unique_mirna_hits = set([x[0] for x in miRNA_bowtie_hits])
            
        print "miRNA bowtie hits:", len(miRNA_bowtie_hits)
        print "unique miRNA hits:", len(unique_mirna_hits)
            
            
        print "\nDead mirnas"
        mirbase.write_dead_mirna(id_to_dead_hp, dead_mirna_bowtie_file)
        _align_bowtie(dead_mirna_bowtie_out, dead_mirna_bowtie_file)
        dead_miRNA_hits = [line.strip().split("\t") for line in open(dead_mirna_bowtie_out)]
            
        print "TOTAL dead mirnas:", len(dead_miRNA_hits)
#         assert 0 
        
    #     using sequence tree to find possible candidates
    #     candidate_tree, sequence_tree, candidates, seq_to_candidates = interval_tree_search.find_candidates(fixed_lines)
        candidate_tree, sequence_tree, candidates, seq_to_candidates = interval_tree_search.find_candidates_2(fixed_lines)
            

        print "\n\tfound candidates in ", time.clock() - start_time, " seconds"
        print "\tbowtie hits", len(fixed_lines)
        print "\tcandidate tree", len(candidate_tree)
        print "\tcandidates", len(candidates)
        print "\tsequence tree", len(sequence_tree)
        print "\tmapped seqs", len(candidates[0].all_mapped_sequences)
        
        
    # 0            1   2[0] [1]      [2] [3]
    # ['1-15830', '-', 'gi|224589818|ref|NC_000006.11|',
    #         NC_000006.11

        
        print "\naligning miRNAs to sequences"
        candidate_to_miRNA = interval_tree_miRNA.align_miRNAs(miRNA_bowtie_hits,
                                                               hairpinID_to_mature,
                                                               harpinID_to_matseqs,
                                                               candidate_tree,
                                                               candidates,
                                                               sequence_tree,
                                                               seq_to_candidates,
                                                               miRNA_species,
                                                               miRNA_high_conf)
        
        mirdeep_new = mirdeep_make_roc_data(candidate_tree, candidate_to_miRNA, miRNA_high_conf)
        
        candidate_to_dead = interval_tree_dead.align_dead_miRNAs(dead_miRNA_hits,
                                                                 id_to_dead_hp,
                                                                 id_to_dead_mature,
                                                                 candidate_tree,
                                                                 candidates,
                                                                 sequence_tree,
                                                                 seq_to_candidates)
        

            
        print "\npadding all miRNA and Candidates"
        gene.include_padding(candidates)
        print "padded all candidates in ", time.clock() - start_time, " seconds"
            
        
        print "\nrunning vienna rnafold"
        vienna.energy_fold2(candidates)
          
        print "finished vienna folding"
          
    #     align_small_seqs(candidates, small_reads, small_reads_count)
    #     small_seq_stats(candidates)
    #     
    #     plot_any.plot(candidates, candidate_to_miRNA, candidate_to_dead, miRNA_high_conf, "ratio_short_long_5p", False )
    #     assert 0
        
        print "saving 123"
        
        pickle.dump(mirdeep_new, open("mirdeep_new.p", "wb"))
        pickle.dump(candidate_tree, open("candidate_tree.p", "wb"))
        pickle.dump(candidates, open("candidates_pre.p", "wb"))
        pickle.dump(candidate_to_miRNA, open("candidate_to_miRNA.p", "wb"))
        pickle.dump(miRNA_high_conf, open("miRNA_high_conf.p", "wb"))
           
        print "saving 234"
           
        pickle.dump(candidate_to_dead, open("candidate_to_dead.p", "wb"))
        pickle.dump(miRNA_fam, open("miRNA_fam.p", "wb"))
        pickle.dump(small_reads, open("small_reads.p", "wb"))
        pickle.dump(small_reads_count, open("small_reads_count.p", "wb"))
        pickle.dump(seq_to_candidates, open("seq_to_candidates.p", "wb"))
        
        pickle.dump(reads, open("reads.p", "wb"))
        pickle.dump(reads_count, open("reads_count.p", "wb"))
        
          
        print "saved 456"
        
        
    candidate_to_dead = pickle.load( open("candidate_to_dead.p", "rb"))
#     print len(candidate_to_dead)
#     
#     print len(candidate_to_dead.values())
#     print len(set(candidate_to_dead.values()))
#     assert 0

    print "loading miRNAs"
    mirdeep_new = pickle.load( open("mirdeep_new.p", "rb"))
    print len(mirdeep_new)
#     assert 0

    
    harpinID_to_matseqs = pickle.load( open("harpinID_to_matseqs.p", "rb"))
    hsa_to_hairpin = pickle.load( open("hsa_to_hairpin.p", "rb"))
    
#     print "loading tree..."

#     candidate_tree = pickle.load( open("candidate_tree.p", "rb"))
    
    print "loading picled stuff ...", time.clock() - start_time
    candidate_to_miRNA = pickle.load( open("candidate_to_miRNA.p", "rb"))
    miRNA_high_conf = pickle.load( open("miRNA_high_conf.p", "rb"))
    
    
#     mirdeep_make_roc_data(candidate_tree, candidate_to_miRNA, miRNA_high_conf)
    
    candidates = pickle.load( open("candidates_pre.p", "rb"))
    hp_50 = pickle.load( open("candidate_classified_miRNA.p", "rb"))
    hp_99 = pickle.load( open("candidate_classified_99.p", "rb"))
    
    
       


    
    
    candidate_to_dead = pickle.load( open("candidate_to_dead.p", "rb"))
    miRNA_fam = pickle.load( open("miRNA_fam.p", "rb"))
    
    
    small_reads = pickle.load( open("small_reads.p", "rb"))
    small_reads_count = pickle.load( open("small_reads_count.p", "rb"))
    seq_to_candidates = pickle.load( open("seq_to_candidates.p", "rb"))
    
    reads = pickle.load( open("reads.p", "rb"))
    reads_count = pickle.load( open("reads_count.p", "rb"))
    
    

    

    print "loaded back", time.clock() - start_time
    

    def get_miRNAid(c):
        hashval = c.chromosome+c.chromosome_direction+str(c.hairpin_start)
        return candidate_to_miRNA[hashval] if hashval in candidate_to_miRNA else None

    
    annotated_data, annotations, low_confidence_data = create_folds(candidates, candidate_to_miRNA, candidate_to_dead, miRNA_high_conf, miRNA_fam, ml_folds)
    
    names = [map(get_miRNAid, data_fold) for data_fold in annotated_data]
#     names = map(get_miRNAid, annotated_data[0])
    pickle.dump(names, open("names_data.p", "wb"))
    print len(names[0]), len(annotated_data[0])
#     assert 0
    
    fix_miRNA_training_test(annotated_data, annotations, low_confidence_data, hsa_to_hairpin, harpinID_to_matseqs, candidate_to_miRNA)
    
#     length_distribution(small_reads, small_reads_count)
#     
#     assert 0
    # overhang calculated using fold seq.
    overhang.get_alignment(candidates)
    

    # calculating hairpin stats (length + overhang using pairing prob.)
    hairpin_stats(candidates, candidate_to_miRNA, miRNA_high_conf)
    
    

    
    def _is_miRNA(c):
        hashval = c.chromosome+c.chromosome_direction+str(c.hairpin_start)
        return hashval in candidate_to_miRNA
        
    
    def _is_hc(c):
        hashval = c.chromosome+c.chromosome_direction+str(c.hairpin_start)
        if hashval in candidate_to_miRNA:
            if candidate_to_miRNA[hashval] in miRNA_high_conf:
                return True
        return False
    
    def _is_dead(c):
        hashval = c.chromosome+c.chromosome_direction+str(c.hairpin_start)
        return hashval in candidate_to_dead
    

    def is_good_candidate(c):
        return c.has_hairpin_struct and not _is_dead(c) and not _is_miRNA(c)
 
 
    hp_candidates = [c for c in candidates if is_good_candidate(c)]
    
    print len(hp_candidates), 572, len(hp_50), len(hp_99)
    
    assert len(hp_candidates) == len(hp_50) == len(hp_99)
    
    
    hp_99_candidates = [h for h, is_99 in zip(hp_candidates, hp_99) if is_99]
    hp_50_candidates = [h for h, is_50 in zip(hp_candidates, hp_50) if is_50]
    
    
    def hash_val(c):
        return c.chromosome+c.chromosome_direction+str(c.hairpin_start)
    
    hp99_to_candidate = {hash_val(c):c for c in hp_99_candidates}
    hp50_to_candidate = {hash_val(c):c for c in hp_50_candidates}
    
#     print hp50_to_candidate.issubset(candidate_to_miRNA)
#     print hp99_to_candidate.issubset(candidate_to_miRNA)
#     
#     assert hp50_to_candidate.issubset(candidate_to_miRNA)
#     assert hp99_to_candidate.issubset(candidate_to_miRNA)
#     
    
    
    print "all candidates+miRNA+other:", len(candidates)
    print "\twith hairpin struct:\t", len([c for c in candidates if c.has_hairpin_struct])
    
    _mirnas = [c for c in candidates if _is_miRNA(c)]
#     _mirnas2 = [c for c in candidates if c.miRNAid != None]
    _mirna_hc = [c for c in candidates if _is_hc(c)]
    
    _mirna_lc = [c for c in _mirnas if not _is_hc(c)]
    
    _mirna_dead = [c for c in candidates if _is_dead(c)]
    
    _not_mirnas = [c for c in candidates if not _is_miRNA(c) and not _is_dead(c)]
#     _not_mirnas2 = [c for c in candidates if c.miRNAid == None]
    
    print "mirnas:", len(_mirnas), len(candidate_to_miRNA)
    print "\twith hairpin struct:\t", len([m for m in _mirnas if m.has_hairpin_struct])
    
    print "candidates:", len(_not_mirnas)
    print "\twith hairpin struct:\t", len([m for m in _not_mirnas if m.has_hairpin_struct])
    
    print "HC mirnas:", len(_mirna_hc), len(miRNA_high_conf)
    print "\twith hairpin struct:\t", len([m for m in _mirna_hc if m.has_hairpin_struct])
    
    print "LC mirnas:", len(_mirna_lc)
    print "\twith hairpin struct:\t", len([m for m in _mirna_lc if m.has_hairpin_struct])
    


    
    
#     fail_hc = [c for c in _mirna_hc if not c.has_hairpin_struct]
#     fail_lc = [c for c in _mirnas if not c.has_hairpin_struct and not _is_hc(c)]
#     
#     hairpin.hairpin_stats(fail_hc, candidate_to_miRNA, miRNA_high_conf)
#     hairpin.hairpin_stats(fail_lc, candidate_to_miRNA, miRNA_high_conf)
#     hairpin.hairpin_stats(_mirnas, candidate_to_miRNA, miRNA_high_conf)

    
    


    
    not_mapped_reads = [structure.Sequence(i,n,read) for i,(read,n) in 
                        enumerate(zip(reads, reads_count))
                        if read not in seq_to_candidates]
    
    


#     aligning small sequences against hairpins
    align_small_seqs(candidates, small_reads, small_reads_count)
    
    
#     lc_10  = pickle.load( open("lc_scores_10.p", "rb"))
#     lc_10_all = pickle.load( open("lc_scores_10_w_cand.p", "rb"))
#     lc_names = pickle.load( open("save_low_confidence_names.p", "rb"))
# #     
# #     
# #     
#     small_seq_stats(_mirna_lc, lc_10, lc_names, candidate_to_miRNA)
#     print lc_names[0]
#     assert 0



    
    
#     small_seq_stats(candidates)    
    small_seq_stats(_mirna_hc) # for testing only



#     A/U ends for all remaining candidates
    tailing.tailing_au_fast(candidates, not_mapped_reads)
    

#     degree of entropy in structure and nucleotides
    entropy.entropy(candidates)
    

#      
#     heterogenity (position counting)
    heterogenity.heterogenity(candidates)

#      
#     candidate quality: nr of sequence hits / all candidate hits for given sequences
    quality.candidate_quality(candidates, seq_to_candidates)
    
    
    
    
    
    # save candidates again here ?
    
#     pickle.dump(candidates, open("candidates_with_features.p", "wb"))

    
    
#     plotting all features

    
       
#     FEATURES = ["hairpin_energy", "hairpin_energy_10", "hairpin_energy_40",
#                 "entropy_nucleotides", "entropy_structure", "heterogenity_5_begin",
#                 "heterogenity_5_end", "heterogenity_3_begin", "heterogenity_3_end",
#                 "quality", "bindings_max_10", "overhang_level_outer_10",
#                 "overhang_outer_10", "overhang_level_inner_10", "overhang_inner_10",
#                 "bulge_factor"]
#      
#      
#     log_scaled = [False]*16 + [True]*3
#     print log_scaled
#     for feat_name, logs in zip(FEATURES, log_scaled):
#       
#         plot_any.plot(candidates, candidate_to_miRNA, candidate_to_dead,
#                       miRNA_high_conf, feat_name, logs )
    
#     removed_nonvalues = [c for c in candidates if c.ratio_short_long_5p != 1.0]
#     
#     print "nonvalues left", len( [c for c in candidates if c.ratio_short_long_5p == 1.0])
#     
#     plot_any.plot(removed_nonvalues, candidate_to_miRNA, candidate_to_dead,
#                       miRNA_high_conf, "ratio_short_long_5p", isLog=True )
    
#     short_correlate_13_17_max = []
#     maxlen_range = range(13, 18)
#     
#     for i in maxlen_range:
#         res = plot_any.plot(candidates, candidate_to_miRNA, candidate_to_dead,
#                   miRNA_high_conf, "short_seq_align_10_"+str(i), isLog=False )
#         
#         short_correlate_13_17_max.append(res)
#        
#     for l in short_correlate_13_17_max:
#         print l 
#     
#     print
#     for l in zip(*short_correlate_13_17_max):
#         print l
#         
#     (ks_val, p_2s_ks, t_student, p_student, t_welch, p_welch) = zip(*short_correlate_13_17_max)
#     
#     
#     plot_kstest(ks_val, maxlen_range, True)
#     plot_ttest(t_student, t_welch, maxlen_range, True)
#     
# 
#     short_correlate_8_17_min = []
#     minlen_range = range(8, 18)
#     for i in minlen_range:
#         res = plot_any.plot(candidates, candidate_to_miRNA, candidate_to_dead,
#                   miRNA_high_conf, "short_seq_align_" + str(i) + "_17", isLog=False )
#         
#         short_correlate_8_17_min.append(res)
#        
#     for l in short_correlate_8_17_min:
#         print l 
#     
#     print
#     for l in zip(*short_correlate_8_17_min):
#         print l
#         
#     (ks_val, p_2s_ks, t_student, p_student, t_welch, p_welch) = zip(*short_correlate_8_17_min)
#     
#     plot_kstest(ks_val, minlen_range, False)
#     plot_ttest(t_student, t_welch, minlen_range, False)
    
    
    d_hp = 0
    d_tot = 0
    
    c_hp = 0
    c_tot = 0
    
    h_hp = 0
    h_tot = 0
    
    l_hp = 0
    l_tot = 0
    
    m_hp = 0
    m_tot = 0
    
    for c in candidates:
        hashval = c.chromosome + c.chromosome_direction + str(c.hairpin_start)
        
        hairpin_one = 1 if c.has_hairpin_struct else 0
        
        if hashval in candidate_to_dead:
            d_hp += hairpin_one
            d_tot += 1
        
        elif hashval in candidate_to_miRNA:
            mi = candidate_to_miRNA[hashval]
            
            if mi in miRNA_high_conf:
                h_hp += hairpin_one
                h_tot += 1
            else:
                l_hp += hairpin_one
                l_tot += 1
        elif hashval in mirdeep_new:
            m_hp += hairpin_one
            m_tot += 1
        else:
            c_hp += hairpin_one
            c_tot += 1
            
    print "==============================================================================="
    print "dead:    \t", d_hp, d_tot, (d_hp*1.0 / d_tot)
    print "candidates:\t", c_hp, c_tot, (c_hp*1.0 / c_tot)
    print "high conf:\t", h_hp, h_tot, (h_hp*1.0 / h_tot)
    print "low conf:\t", l_hp, l_tot, (l_hp*1.0 / l_tot)
    print "mirDeep2:\t", m_hp, m_tot, (m_hp*1.0 / m_tot)
    print "==============================================================================="


    FEATURES_old = [ "hairpin_energy_10", "entropy_nucleotides", "entropy_structure", "heterogenity_5_begin",
                "heterogenity_5_end", "heterogenity_3_begin", "heterogenity_3_end",
                "quality", "overhang_level_outer_10",
                "overhang_outer_10", "overhang_level_inner_10", "overhang_inner_10", ]
    FEATURES_old = [ "hairpin_energy_10", "entropy_nucleotides", "entropy_structure", "heterogenity_5_begin",
                "heterogenity_5_end", "heterogenity_3_begin", "heterogenity_3_end",
                 "overhang_level_outer_10",
                "overhang_outer_10", "overhang_level_inner_10", "overhang_inner_10", "quality" ]

    FEATURES = [
#                 "ratio_short_long",
                "short_seq_align", # 0 until best val is found
#                 "ratio_short_long_logval",
#                 "leading_au",
#                 "tailing_au",
                "overhang_inner",
                "overhang_outer",
                "loop_size",
                "folds_5p",
                "folds_3p",
                "folds_before",
                "folds_after", 
                ]
    
    all_features = FEATURES + FEATURES_old
    logvals = [True] + [False]*10 + [False]*len(FEATURES_old)
#     plot_pearson_correlation(candidates, FEATURES)
#     plot_spearman_correlation(candidates, FEATURES)
    plot_spearman_correlation(candidates, all_features)



    LC_100_best_all  = pickle.load( open("LC_100_best_all.p", "rb"))
    LC_100_best_nonhp  = pickle.load( open("LC_100_best_nonhp.p", "rb"))
    LC_100_worst_all  = pickle.load( open("LC_100_worst_all.p", "rb"))
    LC_100_worst_nonhp  = pickle.load( open("LC_100_worst_nonhp.p", "rb"))
    
    
    
    c_scores = []
    mir_scores = []
    
    
    for feat_name, lv in zip(all_features, logvals):
        
#         (ks_mirdeep, ks_classify) = plot_candidate_results(candidates, candidate_to_miRNA, candidate_to_dead, miRNA_high_conf,
#                                mirdeep_new, hp50_to_candidate, hp99_to_candidate, feat_name, lv)
        
        plot_LC_results(candidates, candidate_to_miRNA, candidate_to_dead, miRNA_high_conf,
                         LC_100_worst_all, LC_100_worst_nonhp, LC_100_best_all, LC_100_best_nonhp, feat_name, lv)
        
#         c_scores.append(ks_classify)
#         mir_scores.append(ks_mirdeep)
       
       
    
    
    pyplot.plot(c_scores)
    pyplot.plot(mir_scores)
    pyplot.title("mirDeep")
    pyplot.show()
        #=======================================================================
        # plot_any.plot(candidates, candidate_to_miRNA, candidate_to_dead,
        #               miRNA_high_conf, feat_name )
        #=======================================================================

    #===========================================================================
    # for feat_name, lv in zip(all_features, logvals):
    #     
    #     plot_mirdeep(candidates, candidate_to_miRNA, candidate_to_dead,
    #                   miRNA_high_conf, mirdeep_new, feat_name, lv)
    #  
    #===========================================================================
    print
    print " features finished:", time.clock() - start_time, " seconds"




#===============================================================================
# saving data for use in classification
#===============================================================================
    
    
    def is_good_or_miRNA(c):
        return _is_miRNA(c) or c.has_hairpin_struct or _is_dead(c)
    
    
    def is_bad_or_miRNA(c):
        return _is_miRNA(c) or _is_dead(c) or not c.has_hairpin_struct

    
#     def is_good_candidate(c): # du
#         return c.has_hairpin_struct and not _is_dead(c) and not _is_miRNA(c)
#     
    
    
#     # removed bad candidates 
#     removed_bad_candidates = [c for c in candidates if is_good_or_miRNA(c)] # classifying LC, not using low quality miRNA
#     annotated_data, annotations, low_confidence_data = create_folds(removed_bad_candidates, candidate_to_miRNA, candidate_to_dead, miRNA_high_conf, miRNA_fam, ml_folds)
   
    # all candidates
    annotated_data, annotations, low_confidence_data = create_folds(candidates, candidate_to_miRNA, candidate_to_dead, miRNA_high_conf, miRNA_fam, ml_folds)
    
    # removed the good candidates (which are classified)
    print "candidates?"    
    not_good_candidates = [c for c in candidates if is_bad_or_miRNA(c)] # classifying bad candidates
    annotated_data_new, annotations_new, _low_confidence_data = create_folds(not_good_candidates, candidate_to_miRNA, candidate_to_dead, miRNA_high_conf, miRNA_fam, ml_folds)
    
    
    
    low_confidence_names = [map(get_miRNAid, l) for l in low_confidence_data]
    
    
    
    def sum_reads(c):
        reads = 0.0
        for el in c.mapped_sequences:
            name = el.data[1]
            reads += float(name.split("-")[1])
        
        return reads
        
            
    
    low_confidence_reads = [sum_reads(c) for fold in low_confidence_data for c in fold ]
    print low_confidence_reads
    
    pickle.dump(low_confidence_reads, open("low_confidence_reads.p", "wb"))
#     assert 0
    
    
    def scale_data(data):
        return preprocessing.scale(data, with_mean=False, with_std=False)
        
    
    
    # data for LC classification: 
    vector_data = map(vectorize.candidates_to_array, annotated_data)
    scaled_data = map(scale_data, vector_data)
    
    
        # low confidence miRNA
    low_confidence_data = map(vectorize.candidates_to_array, low_confidence_data)
    low_confidence_data = map(scale_data, low_confidence_data)
    
    
    # data for new miRNA classification:
    vector_data_new = map(vectorize.candidates_to_array, annotated_data_new)
    scaled_data_new = map(scale_data, vector_data_new)
    
    
        # good candidates (not miRNA, has hairpin struct)
    hp_candidates = [c for c in candidates if is_good_candidate(c)]
    print "hp_candidates", len(hp_candidates)
    hp_candidates = vectorize.candidates_to_array(hp_candidates)
    hp_candidates = map(scale_data, hp_candidates)
    
    print
    print "saving data ...",
    #===========================================================================
    # data used for classifying LOW CONFIDENCE:
    #===========================================================================
    pickle.dump(scaled_data, open("save_scaled_data.p", "wb")) # candidates, HC, DEAD
    pickle.dump(annotations, open("save_an.p", "wb")) # annotations for data over [00001111000]
    pickle.dump(low_confidence_data, open("save_low_confidence_data.p", "wb")) # low confidence miRNA
    pickle.dump(low_confidence_names, open("save_low_confidence_names.p", "wb")) # mirbase names, like >hsa-mir-516a-1
#     pickle.dump(annotated_data, open("save_da.p", "wb")) # extra stuff not needed
    
    #===========================================================================
    # data used for classifying new miRNA:
    #===========================================================================
    pickle.dump(scaled_data_new, open("save_scaled_data_new.p", "wb")) # non-hp candidates, HC, DEAD
    pickle.dump(annotations_new, open("save_an_new.p", "wb")) # annotations for data over [001111000]
    pickle.dump(hp_candidates, open("save_hp_candidates_new.p", "wb")) # annotations for data over [001111000]
    #TODO: position in genome for all candidates probably...
    
    
    print "...saved."
    print "Now loading back for testing ...",
    
    annotations = pickle.load( open("save_an.p", "rb"))
    scaled123 = pickle.load( open("save_scaled_data.p", "rb"))
    low_confidence_data = pickle.load( open("save_low_confidence_data.p", "rb"))
    low_confidence_names = pickle.load( open("save_low_confidence_names.p", "rb"))
#     annotated_data = pickle.load( open("save_da.p", "rb"))

    scaled_data_new = pickle.load( open("save_scaled_data_new.p", "rb"))
    annotations_new = pickle.load( open("save_an_new.p", "rb"))
    hp_candidates = pickle.load( open("save_hp_candidates_new.p", "rb"))


     
    print "... done", (len(scaled123))
    print "finished:", time.clock() - start_time, "seconds"