def test_remove_identical_seqs(): data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) # print("start") scraper = PhyscraperScrape(data_obj, ids) scraper.ids.otu_rank = {} scraper.config.gifilename = False scraper._blasted = 1 blast_dir = "tests/data/precooked/fixed/tte_blast_files" #scraper.gi_list_mrca = pickle.load(open("tests/data/precooked/gi_list_mrca.p", 'rb')) scraper.read_blast_wrapper(blast_dir=blast_dir) #print scraper.ncbi_mrca assert (len(scraper.new_seqs) == 0) assert (len(scraper.data.aln) == 5) assert len(scraper.new_seqs_otu_id) == 17 #Now that we are pulling the full remote sequences, we don'thave any identical seuqnces in the test. #TODO find an example where we do get identical sequences and need to discard them # seqset = set() # for otu in scraper.new_seqs_otu_id: # seq = scraper.new_seqs_otu_id[otu] # if seq in seqset: # print otu # seqset.add(seq) #check that every new sequence is unique in the new seqs set, and is not a substring of another sequence. ## for otu in scraper.new_seqs_otu_id: # qseq = scraper.new_seqs_otu_id[otu] # count = 0 # for seq in seqset: # if qseq in seq: # count += 1 # assert count == 1 ## for taxon in scraper.data.tre.taxon_namespace: # assert(taxon.label in scraper.data.otu_dict) # status = scraper.data.otu_dict[taxon.label].get(u'^physcraper:status') # assert(status in ('original', 'query')) aln_path1 = scraper.data.write_aln() aln_path = scraper.write_all_unaligned('test.fas') scraper.align_query_seqs() assert len(scraper.data.aln) == 22