def test_generic(self): sll1 = self.generate_sll([1, 2, 2, 3, 3, 3]) self.assertEqual([1, 2, 3], self.get_values_from_sll(remove_duplicates(sll1))) sll2 = self.generate_sll([1, 1, 1]) self.assertEqual([1], self.get_values_from_sll(remove_duplicates(sll2))) sll3 = None self.assertEqual([], self.get_values_from_sll(remove_duplicates(sll3)))
def merge(): read_files = glob.glob("song_lyrics_raw/*.txt") with open("merged_lyrics.txt", "wb") as outfile: for f in read_files: with open(f, "rb") as infile: outfile.write(infile.read()) outfile.write("\n") clean_bad_words.clean_bad_words("merged_lyrics.txt", "merged_lyrics_clean.txt") remove_duplicates.remove_duplicates("merged_lyrics_clean.txt", "merged_lyrics_unique.txt")
def exon_coords(args): line_number1 = [] line_number2 = [] exon_file = open(args.exon).read().splitlines() for line in exon_file: col = line.split("\t") if col[0] == split_coords( args.coords)[0] and col[2] == "exon" and col[3] == str( split_coords(args.coords)[1]): line_number1.append(exon_file.index(line)) if col[0] == split_coords( args.coords)[0] and col[2] == "exon" and col[4] == str( split_coords(args.coords)[2]): line_number2.append(exon_file.index(line)) # Making list of exons composing circRNA exon_list = [] for line in exon_file[line_number1[0]:line_number2[0] + 1]: col = line.split("\t") if col[2] == "exon": exon_list.append(col[0] + ":" + col[3] + "-" + col[4]) exon_list = remove_duplicates(exon_list) return exon_list
def exon_lists(line_number1, line_number2, gtf_file): exon_list = [] for line in gtf_file[line_number1[0]:line_number2[0] + 1]: col = line.split("\t") if col[2] == "exon": exon_list.append(col[0] + ":" + col[3] + "-" + col[4]) exon_list = remove_duplicates(exon_list) return exon_list
def test_duplicates(self): inputs = [('a', 'a', 'a', 'a'), (1, 2, 2, 3, 2, 1)] outputs = [('a'), (1, 2, 3)] for in_list, out_list in zip(inputs, outputs): in_head = node.Node(in_list[0]) in_n = in_head for value in in_list: in_n.next = node.Node(value) in_n = in_n.next head = remove_duplicates.remove_duplicates(in_head) n = head for value in out_list: self.assertEqual(n.data, value) n = n.next
def reverse_pair(array): half = int(len(array)/2 + 0.5) pairs = [] for word in array[0:half]: splitted = list(word) splitted.reverse() reversed = ''.join(splitted) if bisect(array, reversed) == True: pairs.append(word) pairs.append(reversed) #print ("pairs = ", pairs) else: pass final_pairs = remove_duplicates(pairs) return final_pairs
def run_locarnap(seqsin, numkept, cpus=1, foldless=False): """Runs locarna-p on a set of sequences in MinimalFastaParser format [(header, seq), (header, seq)] and retgurns alignemtn and structure""" seqs, headers = remove_duplicates(seqsin) # blank headers to save memory headers = 0 # make sure group has enough sequences before continuing if len(seqs) < numkept and not foldless: return "", "" # headers come out in format Header_# so split to get # and sort by abundance seqs.sort(reverse=True, key=lambda count: int(count[0].split("_")[1])) # cut to numkept most abundant sequences if len(seqs) > numkept: seqs = seqs[:numkept] return create_locarnap_alignment(seqs, RNA, struct=True, params={"--cpus": cpus})
def run_locarnap_for_infernal(currgroup, clusters, otus, basefolder): '''Function for multithreading creates the final locarna-p alignment and writes to files, then r2r struct''' #run locana-p on the superclusters to get the alignment and consensus structure #skip if already run and program just crashsed or whatever currotufolder = basefolder + "group_" + str(currgroup) if exists(currotufolder): return "" seqs = [] out = "group " + str(currgroup) + ": " for cluster in clusters: out += cluster + " " count = 0 for header, seq in MinimalFastaParser(open(otus[cluster], 'rU')): seqs.append((header.split()[0], seq)) count += int(header.split("_")[1]) out += "\n" + str(count) + " sequences\n" #make sure group has enough sequences before continuing #run locarna-p on the at most 50 most abundant sequences in the group aln, struct = run_locarnap(seqs, 50, cpus=2, foldless=True) #create output folder for group mkdir(currotufolder) if(aln.getNumSeqs() < 50): out += str(aln.getNumSeqs()) + " unique sequences\n" fout = open(currotufolder + "/unique.fasta", 'w') fout.write(aln.toFasta()) fout.close() else: s, h = remove_duplicates(seqs) out += str(len(s)) + " unique sequences\n" write_fasta_list(s, currotufolder + "/unique.fasta") out += "Structure: " + struct + "\n" #write out alignment and structure in fasta and stockholm formats #write that shit logout = open(currotufolder + "/log.txt", 'w') logout.write(out) logout.close() alnout = open(currotufolder + "/locarnap-aln.fasta", 'w') alnout.write(aln.toFasta() + "\n>SS_struct\n" + struct + "\n") alnout.close() alnout = open(currotufolder + "/locarnap-aln.sto", 'w') struct_dict = {'SS_cons': struct} alnout.write(stockholm_from_alignment(aln, GC_annotation=struct_dict)) alnout.close() #make R2R secondary structure for alignment make_r2r(currotufolder + "/locarnap-aln.sto", currotufolder, "group_" + str(currgroup))
def test_continuous_duplicates(self): test = "aaabbbcccddd" self.assertEqual(remove_duplicates(test), "abcd")
def test_null_string(self): test = "" self.assertEqual(remove_duplicates(test), "")
def test_all_duplicates(self): test = "aaa" self.assertEqual(remove_duplicates(test), "a")
def test_remove_duplicates(self): remove_duplicates(self.default_list) self.assertEqual(self.default_list.size(), 4)
import os import sys import traceback import ntpath from PIL import Image from match_files import match_files from categorize_files import categorize_files from convert_to_jpg import convert_to_jpg from read_pgm import read_pgm from remove_duplicates import remove_duplicates if __name__ == '__main__': path = sys.argv[1] print "Matching files" match_files(path) print "Removing duplicates" remove_duplicates(path) print "Converting to JPEG" convert_to_jpg(path) print "Categorizing files" categorize_files(path) print "Finished."
def main(argv, seed=None): random.seed(seed) global current_config current_config = get_default_config() arg = argv[1] arg = arg.replace("\\", "/") ar1 = arg orig_file = ar1 mut_dir = arg[arg.rfind("/")+1:arg.rfind(".")] if arg.rfind("/") >= 0 else arg[:arg.rfind(".")] script_name = mut_dir mut_dir = (current_config["default_mut_dir"]+"/").replace("//","/") + mut_dir + "/" # Add script's directory to path sys.path.insert(0, mut_dir) # Store the reason why the mutation was completed mutants_with_cause = [] # Timeout since our modifications may cause infinite loops timeout = int(current_config["min_timeout"]) if len(argv) < 4 or not argv[3] else argv[3] if not os.path.exists(mut_dir): os.makedirs(mut_dir) else: cleanup(mut_dir) # Index of the string currently processed str_cnt = 0 # Mutation counter mut_cnt = 0 pick_file = argv[2] if len(argv) > 2 else current_config["default_rejected"] pick_handle = open(pick_file, 'rb') rej_strs = pickle.load(pick_handle) if not rej_strs: raise SystemExit("File: " + pick_file + " contains no inputs.") # Precompute the locations of conditions and the lines of their then and else case and format the file properly global manual_errs manual_errs = argtracer.compute_base_ast(ar1, mut_dir + script_name + ".py") ar1 = mut_dir + script_name + ".py" # Record how long the slowest execution takes to have a better prediction of the required timeout slowest_run = 0 # Get base values from the non-crashing run with the most conditions traversed progress = 1 base_conds = [] ln_cond = -1 for cand in rej_strs: pos = 0 print("Mutated string:", repr(cand[0]), flush=True) print("Valid string:", repr(cand[1]), flush=True) base_index = 0 for str_inpt in cand: start_time = timer() try: print("Tracing:", progress, "/", 2*len(rej_strs), flush=True) (_,base_cond,_,someerror) = argtracer.trace(ar1, str_inpt) if pos == 1: base_conds.append(base_cond) if len(base_cond) > ln_cond: basein = cand[1] base_pos = base_index ln_cond = len(base_cond) if someerror: raise SystemExit("Invalid input: " + repr(str_inpt) + ".\nAborted.") base_index += 1 finally: pos += 1 time_elapsed = timer() - start_time if time_elapsed > slowest_run: slowest_run = time_elapsed progress += 1 # Choose a timeout that is very likely to let valid mutants finish timeout = max(timeout, int(int(current_config["timeout_slow_multi"])*slowest_run)+1) try: (_, b_cdict, _, err) = argtracer.trace(ar1, basein, timeout=timeout) except Timeout: print("Execution timed out on basestring! Try increasing timeout (currently", timeout," seconds)") if err: raise SystemExit("Exiting: " + pick_file + " contains no valid inputs for " + ar1) # Remove duplicates (same condition trace) from valid inputs idxl = 0 idxr = 0 while idxl < len(base_conds): idxr = idxl+1 while idxr < len(base_conds): if get_frozen(base_conds[idxl]) == get_frozen(base_conds[idxr]): del base_conds[idxr] else: idxr += 1 idxl += 1 print("Amount of unique base strings:", len(base_conds), flush=True) print("Used baseinput:", repr(basein)) # Log the inputs since they are determined already input_log = LogWriter(mut_dir[:-1] + "_inputs.log") for i in range(len(rej_strs)): input_log.append_line(str(i) + ": " + repr(rej_strs[i][0])+"\n") input_log.append_line("The baseinput was: " + repr(basein)) lwriter = LogWriter(mut_dir[:-1] + ".log") lwriter.append_line("Mutating script: " + repr(orig_file) + "\n") all_generated = { int_key : [] for int_key in range(len(base_conds)) } # Run the mutation process for every rejected string for s in rej_strs: s = s[0] if int(current_config["variable_base"]) == 0: queue = [(ar1, [], 0, None, None, None, base_index)] else: queue = [] for base_index in range(len(base_conds)): queue.append((ar1, [], 0, None, None, None, base_index)) discarded = set() # Save which exception the first execution of the rejected string produced original_ex_str = None # Stores which exceptions the valid string caused except_set = set() # The set of final lines observed by mutants rejecting the valid string rej_sigs = set() while queue: (arg, history, retries, pidx, pmstate, scstate, b_cindex) = queue.pop(0) skip = False b_cdict = base_conds[b_cindex] print("Current script:", arg, flush=True) # Check whether the chosen correct string is now rejected try: _mod = imp.load_source('mymod', arg) except: print("Discarded script:", arg, "(import error)", flush=True) os.remove(arg) continue print("Executing basestring...", flush=True) try: (lines, _, _, berr) = argtracer.trace(arg, basein, timeout=timeout) except argtracer.Timeout: print("Discarding:", arg, "(basestring timed out)", flush=True) os.remove(arg) continue # Remove lines used to construct custom exceptions lines = manual_errs.remove_custom_lines(lines) # If the crash happens on a condition we modified there is a high chance it's invalid, so we remove it. if lines[0] in history: print("Removed:", arg, "(potentially corrupted condition)", flush=True) os.remove(arg) continue # Mutation guided by rejected strings try: (lines, cdict, _, err) = argtracer.trace(arg, s, timeout=timeout) except: print("Discarding:", arg, "(mutated string timed out)", flush=True) os.remove(arg) continue # Remove lines used to construct custom exceptions lines = manual_errs.remove_custom_lines(lines) # If the crash happens on a condition we modified there is a high chance it's invalid, so we remove it. if lines[0] in history: print("Removed:", arg, "(potentially corrupted condition)", flush=True) os.remove(arg) continue if original_ex_str is None: if err == False: print("Skipping string:", s, "(not rejected)!", flush=True) continue else: original_ex_str = str(err.__class__) # Check whether the modification changed the condition state skip = pmstate is not None and cdict.get(history[-1]) is not None and cdict.get(history[-1]) == pmstate if skip: print("Removed:", arg, "(unsuccessful modification)", flush=True) if retries < int(current_config["mut_retries"]) and pidx: # Try again full_str = manual_errs.get_if_from_line(history[-1], ar1) cond_str = full_str[full_str.find("if")+3:full_str.rfind(":")] inpt_ast = ast.fix_missing_locations(ast.parse(cond_str)) mtrans = MutTransformer(pidx) res = mtrans.visit(inpt_ast) fix = full_str[:full_str.find("if")+2] + " " + astunparse.unparse(res).lstrip().rstrip() + ":" if not fix.endswith("\n"): fix = fix + "\n" mods = { history[-1] : fix } cand = mut_dir + script_name + "_" + str(str_cnt) + "_" + str(mut_cnt) + ".py" queue.insert(0,(cand, history.copy(), retries+1, pidx, pstate, None, b_cindex)) file_copy_replace(cand, arg, mods) mut_cnt += 1 elif retries >= int(current_config["mut_retries"]): print("Retries exceeded:", arg, flush=True) os.remove(arg) continue sskip = (scstate is not None and cdict.get(history[-1]) is not None and cdict.get(history[-1]) == scstate) # Retries would be possible here as well, but since our search is blind for these conditions it's skipped if sskip: print("Removed:", arg, "(unsuccessful modification) (sec)", flush=True) os.remove(arg) continue if berr and (lines[0] not in rej_sigs or berr not in except_set): print("Mutation complete:", arg, "(base rejected)", flush=True) print("Exception for base on", arg, ":", repr(berr), flush=True) mutants_with_cause.append((arg, "valid string rejected")) lwriter.append_line(repr(mutants_with_cause[-1]) + "\n") (prim, sec) = get_left_diff(cdict, b_cdict) # Remove all elements that have been explored (history) or do not belong to the actual code (i.e. error constructor - lines) prim = [e for e in prim if e[0] not in history and e[0] in lines] sec = [e for e in sec if e[0] not in history and e[0] in lines] if int(current_config["blind_continue"]) else [] # Don't create mutants if their line combination is already in the queue prim = [] if not prim else rm_dups(prim, history, all_generated, b_cindex) # Sec will never be progressed if prim is not empty sec = [] if not sec or len(prim) > 0 else rm_dups(sec, history, all_generated, b_cindex) print("Used string:", repr(s), flush=True) print("Queue length:", len(queue), flush=True) print("Change history:", history, flush=True) print("Difference to base (flipped):", prim, flush=True) print("Difference to base (new):", sec, flush=True) print("Final line:", str(lines[0]), flush=True) print("", flush=True) if err: # Check whether the exception is different from the first encountered one diff_err = str(err.__class__) != original_ex_str err = True print("Mutated string rejected:", err, "different:", diff_err, flush=True) if (err and not diff_err) or int(current_config["early_stop"]) == 0: all_fixes = get_possible_fixes((prim, sec), arg) if all_fixes: for (fix_list, fix_line, pstate, sstate) in all_fixes: # Create a mutant for every possible fix for (fix, permindex) in fix_list: if not fix.endswith("\n"): fix = fix + "\n" cand = mut_dir + script_name + "_" + str(str_cnt) + "_" + str(mut_cnt) + ".py" mods = { fix_line : fix } queue.insert(0,(cand, history.copy()+[fix_line],0, permindex, pstate, sstate, b_cindex)) file_copy_replace(cand, arg, mods) mut_cnt += 1 # Check whether the mutant is valid (rejects base or accepts mutated string) and record its behaviour if arg != ar1: if not err or diff_err: print("Mutation complete:", arg, "(mutated string accepted)", flush=True) mutants_with_cause.append((arg, "mutated string accepted")) lwriter.append_line(repr(mutants_with_cause[-1]) + "\n") elif not berr or (berr and (lines[0] in rej_sigs and berr in except_set)): discarded.add(arg) rej_sigs.add(lines[0]) except_set.add(berr) # Don't delete the original script, we need it to create mutants from whenever a new rejected string is processed discarded.discard(ar1) # Remove all scripts that neither reject the base string nor accept the mutated string for scrpt in discarded: print("Removed:", scrpt, flush=True) os.remove(scrpt) # Adjust the file naming str_cnt += 1 mut_cnt = 0 print("Processing string number:", str(str_cnt), "/", str(len(rej_strs)),flush=True) # Move the copy of the original script since it is not a mutant orig_out = current_config["default_mut_dir"] + ar1[ar1.rfind("/")+1:] if os.path.exists(orig_out): os.remove(orig_out) os.rename(ar1, orig_out) print("Done. The final mutants are in:", mut_dir) # Remove duplicates and update the log accordingly mutants_with_cause = remove_duplicates(mut_dir, ".py", mutants_with_cause) lwriter = LogWriter(mut_dir[:-1] + ".log") lwriter.append_line("Mutating script: " + repr(orig_file) + "\n") for e in mutants_with_cause: lwriter.append_line(repr(e) + "\n")
def phonetise_word(arabic_word): utterances = [arabic_word] arabic_word = arabic_utils.remove_diacritics(arabic_word) result = '' # Pronunciations Dictionary utterances_pronunciations = [ ] # Most likely pronunciation for all utterances utterances_pronunciations_with_boundaries = [ ] # Most likely pronunciation for all utterances pronunciations = [] phones = [] # ----------------------------------------------------------------------------------------------------- # Loop through utterances------------------------------------------------------------------------------ # ----------------------------------------------------------------------------------------------------- utterance_number = 1 for utterance in utterances: utterance_number += 1 utterances_pronunciations.append( '') # Add empty entry that will hold this utterance's pronuncation # Add empty entry that will hold this utterance's pronuncation utterances_pronunciations_with_boundaries.append('') utterance = convert(utterance) # --------------------------- word_index = -1 # Loop through words for word in utterance: word_index += 1 if word not in [u'-', u'sil']: pronunciations = [ ] # Start with empty set of possible pronunciations of current word # Add fixed irregular pronunciations if possible result = isFixedWord2(word, result, word, pronunciations) # Indicates whether current character is in an emphatic context or not. Starts with False emphaticContext = False word = u'##' + word + u'##' # This is the end/beginning of word symbol. just for convenience phones = [ ] # Empty list which will hold individual possible word's pronunciation # ----------------------------------------------------------------------------------- # MAIN LOOP: here is where the Modern Standard Arabic phonetisation rule-set starts-- # ----------------------------------------------------------------------------------- # MAIN LOOP: here is where the Modern Standard Arabic phonetisation rule-set starts-- # ----------------------------------------------------------------------------------- for index in range(2, len(word) - 2): letter = word[index] # Current Character nextCharacter = word[index + 1] # Next Character afterNextCharacter = word[index + 2] # Next-Next Character previousCharacter = word[index - 1] # Previous Character beforePreviousCharacter = word[index - 2] # Before Previous Character emphaticContext = emphatic_context.getState(letter, nextCharacter) if letter in constants.unambiguousConsonantMap: phones.append(constants.unambiguousConsonantMap[letter]) # ---------------------------------------------------------------------------------------------------------------- if letter == u'l': # Lam is a consonant which requires special treatment phones += handle_characters.lam(beforePreviousCharacter, previousCharacter, nextCharacter, afterNextCharacter) # ---------------------------------------------------------------------------------------------------------------- # shadda just doubles the letter before it if letter == u'~' and previousCharacter not in [ u'w', u'y' ] and len(phones) > 0: phones[-1] += phones[-1] # ---------------------------------------------------------------------------------------------------------------- if letter == u'|': # Madda only changes based in emphaticness phones += handle_characters.madda(emphatic_context) # ---------------------------------------------------------------------------------------------------------------- if letter == u'p': # Ta' marboota is determined by the following if it is a diacritic or not phones += handle_characters.p(nextCharacter) if letter in constants.vowelMap: # Waw and Ya are complex they could be consonants or vowels and their gemination is complex as # it could be a combination of a vowel and consonants phones += handle_characters.handle_vowels( previousCharacter, letter, nextCharacter, afterNextCharacter, emphaticContext) # Kasra and Damma could be mildened if before a final silent consonant if letter in [u'u', u'i']: phones += handle_characters.kasra_and_damma( word, letter, emphaticContext, nextCharacter, afterNextCharacter) # Alif could be ommited in definite article and beginning of some words if letter in [u'a', u'A', u'Y']: phones += handle_characters.alef(beforePreviousCharacter, previousCharacter, letter, nextCharacter, emphaticContext) pronunciations += get_different_possible_pronounciations(phones) pronunciations = remove_duplicates(pronunciations) return [ ' '.join(item) for item in pronunciations if len(item) >= len(arabic_word) ]
def test_output(self): """Are all duplicates removed?""" duplicates = [1, 2, 2, 3, 3, 3, 4, 4, 4, 4] output = remove_duplicates(duplicates) self.assertEqual(output, [1,2,3,4])
def test_no_duplicates(self): test = "abcd" self.assertEqual(remove_duplicates(test), "abcd")
lines_to_keep = [] current_run = [] for line in file: Q = float(line.split(",")[0]) if Q < -1: #finished a run, so randomly pick a number # with probability 1/4 pick the maximum if random.random() < 0.25: lines_to_keep.append(current_run[-1]) else: lines_to_keep.append(pick_random_partition(current_run)) current_run = [] current_run.append(line) file.close() if options.ensure_uniqueness: lines_to_keep = remove_duplicates(lines_to_keep) if options.filename == None: for line in lines_to_keep: print line, else: output_file = open(options.filename,"w") for line in lines_to_keep: output_file.write(line) output_file.close()
# run locana-p on the superclusters to get the alignment and consensus structure seqs = [] out = "group " + str(currgroup) + ": " for cluster in structgroups[currstruct]: out += cluster + " " for header, seq in MinimalFastaParser(open(otus[cluster], "rU")): seqs.append((header, seq)) print out print str(len(seqs)) + " sequences" # make sure group has enough sequences before continuing # run locarna-p on the at most 50 most abundant sequences in the group aln, struct = run_locarnap(seqs, 50, cpus=args.c, foldless=True) if aln.getNumSeqs() < 50: print str(aln.getNumSeqs()) + " unique sequences" else: s, h = remove_duplicates(seqs) print str(len(s)) + " unique sequences" s = 0 h = 0 print "Structure: " + struct # print out alignment and structure in fasta and stockholm formats # create output folder for group currotufolder = otufolder + "group_" + str(currgroup) if not exists(currotufolder): mkdir(currotufolder) # print that shit alnout = open(currotufolder + "/locarnap-aln.fasta", "w") alnout.write(aln.toFasta() + "\n>SS_struct\n" + struct + "\n") alnout.close() alnout = open(currotufolder + "/locarnap-aln.sto", "w")
lines_to_keep = [] current_run = [] for line in file: Q = float(line.split(",")[0]) if Q < -1: #finished a run, so randomly pick a number # with probability 1/4 pick the maximum if random.random() < 0.25: lines_to_keep.append(current_run[-1]) else: lines_to_keep.append(pick_random_partition(current_run)) current_run = [] current_run.append(line) file.close() if options.ensure_uniqueness: lines_to_keep = remove_duplicates(lines_to_keep) if options.filename == None: for line in lines_to_keep: print line, else: output_file = open(options.filename, "w") for line in lines_to_keep: output_file.write(line) output_file.close()
# Get list of OTUs from file, populate dictionary # need to add -i for input, -o for out folder, -c for cpus, -r current round otus = [] fn = open(argv[1], "rU") for line in fn: lineinfo = line.strip().split() otus.append((lineinfo[0], lineinfo[1])) fn.close() for currotu in otus: otu = currotu[0] print "==" + otu + "==" print "Reading in 30 most abundant sequences" # assuming that the fasta has more than 30 sequences in it. Safe assumption # if this is a significant cluster seqs = [(header, seq) for header, seq in MinimalFastaParser(open(currotu[1], "rU"))] seqs, headers = remove_duplicates(seqs) # blank headers to save memory headers = 0 # headers come out in format Header_# so split to get # and sort by abundance seqs.sort(reverse=True, key=lambda count: int(count[0].split("_")[1])) # cut to 30 most abundant sequences seqs = seqs[:30] print "Running locarna-p on sequences" args = {"--cpus": "24"} aln, struct = create_locarnap_alignment(seqs, RNA, struct=True, params=args) # create output folder for OTU otufolder = "/Users/Ely/Desktop/Ely_selection/R7/lead_clusters/" if not exists(otufolder): mkdir(otufolder) otufolder += otu if not exists(otufolder):
def test_one_node(self): inputs = ['a', 'b', 'c'] for head_value in inputs: head = node.Node(head_value) self.assertEqual(remove_duplicates.remove_duplicates(head), head)
def test_remove_duplicates(self): self.assertEqual(remove_duplicates([0, 0, 1, 1, 1, 2, 2, 3, 3, 4]), 5) self.assertEqual(remove_duplicates([1, 1, 2]), 2) self.assertEqual(remove_duplicates([0, 1, 2, 3, 4]), 5)
def test_output(self): """Are all duplicates removed?""" duplicates = [1, 2, 2, 3, 3, 3, 4, 4, 4, 4] output = remove_duplicates(duplicates) self.assertEqual(output, [1, 2, 3, 4])
str(len(rem)) + " sequences removed") print str(len(kept)) + " sequences left, " + \ str(len(rem)) + " sequences removed. " + str((time() - secs)/60) + " minutes\n" write_fasta_list(kept, currfolder + "-Stripped.fasta") write_fasta_list(rem, currfolder + "-NotStripped.fasta") rem = 0 #remove all sequences with Ns and short sequences print "Remove short and ambiguous sequences" secs = time() kept = rem_N_short(kept, args.l) log.write("Remove short and ambiguous sequences\n" + str(len(kept)) + " sequences left\n") print str(len(kept)) + " sequences left. " + str((time() - secs)/60) + " minutes" write_fasta_list(kept, currfolder + "-CleanStripped.fasta") #remove duplicate sequences from the fasta file and store for later print "Remove duplicates" secs = time() kept, headers = remove_duplicates(kept) write_fasta_list(kept, currfolder + "-Unique.fasta") #write out file holding headers keyed to a sequence keyfile = open(currfolder + "-seqtoheaders.txt", 'w') for key in headers: keyfile.write(key + "\t") for item in headers[key]: keyfile.write(item + ",") keyfile.write("\n") keyfile.close() log.write("Remove duplicates\n" + str(len(kept)) + " sequences left") print str(len(kept)) + " sequences left. " + str((time() - secs)/60) + " minutes\n" log.close()
def test_empty(self): inputs = [None] for head in inputs: self.assertEqual(remove_duplicates.remove_duplicates(head), head)