def main(): parser = argparse.ArgumentParser( description="Calculate sequence-level epitope coverage for " "some library design.") parser.add_argument('-d', '--design', help="Name of design file to compare against " "reference file.") parser.add_argument('-r', '--reference', help="Reference dataset from which 'design' was " "created, script calculates epitope coverage " "for each sequence in the reference.") parser.add_argument('-o', '--output', help="Name of file to write output to. ") parser.add_argument('--delimiter', default='|', help="Delimiter to place between sequence name " "and its percent epitope coverage.") parser.add_argument('-e', '--epitope_size', type=int, help="Integer size of epitopes to use for coverage " "information gathering.") args = parser.parse_args() ref_names, ref_sequences = oligo.read_fasta_lists(args.reference) design_names, design_seqs = oligo.read_fasta_lists(args.design) header = "Sequence Name%sPercent Epitopes (%d-mers) Covered in design\n" % ( args.delimiter, args.epitope_size) out_strings = list() design_kmers = set() for item in design_seqs: design_kmers |= oligo.subset_lists_iter(item, args.epitope_size, 1) for index in range(len(ref_names)): current_name = ref_names[index] current_seq = ref_sequences[index] current_kmers = oligo.subset_lists_iter(current_seq, args.epitope_size, 1) if len(current_kmers) > 0: perc_cov = len(current_kmers & design_kmers) / len(current_kmers) else: perc_cov = 4 out_strings.append("%s%s%f\n" % (current_name, args.delimiter, perc_cov)) with open(args.output, 'w') as out_file: out_file.write("%s\n" % header) for item in out_strings: out_file.write(item)
def main(): names, sequences = oligo.read_fasta_lists( sys.argv[ 1 ] ) kmer_set = set() for seq in sequences: kmer_set |= oligo.subset_lists_iter( seq, 9, 1 ) print( "%s|%d" % ( sys.argv[ 1 ], len( kmer_set ) ) )
def fasta_to_dict(filename): names, sequences = oligo.read_fasta_lists(filename) out_dict = {} for index, name in enumerate(names): out_dict[name] = sequences[index] return out_dict
def parse(self, filename): out_fasta = Fasta(filename=filename) names, sequences = oligo.read_fasta_lists(filename) for name, seq in zip(names, sequences): out_fasta.add_seq(Sequence(name=name, sequence=seq)) return out_fasta
def parse(self, filename): names, sequences = oligo.read_fasta_lists(filename) out_seqs = list() for index, name in enumerate(names): out_seqs.append(Sequence(name=name, sequence=sequences[index])) return out_seqs
def parse_fasta(fname): names, sequences = oligo.read_fasta_lists(fname) sequences = [ Sequence(n, s.replace('-', '')) for n, s in zip(names, sequences) ] return Collection(data=sequences)
def main(): if len(sys.argv) != 4: print("Verify correctness of exact-match output.") print( "USAGE: check_correct_counts.py designed_fasta pepsirf_demux_output reads_fastq" ) sys.exit(1) design_lib = sys.argv[1] demux_output = sys.argv[2] reads_fastq = sys.argv[3] start_index = 43 length = 90 demux_counts = get_counts(demux_output) d_n, d_s = oligo.read_fasta_lists(design_lib) fastq_counts = get_fastq_counts(reads_fastq, start_index, length) seq_dict = {} for name, seq in zip(d_n, d_s): seq_dict[name] = seq for name, count in demux_counts.items(): if seq_dict[name] in fastq_counts: try: assert (count == fastq_counts[seq_dict[name]]) except AssertionError as e: print(seq_dict[name], count, fastq_counts[seq_dict[name]])
def _count_kmers_in_file(self, filename): oligo_set = set() names, sequences = oligo.read_fasta_lists(self._dir_name + '/' + filename) for item in sequences: oligo_set |= oligo.subset_lists_iter(item, self._kmer_size, 1) return len(oligo_set)
def get_kmer_sizes(input_dir, k): output = list() for f_in in os.listdir(input_dir): fname = f'{input_dir}/{f_in}' names, seqs = oligo.read_fasta_lists(fname) output.append((f_in, seqs_to_kmers(seqs, k))) return output
def main(): in_file = sys.argv[1] names, sequences = oligo.read_fasta_lists(in_file) print("Number of oligos in original design: %d" % len(names)) print("Number of unique oligos: %d" % len(set(sequences))) names, sequences = oligo.get_unique_sequences(names, sequences) oligo.write_fastas(names, sequences, in_file + "_unique")
def main(): original = sys.argv[1] clusters = sys.argv[2] orig_dict = {} orig_names, orig_seqs = oligo.read_fasta_lists(original) clusters_names, clusters_seqs = oligo.read_fasta_lists(clusters) assert len(orig_names) == len(orig_seqs) assert len(clusters_names) == len(clusters_seqs) for current in range(len(orig_names)): orig_dict[orig_names[current]] = orig_seqs[current] successful = 0 for current in range(len(clusters_names)): name = clusters_names[current] seq = clusters_seqs[current] if name in orig_dict: assert orig_dict[name] == seq successful += 1
def count_kmers_in_file(filename, k, count_total=False): names, sequences = oligo.read_fasta_lists(filename) kmers = set() ret_val = 0 for seq in sequences: kmer_set = oligo.subset_lists_iter(seq, k, 1) if count_total: ret_val += len(kmer_set) else: kmers |= kmer_set ret_val = len(kmers) return ret_val
def get_records_from_fasta(fname): records = dict() line_re = re.compile("ID=([\S]+)\s[\S]+\sOXX=\d*,(\d*),\d*,\d*") if fname: names, seqs = oligo.read_fasta_lists(fname) for name in names: name_search = re.search(line_re, name) if name_search: id = name_search.group(1) ox = name_search.group(2) records[id] = str(ox) else: print("No match: ", name) return records
def main(): arg_parser = argparse.ArgumentParser( description= "Simple creation of a library, both gap spanning and non-gap spanning algorithms" ) arg_parser.add_argument('-q', '--query', help="Fasta query file.") arg_parser.add_argument('-o', '--output', help="Fasta file to output") arg_parser.add_argument('-g', '--gap_span', help="Fasta query file.", default=False, action="store_true") arg_parser.add_argument('-w', '--window_size', help="Window Size to use for grabbing oligos.", default=19, type=int) arg_parser.add_argument( '-s', '--step_size', help="Number of amino acids to step after each window.", default=10, type=int) args = arg_parser.parse_args() names, sequences = oligo.read_fasta_lists(args.query) seqs = list() for name, sequence in zip(names, sequences): seqs.append(Sequence(name=name, sequence=sequence)) print("Number of input sequences: ", len(seqs)) if args.gap_span: designer = GapSpanningLibraryDesigner(window_size=args.window_size, step_size=args.step_size) else: designer = LibraryDesigner(window_size=args.window_size, step_size=args.step_size) library = designer.design(seqs) print("Number of output Kmers: ", len(library)) write_sequences(args.output, library)
def main(): if len(sys.argv) != 2: print( "Perform IUPred analysis on a fasta containing " "more than one sequence. IUPred must be executable with './iupred2a'. " "Output is sent to standard out.") print("USAGE: do_pred.py input_file") sys.exit(1) names, sequences = oligo.read_fasta_lists(sys.argv[1]) for name, sequence in zip(names, sequences): with tempfile.NamedTemporaryFile(mode='w') as of: of.write('>%s\n%s' % (name, sequence)) of.flush() print('>' + name) print( subprocess.check_output('./iupred2a.py %s long' % of.name, shell=True).decode('ascii'))
def parse_fasta(fname): names, sequences = oligo.read_fasta_lists(fname) return [Sequence(a, b) for a, b in zip(names, sequences)]
def main(): usage = "usage: %prog [options]" option_parser = optparse.OptionParser(usage) add_program_options(option_parser) options, arguments = option_parser.parse_args() if options.query is None: print("ERROR: Fasta query file must be provided.") sys.exit(0) names, sequences = oligo.read_fasta_lists(options.query) names, sequences = oligo.sort_sequences_by_length(names, sequences) cluster_dict = {} # Get the sequences sorted in decreasing order names.reverse() sequences.reverse() num_seqs = len(sequences) ymer_dict = {} for index in range(num_seqs): ymer_dict[sequences[index]] = oligo.subset_lists_iter( sequences[index], options.XmerWindowSize, 1) num_seqs = len(sequences) out_list = np.empty(1) for current_seq in range(num_seqs): for inner_index in range(current_seq + 1, num_seqs): out_list = np.append( out_list, oligo.get_single_sequence_dist( ymer_dict[sequences[current_seq]], ymer_dict[sequences[inner_index]], options.XmerWindowSize, 1)) out_list = np.delete(out_list, 0) if options.verbose: print("Distance Matrix complete") Z = linkage(out_list, 'single') cluster = fcluster(Z, options.clusters, criterion='maxclust') out_file = open(options.output, 'w') if options.verbose: print("Clustering Complete") for sequence in range(len(names)): if cluster[sequence] not in cluster_dict: cluster_dict[cluster[sequence]] = list() cluster_dict[cluster[sequence]].append( (names[sequence], sequences[sequence])) out_file.write("%d %s\n" % (cluster[sequence], names[sequence])) if options.verbose: display_cluster_information(cluster_dict, out_list, options.XmerWindowSize, 1, ymer_dict) out_file.close()
def count_sequences_in_file(filename): names, sequences = oligo.read_fasta_lists(filename) return len(sequences)
def parse_fasta( fname ): n, s = oligo.read_fasta_lists( fname ) seqs = [ Sequence( name = na, seq = se.replace( '-', '' ) ) for na, se in zip( n, s ) ] return SequenceCollection( seqs )
def cluster_taxonomically(options, sequence_dict, kmer_dict): """ Clusters sequences based on taxonomic rank. Ranks that have more than the options.number amount of sequences will be split up evenly. :param options: options object to get the program preferences from. :param sequence_dict: dictionary o fsequences with name: sequence key-value pairs :returns clusters: dictionary of cluster: sequence pairings """ names, sequences = seq_dict_to_names_and_seqs(sequence_dict) if options.start is None: start_ranks = ['FAMILY'] else: start_ranks = options.start # Get the ranks descending order ranks = reversed( sorted([oligo.Rank[item.upper()].value for item in start_ranks])) ranks = [oligo.Rank(item).name for item in ranks] sequence_dict = copy.deepcopy(sequence_dict) current_rank = ranks[0] reference_names, reference_seqs = oligo.read_fasta_lists( options.unclustered) ref_dict = create_seq_dict(reference_names, reference_seqs) # inverted dictionaries for missing id resolutions ref_dict_inverted = create_seq_dict(reference_names, reference_seqs, key='sequences') seq_dict_inverted = create_seq_dict(names, sequences, key='sequences') combined_dictionaries = combine_dicts(ref_dict_inverted, seq_dict_inverted) rank_map = oligo.parse_rank_map(options.rank_map) created_clusters = {} clusters_created = list() missing_seqs = list() sequence_tax_id = set() for current_name, current_seq in sequence_dict.items(): if 'TaxID' not in current_name and 'OX' not in current_name: taxid = resolve_missing_taxid(current_name, current_seq, combined_dictionaries) if taxid: sequence_tax_id.add(taxid) else: missing_seqs.append((current_name, current_seq)) else: sequence_tax_id.add(oligo.get_taxid_from_name(current_name)) tax_data = oligo.get_taxdata_from_file(options.lineage) tax_data = oligo.fill_tax_gaps(tax_data, rank_map) del reference_seqs merged_ids = { 10969: 444185, 11619: 2169991, 11630: 2169993, 11806: 353765, 45218: 2169996, 45222: 2169994, 45709: 2169992, 489502: 10407, 587201: 10255, 587202: 10255, 587203: 10255, 1173522: 11723, 1554474: 1511807, 1554482: 1330068, 1554483: 1330491, 1554492: 1330066, 1554494: 1307800, 1554498: 1511784, 1559366: 1513237, 1560037: 1131483, 2169701: 11027 } current_rank = oligo.Rank[ranks[len(ranks) - 1]].value index = 0 for index in range(len(ranks)): current_rank = oligo.Rank[ranks[index]].value rank_data = oligo.group_seq_from_taxid(sequence_tax_id, merged_ids, tax_data, current_rank) deleted_clusters = list() if len(sequence_dict) > 0: for current_name in list(sequence_dict.keys()): current_id = oligo.get_taxid_from_name(current_name) if current_id: current_id = int(current_id) else: current_id = resolve_missing_taxid( current_name, sequence_dict[current_name], combined_dictionaries) if current_id: current_id = check_for_id_in_merged_ids( merged_ids, current_id) current_rank_data = rank_data[current_id].lower() if current_id in rank_data and current_rank_data not in deleted_clusters: if current_rank_data not in created_clusters: new_cluster = cluster.Cluster(current_rank_data) created_clusters[current_rank_data] = new_cluster created_clusters[ current_rank_data].add_sequence_and_its_kmers( current_name, sequence_dict[current_name], kmer_dict[current_name]) if created_clusters[current_rank_data].get_num_kmers( ) > options.number and index < len(ranks) - 1: # Put the items back in the pool of choices if our cluster becomes too large put_large_cluster_back_in_pool( created_clusters, sequence_dict, current_rank_data) deleted_clusters.append(current_rank_data) else: del sequence_dict[current_name] if missing_seqs: for name, seq in missing_seqs: best_cluster = seq_cluster_best_match(created_clusters.values(), kmer_dict[name], options.kmerSize) best_cluster.add_sequence_and_its_kmers(name, seq, kmer_dict[name]) return created_clusters
def main(): usage = "usage %prog [options]" option_parser = optparse.OptionParser(usage) add_program_options(option_parser) options, arguments = option_parser.parse_args() if options.query is None: print("Fasta query file must be provided.") sys.exit() names, sequences = oligo.read_fasta_lists(options.query) num_seqs = len(names) assert len(names) == len(sequences) sequence_dict = create_seq_dict(names, sequences) ymer_dict = {} total_ymers = set() for current_seq in range(len(sequences)): current_ymers = frozenset( oligo.subset_lists_iter(sequences[current_seq], options.kmerSize, 1)) total_ymers |= current_ymers ymer_dict[names[current_seq]] = current_ymers if 'tax' in options.clustering: if options.lineage: clusters_with_kmers = {} created_clusters = cluster_taxonomically(options, sequence_dict, ymer_dict) else: print( "Lineage file must be provided for taxonomic clustering, exiting" ) sys.exit() else: sorted_ids = sorted(options.id.split(',')) created_clusters = cluster_by_kmers(float(sorted_ids[0]), sequence_dict, ymer_dict) for current_id in sorted_ids[1::]: current_id = float(current_id) max_cluster_size = max( [item.get_num_kmers() for item in created_clusters.values()]) if max_cluster_size > options.number: re_cluster_kmers(sequence_dict, ymer_dict, created_clusters, current_id, options.number) print("Id threshold: %s." % options.id) min_cluster_size, median_cluster_size, avg_cluster_size, max_cluster_size = get_cluster_stats( created_clusters, total_ymers) print("Number of unique ymers: %d." % len(total_ymers)) print("Number of clusters: %d." % len(created_clusters.keys())) print("Minimum cluster size: %d." % min_cluster_size) print("Median cluster size: %.2f." % median_cluster_size) print("Average cluster size: %.2f." % avg_cluster_size) print("Maximum cluster size: %d." % max_cluster_size) write_outputs(options.output, created_clusters, options.number)
def parse_fasta(fname): names, sequences = oligo.read_fasta_lists(fname) return [SequenceWithLocation(n, s) for n, s in zip(names, sequences)]
def parse_fasta(fname): n, s = oligo.read_fasta_lists(fname) return [Sequence(na, se) for na, se in zip(n, s)]
def main(): usage = "usage: %prog [options]" option_parser = optparse.OptionParser(usage) add_program_options(option_parser) options, arguments = option_parser.parse_args() names, sequences = oligo.read_fasta_lists(options.query) min_ymers = 999999999999999999999999999999999 for i in range(options.iterations): xmer_seq_dict = {} # create list of Xmer sequences for index in range(len(sequences)): name, sequence = oligo.subset_lists_iter(names[index], sequences[index], options.XmerWindowSize, options.stepSize) for index in range(len(sequence)): if oligo.is_valid_sequence(sequence[index], options.minLength, options.percentValid): value = [options.redundancy, name[index]] xmer_seq_dict[sequence[index]] = value # create dict of Ymer sequences ymer_seq_dict = {} # Break each ymer up into subsets of xmer size for index in range(len(sequences)): name, sequence = oligo.subset_lists_iter(names[index], sequences[index], options.YmerWindowSize, options.stepSize) for index in range(len(sequence)): if oligo.is_valid_sequence(sequence[index], options.minLength, options.percentValid): ymer_seq_dict[sequence[index]] = name[index] total_ymers = len(ymer_seq_dict) array_design = {} array_xmers = {} to_add = [] ymer_xmers = [] iter_count = 0 while True: #reset max score at the beginning of each iteration max_score = 0 for current_ymer in ymer_seq_dict.keys(): # calculate the score of this ymer score, subset_ymer = calculate_score(current_ymer, xmer_seq_dict, options.XmerWindowSize, 1) if score > max_score: to_add = list() max_score = score to_add.append(current_ymer) ymer_xmers = [subset_ymer] elif score == max_score: to_add.append(current_ymer) ymer_xmers.append(subset_ymer) random_index = random.choice(range(len(to_add))) oligo_to_remove = to_add[random_index] chosen_xmers = ymer_xmers[random_index] # array_xmers.update(chosen_xmers) for each in chosen_xmers: array_xmers[each] = array_xmers.get(each, 0) + 1 # subtract from the score of each xmer within the chosen ymer for item in chosen_xmers: if item in xmer_seq_dict: # We dont' want negative scores if xmer_seq_dict[item][0] > 0: xmer_seq_dict[item][0] -= 1 else: print("%s - not found in xmer dict!!!" % (item)) iter_count += 1 if len(ymer_seq_dict) == 0 or max_score <= 0: print("Final design includes %d %d-mers (%.1f%% of total) " % (len(array_design), options.YmerWindowSize, (len(array_design) / float(total_ymers)) * 100)) # average_redundancy = sum( xmer_seq_dict[ item ][ 0 ] for item in xmer_seq_dict ) / len( xmer_seq_dict ) print("%d unique %d-mers in final %d-mers (%.2f%% of total)" % (len(array_xmers), options.XmerWindowSize, options.YmerWindowSize, (float(len(array_xmers)) / len(xmer_seq_dict)) * 100)) print("Average redundancy of %d-mers in %d-mers: %.2f" % (options.XmerWindowSize, options.YmerWindowSize, sum(array_xmers.values()) / float(len(array_xmers)))) if len(array_design) < min_ymers: min_ymers = len(array_design) best_xmer_seq_dict = xmer_seq_dict del (xmer_seq_dict) best_array_design = array_design del (array_design) break try: array_design[oligo_to_remove] = ymer_seq_dict[oligo_to_remove] del ymer_seq_dict[oligo_to_remove] except KeyError: continue if not iter_count % 250: print("Current Iteration: " + str(iter_count)) # print( "Number of output ymers: " + str( len( array_design ) ) ) print("Current xmer dictionary score: " + str(sum(item[0] for item in xmer_seq_dict.values()))) write_outputs(best_xmer_seq_dict, options.outPut) names = [] sequences = [] # Write resulting oligos to file for sequence, name in best_array_design.items(): names.append(name) sequences.append(sequence) oligo.write_fastas(names, sequences, output_name=options.outPut + "_R" + str(options.redundancy) + ".fasta")
def parse(self): names, sequences = oligo.read_fasta_lists(self.filename) return self.seq_factory.create_seq_list(names, sequences)
def parse_peptides(fname): names, sequences = oligo.read_fasta_lists(fname) return names
def main(): arg_parser = argparse.ArgumentParser( description= "Parse representative output map to produce a sprot/trembl file") arg_parser.add_argument('-s', '--sprot', help="Input sprot fasta to parse") arg_parser.add_argument('-t', '--trembl', help="Input trembl file to parse") arg_parser.add_argument('-m', '--map_file', help="Input map file to parse.") args = arg_parser.parse_args() out_sprot_name = args.map_file + "_sprot" out_trembl_name = args.map_file + "_trembl" sprot_names, sprot_seqs = oligo.read_fasta_lists(args.sprot) trembl_names, trembl_seqs = oligo.read_fasta_lists(args.trembl) in_sprot_seqs = {} in_trembl_seqs = {} out_sprot_seqs = {} out_trembl_seqs = {} for index in range(len(sprot_names)): current_name = sprot_names[index] current_seq = sprot_seqs[index] in_sprot_seqs[current_name] = current_seq for index in range(len(trembl_names)): current_name = trembl_names[index] current_seq = trembl_seqs[index] in_trembl_seqs[current_name] = current_seq map_items = parse_map(args.map_file) for current in map_items: added = False for inner in current: if inner in in_sprot_seqs: added = True out_sprot_seqs[inner.strip()] = in_sprot_seqs[inner.strip()] break if not added: out_trembl_seqs[current[0].strip()] = in_trembl_seqs[ current[0].strip()] out_sprot_names = list() out_sprot_sequences = list() if len(out_sprot_seqs): for key, value in out_sprot_seqs.items(): out_sprot_names.append(key) out_sprot_sequences.append(value) out_trembl_names = list() out_trembl_sequences = list() if len(out_trembl_seqs): for key, value in out_trembl_seqs.items(): out_trembl_names.append(key) out_trembl_sequences.append(value) oligo.write_fastas(out_sprot_names, out_sprot_sequences, out_sprot_name) oligo.write_fastas(out_trembl_names, out_trembl_sequences, out_trembl_name)
def main(): arg_parser = argparse.ArgumentParser( description = ( "Determines and outputs number of unique " "sequences/species/genera/families in a fasta file" ) ) arg_parser.add_argument( '-f', '--fasta', help = "Input fasta from which to gather data") arg_parser.add_argument( '-t', '--tax_db', help = "Name of file containing mappings of taxids -> rank data") arg_parser.add_argument( '-g', '--gap_file', help = "File containing mappings of taxid->rank for use in filling gaps" ) arg_parser.add_argument( '--oligo_file', help = "Parsed oligo map file" ) args = arg_parser.parse_args() taxid_dict = {} gap_dict = {} missing_id_key = { 10969 : 444185, 11619 : 2169991, 11630 : 2169993, 11806 : 353765, 45218 : 2169996, 45222 : 2169994, 45709 : 2169992, 489502 : 10407, 587201 : 10255, 587202 : 10255, 587203 : 10255, 1173522 : 11723, 1554474 : 1511807, 1554482 : 1330068, 1554483 : 1330491, 1554492 : 1330066, 1554494 : 1307800, 1554498 : 1511784, 1559366 : 1513237, 1560037 : 1131483 } # parse and store tax_db and gap_file info with open( args.gap_file ) as gap_file: for line in gap_file: line = line.split( '|' ) gap_dict[ line[ 0 ] ] = line[ 1 ].strip() with open( args.tax_db, 'r' ) as tax_db: for line in tax_db: line = line.split( '|' ) taxid_dict[ line[ 0 ].strip() ] = [ item.strip() for item in line[ 1:: ] ] # Fill gaps for id, info in taxid_dict.items(): if str(id) in gap_dict: if gap_dict[str(id)] == "SPECIES": taxid_dict[id][1] = taxid_dict[id][0] taxid_dict[id][0] = "" elif gap_dict[str(id)] == "GENUS": taxid_dict[id][2] = taxid_dict[id][0] taxid_dict[id][0] = "" elif gap_dict[str(id)] == "FAMILY": taxid_dict[id][3] = taxid_dict[id][0] taxid_dict[id][0] = "" num_seqs = 0 unique_species = set() unique_genera = set() unique_families = set() names, sequences = oligo.read_fasta_lists( args.fasta ) num_seqs = len( names ) tax_ids = set( [ oligo.get_taxid_from_name( item ) \ for item in names ] ) missing_ids = set() for current in tax_ids: try: if int( current ) in missing_id_key: current = str( missing_id_key[ int( current ) ] ) unique_species.add( taxid_dict[ current ][ 1 ] ) unique_genera.add( taxid_dict[ current ][ 2 ] ) unique_families.add( taxid_dict[ current ][ 3 ] ) except KeyError: missing_ids.add( current ) unique_species = [ item for item in unique_species if len( item ) > 0 ] unique_genera = [ item for item in unique_genera if len( item ) > 0 ] unique_families = [ item for item in unique_families if len( item ) > 0 ] print( "Number of sequences: %d" % num_seqs ) print( "Number of unique species: %d" % len( unique_species ) ) print( "Number of unique genera: %d" % len( unique_genera ) ) print( "Number of unique families: %d" % len( unique_families ) ) print( "Missing ids: %s" % ",".join( list( missing_ids ) ) ) species_from_file = set() genera_from_file = set() families_from_file = set() with open( args.oligo_file, 'r' ) as oligo_file: counter = 0 for line in oligo_file: if not counter: counter += 1 continue line = line.split( '\t' ) for item in line[ 5 ].split( ',' ): if len( item.strip() ) > 0: species_from_file.add( item.strip() ) for item in line[ 6 ].split( ',' ): if len( item.strip() ) > 0: genera_from_file.add( item.strip() ) for item in line[ 7 ].split( ',' ): if len( item.strip() ) > 0: families_from_file.add( item.strip() ) print( "Number of species from oligo table: %d" % len( species_from_file ) ) print( "Number of genera from oligo table: %d" % len( genera_from_file ) ) print( "Number of families from oligo table: %d" % len( families_from_file ) ) print( "Species missing from reference dataset: %s " % "|".join( species_from_file - set( unique_species ) ) ) print( "Genera missing from reference dataset: %s " % "|".join( genera_from_file - set( unique_genera ) ) ) print( "Families missing from refernece dataset: %s " % "|".join( families_from_file - set( unique_families ) ) ) print() print( "Species missing from oligo table: %s " % "|".join( set( unique_species ) - species_from_file ) ) print( "Genera missing from oligo table: %s " % "|".join( set( unique_genera ) - genera_from_file ) ) print( "Families missing from oligo table: %s " % "|".join( set( unique_families ) - families_from_file ) )