예제 #1
0
def main():
    parser = argparse.ArgumentParser(
        description="Calculate sequence-level epitope coverage for "
        "some library design.")

    parser.add_argument('-d',
                        '--design',
                        help="Name of design file to compare against "
                        "reference file.")
    parser.add_argument('-r',
                        '--reference',
                        help="Reference dataset from which 'design' was "
                        "created, script calculates epitope coverage "
                        "for each sequence in the reference.")
    parser.add_argument('-o',
                        '--output',
                        help="Name of file to write output to. ")
    parser.add_argument('--delimiter',
                        default='|',
                        help="Delimiter to place between sequence name "
                        "and its percent epitope coverage.")
    parser.add_argument('-e',
                        '--epitope_size',
                        type=int,
                        help="Integer size of epitopes to use for coverage "
                        "information gathering.")

    args = parser.parse_args()

    ref_names, ref_sequences = oligo.read_fasta_lists(args.reference)
    design_names, design_seqs = oligo.read_fasta_lists(args.design)

    header = "Sequence Name%sPercent Epitopes (%d-mers) Covered in design\n" % (
        args.delimiter, args.epitope_size)

    out_strings = list()
    design_kmers = set()

    for item in design_seqs:
        design_kmers |= oligo.subset_lists_iter(item, args.epitope_size, 1)

    for index in range(len(ref_names)):
        current_name = ref_names[index]
        current_seq = ref_sequences[index]
        current_kmers = oligo.subset_lists_iter(current_seq, args.epitope_size,
                                                1)

        if len(current_kmers) > 0:
            perc_cov = len(current_kmers & design_kmers) / len(current_kmers)
        else:
            perc_cov = 4

        out_strings.append("%s%s%f\n" %
                           (current_name, args.delimiter, perc_cov))

    with open(args.output, 'w') as out_file:
        out_file.write("%s\n" % header)
        for item in out_strings:
            out_file.write(item)
예제 #2
0
def main():
    names, sequences = oligo.read_fasta_lists( sys.argv[ 1 ] ) 

    kmer_set = set()
    for seq in sequences:
        kmer_set |= oligo.subset_lists_iter( seq, 9, 1 )
    print( "%s|%d" % ( sys.argv[ 1 ], len( kmer_set ) ) )
예제 #3
0
def fasta_to_dict(filename):
    names, sequences = oligo.read_fasta_lists(filename)
    out_dict = {}

    for index, name in enumerate(names):
        out_dict[name] = sequences[index]
    return out_dict
예제 #4
0
    def parse(self, filename):
        out_fasta = Fasta(filename=filename)

        names, sequences = oligo.read_fasta_lists(filename)
        for name, seq in zip(names, sequences):
            out_fasta.add_seq(Sequence(name=name, sequence=seq))
        return out_fasta
예제 #5
0
    def parse(self, filename):
        names, sequences = oligo.read_fasta_lists(filename)
        out_seqs = list()

        for index, name in enumerate(names):
            out_seqs.append(Sequence(name=name, sequence=sequences[index]))
        return out_seqs
예제 #6
0
def parse_fasta(fname):
    names, sequences = oligo.read_fasta_lists(fname)
    sequences = [
        Sequence(n, s.replace('-', '')) for n, s in zip(names, sequences)
    ]

    return Collection(data=sequences)
def main():
    if len(sys.argv) != 4:
        print("Verify correctness of exact-match output.")
        print(
            "USAGE: check_correct_counts.py designed_fasta pepsirf_demux_output reads_fastq"
        )
        sys.exit(1)

    design_lib = sys.argv[1]
    demux_output = sys.argv[2]
    reads_fastq = sys.argv[3]

    start_index = 43
    length = 90

    demux_counts = get_counts(demux_output)
    d_n, d_s = oligo.read_fasta_lists(design_lib)
    fastq_counts = get_fastq_counts(reads_fastq, start_index, length)

    seq_dict = {}
    for name, seq in zip(d_n, d_s):
        seq_dict[name] = seq

    for name, count in demux_counts.items():
        if seq_dict[name] in fastq_counts:
            try:
                assert (count == fastq_counts[seq_dict[name]])
            except AssertionError as e:
                print(seq_dict[name], count, fastq_counts[seq_dict[name]])
예제 #8
0
    def _count_kmers_in_file(self, filename):
        oligo_set = set()
        names, sequences = oligo.read_fasta_lists(self._dir_name + '/' +
                                                  filename)

        for item in sequences:
            oligo_set |= oligo.subset_lists_iter(item, self._kmer_size, 1)
        return len(oligo_set)
예제 #9
0
def get_kmer_sizes(input_dir, k):
    output = list()
    for f_in in os.listdir(input_dir):
        fname = f'{input_dir}/{f_in}'

        names, seqs = oligo.read_fasta_lists(fname)
        output.append((f_in, seqs_to_kmers(seqs, k)))

    return output
예제 #10
0
def main():
    in_file = sys.argv[1]

    names, sequences = oligo.read_fasta_lists(in_file)

    print("Number of oligos in original design: %d" % len(names))
    print("Number of unique oligos:             %d" % len(set(sequences)))

    names, sequences = oligo.get_unique_sequences(names, sequences)

    oligo.write_fastas(names, sequences, in_file + "_unique")
예제 #11
0
def main():
    original = sys.argv[1]
    clusters = sys.argv[2]

    orig_dict = {}

    orig_names, orig_seqs = oligo.read_fasta_lists(original)
    clusters_names, clusters_seqs = oligo.read_fasta_lists(clusters)

    assert len(orig_names) == len(orig_seqs)
    assert len(clusters_names) == len(clusters_seqs)
    for current in range(len(orig_names)):
        orig_dict[orig_names[current]] = orig_seqs[current]

    successful = 0
    for current in range(len(clusters_names)):
        name = clusters_names[current]
        seq = clusters_seqs[current]

        if name in orig_dict:
            assert orig_dict[name] == seq
            successful += 1
예제 #12
0
def count_kmers_in_file(filename, k, count_total=False):
    names, sequences = oligo.read_fasta_lists(filename)
    kmers = set()
    ret_val = 0

    for seq in sequences:
        kmer_set = oligo.subset_lists_iter(seq, k, 1)

        if count_total:
            ret_val += len(kmer_set)
        else:
            kmers |= kmer_set
            ret_val = len(kmers)

    return ret_val
예제 #13
0
def get_records_from_fasta(fname):
    records = dict()
    line_re = re.compile("ID=([\S]+)\s[\S]+\sOXX=\d*,(\d*),\d*,\d*")
    if fname:
        names, seqs = oligo.read_fasta_lists(fname)
        for name in names:
            name_search = re.search(line_re, name)
            if name_search:
                id = name_search.group(1)
                ox = name_search.group(2)

                records[id] = str(ox)
            else:
                print("No match: ", name)
    return records
예제 #14
0
def main():
    arg_parser = argparse.ArgumentParser(
        description=
        "Simple creation of a library, both gap spanning and non-gap spanning algorithms"
    )

    arg_parser.add_argument('-q', '--query', help="Fasta query file.")
    arg_parser.add_argument('-o', '--output', help="Fasta file to output")
    arg_parser.add_argument('-g',
                            '--gap_span',
                            help="Fasta query file.",
                            default=False,
                            action="store_true")
    arg_parser.add_argument('-w',
                            '--window_size',
                            help="Window Size to use for grabbing oligos.",
                            default=19,
                            type=int)
    arg_parser.add_argument(
        '-s',
        '--step_size',
        help="Number of amino acids to step after each window.",
        default=10,
        type=int)

    args = arg_parser.parse_args()

    names, sequences = oligo.read_fasta_lists(args.query)
    seqs = list()

    for name, sequence in zip(names, sequences):
        seqs.append(Sequence(name=name, sequence=sequence))

    print("Number of input sequences: ", len(seqs))

    if args.gap_span:
        designer = GapSpanningLibraryDesigner(window_size=args.window_size,
                                              step_size=args.step_size)
    else:
        designer = LibraryDesigner(window_size=args.window_size,
                                   step_size=args.step_size)

    library = designer.design(seqs)

    print("Number of output Kmers: ", len(library))

    write_sequences(args.output, library)
예제 #15
0
def main():
    if len(sys.argv) != 2:
        print(
            "Perform IUPred analysis on a fasta containing "
            "more than one sequence. IUPred must be executable with './iupred2a'. "
            "Output is sent to standard out.")

        print("USAGE: do_pred.py input_file")

        sys.exit(1)

    names, sequences = oligo.read_fasta_lists(sys.argv[1])

    for name, sequence in zip(names, sequences):
        with tempfile.NamedTemporaryFile(mode='w') as of:
            of.write('>%s\n%s' % (name, sequence))
            of.flush()

            print('>' + name)

            print(
                subprocess.check_output('./iupred2a.py %s long' % of.name,
                                        shell=True).decode('ascii'))
예제 #16
0
def parse_fasta(fname):
    names, sequences = oligo.read_fasta_lists(fname)

    return [Sequence(a, b) for a, b in zip(names, sequences)]
예제 #17
0
def main():
    usage = "usage: %prog [options]"
    option_parser = optparse.OptionParser(usage)
    add_program_options(option_parser)
    options, arguments = option_parser.parse_args()

    if options.query is None:
        print("ERROR: Fasta query file must be provided.")
        sys.exit(0)

    names, sequences = oligo.read_fasta_lists(options.query)
    names, sequences = oligo.sort_sequences_by_length(names, sequences)

    cluster_dict = {}

    # Get the sequences sorted in decreasing order
    names.reverse()
    sequences.reverse()

    num_seqs = len(sequences)

    ymer_dict = {}
    for index in range(num_seqs):
        ymer_dict[sequences[index]] = oligo.subset_lists_iter(
            sequences[index], options.XmerWindowSize, 1)

    num_seqs = len(sequences)
    out_list = np.empty(1)

    for current_seq in range(num_seqs):
        for inner_index in range(current_seq + 1, num_seqs):
            out_list = np.append(
                out_list,
                oligo.get_single_sequence_dist(
                    ymer_dict[sequences[current_seq]],
                    ymer_dict[sequences[inner_index]], options.XmerWindowSize,
                    1))

    out_list = np.delete(out_list, 0)

    if options.verbose:
        print("Distance Matrix complete")

    Z = linkage(out_list, 'single')

    cluster = fcluster(Z, options.clusters, criterion='maxclust')

    out_file = open(options.output, 'w')

    if options.verbose:
        print("Clustering Complete")

    for sequence in range(len(names)):
        if cluster[sequence] not in cluster_dict:
            cluster_dict[cluster[sequence]] = list()
        cluster_dict[cluster[sequence]].append(
            (names[sequence], sequences[sequence]))

        out_file.write("%d %s\n" % (cluster[sequence], names[sequence]))

    if options.verbose:
        display_cluster_information(cluster_dict, out_list,
                                    options.XmerWindowSize, 1, ymer_dict)

    out_file.close()
예제 #18
0
def count_sequences_in_file(filename):
    names, sequences = oligo.read_fasta_lists(filename)
    return len(sequences)
예제 #19
0
def parse_fasta( fname ):
    n, s = oligo.read_fasta_lists( fname )
    seqs = [ Sequence( name = na, seq = se.replace( '-', '' ) ) for na, se in zip( n, s ) ] 
    return SequenceCollection( seqs )
예제 #20
0
def cluster_taxonomically(options, sequence_dict, kmer_dict):
    """
        Clusters sequences based on taxonomic rank. Ranks that have more than 
        the options.number amount of sequences will be split up evenly. 

        :param options: options object to get the program preferences from.
        :param sequence_dict: dictionary o fsequences with name: sequence key-value pairs
    
        :returns clusters: dictionary of cluster: sequence pairings
    """

    names, sequences = seq_dict_to_names_and_seqs(sequence_dict)

    if options.start is None:
        start_ranks = ['FAMILY']
    else:
        start_ranks = options.start
    # Get the ranks descending order
    ranks = reversed(
        sorted([oligo.Rank[item.upper()].value for item in start_ranks]))

    ranks = [oligo.Rank(item).name for item in ranks]
    sequence_dict = copy.deepcopy(sequence_dict)

    current_rank = ranks[0]

    reference_names, reference_seqs = oligo.read_fasta_lists(
        options.unclustered)

    ref_dict = create_seq_dict(reference_names, reference_seqs)

    # inverted dictionaries for missing id resolutions
    ref_dict_inverted = create_seq_dict(reference_names,
                                        reference_seqs,
                                        key='sequences')
    seq_dict_inverted = create_seq_dict(names, sequences, key='sequences')

    combined_dictionaries = combine_dicts(ref_dict_inverted, seq_dict_inverted)

    rank_map = oligo.parse_rank_map(options.rank_map)

    created_clusters = {}
    clusters_created = list()

    missing_seqs = list()

    sequence_tax_id = set()
    for current_name, current_seq in sequence_dict.items():
        if 'TaxID' not in current_name and 'OX' not in current_name:
            taxid = resolve_missing_taxid(current_name, current_seq,
                                          combined_dictionaries)

            if taxid:
                sequence_tax_id.add(taxid)
            else:
                missing_seqs.append((current_name, current_seq))
        else:
            sequence_tax_id.add(oligo.get_taxid_from_name(current_name))

    tax_data = oligo.get_taxdata_from_file(options.lineage)
    tax_data = oligo.fill_tax_gaps(tax_data, rank_map)

    del reference_seqs

    merged_ids = {
        10969: 444185,
        11619: 2169991,
        11630: 2169993,
        11806: 353765,
        45218: 2169996,
        45222: 2169994,
        45709: 2169992,
        489502: 10407,
        587201: 10255,
        587202: 10255,
        587203: 10255,
        1173522: 11723,
        1554474: 1511807,
        1554482: 1330068,
        1554483: 1330491,
        1554492: 1330066,
        1554494: 1307800,
        1554498: 1511784,
        1559366: 1513237,
        1560037: 1131483,
        2169701: 11027
    }

    current_rank = oligo.Rank[ranks[len(ranks) - 1]].value
    index = 0
    for index in range(len(ranks)):
        current_rank = oligo.Rank[ranks[index]].value

        rank_data = oligo.group_seq_from_taxid(sequence_tax_id, merged_ids,
                                               tax_data, current_rank)
        deleted_clusters = list()

        if len(sequence_dict) > 0:
            for current_name in list(sequence_dict.keys()):

                current_id = oligo.get_taxid_from_name(current_name)
                if current_id:
                    current_id = int(current_id)
                else:
                    current_id = resolve_missing_taxid(
                        current_name, sequence_dict[current_name],
                        combined_dictionaries)
                if current_id:
                    current_id = check_for_id_in_merged_ids(
                        merged_ids, current_id)
                    current_rank_data = rank_data[current_id].lower()

                    if current_id in rank_data and current_rank_data not in deleted_clusters:
                        if current_rank_data not in created_clusters:
                            new_cluster = cluster.Cluster(current_rank_data)
                            created_clusters[current_rank_data] = new_cluster

                        created_clusters[
                            current_rank_data].add_sequence_and_its_kmers(
                                current_name, sequence_dict[current_name],
                                kmer_dict[current_name])

                        if created_clusters[current_rank_data].get_num_kmers(
                        ) > options.number and index < len(ranks) - 1:

                            # Put the items back in the pool of choices if our cluster becomes too large
                            put_large_cluster_back_in_pool(
                                created_clusters, sequence_dict,
                                current_rank_data)
                            deleted_clusters.append(current_rank_data)

                        else:
                            del sequence_dict[current_name]

    if missing_seqs:
        for name, seq in missing_seqs:
            best_cluster = seq_cluster_best_match(created_clusters.values(),
                                                  kmer_dict[name],
                                                  options.kmerSize)
            best_cluster.add_sequence_and_its_kmers(name, seq, kmer_dict[name])

    return created_clusters
예제 #21
0
def main():

    usage = "usage %prog [options]"
    option_parser = optparse.OptionParser(usage)

    add_program_options(option_parser)

    options, arguments = option_parser.parse_args()

    if options.query is None:
        print("Fasta query file must be provided.")
        sys.exit()

    names, sequences = oligo.read_fasta_lists(options.query)
    num_seqs = len(names)

    assert len(names) == len(sequences)

    sequence_dict = create_seq_dict(names, sequences)

    ymer_dict = {}
    total_ymers = set()

    for current_seq in range(len(sequences)):
        current_ymers = frozenset(
            oligo.subset_lists_iter(sequences[current_seq], options.kmerSize,
                                    1))
        total_ymers |= current_ymers
        ymer_dict[names[current_seq]] = current_ymers

    if 'tax' in options.clustering:
        if options.lineage:

            clusters_with_kmers = {}

            created_clusters = cluster_taxonomically(options, sequence_dict,
                                                     ymer_dict)

        else:
            print(
                "Lineage file must be provided for taxonomic clustering, exiting"
            )
            sys.exit()
    else:
        sorted_ids = sorted(options.id.split(','))
        created_clusters = cluster_by_kmers(float(sorted_ids[0]),
                                            sequence_dict, ymer_dict)

        for current_id in sorted_ids[1::]:
            current_id = float(current_id)
            max_cluster_size = max(
                [item.get_num_kmers() for item in created_clusters.values()])

            if max_cluster_size > options.number:
                re_cluster_kmers(sequence_dict, ymer_dict, created_clusters,
                                 current_id, options.number)

        print("Id threshold: %s." % options.id)

    min_cluster_size, median_cluster_size, avg_cluster_size, max_cluster_size = get_cluster_stats(
        created_clusters, total_ymers)

    print("Number of unique ymers: %d." % len(total_ymers))
    print("Number of clusters: %d." % len(created_clusters.keys()))
    print("Minimum cluster size: %d." % min_cluster_size)
    print("Median cluster size: %.2f." % median_cluster_size)
    print("Average cluster size: %.2f." % avg_cluster_size)
    print("Maximum cluster size: %d." % max_cluster_size)

    write_outputs(options.output, created_clusters, options.number)
예제 #22
0
def parse_fasta(fname):
    names, sequences = oligo.read_fasta_lists(fname)

    return [SequenceWithLocation(n, s) for n, s in zip(names, sequences)]
예제 #23
0
def parse_fasta(fname):
    n, s = oligo.read_fasta_lists(fname)

    return [Sequence(na, se) for na, se in zip(n, s)]
예제 #24
0
def main():
    usage = "usage: %prog [options]"
    option_parser = optparse.OptionParser(usage)

    add_program_options(option_parser)

    options, arguments = option_parser.parse_args()

    names, sequences = oligo.read_fasta_lists(options.query)

    min_ymers = 999999999999999999999999999999999

    for i in range(options.iterations):

        xmer_seq_dict = {}

        # create list of Xmer sequences
        for index in range(len(sequences)):

            name, sequence = oligo.subset_lists_iter(names[index],
                                                     sequences[index],
                                                     options.XmerWindowSize,
                                                     options.stepSize)

            for index in range(len(sequence)):
                if oligo.is_valid_sequence(sequence[index], options.minLength,
                                           options.percentValid):
                    value = [options.redundancy, name[index]]
                    xmer_seq_dict[sequence[index]] = value

        # create dict of Ymer sequences
        ymer_seq_dict = {}

        # Break each ymer up into subsets of xmer size
        for index in range(len(sequences)):

            name, sequence = oligo.subset_lists_iter(names[index],
                                                     sequences[index],
                                                     options.YmerWindowSize,
                                                     options.stepSize)

            for index in range(len(sequence)):

                if oligo.is_valid_sequence(sequence[index], options.minLength,
                                           options.percentValid):
                    ymer_seq_dict[sequence[index]] = name[index]

        total_ymers = len(ymer_seq_dict)

        array_design = {}
        array_xmers = {}
        to_add = []
        ymer_xmers = []
        iter_count = 0

        while True:
            #reset max score at the beginning of each iteration
            max_score = 0
            for current_ymer in ymer_seq_dict.keys():
                # calculate the score of this ymer
                score, subset_ymer = calculate_score(current_ymer,
                                                     xmer_seq_dict,
                                                     options.XmerWindowSize, 1)

                if score > max_score:
                    to_add = list()
                    max_score = score
                    to_add.append(current_ymer)
                    ymer_xmers = [subset_ymer]
                elif score == max_score:
                    to_add.append(current_ymer)
                    ymer_xmers.append(subset_ymer)

            random_index = random.choice(range(len(to_add)))
            oligo_to_remove = to_add[random_index]
            chosen_xmers = ymer_xmers[random_index]
            #        array_xmers.update(chosen_xmers)
            for each in chosen_xmers:
                array_xmers[each] = array_xmers.get(each, 0) + 1

            # subtract from the score of each xmer within the chosen ymer
            for item in chosen_xmers:
                if item in xmer_seq_dict:
                    # We dont' want negative scores
                    if xmer_seq_dict[item][0] > 0:
                        xmer_seq_dict[item][0] -= 1
                else:
                    print("%s - not found in xmer dict!!!" % (item))

            iter_count += 1

            if len(ymer_seq_dict) == 0 or max_score <= 0:
                print("Final design includes %d %d-mers (%.1f%% of total) " %
                      (len(array_design), options.YmerWindowSize,
                       (len(array_design) / float(total_ymers)) * 100))
                #            average_redundancy = sum( xmer_seq_dict[ item ][ 0 ] for item in xmer_seq_dict ) / len( xmer_seq_dict )
                print("%d unique %d-mers in final %d-mers (%.2f%% of total)" %
                      (len(array_xmers), options.XmerWindowSize,
                       options.YmerWindowSize,
                       (float(len(array_xmers)) / len(xmer_seq_dict)) * 100))
                print("Average redundancy of %d-mers in %d-mers: %.2f" %
                      (options.XmerWindowSize, options.YmerWindowSize,
                       sum(array_xmers.values()) / float(len(array_xmers))))
                if len(array_design) < min_ymers:
                    min_ymers = len(array_design)
                    best_xmer_seq_dict = xmer_seq_dict
                    del (xmer_seq_dict)
                    best_array_design = array_design
                    del (array_design)
                break

            try:
                array_design[oligo_to_remove] = ymer_seq_dict[oligo_to_remove]
                del ymer_seq_dict[oligo_to_remove]
            except KeyError:
                continue

            if not iter_count % 250:
                print("Current Iteration: " + str(iter_count))
                #            print( "Number of output ymers: " + str( len( array_design ) ) )
                print("Current xmer dictionary score: " +
                      str(sum(item[0] for item in xmer_seq_dict.values())))

    write_outputs(best_xmer_seq_dict, options.outPut)

    names = []
    sequences = []

    # Write resulting oligos to file
    for sequence, name in best_array_design.items():
        names.append(name)
        sequences.append(sequence)

    oligo.write_fastas(names,
                       sequences,
                       output_name=options.outPut + "_R" +
                       str(options.redundancy) + ".fasta")
예제 #25
0
    def parse(self):

        names, sequences = oligo.read_fasta_lists(self.filename)

        return self.seq_factory.create_seq_list(names, sequences)
예제 #26
0
def parse_peptides(fname):
    names, sequences = oligo.read_fasta_lists(fname)
    return names
예제 #27
0
def main():
    arg_parser = argparse.ArgumentParser(
        description=
        "Parse representative output map to produce a sprot/trembl file")

    arg_parser.add_argument('-s', '--sprot', help="Input sprot fasta to parse")
    arg_parser.add_argument('-t',
                            '--trembl',
                            help="Input trembl file to parse")
    arg_parser.add_argument('-m',
                            '--map_file',
                            help="Input map file to parse.")

    args = arg_parser.parse_args()

    out_sprot_name = args.map_file + "_sprot"
    out_trembl_name = args.map_file + "_trembl"

    sprot_names, sprot_seqs = oligo.read_fasta_lists(args.sprot)
    trembl_names, trembl_seqs = oligo.read_fasta_lists(args.trembl)

    in_sprot_seqs = {}
    in_trembl_seqs = {}
    out_sprot_seqs = {}
    out_trembl_seqs = {}

    for index in range(len(sprot_names)):
        current_name = sprot_names[index]
        current_seq = sprot_seqs[index]

        in_sprot_seqs[current_name] = current_seq

    for index in range(len(trembl_names)):
        current_name = trembl_names[index]
        current_seq = trembl_seqs[index]

        in_trembl_seqs[current_name] = current_seq

    map_items = parse_map(args.map_file)

    for current in map_items:
        added = False
        for inner in current:
            if inner in in_sprot_seqs:
                added = True
                out_sprot_seqs[inner.strip()] = in_sprot_seqs[inner.strip()]
                break
        if not added:
            out_trembl_seqs[current[0].strip()] = in_trembl_seqs[
                current[0].strip()]

    out_sprot_names = list()
    out_sprot_sequences = list()
    if len(out_sprot_seqs):
        for key, value in out_sprot_seqs.items():
            out_sprot_names.append(key)
            out_sprot_sequences.append(value)

    out_trembl_names = list()
    out_trembl_sequences = list()
    if len(out_trembl_seqs):
        for key, value in out_trembl_seqs.items():
            out_trembl_names.append(key)
            out_trembl_sequences.append(value)

    oligo.write_fastas(out_sprot_names, out_sprot_sequences, out_sprot_name)
    oligo.write_fastas(out_trembl_names, out_trembl_sequences, out_trembl_name)
예제 #28
0
def main():
    arg_parser = argparse.ArgumentParser( description = (
                                                            "Determines and outputs number of unique "
                                                            "sequences/species/genera/families in a fasta file"
                                                        )
                                        )

    arg_parser.add_argument( '-f',  '--fasta',    help = "Input fasta from which to gather data")
    arg_parser.add_argument( '-t', '--tax_db',   help = "Name of file containing mappings of taxids -> rank data")
    arg_parser.add_argument( '-g',  '--gap_file', help = "File containing mappings of taxid->rank for use in filling gaps" )

    arg_parser.add_argument( '--oligo_file', help = "Parsed oligo map file" )

    args = arg_parser.parse_args()

    taxid_dict = {}
    gap_dict   = {}

    missing_id_key = {
                       10969	: 444185,
                       11619	: 2169991,
                       11630	: 2169993,
                       11806	: 353765,
                       45218	: 2169996,
                       45222	: 2169994,
                       45709	: 2169992,
                       489502	: 10407,
                       587201	: 10255,
                       587202	: 10255,
                       587203	: 10255,
                       1173522	: 11723,
                       1554474	: 1511807,
                       1554482	: 1330068,
                       1554483	: 1330491,
                       1554492	: 1330066,
                       1554494	: 1307800,
                       1554498	: 1511784,
                       1559366	: 1513237,
                       1560037	: 1131483
                     }

    # parse and store tax_db and gap_file info
    with open( args.gap_file ) as gap_file:
        for line in gap_file:
            line = line.split( '|' )
            gap_dict[ line[ 0 ] ] = line[ 1 ].strip()

    with open( args.tax_db, 'r' ) as tax_db:
        for line in tax_db:
            line = line.split( '|' )
            taxid_dict[ line[ 0 ].strip() ] = [ item.strip() for item in line[ 1:: ] ]

    # Fill gaps
    for id, info in taxid_dict.items():
        if str(id) in gap_dict:
            if gap_dict[str(id)] == "SPECIES":
                taxid_dict[id][1] = taxid_dict[id][0]
                taxid_dict[id][0] = ""
            elif gap_dict[str(id)] == "GENUS":
                taxid_dict[id][2] = taxid_dict[id][0]
                taxid_dict[id][0] = ""
            elif gap_dict[str(id)] == "FAMILY":
                taxid_dict[id][3] = taxid_dict[id][0]
                taxid_dict[id][0] = ""

    num_seqs = 0
    unique_species  = set()
    unique_genera   = set()
    unique_families = set()


    names, sequences = oligo.read_fasta_lists( args.fasta )

    num_seqs = len( names )

    tax_ids = set( [ oligo.get_taxid_from_name( item ) \
                     for item in names 
                   ]
                 )

    missing_ids = set()

    for current in tax_ids:
        try:
            if int( current ) in missing_id_key:
                current = str( missing_id_key[ int( current ) ] )

            unique_species.add(   taxid_dict[ current ][ 1 ] )
            unique_genera.add(    taxid_dict[ current ][ 2 ] )
            unique_families.add(  taxid_dict[ current ][ 3 ] )
        except KeyError:
            missing_ids.add( current )
    

    unique_species  = [ item for item in unique_species if len( item ) > 0 ]
    unique_genera   = [ item for item in unique_genera if len( item ) > 0 ]
    unique_families = [ item for item in unique_families if len( item ) > 0 ]

    print( "Number of sequences:       %d" %  num_seqs )
    print( "Number of unique species:  %d" %  len( unique_species  ) )
    print( "Number of unique genera:   %d" %  len( unique_genera   ) )
    print( "Number of unique families: %d" %  len( unique_families ) )

    print( "Missing ids: %s" % ",".join( list( missing_ids ) ) )

    species_from_file  = set()
    genera_from_file   = set()
    families_from_file = set()

    with open( args.oligo_file, 'r' ) as oligo_file:
        counter = 0
        for line in oligo_file:
            if not counter:
                counter += 1
                continue 

            line = line.split( '\t' )

            for item in line[ 5 ].split( ',' ):
                if len( item.strip() ) > 0:
                    species_from_file.add( item.strip() )
            for item in line[ 6 ].split( ',' ):
                if len( item.strip() ) > 0:
                    genera_from_file.add( item.strip() )
            for item in line[ 7 ].split( ',' ):
                if len( item.strip() ) > 0:
                    families_from_file.add( item.strip() )

    print( "Number of species from oligo table:  %d" % len( species_from_file ) )
    print( "Number of genera from oligo table:   %d" % len( genera_from_file ) )
    print( "Number of families from oligo table: %d" % len( families_from_file ) )


    print( "Species missing from reference dataset:  %s "   % "|".join( species_from_file  - set( unique_species ) ) )
    print( "Genera missing from reference dataset:   %s "   % "|".join( genera_from_file   - set( unique_genera ) ) )
    print( "Families missing from refernece dataset: %s "   % "|".join( families_from_file - set( unique_families ) ) )

    print()

    print( "Species missing from oligo table:  %s "   % "|".join( set( unique_species )  - species_from_file ) )
    print( "Genera missing from oligo table:   %s "   % "|".join( set( unique_genera )   - genera_from_file ) )
    print( "Families missing from oligo table: %s "   % "|".join( set( unique_families ) - families_from_file ) )