Python process_fasta_file示例，rosalind_utils.process_fasta_file Python示例

示例#1

0

显示文件

文件： rosalind_grph.py 项目： riemannzetagambit/rosalind_problems

def solve_problem(sequence_data):
    '''
    Assumptions: file is of the form
    >read_id_0
    [arbitrary number of lines of ATCG]
    >read_id_1
    [arbitrary number of lines of ATCG]

    '''
    adjacency_list = []
    k = 3  # want to generate the adjacency list for O_3, so set overlap to k = 3
    read_dict = process_fasta_file(sequence_data)

    for read_pair in combinations(read_dict.keys(), 2):
        if _has_k_overlap(read_dict[read_pair[0]], read_dict[read_pair[1]], k):
            adjacency_list.append(read_pair)
        # check if the other direction gives you a match
        elif _has_k_overlap(read_dict[read_pair[1]], read_dict[read_pair[0]],
                            k):
            adjacency_list.append(
                read_pair[::-1]
            )  # since we matched in opposite order, reverse the tuple to append

    print('\n'.join([' '.join(pair) for pair in adjacency_list]))
    return adjacency_list

示例#2

0

显示文件

def solve_problem(protein_data):
    '''
    Assumptions: input data is of the form
    uniprot_id_0
    uniprot_id_1
    ...
    '''
    n_glycosylation_matches = {}
    for uniprot_id in protein_data:
        resp = requests.get(_UNIPROT_URL.format(uid=uniprot_id))
        # clean up their response files into lines expected for fasta; python 3 returns these as bytes, so we decode
        fasta_lines = resp.content.decode('utf-8').strip().split('\n')
        seq_dict = process_fasta_file(fasta_lines)
        protein_seq = list(seq_dict.values())[0]
        motifs = _N_GLYCOSYLATION_REGEX.finditer(protein_seq)
        motif_loci = []
        for motif in motifs:
            # add 1 since python indexes from 0
            motif_loci.append(motif.start() + 1)
        if len(motif_loci) > 0:
            n_glycosylation_matches[uniprot_id] = motif_loci

    print('\n'.join(
        '{uid}\n{loci}'.format(uid=uid, loci=' '.join(map(str, loci)))
        for uid, loci in n_glycosylation_matches.items()))
    return '\n'.join(
        '{uid}\n{loci}'.format(uid=uid, loci=' '.join(map(str, loci)))
        for uid, loci in n_glycosylation_matches.items())

示例#3

0

显示文件

文件： rosalind_tran.py 项目： riemannzetagambit/rosalind_problems

def solve_problem(sequence_data: list) -> str:
    '''
    Assumptions: input file is of the form
    >read_id_0
    [arbitrary number of lines of ATCG]
    ...
    '''
    reads_dict = process_fasta_file(sequence_data)
    reads = list(reads_dict.values())

    num_reads, len_s1, len_s2 = len(reads), len(reads[0]), len(reads[1])
    if num_reads != 2 or len_s1 != len_s2:
        raise ValueError(
            'Expected only two reads/sequences to process, which must be of equal length. Got {} reads, '
            'with first two having lengths {} and {}.'.format(
                num_reads, len_s1, len_s2))
    transitions, transversions = 0, 0
    for pair in zip(*reads):
        if pair[0] != pair[1]:
            # we have a mismatch at this element-- sort it so we can check if an element in transitions/versions
            sp = tuple(sorted(pair))
            if sp in _TRANSITIONS:
                transitions += 1
            elif sp in _TRANSVERSIONS:
                transversions += 1
            else:
                raise ValueError(
                    'Got mismatch that was not in DNA transitions/transversions: {}'
                    .format(sp))

    print(transitions / transversions)
    return transitions / transversions

示例#4

0

显示文件

文件： rosalind_orf.py 项目： riemannzetagambit/rosalind_problems

def solve_problem(sequence_data):
    '''
    Assumptions: file is of the form
    >read_id_0
    [arbitrary number of lines of ATCG]

    '''
    # we expect a single read, which is the first value in the process_fasta_file returned dict
    read = list(process_fasta_file(sequence_data).values())[0]

    # because we use the generator twice below, we need to cast this to a list lest we exhaust it
    orfs = list(get_open_reading_frames(read))

    print('\n'.join(orfs))
    return '\n'.join(orfs)

示例#5

0

显示文件

文件： rosalind_gc.py 项目： riemannzetagambit/rosalind_problems

def solve_problem(sequence_data):
    '''
    Assumptions: file is of the form
    >read_id_0
    [arbitrary number of lines of ATCG]
    >read_id_1
    [arbitrary number of lines of ATCG]

    '''
    read_dict = process_fasta_file(sequence_data)
    gc_content_dict = {
        read_id: _get_gc_content(read)
        for read_id, read in read_dict.items()
    }

    max_read_id = max(gc_content_dict, key=gc_content_dict.get)
    print(f'{max_read_id}\n{gc_content_dict[max_read_id]}')

    return max_read_id, gc_content_dict[max_read_id]

示例#6

0

显示文件

def _get_consensus_profile_matrix(sequence_data):
    '''
    Given sequence data in list form from a fasta file (see rosalind_utils.process_fasta_file for details), return two
    dictionaries that profile the sequences in two ways:
        The first gives the count of each type of base pair at each position in the sequences. The keys are integer
            positions in the sequence and the values are dictionaries with keys DNA nucleotides and values the count of
            that nucleotide at that position across all sequences
        The second gives, for each DNA nucelotide, the counts of that nucleotide at each position along the sequences.
            The keys are DNA nucelotides and the values are lists of integer counts at each position along the sequences

    The sequences are all assumed to be the same length; an exception is raised if this is not the case
    '''
    position_counter_dict = {}
    profile_matrix_dict = {}
    read_dict = process_fasta_file(sequence_data)
    sequences = list(
        read_dict.values())  # we don't care about read IDs in this problem

    # assume we have a single length for all sequences
    seq_len = len(sequences[0])
    if not all([len(seq) == seq_len for seq in sequences]):
        raise ValueError(
            "Assumption violated that all sequences are of equal length! Some sequences are not same len"
        )
    # for each position, get the counts for everything in that position
    for i in range(seq_len):
        position_counter_dict[i] = {}
        nts_at_pos = ''.join([seq[i] for seq in sequences])
        nt_dict = dict(Counter(nts_at_pos).most_common())
        # need to account for zeroes
        position_counter_dict[i] = {
            nt: nt_dict.get(nt, 0)
            for nt in DNA_ALPHABET
        }

    # now convert position-based dictionary to nucleotide-based dict
    for nt in DNA_ALPHABET:
        profile_matrix_dict[nt] = [
            position_counter_dict[i][nt] for i in range(seq_len)
        ]

    return position_counter_dict, profile_matrix_dict

示例#7

0

显示文件

文件： rosalind_long.py 项目： riemannzetagambit/rosalind_problems

def solve_problem(sequence_data):
    '''
    Assumptions: file is of the form
    >read_id_0
    [arbitrary number of lines of ATCG]
    >read_id_1
    [arbitrary number of lines of ATCG]
    '''
    reads = process_fasta_file(sequence_data).values()

    # use the first read as the seed for the chromosome and add on from there
    # in principle you could start with any read as a seed but randomizing this process could lead to variability in
    # performance and consistency
    chromosome = list(reads)[0]
    # keep track of what reads we've already put into the chromosome
    reads_assembled = [list(reads)[0]]

    # NOTE(dstone): the only assumptions we are allowed to make: "there exists a unique way to reconstruct the entire
    # chromosome from these reads by gluing together pairs of reads that overlap by more than half their length."

    # assemble the chromosome by finding the (unique, thank god for this problem we don't have to deal with bubbles)
    # pair that it has overlap. Start with the first read as the seed, find the other read it overlaps with, and set the
    # chromosome to those combined reads. Then repeat the process with the newly updated chromosome. Continue to add on
    # unique bits to the chromosome until you have exhausted all reads

    # we assume that all reads are used in assembly and do not deal with the case of leftover unaligned reads
    while len(reads_assembled) != len(reads):
        # we have not used all the reads in our assembly, so keep assembling, running over all the reads for matches to
        # the current chromosome update each time

        overlaps_by_read = {r: _get_read_overlap(chromosome, r) for r in reads if r not in reads_assembled}
        # get rid of reads with no overlap
        overlaps_by_read = {read: overlap for read, overlap in overlaps_by_read.items() if overlap is not None}
        shortest_overlap_read = min(overlaps_by_read, key=lambda x: len(overlaps_by_read[x]))
        # set the chromosome to the shortest overlap
        chromosome = _get_read_overlap(chromosome, shortest_overlap_read)
        reads_assembled.append(shortest_overlap_read)


    print(chromosome)
    return chromosome

示例#8

0

显示文件

def solve_problem(sequence_data: list) -> str:
    '''
    Assumptions: input file is of the form
    >read_id_0
    [arbitrary number of lines of ATCG]
    ...
    '''
    reads_dict = process_fasta_file(sequence_data)
    reads = list(reads_dict.values())
    # since process_fasta_file returns an OrderedDict, we are assured that the first value is the read
    read, introns = reads[0], reads[1:]
    spliced_read = _splice_read(read, introns)
    # because we use the generator twice below, we need to cast this to a list lest we exhaust it
    proteins = list(
        convert_dna_to_protein(spliced_read,
                               include_reverse_complement=False,
                               include_offsets=False,
                               include_overlapping_solutions=False))

    print('\n'.join(proteins))
    return '\n'.join(proteins)

示例#9

0

显示文件

文件： rosalind_lcsm.py 项目： riemannzetagambit/rosalind_problems

def solve_problem(sequence_data):
    '''
    Assumptions: file is of the form
    >read_id_0
    [arbitrary number of lines of ATCG]
    >read_id_1
    [arbitrary number of lines of ATCG]
    '''
    reads_dict = process_fasta_file(sequence_data)
    reads = reads_dict.values()
    # use the shortest read as the seed for the motif, since the longest common motif can be at most the length of the
    # shortest sequence
    starting_motif = min(reads, key=lambda x: len(x))
    # generate k-mers from the starting motif, starting with k = len(starting motif),
    # and generate successively smaller k-mers and check to see if each k-mer is in all other sequences
    # return on the first motif that is in all other sequences (which will be the longest, 
    # since we start from longest motif and go down
    for k in range(len(starting_motif), 0, -1):
        for kmer in _get_kmers_from_sequence(starting_motif, k):
            if all(kmer in seq for seq in reads):
                print(kmer)
                return kmer

示例#10

0

显示文件

def solve_problem(sequence_data):
    '''
    Assumptions: file is of the form
    >read_id_0
    [arbitrary number of lines of ATCG]
    >read_id_1
    [arbitrary number of lines of ATCG]
    '''
    seq = list(process_fasta_file(sequence_data).values())[0]
    seq_len = len(seq)

    # problem asks us to restrict to between 4 and 12
    palindrome_min_length = 4
    palindrome_max_length = 12

    palindromes_loc_lens = []
    for k in range(palindrome_min_length, palindrome_max_length + 1):
        for i in range(seq_len - k + 1):
            if seq[i:i + k] == get_reverse_complement(seq[i:i + k]):
                palindromes_loc_lens.append(
                    (i + 1, k))  # positions start at 1, not 0, for Rosalind
    print('\n'.join([' '.join(map(str, tup)) for tup in palindromes_loc_lens]))
    return palindromes_loc_lens

示例#11

0

显示文件

文件： rosalind_corr.py 项目： riemannzetagambit/rosalind_problems

def solve_problem(sequence_data):
    '''
    Assumptions: file is of the form
    >read_id_0
    [arbitrary number of lines of ATCG]
    >read_id_1
    [arbitrary number of lines of ATCG]

    if correctly sequenced, then appears in the dataset at least twice (possibly as a reverse complement)
    if incorrectly sequenced, then appears in the dataset exactly once, and its Hamming distance is 1 with respect to
        exactly one correct read in the dataset (or its reverse complement)
    '''
    reads = process_fasta_file(sequence_data).values()

    # error reads are unique, and below we use the _get_read_ordered_by_rc to unambiguously sort the collection of reads
    # we need a way to track the original error reads so as to identify which output to display, since rc equivalence
    # otherwise adds ambiguity; note this dict is useless for correct reads (which can include rc-equivalent sequences)
    rc_ordering_dict = {_get_read_ordered_by_rc(r): r for r in reads}
    # need to get all reads that show up at least twice, but need to account for reverse complement
    ordered_reads = Counter(map(_get_read_ordered_by_rc, reads))
    # correct reads have at least two appearances (modulo reverse complement) in the fasta reads
    correct_reads = [
        read for read, count in ordered_reads.items() if count >= 2
    ]
    # revert error reads to the form found in the data if they were rc'ed in ordered_reads
    error_reads = [
        rc_ordering_dict[read] for read, count in ordered_reads.items()
        if count == 1
    ]

    corrected_reads = []
    for (error_read, correct_read) in product(error_reads, correct_reads):
        # need to iterate through and check if distances between error and correct reads are 1, up to RC
        # note the 'ground truth' in this case will be determined by the error read, since it is unique
        # which is to say, when in doubt of whether to return the reverse complement or not,
        # choose the one that is Hamming distance of 1 away from unique error read in the set
        if get_hamming_distance(error_read, correct_read) == 1:
            corrected_reads.append((error_read, correct_read))
        elif get_hamming_distance(error_read,
                                  get_reverse_complement(correct_read)) == 1:
            corrected_reads.append(
                (error_read, get_reverse_complement(correct_read)))

    # catch errors
    for er in error_reads:
        if all([
                er != x and get_reverse_complement(er) != x
                for (x, _) in corrected_reads
        ]):
            raise ValueError(
                'No paired correct read found for error read:\n\t{}'.format(
                    er))
    for cr in correct_reads:
        if all([
                cr != x and get_reverse_complement(cr) != x
                for (_, x) in corrected_reads
        ]):
            raise ValueError(
                'No paired error read found for correct read:\n\t{}'.format(
                    cr))

    print('\n'.join(['{}->{}'.format(er, cr) for (er, cr) in corrected_reads]))
    # from now on I want to return the string solution that Rosalind expects, rather than my custom data structure used
    # along the way. Instead join the structure into the string format that Rosalind would agree with-- makes unit
    # testing easier
    return '\n'.join(['{}->{}'.format(er, cr) for (er, cr) in corrected_reads])