Пример #1
0
def get_donor_acceptor_sequences(genome_fasta_file,
                                 scaffold_gff3_file,
                                 window_size=10):
    """
    When given a FASTA file and GFF3 file corresponding to the same genome,
    parse out and return (in a giant list of tuples) all the donor and acceptor 
    sequences of all introns in the genome.
    """
    genome_fasta = parse_fasta.get_all_sequences(genome_fasta_file, 'fasta')
    scaffold_gff3 = parse_gff3.parse_gff3(scaffold_gff3_file, 'exon')

    # create dictionary to store exon/intron coordinates
    #   donor_acceptor_locs['scaffold_name'] = [tuple of ints]
    donor_acceptor_locs = {}
    for scaf in scaffold_gff3:
        donor_acceptor_locs[scaf] = []

    # get donor and acceptor locations
    for scaf in scaffold_gff3:
        for gene in scaffold_gff3[scaf]:
            for tx in scaffold_gff3[scaf][gene].mRNAs:
                # get all exon coordinates within the transcripts
                tx_coords = scaffold_gff3[scaf][gene].mRNAs[tx].details['exon']

                # check whether the transcript is reverse complemented
                rev_comp_flag = tx_coords[0][0] > tx_coords[0][1]

                if rev_comp_flag:
                    donor_locs = [-y for x, y in tx_coords][:-1]
                    acceptor_locs = [-x for x, y in tx_coords][1:]
                else:
                    donor_locs = [y for x, y in tx_coords][:-1]
                    acceptor_locs = [x for x, y in tx_coords][1:]

                for x in zip(donor_locs, acceptor_locs):
                    donor_acceptor_locs[scaf].append(x)

        donor_acceptor_locs[scaf] = sorted(list(set(
            donor_acceptor_locs[scaf])))

    # parse the locations into sequences
    donor_acceptor_sequences = []
    for scaf in scaffold_gff3:
        for da in donor_acceptor_locs[scaf]:
            donor_seq = slice_window(genome_fasta[scaf], da[0])
            acceptor_seq = slice_window(genome_fasta[scaf], da[1])
            if donor_seq and acceptor_seq:
                donor_acceptor_sequences.append((donor_seq, acceptor_seq))

    return donor_acceptor_sequences
Пример #2
0
def get_genomic_context(genome_fasta_file, pos_of_interest_file, window):
    """
    Function that does the heavy lifting. Returns a dictionary of sequences.
    """
    window_seqs = {}

    # read genome sequences
    genome_fasta = parse_fasta.get_all_sequences(genome_fasta_file, 'fasta')

    # read positions of interest
    tsv_reader = csv.reader(pos_of_interest_file, delimiter='\t')
    for row in tsv_reader:
        if len(row) < 2: continue

        scaf = row[0]
        pos = int(row[1]) - 1  # converting to 0-based numbering

        min_loc = max(0, pos - window)
        max_loc = min(len(genome_fasta[scaf]), pos + window + 1)
        window_sequence = genome_fasta[scaf][min_loc:max_loc]

        # if position is too close to the start/end of the scaffold, add padding
        # Ns in order to produce a sequence of (2 x WINDOW + 1) in length.
        front_n_needed = max(0, window - pos)
        back_n_needed = max(0, window - (len(genome_fasta[scaf]) - (pos + 1)))

        # build the sequence of the window
        if front_n_needed:
            window_sequence = 'N' * front_n_needed + window_sequence

        if back_n_needed:
            window_sequence += 'N' * back_n_needed

        annot = scaf + '_' + str(pos + 1)
        window_seqs[annot] = window_sequence

    return window_seqs
Пример #3
0
parser = argparse.ArgumentParser(description="""
CDS files do not contain intronic regions - this script reads the gff3 file
containing start and end coordinates for genes, and extracts the genic 
(i.e. exonic + intronic) sequences for each gene.""")

parser.add_argument('genome_fasta', metavar="fasta_file",
                    type=argparse.FileType('r'), 
                    help="FASTA file of the genome.")
parser.add_argument('scaffold_gff3', metavar="gff3_file",
                    type=argparse.FileType('r'), 
                    help="corresponding gff3 file of the genome.")

args = parser.parse_args()

# read genome details into memory
genome_fasta = parse_fasta.get_all_sequences(args.genome_fasta, 'fasta')
scaffold_gff3 = parse_gff3.parse_gff3(args.scaffold_gff3, 'gene')

# read the positions from the cov file
for scaf in scaffold_gff3:
    for gene in scaffold_gff3[scaf]:
        gene_coords = scaffold_gff3[scaf][gene].coords
        on_crick = gene_coords[0] > gene_coords[1]
        
        genic_seq = genome_fasta[scaf][min(gene_coords):max(gene_coords)]
        if on_crick:
            genic_seq = reverse_complement(genic_seq)
            
        print ('>' + gene)
        print (genic_seq)
Пример #4
0
import tempfile

import parse_fasta

parser = argparse.ArgumentParser(description="""
Script takes in two FASTA files of equal number of sequences, and does pairwise
BLASTP (#1 vs. #1, #2 vs. #2, ...). Prints results to stdout.""")
parser.add_argument('protein_fastas',
                    metavar='fasta_file',
                    type=argparse.FileType('r'),
                    nargs=2,
                    help='Pair of protein FASTA files.')

args = parser.parse_args()

first_fasta = parse_fasta.get_all_sequences(args.protein_fastas[0], 'fasta')
second_fasta = parse_fasta.get_all_sequences(args.protein_fastas[1], 'fasta')

assert len(first_fasta) == len(second_fasta), \
    'number of unique sequences in both files are different!'

# all systems go!
first_fasta = list(first_fasta.items())
second_fasta = list(second_fasta.items())

# header line
print('Query',
      'Hit accession',
      'Hit description',
      'Query length',
      'Hit length',
parser = argparse.ArgumentParser(description="""
Based on the blastp results of the transcripts vs. sprot/trembl,
create a GO annotation file for the transcripts.""")

parser.add_argument('species', metavar="species_code",
                    help="3/4-letter code for species in question.")
parser.add_argument('prot_file', metavar="fasta_file",
                    type=argparse.FileType('r'),
                    help="protein FASTA file of the gene models.")
parser.add_argument('-n', '--no_nr', action='store_true',
                    help='script works without nr too!')
parser.add_argument('-p', '--no_parents', action='store_true',
                    help='use the annotation file that do not contain parents.')
args = parser.parse_args()

all_transcripts = parse_fasta.get_all_sequences(args.prot_file, 'fasta')

sprot_tsv = open('{}_vs_sprot.tGO.tsv'.format(args.species))
trembl_tsv = open('{}_vs_trembl.tGO.tsv'.format(args.species))
if not args.no_nr:
    nr_tsv = open('{}_vs_nr.t1.tsv'.format(args.species))
go_tsv = open('{}_go_annots.{}all.tsv'.format(args.species, 
        'no_parents.' if args.no_parents else ''))

transcript_go_terms = {}
for line in go_tsv:
    cols = line.strip().split('\t')
    transcript_go_terms[cols[0]] = cols[1]

def get_go_terms(transcript):
    if transcript in transcript_go_terms:
Пример #6
0
                    help='Tallied files are FASTQ, not FASTA.')
parser.add_argument('--gzip',
                    '-g',
                    action='store_true',
                    default=False,
                    help='Tallied files are gzip-compressed.')

args = parser.parse_args()

# header row
print('File', 'A', 'C', 'G', 'T', 'N', 'ACGT', 'ACGTN', 'GC%', sep='\t')

for f in args.fasta_files:
    base_composition = collections.Counter()
    seqs = parse_fasta.get_all_sequences(f,
                                         'fastq' if args.fastq else 'fasta',
                                         gzip_compressed=args.gzip,
                                         sequences_only=True)

    for s in seqs:
        base_composition += collections.Counter(s.upper())

    a = base_composition['A']
    c = base_composition['C']
    g = base_composition['G']
    t = base_composition['T']
    acgt = a + c + g + t
    acgtn = sum(base_composition.values())
    non_acgt = acgtn - acgt

    gc_pct = round((c + g) / acgt * 100, 3)
Пример #7
0
            min([float(x.split('\t')[13]) for x in lines_per_query[q]]))

        o = [
            q, hit_accessions, hit_descriptions, query_length, hit_lengths,
            query_coords, hit_coords, frames, max_bit_scores, total_bit_scores,
            identities, id_pct, coverage_pct, expects
        ]

        c_output += '\t'.join([str(x) for x in o]) + '\n'

    return c_output


if args.remove_N:
    import parse_fasta
    query_fasta_seq = parse_fasta.get_all_sequences(args.remove_N, 'fasta')

tree = xml.etree.ElementTree.parse(args.blast_xml)
root = tree.getroot()

# get list of <Iteration></Iteration>
blastoutput_iterations = root.find('BlastOutput_iterations')
iterations = blastoutput_iterations.findall('Iteration')

# remove iterations that contain
#   "<Iteration_message>No hits found</Iteration_message>"
for i in iterations:
    if i.find('Iteration_message') is not None:
        if i.find('Iteration_message').text == 'No hits found':
            blastoutput_iterations.remove(i)
Пример #8
0
                       dest='file_format',
                       const='collapsed_fasta',
                       help='input file is in collapsed FASTA format.')

args = parser.parse_args()

# file exists, counting begins
read_lengths = {}  # read_lengths[length] = total number
nucleotide_stats = {}  # nucleotide_stats[length] = {1: {'A': m, 'T': n ...},
#                             2: {'A': m2, 'T': n2 ...}}
VALID_NUCLEOTIDES = ['A', 'T', 'U', 'G', 'C', 'N']

# parse input file, and convert the option "cfasta" to "fasta" because
# parse_fasta.py doesn't discriminate between those two formats
import parse_fasta
sequences = parse_fasta.get_all_sequences(args.reads_file[0],
                                          args.file_format[-5:])

# handle read inclusions/exclusions
if args.include == None:
    included_reads = set(sequences.keys())
else:
    included_reads = set()
    for i in args.include:
        included_reads |= set([x.strip() for x in i])

    # make sure that the reads exist in the original reads file
    included_reads &= set(sequences.keys())

if args.exclude != None:
    for e in args.exclude:
        included_reads -= set([x.strip() for x in e])
Пример #9
0
    # gene itself. Thus, there's a possibility that
    #     gene_info != AND exon_info == 0 !!
    if not ei_integer: return 'no_info'

    ei_integer = abs(int(ei_integer))  # NumPy ints aren't... really ints.
    ei = 'Exon' if ei_integer % 2 else 'Intron'

    # round() is dangerous. Banker's rounding.
    ei_number = (ei_integer + 1) // 2
    ei_inverse = ei_number - exon_count[gene_id] - (ei_integer % 2)
    return '_'.join([ei, str(ei_number), str(ei_inverse)])


# read sequences
sequence_lengths = parse_fasta.get_all_sequences(args.genome_fasta,
                                                 'fasta',
                                                 lengths_only=True)
if args.verbose:
    print('[{}] Lengths for {} sequences parsed.'.format(
        time.asctime(), len(sequence_lengths)),
          file=sys.stderr)

# read coordinates of genes and exons from .gff3 file.
scaffold_gff3 = parse_gff3.parse_gff3(args.genome_gff3, 'exon')
# as genes might contain overlapping isoforms, the longest isoform is chosen,
# if multiples exist.
scaffold_gff3 = parse_gff3.pick_longest_mRNA(scaffold_gff3)
# make sure features in all mRNAs are sorted properly (for exon numbering).
scaffold_gff3 = parse_gff3.sort_features(scaffold_gff3)

# genic regions are denoted in a NumPy array as follows:
Пример #10
0
                       dest='file_format',
                       const='fasta',
                       help='input file is in FASTA format.')
fasta_opt.add_argument('--fastq',
                       action='store_const',
                       dest='file_format',
                       const='fastq',
                       help='input file is in FASTQ format.')

args = parser.parse_args()

# grab only the sequences - ignore all annotations
import parse_fasta

sequences = parse_fasta.get_all_sequences(args.reads_file[0],
                                          args.file_format,
                                          sequences_only=True)

# discard annotations
import collections

sequences = collections.Counter(sequences)

reads_counter = 0  # the y/z/a counter mentioned in script description
species_identifier = args.species[0][:3]
for m in sequences.most_common():
    # m[0] is the sequence; m[1] is the frequency
    print('>{}_{}_x{}'.format(species_identifier, reads_counter, m[1]))
    print(m[0])

    reads_counter += m[1]
Пример #11
0
                        action='store_true',
                        help='ORFs need not start with Met.')
    parser.add_argument('--nt',
                        action='store_true',
                        help='print equivalent nucleotide sequences.')
    parser.add_argument(
        '--print_length',
        action='store_true',
        help='include the length of the sequence in the annot.')
    parser.add_argument('--nosort',
                        action='store_true',
                        help='disable natural sorting on output.')

    args = parser.parse_args()

    fasta_seqs = parse_fasta.get_all_sequences(args.infile, 'fasta')

    if args.nosort:
        sorted_seqs = fasta_seqs
    else:
        sorted_seqs = natural_sort(fasta_seqs)

    if args.longest:
        for s in sorted_seqs:
            print(
                find_longest_orf(s,
                                 fasta_seqs[s],
                                 relaxed=args.relaxed,
                                 display_length=args.print_length))
    else:
        for s in sorted_seqs:
Пример #12
0
                        help='follow order of singular file in --include.')
    
    args = parser.parse_args()
    
    # sanity checking
    if bool(args.start) != bool(args.end):
        raise ValueError('--start and --end has to be used in conjuction!')
    
    if args.order_include:
        # order_include can only be True when include is True
        if not args.include: args.order_include == False
        if len(args.include) > 1: args.order_include == False
    
    # start of script - get sequence data
    if args.fastq:
        sequences = parse_fasta.get_all_sequences(args.reads_file, 'fastq')
    else:
        sequences = parse_fasta.get_all_sequences(args.reads_file, 'fasta')
    
    # handle read inclusions/exclusions
    if not args.include:
        included_reads = set(sequences.keys())
    else:
        included_reads = set()
        for i in args.include:
            included_reads |= set([x.strip() for x in i])
        
        # make sure that the reads exist in the original reads file
        included_reads &= set(sequences.keys())

    if args.exclude: