Пример #1
0
def main():
    # Read in the RNA sequences from a file specified by user input
    filename = input("Please enter the input file name: ")
    rnainfo = readfasta(filename)

    # Prepare to re-write the RNA sequences to an output file specified by user input
    outfilename = input("Please enter the output file name: ")
    handle = open(outfilename, mode="w")

    # Iterate through each RNA sequence in the input file
    for i in range(len(rnainfo)):
        # Specify gene that is being evaluated
        handle.write("Gene " + str(i + 1) + ": " + rnainfo[i][2] + "\n\n")

        # Translate the RNA Sequence to its corresponding single-letter amino acid sequence
        # Write information to the output file
        translatedseq = translate(rnainfo[i][2])
        handle.write("Protein Sequence " + str(i + 1) + ": " + translatedseq +
                     "\n\n")

        # Scan the single-letter amino acid sequence for transmembrane helices
        # Write results to the output file
        findTMD(translatedseq, handle)

    # Close file
    handle.close()
Пример #2
0
#!/usr/bin/python
import sys
from fasta import readfasta

f = open(sys.argv[1], "r")

fd = readfasta(f)

for key in fd:
    sequence = fd[key][15720:15725]
    print sequence
    homozygote = 'ATCG'
    SNPs = 'RYSWKMBDHV'
    homozygote_count = len(
        [base.upper() for base in sequence if base.upper() in homozygote])
    SNPs_count = len(
        [base.upper() for base in sequence if base.upper() in SNPs])
    print homozygote_count
    print SNPs_count
Пример #3
0
    GCs = 'GC'
    homozygote_count = len(
        [base.upper() for base in sequence if base.upper() in homozygote])
    GC_count = len([base.upper() for base in sequence if base.upper() in GCs])
    return homozygote_count, GC_count


#************************************************
# Read bed and fasta input files
#************************************************
overlapping = open(sys.argv[1], "r")
fasta_seq = open(sys.argv[2], "r")
out = open(sys.argv[3], "w")
#************************************************
# Read fasta file in dictionary
fasta_dict = readfasta(fasta_seq)
#************************************************
#************************************************
# Read overlap file in dictionary
#************************************************
overlapping_list = []  # Create a list to keep order when printing
overlapping_dict = {}
multi_window_intervals = {}
for line in overlapping:
    line = line.strip("\n").split("\t")
    key = line[0] + ":" + line[1] + ":" + line[2]
    value = [int(line[4]), int(line[5]), int(line[6])]
    if key in overlapping_dict.keys():
        overlapping_dict[key].append(value)
    else:
        overlapping_dict[key] = [value]
Пример #4
0
# use them for resampling.
# Usage: ./compile_sequences.py seq.fasta seq.coordinates seq.scaf.chro

fastaseq = open(sys.argv[1], "r")  # File one is the fasta sequence
intergenic = open(sys.argv[2], "r")  # File two is the intergenic file
chrfile = open(sys.argv[3], "r")  # File three is the chromosome file that
# contains all the scaffolds for that chromosome.


def chunks(s, n):
    for start in range(0, len(s), n):
        yield s[start:start + n]


# Read the fasta file into a dictionary
fastaDict = readfasta(fastaseq)


#print fastaDict
# Read the intergenic coordinates into a dictionary
def intergenicCoord(intergenic):
    intergenicDict = {}
    for line in intergenic:
        line = line.strip().split("\t")
        key, value = line[0], line[1:]
        if line[0] in intergenicDict.keys():
            intergenicDict[key].append(value)
        else:
            intergenicDict[key] = [value]
    return intergenicDict
Пример #5
0
#!/usr/bin/python
from __future__ import division
import sys
from fasta import readfasta
from het import heterozygosity

fasta = sys.argv[1]

# Read fasta into a dictionary
with open(fasta, 'r') as f:
    fasta_dict = readfasta(f)

# Calculate heterozygosity for every scaffold
for key in fasta_dict.keys():
    het = heterozygosity(fasta_dict[key])
    print(key + "\t" + str(het[0]) + "\t" + str(het[1]) + "\t" + str(het[2]))
Пример #6
0
# It reads the intron coordinates generated by extract_from_gff.py into a dictionary,
# extract the corresponding sequences from a fasta file and then runs the heterozygosity
# function to count the number of SNPs and bases.
# Usage: ./intron_heterozygosity.py intron_coordinates.txt fasta.fa > output
##########################################################################################

#***************************************************
# Read the intron coordinates and the fasta sequence
#***************************************************
intron_coord = open(sys.argv[1], 'r')
fasta = open(sys.argv[2], 'r')

#******************************************
# Read the fasta sequence into a dictionary
#******************************************
fastaseq = readfasta(fasta)

#**********************************************
# Read the intron coordinates into a dictionary
#**********************************************
intron_dict = {}
for line in intron_coord:
    line = line.strip('\n').split('\t')
    key, value = line[0], line[1:]
    if key in intron_dict.keys():
        intron_dict[key].append(value)
    else:
        intron_dict[key] = [value]

#**************************************************************************
# Extract the intronic sequences from fasta and read them into a dictionary
Пример #7
0
    if not isinstance(sequence, str):
        raise Exception("Sequence is not a string")
    R = len([base.upper() for base in sequence if base.upper() == "R"])
    Y = len([base.upper() for base in sequence if base.upper() == "Y"])
    S = len([base.upper() for base in sequence if base.upper() == "S"])
    W = len([base.upper() for base in sequence if base.upper() == "W"])
    K = len([base.upper() for base in sequence if base.upper() == "K"])
    M = len([base.upper() for base in sequence if base.upper() == "M"])

    return (R, Y, S, W, K, M)


################################################################################
fastafile = open(sys.argv[1], "r")

fastadict = readfasta(fastafile)

R_l = []
Y_l = []
S_l = []
W_l = []
K_l = []
M_l = []
for key in fastadict.keys():
    R = trans_tranv_count(fastadict[key])[0]
    Y = trans_tranv_count(fastadict[key])[1]
    S = trans_tranv_count(fastadict[key])[2]
    W = trans_tranv_count(fastadict[key])[3]
    K = trans_tranv_count(fastadict[key])[4]
    M = trans_tranv_count(fastadict[key])[5]
    R_l.append(R)
Пример #8
0
# If run: ./Open_reading_frame.py CDS.fa SW > output
# The script counts only S and W sites.
# CDS.fa is produced by extract_CDS_from_fasta.py and contains a fasta sequence with IUPAC
# coded SNPs.
##########################################################################################

#*******************
# Specify the inputs
#*******************
inFile = open(sys.argv[1], 'r')
argument = sys.argv[2]

#************************************************************
# Read the fasta file into a dictionary
#************************************************************
fastaseq = readfasta(inFile)
# Specify four nucleotides
#************************************************************
nucs = ["A", "T", "C", "G"]
#************************************************************
# Specify IUPAC codes as a dictionary
#************************************************************
IUPAC_code = {
    'R': ['A', 'G'],
    'Y': ['C', 'T'],
    'S': ['G', 'C'],
    'W': ['A', 'T'],
    'K': ['G', 'T'],
    'M': ['A', 'C']
}  #, 'B':['C', 'G', 'T'], 'D':['A', 'G', 'T'], 'H':['A', 'C', 'T'], 'V':['A', 'C', 'G']}
# Four fold degenerate sites
Пример #9
0
# The script counts only S and W sites.
# CDS.fa is produced by extract_CDS_from_fasta.py and contains a fasta sequence with IUPAC
# coded SNPs.
##########################################################################################

#*******************
# Specify the inputs
#*******************
inFile = open(sys.argv[1], 'r')
argument = sys.argv[2]
#strand = open(sys.argv[3], "r")

#************************************************************
# Read the fasta file into a dictionary
#************************************************************
dna_orf = readfasta(inFile)
# Specify four nucleotides
#************************************************************
nucs = ["A", "T", "C", "G"]
#************************************************************
# Specify IUPAC codes as a dictionary
#************************************************************
IUPAC_code = {
    'R': ['A', 'G'],
    'Y': ['C', 'T'],
    'S': ['G', 'C'],
    'W': ['A', 'T'],
    'K': ['G', 'T'],
    'M': ['A', 'C']
}  #, 'B':['C', 'G', 'T'], 'D':['A', 'G', 'T'], 'H':['A', 'C', 'T'], 'V':['A', 'C', 'G']}
# Four fold degenerate sites
Пример #10
0
# dictionary, it then extracts the CDS coordinates from the fasta file and finally   #
# concatenates each sequence to the other to create a CDS with SNPs marked           #
# as IUPAC code.                                                                     #
# Usage: ./extract_CDS_from_Fasta.py cds.txt fasta.fa > output                       #
######################################################################################

#*******************
# Specify the inputs
#*******************
cds_file = open(sys.argv[1], 'r')
fasta = open(sys.argv[2], 'r')

#************************************************************
# Read the fasta file into a dictionary
#************************************************************
fastaDict = readfasta(fasta)
# With the current readfasta() function, it's much faster to
# use the single line fasta sequence

#*******************************************
# Read the CDS coordinates into a dictionary
#*******************************************
CDS_dict = {}
for line in cds_file:
    line = line.strip('\n').split('\t')
    key, value = line[0], line[1:3]
    if key in CDS_dict.keys():
        CDS_dict[key].append(value)
    else:
        CDS_dict[key] = [value]
#print CDS_dict
Пример #11
0
# synonymous, missense and nonsense according to the following rule:
#*******************************************************************
# Inputs: sequence.fa annotation.gff
#*******************************************************************
# Run: annotate_vcf.py sequence.fa annotation.gff > output.txt
#*******************************************************************

#*******************************************************************
# Open inputs
#*******************************************************************
fastafile = open(sys.argv[1], "r")
gff_file = open(sys.argv[2], "r")
#*******************************************************************
# Read fasta file into dictionary
#*******************************************************************
g_fasta = readfasta(fastafile)
#*******************************************************************
# Read gff file into dictionary
#*******************************************************************
gff_dict = gff_to_dict(gff_file)
#*******************************************************************
# Create the degeneracy count table
#*******************************************************************
degeneracy_table_counts = count_sites()[0]
#*******************************************************************
# Create the degeneracy base table
#*******************************************************************
degeneracy_table_bases = count_sites()[1]
#*******************************************************************

#*******************************************************************
Пример #12
0
#*******************************************************************************
# Written by Homa Papoli - October 2017
#*******************************************************************************
# Script contains functions to:
# 1. Generate a fasta sequence without N
# 2. Perform resampling from the fasta sequence to generate new sequence
# 3. Calculate heterozygosity from the new sequence
#*******************************************************************************

f1 = open(sys.argv[1], "r")
seq = sys.argv[2]  # indicate which chromosome to resample
replicates = int(sys.argv[3])  # number of resampling
num = int(sys.argv[4])  # indicate the length from which to sample

# Read the fasta file into a dictionary.
fastadict = readfasta(f1)
#def resampling_f(fastadict, seq, num):
#	fastadict[seq] = fastadict[seq].replace("N","").replace("n","")
#	l = []
#	# If sampling the sequence as long as the original one
#	# new_seq = ''.join([random.choice(fastadict[seq]) for nuc in fastadict[seq]])
#	# If sampling the sequence for a specific set of number
#	new_seq = ''.join([random.choice(fastadict[seq]) for i in range(num)]) # New sequences
#	new_seq_het = list(heterozygosity(new_seq))[2] # Het of the new sequence
#	l.append(new_seq_het)
#	return l


def resampling_f(fastadict, seq, n, k):
    fastadict[seq] = fastadict[seq].replace("N", "").replace("n", "")
    seq_list = np.random.choice(tuple(fastadict[seq]),