-
Notifications
You must be signed in to change notification settings - Fork 0
/
GeneTree.py
130 lines (115 loc) · 7.28 KB
/
GeneTree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# !/usr/bin/env python3
# University of California, Santa Cruz
# Biomolecular Engineering and Bioinformatics
# Names: Zachary Mason (zmmason), Justin Chan (jumchan)
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
from sequenceAnalysis import OrfFinder, FastAreader
from Bio.Align import MultipleSeqAlignment
from Bio import SeqIO, AlignIO, Phylo
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
# PRELIMINARY PROGRAM
class GeneTree:
"""
The purpose of the class GeneTree is to have methods to be able to change a list of orfs of
coronavirus sequences to be able to create gene trees shared by the viruses.
input: A fasta file that contains headers and sequences for coronavirus strains
output: A gene tree or a visual representation of evolutionary relationships of coronaviruses through a gene.
"""
def __init__(self, fname=''):
"""Contructor: saves information from input file"""
self.fname = fname
nfname = self.fname.split('.', 1)
newPhylip = nfname[0]+'.phy' # setting up new file name to be created in directory
self.newPhylip = newPhylip
def geneSpecificRecord (self, orfList, headList, num):
"""
Create a SeqRecord containing aligned sequences.
Using the ORFfinder module from SequenceAnalysis, a target gene can be chosen and identified within the various
genomes. Data from each gene sequence is compiled into the SeqRecords database after adding gap sequences.
input: An orf list and header list of coronavirus sequences and a number representing which gene is being analyzed
output: A SeqRecord of a particular gene in coronaviruses containing gap characters
"""
sequenceInfo = []
for gene in orfList: # Finds target gene in each genome
sequenceInfo.append(gene[num]) # ***any gene can be utilized***
longestLength = max(len(s) for s in sequenceInfo) # gets longest seq to match length with gap characters
paddedSequences = [s.ljust(longestLength, '-') for s in sequenceInfo] # Adds gap characters
records = (SeqRecord(Seq(s), id = str(paddedSequences.index(s))) for s in paddedSequences) #creating a SeqRecord
return(records)
def fastaToPhylip(self, records):
"""
Convert and align sequences from .fasta formatting to .phy formatting.
Writes to an output .phy file using a list of Seq objects called records. Similar to create a stdout file later
used to construct gene trees in printGeneTree()
input: A SeqRecord of coronaviruses
output: A .phy file containing a SeqRecord
"""
SeqIO.write(records, self.newPhylip, 'phylip') # Writes a new .phy file containing the SeqRecord for specific gene
def printGeneTree(self):
"""
Print gene trees with matplotlib and in the terminal for the four largest target ORFs of coronaviruses.
Takes a .phy file containing multiple alligned sequences, generates a matrix based on sequence composition
and compares each sequence (genome) to one another. sequences with grater scores (similarity) are ranked closer
together on the phylogenetic trees.
input: A .phy file that contains coronavirus gene sequences to draw phylogenetic tree
output: A visual representation of a gene tree on terminal and matplotlib
"""
align = AlignIO.read(self.newPhylip, 'phylip') # Reads created .phy file containing the SeqRecord
#print (align) # prints concatenated allignments
calculator = DistanceCalculator('identity')
dm = calculator.get_distance(align)# Calculate the distance matrix
print('\n======================================== DISTANCE MATRIX =======================================\n')
print(dm,"\n\n") # Print the distance Matrix
constructor = DistanceTreeConstructor() # Construct the phylogenetic tree using UPGMA algorithm
tree = constructor.upgma(dm)
print('\n========================================= GENE TREE ===========================================\n')
Phylo.draw(tree) # Draw the phylogenetic tree (must install matplotlib to use this formatting)
Phylo.draw_ascii(tree) # Print the phylogenetic tree in terminal
def main(inCL=None):
"""
Use fasta file to create gene trees for the four largest genes in related virus strains.
This program takes a "combined" fasta file containing multiple genomes from similar virus strains and searches each
genome for a common specific gene. These genes are then aligned to one another and converted to a .phy file that can
then be used to create a detailed phylogenetic tree based on the variation between particular genes. In theory, the
variation in the molecular composition of the genes will determine the trend in the variation of the genomes. This allows
the "Gene Tree" to provide a rough outline of what the "Phylogenic Tree" would look like having alligned full genomes.
input: An input fasta file that contains virus headers and their DNA sequence
output: A written and graphical description of four gene trees of the four largest orfs in coronaviruses. The output also
contains a distance matrix for each gene tree and key correlating numbers to viruses.
Assumptions:
- input file must follow fasta format and must be a DNA nucleotide sequence.
- input file contains solely corona viruses since they contain only 4 genes that are
conserved among each other
"""
headList = [] # Stores header of coronavirus sequences in fasta file
orfList = [] # Stores sequences containing ORFs of coronavirus sequences in fasta file
validNucs = ['A', 'C', 'G', 'T']
myReader = FastAreader('Combined-ALL-SARS-CoV.fasta')
for head, seq in myReader.readFasta(): # Using fastAreader to read in .fasta files
headList.append(head)
for i in seq:
if i not in validNucs: # Removing non-valid bases
seq = seq.replace(i,"")
orf = OrfFinder(seq, 300, True) # Includes the largest ORF greater than 300 nucleotides within a stop codon
geneOrfList = orf.getOrfs()
geneSeq = [] # Stores ORF sequences
for openFrame in geneOrfList:
geneSeq.append(seq[openFrame[1]-1:openFrame[2]-1])
orfList.append(geneSeq)
# Calls methods to create SeqRecords and then .py file to print gene trees
myPhylo = GeneTree()
for i in range(0,4,1): # Loops to print the first four gene trees of every sequence
records = myPhylo.geneSpecificRecord(orfList, headList, i) # Creates list of SeqRecords that represent a sequence
# alignments = myPhylo.fastaToPhylip(records) # Makes a .phy file using a .fasta file
print("GENE " + str(i+1) + ":")
# printTree = myPhylo.printGeneTree() # Prints Gene Trees
x = 0
print('\n\n============================================ K E Y ============================================\n')
for header in headList: # Loops through headers to print key
header = header.split(',')
header = header[0]
print("{} = {}" .format(x, header)) # Prints each line containing the header
x += 1
if __name__ == "__main__":
main() # Runs the main method to print output with gene trees