def create_matched_genomes(): """Create list of GenomePair objects.""" gnm1 = genome.Genome() gnm1.id = "Trixie" gnm1.annotation_status = "draft" gnm2 = genome.Genome() gnm2.id = "Trixie" gnm_pair1 = genomepair.GenomePair() gnm_pair1.genome1 = gnm1 gnm_pair1.genome2 = gnm2 gnm3 = genome.Genome() gnm3.id = "Alice" gnm3.annotation_status = "final" gnm4 = genome.Genome() gnm4.id = "Alice" gnm_pair2 = genomepair.GenomePair() gnm_pair2.genome1 = gnm3 gnm_pair2.genome2 = gnm4 matched_genomes = [gnm_pair1, gnm_pair2] return matched_genomes
def setUp(self): self.bndl = bundle.Bundle() self.genome1 = genome.Genome() self.genome1.type = "flat_file" self.genome2 = genome.Genome() self.genome2.type = "mysql" self.tkt = ticket.ImportTicket()
def setUp(self): self.genome1 = genome.Genome() self.genome2 = genome.Genome() self.tkt = ticket.ImportTicket() self.genome_pair = genomepair.GenomePair() self.genome_pair.genome1 = self.genome1 self.genome_pair.genome2 = self.genome2 self.date_jan1 = datetime.strptime('1/1/2000', '%m/%d/%Y') self.date_feb1 = datetime.strptime('2/1/2000', '%m/%d/%Y') self.date_feb1_b = datetime.strptime('2/1/2000', '%m/%d/%Y')
def setUp(self): self.ticket1 = ticket.ImportTicket() self.src1 = source.Source() self.src1.id = "L5_SRC_1" self.src2 = source.Source() self.src2.id = "L5_SRC_2" self.src3 = source.Source() self.src3.id = "L5_SRC_3" self.cds1 = cds.Cds() self.cds1.id = "L5_CDS_1" self.cds2 = cds.Cds() self.cds2.id = "L5_CDS_2" self.cds3 = cds.Cds() self.cds3.id = "L5_CDS_3" self.trna1 = trna.Trna() self.trna1.id = "L5_TRNA_1" self.trna2 = trna.Trna() self.trna2.id = "L5_TRNA_2" self.trna3 = trna.Trna() self.trna3.id = "L5_TRNA_3" self.tmrna1 = tmrna.Tmrna() self.tmrna1.id = "L5_TMRNA_1" self.tmrna2 = tmrna.Tmrna() self.tmrna2.id = "L5_TMRNA_2" self.tmrna3 = tmrna.Tmrna() self.tmrna3.id = "L5_TMRNA_3" self.genome1 = genome.Genome() self.genome1.type = "flat_file" self.genome1.cds_features = [self.cds1, self.cds2] self.genome1.source_features = [self.src1, self.src2] self.genome1.trna_features = [self.trna1, self.trna2] self.genome1.tmrna_features = [self.tmrna1, self.tmrna2] self.genome2 = genome.Genome() self.genome2.type = "mysql" self.genome_pair1 = genomepair.GenomePair() self.genome_pair2 = genomepair.GenomePair() self.bndl = bundle.Bundle() self.bndl.ticket = self.ticket1 self.bndl.genome_dict[self.genome1.type] = self.genome1 self.bndl.genome_dict[self.genome2.type] = self.genome2 self.bndl.genome_pair_dict["genome_pair1"] = self.genome_pair1 self.bndl.genome_pair_dict["genome_pair2"] = self.genome_pair2 self.eval_correct1 = evaluation.Evaluation(status="correct") self.eval_correct2 = evaluation.Evaluation(status="correct") self.eval_error1 = evaluation.Evaluation(status="error") self.eval_error2 = evaluation.Evaluation(status="error")
def get_genome(tkt, gnm_type=""): """Construct a pdm_utils Genome object from a pdm_utils ImportTicket object. :param tkt: A pdm_utils ImportTicket object. :type tkt: ImportTicket :param gnm_type: Identifier for the type of genome. :type gnm_type: str :returns: A pdm_utils Genome object. :rtype: Genome """ gnm = genome.Genome() gnm.type = gnm_type gnm.set_id(value=tkt.phage_id) gnm.name = tkt.phage_id if "host_genus" in tkt.data_add: gnm.set_host_genus(tkt.data_dict["host_genus"]) if "accession" in tkt.data_add: gnm.set_accession(tkt.data_dict["accession"]) if "annotation_status" in tkt.data_add: gnm.annotation_status = tkt.data_dict["annotation_status"] if "cluster" in tkt.data_add: gnm.set_cluster(tkt.data_dict["cluster"]) if "subcluster" in tkt.data_add: gnm.set_subcluster(tkt.data_dict["subcluster"]) if "annotation_author" in tkt.data_add: gnm.set_annotation_author(tkt.data_dict["annotation_author"]) if "retrieve_record" in tkt.data_add: gnm.set_retrieve_record(tkt.data_dict["retrieve_record"]) return gnm
def test_create_phage_table_insert_1(self): """Verify phage table INSERT statement is created correctly.""" # Note: even though this function returns a string and doesn't # actually utilize a MySQL database, this test ensures # that the returned statement will function properly in MySQL. gnm = genome.Genome() gnm.id = "L5" gnm.name = "L5_Draft" gnm.host_genus = "Mycobacterium" gnm.annotation_status = "final" gnm.accession = "ABC123" gnm.seq = Seq("ATCG", IUPAC.ambiguous_dna) gnm.length = 4 gnm.gc = 0.5001 gnm.date = constants.EMPTY_DATE gnm.retrieve_record = 1 gnm.annotation_author = 1 gnm.cluster = "Singleton" gnm.subcluster = "A2" statement = mysqldb.create_phage_table_insert(gnm) test_db_utils.execute(statement) phage_data = test_db_utils.get_data(test_db_utils.phage_table_query) results = phage_data[0] exp = ("INSERT INTO phage " "(PhageID, Accession, Name, HostGenus, Sequence, " "Length, GC, Status, DateLastModified, RetrieveRecord, " "AnnotationAuthor, Cluster, Subcluster) " "VALUES " "('L5', 'ABC123', 'L5_Draft', 'Mycobacterium', 'ATCG', " f"4, 0.5001, 'final', '{constants.EMPTY_DATE}', 1, " "1, NULL, 'A2');") with self.subTest(): self.assertEqual(statement, exp) with self.subTest(): self.assertEqual(results["PhageID"], "L5") with self.subTest(): self.assertEqual(results["Accession"], "ABC123") with self.subTest(): self.assertEqual(results["Name"], "L5_Draft") with self.subTest(): self.assertEqual(results["HostGenus"], "Mycobacterium") with self.subTest(): self.assertEqual(results["Sequence"].decode("utf-8"), "ATCG") with self.subTest(): self.assertEqual(results["Length"], 4) with self.subTest(): self.assertEqual(results["GC"], 0.5001) with self.subTest(): self.assertEqual(results["Status"], "final") with self.subTest(): self.assertEqual(results["DateLastModified"], constants.EMPTY_DATE) with self.subTest(): self.assertEqual(results["RetrieveRecord"], 1) with self.subTest(): self.assertEqual(results["AnnotationAuthor"], 1) with self.subTest(): self.assertIsNone(results["Cluster"]) with self.subTest(): self.assertEqual(results["Subcluster"], "A2")
def parse_genome_data(data_dict, gnm_type="", seq=False): """Parses a dictionary of PhagesDB genome data into a pdm_utils Genome object. :param data_dict: Dictionary of data retrieved from PhagesDB. :type data_dict: dict :param gnm_type: Identifier for the type of genome. :type gnm_type: str :param seq: Indicates whether the genome sequence should be retrieved. :type seq: bool :returns: A pdm_utils Genome object with the parsed data. :rtype: Genome """ gnm = genome.Genome() gnm.type = gnm_type # Phage Name, PhageID phage_name = parse_phage_name(data_dict) gnm.name = phage_name gnm.set_id(value=phage_name) # Host host_genus = parse_host_genus(data_dict) gnm.set_host_genus(host_genus, "empty_string") # Accession accession = parse_accession(data_dict) gnm.set_accession(accession, "empty_string") # Cluster cluster = parse_cluster(data_dict) gnm.set_cluster(cluster) #Subcluster subcluster = parse_subcluster(data_dict) gnm.set_subcluster(subcluster) # Fasta file URL fastafile_url = parse_fasta_filename(data_dict) fastafile_url_path = pathlib.Path(fastafile_url) gnm.set_filename(fastafile_url_path) # Fasta file record # if fastafile_url != "": if (fastafile_url != "" and seq == True): fasta_file = retrieve_url_data(fastafile_url) # TODO unit test - not sure how to test this, since this function # retrieves and parses files from PhagesDB. # Genome sequence and parsed record if fasta_file != "": header, seq = parse_fasta_data(fasta_file) gnm.set_sequence(seq) gnm.description = header gnm.parse_description() gnm.misc = data_dict return gnm
def setUp(self): """ Creates objects for unit testing of the export_db pipeline """ #Creates test phage name list phage_name_list = [] phage_name_list.append("TestPhage_1") phage_name_list.append("TestPhage_2") phage_name_list.append("TestPhage_3") self.names = phage_name_list #Creates test Genome objects test_phage1 = genome.Genome() test_phage1.name = self.names[0] test_phage2 = genome.Genome() test_phage2.name = self.names[1] test_phage3 = genome.Genome() test_phage3.name = self.names[2] self.genomes = [test_phage1, test_phage2, test_phage3] #Creates test Cds objects test_cds1 = cds.Cds() test_cds1.start = 1 test_cds1.stop = 2 test_cds1.coordinate_format = "0_half_open" test_cds1.orientation = 1 test_cds2 = cds.Cds() test_cds2.start = 2 test_cds2.stop = 3 test_cds2.coordinate_format = "0_half_open" test_cds2.orientation = 1 test_cds3 = cds.Cds() test_cds3.start = 3 test_cds3.stop = 4 test_cds3.coordinate_format = "0_half_open" test_cds3.orientation = 1 self.cds_list = [test_cds1, test_cds2, test_cds3] #Creates a test SeqRecord object seq = Seq("ATGC") seqrecord = SeqRecord(seq) seqrecord.annotations.update({"comment": ()}) self.test_seqrecord = seqrecord self.test_version_dictionary = \ {"Version" : "Test", "SchemaVersion": "Test"}
def setUp(self): self.API_PREFIX = constants.API_PREFIX self.API_SUFFIX = constants.API_SUFFIX self.genome1 = genome.Genome() self.genome1.id = "L5" self.genome1.type = "add" self.genome1.host_genus = "Gordonia" self.genome1.cluster = "B" self.bundle1 = bundle.Bundle()
def setUp(self): self.ticket1 = ticket.ImportTicket() self.src1 = source.Source() self.src1.id = "L5_SRC_1" self.src2 = source.Source() self.src2.id = "L5_SRC_2" self.src3 = source.Source() self.src3.id = "L5_SRC_3" self.cds1 = cds.Cds() self.cds1.id = "L5_CDS_1" self.cds2 = cds.Cds() self.cds2.id = "L5_CDS_2" self.cds3 = cds.Cds() self.cds3.id = "L5_CDS_3" self.genome1 = genome.Genome() self.genome1.type = "flat_file" self.genome1.cds_features.append(self.cds1) self.genome1.cds_features.append(self.cds2) self.genome1.source_features.append(self.src1) self.genome1.source_features.append(self.src2) self.genome2 = genome.Genome() self.genome2.type = "mysql" self.genome_pair1 = genomepair.GenomePair() self.genome_pair2 = genomepair.GenomePair() self.bndl = bundle.Bundle() self.bndl.ticket = self.ticket1 self.bndl.genome_dict[self.genome1.type] = self.genome1 self.bndl.genome_dict[self.genome2.type] = self.genome2 self.bndl.genome_pair_dict["genome_pair1"] = self.genome_pair1 self.bndl.genome_pair_dict["genome_pair2"] = self.genome_pair2 self.eval_correct1 = eval.Eval(status="correct") self.eval_correct2 = eval.Eval(status="correct") self.eval_error1 = eval.Eval(status="error") self.eval_error2 = eval.Eval(status="error")
def setUp(self): #Creates a test database self.connection = pymysql.connect( host="localhost", user="******", password="******", cursorclass=pymysql.cursors.DictCursor) cur = (self.connection).cursor() cur.execute( "SELECT SCHEMA_NAME FROM " "INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = 'test_db'") result = cur.fetchall() if len(result) != 0: cur.execute("DROP DATABASE test_db") (self.connection).commit() cur.execute("CREATE DATABASE test_db;") (self.connection).commit() (self.connection).close #Creates valid alchemist #Creates Genome object gnm = genome.Genome() gnm.id = "TestID" gnm.accession = "TestAccession" gnm.name = "Test" gnm.host_genus = "TestHost" gnm.length = "TestLength" gnm.date = "TestDate" gnm.description = "TestDescription" gnm.gc = "TestGC" gnm.cluster = "TestCluster" gnm.subcluster = "TestSubcluster" gnm.annotation_status = "TestStatus" gnm.retrieve_record = "TestRecord" gnm.annotation_author = "TestAuthor" self.genome = gnm #Creates SeqRecord object seqrecord = SeqRecord(Seq("ATGC")) seqrecord.seq.alphabet = IUPACAmbiguousDNA() seqrecord.id = "Test_Accession" seqrecord.name = "Test" self.test_record = seqrecord #Creates working test directory self.test_cwd = (Path.cwd()).joinpath("DELETE_ME") (self.test_cwd).mkdir()
def setUp(self): self.genome1 = genome.Genome() self.genome2 = genome.Genome() self.genome3 = genome.Genome()
def parse_phage_table_data(data_dict, trans_table=11, gnm_type=""): """Parse a MySQL database dictionary to create a Genome object. :param data_dict: Dictionary of data retrieved from the phage table. :type data_dict: dict :param trans_table: The translation table that can be used to translate CDS features. :type trans_table: int :param gnm_type: Identifier for the type of genome. :type gnm_type: str :returns: A pdm_utils genome object. :rtype: genome """ gnm = genome.Genome() try: gnm.id = data_dict["PhageID"] except: pass try: gnm.accession = data_dict["Accession"] except: pass try: gnm.name = data_dict["Name"] except: pass try: gnm.host_genus = data_dict["HostGenus"] except: pass try: # Sequence data is stored as MEDIUMBLOB, so decode to string. gnm.set_sequence(data_dict["Sequence"].decode("utf-8")) except: pass try: gnm.length = int(data_dict["Length"]) except: pass try: # DateLastModified gets returned as a datetime.datetime object. # TODO some phages have no date, so it will be returned NULL. gnm.date = data_dict["DateLastModified"] except: pass try: gnm.description = data_dict["Notes"].decode("utf-8") except: pass try: gnm.gc = float(data_dict["GC"]) except: pass try: # Singletons are stored in the MySQL database as NULL, which gets # returned as None. gnm.set_cluster(data_dict["Cluster"]) except: pass # Non-subclustered phages are stored in the MySQL database as NULL, which gets # returned as None. try: gnm.set_subcluster(data_dict["Subcluster"]) except: pass try: gnm.annotation_status = data_dict["Status"] except: pass try: gnm.set_retrieve_record(data_dict["RetrieveRecord"]) except: pass try: gnm.set_annotation_author(data_dict["AnnotationAuthor"]) except: pass gnm.translation_table = trans_table gnm.type = gnm_type return gnm
def setUp(self): self.cds_ftr = cds.Cds() self.src_ftr = source.Source() self.gnm = genome.Genome()
def parse_genome_data(seqrecord, filepath=pathlib.Path(), translation_table=11, genome_id_field="_organism_name", gnm_type="", host_genus_field="_organism_host_genus"): """Parse data from a Biopython SeqRecord object into a Genome object. All Source, CDS, tRNA, and tmRNA features are parsed into their associated Source, Cds, Trna, and Tmrna objects. :param seqrecord: A Biopython SeqRecord object. :type seqrecord: SeqRecord :param filepath: A filename associated with the returned Genome object. :type filepath: Path :param translation_table: The applicable translation table for the genome's CDS features. :type translation_table: int :param genome_id_field: The SeqRecord attribute from which the unique genome identifier/name is stored. :type genome_id_field: str :param host_genus_field: The SeqRecord attribute from which the unique host genus identifier/name is stored. :type host_genus_field: str :param gnm_type: Identifier for the type of genome. :type gnm_type: str :returns: A pdm_utils Genome object. :rtype: Genome """ # Keep track of the file from which the record is derived. gnm = genome.Genome() gnm.set_filename(filepath) gnm.type = gnm_type try: gnm.organism = seqrecord.annotations["organism"] except: gnm.organism = "" # Identifies host and phage name from organism field. gnm.parse_organism() try: # Since accessions are stored in a list, there may be more than # one accessions associated with this file. # The first accession in the list is assumed to be the most recent. accession = seqrecord.annotations["accessions"][0] except: accession = "" gnm.set_accession(accession) try: gnm.description = seqrecord.description # It appears that if description is not present, Biopython # auto-populates this attribute as "<unknown description>" if gnm.description == "<unknown description>": gnm.description = "" except: gnm.description = "" # Identifies host and phage name from description field. gnm.parse_description() try: gnm.source = seqrecord.annotations["source"] except: gnm.source = "" # Identifies host and phage name from record source field. gnm.parse_source() try: # The retrieved authors can be stored in multiple Reference elements. refs = seqrecord.annotations["references"] authors_list = [] for ref in refs: # Note: Reference objects are instantiated with an empty # authors attribute. So if no authors are present in a Reference, # it will still concatenate an empty string, resulting in an # author_string = ";;;" etc. So only add the authors info if # it is not an empty string. if ref.authors != "": authors_list.append(ref.authors) authors_string = ";".join(authors_list) gnm.authors = authors_string except: gnm.authors = "" # Biopython requires the parsed record contains a sequence, so # no need to test whether the seq attribute is present or not. # Nucleotide sequence, length, and % GC. gnm.set_sequence(seqrecord.seq) try: date = seqrecord.annotations["date"] gnm.date = datetime.strptime(date, "%d-%b-%Y") except: gnm.date = constants.EMPTY_DATE # # Now that record fields are parsed, set the genome name, id, # # and host_genus. if genome_id_field != "": gnm.name = getattr(gnm, genome_id_field) gnm.set_id(value=gnm.name) else: # The seqrecord name and id are used if genome_id_field is empty. try: gnm.name = seqrecord.name # It appears that if name is not present, Biopython auto-populates # this attribute as "<unknown name>" if gnm.name == "<unknown name>": gnm.name = "" except: gnm.name = "" try: gnm.id = seqrecord.id # It appears that if id is not present, Biopython auto-populates # this attribute as "<unknown id>" if gnm.id == "<unknown id>": gnm.id = "" except: gnm.id = "" gnm.set_host_genus(attribute=host_genus_field) # Create lists of parsed features. # Note: Biopython instantiates the features attribute with # an empty list, so no need to test if features attribute is # present or not. seqfeature_dict = create_seqfeature_dictionary(seqrecord.features) cds_list = [] if "CDS" in seqfeature_dict.keys(): for seqfeature in seqfeature_dict["CDS"]: cds_ftr = parse_cds_seqfeature(seqfeature) cds_ftr.genome_id = gnm.id cds_ftr.genome_length = gnm.length cds_ftr.set_nucleotide_sequence(parent_genome_seq=gnm.seq) cds_list.append(cds_ftr) source_list = [] if "source" in seqfeature_dict.keys(): for seqfeature in seqfeature_dict["source"]: src_ftr = parse_source_seqfeature(seqfeature) src_ftr.genome_id = gnm.id source_list.append(src_ftr) # TODO unit test after functions are constructed. # TODO implement for trnas trna_list = [] if "tRNA" in seqfeature_dict.keys(): for seqfeature in seqfeature_dict["tRNA"]: trna_ftr = parse_trna_seqfeature(seqfeature) trna_list.append(trna_ftr) # TODO unit test after functions are constructed. tmrna_list = [] if "tmrna" in seqfeature_dict.keys(): for seqfeature in seqfeature_dict["tmrna"]: tmrna = parse_tmrna_seqfeature(seqfeature) tmrna_list.append(tmrna) gnm.translation_table = translation_table gnm.set_cds_features(cds_list) gnm.set_source_features(source_list) gnm.set_trna_features(trna_list) # gnm.set_tmrna_features(tmrna_list) # The Cds.id is constructed from the Genome.id and the Cds order. gnm.set_feature_ids(use_type=True, use_cds=True) gnm.set_feature_ids(use_type=True, use_source=True) # TODO set tRNA feature ids. #gnm.set_feature_ids(use_type=True, use_trna=True) return gnm
def setUp(self): self.API_PREFIX = constants.API_PREFIX self.API_SUFFIX = constants.API_SUFFIX self.gnm = genome.Genome()
def setUp(self): self.genome1 = genome.Genome() self.genome1.id = "L5" self.genome1.name = "L5_Draft" self.genome1.host_genus = "Mycobacterium" self.genome1.annotation_status = "final" self.genome1.accession = "ABC123" self.genome1.seq = "ATCG" self.genome1.length = 4 self.genome1.gc = 0.5001 self.genome1.date = '1/1/2000' self.genome1.retrieve_record = "1" self.genome1.annotation_author = "1" self.genome1.cluster = "A" self.genome1.subcluster = "A2" self.cds1 = cds.Cds() self.cds1.genome_id = "L5" self.cds1.start = 10 self.cds1.stop = 100 self.cds1.parts = 1 self.cds1.length = 1000 self.cds1.name = "1" self.cds1.type = "CDS" self.cds1.translation = "AGGPT" self.cds1.orientation = "F" self.cds1.description = "description" self.cds1.locus_tag = "SEA_L5_001" self.cds2 = cds.Cds() self.cds2.genome_id = "L5" self.cds2.start = 100 self.cds2.stop = 1000 self.cds2.parts = 1 self.cds2.length = 10000 self.cds2.name = "2" self.cds2.type = "CDS" self.cds2.translation = "AKKQE" self.cds2.orientation = "R" self.cds2.description = "description" self.cds2.locus_tag = "SEA_L5_002" self.cds_features = [self.cds1, self.cds2] self.trna1 = trna.Trna() self.trna1.id = "Trixie_1" self.trna1.genome_id = "Trixie" self.trna1.name = "1" self.trna1.locus_tag = "TAG1" self.trna1.start = 5 self.trna1.stop = 10 self.trna1.length = 200 self.trna1.orientation = "F" self.trna1.note = "misc" self.trna1.amino_acid = "Ala" self.trna1.anticodon = "AAA" self.trna1.structure = "random" self.trna1.use = "aragorn" self.trna2 = trna.Trna() self.trna2.id = "Trixie_1" self.trna2.genome_id = "Trixie" self.trna2.name = "1" self.trna2.locus_tag = "TAG1" self.trna2.start = 5 self.trna2.stop = 10 self.trna2.length = 200 self.trna2.orientation = "F" self.trna2.note = "misc" self.trna2.amino_acid = "Ala" self.trna2.anticodon = "AAA" self.trna2.structure = "random" self.trna2.use = "aragorn" self.trna_features = [self.trna1, self.trna2] self.tmrna1 = tmrna.Tmrna() self.tmrna1.id = "Trixie_1" self.tmrna1.genome_id = "Trixie" self.tmrna1.name = "1" self.tmrna1.locus_tag = "TAG1" self.tmrna1.start = 5 self.tmrna1.stop = 10 self.tmrna1.length = 200 self.tmrna1.orientation = "F" self.tmrna1.note = "misc" self.tmrna1.peptide_tag = "random" self.tmrna2 = tmrna.Tmrna() self.tmrna2.id = "Trixie_1" self.tmrna2.genome_id = "Trixie" self.tmrna2.name = "1" self.tmrna2.locus_tag = "TAG1" self.tmrna2.start = 5 self.tmrna2.stop = 10 self.tmrna2.length = 200 self.tmrna2.orientation = "F" self.tmrna2.note = "misc" self.tmrna2.peptide_tag = "random" self.tmrna_features = [self.tmrna1, self.tmrna2]