def process_sam_alignment(db_config, path_config, sam_id, log_file): """This is the first need to call. This function does the following things 1. call create_fasta_for_gal() function that creates the fasta files for the program 2. call run_sam_alignment() function : ran lastz program 3. call parse_sam() function : parse sam file 4. call upload_sam_data() function : upload the parsed data into the database. """ upload_path = path_config.upload_dir lastz_program = path_config.lastz logger = logging_utility.logger_function(__name__, log_file) sam_path = create_sam_alignment_directory(upload_path) sam_file_list = find_uploaded_sam_files(db_config, sam_path) organism_hierarchy_dct = get_organism_hierarchy_map(db_config) sequence_file_list = create_sequence_file_for_gal_using_dct( db_config, sam_path) sam_output_file_list = run_sam_alignment(lastz_program, organism_hierarchy_dct, sequence_file_list, sam_file_list, sam_path, log_file) if sam_output_file_list: parsed_sam_file = get_sam_parsed_file_name(sam_path) parsed_sam_fh = open(parsed_sam_file, 'w') for sam_output in sam_output_file_list: sam_file_path = os.path.join(sam_path, sam_output) sam_id = parse_sam(sam_file_path, sam_id, parsed_sam_fh) upload_sam_data(db_config, parsed_sam_file)
def process_interpro_data(db_config, org_config, path_config, taxonomy_id, log_file): logger = logging_utility.logger_function(__name__, log_file) if os.path.exists(org_config.interproscan): logger.info('processing Interproscan data') interproscan_parser.process_interpro_data(db_config, path_config.upload_dir, org_config.interproscan, taxonomy_id, org_config.version)
def process_protein_feature_algorithm_data(config, random_str, taxonomy_dct, log_file): logger = logging_utility.logger_function(__name__, log_file) db_config = config.db_config org_config = config.org_config path_config = config.path_config protein_path = os.path.join(path_config.upload_dir, random_str + '.aa') taxonomy_id = taxonomy_dct['TAXON_ID'] organism_function.create_protein_file(db_config, taxonomy_id, org_config.version, protein_path) protein_feature_path = directory_utility.ProteinFeatureFileName( path_config.upload_dir, random_str) organism_type = organism_function.find_organism_type( org_config.organism, taxonomy_dct, db_config) hmmpfam_program = path_config.HMMSCAN pfam_db = path_config.hmm_db logger.info('External Program: HMMPFAM started') external_program.run_hmmpfam(hmmpfam_program, pfam_db, protein_path, protein_feature_path.PFam_out, logger) logger.info('HMMpFAM result: {}'.format(protein_feature_path.PFam_out)) signalp_program = path_config.signalp logger.info('External Program: SignalP started') external_program.run_signal_p(signalp_program, protein_path, protein_feature_path.SignalP_out, logger, organism_type) logger.info('Signalp result: {}'.format(protein_feature_path.SignalP_out)) tmhmm_path = path_config.TMHMM logger.info('External Program: TMHMM started') external_program.run_tmhmm(tmhmm_path, protein_path, protein_feature_path.TmHmm_out, logger) logger.info('TMHMM result: {}'.format(protein_feature_path.TmHmm_out)) protein_feature_id_list = db_table.get_protein_feature_table_status( db_config) protein_algorithm_utility.parse_hmmscan_result( protein_feature_path.PFam_out, protein_feature_path.PFam, protein_feature_id_list) protein_algorithm_utility.process_signalp_result( protein_feature_path.SignalP_out, protein_feature_path.SignalP, protein_feature_id_list) protein_algorithm_utility.process_tmhmm_result( protein_feature_path.TmHmm_out, protein_feature_path.TmHmm, protein_feature_id_list) db_table.upload_protein_feature_table_data(db_config, protein_feature_path)
def process_type2_data(org_config, path_config, random_name, gal_dir, log_file): """ No Annotation type data Only fasta file is provided This type of data reference genome name """ logger = logging_utility.logger_function(__name__, log_file) sequence_file = org_config.fasta reference_genome = org_config.RefOrg upload_dir = path_config.upload_dir file_name = os.path.join(upload_dir, random_name) gff_file_name = file_name + ".gff" program_name = org_config.program if program_name: if program_name.lower() == 'augustus': program = path_config.augustus external_program.run_augustus(program, reference_genome, sequence_file, gff_file_name, logger) external_program.fetch_protein(gal_dir, gff_file_name) elif program_name.lower() == 'genemark': program = path_config.genmark model_file_dir = path_config.genmark_model external_program.run_genemark(program, model_file_dir, reference_genome, sequence_file, file_name, logger) else: logger.error('Please check the gene prediction program name') sys.exit(0) else: program = path_config.augustus external_program.run_augustus(program, reference_genome, sequence_file, gff_file_name, logger) external_program.fetch_protein(gal_dir, gff_file_name) protein_file_name = os.path.join(path_config.upload_dir, random_name + ".aa") blast_file_name = os.path.join(path_config.upload_dir, random_name + ".blast") blast_program = path_config.blastp external_program.run_blast(blast_program, protein_file_name, blast_file_name, gal_dir, logger) sequence_dct = bu.read_fasta_to_dictionary(org_config.fasta) # gff_dct = gff_parser.read_gff3_augustus(gff_file_name) gff_dct = gff_parser.read_gff3_genbank(gff_file_name) blast_dct = blastparser.parse_file(blast_file_name) model_gff_dct = ppf.create_gal_model_dct(sequence_dct, gff_dct, blast_dct) return sequence_dct, model_gff_dct
def run_sam_alignment(program, organism_hierarchy_dct, file_name_list, uploaded_sam_file_list, path, log_file): """ This function RUNs the LastZ program and parse the SAM file. """ logger = logging_utility.logger_function(__name__, log_file) class_level = 'CLASS' # Default level order_level = 'ORDERS' family_level = 'FAMILY' genus_level = 'GENUS' run_sam_list = [] for file1 in file_name_list: for file2 in file_name_list: checking_level = class_level if class_level in organism_hierarchy_dct[file1].keys() and class_level in \ organism_hierarchy_dct[file2].keys(): if (organism_hierarchy_dct[file1][class_level] is None) or \ (organism_hierarchy_dct[file2][class_level] is None): checking_level = order_level if (organism_hierarchy_dct[file1][order_level] is None) or \ (organism_hierarchy_dct[file2][order_level] is None): checking_level = family_level if (organism_hierarchy_dct[file1][family_level] is None) or \ (organism_hierarchy_dct[file2][family_level] is None): checking_level = genus_level if organism_hierarchy_dct[file1][ checking_level] == organism_hierarchy_dct[file2][ checking_level]: if file1 != file2: output_filename = "{}__{}.out".format(file1, file2) if output_filename not in uploaded_sam_file_list: logger.info(output_filename) sam_file_path = os.path.join(path, output_filename) file1_path = os.path.join(path, file1) file2_path = os.path.join(path, file2) run_sam_list.append(sam_file_path) external_program.run_lastz(program, file1_path, file2_path, sam_file_path, logger) return run_sam_list
def process_central_dogma_data(config, random_string, id_list, present_dir, log_file): db_config = config.db_config org_config = config.org_config path_config = config.path_config data_type = check_data_type(org_config, path_config) logger = logging_utility.logger_function(__name__, log_file) if data_type == 'type1': logger.info('Processing GenBank type Data...') (sequence_dct, feature_dct) = process_type1_data(org_config) # GenBank Annotation process_minimal_annotation_data(db_config, org_config, path_config, sequence_dct, feature_dct, id_list, logger) db_table.upload_gal_table_data(db_config, path_config.upload_dir, logger) elif data_type == 'type2': # No Annotation logger.info("Processing No Annotation type Data...") (sequence_dct, gff_dct) = process_type2_data(org_config, path_config, random_string, present_dir, log_file) process_minimal_annotation_data(db_config, org_config, path_config, sequence_dct, gff_dct, id_list, logger) db_table.upload_gal_table_data(db_config, path_config.upload_dir, logger) elif data_type == "type3": # Minimal Annotation logger.info("Processing Minimal Annotation type Data...") (sequence_dct, gff_dct) = process_type3_data(org_config) process_minimal_annotation_data(db_config, org_config, path_config, sequence_dct, gff_dct, id_list, logger) db_table.upload_gal_table_data(db_config, path_config.upload_dir, logger) elif data_type == 'type4': # Partial Annotation logger.info("Processing Complete Annotation type Data...") (sequence_dct, gff_dct) = process_type4_data(org_config) process_minimal_annotation_data(db_config, org_config, path_config, sequence_dct, gff_dct, id_list, logger) db_table.upload_gal_table_data(db_config, path_config.upload_dir, logger) return type
def update_organism_table(db_config, org_name, org_ver, log_file): """ NEED THE BELLOW INFORMATION FROM ONE SPECIES 1. genus 2. order 3. phylum 4. class 5. subclass 6. family 7. superkingdom """ logger = logging_utility.logger_function(__name__, log_file) taxonomy_dct = get_org_hierarchy(db_config, org_name) taxonomy_dct = NoneDict(taxonomy_dct) strain = '' org = re.split(r'\s', org_name) org_size = len(org) species = org[0] + " " + org[1] if org_size > 2: strain_list = org[2:] strain = " ".join(strain_list) # for x in range(2, org_size): # strain += org[x] db_dots = create_db_connection(db_config) taxonomy_id = taxonomy_dct['TAXON_ID'] genus = taxonomy_dct['genus'] order = taxonomy_dct['order'] phylum = taxonomy_dct['phylum'] class_name = taxonomy_dct['class'] # subclass = taxonomy_dct['subclass'] family = taxonomy_dct['family'] super_kingdom = taxonomy_dct['superkingdom'] query = """ INSERT INTO Organism(TAXON_ID, SPECIES, STRAIN, PHYLUM, FAMILY, GENUS, ORDERS, CLASS, SUPERKINGDOM, VERSION) VALUES (%s, '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s') """ % (taxonomy_id, species, strain, phylum, family, genus, order, class_name, super_kingdom, org_ver) db_dots.insert(query) logger.info(" Organism Table update complete") return taxonomy_dct
def database_schema(db_config, main_path, log_filename): logger = logging_utility.logger_function(__name__, log_filename) schema = UploadSchema(db_config, main_path) schema_existence = schema.check_schema_existence() if not schema_existence: logger.debug('Uploading Database Scheme : Processing') schema.upload_shared_resource_schema(logger) logger.debug('Shared resource Schema upload complete') schema.upload_dots_schema(logger) logger.debug('DOTS Schema upload complete') schema.add_database_constrain() logger.debug('Uploading Database Scheme : Complete') else: logger.debug('Database Schema already exist')
def get_table_status(db_config, log_filename): logger = logging_utility.logger_function(__name__, log_filename) # logger.info("\n\t\tGetting Max IDs of each table...............") db_name = db_function.DbNames(db_config.db_prefix) db_dots = db_function.Database(db_config.host, db_config.db_username, db_config.db_password, db_name.dots, 0) sql_1 = "SELECT MAX(NA_SEQUENCE_ID) as LAST_ID FROM NASequenceImp" sql_2 = "SELECT MAX(NA_FEATURE_ID) as LAST_ID FROM NAFeatureImp" sql_3 = "SELECT MAX(NA_LOCATION_ID) as LAST_ID FROM NALocation" sql_4 = "SELECT MAX(GENE_INSTANCE_ID) as LAST_ID FROM GeneInstance" sql_5 = "SELECT MAX(PROTEIN_ID) as LAST_ID FROM Protein" row_na_sequence = get_max_table_value(db_dots, sql_1) row_na_feature = get_max_table_value(db_dots, sql_2) row_na_location = get_max_table_value(db_dots, sql_3) row_gene_instance = get_max_table_value(db_dots, sql_4) row_protein = get_max_table_value(db_dots, sql_5) print_str = """Getting Max IDs of each table.. NASequenceImp ID: {} NAFeatureImp ID: {} NALocation ID: {} GeneInstance ID: {} Protein ID: {} """.format(row_na_sequence, row_na_feature, row_na_location, row_gene_instance, row_protein) logger.info(print_str) # print("\t\t NASequenceImp ID is: %d " % row_na_sequence) # print("\t\t NAFeatureimp ID is: %d " % row_na_feature) # print("\t\t NALocation ID is: %d " % row_na_location) # print("\t\t GeneInstance ID is: %d " % row_gene_instance) # print("\t\t Protein ID is: %d " % row_protein) row_list = [ row_na_sequence, row_na_feature, row_na_feature, row_na_feature, row_na_feature ] return row_list
def check_organism_existence(db_config, org_name, org_ver, log_file): logger = logging_utility.logger_function(__name__, log_file) if not org_name: logger.info("Error: Organism Name does not exist \n") return True else: logger.info('Organism: {} version: {}'.format(org_name, org_ver)) taxonomy_id = get_taxonomy_id(db_config, org_name) if taxonomy_id: db_dots = create_db_connection(db_config) sql_query = "select * from Organism where TAXON_ID = %s and VERSION = %s" % (taxonomy_id, org_ver) row_count = db_dots.rowcount(sql_query) if row_count == 1: logger.info("Error: Organism Name and same version already exists \n") return True else: logger.info("New Organism") return False else: logger.info("Error: Please check the organism name") return True
def create_row_files(db_config, taxonomy_id, org_name, org_version, path_config, log_file): logger = logging_utility.logger_function(__name__, log_file) org_info = organism_function.OrganismInfo(org_name, taxonomy_id, org_version) upload_path = path_config.blast_path nucleotide_dir, protein_dir = directory_utility.create_blast_feature_directory( upload_path) genomic_seq_file = PurePosixPath(nucleotide_dir, org_info.org_short_name) protein_seq_file = PurePosixPath(protein_dir, org_info.org_short_name) db_dots = db_function.create_db_dots_connection(db_config) create_scaffold_sequence_file(db_dots, taxonomy_id, org_version, genomic_seq_file) create_protein_file(db_dots, taxonomy_id, org_version, protein_seq_file) db_creator_program = path_config.db_creator external_program.nucleotide_format_db(db_creator_program, genomic_seq_file, genomic_seq_file, logger) external_program.protein_format_db(db_creator_program, protein_seq_file, protein_seq_file, logger)
from __future__ import print_function import os import sys from pathlib import Path, PurePosixPath from galpy import data_schedule_utility from galpy import logging_utility, command_argument, gal_function import main CurrDir = Path(__file__).parent.absolute() arg = command_argument.ProcessArguments(CurrDir) logger = logging_utility.logger_function(__name__, arg.log_file) logger.info('GAL Upload started \n') # configuration file existence check and configuration parser. config = gal_function.ConfigFileHandler(arg.db_config_file, arg.path_config_file, arg.org_config_file, logger) db_config = config.db_config path_config = config.path_config org_config = config.org_config # check database connection, upload schema and common data upload main.process_schema_common_data(db_config, arg, CurrDir) if arg.new_upload: status_log = data_schedule_utility.StatusLog(db_config) status_log.submit_log(org_config.organism, org_config.version, arg.org_config_file)