Пример #1
0
def process_sam_alignment(db_config, path_config, sam_id, log_file):
    """This is the first need to call. This function does the following things
     1. call create_fasta_for_gal() function that creates the fasta files for the program
     2. call run_sam_alignment() function : ran lastz program
     3. call parse_sam() function : parse sam file
     4. call upload_sam_data() function : upload the parsed data into the database.
     """
    upload_path = path_config.upload_dir
    lastz_program = path_config.lastz
    logger = logging_utility.logger_function(__name__, log_file)
    sam_path = create_sam_alignment_directory(upload_path)
    sam_file_list = find_uploaded_sam_files(db_config, sam_path)
    organism_hierarchy_dct = get_organism_hierarchy_map(db_config)

    sequence_file_list = create_sequence_file_for_gal_using_dct(
        db_config, sam_path)
    sam_output_file_list = run_sam_alignment(lastz_program,
                                             organism_hierarchy_dct,
                                             sequence_file_list, sam_file_list,
                                             sam_path, log_file)

    if sam_output_file_list:
        parsed_sam_file = get_sam_parsed_file_name(sam_path)
        parsed_sam_fh = open(parsed_sam_file, 'w')

        for sam_output in sam_output_file_list:
            sam_file_path = os.path.join(sam_path, sam_output)
            sam_id = parse_sam(sam_file_path, sam_id, parsed_sam_fh)

        upload_sam_data(db_config, parsed_sam_file)
def process_interpro_data(db_config, org_config, path_config, taxonomy_id,
                          log_file):
    logger = logging_utility.logger_function(__name__, log_file)

    if os.path.exists(org_config.interproscan):
        logger.info('processing Interproscan data')
        interproscan_parser.process_interpro_data(db_config,
                                                  path_config.upload_dir,
                                                  org_config.interproscan,
                                                  taxonomy_id,
                                                  org_config.version)
def process_protein_feature_algorithm_data(config, random_str, taxonomy_dct,
                                           log_file):
    logger = logging_utility.logger_function(__name__, log_file)

    db_config = config.db_config
    org_config = config.org_config
    path_config = config.path_config

    protein_path = os.path.join(path_config.upload_dir, random_str + '.aa')
    taxonomy_id = taxonomy_dct['TAXON_ID']
    organism_function.create_protein_file(db_config, taxonomy_id,
                                          org_config.version, protein_path)
    protein_feature_path = directory_utility.ProteinFeatureFileName(
        path_config.upload_dir, random_str)

    organism_type = organism_function.find_organism_type(
        org_config.organism, taxonomy_dct, db_config)

    hmmpfam_program = path_config.HMMSCAN
    pfam_db = path_config.hmm_db
    logger.info('External Program: HMMPFAM started')
    external_program.run_hmmpfam(hmmpfam_program, pfam_db, protein_path,
                                 protein_feature_path.PFam_out, logger)
    logger.info('HMMpFAM result: {}'.format(protein_feature_path.PFam_out))

    signalp_program = path_config.signalp
    logger.info('External Program: SignalP started')
    external_program.run_signal_p(signalp_program, protein_path,
                                  protein_feature_path.SignalP_out, logger,
                                  organism_type)
    logger.info('Signalp result: {}'.format(protein_feature_path.SignalP_out))

    tmhmm_path = path_config.TMHMM
    logger.info('External Program: TMHMM started')
    external_program.run_tmhmm(tmhmm_path, protein_path,
                               protein_feature_path.TmHmm_out, logger)
    logger.info('TMHMM result: {}'.format(protein_feature_path.TmHmm_out))

    protein_feature_id_list = db_table.get_protein_feature_table_status(
        db_config)

    protein_algorithm_utility.parse_hmmscan_result(
        protein_feature_path.PFam_out, protein_feature_path.PFam,
        protein_feature_id_list)

    protein_algorithm_utility.process_signalp_result(
        protein_feature_path.SignalP_out, protein_feature_path.SignalP,
        protein_feature_id_list)
    protein_algorithm_utility.process_tmhmm_result(
        protein_feature_path.TmHmm_out, protein_feature_path.TmHmm,
        protein_feature_id_list)

    db_table.upload_protein_feature_table_data(db_config, protein_feature_path)
def process_type2_data(org_config, path_config, random_name, gal_dir,
                       log_file):
    """
        No Annotation type data
        Only fasta file is provided
        This type of data reference genome name
    """
    logger = logging_utility.logger_function(__name__, log_file)
    sequence_file = org_config.fasta
    reference_genome = org_config.RefOrg
    upload_dir = path_config.upload_dir
    file_name = os.path.join(upload_dir, random_name)
    gff_file_name = file_name + ".gff"
    program_name = org_config.program

    if program_name:
        if program_name.lower() == 'augustus':
            program = path_config.augustus
            external_program.run_augustus(program, reference_genome,
                                          sequence_file, gff_file_name, logger)
            external_program.fetch_protein(gal_dir, gff_file_name)
        elif program_name.lower() == 'genemark':
            program = path_config.genmark
            model_file_dir = path_config.genmark_model
            external_program.run_genemark(program, model_file_dir,
                                          reference_genome, sequence_file,
                                          file_name, logger)
        else:
            logger.error('Please check the gene prediction program name')
            sys.exit(0)
    else:
        program = path_config.augustus
        external_program.run_augustus(program, reference_genome, sequence_file,
                                      gff_file_name, logger)
        external_program.fetch_protein(gal_dir, gff_file_name)

    protein_file_name = os.path.join(path_config.upload_dir,
                                     random_name + ".aa")
    blast_file_name = os.path.join(path_config.upload_dir,
                                   random_name + ".blast")
    blast_program = path_config.blastp
    external_program.run_blast(blast_program, protein_file_name,
                               blast_file_name, gal_dir, logger)

    sequence_dct = bu.read_fasta_to_dictionary(org_config.fasta)
    # gff_dct = gff_parser.read_gff3_augustus(gff_file_name)
    gff_dct = gff_parser.read_gff3_genbank(gff_file_name)
    blast_dct = blastparser.parse_file(blast_file_name)

    model_gff_dct = ppf.create_gal_model_dct(sequence_dct, gff_dct, blast_dct)

    return sequence_dct, model_gff_dct
Пример #5
0
def run_sam_alignment(program, organism_hierarchy_dct, file_name_list,
                      uploaded_sam_file_list, path, log_file):
    """
        This function RUNs the LastZ program and parse the SAM file.
    """
    logger = logging_utility.logger_function(__name__, log_file)
    class_level = 'CLASS'  # Default level
    order_level = 'ORDERS'
    family_level = 'FAMILY'
    genus_level = 'GENUS'

    run_sam_list = []
    for file1 in file_name_list:
        for file2 in file_name_list:
            checking_level = class_level
            if class_level in organism_hierarchy_dct[file1].keys() and class_level in \
                    organism_hierarchy_dct[file2].keys():

                if (organism_hierarchy_dct[file1][class_level] is None) or \
                        (organism_hierarchy_dct[file2][class_level] is None):
                    checking_level = order_level

                    if (organism_hierarchy_dct[file1][order_level] is None) or \
                            (organism_hierarchy_dct[file2][order_level] is None):
                        checking_level = family_level

                        if (organism_hierarchy_dct[file1][family_level] is None) or \
                                (organism_hierarchy_dct[file2][family_level] is None):
                            checking_level = genus_level

                if organism_hierarchy_dct[file1][
                        checking_level] == organism_hierarchy_dct[file2][
                            checking_level]:
                    if file1 != file2:
                        output_filename = "{}__{}.out".format(file1, file2)
                        if output_filename not in uploaded_sam_file_list:
                            logger.info(output_filename)
                            sam_file_path = os.path.join(path, output_filename)
                            file1_path = os.path.join(path, file1)
                            file2_path = os.path.join(path, file2)
                            run_sam_list.append(sam_file_path)
                            external_program.run_lastz(program, file1_path,
                                                       file2_path,
                                                       sam_file_path, logger)

    return run_sam_list
def process_central_dogma_data(config, random_string, id_list, present_dir,
                               log_file):
    db_config = config.db_config
    org_config = config.org_config
    path_config = config.path_config

    data_type = check_data_type(org_config, path_config)
    logger = logging_utility.logger_function(__name__, log_file)
    if data_type == 'type1':
        logger.info('Processing  GenBank type Data...')
        (sequence_dct,
         feature_dct) = process_type1_data(org_config)  # GenBank Annotation
        process_minimal_annotation_data(db_config, org_config, path_config,
                                        sequence_dct, feature_dct, id_list,
                                        logger)
        db_table.upload_gal_table_data(db_config, path_config.upload_dir,
                                       logger)

    elif data_type == 'type2':  # No Annotation
        logger.info("Processing No Annotation type Data...")
        (sequence_dct, gff_dct) = process_type2_data(org_config, path_config,
                                                     random_string,
                                                     present_dir, log_file)
        process_minimal_annotation_data(db_config, org_config, path_config,
                                        sequence_dct, gff_dct, id_list, logger)
        db_table.upload_gal_table_data(db_config, path_config.upload_dir,
                                       logger)

    elif data_type == "type3":  # Minimal Annotation
        logger.info("Processing Minimal Annotation type Data...")
        (sequence_dct, gff_dct) = process_type3_data(org_config)
        process_minimal_annotation_data(db_config, org_config, path_config,
                                        sequence_dct, gff_dct, id_list, logger)
        db_table.upload_gal_table_data(db_config, path_config.upload_dir,
                                       logger)

    elif data_type == 'type4':  # Partial Annotation
        logger.info("Processing Complete Annotation type Data...")
        (sequence_dct, gff_dct) = process_type4_data(org_config)
        process_minimal_annotation_data(db_config, org_config, path_config,
                                        sequence_dct, gff_dct, id_list, logger)
        db_table.upload_gal_table_data(db_config, path_config.upload_dir,
                                       logger)

    return type
Пример #7
0
def update_organism_table(db_config, org_name, org_ver, log_file):
    """
    NEED THE BELLOW INFORMATION FROM ONE SPECIES
        1. genus
        2. order
        3. phylum
        4. class
        5. subclass
        6. family
        7. superkingdom
    """
    logger = logging_utility.logger_function(__name__, log_file)
    taxonomy_dct = get_org_hierarchy(db_config, org_name)
    taxonomy_dct = NoneDict(taxonomy_dct)

    strain = ''
    org = re.split(r'\s', org_name)
    org_size = len(org)
    species = org[0] + " " + org[1]
    if org_size > 2:
        strain_list = org[2:]
        strain = " ".join(strain_list)
    # for x in range(2, org_size):
    #    strain += org[x]

    db_dots = create_db_connection(db_config)

    taxonomy_id = taxonomy_dct['TAXON_ID']
    genus = taxonomy_dct['genus']
    order = taxonomy_dct['order']
    phylum = taxonomy_dct['phylum']
    class_name = taxonomy_dct['class']
    # subclass = taxonomy_dct['subclass']
    family = taxonomy_dct['family']
    super_kingdom = taxonomy_dct['superkingdom']

    query = """
            INSERT INTO Organism(TAXON_ID, SPECIES, STRAIN, PHYLUM, FAMILY, GENUS, ORDERS, CLASS, SUPERKINGDOM, VERSION)
            VALUES (%s, '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')
            """ % (taxonomy_id, species, strain, phylum, family, genus, order, class_name, super_kingdom, org_ver)
    db_dots.insert(query)
    logger.info(" Organism Table update complete")
    return taxonomy_dct
def database_schema(db_config, main_path, log_filename):
    logger = logging_utility.logger_function(__name__, log_filename)

    schema = UploadSchema(db_config, main_path)
    schema_existence = schema.check_schema_existence()

    if not schema_existence:
        logger.debug('Uploading Database Scheme : Processing')

        schema.upload_shared_resource_schema(logger)
        logger.debug('Shared resource Schema upload complete')

        schema.upload_dots_schema(logger)
        logger.debug('DOTS Schema upload complete')

        schema.add_database_constrain()
        logger.debug('Uploading Database Scheme : Complete')
    else:
        logger.debug('Database Schema already exist')
def get_table_status(db_config, log_filename):
    logger = logging_utility.logger_function(__name__, log_filename)
    # logger.info("\n\t\tGetting Max IDs of each table...............")

    db_name = db_function.DbNames(db_config.db_prefix)
    db_dots = db_function.Database(db_config.host, db_config.db_username,
                                   db_config.db_password, db_name.dots, 0)

    sql_1 = "SELECT MAX(NA_SEQUENCE_ID) as LAST_ID FROM NASequenceImp"
    sql_2 = "SELECT MAX(NA_FEATURE_ID) as LAST_ID FROM NAFeatureImp"
    sql_3 = "SELECT MAX(NA_LOCATION_ID) as LAST_ID FROM NALocation"
    sql_4 = "SELECT MAX(GENE_INSTANCE_ID) as LAST_ID FROM GeneInstance"
    sql_5 = "SELECT MAX(PROTEIN_ID) as LAST_ID FROM Protein"

    row_na_sequence = get_max_table_value(db_dots, sql_1)
    row_na_feature = get_max_table_value(db_dots, sql_2)
    row_na_location = get_max_table_value(db_dots, sql_3)
    row_gene_instance = get_max_table_value(db_dots, sql_4)
    row_protein = get_max_table_value(db_dots, sql_5)

    print_str = """Getting Max IDs of each table..
        NASequenceImp ID: {}
        NAFeatureImp ID: {}
        NALocation ID: {}
        GeneInstance ID: {}
        Protein ID: {}
        """.format(row_na_sequence, row_na_feature, row_na_location,
                   row_gene_instance, row_protein)

    logger.info(print_str)
    # print("\t\t  NASequenceImp ID is: %d " % row_na_sequence)
    # print("\t\t  NAFeatureimp ID is: %d " % row_na_feature)
    # print("\t\t  NALocation ID is: %d " % row_na_location)
    # print("\t\t  GeneInstance ID is: %d " % row_gene_instance)
    # print("\t\t  Protein ID is: %d " % row_protein)

    row_list = [
        row_na_sequence, row_na_feature, row_na_feature, row_na_feature,
        row_na_feature
    ]
    return row_list
Пример #10
0
def check_organism_existence(db_config, org_name, org_ver, log_file):
    logger = logging_utility.logger_function(__name__, log_file)

    if not org_name:
        logger.info("Error: Organism Name does not exist \n")
        return True
    else:
        logger.info('Organism: {} version: {}'.format(org_name, org_ver))
        taxonomy_id = get_taxonomy_id(db_config, org_name)
        if taxonomy_id:
            db_dots = create_db_connection(db_config)
            sql_query = "select * from Organism where TAXON_ID = %s and VERSION = %s" % (taxonomy_id, org_ver)
            row_count = db_dots.rowcount(sql_query)
            if row_count == 1:
                logger.info("Error: Organism Name and same version already exists \n")
                return True
            else:
                logger.info("New Organism")
                return False
        else:
            logger.info("Error: Please check the organism name")
            return True
Пример #11
0
def create_row_files(db_config, taxonomy_id, org_name, org_version,
                     path_config, log_file):
    logger = logging_utility.logger_function(__name__, log_file)
    org_info = organism_function.OrganismInfo(org_name, taxonomy_id,
                                              org_version)
    upload_path = path_config.blast_path

    nucleotide_dir, protein_dir = directory_utility.create_blast_feature_directory(
        upload_path)

    genomic_seq_file = PurePosixPath(nucleotide_dir, org_info.org_short_name)
    protein_seq_file = PurePosixPath(protein_dir, org_info.org_short_name)

    db_dots = db_function.create_db_dots_connection(db_config)

    create_scaffold_sequence_file(db_dots, taxonomy_id, org_version,
                                  genomic_seq_file)
    create_protein_file(db_dots, taxonomy_id, org_version, protein_seq_file)
    db_creator_program = path_config.db_creator
    external_program.nucleotide_format_db(db_creator_program, genomic_seq_file,
                                          genomic_seq_file, logger)
    external_program.protein_format_db(db_creator_program, protein_seq_file,
                                       protein_seq_file, logger)
Пример #12
0
from __future__ import print_function
import os
import sys
from pathlib import Path, PurePosixPath
from galpy import data_schedule_utility
from galpy import logging_utility, command_argument, gal_function
import main

CurrDir = Path(__file__).parent.absolute()
arg = command_argument.ProcessArguments(CurrDir)

logger = logging_utility.logger_function(__name__, arg.log_file)
logger.info('GAL Upload started \n')

# configuration file existence check and configuration parser.
config = gal_function.ConfigFileHandler(arg.db_config_file,
                                        arg.path_config_file,
                                        arg.org_config_file, logger)

db_config = config.db_config
path_config = config.path_config
org_config = config.org_config

# check database connection, upload schema and common data upload
main.process_schema_common_data(db_config, arg, CurrDir)

if arg.new_upload:
    status_log = data_schedule_utility.StatusLog(db_config)
    status_log.submit_log(org_config.organism, org_config.version,
                          arg.org_config_file)