示例#1
0
def fix_miscolored_phams(engine):
    print("Phixing Phalsely Hued Phams...")
    # Phams which are colored as though they are orphams, when really
    # they are multi-member phams
    query = "SELECT * FROM (SELECT g.PhamID, COUNT(GeneID) AS count, "\
            "p.Color FROM gene AS g INNER JOIN pham AS p ON g.PhamID " \
            "= p.PhamID GROUP BY PhamID) AS c WHERE Color = '#FFFFFF' "\
            "AND count > 1"

    results = mysqldb.query_dict_list(engine, query)

    print(f"Found {len(results)} miscolored phams to fix")

    commands = []
    for dictionary in results:
        pham_id = dictionary["PhamID"]
        count = dictionary["count"]
        color = dictionary["Color"]
        h = s = v = 0
        while h <= 0:
            h = random.random()
        while s < 0.5:
            s = random.random()
        while v < 0.8:
            v = random.random()
        rgb = colorsys.hsv_to_rgb(h, s, v)
        rgb = (rgb[0] * 255, rgb[1] * 255, rgb[2] * 255)
        hexrgb = "#{:02x}{:02x}{:02x}".format(int(rgb[0]), int(rgb[1]),
                                              int(rgb[2]))
        new_color = hexrgb
        commands.append(
            f"UPDATE pham SET Color = '{new_color}' WHERE PhamID = '{pham_id}'"
        )

    mysqldb.execute_transaction(engine, commands)

    print("Phixing Phalsely Phlagged Orphams...")
    # Phams which are colored as though they are multi-member phams
    # when really they are orphams
    query = "SELECT * FROM (SELECT g.PhamID, COUNT(GeneID) AS count, "\
            "p.Color FROM gene AS g INNER JOIN pham AS p ON g.PhamID "\
            "=p.PhamID GROUP BY PhamID) AS c WHERE Color != '#FFFFFF' "\
            "AND count = 1"

    results = mysqldb.query_dict_list(engine, query)
    print(f"Found {len(results)} miscolored orphams to fix...")

    commands = []
    for dictionary in results:
        pham_id = dictionary["PhamID"]
        count = dictionary["count"]
        color = dictionary["Color"]
        new_color = "#FFFFFF"
        commands.append(
            f"UPDATE pham SET Color = '{new_color}' WHERE PhamID = '{pham_id}'"
        )

    mysqldb.execute_transaction(engine, commands)
示例#2
0
    def test_6_map_translations_to_geneids(self):
        """Verify we get back a dictionary"""
        ts_to_gs = map_translations_to_geneids(self.engine)

        command = "SELECT distinct(Translation) FROM gene"
        results = mysqldb.query_dict_list(self.engine, command)

        # ts_to_gs should be a dictionary
        with self.subTest():
            self.assertEqual(type(ts_to_gs), type(dict()))
        # ts_to_gs should have the right number of translations
        with self.subTest():
            self.assertEqual(len(ts_to_gs), len(results))
示例#3
0
def get_new_geneids(engine):
    """
    Queries the database for those genes that are not yet phamerated.
    :param engine: the Engine allowing access to the database
    :return: new_geneids
    """
    new_geneids = set()

    gene_query = "SELECT GeneID FROM gene WHERE PhamID IS NULL"
    gene_results = mysqldb.query_dict_list(engine, gene_query)

    for dictionary in gene_results:
        geneid = dictionary["GeneID"]
        new_geneids = new_geneids | {geneid}

    print(f"Found {len(new_geneids)} genes not in phams...")

    return new_geneids
示例#4
0
def map_geneids_to_translations(engine):
    """
    Constructs a dictionary mapping all geneids to their translations.
    :param engine: the Engine allowing access to the database
    :return: gs_to_ts
    """
    gs_to_ts = dict()

    query = "SELECT GeneID, Translation FROM gene"
    results = mysqldb.query_dict_list(engine, query)

    for dictionary in results:
        geneid = dictionary["GeneID"]
        trans = dictionary["Translation"]
        gs_to_ts[geneid] = trans

    print(f"Found {len(results)} genes in the database...")

    return gs_to_ts
示例#5
0
def get_pham_colors(engine):
    """
    Queries the database for the colors of existing phams
    :param engine: the Engine allowing access to the database
    :return: pham_colors
    """
    pham_colors = dict()

    color_query = "SELECT PhamID, Color FROM pham"
    color_results = mysqldb.query_dict_list(engine, color_query)

    print(f"Found colors for {len(color_results)} phams...")

    for dictionary in color_results:
        pham_id = dictionary["PhamID"]
        color = dictionary["Color"]

        pham_colors[pham_id] = color

    return pham_colors
示例#6
0
def map_translations_to_geneids(engine):
    """
    Constructs a dictionary mapping all unique translations to their
    groups of geneids
    :param engine: the Engine allowing access to the database
    :return: ts_to_gs
    """
    ts_to_gs = dict()

    query = "SELECT GeneID, Translation FROM gene"
    results = mysqldb.query_dict_list(engine, query)

    for dictionary in results:
        geneid = dictionary["GeneID"]
        trans = dictionary["Translation"]
        geneids = ts_to_gs.get(trans, [])
        geneids.append(geneid)
        ts_to_gs[trans] = geneids

    print(f"Found {len(ts_to_gs)} unique translations in the database...")

    return ts_to_gs
示例#7
0
def get_pham_geneids(engine):
    """
    Queries the database for those genes that are already phamerated.
    :param engine: the Engine allowing access to the database
    :return: pham_geneids
    """
    pham_geneids = dict()

    geneid_query = "SELECT GeneID, PhamID FROM gene WHERE PhamID IS NOT NULL"
    geneid_results = mysqldb.query_dict_list(engine, geneid_query)

    print(f"Found {len(geneid_results)} genes in phams...")

    for dictionary in geneid_results:
        pham_id = dictionary["PhamID"]
        geneid = dictionary["GeneID"]

        if pham_id in pham_geneids.keys():
            pham_geneids[pham_id] = pham_geneids[pham_id] | {geneid}
        else:
            pham_geneids[pham_id] = {geneid}

    return pham_geneids
示例#8
0
def main(argument_list):
    """
    :param argument_list:
    :return:
    """
    # Setup argument parser
    cdd_parser = setup_argparser()

    # Use argument parser to parse argument_list
    args = cdd_parser.parse_args(argument_list)

    # Store arguments in more easily accessible variables
    database = args.db
    cdd_dir = expand_path(args.dir)
    cdd_name = learn_cdd_name(cdd_dir)
    threads = args.threads
    evalue = args.evalue
    rpsblast = args.rpsblast
    tmp_dir = args.tmp_dir
    output_folder = args.output_folder
    log_file = args.log_file
    reset = args.reset

    # Set up directory.
    output_folder = basic.set_path(output_folder, kind="dir", expect=True)
    results_folder = pathlib.Path(RESULTS_FOLDER)
    results_path = basic.make_new_dir(output_folder,
                                      results_folder,
                                      attempt=10)
    if results_path is None:
        print("Unable to create output_folder.")
        sys.exit(1)

    log_file = pathlib.Path(results_path, log_file)

    # Set up root logger.
    logging.basicConfig(
        filename=log_file,
        filemode="w",
        level=logging.DEBUG,
        format="pdm_utils find_domains: %(levelname)s: %(message)s")
    logger.info(f"pdm_utils version: {VERSION}")
    logger.info(f"CDD run date: {constants.CURRENT_DATE}")
    logger.info(f"Command line arguments: {' '.join(argument_list)}")
    logger.info(f"Results directory: {results_path}")

    # Early exit if either 1) cdd_name == "" or 2) no rpsblast given and we are
    # unable to find one
    if cdd_name == "":
        msg = (f"Unable to learn CDD database name. Make sure the files in "
               f"{cdd_dir} all have the same basename.")
        logger.error(msg)
        print(msg)
        return

    # Get the rpsblast command and path.
    if rpsblast == "":
        command = get_rpsblast_command()
        rpsblast = get_rpsblast_path(command)

    # Verify database connection and schema compatibility.
    engine = mysqldb.connect_to_db(database)
    logger.info(f"Connected to database: {database}.")
    mysqldb.check_schema_compatibility(engine, "the find_domains pipeline")
    logger.info(f"Schema version is compatible.")
    logger.info("Command line arguments verified.")

    if reset:
        logger.info("Clearing all domain data currently in the database.")
        clear_domain_data(engine)

    # Get gene data that needs to be processed
    # in dict format where key = column name, value = stored value.
    # result = engine.execute(GET_GENES_FOR_CDD)
    cdd_genes = mysqldb.query_dict_list(engine, GET_GENES_FOR_CDD)
    msg = f"{len(cdd_genes)} genes to search for conserved domains..."
    logger.info(msg)
    print(msg)

    # Only run the pipeline if there are genes returned that need it
    if len(cdd_genes) > 0:
        log_gene_ids(cdd_genes)

        # Create temp_dir
        make_tempdir(tmp_dir)

        # TODO dev
        # translations = get_unique_translations(cdd_genes)

        # Build jobs list
        jobs = []

        # TODO dev
        # translation_id = 0
        # for translation in translations:
        #     translation_id += 1
        #     jobs.append((rpsblast, cdd_name, tmp_dir, evalue,
        #                  translation_id, translation))

        for cdd_gene in cdd_genes:
            jobs.append((rpsblast, cdd_name, tmp_dir, evalue,
                         cdd_gene["GeneID"], cdd_gene["Translation"]))

        results = parallelize(jobs, threads, search_and_process)
        print("\n")

        # TODO dev
        # results_dict = create_results_dict(results)
        # map_results_to_genes(cdd_genes, results_dict)

        insert_domain_data(engine, results)
        engine.dispose()
    return