def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = None): time_stamp = datetime.now().strftime( "%Y-%m-%d_%H-%M-%S") # used in naming files start_time = datetime.now().strftime( "%Y-%m-%d %H:%M:%S") # used in terminating message start_time = pd.to_datetime(start_time) # Program preparation if argv is None: parser = build_parser() args = parser.parse_args() else: parser = build_parser(argv) args = parser.parse_args() if logger is None: logger = logging.getLogger(__name__) config_logger(args) # parse the configuration data (cache the uniprot data as .csv files) connection, logger_name, cache_dir = connect_existing_db( args, time_stamp, start_time) # build cache directory if args.cache_dir is not None: # use user defined cache dir cache_dir = args.cache_dir make_output_directory(cache_dir, args.force, args.nodelete_cache) else: cache_dir = cache_dir / "uniprot_data_retrieval" make_output_directory(cache_dir, args.force, args.nodelete_cache) ( config_dict, class_filters, family_filters, kingdom_filters, taxonomy_filter_dict, ec_filters, ) = get_expansion_configuration(args) # add log to the local CAZyme database logger.info("Adding log of scrape to the local CAZyme database") retrieved_annotations = "UniProt accessions, Protein names" if args.ec: retrieved_annotations += ", EC numbers" if args.pdb: retrieved_annotations += ", PDB accessions" if args.sequence: retrieved_annotations += ", Protein sequence" if args.seq_update: retrieved_annotations += ", Updated UniProt protein sequences" with sql_orm.Session(bind=connection) as session: sql_interface.log_scrape_in_db( time_stamp, config_dict, taxonomy_filter_dict, kingdom_filters, ec_filters, 'UniProt', retrieved_annotations, session, args, ) # retrieve dict of genbank accession and genbank db ids from the local CAZyme db if args.genbank_accessions is not None: logger.warning( f"Getting GenBank accessions from file: {args.genbank_accessions}") with open(args.genbank_accessions, "r") as fh: lines = fh.read().splitlines() accessions = [line.strip() for line in lines] accessions = set(accessions) gbk_dict = get_selected_gbks.get_ids(accessions, connection) else: gbk_dict = get_selected_gbks.get_genbank_accessions( class_filters, family_filters, taxonomy_filter_dict, kingdom_filters, ec_filters, connection, ) logger.warning(f"Retrieving UniProt data for {len(gbk_dict.keys())}") # if using cachce skip accession retrieval if args.use_uniprot_cache is not None: logger.warning( f"Using UniProt data from cache: {args.use_uniprot_cache}") with open(args.use_uniprot_cache, "r") as fh: uniprot_dict = json.load(fh) if args.ec: all_ecs = get_ecs_from_cache(uniprot_dict) else: all_ecs = set() else: # Get the UniProt accessions/IDs for the corresponding GenBank accessions if args.skip_uniprot_accessions is not None: logger.warning( f"Using UniProt accessions from cache: {args.skip_uniprot_accessions}" ) with open(args.skip_uniprot_accessions, "r") as fh: uniprot_gkb_dict = json.load(fh) else: uniprot_gkb_dict = get_uniprot_accessions( gbk_dict, args) # {uniprot_acc: {'gbk_acc': str, 'db_id': int}} uniprot_acc_cache = cache_dir / f"uniprot_accessions_{time_stamp}.json" with open(uniprot_acc_cache, "w") as fh: json.dump(uniprot_gkb_dict, fh) # get data from UniProt uniprot_dict, all_ecs = get_uniprot_data(uniprot_gkb_dict, cache_dir, args) # converts sets to lists for json serialisation for uniprot_accession in uniprot_dict: try: uniprot_dict[uniprot_accession]['ec'] = list( uniprot_dict[uniprot_accession]['ec']) except KeyError: pass try: uniprot_dict[uniprot_accession]['pdb'] = list( uniprot_dict[uniprot_accession]['pdb']) except KeyError: pass uniprot_acc_cache = cache_dir / f"uniprot_data_{time_stamp}.json" with open(uniprot_acc_cache, "w") as fh: json.dump(uniprot_dict, fh) # add uniprot accessions (and sequences if seq retrieval is enabled) logger.warning("Adding data to the local CAZyme database") add_uniprot_accessions(uniprot_dict, gbk_dict, connection, args) # add ec numbers if (args.ec) and (len(all_ecs) != 0): add_ec_numbers(uniprot_dict, all_ecs, gbk_dict, connection, args) # add pdb accessions if args.pdb: add_pdb_accessions(uniprot_dict, gbk_dict, connection, args) closing_message("get_uniprot_data", start_time, args)
def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = None): """Set up parser, logger and coordinate overal scrapping of CAZy.""" cazy_home_url = "http://www.cazy.org" time_stamp = datetime.now().strftime( "%Y-%m-%d_%H-%M-%S") # used in naming files start_time = datetime.now().strftime( "%Y-%m-%d %H:%M:%S") # used in terminating message start_time = pd.to_datetime(start_time) # Program preparation if argv is None: parser = build_parser() args = parser.parse_args() else: parser = build_parser(argv) args = parser.parse_args() if logger is None: logger = logging.getLogger(__name__) config_logger(args) # check if printing out version or citation information if args.version: print(VERSION_INFO) return if args.citation: print(CITATION_INFO) return # check correct output was provided, exit if not operable if args.database is not None and args.db_output is not None: warning_message = ( "Target path for a NEW database (--db_output, -d) and\n" "a path to an EXISTING database (--database, -D) were provided.\n" "Please provide one OR the other.\n" "Terminating program.") logger.warning(termcolour(warning_message, "red")) closing_message("cazy_webscraper", start_time, args) return if args.db_output is not None and args.db_output.exists(): if args.force: logger.warning(f"Local db {args.db_output} already exists\n" "Force is True\n" "Ovewriting existing database.") os.remove(args.db_output) else: logger.warning(f"Local db {args.db_output} already exists\n" "Force is False\n" "Not ovewriting existing database\n" "Termianting program") closing_message("cazy_webscraper", start_time, args) return Entrez.email = args.email logger.info("Parsing configuration") ( excluded_classes, config_dict, cazy_class_synonym_dict, class_filters, fam_filters, kingdom_filters, taxonomy_filter_dict, taxonomy_filter_set, ) = parse_configuration.parse_configuration(args) scrape_config_message = ( "Configuration:\n" f"Classes to scrape: {config_dict['classes']}\n" f"GH fams to scrape: {config_dict['Glycoside Hydrolases (GHs)']}\n" f"GT fams to scrape: {config_dict['GlycosylTransferases (GTs)']}\n" f"PL fams to scrape: {config_dict['Polysaccharide Lyases (PLs)']}\n" f"CE fams to scrape: {config_dict['Carbohydrate Esterases (CEs)']}\n" f"AA fams to scrape: {config_dict['Auxiliary Activities (AAs)']}\n" f"CBM fams to scrape: {config_dict['Carbohydrate-Binding Modules (CBMs)']}\n" f"Scraping subfamilies: {args.subfamilies}") if len(taxonomy_filter_set) != 0: scrape_config_message += "\nTaxonomy filters applied." if len(kingdom_filters) < 5: scrape_config_message += f"\nScraping only tax kingdoms: {kingdom_filters}" logger.info(termcolour(scrape_config_message, "cyan")) if args.database is not None: # adding data to an EXISTING database connection, logger_name, cache_dir = connect_existing_db( args, time_stamp, start_time) else: # build a new database connection, logger_name, cache_dir = connect_to_new_db( args, time_stamp, start_time) logger.info("Adding log of scrape to the local CAZyme database") with sql_orm.Session(bind=connection) as session: sql_interface.log_scrape_in_db( time_stamp, config_dict, kingdom_filters, taxonomy_filter_dict, set(), # ec_filters not applied when scraping CAZy 'CAZy', 'CAZy annotations', session, args, ) if args.cache_dir is not None: # use user defined cache dir cache_dir = args.cache_dir make_output_directory(cache_dir, args.force, args.nodelete_cache) else: make_output_directory(cache_dir, args.force, args.nodelete_cache) logger.warning(f"Created cache dir: {cache_dir}") if args.log is not None: # write additional log files to user specified dir logger_name = args.log.name if logger_name.endswith(".log"): logger_name = logger_name[:-4] make_output_directory(args.log, args.force, args.nodelete_log) else: # write the additional log files to the .cazy_webscraper cache dire logger_name = "log" logger.info("Starting retrieval of data from CAZy") if args.cazy_data is not None: logger.warning( f"Retrieving CAZy data from predownloaded CAZy db dump at:\n{args.cazy_data}" ) get_cazy_data( cazy_home_url, excluded_classes, cazy_class_synonym_dict, config_dict, class_filters, fam_filters, kingdom_filters, taxonomy_filter_set, connection, cache_dir, logger_name, time_stamp, args, ) closing_message("cazy_webscraper", start_time, args)
def get_validation_data( cazy_home_url, excluded_classes, cazy_synonym_dict, config_dict, cache_dir, connection_failures_logger, time_stamp, args, ): """Coordinate retrieving the population sizes of CAZy familes from the CAZy website. :param cazy_home_url: str, URL to CAZy home page :param excluded_classes: list of CAZy classes NOT to scrape :param cazy_synonym_dict: dict of accepted CAZy class name synonyms :param config_dict: dict keyed by CAZy classes, values by set of CAZy families to scrape :param cache_dir: path to cache dir :param connection_failures_logger: logger, logg incorrect URLs and URLs to which a connection could not be made :param time_stamp: str, time cazy_webscraper was invoked :param args: cmd-line args parser Return dict, keyed by CAZy family (str) and valued by population size (int) """ # make dir fo caching HTML files cache_dir = cache_dir / "html" file_io.make_output_directory(cache_dir, args.force, args.nodelete_cache) cazy_fam_populations = {} # {fam(str): population(int)} # retrieve list of CAZy class instances, one instance per class to be scrapped cazy_classes = get_cazy_classes( cazy_home_url, excluded_classes, cazy_synonym_dict, cache_dir, time_stamp, args, ) if cazy_classes is None: return for cazy_class in tqdm(cazy_classes, desc="Retrieving CAZy family population sizes"): # first attempt of scraping, retrieve URLs to CAZy families if len(list(cazy_class.failed_families.keys())) == 0: fam_pops_to_retrieve = config_dict[ cazy_class.name] # retrieve user specified fams else: fam_pops_to_retrieve = list( cazy_class.failed_families.keys()) # retry failed connections family_populations, err_message, incorrect_urls, failed_families = get_cazy_family_pops( cazy_class.name, cazy_class.url, cazy_home_url, fam_pops_to_retrieve, cache_dir, time_stamp, args, ) if incorrect_urls is not None: # log families for which compiled URL is incorrect [ connection_failures_logger.warning(url_message) for url_message in incorrect_urls ] if family_populations is None: # couldn't retrieve family populations cazy_class.tries += 1 # check if maximum number of attempts to connect have been met if cazy_class.tries == (args.retries + 1): # Maximum number of tries met connection_failures_logger.warning( f"{cazy_class.url}\t" f"{cazy_class.name}\t" f"CAZy family populations not retrieved from {cazy_class.name}\t" f"{err_message}") else: for fam in failed_families: try: cazy_class.failed_families[fam] += 1 if cazy_class.failed_families[fam] == (args.retries + 1): # max number of attemptes made, do not retry connection del cazy_class.failed_families[fam] except KeyError: cazy_class.failed_families[fam] = 1 cazy_classes.append( cazy_class) # retry retriving family populations later continue # go onto next CAZy class else: # retrieved CAZy family populations cazy_fam_populations.update(family_populations) # log any errors that meant no family population could be retrieved for cazy_class in cazy_classes: for fam in list(cazy_class.failed_families.keys()): connection_failures_logger.warning( f"{fam}\t" "Retrieved no family population for data retrieval validation\n" f"Failed to conencted to CAZy after {(args.retries + 1)*(args.retries +1)} attempts" ) return cazy_fam_populations
def connect_to_new_db(args, time_stamp, start_time): """Build and connect to a new local CAZyme database. :param args: cmd-line args parser :param time_stamp: str, time cazy_webscraper was invoked :param start_time: pd date-time obj, time cazy_webscraper was invoked Return connection to the database, name of the logger, and path to the cache dir """ logger = logging.getLogger(__name__) if args.db_output is not None: # user defined target output for the NEW database if (os.path.isfile(args.db_output)): # target file exists if args.force: logger.warning( "Overwriting existing local CAZyme database at:\n" f"{args.db_output}" ) else: logger.warning( "Target path for new database already exists.\n" "Either enable forced overwriting (-f) or add data this data (-D).\n" "Terminating program." ) closing_message("cazy_webscraper", start_time, args) sys.exit(1) else: # may need to build dirs logger.info( "Building new local CAZyme database\n" f"Output directory: {(args.db_output).parent}\n" f"Force overwriting exiting output file: {args.force}" ) if str((args.db_output).parent) != '.': # dirs defined in output put output_dir = (args.db_output).parent make_output_directory(output_dir, args.force, args.nodelete) cache_dir = Path(f"{str(output_dir)}/.cazy_webscraper_{time_stamp}") else: # writing to cwd cache_dir = Path(f".cazy_webscraper_{time_stamp}") logger_name = str(args.db_output).split('.')[0] db_path = args.db_output else: logger.info("Using default database name and writing to cwd") db_path = Path(f"cazy_webscraper_{time_stamp}.db") cache_dir = Path(f".cazy_webscraper_{time_stamp}") logger_name = f'cazy_webscraper_{time_stamp}' try: connection = sql_orm.get_db_connection(db_path, args, new=True) logger.warning(f"Built new local CAZyme database at\n{db_path}") except Exception: logger.error( "Failed to build new SQL database\n." "Terminating program", exc_info=True, ) closing_message("cazy_webscraper", start_time, args) sys.exit(1) return connection, logger_name, cache_dir
def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = None): time_stamp = datetime.now().strftime( "%Y-%m-%d_%H-%M-%S") # used in naming files start_time = datetime.now().strftime( "%Y-%m-%d %H:%M:%S") # used in terminating message start_time = pd.to_datetime(start_time) # Program preparation if argv is None: parser = genbank_cov_parser.build_parser() args = parser.parse_args() else: parser = genbank_cov_parser.build_parser(argv) args = parser.parse_args() if logger is None: logger = logging.getLogger(__name__) config_logger(args) Entrez.email = args.email # check if need to build output dir if os.getcwd() != args.output_dir: make_output_directory(args.output_dir, args.force, args.nodelete) # connect to the local CAZyme database connection, logger_name, cache_dir = cazy_scraper.connect_existing_db( args, time_stamp) # make cache_dir make_output_directory(cache_dir, args.force_cache, args.nodelete_cache) no_accession_logger = cache_dir / f"no_genomic_accession_retrieved_{time_stamp}.log" # load Genbank and Kingdom records from the db logger.warning( "Retrieving Genbanks, Taxs and Kingdoms records from the local CAZyme db" ) genbank_kingdom_dict = get_table_dicts.get_gbk_kingdom_dict(connection) # {kingdom: {genus: {species: {protein_accessions}}} logger.warning( f"Retrieved Genbanks, Taxs and Kingdoms records from the local CAZyme db" ) nucleotide_accessions_dict = get_nucleotide_accessions( genbank_kingdom_dict, no_accession_logger, args, ) output_path = cache_dir / f"nucleotide_accessions_{time_stamp}.json" with open(output_path, 'w') as fh: json.dump(nucleotide_accessions_dict, fh) genomic_accession_dict = get_genomic_accessions(nucleotide_accessions_dict, no_accession_logger, args) output_path = cache_dir / f"genomic_accession_numbers_{time_stamp}.json" with open(output_path, 'w') as fh: json.dump(genomic_accession_dict, fh) write_out_genomic_accessions(genomic_accession_dict, time_stamp, args) ncbi_genomes_totals = get_ncbi_counts(args) write_out_genome_coverage(ncbi_genomes_totals, genomic_accession_dict, time_stamp, args) end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") end_time = pd.to_datetime(end_time) total_time = end_time - start_time if args.verbose: logger.info( "Finished calculting the local CAZyme db's coverage of GenBank\n" f"Scrape initated at {start_time}\n" f"Scrape finished at {end_time}\n" f"Total run time: {total_time}" f"Version: {cazy_scraper.VERSION_INFO}\n" f"Citation: {cazy_scraper.CITATION_INFO}") else: print("=====================cazy_webscraper=====================\n" "Finished calculting the local CAZyme db's coverage of GenBank\n" f"Scrape initated at {start_time}\n" f"Scrape finished at {end_time}\n" f"Total run time: {total_time}" f"Version: {cazy_scraper.VERSION_INFO}\n" f"Citation: {cazy_scraper.CITATION_INFO}")
def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = None): """Set up parser, logger and coordinate overal scrapping of CAZy.""" time_stamp = datetime.now().strftime( "%Y-%m-%d_%H-%M-%S") # used in naming files start_time = datetime.now().strftime( "%Y-%m-%d %H:%M:%S") # used in terminating message start_time = pd.to_datetime(start_time) # Program preparation if argv is None: parser = api_parser.build_parser() args = parser.parse_args() else: parser = api_parser.build_parser(argv) args = parser.parse_args() if logger is None: logger = logging.getLogger(__name__) config_logger(args) connection, logger_name, cache_dir = cazy_scraper.connect_existing_db( args, time_stamp, start_time) if args.output_dir is not None: file_io.make_output_directory(args.output_dir, args.force, args.nodelete) if args.cache_dir is not None: # use user defined cache dir cache_dir = args.cache_dir file_io.make_output_directory(cache_dir, args.force, args.nodelete_cache) else: cache_dir = cache_dir / "uniprot_data_retrieval" file_io.make_output_directory(cache_dir, args.force, args.nodelete_cache) ( config_dict, class_filters, family_filters, kingdom_filters, taxonomy_filter_dict, ec_filters, ) = get_expansion_configuration(args) output_path = compile_output_name(args) existing_files = "" if 'json' in args.file_types: json_output_path = output_path + ".json" logger.warning(f"JSON output path: {json_output_path}") if Path(json_output_path).exists(): existing_files = existing_files + " " + f"{json_output_path}\n" if 'csv' in args.file_types: csv_output_path = output_path + ".csv" logger.warning(f"CSV output path: {csv_output_path}") if Path(csv_output_path).exists(): existing_files = existing_files + " " + f"{csv_output_path}\n" existing_files = existing_files.strip() if len(existing_files) != 0: if args.overwrite: logger.warning( "The output files\n" f"{existing_files}" "Exist already. Overwrite is True. Overwriting output files") else: logger.warning( "The output files\n" f"{existing_files}" "Exist already. Overwrite is False\n" "To overwrite the files use the --overwrite flag, or " "change the output file prefix using the --prefix flag\n" "Terminating program") closing_message("cw_query_database", start_time, args) sys.exit(1) # get the records of GenBank accessions matching the criteria of interest # {gbk_acc: gbk_id} gbk_dict = get_selected_gbks.get_genbank_accessions( class_filters, family_filters, taxonomy_filter_dict, kingdom_filters, ec_filters, connection, ) query_data = get_query_data(gbk_dict, connection, args) logger.warning( f"Retrieved {len(list(query_data.keys()))} proteins from the local db") if 'json' in args.file_types: write_json_output(json_output_path, query_data, args) if 'csv' in args.file_types: write_csv_output(query_data, args, csv_output_path) closing_message("cw_query_database", start_time, args)
def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = None): """Set up programme and initate run.""" time_stamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") start_time = datetime.now().strftime( "%Y-%m-%d %H:%M:%S") # used in terminating message start_time = pd.to_datetime(start_time) # parse cmd-line arguments if argv is None: parser = pdb_strctre_parser.build_parser() args = parser.parse_args() else: args = pdb_strctre_parser.build_parser(argv).parse_args() if logger is None: logger = logging.getLogger(__package__) config_logger(args) connection, logger_name, cache_dir = connect_existing_db( args, time_stamp, start_time) ( config_dict, class_filters, family_filters, kingdom_filters, taxonomy_filter_dict, ec_filters, ) = parse_configuration.get_expansion_configuration(args) gbk_dict = {} # {gbk_acc: gbk_id} gbk_table_dict = get_table_dicts.get_gbk_table_dict(connection) # {genbank_accession: 'taxa_id': str, 'gbk_id': int} if args.genbank_accessions is not None: logger.warning( f"Retrieving PDB structures for GenBank accessions listed in {args.genbank_accessions}" ) gbk_dict.update(get_user_genbank_sequences(gbk_table_dict, args)) if args.uniprot_accessions is not None: logger.warning( f"Extracting protein sequences for UniProt accessions listed in {args.uniprot_accessions}" ) uniprot_table_dict = get_table_dicts.get_uniprot_table_dict(connection) gbk_dict.update( get_user_uniprot_sequences(gbk_table_dict, uniprot_table_dict, args)) pdb_accessions = get_selected_pdbs.get_pdb_accessions( class_filters, family_filters, taxonomy_filter_dict, kingdom_filters, ec_filters, gbk_table_dict, connection, ) if len(pdb_accessions) == 0: logger.warning("No PDB accessions matched the criteria provided.\n" "Retrieving no protein structure files from PDB") else: logger.warning( f"Retrieving {len(pdb_accessions)} structure files from PDB") # make output and cache dirs if args.cache_dir is not None: # use user defined cache dir cache_dir = args.cache_dir make_output_directory(cache_dir, args.force, args.nodelete_cache) else: cache_dir = cache_dir / "pdb_retrieval" make_output_directory(cache_dir, args.force, args.nodelete_cache) download_pdb_structures(pdb_accessions, args) cache_path = cache_dir / f"pdb_retrieval_{time_stamp}.txt" with open(cache_path, 'a') as fh: for acc in pdb_accessions: fh.write(f"{acc}\n") closing_message("Get PDB structure files", start_time, args)
def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = None): """Set up programme and initate run.""" time_stamp = datetime.now().strftime( "%Y-%m-%d_%H-%M-%S") # used in naming files start_time = datetime.now().strftime( "%Y-%m-%d %H:%M:%S") # used in terminating message start_time = pd.to_datetime(start_time) date_today = datetime.now().strftime( "%Y-%m-%d") # used as seq_update_date in the db # parse cmd-line arguments if argv is None: parser = build_parser() args = parser.parse_args() else: args = build_parser(argv).parse_args() if logger is None: logger = logging.getLogger(__package__) config_logger(args) logger.info("Providing user email address to NCBI.Entrez") Entrez.email = args.email if args.seq_update: logger.warning("Enabled updating sequences") connection, logger_name, cache_dir = connect_existing_db( args, time_stamp, start_time) if args.cache_dir is not None: # use user defined cache dir cache_dir = args.cache_dir make_output_directory(cache_dir, args.force, args.nodelete_cache) else: cache_dir = cache_dir / "genbank_data_retrieval" make_output_directory(cache_dir, args.force, args.nodelete_cache) ( config_dict, class_filters, family_filters, kingdom_filters, taxonomy_filter_dict, ec_filters, ) = get_expansion_configuration(args) # add log to the local CAZyme database logger.info("Adding log of scrape to the local CAZyme database") with sql_orm.Session(bind=connection) as session: retrieved_data = "GenBank protein sequences" sql_interface.log_scrape_in_db( time_stamp, config_dict, kingdom_filters, taxonomy_filter_dict, ec_filters, 'GenBank', retrieved_data, session, args, ) # retrieve dict of genbank accession and genbank db ids from the local CAZyme db if args.genbank_accessions is not None: logger.warning( f"Getting GenBank accessions from file: {args.genbank_accessions}") with open(args.genbank_accessions, "r") as fh: lines = fh.read().splitlines() accessions = [line.strip() for line in lines] accessions = set(accessions) gbk_dict = get_selected_gbks.get_ids(accessions, connection) else: gbk_dict = get_selected_gbks.get_genbank_accessions( class_filters, family_filters, taxonomy_filter_dict, kingdom_filters, ec_filters, connection, ) genbank_accessions = list(gbk_dict.keys()) logger.warning(f"Retrieving GenBank sequences for {len(gbk_dict.keys())}") if args.seq_dict: logger.warning(f"Getting sequences from cache: {args.seq_dict}") with open(args.seq_dict, "r") as fh: cache_dict = json.load(fh) # convert strs to SeqRecords seq_dict = {} for key in cache_dict: seq_dict[key] = Seq(cache_dict[key]) else: seq_dict, no_seq = get_sequences(genbank_accessions, args) # {gbk_accession: seq} # only cache the sequence. Seq obj is not JSON serializable cache_dict = {} for key in seq_dict: cache_dict[key] = str(seq_dict[key]) # cache the retrieved sequences cache_path = cache_dir / f"genbank_seqs_{time_stamp}.json" with open(cache_path, "w") as fh: json.dump(cache_dict, fh) if len(no_seq) != 0: no_seq_cache = cache_dir / f"no_seq_retrieved_{time_stamp}.txt" logger.warning( f"No protein sequence retrieved for {len(no_seq)}\n" f"The GenBank accessions for these proteins have been written to: {no_seq_cache}" ) with open(no_seq_cache, "a") as fh: for acc in no_seq: fh.write(f"{acc}\n") logger.warning( f"Adding {len(list(seq_dict.keys()))} protein seqs to the db") add_genbank_data.add_gbk_seqs_to_db(seq_dict, date_today, gbk_dict, connection, args) closing_message("get_genbank_sequences", start_time, args)
def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = None): """Set up programme and initate run.""" time_stamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") start_time = datetime.now().strftime( "%Y-%m-%d %H:%M:%S") # used in terminating message start_time = pd.to_datetime(start_time) # parse cmd-line arguments if argv is None: parser = build_parser() args = parser.parse_args() else: args = build_parser(argv).parse_args() if logger is None: logger = logging.getLogger(__package__) config_logger(args) validate_user_options(args) # make output directories if args.fasta_file: target_dir = args.fasta_file.parent make_output_directory(target_dir, args.force, args.nodelete) if args.fasta_dir: make_output_directory(args.fasta_dir, args.force, args.nodelete) connection, logger_name, cache_dir = connect_existing_db( args, time_stamp, start_time) if args.cache_dir is not None: # use user defined cache dir cache_dir = args.cache_dir make_output_directory(cache_dir, args.force, args.nodelete_cache) else: cache_dir = cache_dir / "sequence_extraction" make_output_directory(cache_dir, args.force, args.nodelete_cache) ( config_dict, class_filters, family_filters, kingdom_filters, taxonomy_filter_dict, ec_filters, ) = parse_configuration.get_expansion_configuration(args) gbk_table_dict = get_table_dicts.get_gbk_table_dict(connection) # {genbank_accession: 'taxa_id': str, 'gbk_id': int} # check what additional tabled needed to be loaded if any(((args.genbank_accessions is not None), (args.uniprot_accessions is not None), ('genbank' in args.source))): logger.info("Loading the GenBank table") gbk_seq_dict = get_table_dicts.get_gbk_table_seq_dict(connection) # {genbank_accession: 'sequence': str, 'seq_date': str} if any(((args.uniprot_accessions is not None), ('uniprot' in args.source))): logger.info("Loading the UniProt table") uniprot_table_dict = get_table_dicts.get_uniprot_table_dict(connection) # {acc: {name: str, gbk_id: int, seq: str, seq_date:str } } # build dick {gbk_acc: db_id} matching the users specified criteria # either via a list in a file or parameters provided via config file and/or command line gbk_dict = {} # {gbk_acc: gbk_id} if args.genbank_accessions is not None: logger.warning( f"Extracting protein sequences for GenBank accessions listed in {args.genbank_accessions}" ) gbk_dict.update(get_user_genbank_sequences(gbk_table_dict, args)) if args.uniprot_accessions is not None: logger.warning( f"Extracting protein sequences for UniProt accessions listed in {args.uniprot_accessions}" ) gbk_dict.update( get_user_uniprot_sequences(gbk_table_dict, uniprot_table_dict, args)) if len(list(gbk_dict.keys())) == 0: gbk_dict = get_selected_gbks.get_genbank_accessions( class_filters, family_filters, taxonomy_filter_dict, kingdom_filters, ec_filters, connection, ) # extract protein sequences from the database extracted_sequences = {} # {accession: {'db': str, 'seq': str}} if 'genbank' in args.source: extracted_sequences.update( get_genbank_sequences(gbk_seq_dict, gbk_dict)) if 'uniprot' in args.source: extracted_sequences.update( get_uniprot_sequences(uniprot_table_dict, gbk_dict)) protein_records = [] for protein_accession in tqdm(extracted_sequences, "Compiling SeqRecords"): try: new_record = SeqRecord( Seq(extracted_sequences[protein_accession]['seq']), id=protein_accession, description=extracted_sequences[protein_accession]['db']) protein_records.append(new_record) except TypeError: if extracted_sequences[protein_accession]['seq'] is not None: logger.warning( f"Seq for {protein_accession} retrieved as type {type(extracted_sequences[protein_accession]['seq'])}\n" "Not adding to FASTA file") pass # passed when sequence is None # write out the sequences to the specified outputs logger.warning(f"Extracted {len(protein_records)}") write_output(protein_records, cache_dir, args) closing_message("extract_sequences", start_time, args)