def check_adm2_values(config): get_acceptable_adm2(config) accepted_adm2 = config["clean_locs"] with open(config["query"], "r") as f: reader = csv.DictReader(f) header = reader.fieldnames if "adm2" in header: for row in reader: if row["adm2"].upper().replace( " ", "_") not in accepted_adm2 and "|" not in row[ "adm2"] and row["adm2"] != "": adm2_value = row["adm2"] sys.stderr.write( qcfunk.cyan( f'Error: {adm2_value} not a valid adm2 region.\n Find a list of valid adm2 values at:\nhttps://artic-network.github.io/civet/geographic_data.html\n' )) sys.stderr.write( qcfunk.cyan( f'Please note: if you have a region that encompasses multiple adm2 regions eg West Midlands, list the correct adm2 regions separated by the "|" symbol to indicate ambiguity.\n' )) sys.exit(-1) elif "|" in row["adm2"]: adm2_value = row["adm2"].split("|") for i in adm2_value: if i not in accepted_adm2: sys.stderr.write( qcfunk.cyan( f'Error: {i} found in the ambiguity code {row["adm2"]} not a valid adm2 region.\n Find a list of valid adm2 values at:\nhttps://artic-network.github.io/civet/geographic_data.html\n' )) sys.exit(-1)
def get_datadir(args_datadir, args_metadata, cwd, config): data_dir = "" background_metadata = "" if args_metadata: expanded_path = os.path.expanduser(args_metadata) background_metadata = os.path.join(cwd, expanded_path) if not os.path.exists(background_metadata): sys.stderr.write( qcfunk.cyan( f"Error: can't find metadata file at {background_metadata}.\n" )) sys.exit(-1) elif "background_metadata" in config: if config["background_metadata"]: expanded_path = os.path.expanduser(config["background_metadata"]) background_metadata = os.path.join(config["path_to_query"], expanded_path) if not os.path.exists(background_metadata): sys.stderr.write( qcfunk.cyan( f"Error: can't find metadata file at {background_metadata}.\n" )) sys.exit(-1) elif args_datadir: data_dir = os.path.join(cwd, args_datadir) elif "datadir" in config: if config["datadir"]: expanded_path = os.path.expanduser(config["datadir"]) data_dir = os.path.join(config["path_to_query"], expanded_path) else: data_dir = os.path.join(cwd, "civet-cat") if not os.path.exists(data_dir): print_data_error(data_dir) sys.exit(-1) background_seqs, background_tree, background_metadata, data_date = get_background_files( data_dir, background_metadata) config["datadir"] = data_dir config["data_date"] = data_date if not os.path.isfile(background_tree) or not os.path.isfile( background_seqs) or not os.path.isfile(background_metadata): print_data_error(data_dir) sys.exit(-1) else: config["background_metadata"] = background_metadata config["background_seqs"] = background_seqs config["background_tree"] = background_tree print("Found data:") print(" -", background_seqs) print(" -", background_metadata) print(" -", background_tree, "\n")
def check_cluster_dependencies(config): if not "query" in config: sys.stderr.write( qcfunk.cyan('Error: input.csv required to run `cluster` civet\n')) sys.exit(-1) if config["update"]: sys.stderr.write( qcfunk.cyan( 'Error: specify one of either `cluster` or `update`\n')) sys.exit(-1)
def print_data_error(data_dir): sys.stderr.write( qcfunk.cyan( f"Error: data directory should contain the following files or additionally supply a background metadata file:\n" ) + f"\ - global_2020-XX-YY_tree.nexus\n\ - global_2020-XX-YY_metadata.csv\n\ - global_2020-XX-YY_alignment.fasta\n" + qcfunk.cyan(f"\ To run civet please specify a local directory with the appropriate files, optionally supply a custom metadata file\n\n" ""))
def map_sequences_config(config): background_headers = config["background_metadata_header"] query_headers = config["query_metadata_header"] if config["map_sequences"]: map_inputs = "" if config["map_info"]: map_inputs = config["map_info"].replace(" ", "") else: sys.stderr.write( qcfunk.cyan( 'Error: coordinates or outer postcode or adm2 not supplied for mapping sequences. Please provide either x and y columns as a comma separated string, or column header containing outer postcode.' )) sys.exit(-1) if len(map_inputs.split( ",")) == 2: #If x and y coordinates are provided if not config["input_crs"]: sys.stderr.write( qcfunk.cyan( 'Error: input coordinate system not provided for mapping. Please provide --input-crs eg EPSG:3395' )) sys.exit(-1) else: #If an outer postcode column is provided config["input_crs"] = "EPSG:4326" relevant_cols = map_inputs.split(",") if config["colour_map_by"]: relevant_cols.append(config["colour_map_by"]) for map_arg in relevant_cols: map_arg = map_arg.replace(" ", "") if map_arg not in query_headers and map_arg not in background_headers: sys.stderr.write( qcfunk.cyan( f"Error: {map_arg} column not found in metadata file or background database for mapping sequences" )) sys.exit(-1) if config["colour_map_by"]: if map_inputs == "adm2": print( qcfunk.cyan( f"NOTE: --colour-map-by not set up to colour by adm2. Please provide outer postcode or coordinates" )) else: print( qcfunk.green( f"Colouring map by: {config['colour_map_by']}"))
def check_update_dependencies(config): if "from_metadata" in config: if not config["from_metadata"]: sys.stderr.write( qcfunk.cyan( 'Error: `--from-metadata` search term required to run in `update` mode\n' )) sys.exit(-1) else: sys.stderr.write( qcfunk.cyan( 'Error: `--from-metadata` search term required to run in `update` mode\n' )) sys.exit(-1)
def print_data_error(data_dir): sys.stderr.write( qcfunk.cyan( f"Error: data directory should contain the following files or additionally supply a background metadata file:\n" ) + f"\ - cog_global_2020-XX-YY_tree.newick\n\ - cog_global_2020-XX-YY_metadata.csv\n\ - cog_global_2020-XX-YY_alignment.fasta\n\n" + qcfunk.cyan( "Please also check that the data directory is correctly specified.\n\n" ) + qcfunk.cyan(f"\ To run civet please either\n1) ssh into CLIMB and run with --CLIMB flag\n\ 2) Run using `--remote` flag and your CLIMB username specified e.g. `-uun climb-covid19-smithj`\n\ 3) Specify a local directory with the appropriate files, optionally supply a custom metadata file, custom background tree or custom background fasta file\n\n" ""))
def get_sequencing_centre_header(config): sc_list = [ "PHEC", 'LIVE', 'BIRM', 'PHWC', 'CAMB', 'NORW', 'GLAS', 'EDIN', 'SHEF', 'EXET', 'NOTT', 'PORT', 'OXON', 'NORT', 'NIRE', 'GSTT', 'LOND', 'SANG', "NIRE" ] sequencing_centre = config["sequencing_centre"] if sequencing_centre in sc_list or sequencing_centre == "DEFAULT": package_png = os.path.join("data", "headers", f"{sequencing_centre}.png") sequencing_centre_source = pkg_resources.resource_filename( 'civet', package_png) print(qcfunk.green(f"Using header file from:") + f" {package_png}\n") config["sequencing_centre_source"] = sequencing_centre_source config["sequencing_centre_dest"] = os.path.join( config["outdir"], "report", "figures", f"{sequencing_centre}.png") config["sequencing_centre_file"] = os.path.join( ".", "figures", f"{sequencing_centre}.png") config["sequencing_centre"] = sequencing_centre else: sc_string = "\n".join(sc_list) sys.stderr.write( qcfunk.cyan( f'Error: sequencing centre must be one of the following:\n{sc_string}\n' )) sys.exit(-1)
def get_remote_data(uun, background_metadata, background_trees, background_seqs, data_dir, config): config["remote"] = True if uun: config["username"] = uun rsync_data_from_climb(uun, data_dir) elif "username" in config: uun = config["username"] rsync_data_from_climb(uun, data_dir) elif "uun" in config: uun = config["uun"] rsync_data_from_climb(uun, data_dir) else: rsync_command = f"rsync -avzh --exclude 'cog' --delete-after bham.covid19.climb.ac.uk:/cephfs/covid/bham/results/phylogenetics/latest/civet/ '{data_dir}'" print(f"Syncing civet data to {data_dir}") status = os.system(rsync_command) if status != 0: sys.stderr.write( qcfunk.cyan( "Error: rsync command failed.\nCheck your ssh is configured with Host bham.covid19.climb.ac.uk\nAlternatively enter your CLIMB username with -uun e.g. climb-covid19-smithj\nAlso, check if you have access to CLIMB from this machine and check if you are in the UK\n\n" )) sys.exit(-1) background_seqs, background_tree, background_metadata, data_date, background_metadata_all = get_background_files( data_dir, background_metadata, background_trees, background_seqs) config["datadir"] = data_dir config["data_date"] = data_date if not os.path.exists(config["datadir"]): print(qcfunk.cyan(f"Error: data directory not found at {data_dir}.\n")) sys.exit(-1) if not os.path.isfile(background_tree) or not os.path.isfile( background_seqs) or not os.path.isfile(background_metadata): print_data_error(data_dir) sys.exit(-1) else: config["background_metadata"] = background_metadata config["background_seqs"] = background_seqs config["background_tree"] = background_tree print(qcfunk.green("Found data:")) print(" -", background_seqs) print(" -", background_metadata) print(" -", background_tree, "\n")
def get_package_data(thisdir, config): reference_fasta = pkg_resources.resource_filename('civet', 'data/reference.fasta') outgroup_fasta = pkg_resources.resource_filename('civet', 'data/outgroup.fasta') polytomy_figure = pkg_resources.resource_filename('civet', 'data/polytomies.png') report_args = pkg_resources.resource_filename('civet', 'data/report_arguments.txt') footer_fig = pkg_resources.resource_filename('civet', 'data/footer.png') clean_locs_file = pkg_resources.resource_filename( 'civet', 'data/mapping_files/adm2_cleaning.csv') map_input_1 = pkg_resources.resource_filename( 'civet', 'data/mapping_files/gadm36_GBR_2.json') map_input_2 = pkg_resources.resource_filename( 'civet', 'data/mapping_files/channel_islands.json') map_input_3 = pkg_resources.resource_filename( 'civet', 'data/mapping_files/NI_counties.geojson') map_input_4 = pkg_resources.resource_filename( 'civet', 'data/mapping_files/Mainland_HBs_gapclosed_mapshaped_d3.json') map_input_5 = pkg_resources.resource_filename( 'civet', 'data/mapping_files/urban_areas_UK.geojson') map_input_6 = pkg_resources.resource_filename( 'civet', 'data/mapping_files/UK_outPC_coords.csv') spatial_translations_1 = pkg_resources.resource_filename( 'civet', 'data/mapping_files/HB_Translation.pkl') spatial_translations_2 = pkg_resources.resource_filename( 'civet', 'data/mapping_files/adm2_regions_to_coords.csv') appendix_text = pkg_resources.resource_filename('civet', 'data/appendix.txt') config["reference_fasta"] = reference_fasta config["outgroup_fasta"] = outgroup_fasta config["polytomy_figure"] = polytomy_figure config["report_args"] = report_args config["footer"] = footer_fig config["appendix"] = appendix_text config["clean_locs_file"] = clean_locs_file config["uk_map"] = map_input_1 config["channels_map"] = map_input_2 config["ni_map"] = map_input_3 config["uk_map_d3"] = map_input_4 config["urban_centres"] = map_input_5 config["pc_file"] = map_input_6 config["HB_translations"] = spatial_translations_1 config["PC_translations"] = spatial_translations_2 report_template = os.path.join(thisdir, 'scripts', 'civet_template.pmd') if not os.path.exists(report_template): sys.stderr.write( qcfunk.cyan( f'Error: cannot find report_template at {report_template}\n')) sys.exit(-1) config["report_template"] = report_template
def rsync_data_from_climb(uun, data_dir): rsync_command = f"rsync -avzh --exclude 'cog' --delete-after {uun}@bham.covid19.climb.ac.uk:/cephfs/covid/bham/results/phylogenetics/latest/civet/ '{data_dir}'" print(qcfunk.green(f"Syncing civet data to {data_dir}")) status = os.system(rsync_command) if status != 0: sys.stderr.write( qcfunk.cyan( "Error: rsync command failed.\nCheck your user name is a valid CLIMB username e.g. climb-covid19-smithj\nAlso, check if you have access to CLIMB from this machine and are in the UK\n\n" )) sys.exit(-1)
def check_for_new_in_cluster(config): new_count = 0 prefix = config["output_prefix"] background_metadata = config["background_metadata"] cluster_csv = os.path.join(config["outdir"], f"{prefix}.csv") with open(cluster_csv, "r") as f: reader = csv.DictReader(f) if not "new" in reader.fieldnames: sys.stderr.write( qcfunk.cyan( 'Error: `cluster` civet has not run, require column `new` in csv\n' )) sys.exit(-1) for row in reader: if row["new"] == "True": new_count += 1 return new_count, cluster_csv
def main(sysargs = sys.argv[1:]): parser = argparse.ArgumentParser(add_help=False, prog = _program, description=cfunk.preamble(__version__), usage=''' \tcivet -i <config.yaml> [options] \tcivet -i input.csv [options] \tcivet -i ID1,IS2 [options] \tcivet -fm <column=match> [options]\n\n''') io_group = parser.add_argument_group('input output options') io_group.add_argument('-i',"--input", action="store",help="Input config file in yaml format, csv file (with minimally an input_column header, Default=`name`) or comma-separated id string with one or more query ids. Example: `EDB3588,EDB3589`.", dest="input") io_group.add_argument('-fm','--from-metadata',nargs='*', dest="from_metadata",help="Generate a query from the metadata file supplied. Define a search that will be used to pull out sequences of interest from the large phylogeny. E.g. -fm adm2=Edinburgh sample_date=2020-03-01:2020-04-01") io_group.add_argument('-o','--output-prefix',action="store",help="Prefix of output directory & report name: Default: civet",dest="output_prefix") io_group.add_argument('--outdir', action="store",help="Output directory. Default: current working directory") io_group.add_argument('-f','--fasta', action="store",help="Optional fasta query.", dest="fasta") io_group.add_argument('--max-ambiguity', action="store", type=float,help="Maximum proportion of Ns allowed to attempt analysis. Default: 0.5",dest="max_ambiguity") io_group.add_argument('--min-length', action="store", type=int,help="Minimum query length allowed to attempt analysis. Default: 10000",dest="min_length") data_group = parser.add_argument_group('data source options') data_group.add_argument('-d','--datadir', action="store",help="Local directory that contains the data files. Default: civet-cat") data_group.add_argument("-m","--background-metadata",action="store",dest="background_metadata",help="Custom metadata file that corresponds to the large global tree/ alignment. Should have a column `sequence_name`.") data_group.add_argument("--background-tree", action="store", dest="background_tree", help="Custom tree file.") data_group.add_argument("--background-sequences", action="store", dest="background_sequences", help="Custom background fasta file.") data_group.add_argument('--CLIMB', action="store_true",dest="climb",help="Indicates you're running CIVET from within CLIMB, uses default paths in CLIMB to access data") data_group.add_argument("-r",'--remote', action="store_true",dest="remote",help="Remotely access lineage trees from CLIMB") data_group.add_argument("-uun","--your-user-name", action="store", help="Your CLIMB COG-UK username. Required if running with --remote flag", dest="uun") data_group.add_argument('--input-column', action="store",help="Column in input csv file to match with database. Default: name", dest="input_column") data_group.add_argument('--data-column', action="store",help="Option to search COG database for a different id type. Default: COG-UK ID", dest="data_column") report_group = parser.add_argument_group('report customisation') report_group.add_argument('-sc',"--sequencing-centre", action="store",help="Customise report with logos from sequencing centre.", dest="sequencing_centre") report_group.add_argument("--display-name", action="store", help="Column in input csv file with display names for seqs. Default: same as input column", dest="display_name") report_group.add_argument("--sample-date-column", action="store", help="Column in input csv with sampling date in it. Default='sample_date'", dest="sample_date_column") report_group.add_argument("--database-sample-date-column", action="store", help="Colum in background metadata containing sampling date. Default='sample_date'", dest="database_sample_date_column") report_group.add_argument('--colour-by', action="store", help="Comma separated string of fields to display as coloured dots rather than text in report trees. Optionally add colour scheme eg adm1=viridis", dest="colour_by") report_group.add_argument('--tree-fields', action="store",help="Comma separated string of fields to display in the trees in the report. Default: country", dest="tree_fields") report_group.add_argument('--label-fields', action="store", help="Comma separated string of fields to add to tree report labels.", dest="label_fields") report_group.add_argument("--date-fields", action="store", help="Comma separated string of metadata headers containing date information.", dest="date_fields") report_group.add_argument("--node-summary", action="store", help="Column to summarise collapsed nodes by. Default = Global lineage", dest="node_summary") report_group.add_argument("--table-fields", action="store", help="Fields to include in the table produced in the report. Query ID, name of sequence in tree and the local tree it's found in will always be shown", dest="table_fields") report_group.add_argument("--remove-snp-table", action="store_true", help="Include information about closest sequence in database in table. Default is False", dest="remove_snp_table") report_group.add_argument('--no-snipit', action="store_true",help="Don't run snipit graph", dest="no_snipit") report_group.add_argument('--include-bars', action="store_true",help="Render barcharts in the output report", dest="include_bars") report_group.add_argument('--omit-appendix', action="store_true", help="Omit the appendix section. Default=False", dest="omit_appendix") report_group.add_argument('--omit-trees', action="store_true", help="Omit trees.", dest="omit_trees") report_group.add_argument('--context-table-summary', help="Provide a field to summarise the context by", dest='context_table_summary') tree_group = parser.add_argument_group('tree context options') tree_group.add_argument('--distance', action="store",help="Extraction from large tree radius. Default: 2", dest="distance",type=int) tree_group.add_argument('--up-distance', action="store",help="Upstream distance to extract from large tree. Default: 2", dest="up_distance",type=int) tree_group.add_argument('--down-distance', action="store",help="Downstream distance to extract from large tree. Default: 2", dest="down_distance",type=int) tree_group.add_argument('--collapse-threshold', action='store',help="Minimum number of nodes to collapse on. Default: 1", dest="collapse_threshold",type=int) tree_group.add_argument('-p','--protect',nargs='*', dest="protect",help="Protect nodes from collapse if they match the search query in the metadata file supplied. E.g. -p adm2=Edinburgh sample_date=2020-03-01:2020-04-01") map_group = parser.add_argument_group('map rendering options') map_group.add_argument('--local-lineages',action="store_true",dest="local_lineages",help="Contextualise the cluster lineages at local regional scale. Requires at least one adm2 value in query csv.") map_group.add_argument('--date-restriction',action="store_true",dest="date_restriction",help="Chose whether to date-restrict comparative sequences at regional-scale.") map_group.add_argument('--date-range-start',action="store", type=str, dest="date_range_start", help="Define the start date from which sequences will COG sequences will be used for local context. YYYY-MM-DD format required.") map_group.add_argument('--date-range-end', action="store", type=str, dest="date_range_end", help="Define the end date from which sequences will COG sequences will be used for local context. YYYY-MM-DD format required.") map_group.add_argument('--date-window',action="store", type=int, dest="date_window",help="Define the window +- either side of cluster sample collection date-range. Default is 7 days.") map_group.add_argument("--map-sequences", action="store_true", dest="map_sequences", help="Map the sequences themselves by adm2, coordinates or outer postcode.") map_group.add_argument("--map-info", required=False, dest="map_info", help="columns containing EITHER x and y coordinates as a comma separated string OR outer postcodes for mapping sequences OR Adm2") map_group.add_argument("--input-crs", required=False, dest="input_crs", help="Coordinate reference system for sequence coordinates") map_group.add_argument("--colour-map-by", required=False, dest="colour_map_by", help="Column to colour mapped sequences by") run_group = parser.add_argument_group('run options') run_group.add_argument("--cluster",action="store_true",help="Run cluster civet pipeline. Requires -i/--input csv",dest="cluster") run_group.add_argument("--update",action="store_true",help="Check for changes from previous run of civet. Requires -fm/--from-metadata option in a config.yaml file from previous run",dest="update") run_group.add_argument("--udpate",action="store_true",help="Check for changes from previous run of civet. Requires -fm/--from-metadata option in a config.yaml file from previous run",dest="udpate") run_group.add_argument('-c','--generate-config',dest="generate_config",action="store_true",help="Rather than running a civet report, just generate a config file based on the command line arguments provided") run_group.add_argument('-b','--launch-browser', action="store_true",help="Optionally launch md viewer in the browser using grip",dest="launch_browser") misc_group = parser.add_argument_group('misc options') misc_group.add_argument("--safety-level", action="store", type=int, dest="safety_level",help="Level of anonymisation for users. Options: 0 (no anonymity), 1 (no COGIDs on background data), 2 (no adm2 on data). Default: 1") misc_group.add_argument('--tempdir',action="store",help="Specify where you want the temp stuff to go. Default: $TMPDIR") misc_group.add_argument("--no-temp",action="store_true",help="Output all intermediate files, for dev purposes.",dest="no_temp") misc_group.add_argument("--verbose",action="store_true",help="Print lots of stuff to screen") misc_group.add_argument("--art",action="store_true",help="Print art") misc_group.add_argument('-t', '--threads', action='store',dest="threads",type=int,help="Number of threads") misc_group.add_argument("-v","--version", action='version', version=f"civet {__version__}") misc_group.add_argument("-h","--help",action="store_true",dest="help") """ Exit with help menu if no args supplied """ if len(sysargs)<1: parser.print_help() sys.exit(0) else: args = parser.parse_args(sysargs) if args.help: parser.print_help() sys.exit(0) if args.art: cfunk.be_arty() sys.exit(0) """ Initialising dicts """ # get the default values from civetfunk config = cfunk.get_defaults() """ Input file (-i/--input) Valid inputs are input.csv; ID1,ID2,ID3; config.yaml/config.yml If there's an input fasta file- add to the config dict """ # find the query csv, or string of ids, or config file query,configfile = qcfunk.type_input_file(args.input,cwd,config) # if a yaml file is detected, add everything in it to the config dict if configfile: qcfunk.parse_yaml_file(configfile, config) """ Report options and args added to config, seq header file retrieved """ # check args for report group options cfunk.report_group_to_config(args,config) # update and cluster options cfunk.configure_update(args.update,args.udpate,config) qcfunk.add_arg_to_config("cluster",args.cluster, config) """ Get outdir, tempdir and data dir. Check if data has the right columns needed. The following rely on things that come out of the input config or csv files so shouldnt be moved up above that. - tempdir - datadir """ # default output dir qcfunk.get_outdir(args.outdir,args.output_prefix,cwd,config) # specifying temp directory, outdir if no_temp (tempdir becomes working dir) tempdir = qcfunk.get_temp_dir(args.tempdir, args.no_temp,cwd,config) qcfunk.add_arg_to_config("remote",args.remote, config) # find the data dir cfunk.get_datadir(args.climb,args.uun,args.datadir,args.background_metadata, args.background_tree, args.background_sequences, cwd,config) # add data and input columns to config qcfunk.data_columns_to_config(args,config) # check if metadata has the right columns, background_metadata_header added to config qcfunk.check_metadata_for_search_columns(config) """ from metadata parsing relies on the data dir being found """ # generate query from metadata qcfunk.add_arg_to_config("from_metadata",args.from_metadata, config) if config["from_metadata"]: qcfunk.from_metadata_checks(config) metadata = config["background_metadata"] config["no_snipit"]=True if config["update"]: query_file = os.path.join(config["outdir"], "update_query.csv") run_update = cfunk.check_for_update(query_file,config) if not run_update: print(qcfunk.cyan('Note: no new sequences to report.\nExiting.')) sys.exit(0) else: query = config["query"] # gets added updated in the check_for_update function else: query_file = os.path.join(config["outdir"], "from_metadata_query.csv") query = qcfunk.generate_query_from_metadata(query_file,args.from_metadata,metadata,config) else: if config["update"]: cfunk.check_update_dependencies(config) """ The query file could have been from one of - input.csv - id string input, created csv - from_metadata generated query csv (all either specified in config or via command line) """ # check query exists or add ids to temp query file qcfunk.check_query_file(query, cwd, config) # check if metadata has the right columns, background_metadata_header added to config qcfunk.check_query_for_input_column(config) """ Input fasta file sourcing and qc checks """ # find the query fasta qcfunk.get_query_fasta(args.fasta,cwd, config) # run qc on the input sequence file num_seqs = qcfunk.input_file_qc(args.min_length,args.max_ambiguity,config) """ Quick check in background data """ if num_seqs == 0: # check if any queries in background or if fasta supplied qcfunk.check_background_for_queries(config) """ Accessing the civet package data and selecting the mapping files, the sequencing centre header """ # accessing package data and adding to config dict cfunk.get_package_data(thisdir,config) # get seq centre header file from pkg data cfunk.get_sequencing_centre_header(config) """ Mapping options parsing and qc of the input """ # check args for mapping group options cfunk.map_group_to_config(args,config) # check args for data group options qcfunk.data_columns_to_config(args,config) # parse the input csv, check col headers and get fields if fields specified qcfunk.check_label_and_tree_and_date_fields(config) # map sequences configuration cfunk.map_sequences_config(config) # local lineages qc cfunk.local_lineages_qc(config) #check adm2s if config["local_lineages"] or config["map_sequences"]: cfunk.check_adm2_values(config) """ Parsing the tree_group arguments, config or default options """ # global now the only search option cfunk.define_seq_db(config) # extraction radius configuration qcfunk.distance_config(args.distance,args.up_distance,args.down_distance,config) # extraction radius configuration qcfunk.collapse_config(args.collapse_threshold,config) qcfunk.parse_protect(args.protect,config["background_metadata"],config) """ Parsing the report_group arguments, config or default options """ # make title rfunk.make_title(config) # deal with free text rfunk.free_text_args(config) #get table headers qcfunk.check_table_fields(config) # summarising collapsed nodes config qcfunk.check_summary_field("node_summary",config) qcfunk.collapse_summary_path_to_config(config) """ Miscellaneous options parsing """ qcfunk.add_arg_to_config("launch_browser",args.launch_browser,config) # don't run in quiet mode if verbose specified if args.verbose: quiet_mode = False config["log_string"] = "" else: quiet_mode = True lh_path = os.path.realpath(lh.__file__) config["log_string"] = f"--quiet --log-handler-script {lh_path} " qcfunk.add_arg_to_config("threads",args.threads,config) try: config["threads"]= int(config["threads"]) except: sys.stderr.write(qcfunk.cyan('Error: Please specifiy an integer for variable `threads`.\n')) sys.exit(-1) threads = config["threads"] if args.safety_level != None: config["safety_level"]= args.safety_level if config["remote"]: config["safety_level"] = 2 try: safety_level = int(config["safety_level"]) except: sys.stderr.write(qcfunk.cyan('Error: Please specifiy either 0, 1 or 2 for variable `safety_level`.\n')) sys.exit(-1) if safety_level in [0,1,2]: config["safety_level"]= int(config["safety_level"]) else: sys.stderr.write(qcfunk.cyan('Error: Please specifiy either 0, 1 or 2 for variable `safety_level`.\n')) sys.exit(-1) if args.generate_config: qcfunk.make_config_file("civet_config.yaml",config) """ cluster civet checks - arg, config, default - is there a query? - check if new things in the local tree - if new sequences, run main civet with updated query - if no new sequences, exit """ # cluster civet if config["cluster"]: config["today"] = today cfunk.configure_cluster(config) cluster_snakefile = qcfunk.get_cluster_snakefile(thisdir) if args.verbose: print("\n**** CONFIG ****") for k in sorted(config): print(qcfunk.green(k), config[k]) status = snakemake.snakemake(cluster_snakefile, printshellcmds=True, forceall=True, force_incomplete=True, workdir=tempdir,config=config, cores=threads,lock=False ) else: logger = custom_logger.Logger() status = snakemake.snakemake(cluster_snakefile, printshellcmds=False, forceall=True,force_incomplete=True,workdir=tempdir, config=config, cores=threads,lock=False,quiet=True,log_handler=logger.log_handler ) if not status: print(qcfunk.cyan(f"Error: Cluster civet did not successfully run")) sys.exit(-1) new_seqs, cluster_csv = cfunk.check_for_new_in_cluster(config) print(qcfunk.green(f"\nNew sequences found in cluster {today}: ") + f"{new_seqs}") if not new_seqs: print(qcfunk.cyan(f"No new sequences in cluster today, {today}")) sys.exit(0) else: config["query"] = cluster_csv # find the master Snakefile snakefile = qcfunk.get_snakefile(thisdir) if args.verbose: print("\n**** CONFIG ****") for k in sorted(config): print(qcfunk.green(k), config[k]) status = snakemake.snakemake(snakefile, printshellcmds=True, forceall=True, force_incomplete=True, workdir=tempdir,config=config, cores=threads,lock=False ) else: logger = custom_logger.Logger() status = snakemake.snakemake(snakefile, printshellcmds=False, forceall=True,force_incomplete=True,workdir=tempdir, config=config, cores=threads,lock=False,quiet=True,log_handler=logger.log_handler ) if status: # translate "success" into shell exit code of 0 return 0 return 1
def check_cog_db(): args = parse_args() found = [] in_cog_metadata = [] in_cog_names = {} column_to_match = args.field input_column = args.input_column query_names = [] with open(args.query, newline="") as f: reader = csv.DictReader(f) for row in reader: query_names.append(row[input_column]) with open(args.cog_metadata, newline="") as f: reader = csv.DictReader(f) header_names = reader.fieldnames for row in reader: for seq in query_names: cog_id = row[column_to_match] if seq == cog_id: row["query_id"] = seq row["cog_id"] = row[column_to_match] row["query"] = row["sequence_name"] row["closest"] = row["sequence_name"] if args.all_cog: row["source"] = "on CLIMB; not in phylogeny" else: row["source"] = "phylogeny" in_cog_metadata.append(row) in_cog_names[row[column_to_match]] = row["sequence_name"] print( qcfunk.green(f"Number of query records found in tree:") + f" {len(in_cog_metadata)}") with open(args.in_metadata, "w") as fw: header_names.append("query_id") header_names.append("cog_id") header_names.append("query") header_names.append("closest") header_names.append("source") writer = csv.DictWriter(fw, fieldnames=header_names, lineterminator='\n') writer.writeheader() writer.writerows(in_cog_metadata) with open(args.in_seqs, "w") as fw: for record in SeqIO.parse(args.cog_seqs, "fasta"): for name in in_cog_names: sequence_name = in_cog_names[name] if sequence_name == record.id: found.append(name) if args.all_cog: status = "on_climb" else: status = "in_phylogeny" fw.write( f">{name} sequence_name={record.id} status={status}\n{record.seq}\n" ) with open(args.not_in_cog, "w") as fw: c = 0 not_found_str = "" fw.write(f"{input_column}\n") for query in query_names: if query not in found: fw.write(query + '\n') not_found_str += (f"\t- {query}\n") c += 1 if c != 0: print(qcfunk.cyan("\nNot found in phylogeny:")) print(not_found_str)
def local_lineages_qc(config): query_file = config["query"] date_format = "%Y-%m-%d" if config["local_lineages"]: if "adm2" not in config["background_metadata_header"]: sys.stderr.write( qcfunk.cyan( 'Error: no geographic information found for local lineage analysis. Please provide a column in the background metadata with the header "adm2"\n' )) sys.exit(-1) elif "uk_lineage" not in config["background_metadata_header"]: sys.stderr.write( qcfunk.cyan( 'Error: no uk lineage information found for local lineage analysis. Please provide a column in the background metadata with the header "uk_lineage"\n' )) sys.exit(-1) if config["date_restriction"]: if config["date_range_start"] and type( config["date_range_start"]) == str: try: check_date = dt.datetime.strptime( config["date_range_start"], date_format).date() except: print( qcfunk.cyan( f'date range start in incorrect format. Please use i.e. YYYY-MM-DD' )) sys.exit(-1) if config["date_range_end"] and type( config["date_range_end"]) == str: try: check_date = dt.datetime.strptime(config["date_range_end"], date_format).date() except: print( qcfunk.cyan( f'date range end in incorrect format. Please use i.e. YYYY-MM-DD' )) sys.exit(-1) if config["date_range_start"] and config["date_range_end"]: print( qcfunk.green( f"Local lineage analysis restricted to {config['date_range_start']} to {config['date_range_end']}" )) elif config["date_range_start"]: print( qcfunk.green( f"Local lineage analysis restricted to {config['date_range_start']} to present" )) else: print( qcfunk.green( f"Local lineage analysis restricted to {config['date_window']} days around the sampling range" )) elif config['date_range_start'] or config["date_range_end"]: print( qcfunk.cyan( "Date restriction data provided but --date-restriction flag not used. Please use --date-restriction flag in config or command line." )) sys.exit(-1) else: print( qcfunk.green( f"Local lineage analysis not restricted by time, will show background lineage composition for the whole of the epidemic" ))
def get_datadir(args_climb, args_uun, args_datadir, args_metadata, args_tree, args_fasta, cwd, config): data_dir = "" background_metadata = "" background_seqs = "" background_tree = "" remote = config["remote"] cog_all = False if args_metadata: expanded_path = os.path.expanduser(args_metadata) background_metadata = os.path.join(cwd, expanded_path) if not os.path.exists(background_metadata): sys.stderr.write( qcfunk.cyan( f"Error: can't find metadata file at {background_metadata}.\n" )) sys.exit(-1) elif "background_metadata" in config: if config["background_metadata"]: expanded_path = os.path.expanduser(config["background_metadata"]) background_metadata = os.path.join(config["path_to_query"], expanded_path) if not os.path.exists(background_metadata): sys.stderr.write( qcfunk.cyan( f"Error: can't find metadata file at {background_metadata}.\n" )) sys.exit(-1) if args_tree: expanded_path = os.path.expanduser(args_tree) background_tree = os.path.join(cwd, expanded_path) if not os.path.exists(background_tree): sys.stderr.write( qcfunk.cyan( f"Error: can't find tree file at {background_tree}.\n")) sys.exit(-1) elif "background_tree" in config: if config["background_tree"]: expanded_path = os.path.expanduser(config["background_tree"]) background_tree = os.path.join(config["path_to_query"], expanded_path) if not os.path.exists(background_tree): sys.stderr.write( qcfunk.cyan( f"Error: can't find tree file at {background_tree}.\n") ) sys.exit(-1) if args_fasta: expanded_path = os.path.expanduser(args_fasta) background_seqs = os.path.join(cwd, expanded_path) if not os.path.exists(background_seqs): sys.stderr.write( qcfunk.cyan( f"Error: can't find metadata file at {background_seqs}.\n") ) sys.exit(-1) elif "background_sequences" in config: if config["background_sequences"]: expanded_path = os.path.expanduser(config["background_sequences"]) background_seqs = os.path.join(config["path_to_query"], expanded_path) if not os.path.exists(background_seqs): sys.stderr.write( qcfunk.cyan( f"Error: can't find fasta file at {background_seqs}.\n" )) sys.exit(-1) if args_climb: data_dir = "/cephfs/covid/bham/results/phylogenetics/latest/civet/cog" cog_all = False if os.path.exists(data_dir): config["remote"] = False config["username"] = "" else: sys.stderr.write( qcfunk.cyan( f"Error: --CLIMB argument called, but CLIMB data path doesn't exist.\n" )) sys.exit(-1) elif args_datadir: data_dir = os.path.join(cwd, args_datadir) elif "datadir" in config: if config["datadir"]: expanded_path = os.path.expanduser(config["datadir"]) data_dir = os.path.join(config["path_to_query"], expanded_path) else: data_dir = os.path.join(cwd, "civet-cat") if not remote: if not os.path.exists(data_dir): print_data_error(data_dir) sys.exit(-1) background_seqs, background_tree, background_metadata, data_date, background_metadata_all = get_background_files( data_dir, background_metadata, background_tree, background_seqs, cog_all) config["datadir"] = data_dir config["data_date"] = data_date if not os.path.isfile(background_tree) or not os.path.isfile( background_seqs) or not os.path.isfile(background_metadata): print_data_error(data_dir) sys.exit(-1) else: config["background_metadata"] = background_metadata config["background_seqs"] = background_seqs config["background_tree"] = background_tree config["background_metadata_all"] = background_metadata_all print("Found data:") print(" -", background_seqs) print(" -", background_metadata) print(" -", background_metadata_all) print(" -", background_tree, "\n") elif remote: get_remote_data(args_uun, background_metadata, background_tree, background_seqs, data_dir, config) config["datadir"] = data_dir