Exemplo n.º 1
0
def check_adm2_values(config):

    get_acceptable_adm2(config)
    accepted_adm2 = config["clean_locs"]

    with open(config["query"], "r") as f:
        reader = csv.DictReader(f)
        header = reader.fieldnames
        if "adm2" in header:
            for row in reader:
                if row["adm2"].upper().replace(
                        " ", "_") not in accepted_adm2 and "|" not in row[
                            "adm2"] and row["adm2"] != "":
                    adm2_value = row["adm2"]
                    sys.stderr.write(
                        qcfunk.cyan(
                            f'Error: {adm2_value} not a valid adm2 region.\n Find a list of valid adm2 values at:\nhttps://artic-network.github.io/civet/geographic_data.html\n'
                        ))
                    sys.stderr.write(
                        qcfunk.cyan(
                            f'Please note: if you have a region that encompasses multiple adm2 regions eg West Midlands, list the correct adm2 regions separated by the "|" symbol to indicate ambiguity.\n'
                        ))
                    sys.exit(-1)
                elif "|" in row["adm2"]:
                    adm2_value = row["adm2"].split("|")
                    for i in adm2_value:
                        if i not in accepted_adm2:
                            sys.stderr.write(
                                qcfunk.cyan(
                                    f'Error: {i} found in the ambiguity code {row["adm2"]} not a valid adm2 region.\n Find a list of valid adm2 values at:\nhttps://artic-network.github.io/civet/geographic_data.html\n'
                                ))
                            sys.exit(-1)
Exemplo n.º 2
0
def get_datadir(args_datadir, args_metadata, cwd, config):
    data_dir = ""
    background_metadata = ""

    if args_metadata:
        expanded_path = os.path.expanduser(args_metadata)
        background_metadata = os.path.join(cwd, expanded_path)
        if not os.path.exists(background_metadata):
            sys.stderr.write(
                qcfunk.cyan(
                    f"Error: can't find metadata file at {background_metadata}.\n"
                ))
            sys.exit(-1)

    elif "background_metadata" in config:
        if config["background_metadata"]:
            expanded_path = os.path.expanduser(config["background_metadata"])
            background_metadata = os.path.join(config["path_to_query"],
                                               expanded_path)
            if not os.path.exists(background_metadata):
                sys.stderr.write(
                    qcfunk.cyan(
                        f"Error: can't find metadata file at {background_metadata}.\n"
                    ))
                sys.exit(-1)

    elif args_datadir:
        data_dir = os.path.join(cwd, args_datadir)

    elif "datadir" in config:
        if config["datadir"]:
            expanded_path = os.path.expanduser(config["datadir"])
            data_dir = os.path.join(config["path_to_query"], expanded_path)
        else:
            data_dir = os.path.join(cwd, "civet-cat")

    if not os.path.exists(data_dir):
        print_data_error(data_dir)
        sys.exit(-1)

    background_seqs, background_tree, background_metadata, data_date = get_background_files(
        data_dir, background_metadata)

    config["datadir"] = data_dir
    config["data_date"] = data_date

    if not os.path.isfile(background_tree) or not os.path.isfile(
            background_seqs) or not os.path.isfile(background_metadata):
        print_data_error(data_dir)
        sys.exit(-1)
    else:
        config["background_metadata"] = background_metadata
        config["background_seqs"] = background_seqs
        config["background_tree"] = background_tree

        print("Found data:")
        print("    -", background_seqs)
        print("    -", background_metadata)
        print("    -", background_tree, "\n")
Exemplo n.º 3
0
def check_cluster_dependencies(config):
    if not "query" in config:
        sys.stderr.write(
            qcfunk.cyan('Error: input.csv required to run `cluster` civet\n'))
        sys.exit(-1)
    if config["update"]:
        sys.stderr.write(
            qcfunk.cyan(
                'Error: specify one of either `cluster` or `update`\n'))
        sys.exit(-1)
Exemplo n.º 4
0
def print_data_error(data_dir):
    sys.stderr.write(
        qcfunk.cyan(
            f"Error: data directory should contain the following files or additionally supply a background metadata file:\n"
        ) + f"\
    - global_2020-XX-YY_tree.nexus\n\
    - global_2020-XX-YY_metadata.csv\n\
    - global_2020-XX-YY_alignment.fasta\n" + qcfunk.cyan(f"\
To run civet please specify a local directory with the appropriate files, optionally supply a custom metadata file\n\n"
                                                         ""))
Exemplo n.º 5
0
def map_sequences_config(config):

    background_headers = config["background_metadata_header"]
    query_headers = config["query_metadata_header"]

    if config["map_sequences"]:

        map_inputs = ""
        if config["map_info"]:
            map_inputs = config["map_info"].replace(" ", "")
        else:
            sys.stderr.write(
                qcfunk.cyan(
                    'Error: coordinates or outer postcode or adm2 not supplied for mapping sequences. Please provide either x and y columns as a comma separated string, or column header containing outer postcode.'
                ))
            sys.exit(-1)

        if len(map_inputs.split(
                ",")) == 2:  #If x and y coordinates are provided
            if not config["input_crs"]:
                sys.stderr.write(
                    qcfunk.cyan(
                        'Error: input coordinate system not provided for mapping. Please provide --input-crs eg EPSG:3395'
                    ))
                sys.exit(-1)
        else:  #If an outer postcode column is provided
            config["input_crs"] = "EPSG:4326"

        relevant_cols = map_inputs.split(",")

        if config["colour_map_by"]:
            relevant_cols.append(config["colour_map_by"])

        for map_arg in relevant_cols:
            map_arg = map_arg.replace(" ", "")
            if map_arg not in query_headers and map_arg not in background_headers:
                sys.stderr.write(
                    qcfunk.cyan(
                        f"Error: {map_arg} column not found in metadata file or background database for mapping sequences"
                    ))
                sys.exit(-1)

        if config["colour_map_by"]:
            if map_inputs == "adm2":
                print(
                    qcfunk.cyan(
                        f"NOTE: --colour-map-by not set up to colour by adm2. Please provide outer postcode or coordinates"
                    ))
            else:
                print(
                    qcfunk.green(
                        f"Colouring map by: {config['colour_map_by']}"))
Exemplo n.º 6
0
def check_update_dependencies(config):
    if "from_metadata" in config:
        if not config["from_metadata"]:
            sys.stderr.write(
                qcfunk.cyan(
                    'Error: `--from-metadata` search term required to run in `update` mode\n'
                ))
            sys.exit(-1)
    else:
        sys.stderr.write(
            qcfunk.cyan(
                'Error: `--from-metadata` search term required to run in `update` mode\n'
            ))
        sys.exit(-1)
Exemplo n.º 7
0
def print_data_error(data_dir):
    sys.stderr.write(
        qcfunk.cyan(
            f"Error: data directory should contain the following files or additionally supply a background metadata file:\n"
        ) + f"\
    - cog_global_2020-XX-YY_tree.newick\n\
    - cog_global_2020-XX-YY_metadata.csv\n\
    - cog_global_2020-XX-YY_alignment.fasta\n\n" + qcfunk.cyan(
            "Please also check that the data directory is correctly specified.\n\n"
        ) + qcfunk.cyan(f"\
To run civet please either\n1) ssh into CLIMB and run with --CLIMB flag\n\
2) Run using `--remote` flag and your CLIMB username specified e.g. `-uun climb-covid19-smithj`\n\
3) Specify a local directory with the appropriate files, optionally supply a custom metadata file, custom background tree or custom background fasta file\n\n"
                        ""))
Exemplo n.º 8
0
def get_sequencing_centre_header(config):

    sc_list = [
        "PHEC", 'LIVE', 'BIRM', 'PHWC', 'CAMB', 'NORW', 'GLAS', 'EDIN', 'SHEF',
        'EXET', 'NOTT', 'PORT', 'OXON', 'NORT', 'NIRE', 'GSTT', 'LOND', 'SANG',
        "NIRE"
    ]

    sequencing_centre = config["sequencing_centre"]
    if sequencing_centre in sc_list or sequencing_centre == "DEFAULT":
        package_png = os.path.join("data", "headers",
                                   f"{sequencing_centre}.png")
        sequencing_centre_source = pkg_resources.resource_filename(
            'civet', package_png)
        print(qcfunk.green(f"Using header file from:") + f" {package_png}\n")
        config["sequencing_centre_source"] = sequencing_centre_source
        config["sequencing_centre_dest"] = os.path.join(
            config["outdir"], "report", "figures", f"{sequencing_centre}.png")
        config["sequencing_centre_file"] = os.path.join(
            ".", "figures", f"{sequencing_centre}.png")
        config["sequencing_centre"] = sequencing_centre
    else:
        sc_string = "\n".join(sc_list)
        sys.stderr.write(
            qcfunk.cyan(
                f'Error: sequencing centre must be one of the following:\n{sc_string}\n'
            ))
        sys.exit(-1)
Exemplo n.º 9
0
def get_remote_data(uun, background_metadata, background_trees,
                    background_seqs, data_dir, config):
    config["remote"] = True
    if uun:
        config["username"] = uun
        rsync_data_from_climb(uun, data_dir)
    elif "username" in config:
        uun = config["username"]
        rsync_data_from_climb(uun, data_dir)
    elif "uun" in config:
        uun = config["uun"]
        rsync_data_from_climb(uun, data_dir)
    else:
        rsync_command = f"rsync -avzh --exclude 'cog' --delete-after  bham.covid19.climb.ac.uk:/cephfs/covid/bham/results/phylogenetics/latest/civet/ '{data_dir}'"
        print(f"Syncing civet data to {data_dir}")
        status = os.system(rsync_command)
        if status != 0:
            sys.stderr.write(
                qcfunk.cyan(
                    "Error: rsync command failed.\nCheck your ssh is configured with Host bham.covid19.climb.ac.uk\nAlternatively enter your CLIMB username with -uun e.g. climb-covid19-smithj\nAlso, check if you have access to CLIMB from this machine and check if you are in the UK\n\n"
                ))
            sys.exit(-1)

    background_seqs, background_tree, background_metadata, data_date, background_metadata_all = get_background_files(
        data_dir, background_metadata, background_trees, background_seqs)

    config["datadir"] = data_dir
    config["data_date"] = data_date
    if not os.path.exists(config["datadir"]):
        print(qcfunk.cyan(f"Error: data directory not found at {data_dir}.\n"))
        sys.exit(-1)

    if not os.path.isfile(background_tree) or not os.path.isfile(
            background_seqs) or not os.path.isfile(background_metadata):
        print_data_error(data_dir)
        sys.exit(-1)
    else:
        config["background_metadata"] = background_metadata
        config["background_seqs"] = background_seqs
        config["background_tree"] = background_tree

        print(qcfunk.green("Found data:"))
        print("    -", background_seqs)
        print("    -", background_metadata)
        print("    -", background_tree, "\n")
Exemplo n.º 10
0
def get_package_data(thisdir, config):
    reference_fasta = pkg_resources.resource_filename('civet',
                                                      'data/reference.fasta')
    outgroup_fasta = pkg_resources.resource_filename('civet',
                                                     'data/outgroup.fasta')
    polytomy_figure = pkg_resources.resource_filename('civet',
                                                      'data/polytomies.png')
    report_args = pkg_resources.resource_filename('civet',
                                                  'data/report_arguments.txt')
    footer_fig = pkg_resources.resource_filename('civet', 'data/footer.png')
    clean_locs_file = pkg_resources.resource_filename(
        'civet', 'data/mapping_files/adm2_cleaning.csv')
    map_input_1 = pkg_resources.resource_filename(
        'civet', 'data/mapping_files/gadm36_GBR_2.json')
    map_input_2 = pkg_resources.resource_filename(
        'civet', 'data/mapping_files/channel_islands.json')
    map_input_3 = pkg_resources.resource_filename(
        'civet', 'data/mapping_files/NI_counties.geojson')
    map_input_4 = pkg_resources.resource_filename(
        'civet', 'data/mapping_files/Mainland_HBs_gapclosed_mapshaped_d3.json')
    map_input_5 = pkg_resources.resource_filename(
        'civet', 'data/mapping_files/urban_areas_UK.geojson')
    map_input_6 = pkg_resources.resource_filename(
        'civet', 'data/mapping_files/UK_outPC_coords.csv')
    spatial_translations_1 = pkg_resources.resource_filename(
        'civet', 'data/mapping_files/HB_Translation.pkl')
    spatial_translations_2 = pkg_resources.resource_filename(
        'civet', 'data/mapping_files/adm2_regions_to_coords.csv')
    appendix_text = pkg_resources.resource_filename('civet',
                                                    'data/appendix.txt')
    config["reference_fasta"] = reference_fasta
    config["outgroup_fasta"] = outgroup_fasta
    config["polytomy_figure"] = polytomy_figure
    config["report_args"] = report_args
    config["footer"] = footer_fig
    config["appendix"] = appendix_text

    config["clean_locs_file"] = clean_locs_file
    config["uk_map"] = map_input_1
    config["channels_map"] = map_input_2
    config["ni_map"] = map_input_3
    config["uk_map_d3"] = map_input_4
    config["urban_centres"] = map_input_5
    config["pc_file"] = map_input_6
    config["HB_translations"] = spatial_translations_1
    config["PC_translations"] = spatial_translations_2

    report_template = os.path.join(thisdir, 'scripts', 'civet_template.pmd')

    if not os.path.exists(report_template):
        sys.stderr.write(
            qcfunk.cyan(
                f'Error: cannot find report_template at {report_template}\n'))
        sys.exit(-1)

    config["report_template"] = report_template
Exemplo n.º 11
0
def rsync_data_from_climb(uun, data_dir):
    rsync_command = f"rsync -avzh --exclude 'cog' --delete-after {uun}@bham.covid19.climb.ac.uk:/cephfs/covid/bham/results/phylogenetics/latest/civet/ '{data_dir}'"
    print(qcfunk.green(f"Syncing civet data to {data_dir}"))
    status = os.system(rsync_command)
    if status != 0:
        sys.stderr.write(
            qcfunk.cyan(
                "Error: rsync command failed.\nCheck your user name is a valid CLIMB username e.g. climb-covid19-smithj\nAlso, check if you have access to CLIMB from this machine and are in the UK\n\n"
            ))
        sys.exit(-1)
Exemplo n.º 12
0
def check_for_new_in_cluster(config):
    new_count = 0
    prefix = config["output_prefix"]
    background_metadata = config["background_metadata"]
    cluster_csv = os.path.join(config["outdir"], f"{prefix}.csv")
    with open(cluster_csv, "r") as f:
        reader = csv.DictReader(f)
        if not "new" in reader.fieldnames:
            sys.stderr.write(
                qcfunk.cyan(
                    'Error: `cluster` civet has not run, require column `new` in csv\n'
                ))
            sys.exit(-1)
        for row in reader:
            if row["new"] == "True":
                new_count += 1
    return new_count, cluster_csv
Exemplo n.º 13
0
def main(sysargs = sys.argv[1:]):

    parser = argparse.ArgumentParser(add_help=False, prog = _program, 
    description=cfunk.preamble(__version__), 
    usage='''
\tcivet -i <config.yaml> [options]
\tcivet -i input.csv [options]
\tcivet -i ID1,IS2 [options]
\tcivet -fm <column=match> [options]\n\n''')

    io_group = parser.add_argument_group('input output options')
    io_group.add_argument('-i',"--input", action="store",help="Input config file in yaml format, csv file (with minimally an input_column header, Default=`name`) or comma-separated id string with one or more query ids. Example: `EDB3588,EDB3589`.", dest="input")
    io_group.add_argument('-fm','--from-metadata',nargs='*', dest="from_metadata",help="Generate a query from the metadata file supplied. Define a search that will be used to pull out sequences of interest from the large phylogeny. E.g. -fm adm2=Edinburgh sample_date=2020-03-01:2020-04-01")
    io_group.add_argument('-o','--output-prefix',action="store",help="Prefix of output directory & report name: Default: civet",dest="output_prefix")
    io_group.add_argument('--outdir', action="store",help="Output directory. Default: current working directory")
    io_group.add_argument('-f','--fasta', action="store",help="Optional fasta query.", dest="fasta")
    io_group.add_argument('--max-ambiguity', action="store", type=float,help="Maximum proportion of Ns allowed to attempt analysis. Default: 0.5",dest="max_ambiguity")
    io_group.add_argument('--min-length', action="store", type=int,help="Minimum query length allowed to attempt analysis. Default: 10000",dest="min_length")
    
    data_group = parser.add_argument_group('data source options')
    data_group.add_argument('-d','--datadir', action="store",help="Local directory that contains the data files. Default: civet-cat")
    data_group.add_argument("-m","--background-metadata",action="store",dest="background_metadata",help="Custom metadata file that corresponds to the large global tree/ alignment. Should have a column `sequence_name`.")
    data_group.add_argument("--background-tree", action="store", dest="background_tree", help="Custom tree file.")
    data_group.add_argument("--background-sequences", action="store", dest="background_sequences", help="Custom background fasta file.")
    data_group.add_argument('--CLIMB', action="store_true",dest="climb",help="Indicates you're running CIVET from within CLIMB, uses default paths in CLIMB to access data")
    data_group.add_argument("-r",'--remote', action="store_true",dest="remote",help="Remotely access lineage trees from CLIMB")
    data_group.add_argument("-uun","--your-user-name", action="store", help="Your CLIMB COG-UK username. Required if running with --remote flag", dest="uun")
    data_group.add_argument('--input-column', action="store",help="Column in input csv file to match with database. Default: name", dest="input_column")
    data_group.add_argument('--data-column', action="store",help="Option to search COG database for a different id type. Default: COG-UK ID", dest="data_column")

    report_group = parser.add_argument_group('report customisation')
    report_group.add_argument('-sc',"--sequencing-centre", action="store",help="Customise report with logos from sequencing centre.", dest="sequencing_centre")
    report_group.add_argument("--display-name", action="store", help="Column in input csv file with display names for seqs. Default: same as input column", dest="display_name")
    report_group.add_argument("--sample-date-column", action="store", help="Column in input csv with sampling date in it. Default='sample_date'", dest="sample_date_column")
    report_group.add_argument("--database-sample-date-column", action="store", help="Colum in background metadata containing sampling date. Default='sample_date'", dest="database_sample_date_column")
    report_group.add_argument('--colour-by', action="store", help="Comma separated string of fields to display as coloured dots rather than text in report trees. Optionally add colour scheme eg adm1=viridis", dest="colour_by")
    report_group.add_argument('--tree-fields', action="store",help="Comma separated string of fields to display in the trees in the report. Default: country", dest="tree_fields")
    report_group.add_argument('--label-fields', action="store", help="Comma separated string of fields to add to tree report labels.", dest="label_fields")
    report_group.add_argument("--date-fields", action="store", help="Comma separated string of metadata headers containing date information.", dest="date_fields")
    report_group.add_argument("--node-summary", action="store", help="Column to summarise collapsed nodes by. Default = Global lineage", dest="node_summary")
    report_group.add_argument("--table-fields", action="store", help="Fields to include in the table produced in the report. Query ID, name of sequence in tree and the local tree it's found in will always be shown", dest="table_fields")
    report_group.add_argument("--remove-snp-table", action="store_true", help="Include information about closest sequence in database in table. Default is False", dest="remove_snp_table")
    report_group.add_argument('--no-snipit', action="store_true",help="Don't run snipit graph", dest="no_snipit")
    report_group.add_argument('--include-bars', action="store_true",help="Render barcharts in the output report", dest="include_bars")
    report_group.add_argument('--omit-appendix', action="store_true", help="Omit the appendix section. Default=False", dest="omit_appendix")
    report_group.add_argument('--omit-trees', action="store_true", help="Omit trees.", dest="omit_trees")
    report_group.add_argument('--context-table-summary', help="Provide a field to summarise the context by", dest='context_table_summary')

    tree_group = parser.add_argument_group('tree context options')
    tree_group.add_argument('--distance', action="store",help="Extraction from large tree radius. Default: 2", dest="distance",type=int)
    tree_group.add_argument('--up-distance', action="store",help="Upstream distance to extract from large tree. Default: 2", dest="up_distance",type=int)
    tree_group.add_argument('--down-distance', action="store",help="Downstream distance to extract from large tree. Default: 2", dest="down_distance",type=int)
    tree_group.add_argument('--collapse-threshold', action='store',help="Minimum number of nodes to collapse on. Default: 1", dest="collapse_threshold",type=int)
    tree_group.add_argument('-p','--protect',nargs='*', dest="protect",help="Protect nodes from collapse if they match the search query in the metadata file supplied. E.g. -p adm2=Edinburgh sample_date=2020-03-01:2020-04-01")

    map_group = parser.add_argument_group('map rendering options')
    map_group.add_argument('--local-lineages',action="store_true",dest="local_lineages",help="Contextualise the cluster lineages at local regional scale. Requires at least one adm2 value in query csv.")
    map_group.add_argument('--date-restriction',action="store_true",dest="date_restriction",help="Chose whether to date-restrict comparative sequences at regional-scale.")
    map_group.add_argument('--date-range-start',action="store", type=str, dest="date_range_start", help="Define the start date from which sequences will COG sequences will be used for local context. YYYY-MM-DD format required.")
    map_group.add_argument('--date-range-end', action="store", type=str, dest="date_range_end", help="Define the end date from which sequences will COG sequences will be used for local context. YYYY-MM-DD format required.")
    map_group.add_argument('--date-window',action="store", type=int, dest="date_window",help="Define the window +- either side of cluster sample collection date-range. Default is 7 days.")
    map_group.add_argument("--map-sequences", action="store_true", dest="map_sequences", help="Map the sequences themselves by adm2, coordinates or outer postcode.")
    map_group.add_argument("--map-info", required=False, dest="map_info", help="columns containing EITHER x and y coordinates as a comma separated string OR outer postcodes for mapping sequences OR Adm2")
    map_group.add_argument("--input-crs", required=False, dest="input_crs", help="Coordinate reference system for sequence coordinates")
    map_group.add_argument("--colour-map-by", required=False, dest="colour_map_by", help="Column to colour mapped sequences by")
    
    run_group = parser.add_argument_group('run options')
    run_group.add_argument("--cluster",action="store_true",help="Run cluster civet pipeline. Requires -i/--input csv",dest="cluster")
    run_group.add_argument("--update",action="store_true",help="Check for changes from previous run of civet. Requires -fm/--from-metadata option in a config.yaml file from previous run",dest="update")
    run_group.add_argument("--udpate",action="store_true",help="Check for changes from previous run of civet. Requires -fm/--from-metadata option in a config.yaml file from previous run",dest="udpate")
    run_group.add_argument('-c','--generate-config',dest="generate_config",action="store_true",help="Rather than running a civet report, just generate a config file based on the command line arguments provided")
    run_group.add_argument('-b','--launch-browser', action="store_true",help="Optionally launch md viewer in the browser using grip",dest="launch_browser")

    misc_group = parser.add_argument_group('misc options')
    misc_group.add_argument("--safety-level", action="store", type=int, dest="safety_level",help="Level of anonymisation for users. Options: 0 (no anonymity), 1 (no COGIDs on background data), 2 (no adm2 on data). Default: 1")
    misc_group.add_argument('--tempdir',action="store",help="Specify where you want the temp stuff to go. Default: $TMPDIR")
    misc_group.add_argument("--no-temp",action="store_true",help="Output all intermediate files, for dev purposes.",dest="no_temp")
    misc_group.add_argument("--verbose",action="store_true",help="Print lots of stuff to screen")
    misc_group.add_argument("--art",action="store_true",help="Print art")
    misc_group.add_argument('-t', '--threads', action='store',dest="threads",type=int,help="Number of threads")
    misc_group.add_argument("-v","--version", action='version', version=f"civet {__version__}")
    misc_group.add_argument("-h","--help",action="store_true",dest="help")
    
    """
    Exit with help menu if no args supplied
    """
    if len(sysargs)<1: 
        parser.print_help()
        sys.exit(0)
    else:
        args = parser.parse_args(sysargs)
        if args.help:
            parser.print_help()
            sys.exit(0)
    
    if args.art:
        cfunk.be_arty()
        sys.exit(0) 
    
    """
    Initialising dicts
    """

    # get the default values from civetfunk
    config = cfunk.get_defaults()

    """
    Input file (-i/--input) 
    Valid inputs are input.csv; ID1,ID2,ID3; config.yaml/config.yml
    
    If there's an input fasta file- add to the config dict

    """
    # find the query csv, or string of ids, or config file
    query,configfile = qcfunk.type_input_file(args.input,cwd,config)

    # if a yaml file is detected, add everything in it to the config dict
    if configfile:
        qcfunk.parse_yaml_file(configfile, config)
    
    """
    Report options and args added to config, seq header file retrieved
    """
    # check args for report group options
    cfunk.report_group_to_config(args,config)


    # update and cluster options

    cfunk.configure_update(args.update,args.udpate,config)
    qcfunk.add_arg_to_config("cluster",args.cluster, config)

    """
    Get outdir, tempdir and data dir. 
    Check if data has the right columns needed.
    The following rely on things that come out of the 
    input config or csv files so shouldnt be moved up above that.

    - tempdir
    - datadir
    """
    # default output dir
    qcfunk.get_outdir(args.outdir,args.output_prefix,cwd,config)

    # specifying temp directory, outdir if no_temp (tempdir becomes working dir)
    tempdir = qcfunk.get_temp_dir(args.tempdir, args.no_temp,cwd,config)

    qcfunk.add_arg_to_config("remote",args.remote, config)

    # find the data dir
    cfunk.get_datadir(args.climb,args.uun,args.datadir,args.background_metadata, args.background_tree, args.background_sequences, cwd,config)

    # add data and input columns to config
    qcfunk.data_columns_to_config(args,config)

    # check if metadata has the right columns, background_metadata_header added to config
    qcfunk.check_metadata_for_search_columns(config)


    """
    from metadata parsing 

    relies on the data dir being found 
    """
    # generate query from metadata
    qcfunk.add_arg_to_config("from_metadata",args.from_metadata, config)
    if config["from_metadata"]:
        
        qcfunk.from_metadata_checks(config)

        metadata = config["background_metadata"]
        config["no_snipit"]=True

        if config["update"]:
            query_file = os.path.join(config["outdir"], "update_query.csv")
            run_update = cfunk.check_for_update(query_file,config)
            if not run_update:
                print(qcfunk.cyan('Note: no new sequences to report.\nExiting.'))
                sys.exit(0)
            else:
                query = config["query"] # gets added updated in the check_for_update function
        else:
            query_file = os.path.join(config["outdir"], "from_metadata_query.csv")
            query = qcfunk.generate_query_from_metadata(query_file,args.from_metadata,metadata,config)
    else:
        if config["update"]:
            cfunk.check_update_dependencies(config)
            
    """
    The query file could have been from one of
    - input.csv
    - id string input, created csv
    - from_metadata generated query csv

    (all either specified in config or via command line)
    """
    # check query exists or add ids to temp query file
    qcfunk.check_query_file(query, cwd, config)

    # check if metadata has the right columns, background_metadata_header added to config
    qcfunk.check_query_for_input_column(config)

    """
    Input fasta file 
    sourcing and qc checks
    """
    # find the query fasta
    qcfunk.get_query_fasta(args.fasta,cwd, config)
    
    # run qc on the input sequence file
    num_seqs = qcfunk.input_file_qc(args.min_length,args.max_ambiguity,config)
    
    """
    Quick check in background data
    """
    if num_seqs == 0:
        # check if any queries in background or if fasta supplied
        qcfunk.check_background_for_queries(config)

    """
    Accessing the civet package data and 
    selecting the mapping files, 
    the sequencing centre header
    """
    # accessing package data and adding to config dict
    cfunk.get_package_data(thisdir,config)

    # get seq centre header file from pkg data
    cfunk.get_sequencing_centre_header(config)

    
    """
    Mapping options parsing and 
    qc of the input
    """
    
    # check args for mapping group options
    cfunk.map_group_to_config(args,config)

    # check args for data group options
    qcfunk.data_columns_to_config(args,config)

    # parse the input csv, check col headers and get fields if fields specified
    qcfunk.check_label_and_tree_and_date_fields(config)
        
    # map sequences configuration
    cfunk.map_sequences_config(config)
    
    # local lineages qc
    cfunk.local_lineages_qc(config)

    #check adm2s
    if config["local_lineages"] or config["map_sequences"]:
        cfunk.check_adm2_values(config)

    """
    Parsing the tree_group arguments, 
    config or default options
    """

    # global now the only search option
    cfunk.define_seq_db(config)

    # extraction radius configuration
    qcfunk.distance_config(args.distance,args.up_distance,args.down_distance,config) 

    # extraction radius configuration
    qcfunk.collapse_config(args.collapse_threshold,config) 


    qcfunk.parse_protect(args.protect,config["background_metadata"],config)

    """
    Parsing the report_group arguments, 
    config or default options
    """

    # make title
    rfunk.make_title(config)
    # deal with free text
    rfunk.free_text_args(config)

    #get table headers
    qcfunk.check_table_fields(config) 
        
    # summarising collapsed nodes config
    qcfunk.check_summary_field("node_summary",config)

    qcfunk.collapse_summary_path_to_config(config)

   

    """
    Miscellaneous options parsing

    """

    qcfunk.add_arg_to_config("launch_browser",args.launch_browser,config)

    # don't run in quiet mode if verbose specified
    if args.verbose:
        quiet_mode = False
        config["log_string"] = ""
    else:
        quiet_mode = True
        lh_path = os.path.realpath(lh.__file__)
        config["log_string"] = f"--quiet --log-handler-script {lh_path} "

    qcfunk.add_arg_to_config("threads",args.threads,config)
    
    try:
        config["threads"]= int(config["threads"])
    except:
        sys.stderr.write(qcfunk.cyan('Error: Please specifiy an integer for variable `threads`.\n'))
        sys.exit(-1)
    threads = config["threads"]


    if args.safety_level != None:
        config["safety_level"]= args.safety_level
    
    if config["remote"]:
        config["safety_level"] = 2
    
    try:
        safety_level = int(config["safety_level"])
    except:
        sys.stderr.write(qcfunk.cyan('Error: Please specifiy either 0, 1 or 2 for variable `safety_level`.\n'))
        sys.exit(-1)

    if safety_level in [0,1,2]:
        config["safety_level"]= int(config["safety_level"])
    else:
        sys.stderr.write(qcfunk.cyan('Error: Please specifiy either 0, 1 or 2 for variable `safety_level`.\n'))
        sys.exit(-1)

    if args.generate_config:
        qcfunk.make_config_file("civet_config.yaml",config)
    

    """
    cluster civet checks
    - arg, config, default
    - is there a query?
    - check if new things in the local tree
    - if new sequences, run main civet with updated query
    - if no new sequences, exit
    
    """
    # cluster civet 

    if config["cluster"]:

        config["today"] = today
        cfunk.configure_cluster(config)

        cluster_snakefile = qcfunk.get_cluster_snakefile(thisdir)

        if args.verbose:
            print("\n**** CONFIG ****")
            for k in sorted(config):
                print(qcfunk.green(k), config[k])
            status = snakemake.snakemake(cluster_snakefile, printshellcmds=True, forceall=True, force_incomplete=True,
                                        workdir=tempdir,config=config, cores=threads,lock=False
                                        )
        else:
            logger = custom_logger.Logger()
            status = snakemake.snakemake(cluster_snakefile, printshellcmds=False, forceall=True,force_incomplete=True,workdir=tempdir,
                                        config=config, cores=threads,lock=False,quiet=True,log_handler=logger.log_handler
                                        )

        if not status:
            print(qcfunk.cyan(f"Error: Cluster civet did not successfully run"))
            sys.exit(-1)

        new_seqs, cluster_csv = cfunk.check_for_new_in_cluster(config)

        print(qcfunk.green(f"\nNew sequences found in cluster {today}: ") + f"{new_seqs}")

        if not new_seqs:
            print(qcfunk.cyan(f"No new sequences in cluster today, {today}"))
            sys.exit(0)
        else:
            config["query"] = cluster_csv
        
    # find the master Snakefile
    snakefile = qcfunk.get_snakefile(thisdir)

    if args.verbose:
        print("\n**** CONFIG ****")
        for k in sorted(config):
            print(qcfunk.green(k), config[k])
        status = snakemake.snakemake(snakefile, printshellcmds=True, forceall=True, force_incomplete=True,
                                    workdir=tempdir,config=config, cores=threads,lock=False
                                    )
    else:
        logger = custom_logger.Logger()
        status = snakemake.snakemake(snakefile, printshellcmds=False, forceall=True,force_incomplete=True,workdir=tempdir,
                                    config=config, cores=threads,lock=False,quiet=True,log_handler=logger.log_handler
                                    )

    if status: # translate "success" into shell exit code of 0
       return 0

    return 1
Exemplo n.º 14
0
def check_cog_db():
    args = parse_args()

    found = []

    in_cog_metadata = []
    in_cog_names = {}

    column_to_match = args.field
    input_column = args.input_column
    query_names = []
    with open(args.query, newline="") as f:
        reader = csv.DictReader(f)
        for row in reader:
            query_names.append(row[input_column])

    with open(args.cog_metadata, newline="") as f:
        reader = csv.DictReader(f)
        header_names = reader.fieldnames
        for row in reader:
            for seq in query_names:

                cog_id = row[column_to_match]
                if seq == cog_id:

                    row["query_id"] = seq
                    row["cog_id"] = row[column_to_match]
                    row["query"] = row["sequence_name"]
                    row["closest"] = row["sequence_name"]
                    if args.all_cog:
                        row["source"] = "on CLIMB; not in phylogeny"
                    else:
                        row["source"] = "phylogeny"
                    in_cog_metadata.append(row)
                    in_cog_names[row[column_to_match]] = row["sequence_name"]

    print(
        qcfunk.green(f"Number of query records found in tree:") +
        f" {len(in_cog_metadata)}")
    with open(args.in_metadata, "w") as fw:
        header_names.append("query_id")
        header_names.append("cog_id")
        header_names.append("query")
        header_names.append("closest")
        header_names.append("source")
        writer = csv.DictWriter(fw,
                                fieldnames=header_names,
                                lineterminator='\n')
        writer.writeheader()
        writer.writerows(in_cog_metadata)

    with open(args.in_seqs, "w") as fw:
        for record in SeqIO.parse(args.cog_seqs, "fasta"):
            for name in in_cog_names:
                sequence_name = in_cog_names[name]
                if sequence_name == record.id:
                    found.append(name)
                    if args.all_cog:
                        status = "on_climb"
                    else:
                        status = "in_phylogeny"
                    fw.write(
                        f">{name} sequence_name={record.id} status={status}\n{record.seq}\n"
                    )

    with open(args.not_in_cog, "w") as fw:

        c = 0
        not_found_str = ""
        fw.write(f"{input_column}\n")
        for query in query_names:
            if query not in found:
                fw.write(query + '\n')
                not_found_str += (f"\t- {query}\n")
                c += 1
        if c != 0:
            print(qcfunk.cyan("\nNot found in phylogeny:"))
            print(not_found_str)
Exemplo n.º 15
0
def local_lineages_qc(config):

    query_file = config["query"]
    date_format = "%Y-%m-%d"

    if config["local_lineages"]:

        if "adm2" not in config["background_metadata_header"]:
            sys.stderr.write(
                qcfunk.cyan(
                    'Error: no geographic information found for local lineage analysis. Please provide a column in the background metadata with the header "adm2"\n'
                ))
            sys.exit(-1)
        elif "uk_lineage" not in config["background_metadata_header"]:
            sys.stderr.write(
                qcfunk.cyan(
                    'Error: no uk lineage information found for local lineage analysis. Please provide a column in the background metadata with the header "uk_lineage"\n'
                ))
            sys.exit(-1)

        if config["date_restriction"]:
            if config["date_range_start"] and type(
                    config["date_range_start"]) == str:
                try:
                    check_date = dt.datetime.strptime(
                        config["date_range_start"], date_format).date()
                except:
                    print(
                        qcfunk.cyan(
                            f'date range start in incorrect format. Please use i.e. YYYY-MM-DD'
                        ))
                    sys.exit(-1)

            if config["date_range_end"] and type(
                    config["date_range_end"]) == str:
                try:
                    check_date = dt.datetime.strptime(config["date_range_end"],
                                                      date_format).date()
                except:
                    print(
                        qcfunk.cyan(
                            f'date range end in incorrect format. Please use i.e. YYYY-MM-DD'
                        ))
                    sys.exit(-1)

            if config["date_range_start"] and config["date_range_end"]:
                print(
                    qcfunk.green(
                        f"Local lineage analysis restricted to {config['date_range_start']} to {config['date_range_end']}"
                    ))
            elif config["date_range_start"]:
                print(
                    qcfunk.green(
                        f"Local lineage analysis restricted to {config['date_range_start']} to present"
                    ))
            else:
                print(
                    qcfunk.green(
                        f"Local lineage analysis restricted to {config['date_window']} days around the sampling range"
                    ))

        elif config['date_range_start'] or config["date_range_end"]:
            print(
                qcfunk.cyan(
                    "Date restriction data provided but --date-restriction flag not used. Please use --date-restriction flag in config or command line."
                ))
            sys.exit(-1)

        else:
            print(
                qcfunk.green(
                    f"Local lineage analysis not restricted by time, will show background lineage composition for the whole of the epidemic"
                ))
Exemplo n.º 16
0
def get_datadir(args_climb, args_uun, args_datadir, args_metadata, args_tree,
                args_fasta, cwd, config):
    data_dir = ""
    background_metadata = ""
    background_seqs = ""
    background_tree = ""
    remote = config["remote"]
    cog_all = False

    if args_metadata:
        expanded_path = os.path.expanduser(args_metadata)
        background_metadata = os.path.join(cwd, expanded_path)
        if not os.path.exists(background_metadata):
            sys.stderr.write(
                qcfunk.cyan(
                    f"Error: can't find metadata file at {background_metadata}.\n"
                ))
            sys.exit(-1)

    elif "background_metadata" in config:
        if config["background_metadata"]:
            expanded_path = os.path.expanduser(config["background_metadata"])
            background_metadata = os.path.join(config["path_to_query"],
                                               expanded_path)
            if not os.path.exists(background_metadata):
                sys.stderr.write(
                    qcfunk.cyan(
                        f"Error: can't find metadata file at {background_metadata}.\n"
                    ))
                sys.exit(-1)
    if args_tree:
        expanded_path = os.path.expanduser(args_tree)
        background_tree = os.path.join(cwd, expanded_path)
        if not os.path.exists(background_tree):
            sys.stderr.write(
                qcfunk.cyan(
                    f"Error: can't find tree file at {background_tree}.\n"))
            sys.exit(-1)

    elif "background_tree" in config:
        if config["background_tree"]:
            expanded_path = os.path.expanduser(config["background_tree"])
            background_tree = os.path.join(config["path_to_query"],
                                           expanded_path)
            if not os.path.exists(background_tree):
                sys.stderr.write(
                    qcfunk.cyan(
                        f"Error: can't find tree file at {background_tree}.\n")
                )
                sys.exit(-1)

    if args_fasta:
        expanded_path = os.path.expanduser(args_fasta)
        background_seqs = os.path.join(cwd, expanded_path)
        if not os.path.exists(background_seqs):
            sys.stderr.write(
                qcfunk.cyan(
                    f"Error: can't find metadata file at {background_seqs}.\n")
            )
            sys.exit(-1)

    elif "background_sequences" in config:
        if config["background_sequences"]:
            expanded_path = os.path.expanduser(config["background_sequences"])
            background_seqs = os.path.join(config["path_to_query"],
                                           expanded_path)
            if not os.path.exists(background_seqs):
                sys.stderr.write(
                    qcfunk.cyan(
                        f"Error: can't find fasta file at {background_seqs}.\n"
                    ))
                sys.exit(-1)

    if args_climb:
        data_dir = "/cephfs/covid/bham/results/phylogenetics/latest/civet/cog"
        cog_all = False
        if os.path.exists(data_dir):
            config["remote"] = False
            config["username"] = ""
        else:
            sys.stderr.write(
                qcfunk.cyan(
                    f"Error: --CLIMB argument called, but CLIMB data path doesn't exist.\n"
                ))
            sys.exit(-1)

    elif args_datadir:
        data_dir = os.path.join(cwd, args_datadir)

    elif "datadir" in config:
        if config["datadir"]:
            expanded_path = os.path.expanduser(config["datadir"])
            data_dir = os.path.join(config["path_to_query"], expanded_path)
        else:
            data_dir = os.path.join(cwd, "civet-cat")

    if not remote:
        if not os.path.exists(data_dir):
            print_data_error(data_dir)
            sys.exit(-1)

        background_seqs, background_tree, background_metadata, data_date, background_metadata_all = get_background_files(
            data_dir, background_metadata, background_tree, background_seqs,
            cog_all)

        config["datadir"] = data_dir
        config["data_date"] = data_date

        if not os.path.isfile(background_tree) or not os.path.isfile(
                background_seqs) or not os.path.isfile(background_metadata):
            print_data_error(data_dir)
            sys.exit(-1)
        else:
            config["background_metadata"] = background_metadata
            config["background_seqs"] = background_seqs
            config["background_tree"] = background_tree
            config["background_metadata_all"] = background_metadata_all

            print("Found data:")
            print("    -", background_seqs)
            print("    -", background_metadata)
            print("    -", background_metadata_all)
            print("    -", background_tree, "\n")

    elif remote:

        get_remote_data(args_uun, background_metadata, background_tree,
                        background_seqs, data_dir, config)

    config["datadir"] = data_dir