def test_fi1(fi1_df, measure, expected_type): values = fi1_df[measure] classifier = MeasureClassifier(default_config()) classifier_report = MeasureClassifier.meta_measures(values) measure_type = classifier.classify(classifier_report) assert measure_type == expected_type
def parse_config(args): config = default_config() config.verbose = args.verbose config.instruments.dir = args.instruments config.pedigree = args.pedigree config.db.filename = args.output skip_columns = set([]) if args.skip_file: assert os.path.exists(args.skip_file) with open(args.skip_file, "r") as infile: columns = infile.readlines() columns = [col.strip() for col in columns] skip_columns = skip_columns | set(columns) if args.skip_columns: columns = set([col for col in args.skip_columns.split(",")]) skip_columns = skip_columns | columns config.skip.measures = skip_columns if args.composite_fids: config.family.composite_key = args.composite_fids if args.role: config.person.role.type = args.role assert config.person.role.type in set(["column", "guess"]) if args.role_mapping: config.person.role.mapping = args.role_mapping assert config.person.role.mapping in set(["SPARK", "SSC", "INTERNAL"]) if args.person_column: config.person.column = args.person_column if args.min_individuals is not None and args.min_individuals >= 0: config.classification.min_individuals = args.min_individuals if args.categorical is not None and args.categorical >= 0: config.classification.categorical.min_rank = args.categorical if args.ordinal is not None and args.ordinal >= 0: config.classification.ordinal.min_rank = args.ordinal if args.continuous is not None and args.continuous >= 0: config.classification.continuous.min_rank = args.continuous if args.tab_separated: config.instruments.tab_separated = True if args.report_only: config.db.filename = "memory" config.report_only = args.report_only if args.parallel: config.parallel = args.parallel return config
def test_fake_phenotype_data_ordinal_m4(fake_phenotype_data): measure_id = "i1.m4" df = fake_phenotype_data.get_measure_values_df(measure_id) rank = len(df[measure_id].unique()) assert rank == 9 assert len(df) == 195 measure_conf = default_config() classifier = MeasureClassifier(measure_conf) report = classifier.meta_measures(df[measure_id]) assert classifier.classify(report) == MeasureType.ordinal
def parse_phenotype_data_config(args): config = default_config() config.verbose = args.verbose config.instruments.dir = args.instruments config.pedigree = args.pedigree config.db.filename = args.pheno_db_filename dump_config(config) check_phenotype_data_config(config) return config
def test_should_convert_to_numeric_cutoff(): values = pd.Series(data=["1", "2", "1", "1", "1", "1", "2", "2", "a"]) report = MeasureClassifier.meta_measures(values) config = default_config() config.classification.min_individuals = 1 config.classification.ordinal.min_rank = 2 classifier = MeasureClassifier(config) measure_type = classifier.classify(report) assert measure_type == MeasureType.categorical config.classification.non_numeric_cutoff = 0.2 classifier = MeasureClassifier(config) measure_type = classifier.classify(report) assert measure_type == MeasureType.ordinal
def test_fake_background_classify(fake_background_df): columns = list(fake_background_df.columns) for col in columns[1:]: series = fake_background_df[col] classifier = MeasureClassifier(default_config()) classifier_report = MeasureClassifier.meta_measures(series) measure_type = classifier.classify(classifier_report) assert (measure_type == MeasureType.text or measure_type == MeasureType.raw or measure_type == MeasureType.categorical) values = classifier.convert_to_string(series.values) values = [v for v in values if v is not None] assert all([isinstance(v, str) for v in values])
def main(argv=None): # IGNORE:C0111 """Command line options.""" if argv is None: argv = sys.argv else: sys.argv.extend(argv) program_name = os.path.basename(sys.argv[0]) program_shortdesc = __import__("__main__").__doc__.split("\n")[1] program_license = """%s USAGE """ % (program_shortdesc, ) try: defaults = default_config() # Setup argument parser parser = ArgumentParser(description=program_license) # formatter_class=RawDescriptionHelpFormatter # formatter_class=ArgumentDefaultsHelpFormatter) parser.add_argument("-V", "--verbose", dest="verbose", action="count", help="set verbosity level", default=0) parser.add_argument( "-i", "--instruments", dest="instruments", help="directory where all instruments are located", metavar="path", ) parser.add_argument( "-p", "--pedigree", dest="pedigree", help="pedigree file where families descriptions are located", metavar="path", ) parser.add_argument( "-d", "--description", help="standardized tsv file that contains measure descriptions", ) parser.add_argument( "-o", "--output", dest="output", help="output file", metavar="filename", ) parser.add_argument( "-C", "--continuous", type=int, dest="continuous", default=defaults["classification"]["continuous"]["min_rank"], help="minimal count of unique values for a measure to be " "classified as continuous (default: %(default)s)", ) parser.add_argument( "-O", "--ordinal", type=int, dest="ordinal", default=defaults["classification"]["ordinal"]["min_rank"], help="minimal count of unique values for a measure to be " "classified as ordinal (default: %(default)s)", ) parser.add_argument( "-A", "--categorical", type=int, dest="categorical", default=defaults["classification"]["categorical"]["min_rank"], help="minimal count of unique values for a measure to be " "classified as categorical (default: %(default)s)", ) parser.add_argument( "-I", "--min-individuals", type=int, dest="min_individuals", default=defaults["classification"]["min_individuals"], help="minimal number of individuals for a measure to be " "considered for classification (default: %(default)s)", ) parser.add_argument( "-S", "--skip-columns", type=str, dest="skip_columns", help="comma separated list of instruments columns to skip", ) parser.add_argument( "--skip-file", type=str, dest="skip_file", help="file with list of instruments columns to skip", ) parser.add_argument( "--composite-fids", action="store_true", dest="composite_fids", help="builds composite family IDs from parents' IDs" " (default: %(default)s)", ) parser.add_argument( "-r", "--role", dest="role", default=defaults["person"]["role"]["type"], help='sets role handling; available choices: "column", "guess"' " (default: %(default)s)", ) parser.add_argument( "--role-mapping", dest="role_mapping", default=defaults["person"]["role"]["mapping"], help="sets role column mapping rules; " 'available choices "SPARK", "SSC", "INTERNAL"' " (default: %(default)s)", ) parser.add_argument( "-P", "--person-column", dest="person_column", # default=defaults['person']['role']['column'], help="sets name of a column in instrument's files, " "containing personId (default: %(default)s)", ) parser.add_argument( "-T", "--tab-separated", dest="tab_separated", action="store_true", help="instruments file are tab separated" " (default: %(default)s)", ) parser.add_argument( "--report-only", dest="report_only", action="store_true", help="runs the tool in report only mode (default: %(default)s)", ) parser.add_argument( "--parallel", type=int, dest="parallel", default=defaults["parallel"], help="size of executors pool to use for processing" " (default: %(default)s)", ) # Process arguments args = parser.parse_args() if args.verbose == 1: logging.basicConfig(level=logging.WARNING) elif args.verbose == 2: logging.basicConfig(level=logging.INFO) elif args.verbose >= 3: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.ERROR) if not args.output and not args.report_only: raise CLIError("output filename should be specified") if not args.output: args.output = "output.db" if not args.pedigree: raise CLIError("pedigree file must be specified") if not args.instruments: raise CLIError("instruments directory should be specified") config = parse_config(args) dump_config(config) if not check_phenotype_data_config(config): raise Exception("bad classification boundaries") if os.path.exists(args.output): raise CLIError("output file already exists") prep = PrepareVariables(config) prep.build_pedigree(args.pedigree) prep.build_variables(args.instruments, args.description) return 0 except KeyboardInterrupt: return 1 except Exception as e: traceback.print_exc() indent = len(program_name) * " " sys.stderr.write(program_name + ": " + repr(e) + "\n") sys.stderr.write(indent + " for help use --help\n") return 2
def test_config(temp_dbfile): config = default_config() config.db.filename = temp_dbfile return Box(config.to_dict())