class ImportanceAnalysis(object): """ Model for random forest based importance analysis """ def __init__(self, _jia, sql): self._jia = _jia self.sql = sql @params(self=object, limit=Nullable(int)) def important_variables(self, limit=10): """ Gets the top limit important variables as a list of tuples (name, importance) where: - name: string - variable name - importance: double - gini importance """ jimpvarmap = self._jia.importantVariablesJavaMap(limit) return sorted(jimpvarmap.items(), key=lambda x: x[1], reverse=True) def oob_error(self): """ OOB (Out of Bag) error estimate for the model :rtype: float """ return self._jia.oobError() def variable_importance(self): """ Returns a DataFrame with the gini importance of variables. The DataFrame has two columns: - variable: string - variable name - importance: double - gini importance """ jdf = self._jia.variableImportance() jdf.count() jdf.createTempView("df") return self.sql.table("df")
class ImportanceAnalysis(object): """ Model for random forest based importance analysis """ def __init__(self, hc, _jia): self.hc = hc self._jia = _jia @property def oob_error(self): """ OOB (Out of Bag) error estimate for the model :rtype: float """ return self._jia.oobError() @params(self=object, n_limit=Nullable(int)) def important_variants(self, n_limit=1000): """ Gets the top n most important loci. :param int n_limit: the limit of the number of loci to return :return: A KeyTable with the variant in the first column and importance in the second. :rtype: :py:class:`hail.KeyTable` """ return KeyTable(self.hc, self._jia.variantImportance(n_limit))
class ParameterDoc(object): """The documentation data of a parameter or return value for an Eluna method.""" # The integer ranges that each C++ type is valid for. None means valid for all numbers. valid_ranges = { 'float': None, 'double': None, 'int': ( '-2,147,483,647', '2,147,483,647' ), # This should be -32767..32767, but it's pretty safe to assume 32-bit. 'int8': ('-127', '127'), 'uint8': ('0', '255'), 'int16': ('-32,767', '32,767'), 'uint16': ('0', '65,535'), 'int32': ('-2,147,483,647', '2,147,483,647'), 'uint32': ('0', '4,294,967,295'), } @params(self=object, name=unicode, data_type=str, description=unicode, default_value=Nullable(unicode)) def __init__(self, name, data_type, description, default_value=None): """If `name` is not provided, the Parameter is a returned value instead of a parameter.""" self.name = name self.data_type = data_type self.default_value = default_value if description: # Capitalize the first letter, add a period, and parse as Markdown. self.description = '{}{}. '.format(description[0].capitalize(), description[1:]) self.description = markdown.markdown(self.description) else: self.description = '' # If the data type is a C++ number, convert to Lua number and add range info to description. if self.data_type in [ 'float', 'double', 'int8', 'uint8', 'int16', 'uint16', 'int32', 'uint32' ]: range = ParameterDoc.valid_ranges[self.data_type] if range: self.description += '<p><em>Valid numbers</em>: integers from {0} to {1}.</p>'.format( range[0], range[1]) else: self.description += '<p><em>Valid numbers</em>: all decimal numbers.</p>' self.data_type = 'number' elif self.data_type == 'bool': self.data_type = 'boolean' elif self.data_type == 'uint64' or self.data_type == 'int64': self.data_type = 'string'
def test_returns_nullable(self): @returns(Nullable(int)) def foo(x): return x # should not raise anything foo(1) foo(None) self.assertRaises(TypeError, lambda: foo('a'))
def test_params_nullable_type(self): @params(a=Nullable(int)) def foo(a=None): pass # should not raise anything foo(0) foo(None) self.assertRaises(TypeError, lambda: foo('a'))
class FeatureSource(object): def __init__(self, _jvm, _vs_api, _jsql, sql, _jfs): self._jfs = _jfs self._jvm = _jvm self._vs_api = _vs_api self._jsql = _jsql self.sql = sql @params(self=object, label_source=object, n_trees=Nullable(int), mtry_fraction=Nullable(float), oob=Nullable(bool), seed=Nullable(Union(int, long)), batch_size=Nullable(int), var_ordinal_levels=Nullable(int), max_depth=int, min_node_size=int) def importance_analysis(self, label_source, n_trees=1000, mtry_fraction=None, oob=True, seed=None, batch_size=100, var_ordinal_levels=3, max_depth=java.MAX_INT, min_node_size=1): """Builds random forest classifier. :param label_source: The ingested label source :param int n_trees: The number of trees to build in the forest. :param float mtry_fraction: The fraction of variables to try at each split. :param bool oob: Should OOB error be calculated. :param long seed: Random seed to use. :param int batch_size: The number of trees to build in one batch. :param int var_ordinal_levels: :return: Importance analysis model. :rtype: :py:class:`ImportanceAnalysis` """ jrf_params = self._jvm.au.csiro.variantspark.algo.RandomForestParams(bool(oob), java.jfloat_or(mtry_fraction), True, java.NAN, True, java.jlong_or(seed, randint(java.MIN_LONG, java.MAX_LONG)), max_depth, min_node_size, False, 0) jia = self._vs_api.ImportanceAnalysis(self._jsql, self._jfs, label_source, jrf_params, n_trees, batch_size, var_ordinal_levels) return ImportanceAnalysis(jia, self.sql)
class VariantsDatasetFunctions(object): """Extension to hail.VariantDataset with variant-spark related functions """ def __init__(self, *args, **kwargs): # check that the VariantDataset fields we rely on # have been initialized self.hc = self.hc self._jvds = self._jvds # Create the Java bridge object vsh = getattr(self.hc._jvm, 'au.csiro.variantspark.hail') self._vshf_cache = vsh.VSHailFunctions(self._jvds) @params(self=object, y_expr=str, n_trees=Nullable(int), mtry_fraction=Nullable(float), oob=Nullable(bool), seed=Nullable(Union(int, long)), batch_size=Nullable(int)) def importance_analysis(self, y_expr, n_trees=1000, mtry_fraction=None, oob=True, seed=None, batch_size=100): """Builds random forest classifier for the response variable defined with y_expr. :param str y_expr: Response expression. Must evaluate to Boolean or numeric with all values 0 or 1. :param int n_trees: The number of trees to build in the forest. :param float mtry_fraction: The fraction of variables to try at each split. :param bool oob: Should OOB error be calculated. :param long seed: Random seed to use. :param int batch_size: The number of trees to build in one batch. :return: Importance analysis model. :rtype: :py:class:`ImportanceAnalysis` """ return ImportanceAnalysis( self.hc, self._vshf_cache.importanceAnalysis( y_expr, n_trees, joption(mtry_fraction), oob, joption(long(seed) if seed is not None else None), batch_size)) @params(self=object, operation_name=str) def pairwise_operation(self, operation_name): """Computes a pairwise operation on encoded genotypes. Currently implemented operations include: - `manhattan` : the Manhattan distance - `euclidean` : the Euclidean distance - `sharedAltAlleleCount`: count of shared alternative alleles - `anySharedAltAlleleCount`: count of variants that share at least one alternative allele :param operation_name: name of the operaiton. One of `manhattan`, `euclidean`, `sharedAltAlleleCount`, `anySharedAltAlleleCount` :return: A symmetric `no_of_samples x no_of_samples` matrix with the result of the pairwise computation. :rtype: :py:class:`hail.KinshipMatrix` """ return KinshipMatrix( self._vshf_cache.pairwiseOperation(operation_name))
class ClassParser(object): """Parses a file line-by-line and returns methods when enough information is received to build them.""" # Various regular expressions to parse different parts of the doc string. # There are used to parse the class's description. class_start_regex = re.compile( r"\s*/\*\*\*") # The start of class documentation, i.e. /*** class_body_regex = re.compile( r"\s*\*\s*(.*)" ) # The "body", i.e. a * and optionally some descriptive text. class_end_regex = re.compile( r"\s*\*/") # The end of the comment portion, i.e. */ # These are used to parse method documentation. start_regex = re.compile( r"\s*/\*\*") # The start of documentation, i.e. /** body_regex = re.compile( r"\s*\s?\*\s*(.*)" ) # The "body", i.e. a * and optionally some descriptive text. # An extra optional space (\s?) was thrown in to make it different from `class_body_regex`. param_regex = re.compile( r"""\s*\*\s@param\s # The @param tag starts with opt. whitespace followed by "* @param ". ([&\w]+)\s(\w+) # The data type, a space, and the name of the param. (?:\s=\s(\w+))? # The default value: a = surrounded by spaces, followed by text. (?:\s:\s(.+))? # The description: a colon surrounded by spaces, followed by text. """, re.X) # This is the same as the @param tag, minus the default value part. return_regex = re.compile( r"""\s*\*\s@return\s ([&\w]+)\s(\w+) (?:\s:\s(.+))? """, re.X) comment_end_regex = re.compile( r"\s*\*/") # The end of the comment portion, i.e. */ end_regex = re.compile( r"\s*int\s(\w+)\s*\(" ) # The end of the documentation, i.e. int MethodName( def __init__(self, class_name): assert ClassParser.class_body_regex is not ClassParser.body_regex # The methods that have been parsed. self.methods = [] # The name of the class being parsed. self.class_name = class_name # The description of the class being parsed. self.class_description = '' # Reset the parser's state machine. self.reset() def reset(self): # What the last handled regex was, to determine what the next should be. self.last_regex = None # These are used to piece together the next `Method`. self.description = '' self.params = [] self.returned = [] self.method_name = None def handle_class_body(self, match): text = match.group(1) self.class_description += text + '\n' def handle_body(self, match): text = match.group(1) self.description += text + '\n' def handle_param(self, match): data_type, name, default, description = match.group(1), match.group( 2), match.group(3), match.group(4) self.params.append(ParameterDoc(name, data_type, description, default)) def handle_return(self, match): data_type, name, description = match.group(1), match.group( 2), match.group(3) self.returned.append(ParameterDoc(name, data_type, description)) def handle_end(self, match): self.method_name = match.group(1) self.methods.append( MethodDoc(self.method_name, self.description, self.params, self.returned)) # Table of which handler is used to handle each regular expressions. regex_handlers = { class_start_regex: None, class_body_regex: handle_class_body, class_end_regex: None, start_regex: None, body_regex: handle_body, param_regex: handle_param, return_regex: handle_return, comment_end_regex: None, end_regex: handle_end, } # Table of which regular expressions can follow the last handled regex. # `doc_body_regex` must always come LAST when used, since it also matches param, return, and comment_end. next_regexes = { None: [class_start_regex, start_regex, end_regex], class_start_regex: [class_end_regex, class_body_regex], class_body_regex: [class_end_regex, class_body_regex], class_end_regex: [], start_regex: [param_regex, return_regex, comment_end_regex, body_regex], body_regex: [param_regex, return_regex, comment_end_regex, body_regex], param_regex: [param_regex, return_regex, comment_end_regex], return_regex: [return_regex, comment_end_regex], comment_end_regex: [end_regex], end_regex: [], } @returns(Nullable(MethodDoc)) @params(self=object, line=str) def next_line(self, line): """Parse the next line of the file. This method returns a `Method` when enough data to form a `Method` has been parsed. Otherwise, it returns None. """ # Get the list of expected regular expressions using the last one handled. valid_regexes = self.next_regexes[self.last_regex] # Try to find a match. for regex in valid_regexes: match = regex.match(line) if match: handler = self.regex_handlers[regex] if handler: handler(self, match) # Not every regex has a handler, but keep track of where we are anyway. self.last_regex = regex # Break at the first match. break else: # No valid regex was found, reset everything. self.reset() @returns(MangosClassDoc) def to_class_doc(self): """Create an instance of `MangosClassDoc` from the parser's data. Is called by `parse_file` once parsing is finished. """ return MangosClassDoc(self.class_name, self.class_description, self.methods) @staticmethod @returns(MangosClassDoc) @params(file=FileType) def parse_file(file): """Parse the file `file` into a documented class.""" # Get the class name from "ClassMethods.h" by stripping off "Methods.h". class_name = file.name[:-len('Methods.h')] parser = ClassParser(class_name) line = file.readline() while line: parser.next_line(line) line = file.readline() return parser.to_class_doc()
class ClassParser(object): """Parses a file line-by-line and returns methods when enough information is received to build them.""" # Various regular expressions to parse different parts of the doc string. # There are used to parse the class's description. class_start_regex = re.compile( r"\s*/\*\*\*") # The start of class documentation, i.e. /*** class_body_regex = re.compile( r"\s*\*\s*(.*)" ) # The "body", i.e. a * and optionally some descriptive text. class_end_regex = re.compile( r"\s*\*/") # The end of the comment portion, i.e. */ # These are used to parse method documentation. start_regex = re.compile( r"\s*/\*\*") # The start of documentation, i.e. /** body_regex = re.compile( r"\s*\s?\*\s?(.*)" ) # The "body", i.e. a * and optionally some descriptive text. # An extra optional space (\s?) was thrown in to make it different from `class_body_regex`. param_regex = re.compile( r"""\s*\*\s@param\s # The @param tag starts with opt. whitespace followed by "* @param ". ([^\s]+)\s(\w+)? # The data type, a space, and the name of the param. (?:\s=\s(\w+))? # The default value: a = surrounded by spaces, followed by text. (?:\s:\s(.+))? # The description: a colon surrounded by spaces, followed by text. """, re.X) # This is the same as the @param tag, minus the default value part. return_regex = re.compile( r"""\s*\*\s@return\s ([\[\]\w]+)\s(\w+) (?:\s:\s(.+))? """, re.X) proto_regex = re.compile( r"""\s*\*\s@proto\s ([\w\s,]+)? # The list of arguments. (?:=\s)? # An equals sign and a space separate the args and returns. (?:\(([\w\s,]+)\))? # The list of return values, in parens. """, re.X) comment_end_regex = re.compile( r"\s*\*/") # The end of the comment portion, i.e. */ end_regex = re.compile( r"\s*int\s(\w+)\s*\(" ) # The end of the documentation, i.e. int MethodName( def __init__(self, class_name): assert ClassParser.class_body_regex is not ClassParser.body_regex # The methods that have been parsed. self.methods = [] # The name of the class being parsed. self.class_name = class_name # The description of the class being parsed. self.class_description = '' # Reset the parser's state machine. self.reset() def reset(self): # What the last handled regex was, to determine what the next should be. self.last_regex = None # These are used to piece together the next `Method`. self.description = '' self.params = [] self.returned = [] self.method_name = None self.prototypes = [] def handle_class_body(self, match): text = match.group(1) self.class_description += text + '\n' def handle_body(self, match): text = match.group(1) self.description += text + '\n' def handle_param(self, match): data_type, name, default, description = match.group(1), match.group( 2), match.group(3), match.group(4) self.params.append(ParameterDoc(name, data_type, description, default)) def handle_return(self, match): data_type, name, description = match.group(1), match.group( 2), match.group(3) self.returned.append(ParameterDoc(name, data_type, description)) def handle_proto(self, match): return_values, parameters = match.group(1), match.group(2) parameters = ' ' + parameters + ' ' if parameters else '' return_values = return_values + '= ' if return_values else '' if self.class_name == 'Global': prototype = '{0}{{0}}({1})'.format(return_values, parameters) else: prototype = '{0}{1}:{{0}}({2})'.format(return_values, self.class_name, parameters) self.prototypes.append(prototype) def handle_end(self, match): self.method_name = match.group(1) def make_prototype(parameters): if parameters != '': parameters = ' ' + parameters + ' ' if self.class_name == 'Global': if self.returned: return_values = ', '.join( [param.name for param in self.returned]) prototype = '{0} = {1}({2})'.format( return_values, self.method_name, parameters) else: prototype = '{0}({1})'.format(self.method_name, parameters) else: if self.returned: return_values = ', '.join( [param.name for param in self.returned]) prototype = '{0} = {1}:{2}({3})'.format( return_values, self.class_name, self.method_name, parameters) else: prototype = '{0}:{1}({2})'.format(self.class_name, self.method_name, parameters) return prototype # If there's no prototype, make one with all params and returns. if not self.prototypes: # A list of all parameters with default values. params_with_default = [] # The index of the last non-default parameter. last_non_default_i = 0 # If False, a parameter WITHOUT a default value follows one WITH a default value. # In this case, don't bother generating prototypes. simple_order = True for i, param in enumerate(self.params): if param.default_value: params_with_default.append(param) else: last_non_default_i = i if params_with_default: simple_order = False if not params_with_default or not simple_order: # Just generate one prototype with all the parameters. parameters = ', '.join([param.name for param in self.params]) self.prototypes.append(make_prototype(parameters)) else: # Generate a prototype for all the non-default parameters, # then one for each default parameter with all the previous parameters. for i in range(last_non_default_i, len(self.params)): parameters = ', '.join( [param.name for param in self.params[:i + 1]]) self.prototypes.append(make_prototype(parameters)) else: # Format the method name into each prototype. self.prototypes = [ proto.format(self.method_name) for proto in self.prototypes ] self.methods.append( MethodDoc(self.method_name, self.description, self.prototypes, self.params, self.returned)) # Table of which handler is used to handle each regular expressions. regex_handlers = { class_start_regex: None, class_body_regex: handle_class_body, class_end_regex: None, start_regex: None, body_regex: handle_body, param_regex: handle_param, return_regex: handle_return, proto_regex: handle_proto, comment_end_regex: None, end_regex: handle_end, } # Table of which regular expressions can follow the last handled regex. # `body_regex` must always come LAST when used, since it also matches param, return, and comment_end. next_regexes = { None: [class_start_regex, start_regex, end_regex], class_start_regex: [class_end_regex, class_body_regex], class_body_regex: [class_end_regex, class_body_regex], class_end_regex: [], start_regex: [ param_regex, return_regex, proto_regex, comment_end_regex, body_regex ], body_regex: [ param_regex, return_regex, proto_regex, comment_end_regex, body_regex ], proto_regex: [ param_regex, return_regex, proto_regex, comment_end_regex, body_regex ], param_regex: [param_regex, return_regex, comment_end_regex, body_regex], return_regex: [return_regex, comment_end_regex], comment_end_regex: [end_regex], end_regex: [], } @returns(Nullable(MethodDoc)) @params(self=object, line=str) def next_line(self, line): """Parse the next line of the file. This method returns a `Method` when enough data to form a `Method` has been parsed. Otherwise, it returns None. """ # Get the list of expected regular expressions using the last one handled. valid_regexes = self.next_regexes[self.last_regex] # Try to find a match. for regex in valid_regexes: match = regex.match(line) if match: handler = self.regex_handlers[regex] if handler: handler(self, match) # Not every regex has a handler, but keep track of where we are anyway. self.last_regex = regex # Break at the first match. break else: # No valid regex was found, reset everything. self.reset() @returns(MangosClassDoc) def to_class_doc(self): """Create an instance of `MangosClassDoc` from the parser's data. Is called by `parse_file` once parsing is finished. """ return MangosClassDoc(self.class_name, self.class_description, self.methods) @staticmethod @returns(MangosClassDoc) @params(file=FileType) def parse_file(file): """Parse the file `file` into a documented class.""" # Get the class name from "ClassMethods.h" by stripping off "Methods.h". class_name = file.name[:-len('Methods.h')] parser = ClassParser(class_name) line = file.readline() while line: parser.next_line(line) line = file.readline() return parser.to_class_doc()
class ParameterDoc(object): """The documentation data of a parameter or return value for an Eluna method.""" # The integer ranges that each C++ type is valid for. None means valid for all numbers. valid_ranges = { 'float': None, 'double': None, 'int': ( '-2,147,483,647', '2,147,483,647' ), # This should be -32767..32767, but it's pretty safe to assume 32-bit. 'int8': ('-127', '127'), 'uint8': ('0', '255'), 'int16': ('-32,767', '32,767'), 'uint16': ('0', '65,535'), 'int32': ('-2,147,483,647', '2,147,483,647'), 'uint32': ('0', '4,294,967,295'), 'int64': ('-9,223,372,036,854,775,808', '9,223,372,036,854,775,807'), 'uint64': ('0', '18,446,744,073,709,551,615'), 'ObjectGuid': ('0', '18,446,744,073,709,551,615'), } @params(self=object, name=Nullable(unicode), data_type=str, description=unicode, default_value=Nullable(unicode)) def __init__(self, name, data_type, description, default_value=None): """If `name` is not provided, the Parameter is a returned value instead of a parameter.""" self.name = name self.data_type = data_type self.default_value = default_value if self.data_type == '...': self.name = '...' else: assert (self.name is not None) if description: # Capitalize the first letter, add a period, and parse as Markdown. self.description = '{}{}. '.format(description[0].capitalize(), description[1:]) self.description = markdown.markdown(self.description) else: self.description = '' # If the data type is a C++ number, convert to Lua number and add range info to description. if self.data_type in self.valid_ranges.keys(): range = ParameterDoc.valid_ranges[self.data_type] if range: self.description += '<p><em>Valid numbers</em>: integers from {0} to {1}.</p>'.format( range[0], range[1]) else: self.description += '<p><em>Valid numbers</em>: all decimal numbers.</p>' self.data_type = 'number' elif self.data_type == 'bool': self.data_type = 'boolean' elif self.data_type == 'int64' or self.data_type == 'uint64': self.data_type = '[' + self.data_type + ']' elif not self.data_type in [ 'nil', 'boolean', 'number', 'string', 'table', 'function', '...' ] and self.data_type[:1] != '[': print "Missing angle brackets [] around the data type name: `" + self.data_type + "`"