def test_read_mapping_file_invalid_input(self): """Test parsing invalid input raises an error.""" with self.assertRaises(MappingFileFormatError): _ = read_mapping_file(self.map_empty_cells_lines) with self.assertRaises(MappingFileFormatError): _ = read_mapping_file(self.map_dup_samples_lines)
def build_problem_data(group_map_files, mapping_file, prediction_field, start_level, include_only, negate, n_processes): #For each scope, build a map from group to object and vice versa group_to_object = [] object_to_group = [] for map_file in group_map_files: g_to_o, o_to_g = read_split_file(map_file) group_to_object.append(g_to_o) object_to_group.append(o_to_g) #Find a list of sample names from our group names #An alternative is 'samplenames = samplemap.keys()', but that may have records without features samplenames = set() for grp in group_to_object[start_level]: l = group_to_object[start_level][grp] for obj in l: samplenames.add(parse_object_string_sample(obj)) samplenames = list(samplenames) #get a map of sample name to it's properties samplemap = read_mapping_file(mapping_file) sample_to_response = {} for samplename in samplenames: if (include_only is None or ((samplemap[samplename][include_only[0]] in include_only[1]) ^ negate)): sample_to_response[samplename] = samplemap[samplename][prediction_field] problem_data = ProblemData(group_to_object, object_to_group, sample_to_response, n_processes) feature_vector = FeatureVector([FeatureRecord(group, start_level, len(group_to_object[start_level][group])) for group in group_to_object[start_level].keys()]) return problem_data, feature_vector
def build_problem_data( group_map_files, mapping_file, prediction_field, start_level, include_only, negate, n_processes, parse_object_string=parse_object_string_sample, ): simple_var_types = [ ("n_processes", types.IntType), ("start_level", types.IntType), ("negate", types.BooleanType), ("prediction_field", types.StringType), ("include_only", (types.NoneType, types.ListType, types.TupleType)), ("group_map_files", types.ListType), ] for var_name, var_type in simple_var_types: check_input_type(var_name, locals()[var_name], var_type) if include_only != None: if not isinstance(include_only[0], types.StringType): raise InputTypeError("include_only[0] should be of type string") if not isinstance(include_only[1], types.ListType) or not all( [isinstance(value, types.StringType) for value in include_only[1]] ): raise InputTypeError("include_only[1] should be a list of strings") if start_level >= len(group_map_files) or start_level < 0: raise InputTypeError( "start_level (%s) is not a valid scope index; group_map_files is of length %s" % (start_level, len(group_map_files)) ) # For each scope, build a map from group to object and vice versa group_to_object = [] object_to_group = [] for map_file in group_map_files: g_to_o, o_to_g = read_split_file(map_file) group_to_object.append(g_to_o) object_to_group.append(o_to_g) assert isinstance(g_to_o, types.DictType), "read_split_file did not return a dict type" assert isinstance(o_to_g, types.DictType), "read_split_file did not return a dict type" # Find a list of sample names from our group names # An alternative is 'samplenames = samplemap.keys()', but that may have records without features samplename_set = set() for grp in group_to_object[start_level]: objs = group_to_object[start_level][grp] for obj in objs: samplename = parse_object_string(obj) samplename_set.add(samplename) samplenames = list(samplename_set) # get a map of sample name to it's properties samplemap = read_mapping_file(mapping_file) def include_samplename(samplename): if include_only == None: return True sample_dict = samplemap[samplename] try: if (sample_dict[include_only[0]] in include_only[1]) ^ negate: return True return False except KeyError: raise KeyError("include_only[0] is not a field in mapping_file") sample_to_response = {} for samplename in samplenames: if include_samplename(samplename): sample_fields = None try: sample_fields = samplemap[samplename] except KeyError: raise KeyError( "A sample name (%s) found in the group files is not a sample in mapping_file." % samplename ) try: sample_to_response[samplename] = sample_fields[prediction_field] except KeyError: raise KeyError("prediction_field is not a field in mapping_file.") problem_data = ProblemData(group_to_object, object_to_group, sample_to_response, n_processes, parse_object_string) return problem_data
def test_read_mapping_file(self): """Test parsing a mapping file into a nested dictionary.""" exp = {'A': {'Foo': 'f1', 'Bar': 'b1'}, 'B': {'Foo': 'f2', 'Bar': 'b2'}} obs = read_mapping_file(self.map1_lines) self.assertEqual(obs, exp)