def load(cls, input_file, md5sum=None, sha256sum=None, force=False): """Loads a classifier from a file""" logger = logging.getLogger(__name__) filename = try_and_get_filename(input_file) if (md5sum, sha256sum, force) == (None, None, False): message = "Loading unknown pickled objects is dangerous. Either \ provide an md5 or sha256 for '%s' or explicitly 'force' if you want to \ live dangerously" % filename logger.error(message) raise ValueError(message) (actual_md5sum, actual_sha256sum) = cls._calculate_hashes(input_file) if not md5sum is None: if actual_md5sum != md5sum: message = "Expected md5sum of '%s' to be '%s' but got '%s'" % ( filename, md5sum, actual_md5sum) if force: logger.warn("FORCED so ignoring: %s" % message) else: logger.error(message) raise ValueError(message) if not sha256sum is None: if actual_sha256sum != sha256sum: message = "Expected sha256sum of '%s' to be '%s' but got '%s'" % ( filename, sha256sum, actual_sha256sum) if force: logger.warn("FORCED so ignoring: %s" % message) else: logger.error(message) raise ValueError(message) input_file.seek(0) classifier, feature_labels = pickle.load(input_file) return SampleClassifier(classifier, feature_labels)
def load(cls, input_file, md5sum=None, sha256sum=None, force=False): """Loads a classifier from a file""" logger = logging.getLogger(__name__) filename = try_and_get_filename(input_file) if (md5sum, sha256sum, force) == (None, None, False): message = "Loading unknown pickled objects is dangerous. Either \ provide an md5 or sha256 for '%s' or explicitly 'force' if you want to \ live dangerously" % filename logger.error(message) raise ValueError(message) (actual_md5sum, actual_sha256sum) = cls._calculate_hashes(input_file) if not md5sum is None: if actual_md5sum != md5sum: message = "Expected md5sum of '%s' to be '%s' but got '%s'" % (filename, md5sum, actual_md5sum) if force: logger.warn("FORCED so ignoring: %s" % message) else: logger.error(message) raise ValueError(message) if not sha256sum is None: if actual_sha256sum != sha256sum: message = "Expected sha256sum of '%s' to be '%s' but got '%s'" % (filename, sha256sum, actual_sha256sum) if force: logger.warn("FORCED so ignoring: %s" % message) else: logger.error(message) raise ValueError(message) input_file.seek(0) classifier, feature_labels = pickle.load(input_file) return SampleClassifier(classifier, feature_labels)
def _extract_features(self, features_file): """Parses a feature file and returns the sample_names and features""" features_file.seek(0) filename = try_and_get_filename(features_file) feature_file_lines = [line for line in csv.reader(features_file)] data = np.array(feature_file_lines) try: number_of_rows, number_of_columns = data.shape # shape returns a 2 element tuple if there are a consistent number of # columns. Otherwise it returns a 1 element tuple which would trigger a # ValueError. except ValueError as e: raise ValueError( "Issue parsing '%s', some samples have more features than others" % filename) feature_labels = data[0, 1:] sample_names = data[1:, 0] str_to_int = np.vectorize(int) try: features = str_to_int(data[1:, 1:]) except ValueError: raise ValueError( "Issue parsing '%s', expected features to be 0 or 1" % filename) if not self._check_features_binary(features): raise ValueError( "Issue parsing '%s', expected features to be 0 or 1" % filename) return sample_names, features
def write_vcf_file(self, vcf_output_file): reader = self._get_records_from_vcf() reader.add_GT_format_header( ) # This might not be true but it doesn't hurt writer = vcf.Writer(vcf_output_file, reader) filename = try_and_get_filename(vcf_output_file) self.logger.info("Writing VCF records to '%s'" % filename) for record in reader: writer.write_record(record)
def export(self, output_file): """Writes the classifier to a file""" filename = try_and_get_filename(output_file) self.logger.info("Writing classifier to '%s'" % filename) pickle.dump((self.classifier, self.feature_labels), output_file) output_file.flush() with open(filename, 'rb') as hash_check_file: (md5sum, sha256sum) = self._calculate_hashes(hash_check_file) self.logger.info("Wrote classifier with md5sum '%s' to '%s'" % (md5sum, filename)) self.logger.info("Wrote classifier with sha256sum '%s' to '%s'" % (sha256sum, filename))
def get_feature_labels_from_file(self, feature_file): """Parses a file of features and returns the feature labels Some files have more features others don't. This is a useful pre-computation step which extracts the feature labels used in one file which can be used to filter the features in another""" feature_file.seek(0) features_line = csv.reader(feature_file).next() if features_line[0] != 'Features': filename = try_and_get_filename(feature_file) raise ValueError("Issue parsing '%s', expected first element to be 'Features'" % filename) feature_labels = features_line[1:] return np.array(feature_labels)
def get_feature_labels_from_file(self, feature_file): """Parses a file of features and returns the feature labels Some files have more features others don't. This is a useful pre-computation step which extracts the feature labels used in one file which can be used to filter the features in another""" feature_file.seek(0) features_line = csv.reader(feature_file).next() if features_line[0] != 'Features': filename = try_and_get_filename(feature_file) raise ValueError( "Issue parsing '%s', expected first element to be 'Features'" % filename) feature_labels = features_line[1:] return np.array(feature_labels)
def _extract_features(self, features_file): """Parses a feature file and returns the sample_names and features""" features_file.seek(0) filename = try_and_get_filename(features_file) feature_file_lines = [line for line in csv.reader(features_file)] data = np.array(feature_file_lines) try: number_of_rows, number_of_columns = data.shape # shape returns a 2 element tuple if there are a consistent number of # columns. Otherwise it returns a 1 element tuple which would trigger a # ValueError. except ValueError as e: raise ValueError("Issue parsing '%s', some samples have more features than others" % filename) feature_labels = data[0,1:] sample_names = data[1:,0] str_to_int = np.vectorize(int) try: features = str_to_int(data[1:,1:]) except ValueError: raise ValueError("Issue parsing '%s', expected features to be 0 or 1" % filename) if not self._check_features_binary(features): raise ValueError("Issue parsing '%s', expected features to be 0 or 1" % filename) return sample_names, features