示例#1
0
    def load(cls, input_file, md5sum=None, sha256sum=None, force=False):
        """Loads a classifier from a file"""
        logger = logging.getLogger(__name__)
        filename = try_and_get_filename(input_file)
        if (md5sum, sha256sum, force) == (None, None, False):
            message = "Loading unknown pickled objects is dangerous.  Either \
provide an md5 or sha256 for '%s' or explicitly 'force' if you want to \
live dangerously" % filename
            logger.error(message)
            raise ValueError(message)
        (actual_md5sum, actual_sha256sum) = cls._calculate_hashes(input_file)
        if not md5sum is None:
            if actual_md5sum != md5sum:
                message = "Expected md5sum of '%s' to be '%s' but got '%s'" % (
                    filename, md5sum, actual_md5sum)
                if force:
                    logger.warn("FORCED so ignoring: %s" % message)
                else:
                    logger.error(message)
                    raise ValueError(message)
        if not sha256sum is None:
            if actual_sha256sum != sha256sum:
                message = "Expected sha256sum of '%s' to be '%s' but got '%s'" % (
                    filename, sha256sum, actual_sha256sum)
                if force:
                    logger.warn("FORCED so ignoring: %s" % message)
                else:
                    logger.error(message)
                    raise ValueError(message)

        input_file.seek(0)
        classifier, feature_labels = pickle.load(input_file)
        return SampleClassifier(classifier, feature_labels)
示例#2
0
  def load(cls, input_file, md5sum=None, sha256sum=None, force=False):
    """Loads a classifier from a file"""
    logger = logging.getLogger(__name__)
    filename = try_and_get_filename(input_file)
    if (md5sum, sha256sum, force) == (None, None, False):
      message = "Loading unknown pickled objects is dangerous.  Either \
provide an md5 or sha256 for '%s' or explicitly 'force' if you want to \
live dangerously" % filename
      logger.error(message)
      raise ValueError(message)
    (actual_md5sum, actual_sha256sum) = cls._calculate_hashes(input_file)
    if not md5sum is None:
      if actual_md5sum != md5sum:
        message = "Expected md5sum of '%s' to be '%s' but got '%s'" % (filename,
                                                                       md5sum,
                                                                       actual_md5sum)
        if force:
          logger.warn("FORCED so ignoring: %s" % message)
        else:
          logger.error(message)
          raise ValueError(message)
    if not sha256sum is None:
      if actual_sha256sum != sha256sum:
        message = "Expected sha256sum of '%s' to be '%s' but got '%s'" % (filename,
                                                                          sha256sum,
                                                                          actual_sha256sum)
        if force:
          logger.warn("FORCED so ignoring: %s" % message)
        else:
          logger.error(message)
          raise ValueError(message)

    input_file.seek(0)
    classifier, feature_labels = pickle.load(input_file)
    return SampleClassifier(classifier, feature_labels)
示例#3
0
 def _extract_features(self, features_file):
     """Parses a feature file and returns the sample_names and features"""
     features_file.seek(0)
     filename = try_and_get_filename(features_file)
     feature_file_lines = [line for line in csv.reader(features_file)]
     data = np.array(feature_file_lines)
     try:
         number_of_rows, number_of_columns = data.shape
         # shape returns a 2 element tuple if there are a consistent number of
         # columns.  Otherwise it returns a 1 element tuple which would trigger a
         # ValueError.
     except ValueError as e:
         raise ValueError(
             "Issue parsing '%s', some samples have more features than others"
             % filename)
     feature_labels = data[0, 1:]
     sample_names = data[1:, 0]
     str_to_int = np.vectorize(int)
     try:
         features = str_to_int(data[1:, 1:])
     except ValueError:
         raise ValueError(
             "Issue parsing '%s', expected features to be 0 or 1" %
             filename)
     if not self._check_features_binary(features):
         raise ValueError(
             "Issue parsing '%s', expected features to be 0 or 1" %
             filename)
     return sample_names, features
示例#4
0
 def write_vcf_file(self, vcf_output_file):
     reader = self._get_records_from_vcf()
     reader.add_GT_format_header(
     )  # This might not be true but it doesn't hurt
     writer = vcf.Writer(vcf_output_file, reader)
     filename = try_and_get_filename(vcf_output_file)
     self.logger.info("Writing VCF records to '%s'" % filename)
     for record in reader:
         writer.write_record(record)
示例#5
0
 def export(self, output_file):
   """Writes the classifier to a file"""
   filename = try_and_get_filename(output_file)
   self.logger.info("Writing classifier to '%s'" % filename)
   pickle.dump((self.classifier, self.feature_labels), output_file)
   output_file.flush()
   with open(filename, 'rb') as hash_check_file:
     (md5sum, sha256sum) = self._calculate_hashes(hash_check_file)
   self.logger.info("Wrote classifier with md5sum '%s' to '%s'" % (md5sum,
                                                                   filename))
   self.logger.info("Wrote classifier with sha256sum '%s' to '%s'" % (sha256sum,
                                                                   filename))
示例#6
0
 def export(self, output_file):
     """Writes the classifier to a file"""
     filename = try_and_get_filename(output_file)
     self.logger.info("Writing classifier to '%s'" % filename)
     pickle.dump((self.classifier, self.feature_labels), output_file)
     output_file.flush()
     with open(filename, 'rb') as hash_check_file:
         (md5sum, sha256sum) = self._calculate_hashes(hash_check_file)
     self.logger.info("Wrote classifier with md5sum '%s' to '%s'" %
                      (md5sum, filename))
     self.logger.info("Wrote classifier with sha256sum '%s' to '%s'" %
                      (sha256sum, filename))
示例#7
0
  def get_feature_labels_from_file(self, feature_file):
    """Parses a file of features and returns the feature labels

    Some files have more features others don't.  This is a useful
    pre-computation step which extracts the feature labels used in one file
    which can be used to filter the features in another"""
    feature_file.seek(0)
    features_line = csv.reader(feature_file).next()
    if features_line[0] != 'Features':
      filename = try_and_get_filename(feature_file)
      raise ValueError("Issue parsing '%s', expected first element to be 'Features'" % filename)
    feature_labels = features_line[1:]
    return np.array(feature_labels)
示例#8
0
    def get_feature_labels_from_file(self, feature_file):
        """Parses a file of features and returns the feature labels

    Some files have more features others don't.  This is a useful
    pre-computation step which extracts the feature labels used in one file
    which can be used to filter the features in another"""
        feature_file.seek(0)
        features_line = csv.reader(feature_file).next()
        if features_line[0] != 'Features':
            filename = try_and_get_filename(feature_file)
            raise ValueError(
                "Issue parsing '%s', expected first element to be 'Features'" %
                filename)
        feature_labels = features_line[1:]
        return np.array(feature_labels)
示例#9
0
 def _extract_features(self, features_file):
   """Parses a feature file and returns the sample_names and features"""
   features_file.seek(0)
   filename = try_and_get_filename(features_file)
   feature_file_lines = [line for line in csv.reader(features_file)]
   data = np.array(feature_file_lines)
   try:
     number_of_rows, number_of_columns = data.shape
     # shape returns a 2 element tuple if there are a consistent number of
     # columns.  Otherwise it returns a 1 element tuple which would trigger a
     # ValueError.
   except ValueError as e:
     raise ValueError("Issue parsing '%s', some samples have more features than others" % filename)
   feature_labels = data[0,1:]
   sample_names = data[1:,0]
   str_to_int = np.vectorize(int)
   try:
     features = str_to_int(data[1:,1:])
   except ValueError:
     raise ValueError("Issue parsing '%s', expected features to be 0 or 1" % filename)
   if not self._check_features_binary(features):
     raise ValueError("Issue parsing '%s', expected features to be 0 or 1" % filename)
   return sample_names, features