Exemplo n.º 1
0
    def _open_sequences(self):
        """Loads the raw sequences, storing them as lists of strings."""
        
        logging.info("loading %s ..." % basename(self._sequencefile))
        tic()

        # Read entire data file into one big string.
        # Manually scanning the string for newlines and tabs is 3x faster than
        # using readlines() and then calling split() on each line.
        with open(self._sequencefile) as f:
            f.readline() # discard header
            txt = f.read()
            assert txt[-1] == '\n', "Sequence file must end with a newline."

        for name in self.sequence_names:
            logging.info("   %s" % name)

        foldfilter = self._foldfilter
        maxrows = self._maxrows
        seqloop = range(len(self.sequence_names)-1) # Used in innermost loop
        revcomp = self._reverse_complement

        # Store each column as its own list of sequences.
        # Scan through the txt string until we've hit the end.
        sequences = [[] for s in self.sequence_names]
        rowidx    = []
        i,j = 0,txt.find('\n')
        for row_index in xrange(len(txt)):
            if j == -1 or len(rowidx) >= maxrows:
                break
            
            # Check FoldID is wanted (first char of any new line)
            if txt[i] in foldfilter:

                # Add each sequence in this row to its corresponding list
                k = txt.find('\t', i+2) # k = index of first char of first sequence in this row
                for s in seqloop:
                    i, k = k+1, txt.find('\t', k+1)
                    sequences[s].append(txt[i:k])  # Pull out column 's' sequence
                i, k = k+1, txt.find('\t', k+1)
                if k == -1 or k > j:  # If the next tab is on the next line, then break at the newline
                     k = j
                sequences[-1].append(txt[i:k])   # Pull out the last column's sequence

                rowidx.append(row_index)  # Also remember the original row index of this example.

            # Advance so that txt[i:j] is the next line. The last character of the file must be a '\n'.
            i,j = j+1,txt.find('\n',j+1)

        txt = None  # Release memory for gigantic string immediately, for the stability of debugger

        # Convert row indices numpy array for faster indexing when loading features/targets
        self.rowidx = np.asarray(rowidx,np.uint32).reshape((-1,1))
        self._sequences   = sequences

        logging.info("... load took %.2fs" % toc())
Exemplo n.º 2
0
    def _open_targets(self):
        if self._targetfile is None:
            return

        _log_xform_warned = False
        logging.info("loading %s ..." % basename(self._targetfile))
        tic()

        # Read the entire targets file, convert it to numpy array as a string, and slice
        # just the rows that we're using.
        # It turns out this strategy is MUCH faster than using numpy.loadtxt:
        #     features = np.loadtxt(self._featurefile, np.float32, 
        #                           delimiter='\t', skiprows=1, ndmin=2)
        with open(self._targetfile) as f:
            f.readline() # discard header
            txt = f.read()
        
        ntarget = len(self.target_names)
        ntask   = len(self._task_ids)
        rowidx  = self.rowidx
        maxrows_to_read = rowidx[-1]+1

        if self._targetfile_cols:
            # np.fromstring is fast but doesn't support the presence of non-numeric columns
            targets = np.asarray([[float(x) for x in line.split('\t')[self._targetfile_cols]]
                                            for line in txt.split('\n',maxrows_to_read)[:-1]])
        else:
            targets = np.fromstring(txt, np.float32, sep='\t', 
                                    count=ntarget*maxrows_to_read).reshape(-1, ntarget)
        txt = None

        if len(targets) > len(rowidx):
            targets = targets[rowidx.ravel(),:]

        # Select columns using '_task_ids' no matter what, since the order
        # might be different.
        usecols = np.asarray([self.target_names.index(name) for name in self._task_ids])  # nparray for faster indexing in 
        targets = targets[:,usecols]

        # Normalize targets by scaling min/max range to [0,1]
        if targets.size > 0:
            # OPTIONAL: clamp all originally negative values at zero
            #targets = np.maximum(0, targets)

            # For each individual column, get lo/hi percentile 
            # and then normalize the non-NaN values in that column
            a,b = [],[]
            for i in range(ntask):
                target_i = targets[:,i]
                mask_i = ~np.isnan(target_i)

                is_boolean = np.all(np.logical_or(target_i[mask_i] == 0, target_i[mask_i] == 1))
                if is_boolean:
                    # Automatically assume 0/1 classification target
                    logging.info("   %s \t(classification)" % self._task_ids[i])
                    ai,bi = 1,0
                else:
                    # Automatically assume regression target
                    logging.info("   %s \t(regression)" % self._task_ids[i])

                    if "log" in self._preprocess:
                        if (not np.all(target_i[mask_i] >= 0)):
                            if not _log_xform_warned:
                                _log_xform_warned = True
                                print "Warning: log transform requires all original targets to be non-negative; biasing the data and proceeding anyway."
                            target_i[mask_i] -= target_i[mask_i].min()
                        target_i[mask_i] = np.log(1+target_i[mask_i])
                    elif "sqrt" in self._preprocess:
                        if (not np.all(target_i[mask_i] >= 0)):
                            if not _log_xform_warned:
                                _log_xform_warned = True
                                print "Warning: sqrt transform requires all original targets to be non-negative; biasing the data and proceeding anyway."
                            target_i[mask_i] -= target_i[mask_i].min()
                        target_i[mask_i] = np.sqrt(target_i[mask_i])

                    lo_i,hi_i = np.percentile(target_i[mask_i], [0.0, 1.0])
                    #lo_i,hi_i = np.percentile(target_i[mask_i], [0.05, 99.95])
                    if lo_i == hi_i:
                        hi_i += 1 # Avoid divide by zero for degenerate targets

                    # Convert everything below the "lo" threshold to NaNs
                    tmp = target_i[mask_i]
                    tmp[tmp < lo_i] = np.nan
                    target_i[mask_i] = tmp
                    mask_i = ~np.isnan(target_i)

                    # Convert everything above the "hi" threshold to NaNs
                    tmp = target_i[mask_i]
                    tmp[tmp > hi_i] = np.nan
                    target_i[mask_i] = tmp
                    mask_i = ~np.isnan(target_i)

                    # Clamp everything to the range [lo,hi]
                    #target_i[mask_i] = np.maximum(lo_i, target_i[mask_i])
                    #target_i[mask_i] = np.minimum(hi_i, target_i[mask_i])  # Assume anything above hi_i is a "large" outlier

                    # Subtract the mean
                    if self._requirements.get('target',None) == 'logistic':
                        intercept_i = lo_i
                    else:
                        intercept_i = np.mean(target_i[mask_i])
                    ai = 1./(hi_i-lo_i)
                    bi = -intercept_i*ai
                    target_i[mask_i] = ai*target_i[mask_i] + bi
                    #mask_pos = target_i[mask_i] > 0
                    #target_i[mask_i][mask_pos] **= 0.5
                a.append(ai)
                b.append(bi)

            if "log" in self._preprocess:
                self._targets_preprocess.append(('log',))
            self._targets_preprocess.append(('normalize', np.asarray(a).reshape((1,-1)), 
                                                          np.asarray(b).reshape((1,-1))) )

            #targets[self._targets_mask] = np.maximum(0,targets[self._targets_mask])
            #targets[self._targets_mask] = np.minimum(1,targets[self._targets_mask])

        self._targets = targets
        self._targets_mask = ~np.isnan(targets)


        logging.info("... load took %.2fs" % toc())
Exemplo n.º 3
0
    def _open_features(self):
        if self._featurefile is None:
            return

        logging.info("loading %s ..." % basename(self._featurefile))
        tic()

        # Read the entire features file, convert it to numpy array as a string, and slice
        # just the rows that we're using.
        # It turns out this strategy is MUCH faster than using numpy.loadtxt:
        #     features = np.loadtxt(self._featurefile, np.float32, 
        #                           delimiter='\t', skiprows=1, ndmin=2)
        with open(self._featurefile) as f:
            f.readline() # discard header
            txt = f.read()

        for name in self.feature_names:
            logging.info("   %s" % name)

        nfeature = len(self.feature_names)
        rowidx   = self.rowidx
        maxrows_to_read = rowidx[-1]+1

        if self._featurefile_cols:
            # np.fromstring is fast but doesn't support the presence of non-numeric columns
            raise NotImplementedError("This code should work but has not been tested.")
            features = np.asarray([[float(x) for x in line.split('\t')[self._featurefile_cols]]
                                             for line in txt.split('\n',maxrows_to_read)[:-1]])
        else:
            features = np.fromstring(txt, np.float32, sep='\t', 
                                     count=nfeature*maxrows_to_read).reshape(-1, nfeature)
        txt = None

        if len(features) > len(rowidx):
            features = features[rowidx.ravel(),:]

        # Preprocess each feature by normalizing and setting mean to 0
        a,b = [],[]
        for i in range(nfeature):
            col = features[:,i:i+1]
            mask = ~np.isnan(col)
            lo = np.min(col[mask], axis=0)  
            hi = np.max(col[mask], axis=0)
            if lo == hi:
                hi += 1 # Avoid divide by zero for degenerate targets
            meani = np.mean(col[mask])

            ai = 1./(hi-lo)
            bi = -meani*ai

            col[mask] = ai*col[mask] + bi
            a.append(ai)
            b.append(bi)

        self._feature_preprocess = [ ('normalize', np.asarray(a).reshape((1,-1)), 
                                                   np.asarray(b).reshape((1,-1))) ]

        nsequence = len(self._sequences[0])
        assert len(features) == nsequence, "Number of rows in Features file must match number of rows in Sequences file."
        self._features = features
        
        logging.info("... load took %.2fs" % toc())
Exemplo n.º 4
0
    def _open_targets(self):
        if self._targetfile is None:
            return

        _log_xform_warned = False
        logging.info("loading %s ..." % basename(self._targetfile))
        tic()

        # Read the entire targets file, convert it to numpy array as a string, and slice
        # just the rows that we're using.
        # It turns out this strategy is MUCH faster than using numpy.loadtxt:
        #     features = np.loadtxt(self._featurefile, np.float32,
        #                           delimiter='\t', skiprows=1, ndmin=2)
        with open(self._targetfile) as f:
            f.readline()  # discard header
            txt = f.read()

        ntarget = len(self.target_names)
        ntask = len(self._task_ids)
        rowidx = self.rowidx
        maxrows_to_read = rowidx[-1] + 1

        if self._targetfile_cols:
            # np.fromstring is fast but doesn't support the presence of non-numeric columns
            targets = np.asarray(
                [[float(x) for x in line.split('\t')[self._targetfile_cols]]
                 for line in txt.split('\n', maxrows_to_read)[:-1]])
        else:
            targets = np.fromstring(txt,
                                    np.float32,
                                    sep='\t',
                                    count=ntarget * maxrows_to_read).reshape(
                                        -1, ntarget)
        txt = None

        if len(targets) > len(rowidx):
            targets = targets[rowidx.ravel(), :]

        # Select columns using '_task_ids' no matter what, since the order
        # might be different.
        usecols = np.asarray([
            self.target_names.index(name) for name in self._task_ids
        ])  # nparray for faster indexing in
        targets = targets[:, usecols]

        # Normalize targets by scaling min/max range to [0,1]
        if targets.size > 0:
            # OPTIONAL: clamp all originally negative values at zero
            #targets = np.maximum(0, targets)

            # For each individual column, get lo/hi percentile
            # and then normalize the non-NaN values in that column
            a, b = [], []
            for i in range(ntask):
                target_i = targets[:, i]
                mask_i = ~np.isnan(target_i)

                is_boolean = np.all(
                    np.logical_or(target_i[mask_i] == 0,
                                  target_i[mask_i] == 1))
                if is_boolean:
                    # Automatically assume 0/1 classification target
                    logging.info("   %s \t(classification)" %
                                 self._task_ids[i])
                    ai, bi = 1, 0
                else:
                    # Automatically assume regression target
                    logging.info("   %s \t(regression)" % self._task_ids[i])

                    if "log" in self._preprocess:
                        if (not np.all(target_i[mask_i] >= 0)):
                            if not _log_xform_warned:
                                _log_xform_warned = True
                                print "Warning: log transform requires all original targets to be non-negative; biasing the data and proceeding anyway."
                            target_i[mask_i] -= target_i[mask_i].min()
                        target_i[mask_i] = np.log(1 + target_i[mask_i])
                    elif "sqrt" in self._preprocess:
                        if (not np.all(target_i[mask_i] >= 0)):
                            if not _log_xform_warned:
                                _log_xform_warned = True
                                print "Warning: sqrt transform requires all original targets to be non-negative; biasing the data and proceeding anyway."
                            target_i[mask_i] -= target_i[mask_i].min()
                        target_i[mask_i] = np.sqrt(target_i[mask_i])

                    lo_i, hi_i = np.percentile(target_i[mask_i], [0.0, 1.0])
                    #lo_i,hi_i = np.percentile(target_i[mask_i], [0.05, 99.95])
                    if lo_i == hi_i:
                        hi_i += 1  # Avoid divide by zero for degenerate targets

                    # Convert everything below the "lo" threshold to NaNs
                    tmp = target_i[mask_i]
                    tmp[tmp < lo_i] = np.nan
                    target_i[mask_i] = tmp
                    mask_i = ~np.isnan(target_i)

                    # Convert everything above the "hi" threshold to NaNs
                    tmp = target_i[mask_i]
                    tmp[tmp > hi_i] = np.nan
                    target_i[mask_i] = tmp
                    mask_i = ~np.isnan(target_i)

                    # Clamp everything to the range [lo,hi]
                    #target_i[mask_i] = np.maximum(lo_i, target_i[mask_i])
                    #target_i[mask_i] = np.minimum(hi_i, target_i[mask_i])  # Assume anything above hi_i is a "large" outlier

                    # Subtract the mean
                    if self._requirements.get('target', None) == 'logistic':
                        intercept_i = lo_i
                    else:
                        intercept_i = np.mean(target_i[mask_i])
                    ai = 1. / (hi_i - lo_i)
                    bi = -intercept_i * ai
                    target_i[mask_i] = ai * target_i[mask_i] + bi
                    #mask_pos = target_i[mask_i] > 0
                    #target_i[mask_i][mask_pos] **= 0.5
                a.append(ai)
                b.append(bi)

            if "log" in self._preprocess:
                self._targets_preprocess.append(('log', ))
            self._targets_preprocess.append(
                ('normalize', np.asarray(a).reshape(
                    (1, -1)), np.asarray(b).reshape((1, -1))))

            #targets[self._targets_mask] = np.maximum(0,targets[self._targets_mask])
            #targets[self._targets_mask] = np.minimum(1,targets[self._targets_mask])

        self._targets = targets
        self._targets_mask = ~np.isnan(targets)

        logging.info("... load took %.2fs" % toc())
Exemplo n.º 5
0
    def _open_features(self):
        if self._featurefile is None:
            return

        logging.info("loading %s ..." % basename(self._featurefile))
        tic()

        # Read the entire features file, convert it to numpy array as a string, and slice
        # just the rows that we're using.
        # It turns out this strategy is MUCH faster than using numpy.loadtxt:
        #     features = np.loadtxt(self._featurefile, np.float32,
        #                           delimiter='\t', skiprows=1, ndmin=2)
        with open(self._featurefile) as f:
            f.readline()  # discard header
            txt = f.read()

        for name in self.feature_names:
            logging.info("   %s" % name)

        nfeature = len(self.feature_names)
        rowidx = self.rowidx
        maxrows_to_read = rowidx[-1] + 1

        if self._featurefile_cols:
            # np.fromstring is fast but doesn't support the presence of non-numeric columns
            raise NotImplementedError(
                "This code should work but has not been tested.")
            features = np.asarray(
                [[float(x) for x in line.split('\t')[self._featurefile_cols]]
                 for line in txt.split('\n', maxrows_to_read)[:-1]])
        else:
            features = np.fromstring(txt,
                                     np.float32,
                                     sep='\t',
                                     count=nfeature * maxrows_to_read).reshape(
                                         -1, nfeature)
        txt = None

        if len(features) > len(rowidx):
            features = features[rowidx.ravel(), :]

        # Preprocess each feature by normalizing and setting mean to 0
        a, b = [], []
        for i in range(nfeature):
            col = features[:, i:i + 1]
            mask = ~np.isnan(col)
            lo = np.min(col[mask], axis=0)
            hi = np.max(col[mask], axis=0)
            if lo == hi:
                hi += 1  # Avoid divide by zero for degenerate targets
            meani = np.mean(col[mask])

            ai = 1. / (hi - lo)
            bi = -meani * ai

            col[mask] = ai * col[mask] + bi
            a.append(ai)
            b.append(bi)

        self._feature_preprocess = [('normalize', np.asarray(a).reshape(
            (1, -1)), np.asarray(b).reshape((1, -1)))]

        nsequence = len(self._sequences[0])
        assert len(
            features
        ) == nsequence, "Number of rows in Features file must match number of rows in Sequences file."
        self._features = features

        logging.info("... load took %.2fs" % toc())
Exemplo n.º 6
0
    def _open_sequences(self):
        """Loads the raw sequences, storing them as lists of strings."""

        logging.info("loading %s ..." % basename(self._sequencefile))
        tic()

        # Read entire data file into one big string.
        # Manually scanning the string for newlines and tabs is 3x faster than
        # using readlines() and then calling split() on each line.
        with open(self._sequencefile) as f:
            f.readline()  # discard header
            txt = f.read()
            assert txt[-1] == '\n', "Sequence file must end with a newline."

        for name in self.sequence_names:
            logging.info("   %s" % name)

        foldfilter = self._foldfilter
        maxrows = self._maxrows
        seqloop = range(len(self.sequence_names) - 1)  # Used in innermost loop
        revcomp = self._reverse_complement

        # Store each column as its own list of sequences.
        # Scan through the txt string until we've hit the end.
        sequences = [[] for s in self.sequence_names]
        rowidx = []
        i, j = 0, txt.find('\n')
        for row_index in xrange(len(txt)):
            if j == -1 or len(rowidx) >= maxrows:
                break

            # Check FoldID is wanted (first char of any new line)
            if txt[i] in foldfilter:

                # Add each sequence in this row to its corresponding list
                k = txt.find(
                    '\t', i +
                    2)  # k = index of first char of first sequence in this row
                for s in seqloop:
                    i, k = k + 1, txt.find('\t', k + 1)
                    sequences[s].append(
                        txt[i:k])  # Pull out column 's' sequence
                i, k = k + 1, txt.find('\t', k + 1)
                if k == -1 or k > j:  # If the next tab is on the next line, then break at the newline
                    k = j
                sequences[-1].append(
                    txt[i:k])  # Pull out the last column's sequence

                rowidx.append(
                    row_index
                )  # Also remember the original row index of this example.

            # Advance so that txt[i:j] is the next line. The last character of the file must be a '\n'.
            i, j = j + 1, txt.find('\n', j + 1)

        txt = None  # Release memory for gigantic string immediately, for the stability of debugger

        # Convert row indices numpy array for faster indexing when loading features/targets
        self.rowidx = np.asarray(rowidx, np.uint32).reshape((-1, 1))
        self._sequences = sequences

        logging.info("... load took %.2fs" % toc())