def _open_sequences(self): """Loads the raw sequences, storing them as lists of strings.""" logging.info("loading %s ..." % basename(self._sequencefile)) tic() # Read entire data file into one big string. # Manually scanning the string for newlines and tabs is 3x faster than # using readlines() and then calling split() on each line. with open(self._sequencefile) as f: f.readline() # discard header txt = f.read() assert txt[-1] == '\n', "Sequence file must end with a newline." for name in self.sequence_names: logging.info(" %s" % name) foldfilter = self._foldfilter maxrows = self._maxrows seqloop = range(len(self.sequence_names)-1) # Used in innermost loop revcomp = self._reverse_complement # Store each column as its own list of sequences. # Scan through the txt string until we've hit the end. sequences = [[] for s in self.sequence_names] rowidx = [] i,j = 0,txt.find('\n') for row_index in xrange(len(txt)): if j == -1 or len(rowidx) >= maxrows: break # Check FoldID is wanted (first char of any new line) if txt[i] in foldfilter: # Add each sequence in this row to its corresponding list k = txt.find('\t', i+2) # k = index of first char of first sequence in this row for s in seqloop: i, k = k+1, txt.find('\t', k+1) sequences[s].append(txt[i:k]) # Pull out column 's' sequence i, k = k+1, txt.find('\t', k+1) if k == -1 or k > j: # If the next tab is on the next line, then break at the newline k = j sequences[-1].append(txt[i:k]) # Pull out the last column's sequence rowidx.append(row_index) # Also remember the original row index of this example. # Advance so that txt[i:j] is the next line. The last character of the file must be a '\n'. i,j = j+1,txt.find('\n',j+1) txt = None # Release memory for gigantic string immediately, for the stability of debugger # Convert row indices numpy array for faster indexing when loading features/targets self.rowidx = np.asarray(rowidx,np.uint32).reshape((-1,1)) self._sequences = sequences logging.info("... load took %.2fs" % toc())
def _open_targets(self): if self._targetfile is None: return _log_xform_warned = False logging.info("loading %s ..." % basename(self._targetfile)) tic() # Read the entire targets file, convert it to numpy array as a string, and slice # just the rows that we're using. # It turns out this strategy is MUCH faster than using numpy.loadtxt: # features = np.loadtxt(self._featurefile, np.float32, # delimiter='\t', skiprows=1, ndmin=2) with open(self._targetfile) as f: f.readline() # discard header txt = f.read() ntarget = len(self.target_names) ntask = len(self._task_ids) rowidx = self.rowidx maxrows_to_read = rowidx[-1]+1 if self._targetfile_cols: # np.fromstring is fast but doesn't support the presence of non-numeric columns targets = np.asarray([[float(x) for x in line.split('\t')[self._targetfile_cols]] for line in txt.split('\n',maxrows_to_read)[:-1]]) else: targets = np.fromstring(txt, np.float32, sep='\t', count=ntarget*maxrows_to_read).reshape(-1, ntarget) txt = None if len(targets) > len(rowidx): targets = targets[rowidx.ravel(),:] # Select columns using '_task_ids' no matter what, since the order # might be different. usecols = np.asarray([self.target_names.index(name) for name in self._task_ids]) # nparray for faster indexing in targets = targets[:,usecols] # Normalize targets by scaling min/max range to [0,1] if targets.size > 0: # OPTIONAL: clamp all originally negative values at zero #targets = np.maximum(0, targets) # For each individual column, get lo/hi percentile # and then normalize the non-NaN values in that column a,b = [],[] for i in range(ntask): target_i = targets[:,i] mask_i = ~np.isnan(target_i) is_boolean = np.all(np.logical_or(target_i[mask_i] == 0, target_i[mask_i] == 1)) if is_boolean: # Automatically assume 0/1 classification target logging.info(" %s \t(classification)" % self._task_ids[i]) ai,bi = 1,0 else: # Automatically assume regression target logging.info(" %s \t(regression)" % self._task_ids[i]) if "log" in self._preprocess: if (not np.all(target_i[mask_i] >= 0)): if not _log_xform_warned: _log_xform_warned = True print "Warning: log transform requires all original targets to be non-negative; biasing the data and proceeding anyway." target_i[mask_i] -= target_i[mask_i].min() target_i[mask_i] = np.log(1+target_i[mask_i]) elif "sqrt" in self._preprocess: if (not np.all(target_i[mask_i] >= 0)): if not _log_xform_warned: _log_xform_warned = True print "Warning: sqrt transform requires all original targets to be non-negative; biasing the data and proceeding anyway." target_i[mask_i] -= target_i[mask_i].min() target_i[mask_i] = np.sqrt(target_i[mask_i]) lo_i,hi_i = np.percentile(target_i[mask_i], [0.0, 1.0]) #lo_i,hi_i = np.percentile(target_i[mask_i], [0.05, 99.95]) if lo_i == hi_i: hi_i += 1 # Avoid divide by zero for degenerate targets # Convert everything below the "lo" threshold to NaNs tmp = target_i[mask_i] tmp[tmp < lo_i] = np.nan target_i[mask_i] = tmp mask_i = ~np.isnan(target_i) # Convert everything above the "hi" threshold to NaNs tmp = target_i[mask_i] tmp[tmp > hi_i] = np.nan target_i[mask_i] = tmp mask_i = ~np.isnan(target_i) # Clamp everything to the range [lo,hi] #target_i[mask_i] = np.maximum(lo_i, target_i[mask_i]) #target_i[mask_i] = np.minimum(hi_i, target_i[mask_i]) # Assume anything above hi_i is a "large" outlier # Subtract the mean if self._requirements.get('target',None) == 'logistic': intercept_i = lo_i else: intercept_i = np.mean(target_i[mask_i]) ai = 1./(hi_i-lo_i) bi = -intercept_i*ai target_i[mask_i] = ai*target_i[mask_i] + bi #mask_pos = target_i[mask_i] > 0 #target_i[mask_i][mask_pos] **= 0.5 a.append(ai) b.append(bi) if "log" in self._preprocess: self._targets_preprocess.append(('log',)) self._targets_preprocess.append(('normalize', np.asarray(a).reshape((1,-1)), np.asarray(b).reshape((1,-1))) ) #targets[self._targets_mask] = np.maximum(0,targets[self._targets_mask]) #targets[self._targets_mask] = np.minimum(1,targets[self._targets_mask]) self._targets = targets self._targets_mask = ~np.isnan(targets) logging.info("... load took %.2fs" % toc())
def _open_features(self): if self._featurefile is None: return logging.info("loading %s ..." % basename(self._featurefile)) tic() # Read the entire features file, convert it to numpy array as a string, and slice # just the rows that we're using. # It turns out this strategy is MUCH faster than using numpy.loadtxt: # features = np.loadtxt(self._featurefile, np.float32, # delimiter='\t', skiprows=1, ndmin=2) with open(self._featurefile) as f: f.readline() # discard header txt = f.read() for name in self.feature_names: logging.info(" %s" % name) nfeature = len(self.feature_names) rowidx = self.rowidx maxrows_to_read = rowidx[-1]+1 if self._featurefile_cols: # np.fromstring is fast but doesn't support the presence of non-numeric columns raise NotImplementedError("This code should work but has not been tested.") features = np.asarray([[float(x) for x in line.split('\t')[self._featurefile_cols]] for line in txt.split('\n',maxrows_to_read)[:-1]]) else: features = np.fromstring(txt, np.float32, sep='\t', count=nfeature*maxrows_to_read).reshape(-1, nfeature) txt = None if len(features) > len(rowidx): features = features[rowidx.ravel(),:] # Preprocess each feature by normalizing and setting mean to 0 a,b = [],[] for i in range(nfeature): col = features[:,i:i+1] mask = ~np.isnan(col) lo = np.min(col[mask], axis=0) hi = np.max(col[mask], axis=0) if lo == hi: hi += 1 # Avoid divide by zero for degenerate targets meani = np.mean(col[mask]) ai = 1./(hi-lo) bi = -meani*ai col[mask] = ai*col[mask] + bi a.append(ai) b.append(bi) self._feature_preprocess = [ ('normalize', np.asarray(a).reshape((1,-1)), np.asarray(b).reshape((1,-1))) ] nsequence = len(self._sequences[0]) assert len(features) == nsequence, "Number of rows in Features file must match number of rows in Sequences file." self._features = features logging.info("... load took %.2fs" % toc())
def _open_targets(self): if self._targetfile is None: return _log_xform_warned = False logging.info("loading %s ..." % basename(self._targetfile)) tic() # Read the entire targets file, convert it to numpy array as a string, and slice # just the rows that we're using. # It turns out this strategy is MUCH faster than using numpy.loadtxt: # features = np.loadtxt(self._featurefile, np.float32, # delimiter='\t', skiprows=1, ndmin=2) with open(self._targetfile) as f: f.readline() # discard header txt = f.read() ntarget = len(self.target_names) ntask = len(self._task_ids) rowidx = self.rowidx maxrows_to_read = rowidx[-1] + 1 if self._targetfile_cols: # np.fromstring is fast but doesn't support the presence of non-numeric columns targets = np.asarray( [[float(x) for x in line.split('\t')[self._targetfile_cols]] for line in txt.split('\n', maxrows_to_read)[:-1]]) else: targets = np.fromstring(txt, np.float32, sep='\t', count=ntarget * maxrows_to_read).reshape( -1, ntarget) txt = None if len(targets) > len(rowidx): targets = targets[rowidx.ravel(), :] # Select columns using '_task_ids' no matter what, since the order # might be different. usecols = np.asarray([ self.target_names.index(name) for name in self._task_ids ]) # nparray for faster indexing in targets = targets[:, usecols] # Normalize targets by scaling min/max range to [0,1] if targets.size > 0: # OPTIONAL: clamp all originally negative values at zero #targets = np.maximum(0, targets) # For each individual column, get lo/hi percentile # and then normalize the non-NaN values in that column a, b = [], [] for i in range(ntask): target_i = targets[:, i] mask_i = ~np.isnan(target_i) is_boolean = np.all( np.logical_or(target_i[mask_i] == 0, target_i[mask_i] == 1)) if is_boolean: # Automatically assume 0/1 classification target logging.info(" %s \t(classification)" % self._task_ids[i]) ai, bi = 1, 0 else: # Automatically assume regression target logging.info(" %s \t(regression)" % self._task_ids[i]) if "log" in self._preprocess: if (not np.all(target_i[mask_i] >= 0)): if not _log_xform_warned: _log_xform_warned = True print "Warning: log transform requires all original targets to be non-negative; biasing the data and proceeding anyway." target_i[mask_i] -= target_i[mask_i].min() target_i[mask_i] = np.log(1 + target_i[mask_i]) elif "sqrt" in self._preprocess: if (not np.all(target_i[mask_i] >= 0)): if not _log_xform_warned: _log_xform_warned = True print "Warning: sqrt transform requires all original targets to be non-negative; biasing the data and proceeding anyway." target_i[mask_i] -= target_i[mask_i].min() target_i[mask_i] = np.sqrt(target_i[mask_i]) lo_i, hi_i = np.percentile(target_i[mask_i], [0.0, 1.0]) #lo_i,hi_i = np.percentile(target_i[mask_i], [0.05, 99.95]) if lo_i == hi_i: hi_i += 1 # Avoid divide by zero for degenerate targets # Convert everything below the "lo" threshold to NaNs tmp = target_i[mask_i] tmp[tmp < lo_i] = np.nan target_i[mask_i] = tmp mask_i = ~np.isnan(target_i) # Convert everything above the "hi" threshold to NaNs tmp = target_i[mask_i] tmp[tmp > hi_i] = np.nan target_i[mask_i] = tmp mask_i = ~np.isnan(target_i) # Clamp everything to the range [lo,hi] #target_i[mask_i] = np.maximum(lo_i, target_i[mask_i]) #target_i[mask_i] = np.minimum(hi_i, target_i[mask_i]) # Assume anything above hi_i is a "large" outlier # Subtract the mean if self._requirements.get('target', None) == 'logistic': intercept_i = lo_i else: intercept_i = np.mean(target_i[mask_i]) ai = 1. / (hi_i - lo_i) bi = -intercept_i * ai target_i[mask_i] = ai * target_i[mask_i] + bi #mask_pos = target_i[mask_i] > 0 #target_i[mask_i][mask_pos] **= 0.5 a.append(ai) b.append(bi) if "log" in self._preprocess: self._targets_preprocess.append(('log', )) self._targets_preprocess.append( ('normalize', np.asarray(a).reshape( (1, -1)), np.asarray(b).reshape((1, -1)))) #targets[self._targets_mask] = np.maximum(0,targets[self._targets_mask]) #targets[self._targets_mask] = np.minimum(1,targets[self._targets_mask]) self._targets = targets self._targets_mask = ~np.isnan(targets) logging.info("... load took %.2fs" % toc())
def _open_features(self): if self._featurefile is None: return logging.info("loading %s ..." % basename(self._featurefile)) tic() # Read the entire features file, convert it to numpy array as a string, and slice # just the rows that we're using. # It turns out this strategy is MUCH faster than using numpy.loadtxt: # features = np.loadtxt(self._featurefile, np.float32, # delimiter='\t', skiprows=1, ndmin=2) with open(self._featurefile) as f: f.readline() # discard header txt = f.read() for name in self.feature_names: logging.info(" %s" % name) nfeature = len(self.feature_names) rowidx = self.rowidx maxrows_to_read = rowidx[-1] + 1 if self._featurefile_cols: # np.fromstring is fast but doesn't support the presence of non-numeric columns raise NotImplementedError( "This code should work but has not been tested.") features = np.asarray( [[float(x) for x in line.split('\t')[self._featurefile_cols]] for line in txt.split('\n', maxrows_to_read)[:-1]]) else: features = np.fromstring(txt, np.float32, sep='\t', count=nfeature * maxrows_to_read).reshape( -1, nfeature) txt = None if len(features) > len(rowidx): features = features[rowidx.ravel(), :] # Preprocess each feature by normalizing and setting mean to 0 a, b = [], [] for i in range(nfeature): col = features[:, i:i + 1] mask = ~np.isnan(col) lo = np.min(col[mask], axis=0) hi = np.max(col[mask], axis=0) if lo == hi: hi += 1 # Avoid divide by zero for degenerate targets meani = np.mean(col[mask]) ai = 1. / (hi - lo) bi = -meani * ai col[mask] = ai * col[mask] + bi a.append(ai) b.append(bi) self._feature_preprocess = [('normalize', np.asarray(a).reshape( (1, -1)), np.asarray(b).reshape((1, -1)))] nsequence = len(self._sequences[0]) assert len( features ) == nsequence, "Number of rows in Features file must match number of rows in Sequences file." self._features = features logging.info("... load took %.2fs" % toc())
def _open_sequences(self): """Loads the raw sequences, storing them as lists of strings.""" logging.info("loading %s ..." % basename(self._sequencefile)) tic() # Read entire data file into one big string. # Manually scanning the string for newlines and tabs is 3x faster than # using readlines() and then calling split() on each line. with open(self._sequencefile) as f: f.readline() # discard header txt = f.read() assert txt[-1] == '\n', "Sequence file must end with a newline." for name in self.sequence_names: logging.info(" %s" % name) foldfilter = self._foldfilter maxrows = self._maxrows seqloop = range(len(self.sequence_names) - 1) # Used in innermost loop revcomp = self._reverse_complement # Store each column as its own list of sequences. # Scan through the txt string until we've hit the end. sequences = [[] for s in self.sequence_names] rowidx = [] i, j = 0, txt.find('\n') for row_index in xrange(len(txt)): if j == -1 or len(rowidx) >= maxrows: break # Check FoldID is wanted (first char of any new line) if txt[i] in foldfilter: # Add each sequence in this row to its corresponding list k = txt.find( '\t', i + 2) # k = index of first char of first sequence in this row for s in seqloop: i, k = k + 1, txt.find('\t', k + 1) sequences[s].append( txt[i:k]) # Pull out column 's' sequence i, k = k + 1, txt.find('\t', k + 1) if k == -1 or k > j: # If the next tab is on the next line, then break at the newline k = j sequences[-1].append( txt[i:k]) # Pull out the last column's sequence rowidx.append( row_index ) # Also remember the original row index of this example. # Advance so that txt[i:j] is the next line. The last character of the file must be a '\n'. i, j = j + 1, txt.find('\n', j + 1) txt = None # Release memory for gigantic string immediately, for the stability of debugger # Convert row indices numpy array for faster indexing when loading features/targets self.rowidx = np.asarray(rowidx, np.uint32).reshape((-1, 1)) self._sequences = sequences logging.info("... load took %.2fs" % toc())