Exemplo n.º 1
0
  def fetch(self, data=None, headers=None):
    """Fetch http file from network.

    Args:
      headers: {str:str} of additional request HTTP headers
      data: {str:*} of data to be sent via HTTP
    Returns:
      [*str] of file pointer-like HTTP stream.
    """
    # Fetch request.
    if self.type == "http":
      rsp = self._fetch_http(data, headers)
    elif self.type == "ftp":
      rsp = self._fetch_ftp()
    else:
      Log.warning("Unknown type, cannot fetch %s for %s." % self.url, self)
      return None

    self.status = 200
    # Convert header keys into all lower case.
    self.headers = {}
    for key, value in dict(rsp.info()).items():
      self.headers[key.lower()] = value
    self.url_rsp = rsp.geturl()
    
    return rsp
Exemplo n.º 2
0
  def close(self):
    """Close any open file pointers, close and finalize cache file.
    """
    # Ignore repeated calls to close()
    if self.closed:
      Log.info("Redundant call to close(), Ignored for %s." % self)
      return
    else:
      Log.info("Closing %s..." % self)

    # Handle finalize requests to complete download to buffer.
    if self.finalize:
      if not self.completed and self.cache:
        Log.info("Finalizing download of %s." % self)
        # Read remaining buffer unconditionally. Use iterator if reporting.
        if self.report:
          while True:
            try:
              self.next()
            except StopIteration:
              break
        else:
          self.read()
        # If not closed in previous read(), try another read().
        if not self.closed:
          # This closes self since the previous read flushed the buffer.
          self.read()
        if not self.closed:
          Log.warning("Close sequence not completed as expected for %s." % self)
        # Exit: prior reads in the finalize process already closed self.
        return

    # self.buffer.close() causes bugs with FTP. Python sockets clean up after 
    #   themselves in garbage collection, so to remove the reference to buffer
    # self.buffer.close()
    self.buffer = None
    self.fp_out.close()

    if self.completed:
      Log.info("Download complete. %d bytes read." % (self.bytes_read))
      # Finalize cache.
      if self.cache:
        os.rename(self.tmp_filepath, self.dest_filepath)
        Log.info("Cache finalized as '%s'." % (self.dest_filepath))
    else:
      Log.info("Download closed before completion. %d bytes read." % \
               (self.bytes_read))
      # Flush cache.
      if self.cache:
        os.remove(self.tmp_filepath)
        Log.info("Incomplete cache '%s' deleted." % (self.tmp_filepath))
        
    # Flag self as closed to prevent redundant .close() calls.
    self.closed = True
Exemplo n.º 3
0
  def __init__(self, gse, merge_cols=True, percentile=.75):
    """Initialize filter. Requires populated gse.

    Args:
      gse: GSE instance associated with row_iter
      merge_cols: bool if to merge columns if able
      percentile: float 0<x<=1 of top percent by std to keep
    """
    # 1. Require that GSE is populated and is of correct type.
    # ==========
    if not gse.populated:
      raise geo.NotPopulatedError, "%s must be populated to filter rows." % gse
    if gse.type != "eQTL":
      raise geo.StudyTypeMismatch, "%s must be type 'eQTL', not '%s'." % \
        (gse, gse.type)

    # 2. Set Attributes.
    # ==========
    self.gse = gse
    self.col_titles = self.gse.col_titles[:]
    self.col_map = None
    self.rows_filtered = []
    self.rows_per_gene = {}
    self.row_stats = {}
    self.merge_cols = merge_cols
    self.percentile = percentile
    
    # 3. Get column map for column merging.
    # ==========
    n_samples = len(self.gse.samples)
    n_uniques = len(self.gse.subject_gsms)

    # If there are more samples than unique subjects, then create column map.
    if self.merge_cols and n_samples > n_uniques:
      self.col_map = self._make_col_map()
      rx_str = self.gse.parameters['rx_gsm_subject_str']
      Log.info(("Created column merge map for %s (%d samples to %d subjects)" +\
        " with rx '%s'") % \
        (self.gse, n_samples, n_uniques, rx_str))
      # Verify that column merge map is reasonable (num uniques + 1 for ID column)
      if len(self.col_map) != n_uniques + 1:
        Log.warning("Column merge map has %d classes, expected %d in %s." % \
                    (len(self.col_map), n_uniques, self))
        
    # No column merging scheme can exist. Do not create a col_map.
    else:
      # Retrieve the regular expression used
      rx_str = self.gse.parameters['rx_gsm_subject_str']
      Log.info("No column merge map created for %s using rx '%s'. Merge_cols flag is %s" % \
        (self.gse, rx_str, self.merge_cols))
Exemplo n.º 4
0
  def get_rows(self):
    """Return filtered row iterator.
    CLEAN THIS UP
    It may be best to break this into multiple filters?
    Fix to return [str]
    
    Returns:
      *[str] of filtered rows of data split by columns
    """
    Log.info("Initiated filter %s for rows of %s" % (self, self.gse))
    if self.col_map:
      Log.info("self.col_map exists. Merge %d to %d columns for %s" % \
               (len(self.col_titles), len(self.col_map), self))
    else:
      Log.info("No col_map. Will not merge %d columns for %s." % \
               (len(self.col_titles), self))

    # 0. Determine best gene name column in case GENE_SYMBOL does not exist.
    # ==========
    gene_symbol_name = None
    # Traverse column names in preferred order.
    for name in geo.GPL.EQTL_GENE_NAME_LIST:
      # Skip columns without assignments. Continue
      if self.gse.platform.special_cols[name] is None:
        continue
      # Choose the first column that has an acceptable assignment. Break.
      else:
        actual_column_name = self.gse.platform.special_cols[name]
        gene_symbol_name = name
        break
    # Verify that a column was chosen to identify the row.
    if gene_symbol_name:
      Log.info("Selected column '%s=>%s' to best represent gene name for %s." %\
        (gene_symbol_name, actual_column_name, self.gse.platform))
    else:
      raise MalformedFilterError, "Cannot select gene symbol column from %s" % \
        (self.gse.platform)
    
    # 1. Update column titles accounting for merged columns.
    # ==========
    if self.col_map:
      self.col_titles = self._merge_cols(self.col_titles, merge_titles)
      
    # Insert generated column titles (AFTER merging columns)
    # self.col_titles[0] should always be "ID_REF"
    col_titles_prefix = ["ID_REF", gene_symbol_name, "NUM_VALUES", "MEAN", "STD"]
    self.col_titles = col_titles_prefix + self.col_titles[1:]
    Log.info("Added %s, NUM_VALUES, MEAN, STD to col titles for %s." %\
             (gene_symbol_name, self))
             
    # Open new temporary file. XXX RENAME
    filepath = temp_file_name("%s.rowmerge" % self.gse.id)
    fp_out = open(filepath, "w")

    # 2: @DATAPASS 1: Merge columns, add gene symbol, filter non-genes.
    # ==========
    Log.info(("Started filter 1 in %s for %s: find and add gene, merge cols. " +
             "(This may take a while.)") % (self, self.gse))
      
    num_rows = 0
    for row in self.gse.get_rows():
      # TODO: Add status reporting to console
      num_rows += 1

      # Determine gene symbol for this row. Filter if no gene symbol exists.
      row_id = row[0] # Row ID should always be the first entry in a row.
      gene_sym = self.gse.platform.get_column(row_id, gene_symbol_name)
      if not gene_sym:
        self.rows_filtered.append(row_id)
        continue # skip this row
      else:
        self.rows_per_gene.setdefault(gene_sym, set()).add(row_id)
      
      # Merge columns using column mapping of series matrix columns.
      # Also, transform row into "floats" and None
      if self.col_map:
        # XXX_merge_cols is slow, perhaps due to float conversions.
        row = self._merge_cols(row, merge_floats)
      else:
        row = map(get_float, row)

      # Compute mean and standard deviation of all non-ID columns
      # check for None specifically since a valid value could be 0
      filtered_row = filter(lambda x: x is not None, row[1:])
      std = calc_std(filtered_row)
      mean = calc_mean(filtered_row)
      num_values = len(filtered_row)
      # Store row statistics
      self.row_stats[row_id] = \
        {'num_values': num_values, 'mean': mean, 'std': std}

      # Insert (gene_sym, size, mean, std) into second column
      row = [row_id , gene_sym, num_values, mean, std] + row[1:]

      # Write row to temporary file.
      # TODO: I may want to compress my row by converting it to a pickle.
      # pickling a list of floats uses 2/3 space and takes 1/2 compute time.
      fp_out.write("\t".join(map(str, row)))
      fp_out.write("\n")
    fp_out.close()

    # Log results of filter pass 1
    # ==========
    n = len(self.rows_filtered)
    n_gene_rows = num_rows-n
    mean_rows_per_gene = float(num_rows-n)/len(self.rows_per_gene)
    
    if num_rows != self.gse.est_num_row:
      Log.warning("Num rows read(%d) not num rows expected(%d) for %s" % \
                  (num_rows, self.gse.est_num_row, self))
    Log.info(("Filter 1 complete for %s. " + \
      "%d of %d (%.2f%%) rows removed for no gene symbol. %d rows remain.") % \
      (self, n, num_rows, (n/float(num_rows))*100, n_gene_rows))
    Log.info("Number of unique genes: %d, %.1f mean num rows per gene." % \
      (len(self.rows_per_gene), mean_rows_per_gene))

    # 3: Choose representative genes from self.row_stats and self.rows_per_gene
    # ==========
    # select all rows for a gene. If a gene 
    selected_row_ids = []
    for gene, row_ids in self.rows_per_gene.items():
      # If only a single row for this gene exists, choose it.
      if len(row_ids) == 1:
        best_row_id = row_ids.pop()
      # Else, choose row with the highest mean value. 
      else:
        s = sorted(row_ids, key=lambda x: self.row_stats[x]['mean'])
        best_row_id = s[-1]
      # Add this row_id to the accepted list
      selected_row_ids.append(best_row_id)

    n_single_gene_rows = len(selected_row_ids)
    Log.info("Selected %d of %d rows for %d genes by maximum row mean." % \
      (n_single_gene_rows, n_gene_rows, len(self.rows_per_gene)))

    # Sort row_ids by row standard deviation in decreasing order.
    selected_row_ids.sort(key=lambda x: self.row_stats[x]['std'], reverse=True)
    
    # Select top percentile by std. Convert type to set for easier membership tests.
    x = int(len(selected_row_ids)*self.percentile)
    selected_row_ids = set(selected_row_ids[:x])
    threshold_num_rows = len(selected_row_ids)
    assert(x == threshold_num_rows)
    Log.info("Selected top %d%% of rows (%d of %d) by standard deviation." % 
      (self.percentile*100, threshold_num_rows, n_single_gene_rows))
      
    # FINAL PASS: YIELD FILTERED LINES
    # ===========
    # Open temporary file generated in first pass.
    fp = open(filepath, "r")

    # Yield (modified) column titles.
    yield self.col_titles[:]
    
    # For each line, only yield if the row_id is in the selected_row_ids list.
    num_yielded_rows = 0
    for line in fp:
      row = line.strip().split("\t")
      row_id = row[0]
      if row_id in selected_row_ids:
        num_yielded_rows += 1
        yield row

    # All lines yielded. Check number of lines yielded with expected value.
    if num_yielded_rows != threshold_num_rows:
      Log.warning("%d yielded rows != %d expected number of rows." % \
        (num_yielded_rows, threshold_num_rows))
    else:
      Log.info("Filter complete. yielded %d rows." % (num_yielded_rows))
Exemplo n.º 5
0
 def __iter__(self):
   """Call at start of iter read loops"""
   if self.completed or self.closed:
     Log.warning("Iterator opened on closed or completed %s" % self)
   return self