def _findFreeRow(self, record): """ Find free row to write records to. """ expenseStartColumn, expenseStartRow, expenseEndColumn, expenseEndRow = 0, 0, 0, 0 sheetName = constants.MONTH_NUM_TO_STRING[record.getDate().tm_mon] # check if already found the the empty row if sheetName in self.emptyRowsPerSheet: self.emptyRowsPerSheet[sheetName] = (self.emptyRowsPerSheet[sheetName][0] + 1, self.emptyRowsPerSheet[sheetName][1]) else: self.ws = self.wb.sheet_by_name(sheetName) recordsSignature = [constants.DATE_SIG, constants.CATEGORY_SIG, constants.DESCRIPTION_SIG, constants.COST_SIG] num_rows = self.ws.nrows - 1 num_cells = self.ws.ncols - 1 curr_row = -1 while curr_row < num_rows: curr_row += 1 curr_cell = -1 while curr_cell < num_cells: if curr_cell > constants.MAX_COL_NUM: break curr_cell += 1 rowTuple = [self.ws.cell_value(curr_row, curr_cell + i) for i in range(4)] if rowTuple == recordsSignature: expenseStartColumn = curr_cell expenseStartRow = curr_row + 1 expenseEndColumn = curr_cell + 4 wsRows = [self.ws.row_values(rowx=expenseStartRow + i, start_colx=expenseStartColumn, end_colx=expenseEndColumn) for i in range(num_rows - expenseStartRow)] foundEmptyLine = False emptyRow = -1 for rowNumber, line in enumerate(wsRows): if all(item == "" for item in line) and not foundEmptyLine: foundEmptyLine = True emptyRow = rowNumber + 1 elif not all(item == "" for item in line) and foundEmptyLine: foundEmptyLine = False firstEmptyRow = emptyRow + int(expenseStartRow) self.emptyRowsPerSheet[sheetName] = (firstEmptyRow, colname(colx=expenseStartColumn)) return self.emptyRowsPerSheet[sheetName]
def read_datacolumns(book): ''' @return an array of data column definition dicts ''' data_column_sheet = book.sheet_by_name('Data Columns') labels = { 'Worksheet Column': 'worksheet_column', '"Data" Worksheet Column': 'worksheet_column', 'Display Order': 'display_order', 'Display Name': 'display_name', 'Name': 'name', 'Data Type': 'data_type', 'Decimal Places': 'precision', 'Description': 'description', 'Replicate Number': 'replicate', 'Unit': 'unit', 'Assay readout type': 'readout_type', 'Comments': 'comments', } dc_definitions = [] datacolumn_fields = util.get_fields(DataColumn) type_lookup = dict((f.name, iu.totype(f)) for f in datacolumn_fields) logger.debug('datacolumn type lookups: %s' % type_lookup) required_labels = ['name', 'data_type'] logger.info('read the data column definitions...') for i in xrange(data_column_sheet.nrows): row_values = data_column_sheet.row_values(i) if i == 0: for val in row_values[1:]: dc_definitions.append({}) label_read = row_values[0] recognized_label = next( (field_name for label, field_name in labels.items() if label_read and label.lower() == label_read.lower()), None) if recognized_label: logger.debug('label: %r, recognized_label: %r' % (label_read, recognized_label)) for j, val in enumerate(row_values[1:]): dc_dict = dc_definitions[j] logger.debug('data column %s:%d:%d:%r' % (recognized_label, i, j, val)) final_val = util.convertdata( val, type_lookup.get(recognized_label, None)) if final_val != None: dc_dict[recognized_label] = final_val if recognized_label == 'display_order': # add 10 to the order, so default reagent cols can go first dc_dict['display_order'] = (dc_dict['display_order'] + 10) if recognized_label == 'name': # split on non-alphanumeric chars temp = re.split(r'[^a-zA-Z0-9]+', dc_dict['name']) # convert, if needed if len(temp) > 1: dc_dict['name'] = camel_case_dwg(dc_dict['name']) else: if recognized_label in required_labels: raise Exception( 'Error, data column field is required: %s, col: %r' % (recognized_label, colname(j + 1))) else: logger.debug('unrecognized label in "Data Columns" sheet %r' % label_read) for dc_dict in dc_definitions: for label in required_labels: if label not in dc_dict: raise Exception('required "Data Column" label not defined %r' % label) logger.info('find the data columns on the "Data" sheet...') data_sheet = book.sheet_by_name('Data') data_sheet_labels = data_sheet.row_values(0) dc_definitions_found = [] data_labels_found = [] for i, data_label in enumerate(data_sheet_labels): if not data_label or not data_label.strip(): logger.info('break on data sheet col %d, blank' % i) break data_label = data_label.upper() col_letter = colname(i) for dc_dict in dc_definitions: _dict = None if 'worksheet_column' in dc_dict: v = dc_dict['worksheet_column'] if v.upper() == col_letter: data_labels_found.append(i) dc_definitions_found.append(dc_dict) _dict = dc_dict elif 'name' in dc_dict or 'display_name' in dc_dict: if (dc_dict.get('name', '').upper() == data_label or dc_dict.get('display_name', '').upper() == data_label): dc_dict['worksheet_column'] = col_letter data_labels_found.append(i) dc_definitions_found.append(dc_dict) _dict = dc_dict if _dict and 'display_order' not in _dict: _dict['display_order'] = i + 10 logger.warn('auto assigning "display_order" for col %r as %d' % (_dict['name'], i + 10)) if i not in data_labels_found: logger.debug(('Data sheet label not found %r,' ' looking in default reagent definitions %s') % (data_label, default_reagent_columns.keys())) for key, dc_dict in default_reagent_columns.items(): if (key.upper() == data_label or dc_dict.get('name', '').upper() == data_label or dc_dict.get('display_name', '').upper() == data_label): dc_dict['worksheet_column'] = col_letter data_labels_found.append(i) dc_definitions_found.append(dc_dict) data_labels_not_found = [ data_label for i, data_label in enumerate(data_sheet_labels) if data_label and data_label.strip() and i not in data_labels_found and data_label not in meta_columns ] if data_labels_not_found: logger.warn('data sheet labels not recognized %s' % data_labels_not_found) # for legacy datasets: make sure the small molecule column 1 is always created small_mol_col = None for dc_dict in dc_definitions_found: if dc_dict['data_type'] == 'small_molecule': small_mol_col = dc_dict break if not small_mol_col: dc_definitions_found.append( default_reagent_columns['Small Molecule Batch']) logger.info('data column definitions found: %s' % [x['display_name'] for x in dc_definitions_found]) return dc_definitions_found
dc_definition['display_order'] = i datacolumn = DataColumn(**dc_definition) datacolumn.save() if not small_molecule_col and datacolumn.data_type == 'small_molecule': small_molecule_col = datacolumn logger.debug('datacolumn created: %r' % datacolumn) if datacolumn.worksheet_column: col_to_dc_map[int_for_col( datacolumn.worksheet_column)] = datacolumn logger.debug('final data columns: %s' % col_to_dc_map) logger.debug('read the Data sheet') data_sheet = book.sheet_by_name('Data') for i, label in enumerate(data_sheet.row_values(0)): logger.debug('find datasheet label %r:%r' % (colname(i), label)) if label in meta_columns: meta_columns[label] = i continue logger.debug('meta_columns: %s, datacolumnList: %s' % (meta_columns, col_to_dc_map)) logger.debug('read the data sheet, save_interval: %d' % save_interval) loopStart = time.time() pointsSaved = 0 rows_read = 0 col_to_dc_items = col_to_dc_map.items() for i in xrange(data_sheet.nrows - 1): current_row = i + 2
def read_data(book, col_to_dc_map, first_small_molecule_column, dataset): datarecord_batch = [] save_interval = 1000 logger.debug('read the Data sheet') data_sheet = book.sheet_by_name('Data') for i,label in enumerate(data_sheet.row_values(0)): logger.debug('find datasheet label %r:%r' % (colname(i), label)) if label in meta_columns: meta_columns[label] = i continue logger.debug('meta_columns: %s, datacolumnList: %s' % (meta_columns, col_to_dc_map) ) logger.debug('read the data sheet, save_interval: %d' % save_interval) loopStart = time.time() pointsSaved = 0 rows_read = 0 for i in xrange(data_sheet.nrows-1): current_row = i + 2 row = data_sheet.row_values(i+1) r = util.make_row(row) datarecord = DataRecord(dataset=dataset) if meta_columns['Control Type'] > -1: datarecord.control_type = util.convertdata( r[meta_columns['Control Type']]) datapoint_batch = [] small_molecule_datapoint = None for i,dc in col_to_dc_map.items(): value = r[i] logger.debug( 'reading column %r, %s, val: %r' % (colname(i), dc, value)) value = value.strip() value = util.convertdata(value) if not value: continue datapoint = _create_datapoint(dc, dataset, datarecord, value) datapoint_batch.append(datapoint) pointsSaved += 1 if not small_molecule_datapoint and dc.data_type == 'small_molecule': small_molecule_datapoint = datapoint if meta_columns['Plate'] > -1: _read_plate_well( meta_columns['Plate'], r, current_row, datarecord, first_small_molecule_column,small_molecule_datapoint, datapoint_batch) datarecord_batch.append((datarecord, datapoint_batch)) rows_read += 1 if (rows_read % save_interval == 0): bulk_create_datarecords(datarecord_batch) logger.debug( 'datarecord batch created, rows_read: %d , time (ms): %d' % (rows_read, time.time()-loopStart ) ) count = bulk_create_datapoints(datarecord_batch) logger.debug('datapoints created in batch: %d ' % count) datarecord_batch=[] bulk_create_datarecords(datarecord_batch) et = time.time()-loopStart logger.debug( 'final datarecord batch created, rows_read: %d, time (ms): %d' % (rows_read, et)) count = bulk_create_datapoints(datarecord_batch) logger.debug('created dps %d' % count ) print 'Finished reading, rows_read: ', rows_read, ', points Saved: ', pointsSaved print 'elapsed: ', et , 'avg: ', et/rows_read cleanup_unused_datacolumns(dataset)
dc_dict[recognized_label] = final_val if recognized_label == 'display_order': # add 10 to the order, so default reagent cols can go first dc_dict['display_order'] = ( dc_dict['display_order'] + 10) if recognized_label == 'name': # split on non-alphanumeric chars temp = re.split(r'[^a-zA-Z0-9]+',dc_dict['name']) # convert, if needed if len(temp) > 1: dc_dict['name'] = camel_case_dwg(dc_dict['name']) else: if recognized_label in required_labels: raise Exception( 'Error, data column field is required: %s, col: %r' % ( recognized_label, colname(j+1) ) ) else: logger.debug( 'unrecognized label in "Data Columns" sheet %r' % label_read) for dc_dict in dc_definitions: for label in required_labels: if label not in dc_dict: raise Exception( 'required "Data Column" label not defined %r' % label) logger.info('find the data columns on the "Data" sheet...') data_sheet = book.sheet_by_name('Data') data_sheet_labels = data_sheet.row_values(0) dc_definitions_found = []
def read_datacolumns(book): ''' @return an array of data column definition dicts ''' data_column_sheet = book.sheet_by_name('Data Columns') labels = { 'Worksheet Column':'worksheet_column', '"Data" Worksheet Column':'worksheet_column', 'Display Order':'display_order', 'Display Name':'display_name', 'Name':'name', 'Data Type':'data_type', 'Decimal Places':'precision', 'Description':'description', 'Replicate Number':'replicate', 'Unit':'unit', 'Assay readout type':'readout_type', 'Comments':'comments', } dc_definitions = [] datacolumn_fields = util.get_fields(DataColumn) type_lookup = dict((f.name, iu.totype(f)) for f in datacolumn_fields) logger.debug('datacolumn type lookups: %s' % type_lookup) required_labels = ['name', 'data_type'] logger.info('read the data column definitions...') for i in xrange(data_column_sheet.nrows): row_values = data_column_sheet.row_values(i) if i == 0: for val in row_values[1:]: dc_definitions.append({}) label_read = row_values[0] recognized_label = next( (field_name for label, field_name in labels.items() if label_read and label.lower() == label_read.lower() ), None) if recognized_label: logger.debug( 'label: %r, recognized_label: %r' % (label_read, recognized_label)) for j,val in enumerate(row_values[1:]): dc_dict = dc_definitions[j] logger.debug('data column %s:%d:%d:%r' % ( recognized_label, i, j, val)) final_val = util.convertdata( val,type_lookup.get(recognized_label, None)) if final_val != None: dc_dict[recognized_label] = final_val if recognized_label == 'display_order': # add 10 to the order, so default reagent cols can go first dc_dict['display_order'] = ( dc_dict['display_order'] + 10) if recognized_label == 'name': # split on non-alphanumeric chars temp = re.split(r'[^a-zA-Z0-9]+',dc_dict['name']) # convert, if needed if len(temp) > 1: dc_dict['name'] = camel_case_dwg(dc_dict['name']) else: if recognized_label in required_labels: raise Exception( 'Error, data column field is required: %s, col: %r' % ( recognized_label, colname(j+1) ) ) else: logger.debug( 'unrecognized label in "Data Columns" sheet %r' % label_read) for dc_dict in dc_definitions: for label in required_labels: if label not in dc_dict: raise Exception( 'required "Data Column" label not defined %r' % label) logger.info('find the data columns on the "Data" sheet...') data_sheet = book.sheet_by_name('Data') data_sheet_labels = data_sheet.row_values(0) dc_definitions_found = [] data_labels_found = [] for i,data_label in enumerate(data_sheet_labels): if not data_label or not data_label.strip(): logger.info('break on data sheet col %d, blank' % i) break data_label = data_label.upper() col_letter = colname(i) for dc_dict in dc_definitions: _dict = None if 'worksheet_column' in dc_dict: v = dc_dict['worksheet_column'] if v.upper() == col_letter: data_labels_found.append(i) dc_definitions_found.append(dc_dict) _dict = dc_dict elif 'name' in dc_dict or 'display_name' in dc_dict: if ( dc_dict.get('name', '').upper() == data_label or dc_dict.get('display_name', '').upper() == data_label): dc_dict['worksheet_column'] = col_letter data_labels_found.append(i) dc_definitions_found.append(dc_dict) _dict = dc_dict if _dict and 'display_order' not in _dict: _dict['display_order'] = i+10 logger.warn( 'auto assigning "display_order" for col %r as %d' % (_dict['name'], i+10)) if i not in data_labels_found: logger.debug( ( 'Data sheet label not found %r,' ' looking in default reagent definitions %s' ) % ( data_label, default_reagent_columns.keys() ) ) for key,dc_dict in default_reagent_columns.items(): if (key.upper() == data_label or dc_dict.get('name', '').upper() == data_label or dc_dict.get('display_name', '').upper() == data_label): dc_dict['worksheet_column'] = col_letter data_labels_found.append(i) dc_definitions_found.append(dc_dict) data_labels_not_found = [ data_label for i,data_label in enumerate(data_sheet_labels) if data_label and data_label.strip() and i not in data_labels_found and data_label not in meta_columns ] if data_labels_not_found: logger.warn( 'data sheet labels not recognized %s' % data_labels_not_found ) # for legacy datasets: make sure the small molecule column 1 is always created small_mol_col = None for dc_dict in dc_definitions_found: if dc_dict['data_type'] == 'small_molecule': small_mol_col = dc_dict break if not small_mol_col: dc_definitions_found.append(default_reagent_columns['Small Molecule Batch']) logger.info('data column definitions found: %s' % [x['display_name'] for x in dc_definitions_found]) return dc_definitions_found
or dc_definition['display_order']==None): dc_definition['display_order']=i datacolumn = DataColumn(**dc_definition) datacolumn.save() if not small_molecule_col and datacolumn.data_type == 'small_molecule': small_molecule_col = datacolumn logger.debug('datacolumn created: %r' % datacolumn) if datacolumn.worksheet_column: col_to_dc_map[int_for_col(datacolumn.worksheet_column)] = datacolumn logger.debug('final data columns: %s' % col_to_dc_map) logger.debug('read the Data sheet') data_sheet = book.sheet_by_name('Data') for i,label in enumerate(data_sheet.row_values(0)): logger.debug('find datasheet label %r:%r' % (colname(i), label)) if label in meta_columns: meta_columns[label] = i continue logger.debug('meta_columns: %s, datacolumnList: %s' % (meta_columns, col_to_dc_map) ) logger.debug('read the data sheet, save_interval: %d' % save_interval) loopStart = time.time() pointsSaved = 0 rows_read = 0 col_to_dc_items = col_to_dc_map.items() for i in xrange(data_sheet.nrows-1): current_row = i + 2