def readDataColumns(path): # Read in the DataColumn Sheet sheetname = 'Data Columns' dataColumnSheet = iu.readtable([path, sheetname]) _fields = util.get_fields(DataColumn) _typelookup = dict((f.name, iu.totype(f)) for f in _fields) # TODO: Use the import_utils methods here # TODO: compare and combine this with the fieldinformation entity labels = {'Worksheet Column':'worksheet_column', 'Display Order':'display_order', 'Name':'name', 'Data Type':'data_type', 'Decimal Places':'precision', 'Description':'description', 'Replicate Number':'replicate', 'Time point':'time_point', 'Assay readout type':'readout_type', 'Comments':'comments'} # create an array of dict's, each dict defines a DataColumn dataColumnDefinitions = [] # first put the label row in (it contains the worksheet column, and its unique) for v in dataColumnSheet.labels[1:]: dataColumnDefinitions.append({labels['Worksheet Column']:v}) # now, for each row, create the appropriate dictionary entry in the dataColumnDefinitions for row in dataColumnSheet: rowAsUnicode = util.make_row(row) keyRead = rowAsUnicode[0] for i,cellText in enumerate(rowAsUnicode[1:]): for key,fieldName in labels.items(): if re.match(key,keyRead,re.M|re.I): # if the row is one of the DataColumn fields, then add it to the dict dataColumnDefinitions[i][fieldName] = util.convertdata(cellText,_typelookup.get(fieldName, None)) # Note: convert the data to the model field type else: logger.debug(str(( '"Data Column definition not used: ', cellText)) ) pass logger.debug(str(("definitions: ", dataColumnDefinitions)) ) return dataColumnDefinitions
def readDataColumns(path): # Read in the DataColumn Sheet sheetname = 'Data Columns' dataColumnSheet = iu.readtable([path, sheetname]) _fields = util.get_fields(DataColumn) _typelookup = dict((f.name, iu.totype(f)) for f in _fields) labels = {'Worksheet Column':'worksheet_column', 'Name':'name', 'Data Type':'data_type', 'Decimal Places':'precision', 'Description':'description', 'Replicate Number':'replicate', 'Time point':'time_point', 'Assay readout type':'readout_type', 'Comments':'comments'} # create an array of dict's, each dict defines a DataColumn dataColumnDefinitions = [] # first put the label row in (it contains the worksheet column, and its unique) for v in dataColumnSheet.labels[1:]: dataColumnDefinitions.append({labels['Worksheet Column']:v}) # now, for each row, create the appropriate dictionary entry in the dataColumnDefinitions for row in dataColumnSheet: rowAsUnicode = util.make_row(row) keyRead = rowAsUnicode[0] for i,cellText in enumerate(rowAsUnicode[1:]): for key,fieldName in labels.items(): if re.match(key,keyRead,re.M|re.I): # if the row is one of the DataColumn fields, then add it to the dict dataColumnDefinitions[i][fieldName] = util.convertdata(cellText,_typelookup.get(fieldName, None)) # Note: convert the data to the model field type else: pass # print '"Data Column definition not used: ', cellText print "definitions: ", dataColumnDefinitions return dataColumnDefinitions
def readDataColumns(path): # Read in the DataColumn Sheet sheetname = 'Data Columns' dataColumnSheet = iu.readtable([path, sheetname]) # Lookup all of the field types of the Datacolumn table. # These will be used to validate input type by converting on read _fields = util.get_fields(DataColumn) _typelookup = dict((f.name, iu.totype(f)) for f in _fields) # TODO: Use the import_utils methods here # TODO: compare and combine this with the fieldinformation entity labels = {'Worksheet Column':'worksheet_column', 'Display Order':'display_order', 'Name':'name', 'Display Name':'display_name', 'Data Type':'data_type', 'Decimal Places':'precision', 'Description':'description', 'Replicate Number':'replicate', 'Unit':'unit', 'Assay readout type':'readout_type', 'Comments':'comments', 'Protein HMS LINCS ID': 'protein', 'Cell HMS LINCS ID': 'cell'} # create an array of dict's, each dict defines a DataColumn dataColumnDefinitions = [] #Note we also allow a list of pro # first the label row (it contains the worksheet column, it is unique) for v in dataColumnSheet.labels[1:]: dataColumnDefinitions.append({labels['Worksheet Column']:v}) logger.debug(str(('========== datacolumns:',dataColumnDefinitions))) # for each row, create the dictionary entry in the dataColumnDefinitions for row in dataColumnSheet: rowAsUnicode = util.make_row(row) keyRead = rowAsUnicode[0] for i,cellText in enumerate(rowAsUnicode[1:]): try: for key,fieldName in labels.items(): # if one of the DataColumn fields, add it to the dict if re.match(key,keyRead,re.M|re.I): if re.match('Protein HMS LINCS ID', keyRead, re.M|re.I): facility_id = util.convertdata(cellText, int); if facility_id: dataColumnDefinitions[i][fieldName] = \ Protein.objects.get(lincs_id=facility_id) elif re.match('Cell HMS LINCS ID', keyRead, re.M|re.I): facility_id = util.convertdata(cellText, int); if facility_id: dataColumnDefinitions[i][fieldName] = \ Cell.objects.get(facility_id=facility_id) else: # Use the type from the fieldinformation table # to read in the data for each DC field dataColumnDefinitions[i][fieldName] = \ util.convertdata(cellText, _typelookup.get(fieldName, None)) else: logger.debug(str(( '"Data Column definition not used: ', cellText)) ) pass except Exception, e: logger.error(str(('Exception reading data for cell', i, cellText, e))) raise e logger.debug(str(("definitions: ", dataColumnDefinitions)) )
def read_datacolumns(book): ''' @return an array of data column definition dicts ''' data_column_sheet = book.sheet_by_name('Data Columns') labels = { 'Worksheet Column': 'worksheet_column', '"Data" Worksheet Column': 'worksheet_column', 'Display Order': 'display_order', 'Display Name': 'display_name', 'Name': 'name', 'Data Type': 'data_type', 'Decimal Places': 'precision', 'Description': 'description', 'Replicate Number': 'replicate', 'Unit': 'unit', 'Assay readout type': 'readout_type', 'Comments': 'comments', } dc_definitions = [] datacolumn_fields = util.get_fields(DataColumn) type_lookup = dict((f.name, iu.totype(f)) for f in datacolumn_fields) logger.debug('datacolumn type lookups: %s' % type_lookup) required_labels = ['name', 'data_type'] logger.info('read the data column definitions...') for i in xrange(data_column_sheet.nrows): row_values = data_column_sheet.row_values(i) if i == 0: for val in row_values[1:]: dc_definitions.append({}) label_read = row_values[0] recognized_label = next( (field_name for label, field_name in labels.items() if label_read and label.lower() == label_read.lower()), None) if recognized_label: logger.debug('label: %r, recognized_label: %r' % (label_read, recognized_label)) for j, val in enumerate(row_values[1:]): dc_dict = dc_definitions[j] logger.debug('data column %s:%d:%d:%r' % (recognized_label, i, j, val)) final_val = util.convertdata( val, type_lookup.get(recognized_label, None)) if final_val != None: dc_dict[recognized_label] = final_val if recognized_label == 'display_order': # add 10 to the order, so default reagent cols can go first dc_dict['display_order'] = (dc_dict['display_order'] + 10) if recognized_label == 'name': # split on non-alphanumeric chars temp = re.split(r'[^a-zA-Z0-9]+', dc_dict['name']) # convert, if needed if len(temp) > 1: dc_dict['name'] = camel_case_dwg(dc_dict['name']) else: if recognized_label in required_labels: raise Exception( 'Error, data column field is required: %s, col: %r' % (recognized_label, colname(j + 1))) else: logger.debug('unrecognized label in "Data Columns" sheet %r' % label_read) for dc_dict in dc_definitions: for label in required_labels: if label not in dc_dict: raise Exception('required "Data Column" label not defined %r' % label) logger.info('find the data columns on the "Data" sheet...') data_sheet = book.sheet_by_name('Data') data_sheet_labels = data_sheet.row_values(0) dc_definitions_found = [] data_labels_found = [] for i, data_label in enumerate(data_sheet_labels): if not data_label or not data_label.strip(): logger.info('break on data sheet col %d, blank' % i) break data_label = data_label.upper() col_letter = colname(i) for dc_dict in dc_definitions: _dict = None if 'worksheet_column' in dc_dict: v = dc_dict['worksheet_column'] if v.upper() == col_letter: data_labels_found.append(i) dc_definitions_found.append(dc_dict) _dict = dc_dict elif 'name' in dc_dict or 'display_name' in dc_dict: if (dc_dict.get('name', '').upper() == data_label or dc_dict.get('display_name', '').upper() == data_label): dc_dict['worksheet_column'] = col_letter data_labels_found.append(i) dc_definitions_found.append(dc_dict) _dict = dc_dict if _dict and 'display_order' not in _dict: _dict['display_order'] = i + 10 logger.warn('auto assigning "display_order" for col %r as %d' % (_dict['name'], i + 10)) if i not in data_labels_found: logger.debug(('Data sheet label not found %r,' ' looking in default reagent definitions %s') % (data_label, default_reagent_columns.keys())) for key, dc_dict in default_reagent_columns.items(): if (key.upper() == data_label or dc_dict.get('name', '').upper() == data_label or dc_dict.get('display_name', '').upper() == data_label): dc_dict['worksheet_column'] = col_letter data_labels_found.append(i) dc_definitions_found.append(dc_dict) data_labels_not_found = [ data_label for i, data_label in enumerate(data_sheet_labels) if data_label and data_label.strip() and i not in data_labels_found and data_label not in meta_columns ] if data_labels_not_found: logger.warn('data sheet labels not recognized %s' % data_labels_not_found) # for legacy datasets: make sure the small molecule column 1 is always created small_mol_col = None for dc_dict in dc_definitions_found: if dc_dict['data_type'] == 'small_molecule': small_mol_col = dc_dict break if not small_mol_col: dc_definitions_found.append( default_reagent_columns['Small Molecule Batch']) logger.info('data column definitions found: %s' % [x['display_name'] for x in dc_definitions_found]) return dc_definitions_found
'"Data" Worksheet Column':'worksheet_column', 'Display Order':'display_order', 'Display Name':'display_name', 'Name':'name', 'Data Type':'data_type', 'Decimal Places':'precision', 'Description':'description', 'Replicate Number':'replicate', 'Unit':'unit', 'Assay readout type':'readout_type', 'Comments':'comments', } dc_definitions = [] datacolumn_fields = util.get_fields(DataColumn) type_lookup = dict((f.name, iu.totype(f)) for f in datacolumn_fields) logger.debug('datacolumn type lookups: %s' % type_lookup) required_labels = ['name', 'data_type'] logger.info('read the data column definitions...') for i in xrange(data_column_sheet.nrows): row_values = data_column_sheet.row_values(i) if i == 0: for val in row_values[1:]: dc_definitions.append({}) label_read = row_values[0] recognized_label = next( (field_name for label, field_name in labels.items()
def read_datacolumns(book): ''' @return an array of data column definition dicts ''' data_column_sheet = book.sheet_by_name('Data Columns') labels = { 'Worksheet Column':'worksheet_column', '"Data" Worksheet Column':'worksheet_column', 'Display Order':'display_order', 'Display Name':'display_name', 'Name':'name', 'Data Type':'data_type', 'Decimal Places':'precision', 'Description':'description', 'Replicate Number':'replicate', 'Unit':'unit', 'Assay readout type':'readout_type', 'Comments':'comments', } dc_definitions = [] datacolumn_fields = util.get_fields(DataColumn) type_lookup = dict((f.name, iu.totype(f)) for f in datacolumn_fields) logger.debug('datacolumn type lookups: %s' % type_lookup) required_labels = ['name', 'data_type'] logger.info('read the data column definitions...') for i in xrange(data_column_sheet.nrows): row_values = data_column_sheet.row_values(i) if i == 0: for val in row_values[1:]: dc_definitions.append({}) label_read = row_values[0] recognized_label = next( (field_name for label, field_name in labels.items() if label_read and label.lower() == label_read.lower() ), None) if recognized_label: logger.debug( 'label: %r, recognized_label: %r' % (label_read, recognized_label)) for j,val in enumerate(row_values[1:]): dc_dict = dc_definitions[j] logger.debug('data column %s:%d:%d:%r' % ( recognized_label, i, j, val)) final_val = util.convertdata( val,type_lookup.get(recognized_label, None)) if final_val != None: dc_dict[recognized_label] = final_val if recognized_label == 'display_order': # add 10 to the order, so default reagent cols can go first dc_dict['display_order'] = ( dc_dict['display_order'] + 10) if recognized_label == 'name': # split on non-alphanumeric chars temp = re.split(r'[^a-zA-Z0-9]+',dc_dict['name']) # convert, if needed if len(temp) > 1: dc_dict['name'] = camel_case_dwg(dc_dict['name']) else: if recognized_label in required_labels: raise Exception( 'Error, data column field is required: %s, col: %r' % ( recognized_label, colname(j+1) ) ) else: logger.debug( 'unrecognized label in "Data Columns" sheet %r' % label_read) for dc_dict in dc_definitions: for label in required_labels: if label not in dc_dict: raise Exception( 'required "Data Column" label not defined %r' % label) logger.info('find the data columns on the "Data" sheet...') data_sheet = book.sheet_by_name('Data') data_sheet_labels = data_sheet.row_values(0) dc_definitions_found = [] data_labels_found = [] for i,data_label in enumerate(data_sheet_labels): if not data_label or not data_label.strip(): logger.info('break on data sheet col %d, blank' % i) break data_label = data_label.upper() col_letter = colname(i) for dc_dict in dc_definitions: _dict = None if 'worksheet_column' in dc_dict: v = dc_dict['worksheet_column'] if v.upper() == col_letter: data_labels_found.append(i) dc_definitions_found.append(dc_dict) _dict = dc_dict elif 'name' in dc_dict or 'display_name' in dc_dict: if ( dc_dict.get('name', '').upper() == data_label or dc_dict.get('display_name', '').upper() == data_label): dc_dict['worksheet_column'] = col_letter data_labels_found.append(i) dc_definitions_found.append(dc_dict) _dict = dc_dict if _dict and 'display_order' not in _dict: _dict['display_order'] = i+10 logger.warn( 'auto assigning "display_order" for col %r as %d' % (_dict['name'], i+10)) if i not in data_labels_found: logger.debug( ( 'Data sheet label not found %r,' ' looking in default reagent definitions %s' ) % ( data_label, default_reagent_columns.keys() ) ) for key,dc_dict in default_reagent_columns.items(): if (key.upper() == data_label or dc_dict.get('name', '').upper() == data_label or dc_dict.get('display_name', '').upper() == data_label): dc_dict['worksheet_column'] = col_letter data_labels_found.append(i) dc_definitions_found.append(dc_dict) data_labels_not_found = [ data_label for i,data_label in enumerate(data_sheet_labels) if data_label and data_label.strip() and i not in data_labels_found and data_label not in meta_columns ] if data_labels_not_found: logger.warn( 'data sheet labels not recognized %s' % data_labels_not_found ) # for legacy datasets: make sure the small molecule column 1 is always created small_mol_col = None for dc_dict in dc_definitions_found: if dc_dict['data_type'] == 'small_molecule': small_mol_col = dc_dict break if not small_mol_col: dc_definitions_found.append(default_reagent_columns['Small Molecule Batch']) logger.info('data column definitions found: %s' % [x['display_name'] for x in dc_definitions_found]) return dc_definitions_found