def get_string_from_excel_cell(_xl_sheet, _row_index, _column_index): cell_obj = _xl_sheet.cell(_row_index, _column_index) cell_type_str = ctype_text.get(cell_obj.ctype, "unknown type") if cell_type_str == "text": return cell_obj.value.strip() else: return ""
def write_table(s,row_offset,col1,num_cols,refs,ref_col,wr): rowheader = get_rowheader(s,row_offset) col1_index = get_col_index(s,rowheader,col1) # this is for the parent table wich has a refereced column. # will treat composed keys later if ref_col != "NULL": ref_index = get_col_index(s,rowheader,ref_col) refs.append(ref_col) ref_row=0 for rowid in range(rowheader,s.nrows): # Conver header fields to lower and replace blanks by '_' #print "======>>> Row=%s" % rowid if rowid == rowheader: head = [] if ref_col == "NULL": head.append(format_header(refs[ref_row])) for colid in range(col1_index,(col1_index+num_cols)): head.append(format_header(s.cell(rowid,colid).value)) wr.writerow(head) else: row = [] cell_obj=s.cell(rowid,col1_index) cell_type_str = ctype_text.get(cell_obj.ctype, 'unknown type') if cell_type_str == 'empty' and ctype_text.get(s.cell(rowid-1,col1_index).ctype, 'unknown type') != 'empty' and ref_col == "NULL": ref_row+=1 if cell_type_str != 'empty': if ref_col == "NULL": row.append(refs[ref_row]) for colid in range(col1_index,(col1_index+num_cols)): row.append(s.cell(rowid,colid).value) wr.writerow(row) if ref_col != "NULL": refs.append(s.cell(rowid,ref_index).value) return refs
def extract_po(wb: xlrd.book.Book) -> str: sheet = wb.sheet_by_index(0) # try col 9, 10 for i in range(9, 11): cell: xlrd.sheet.Cell = sheet.cell(2, i) if ctype_text.get(cell.ctype) != 'text': continue po = cell.value if po_re.match(po): return po return None
def show_column_names(xl_sheet): a = [] row = xl_sheet.row(0) # 1st row print(60 * '-' + 'n(Column #) value [type]n' + 60 * '-') for idx, cell_obj in enumerate(row): cell_type_str = ctype_text.get(cell_obj.ctype, 'unknown type') a.append(' ' + cell_obj.value) print('(%s) %s [%s]' % (idx, cell_obj.value, cell_type_str,)) return a
def getColumnNamesListFromSheet(sheet): columnNamesRow = sheet.row(0) columnNames = [] for index, columnName in enumerate(columnNamesRow): cellDataType = ctype_text.get(columnName.ctype, 'unknown type') if cellDataType != "text": print("ERROR: Only 'text' data types for column names is currently supported"); sys.exit(1) columnNames.append(columnName.value) return columnNames
def clean_data(self): 'Clean the tweet data after loading completed, remove stop words and apply stemming to the featured words in tweets' self.cleaned_tweets_X = [] self.cleaned_tweets_Y = [] for row_idx in range(0, self.raw_data.nrows): #fetch data from 4th column in excel sheet original_tweet = self.raw_data.cell(row_idx, 0) cell_type_tweet = ctype_text.get(original_tweet.ctype, 'unknown type') sentiment = self.raw_data.cell(row_idx, 1) cell_type_sentiment = ctype_text.get(sentiment.ctype, 'unknown type') # tweet should be text and sentiment value a number if cell_type_tweet == "text" and cell_type_sentiment == "number": original_tweet = original_tweet.value.lower() sentiment = int(sentiment.value) count=0 #remove garbage values like: .,!"<>{}[]? tweet = original_tweet.replace('. ',' ').replace('..',' ').replace(';',' ').replace(', ',' ').replace('!',' ').replace('"',' ')\ .replace('<',' ').replace('>',' ').replace('{',' ').replace('}',' ').replace('[',' ').replace(']',' ')\ .replace('?',' ').replace("'",'').replace(',',' ').replace(',',' ').replace('%',' ') #tokenize into words wordList = tweet.split() sentence = "" for word in wordList: #remove stop words and words having 1 and or 2 characters value = self.isStopWord(word) if value == False: #remove the tags </e>, <a> etc. if len(word)==2 and word[0:1]=='/': continue #only consider words which are more than 1 character in length if len(word)>1: word = self.stemmer.stem(word) word = word.encode('utf8') sentence = sentence + word + ' ' count = count + 1 self.cleaned_tweets_X.append(sentence) self.cleaned_tweets_Y.append(sentiment)
def spreadsheet_text_encode(f_bytes, encoding): #UTF-8 is assumed for encoding, which isn't great. May want to modify later. wb = xlrd.open_workbook(file_contents=f_bytes) text = [] for sheet in wb.sheets(): for row in sheet.get_rows(): filtered_row = filter( lambda x: ctype_text.get(x.ctype, 'not_text') == 'text', row) filtered_row = [s.value for s in filtered_row] text += [" ".join(filtered_row)] return " ".join(text)
def xls2pg(): from xlrd.sheet import ctype_text global XLS_FILE global XLS_ENCODE global PG_CONN bk = None print "Reading XLS" if (XLS_ENCODE == None): bk = xlrd.open_workbook(XLS_FILE) else: bk = xlrd.open_workbook(XLS_FILE, encoding_override=XLS_ENCODE) print "Creating " + TABLE + " table" sheet = bk.sheet_by_index(0) row = sheet.row(0) create_table = "CREATE TABLE " + TABLE + "(" for idx, cell_obj in enumerate(row): #cell_type_str = ctype_text.get(cell_obj.ctype, 'unknown type') #print cell_type_str col = remove_accents(cell_obj.value) col = col.replace(" ", "_").lower() create_table = create_table + col + " varchar," create_table = create_table[0:-1] + ");" num_cols = sheet.ncols print "Inserting data" try: cur = PG_CONN.cursor() cur.execute(create_table) for row_idx in range(1, sheet.nrows): insert = "INSERT INTO " + TABLE + " VALUES(" row = sheet.row(row_idx) for idx, cell_obj in enumerate(row): cell_type_str = ctype_text.get(cell_obj.ctype, 'unknown type') if cell_type_str == "text": cell = unicode(cell_obj.value) cell = cell.encode('UTF-8') cell = cell.replace("'", "''") else: cell = str(cell_obj.value) insert = insert + "'" + cell + "'," insert = insert[0:-1] + ");" cur.execute(insert) if ((row_idx % 100) == 0): print("."), print "Commiting data" cur.close() PG_CONN.commit() except (Exception, psycopg2.DatabaseError) as error: print(error) finally: closeConn()
def readData(self): for i in range(0, 22): self.data[i] = [] self.row1 = self.sheet.row(0) for i in range(2, 1632): self.row1 = self.sheet.row(i - 1) for idx, cell_obj in enumerate(self.row1): cell_type_str = ctype_text.get(cell_obj.ctype, 'unknown type') #print('(%s) %s %s' % (idx, cell_type_str, cell_obj.value)) if cell_obj.value == "" and idx == 0: #print("abc") break else: self.data[idx].append(cell_obj.value)
def locate_row_descriptions(xl_workbook, first_col, first_row, sheet_name, data_cols): """ Function to locate the column where the row headers are. Returns a single column """ if len(data_cols) == 1: table_type = "long format" elif len(data_cols) > 1: table_type = "wide format" else: table_type = "no data" sheet = xl_workbook.sheet_by_name(sheet_name) row_descriptions = [] # Where is the descriptor column? if table_type == "wide format": for c in range(0, first_col): cell = sheet.row(first_row)[c] if ctype_text.get(cell.ctype, 'unknown type') not in ["text", "xldate"]: continue else: row_descriptions.append(c) #break elif table_type == "long format": for c in range(0, first_col): cell = sheet.row(first_row - 1)[c] if ctype_text.get(cell.ctype, 'unknown type') not in ["text", "xldate"]: continue else: row_descriptions.append(c) if len(row_descriptions) == 0: print("Something went wrong. Could not locate row headings") return "Could not locate row headings", table_type # Possibly a pivot table return row_descriptions, table_type
def load_file(file_name): book = xlrd.open_workbook(file_name, formatting_info=True) sheets = book.sheet_names() #print ("sheets are:", sheets) links = get_links(file_name) for index, sh in enumerate(sheets): sheet = book.sheet_by_index(index) #print ("Sheet:", sheet.name) rows, cols = sheet.nrows, sheet.ncols #print ("Number of rows: %s Number of cols: %s" % (rows, cols)) # Iterate through rows, and print out the column values tbl = Table() #print(sheet.nrows) for row_idx in range(1, sheet.nrows): #print('Row ', row_idx) row = Row(row_idx) for col_idx in range(len(cols_name)): cell_obj = sheet.cell(row_idx, col_idx) cell_type_str = ctype_text.get(cell_obj.ctype, 'unknown type') #print("\t(%s): %s" % (cols_name[col_idx], cell_obj.value)) if col_idx == len(cols_name) - 1: try: value = links[row_idx + 1][11:-9] except KeyError: value = None continue else: value = str(cell_obj.value) xfx = sheet.cell_xf_index(row_idx, col_idx) xf = book.xf_list[xfx] bgx = xf.background.pattern_colour_index #print ("\t\tColor %d" % bgx) color = None if bgx == 10: color = Color.RED elif bgx == 13: color = Color.YELLOW elif bgx == 57: color = Color.GREEN else: color = Color.WHITE row.update_cell(col_idx, value, color) tbl.append(row) return tbl
def _get_cell_value(self, cell, field_type=False): """ If Odoo's field type is known, convert to valid string for import, if not know, just get value as is """ value = False datemode = 0 # From book.datemode, but we fix it for simplicity if field_type in ['date', 'datetime']: ctype = ctype_text.get(cell.ctype, 'unknown type') if ctype == 'number': time_tuple = xlrd.xldate_as_tuple(cell.value, datemode) date = datetime(*time_tuple) if field_type == 'date': value = date.strftime("%Y-%m-%d") elif field_type == 'datetime': value = date.strftime("%Y-%m-%d %H:%M:%S") else: value = cell.value elif field_type in ['integer', 'float']: value_str = str(cell.value).strip().replace(',', '') if len(value_str) == 0: value = '' elif value_str.replace('.', '', 1).isdigit(): # Is number if field_type == 'integer': value = int(float(value_str)) elif field_type == 'float': value = float(value_str) else: # Is string, no conversion value = value_str elif field_type in ['many2one']: # If number, change to string if isinstance(cell.value, (int, long, float, complex)): value = str(cell.value) else: value = cell.value else: value = cell.value # If string, cleanup if isinstance(value, str): value = value.encode('utf-8') if value[-2:] == '.0': value = value[:-2] # Except boolean, when no value, we should return as '' if field_type not in ['boolean']: if not value: value = '' return value
def parse_mds_leader_organogram(filename): #open worksheet xl_wb = xl_workbook = xlrd.open_workbook(filename) sheet_names = xl_workbook.sheet_names() xl_sheet = xl_workbook.sheet_by_index(0) from dialogues.models import * row = xl_sheet.row(0) district_count = 0 for i in range(8,xl_sheet.nrows): row = xl_sheet.row(i) valid_dist = False for idx, cell_obj in enumerate(row): cell_type_str = ctype_text.get(cell_obj.ctype, 'unknown type') if idx == 0: region = cell_obj.value.strip() if idx == 1: ga = cell_obj.value.strip() if idx == 2: area = cell_obj.value.strip() if idx == 3: chapter = cell_obj.value.strip() if idx == 4: district = cell_obj.value.strip() if idx == 6: structure = cell_obj.value.strip() if structure.lower() == 'district': district_count += 1 valid_dist = True #print "%s, %s, %s, %s, %s"%(region, ga, area, chapter, district) if valid_dist: try_create_region(region) try_create_ga(ga, region) try_create_area(area, ga) try_create_chapter(chapter, area) try_create_district(district, chapter) print "Total number of districts added: %d"%district_count return district_count
def open_file(path): """ Open and read an Excel file """ # Open the workbook xl_workbook = xlrd.open_workbook(path) # List sheet names, and pull a sheet by name # sheet_names = xl_workbook.sheet_names() print('Sheet Names', sheet_names) xl_sheet = xl_workbook.sheet_by_name(sheet_names[0]) # Or grab the first sheet by index # (sheets are zero-indexed) # xl_sheet = xl_workbook.sheet_by_index(0) print('Sheet name: %s' % xl_sheet.name) # Pull the first row by index # (rows/columns are also zero-indexed) # row = xl_sheet.row(0) # 1st row # Print 1st row values and types # from xlrd.sheet import ctype_text print('(Column #) type:value') for idx, cell_obj in enumerate(row): cell_type_str = ctype_text.get(cell_obj.ctype, 'unknown type') print('(%s) %s %s' % (idx, cell_type_str, cell_obj.value)) # Print all values, iterating through rows and columns # num_cols = xl_sheet.ncols # Number of columns for row_idx in range(0, xl_sheet.nrows): # Iterate through rows print('-' * 40) print('Row: %s' % row_idx) # Print row number for col_idx in range(0, num_cols): # Iterate through columns cell_obj = xl_sheet.cell(row_idx, col_idx) # Get cell object by row, col print('Column: [%s] cell_obj: [%s]' % (col_idx, cell_obj))
def parse_mds_leader_organogram(filename): #open worksheet xl_wb = xl_workbook = xlrd.open_workbook(filename) sheet_names = xl_workbook.sheet_names() xl_sheet = xl_workbook.sheet_by_index(0) from dialogues.models import * row = xl_sheet.row(0) district_count = 0 for i in range(8, xl_sheet.nrows): row = xl_sheet.row(i) valid_dist = False for idx, cell_obj in enumerate(row): cell_type_str = ctype_text.get(cell_obj.ctype, 'unknown type') if idx == 0: region = cell_obj.value.strip() if idx == 1: ga = cell_obj.value.strip() if idx == 2: area = cell_obj.value.strip() if idx == 3: chapter = cell_obj.value.strip() if idx == 4: district = cell_obj.value.strip() if idx == 6: structure = cell_obj.value.strip() if structure.lower() == 'district': district_count += 1 valid_dist = True #print "%s, %s, %s, %s, %s"%(region, ga, area, chapter, district) if valid_dist: try_create_region(region) try_create_ga(ga, region) try_create_area(area, ga) try_create_chapter(chapter, area) try_create_district(district, chapter) print "Total number of districts added: %d" % district_count return district_count
def get_column_stats(xl_sheet, col_idx): """ :param xl_sheet: Sheet object from Excel Workbook, extracted using xlrd :param col_idx: zero-indexed int indicating a column in the Excel workbook """ if xl_sheet is None: print('xl_sheet is None') return if not col_idx.isdigit(): print('Please enter a valid column number (0-%d)' % (xl_sheet.ncols - 1)) return col_idx = int(col_idx) if col_idx < 0 or col_idx >= xl_sheet.ncols: print('Please enter a valid column number (0-%d)' % (xl_sheet.ncols - 1)) return # Iterate through rows, and print out the column values row_vals = [] for row_idx in range(0, xl_sheet.nrows): cell_obj = xl_sheet.cell(row_idx, col_idx) cell_type_str = ctype_text.get(cell_obj.ctype, 'unknown type') print('(row %s) %s (type:%s)' % (row_idx, cell_obj.value, cell_type_str)) row_vals.append(cell_obj.value) # Retrieve non-empty rows nonempty_row_vals = [x for x in row_vals if x] num_rows_missing_vals = xl_sheet.nrows - len(nonempty_row_vals) print('Vals: %d; Rows Missing Vals: %d' % (len(nonempty_row_vals), num_rows_missing_vals)) # Count occurrences of values counts = Counter(nonempty_row_vals) # Display value counts print('-' * 40 + 'n', 'Top Twenty Values', 'n' + '-' * 40) print('Value [count]') for val, cnt in counts.most_common(20): print('%s [%s]' % (val, cnt))
def open_file(path): """ Open and read an Excel file """ # Open the workbook xl_workbook = xlrd.open_workbook(path) # List sheet names, and pull a sheet by name # sheet_names = xl_workbook.sheet_names() print('Sheet Names', sheet_names) xl_sheet = xl_workbook.sheet_by_name(sheet_names[0]) # Or grab the first sheet by index # (sheets are zero-indexed) # xl_sheet = xl_workbook.sheet_by_index(0) print ('Sheet name: %s' % xl_sheet.name) # Pull the first row by index # (rows/columns are also zero-indexed) # row = xl_sheet.row(0) # 1st row # Print 1st row values and types # from xlrd.sheet import ctype_text print('(Column #) type:value') for idx, cell_obj in enumerate(row): cell_type_str = ctype_text.get(cell_obj.ctype, 'unknown type') print('(%s) %s %s' % (idx, cell_type_str, cell_obj.value)) # Print all values, iterating through rows and columns # num_cols = xl_sheet.ncols # Number of columns for row_idx in range(0, xl_sheet.nrows): # Iterate through rows print ('-'*40) print ('Row: %s' % row_idx) # Print row number for col_idx in range(0, num_cols): # Iterate through columns cell_obj = xl_sheet.cell(row_idx, col_idx) # Get cell object by row, col print ('Column: [%s] cell_obj: [%s]' % (col_idx, cell_obj))
def extract_column_headings(xl_workbook, table, spreadsheet_type): sheet = xl_workbook.sheet_by_name(table.sheet_name) if spreadsheet_type in ["Time series"]: column_headings = pd.DataFrame() for r, title in table.column_titles.items(): for c in table.cols: cell = sheet.row(r)[c] row_position = c - min(table.cols) if ctype_text.get(cell.ctype, 'unknown type') == "xldate": column_headings.loc[row_position, title] = pd.to_datetime((cell.value - 25569) * 86400.0, unit='s').strftime('%d/%m/%Y') else: column_headings.loc[row_position, title] = cell.value return column_headings, {} else: first_row = min(table.rows) if table.table_type == "long format": column_headings = [] for c in list(table.cols) + table.row_descriptions: col_header = sheet.row(first_row-1)[c].value column_headings.append(col_header) return column_headings, {} # if it is wide format: column_headings = merged_data_function(xl_workbook, sheet_name=table.sheet_name, merged_data_cols=table.merged_meta_data, data_cols=table.cols, data_rows=table.rows, #rows=table.column_header_locations, extra_rows=table.extra_meta_data, spreadsheet_type=spreadsheet_type, column_header_locations=table.column_header_locations, last_row_in_sheet=table.last_row_in_sheet) column_subheadings = merged_data_subheadings_function(xl_workbook, sheet_name=table.sheet_name, merged_data_cols=table.merged_meta_data, data_cols=table.cols, data_rows=table.rows, rows=table.column_header_locations, extra_rows=table.extra_meta_data, top_row=table.top_row) # if table.sheet_name == "Table 1": # print('column_headings', column_headings, 'column_subheadings', column_subheadings) return column_headings, column_subheadings
def read_xls(fname, display): # Open the workbook xl_workbook = xlrd.open_workbook(fname) # Grab the first sheet by index # (sheets are zero-indexed) xl_sheet = xl_workbook.sheet_by_index(0) if display: print('Sheet name: %s' % xl_sheet.name) # Pull the first row by index # (rows/columns are also zero-indexed) if display: row0 = xl_sheet.row(0) # 1st row row = xl_sheet.row(1) # 2nd row # Print 1st row values and types from xlrd.sheet import ctype_text print('(Column #) type:value') for idx, cell_obj in enumerate(row): cell_type_str = ctype_text.get(cell_obj.ctype, 'unknown type') print('(%s) %s \t: %s' % (idx, row0[idx].value, cell_type_str)) # Print all values, iterating through rows and columns tb = [['' for i in range(xl_sheet.ncols)] for j in range(xl_sheet.nrows)] for row_idx in range(0, xl_sheet.nrows): # Iterate through rows if display: print('-' * 40) print('Row: %s' % row_idx) # Print row number for col_idx in range(0, xl_sheet.ncols): # Iterate through columns cell_obj = xl_sheet.cell(row_idx, col_idx) # Get cell object by row, col tb[row_idx][col_idx] = xl_sheet.cell(row_idx, col_idx).value if display: print('Column: [%s] cell_obj: [%s]' % (col_idx, cell_obj.value)) return tb
def __init__(self): self.driver = webdriver.Firefox() self.driver.maximize_window() self.driver.delete_all_cookies() self.driver.get("https://www.heb.com.mx/") self.fname = join(dirname(dirname(abspath(__file__))), 'untitled', 'heblista.xlsx') self.xl_workbook = xlrd.open_workbook(self.fname) # List sheet names, and pull a sheet by name # self.sheet_names = self.xl_workbook.sheet_names() self.xl_sheet = self.xl_workbook.sheet_by_name(self.sheet_names[0]) self.row = self.xl_sheet.row(0) # 1st row self.arreglo2 = [] self.arreglo3 = [] self.arreglo = self.arreglo2, self.arreglo3 self.contador3 = 0 for self.idx, self.cell_obj in enumerate(self.row): self.cell_type_str = ctype_text.get(self.cell_obj.ctype, 'unknown type') # Print all values, iterating through rows and columns # self.num_cols = self.xl_sheet.ncols # Number of columns for self.row_idx in range(0, self.xl_sheet.nrows): # Iterate through rows self.contador3 += 1 for self.col_idx in range(0, self.num_cols): # Iterate through columns self.cell_obj = self.xl_sheet.cell(self.row_idx, self.col_idx).value # Get cell object by row, col print(self.cell_obj) if self.contador3 == 1: self.arreglo2.append(self.cell_obj) if self.contador3 == 2: self.arreglo3.append(self.cell_obj)
def checkveolia(): url = URL() urlConnect = 'https://www.service-client.veoliaeau.fr/home.loginAction.do#inside-space' urlConso1 = 'https://www.service-client.veoliaeau.fr/home/espace-client/votre-consommation.html' urlConso2 = 'https://www.service-client.veoliaeau.fr/home/espace-client/votre-consommation.html?vueConso=historique' urlXls = 'https://www.service-client.veoliaeau.fr/home/espace-client/votre-consommation.exportConsommationData.do?vueConso=historique' urlDisconnect = 'https://www.service-client.veoliaeau.fr/logout' # Connect to Veolia website Domoticz.Log('Connection au site Veolia Eau') params = { 'veolia_username': Parameters["Username"], 'veolia_password': Parameters["Password"], 'login': '******' } referer = 'https://www.service-client.veoliaeau.fr/home.html' url.call(urlConnect, params, referer) # Page 'votre consomation' #Domoticz.Log('Page de consommation') url.call(urlConso1) # Page 'votre consomation : historique' #Domoticz.Log('Page de consommation : historique') url.call(urlConso2) # Download XLS file Domoticz.Log('Telechargement du fichier') response = url.call(urlXls) content = response.read() # logout Domoticz.Log('Deconnection du site Veolia Eau') url.call(urlDisconnect) file = open('temp.xls', 'wb') file.write(content) file.close() book = xlrd.open_workbook('temp.xls', encoding_override="cp1252") sheet = book.sheet_by_index(0) last_rows = sheet.nrows row = sheet.row(last_rows - 1) for idx, cell_obj in enumerate(row): cell_type_str = ctype_text.get(cell_obj.ctype, 'unknown type') if idx == 1: UpdateDevice(1, 0, cell_obj.value)
def read_xls(fname, display): # Open the workbook xl_workbook = xlrd.open_workbook(fname) # Grab the first sheet by index # (sheets are zero-indexed) xl_sheet = xl_workbook.sheet_by_index(0) if display: print ('Sheet name: %s' % xl_sheet.name) # Pull the first row by index # (rows/columns are also zero-indexed) if display: row0 = xl_sheet.row(0) # 1st row row = xl_sheet.row(1) # 2nd row # Print 1st row values and types from xlrd.sheet import ctype_text print('(Column #) type:value') for idx, cell_obj in enumerate(row): cell_type_str = ctype_text.get(cell_obj.ctype, 'unknown type') print('(%s) %s \t: %s' % (idx, row0[idx].value, cell_type_str)) # Print all values, iterating through rows and columns tb=[['' for i in range(xl_sheet.ncols)] for j in range(xl_sheet.nrows)] for row_idx in range(0, xl_sheet.nrows): # Iterate through rows if display: print ('-'*40) print ('Row: %s' % row_idx) # Print row number for col_idx in range(0, xl_sheet.ncols): # Iterate through columns cell_obj = xl_sheet.cell(row_idx, col_idx) # Get cell object by row, col tb[row_idx][col_idx] = xl_sheet.cell(row_idx, col_idx).value if display: print ('Column: [%s] cell_obj: [%s]' % (col_idx, cell_obj.value)) return tb
def get_column_stats(xl_sheet, col_idx): """ :param xl_sheet: Sheet object from Excel Workbook, extracted using xlrd :param col_idx: zero-indexed int indicating a column in the Excel workbook """ if xl_sheet is None: print ('xl_sheet is None') return if not col_idx.isdigit(): print ('Please enter a valid column number (0-%d)' % (xl_sheet.ncols-1)) return col_idx = int(col_idx) if col_idx < 0 or col_idx >= xl_sheet.ncols: print ('Please enter a valid column number (0-%d)' % (xl_sheet.ncols-1)) return # Iterate through rows, and print out the column values row_vals = [] for row_idx in range(0, xl_sheet.nrows): cell_obj = xl_sheet.cell(row_idx, col_idx) cell_type_str = ctype_text.get(cell_obj.ctype, 'unknown type') print ('(row %s) %s (type:%s)' % (row_idx, cell_obj.value, cell_type_str)) row_vals.append(cell_obj.value) # Retrieve non-empty rows nonempty_row_vals = [x for x in row_vals if x] num_rows_missing_vals = xl_sheet.nrows - len(nonempty_row_vals) print ('Vals: %d; Rows Missing Vals: %d' % (len(nonempty_row_vals), num_rows_missing_vals)) # Count occurrences of values counts = Counter(nonempty_row_vals) # Display value counts print ('-'*40 + 'n', 'Top Twenty Values', 'n' + '-'*40 ) print ('Value [count]') for val, cnt in counts.most_common(20): print ('%s [%s]' % (val, cnt))
def excelConverter(path, tname): excelFileInf="temp" global numBytes, numExfilFiles, numFiles, dT, eD, gpath, flg breakit="" for root, dirs, files in os.walk(path): if breakit=="true": break for name in files: if name == tname: print (os.path.abspath(os.path.join(root, name))) excelFileInf= os.path.abspath(os.path.join(root, name)) breakit="true" break fileFil = open(str(numFiles)+"combinedExfil.txt", "a") SheetList = xlrd.open_workbook(excelFileInf) fnames = SheetList.sheet_names() fileFil.write("[Excel-Spreadsheet]") SpreadSheet = SheetList.sheet_by_index(0) fileFil.write('Sheet name:'+str(SpreadSheet.name)) row = SpreadSheet.row(0) #startshere for idx, spreadsheetCells in enumerate(row): ContentType = ctext.get(spreadsheetCells.ctype, 'Null') writtenText=str(spreadsheetCells.value) if numBytes+sys.getsizeof(writtenText)<45000: fileFil.write(("("+str(idx)+")"+" "+str(ContentType)+writtenText)) numBytes+=sys.getsizeof(writtenText) else: numFiles+=1 fileFil=open(+str(numFiles)+"combinedExfil.txt", "a") fileFil.write(("("+str(idx)+")"+" "+str(ContentType)+writtenText)) numBytes=sys.getsizeof(writtenText) num_cols = SpreadSheet.ncols # Number of columns for row_idx in range(0, SpreadSheet.nrows): for col_idx in range(0, num_cols): spreadsheetCells = SpreadSheet.cell(row_idx, col_idx) # Get cell object by row, col fileFil.write('Row-Column[%s:%s] Content[%s]' % (row_idx, col_idx, spreadsheetCells))
num_rows = first_sheet.nrows num_cols = first_sheet.ncols print("No of Rows : " + str(num_rows)) print("No of Columns : " + str(num_cols)) header_found = 'N' txn_count = 0 # Read through all Rows/Columns/Cells print(100 * '=') for row_num in range(0, num_rows): # Get First Cell details for each row current_row = first_sheet.row(row_num) first_cell = first_sheet.cell(row_num, 0) first_cell_type = ctype_text.get(first_cell.ctype, 'Unknown Type') first_cell_value = first_cell.value # Once header is found extract the txn records if header_found == 'Y': if first_cell_type != 'text' and \ first_cell_value != '\t': print('Row number ' + str(row_num) + ' : ' + str(current_row)) txn_count = txn_count + 1 for col_num in range(0, num_cols): cell = first_sheet.cell(row_num, col_num) cell_type = ctype_text.get(cell.ctype, 'Unknown Type') cell_value = cell.value print('cell(%d,%d) = (type = %s, value = %s)' % (row_num, col_num, cell_type, cell_value)) if first_cell_type == 'text' and \
def getCellDetails(row_num, col_num): cell = first_sheet.cell(row_num, col_num) cell_type = ctype_text.get(cell.ctype, 'Unknown Type') cell_value = cell.value return (cell, cell_type, cell_value)
def show_column_names(xl_sheet): row = xl_sheet.row(0) # 1st row print(60 * "-" + "n(Column #) value [type]n" + 60 * "-") for idx, cell_obj in enumerate(row): cell_type_str = ctype_text.get(cell_obj.ctype, "unknown type") print("(%s) %s [%s]" % (idx, cell_obj.value, cell_type_str))
def import_xls(self, model, file, column_name=None, column_value=None): decoded_data = base64.decodestring(file) ftemp = 'temp' + datetime.utcnow().strftime('%H%M%S%f')[:-3] f = open(ftemp + '.xls', 'wb+') f.write(decoded_data) f.seek(0) f.close() wb = xlrd.open_workbook(f.name) st = wb.sheet_by_index(0) csv_file = open(ftemp + '.csv', 'wb') csv_out = unicodecsv.writer(csv_file, encoding='utf-8', quoting=unicodecsv.QUOTE_ALL) if st._cell_values: _HEADER_FIELDS = st._cell_values[0] for nrow in xrange(st.nrows): if nrow > 0: row_values = st.row_values(nrow) for index, val in enumerate(row_values): ctype = st.cell(nrow, index).ctype type = ctype_text.get(ctype, 'unknown type') if type == 'empty' or type == 'text' \ or type == 'bool' or type == 'error' \ or type == 'blank': row_values[index] = st.cell(nrow, index).value elif type == 'number': if not val: row_values[index] = 0 else: if not str(val).isdigit(): row_values[index] = int(val) else: row_values[index] = val elif type == 'xldate': str_date = self.xldate_to_datetime( st.cell(nrow, index).value) row_values[index] = str_date else: row_values[index] = st.cell(nrow, index).value csv_out.writerow(row_values) else: csv_out.writerow(st.row_values(nrow)) csv_file.close() csv_file = open(ftemp + '.csv', 'r') file_txt = csv_file.read() csv_file.close() os.unlink(ftemp + '.xls') os.unlink(ftemp + '.csv') if not file_txt: raise ValidationError(_(str("File Not found."))) if column_name and column_value: _HEADER_FIELDS.insert(0, str(column_name)) file_txt = self._add_column(column_name, column_value, file_txt) Import = self.env['base_import.import'] imp = Import.create({ 'res_model': model, 'file': file_txt, }) [errors] = imp.do( _HEADER_FIELDS, {'headers': True, 'separator': ',', 'quoting': '"', 'encoding': 'utf-8'}) if errors: raise ValidationError(_(str(errors[0]['message']))) return file
xl_workbook = xlrd.open_workbook("excelfile.xlsx") print(xl_workbook) sheet_names = xl_workbook.sheet_names() print('Sheet Names', sheet_names) xl_sheet = xl_workbook.sheet_by_index(4) print('Sheet name: %s' % xl_sheet.name) from xlrd.sheet import ctype_text inten = [] energy = [] sum = 0 for row_index in range(1, xl_sheet.nrows): if (ctype_text.get(xl_sheet.cell(row_index, 3).ctype) == "empty" or xl_sheet.cell(row_index, 3).value == "-"): continue inten.append(float(xl_sheet.cell(row_index, 6).value) * 0.3) energy.append(float(xl_sheet.cell(row_index, 3).value)) sum+=float(xl_sheet.cell(row_index, 6).value) * 0.3 print(int(sum)) creationism(energy, inten) IntAndE = [] for i in range(0,4): IntAndE.append([]) for i in range(0, len(inten)):
def read_curriculum(): cname = join(dirname(dirname(abspath(__file__))), 'crs', 'EE-curriculum-October2014 (11).xls') cl_workbook = xlrd.open_workbook(cname) sheet_names = cl_workbook.sheet_names() # print ('Sheet Names', sheet_names) # print ('hello') cl_sheet = cl_workbook.sheet_by_name(sheet_names[0]) row = cl_sheet.row(0) for idx, cell_obj in enumerate(row): cell_type_str = ctype_text.get(cell_obj.ctype, 'unknown type') # print ('(%s) %s %s ' % (idx, cell_type_str, cell_obj.value)) num_cols = cl_sheet.ncols list_of_courses = [] for row_idx in range(1, cl_sheet.nrows): # print ('-'*40) # print ('Row: %s ' % row_idx) #print row number for col_idx in range(0, num_cols): # Iterate through columns cell_obj = cl_sheet.cell(row_idx, col_idx) # Get cell object by row, col # print ('Column: [%s] cell_obj: [%s]' % (col_idx, cell_obj)) for row_idx in range(7, 14): #print ('-'*40) #print ('Row: %s ' % row_idx) #print row number list_of_courses.append(cl_sheet.cell(row_idx,0).value) list_of_courses.append(cl_sheet.cell(row_idx,7).value) #print ('Column: [%s] cell_obj: [%s]' % (col_idx, cell_obj)) # print (list_of_courses) for row_idx in range(19, 25): #print ('-'*40) #print ('Row: %s ' % row_idx) #print row number list_of_courses.append(cl_sheet.cell(row_idx,0).value) list_of_courses.append(cl_sheet.cell(row_idx,7).value) #print ('Column: [%s] cell_obj: [%s]' % (col_idx, cell_obj)) # print (list_of_courses) for row_idx in range(30, 36): #print ('-'*40) #print ('Row: %s ' % row_idx) #print row number list_of_courses.append(cl_sheet.cell(row_idx,0).value) list_of_courses.append(cl_sheet.cell(row_idx,7).value) #print ('Column: [%s] cell_obj: [%s]' % (col_idx, cell_obj)) # print (list_of_courses) for row_idx in range(41, 46): #print ('-'*40) #print ('Row: %s ' % row_idx) #print row number list_of_courses.append(cl_sheet.cell(row_idx,0).value) list_of_courses.append(cl_sheet.cell(row_idx,7).value) #print ('Column: [%s] cell_obj: [%s]' % (col_idx, cell_obj)) list_of_courses.append(cl_sheet.cell(15,7).value) # print (list_of_courses) return list_of_courses
def handle(self, **options): commit = options.get('commit') region = options.get('region') excel_file = options.get('excel_file') risk_analysis = options.get('risk_analysis') excel_metadata_file = options.get('excel_metadata_file') risk_app = options.get('risk_app') app = RiskApp.objects.get(name=risk_app) if region is None: raise CommandError( "Input Destination Region '--region' is mandatory") if risk_analysis is None: raise CommandError("Input Risk Analysis associated to the File \ '--risk_analysis' is mandatory") if not excel_file or len(excel_file) == 0: raise CommandError( "Input Risk Data Table '--excel_file' is mandatory") risk = RiskAnalysis.objects.get(name=risk_analysis, app=app) wb = xlrd.open_workbook(filename=excel_file) region = Region.objects.get(name=region) region_code = region.administrative_divisions.filter( parent=None)[0].code scenarios = RiskAnalysisDymensionInfoAssociation.objects.filter( riskanalysis=risk, axis='x') round_periods = RiskAnalysisDymensionInfoAssociation.objects.filter( riskanalysis=risk, axis='y') # print('typename = %s' % (risk.layer.typename)) table_name = risk.layer.typename.split(":")[1] \ if ":" in risk.layer.typename else risk.layer.typename for scenario in scenarios: # Dump Vectorial Data from DB datastore = settings.OGC_SERVER['default']['DATASTORE'] if (datastore): ogc_db_name = settings.DATABASES[datastore]['NAME'] ogc_db_user = settings.DATABASES[datastore]['USER'] ogc_db_passwd = settings.DATABASES[datastore]['PASSWORD'] ogc_db_host = settings.DATABASES[datastore]['HOST'] ogc_db_port = settings.DATABASES[datastore]['PORT'] sheet = wb.sheet_by_name(scenario.value) row_headers = sheet.row(0) for rp_idx, rp in enumerate(round_periods): col_num = -1 if app.name == RiskApp.APP_DATA_EXTRACTION: for idx, cell_obj in enumerate(row_headers): # cell_type_str = ctype_text.get(cell_obj.ctype, 'unknown type') # print('(%s) %s %s' % (idx, cell_type_str, cell_obj.value)) try: # if int(cell_obj.value) == int(rp.value): # print('{} =? {}'.format(rp.value, cell_obj.value)) if self.to_int_if_number( cell_obj.value) == self.to_int_if_number( rp.value): # print('[%s] (%s) RP-%s' % (scenario.value, idx, rp.value)) col_num = idx break except: traceback.print_exc() pass elif app.name == RiskApp.APP_COST_BENEFIT: col_num = 0 if col_num >= 0: conn = self.get_db_conn(ogc_db_name, ogc_db_user, ogc_db_port, ogc_db_host, ogc_db_passwd) try: if app.name == RiskApp.APP_DATA_EXTRACTION: for row_num in range(1, sheet.nrows): cell_obj = sheet.cell(row_num, 5) iso_country = str( sheet.cell(row_num, 2).value)[:2] cell_type_str = ctype_text.get( cell_obj.ctype, 'unknown type') # print('(%s) %s %s' % (idx, cell_type_str, cell_obj.value)) if cell_obj.value: adm_code = cell_obj.value \ if cell_type_str == 'text' \ else iso_country + '{:05d}'.format(int(cell_obj.value)) print('adm code read from cell: {}'.format( adm_code)) try: adm_div = AdministrativeDivision.objects.get( code=adm_code) except ObjectDoesNotExist: traceback.print_exc() pass value = sheet.cell_value(row_num, col_num) print('[%s] (%s) %s (%s) / %s' % (scenario.value, rp.value, adm_div.name, adm_code, value)) db_values = { 'table': table_name, # From rp.layer 'the_geom': geos.fromstr(adm_div.geom, srid=adm_div.srid), 'dim1': scenario.value, 'dim1_order': scenario.order, 'dim2': rp.value, 'dim2_order': rp.order, 'dim3': None, 'dim4': None, 'dim5': None, 'risk_analysis': risk_analysis, 'hazard_type': risk.hazard_type.mnemonic, 'admin': adm_div.name.encode('utf-8').replace( "'", "''"), 'adm_code': adm_div.code, 'region': region.name, 'value': value } self.insert_db(conn, db_values) risk_adm = RiskAnalysisAdministrativeDivisionAssociation.\ objects.\ filter(riskanalysis=risk, administrativedivision=adm_div) if len(risk_adm) == 0: RiskAnalysisAdministrativeDivisionAssociation.\ objects.\ create(riskanalysis=risk, administrativedivision=adm_div) elif app.name == RiskApp.APP_COST_BENEFIT: cell_obj = sheet.cell(rp_idx + 1, 0) cell_type_str = ctype_text.get( cell_obj.ctype, 'unknown type') if cell_obj.value: adm_div = AdministrativeDivision.objects.get( name=region) value = sheet.cell_value(rp_idx + 1, 1) print('[%s] (%s) %s / %s' % (scenario.value, rp.value, adm_div.name, value)) db_values = { 'table': table_name, # From rp.layer 'the_geom': geos.fromstr(adm_div.geom, srid=adm_div.srid), 'dim1': scenario.value, 'dim1_order': scenario.order, 'dim2': rp.value, 'dim2_order': rp.order, 'dim3': None, 'dim4': None, 'dim5': None, 'risk_analysis': risk_analysis, 'hazard_type': risk.hazard_type.mnemonic, 'admin': adm_div.name.encode('utf-8').replace( "'", "''"), 'adm_code': adm_div.code, 'region': region.name, 'value': value } self.insert_db(conn, db_values) risk_adm = RiskAnalysisAdministrativeDivisionAssociation.\ objects.\ filter(riskanalysis=risk, administrativedivision=adm_div) if len(risk_adm) == 0: RiskAnalysisAdministrativeDivisionAssociation.\ objects.\ create(riskanalysis=risk, administrativedivision=adm_div) # Finished Import: Commit on DB conn.commit() except Exception: try: conn.rollback() except: pass traceback.print_exc() finally: conn.close() # Import or Update Metadata if Metadata File has been specified/found if excel_metadata_file: call_command('importriskmetadata', region=region.name, excel_file=excel_metadata_file, risk_analysis=risk_analysis, risk_app=[app.name]) risk.metadata_file = excel_metadata_file # Finalize risk.data_file = excel_file if commit: risk.save() return risk_analysis
workbook = xlrd.open_workbook(filename=args.excel_file) print workbook.sheet_names() worksheet = workbook.sheet_by_name(u"Sheet 1") num_rows = worksheet.nrows - 1 curr_row = -1 matches = [] while curr_row < num_rows: curr_row += 1 row = worksheet.row(curr_row) n0 = row[0] eid0 = row[2] cell_type_str = ctype_text.get(n0.ctype, "unknown type") if cell_type_str == "number": matches.append([int(n0.value), eid0.value]) n0 = row[5] eid0 = row[7] cell_type_str = ctype_text.get(n0.ctype, "unknown type") if cell_type_str == "number": matches.append([int(n0.value), eid0.value]) # Connect to AgBase ab = AgBase() ab.set_logging_on(True) user = ab.connect(args.user, args.passwd, args.server) if user is None: print ("ERROR: Login failed.")
def extract_data(table, xl_workbook, spreadsheet_type, df): sheet = xl_workbook.sheet_by_name(table.sheet_name) # Add data to the dataframe first_row = min(table.rows) first_col = min(table.cols) for r in table.rows: for c in table.cols: cell = sheet.row(r)[c] df.loc[r - first_row, c - first_col] = cell.value # Reset row indices so they go 0,1,2...nmain.py df = df.reset_index(drop=True) # Add row descriptions to the dataframe row_descriptions = table.row_descriptions indentation_levels = table.indentation_levels if table.table_type == "long format": # for i, col in enumerate(reversed(row_descriptions)): # df.insert(loc=i, column='descriptor_col_' + str(i), value=['' for j in range(df.shape[0])]) # for r in table.rows: # cell = sheet.row(r)[col] # if ctype_text.get(cell.ctype, 'unknown type') == "xldate": # df.loc[r - first_row, 'descriptor_col_' + str(i)] = pd.to_datetime((cell.value - 25569) * 86400.0, # unit='s').strftime('%d/%m/%Y') # else: # df.loc[r - first_row, 'descriptor_col_' + str(i)] = cell.value # print('df', df) row_headings = merged_data_row_headings_function(xl_workbook, sheet_name=table.sheet_name, merged_data_rows=table.merged_meta_data_row_headings, data_rows=table.rows, row_descriptions=table.row_descriptions, row_titles=table.row_titles, top_header_row=table.top_header_row) df = df.join(row_headings) #print('df', df) if table.table_type == "wide format": # Insert empty columns into dataframe, where the descriptions will go if table.columns_with_indentation: for c in table.columns_with_indentation: for idx, i in enumerate(indentation_levels[c]): df.insert(loc=idx, column='descriptor_col_' + str(c) + str(i), value=['' for j in range(df.shape[0])]) row = 0 top_cell_row = min(table.rows) for r in table.rows: column = 0 for c in table.columns_with_indentation: for i in indentation_levels[c]: cell_row = r cell = sheet.row(cell_row)[c] indentation_cell = int(xl_workbook.xf_list[sheet.cell_xf_index(cell_row, c)].alignment.indent_level) while indentation_cell > i or cell.value is None: cell_row -= 1 if cell_row < 1: break cell = sheet.row(cell_row)[c] indentation_cell = int(xl_workbook.xf_list[sheet.cell_xf_index(cell_row, c)].alignment.indent_level) if cell_row < top_cell_row: top_cell_row = cell_row if indentation_cell == i: if ctype_text.get(cell.ctype, 'unknown type') == "xldate": df.loc[row, df.columns[column]] = pd.to_datetime((cell.value - 25569) * 86400.0, unit='s').strftime('%d/%m/%Y') else: df.loc[row, df.columns[column]] = cell.value column += 1 row += 1 other_columns = set(i for i in table.row_descriptions if i not in table.columns_with_indentation) if other_columns: row_headings = merged_data_row_headings_function(xl_workbook, sheet_name=table.sheet_name, merged_data_rows=table.merged_meta_data_row_headings, data_rows=table.rows, row_descriptions=other_columns, row_titles=table.row_titles, top_header_row=table.top_header_row) if spreadsheet_type == "Time series": row_headings.rename(columns={'Series ID': 'Date'}, inplace=True) df = df.join(row_headings) # if table.sheet_name == "Table 1": # print(df) # if table.table_type == "wide format": # # Insert empty columns into dataframe, where the descriptions will go # if indentation_levels: # for i in indentation_levels: # df.insert(loc=i, column='descriptor_col_' + str(i), value=['' for j in range(df.shape[0])]) # # row = 0 # top_cell_row = min(table.rows) # # for r in table.rows: # column = 0 # for i in indentation_levels: # cell_row = r # cell = sheet.row(cell_row)[row_descriptions[0]] # indentation_cell = xl_workbook.xf_list[sheet.cell_xf_index(cell_row, row_descriptions[0])].alignment.indent_level # while indentation_cell > i: # cell_row -= 1 # cell = sheet.row(cell_row)[row_descriptions[0]] # indentation_cell = xl_workbook.xf_list[sheet.cell_xf_index(cell_row, row_descriptions[0])].alignment.indent_level # if cell_row < top_cell_row: # top_cell_row = cell_row # if indentation_cell == i: # if ctype_text.get(cell.ctype, 'unknown type') == "xldate": # df.loc[row, df.columns[column]] = pd.to_datetime((cell.value - 25569) * 86400.0, # unit='s').strftime('%d/%m/%Y') # else: # df.loc[row, df.columns[column]] = cell.value # column += 1 # row += 1 # else: # row_headings = merged_data_row_headings_function(xl_workbook, sheet_name=table.sheet_name, # merged_data_rows=table.merged_meta_data_row_headings, # data_rows=table.rows, # row_descriptions=table.row_descriptions, # row_titles=table.row_titles) # df = df.join(row_headings) return df
def transPerFile(self, infile, outfile): """Called on a per file basis from transformAll""" self.mtime = os.path.getmtime(infile) wb = xlrd.open_workbook(filename=infile, on_demand=True) sheet = wb.sheet_by_index(0) root = ET.Element( "museumPlusExport", attrib={ "version": "2.0", "level": "dirty", }, ) tree = ET.ElementTree(root) columns = [sheet.cell(0, c).value for c in range(sheet.ncols)] base = os.path.basename(infile) # print ("%s -> %s" % (infile, tag)) # invalid xml characters: will be stripped remove_re = re.compile(u"[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]") for r in range(1, sheet.nrows): # leave out column headers if re.match("so", base, flags=re.I): tag = "sammlungsobjekt" attrib = "objId" elif re.match("pk", base, flags=re.I): tag = "personKörperschaft" attrib = "kueId" elif re.match("mm", base, flags=re.I): tag = "multimediaobjekt" attrib = "mulId" elif re.match("aus", base, flags=re.I): tag = "ausstellung" attrib = "ausId" else: print("Error: Unknown file %s" % infile) sys.exit(1) index = sheet.cell(r, columns.index(attrib)).value if index: index = str(int(index)) if index != "": # Dont include rows without meaningful index t = datetime.fromtimestamp( self.mtime, timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") # print ('AAAAAAAA'+str(t)) doc = ET.SubElement(root, tag, attrib={ attrib: index, "exportdatum": str(t) }) print("INDEX: %s" % index) # should this become verbose? row_dict = {} for c in range(sheet.ncols): cell = sheet.cell(r, c) cellTypeStr = ctype_text.get(cell.ctype, "unknown type") tag = sheet.cell(0, c).value tag = (tag[0].lower() + tag[1:] ) # I want lowercase initial for all element names tag = re.sub( r"\W|&|<|>|:", "", tag) # xml spec: strip illegal chars for elements if re.search(r"\A[0-9]", tag): raise ValueError( "XML spec doesn't allow elements to begin with numbers" ) # type conversions if cellTypeStr == "number": # val=int(float(cell.value)) val = int(cell.value) # print ("number:%s" % val) elif cellTypeStr == "xldate": val = xlrd.xldate.xldate_as_datetime(cell.value, 0) # print ("XLDATE %s" % (val)) elif cellTypeStr == "text": # val=escape() leads to double escape val = remove_re.sub("", cell.value) # rm illegal xml char # print ("---------TypeError %s" % cellTypeStr) if cellTypeStr != "empty": # write non-empty elements # print ("%s:%s" % (attrib, tag)) val = str(val).strip( ) # rm leading and trailing whitespace; turn into str if tag != attrib and val != "": # print ( '%s: %s (%s)' % (tag, val, cellTypeStr)) row_dict[tag] = val for tag in sorted(row_dict.keys()): ET.SubElement(doc, tag).text = row_dict[tag] self.indent(root) # print ('%s->%s' % (inpath, outfile)) tree.write(outfile, encoding="UTF-8", xml_declaration=True)
def merged_data_row_headings_function(xl_workbook, sheet_name, merged_data_rows, data_rows, row_descriptions, row_titles, top_header_row): """ Function to extract data from merged cells merged_data_rows is a list of tuples. Each tuple is in the format used by xlrd function merged_cells """ sheet = xl_workbook.sheet_by_name(sheet_name) other_rows = set(i for i in range(top_header_row + 1, max(data_rows)) if i not in data_rows) all_rows = data_rows.union(other_rows) # sheet.merged_cells returns a list of tuples. Each tuple has 4 elements a,b,c,d # a,c is the top-left coordinate (row / col, starting with 0) where the merge starts. # b,d is the bottom right coordinate (row / col, starting with 1) where the merge finishes row_headings = pd.DataFrame() all_positions = [] for i in all_rows: for j in row_descriptions: all_positions.append((i, i + 1, j, j + 1)) merged_meta_data_row_headings = list( filter(lambda x: x[0] in data_rows, merged_data_rows)) all_merged_positions = [] for i in merged_meta_data_row_headings: j = i[0] # start row k = 1 while j < i[1]: for cells in range(i[0], i[0] + k): all_merged_positions.append((cells, i[0] + k, i[2], i[3])) j += 1 k += 1 merged_meta_data_extended = copy.copy(merged_meta_data_row_headings) merged_meta_data_extended.extend(i for i in all_positions if i not in all_merged_positions) # Needs to be sorted to ensure the descriptions line up properly with the data merged_meta_data_extended.sort(key=itemgetter(0, 2)) values = [0] values.extend(row for row in range(min(data_rows), max(data_rows)) if row not in data_rows) keys = [0] k = 1 for v in values: keys.append(k) k += 1 empty_rows = dict(zip(keys, values)) descriptions_in_other_rows = [] columns_included = set() first_data_row = min(data_rows) for c in row_descriptions: column_heading = row_titles[c] for i in merged_meta_data_extended: if i[2] == c: row_position = i[ 0] - first_data_row # row position in df (in other words, the row number) # Filter out entries that occur in empty columns empty_rows_filtered = dict( filter(lambda elem: elem[1] < i[1], empty_rows.items())) if empty_rows_filtered: row_position = row_position - max( empty_rows_filtered, key=empty_rows_filtered.get) for k in range(i[0], i[1]): cell_value = sheet.row(i[0])[i[2]].value if cell_value != '': descriptions_in_other_rows.append({ 'Row': i[1], 'Col': i[2], 'row_position': row_position, 'Desc_row' + str(i[2]): cell_value }) for k in range(i[0], i[1]): cell = sheet.row(i[0])[i[2]] if row_position >= 0 and i[0] in data_rows: if ctype_text.get(cell.ctype, 'unknown type') == "xldate": row_headings.loc[row_position, column_heading] = pd.to_datetime( (cell.value - 25569) * 86400.0, unit='s').strftime('%d/%m/%Y') else: row_headings.loc[row_position, column_heading] = cell.value if cell.value != '' and i[0] in data_rows: columns_included.add(i[2]) row_position += 1 if descriptions_in_other_rows: descriptions_in_other_rows = list( filter( lambda i: i['Row'] not in data_rows and i['Col'] not in columns_included, descriptions_in_other_rows)) for d in descriptions_in_other_rows: del d['Col'] if descriptions_in_other_rows: spreadsheet_rows = list(range(min(data_rows), max(data_rows) + 1)) correspondence = {} k = 0 for i in spreadsheet_rows: if i in data_rows: correspondence[i] = k k += 1 correspondence = pd.DataFrame(correspondence.items(), columns=['index', 'New_index']) descriptions_in_other_rows = pd.DataFrame(descriptions_in_other_rows) descriptions_in_other_rows = descriptions_in_other_rows.sort_values( by=['Row']) descriptions_in_other_rows.set_index('Row', inplace=True) descriptions_in_other_rows = descriptions_in_other_rows.reindex( range(max(data_rows) + 1)) descriptions_in_other_rows.ffill(axis=0, inplace=True) descriptions_in_other_rows['index'] = descriptions_in_other_rows.index descriptions_in_other_rows = descriptions_in_other_rows.merge( correspondence, on='index', how='left') descriptions_in_other_rows.drop(['row_position', 'index'], axis=1, inplace=True) descriptions_in_other_rows.set_index('New_index', inplace=True) descriptions_in_other_rows.rename_axis(None, inplace=True) row_headings = row_headings.join(descriptions_in_other_rows) return row_headings
def show_column_names(xl_sheet): row = xl_sheet.row(0) # 1st row print(60*'-' + 'n(Column #) value [type]n' + 60*'-') for idx, cell_obj in enumerate(row): cell_type_str = ctype_text.get(cell_obj.ctype, 'unknown type') print('(%s) %s [%s]' % (idx, cell_obj.value, cell_type_str, ))
lastname = sheet.row(row)[0].value.strip().title() else: name = sheet.row(row)[0].value.title() if ',' in name: names = name.split(',') if '/' in name: names = name.split('/') else: name = name.split(' ') lastname = names[0].strip() firstname = names[1].strip() print lastname, firstname element = sheet.row(row)[2] if ctype_text.get(element.ctype) == 'xldate': bdate = xldate_to_datetime(element.value) else: bdate = element.value.strip().decode('utf-8') print bdate element = sheet.row(row)[4] if ctype_text.get(element.ctype) == 'number': audio_clip = int(element.value) else: audio_clip = element.value print 'audio_clip', audio_clip element = sheet.row(row)[5] program = '' if element.value != '':
##### # There are 2 ways to open the desired excel worksheet: # by index -----> xl_wrkbk.sheet_by_index(0) # by name -----> xl_wrkbk.sheet_by_name('Sheet1') xl_sheet = xl_wrkbk.sheet_by_index(0) ##### # Extract a row from the opened excel-sheet. # NOTE: Starting index for both rows and columns is 0(zero). a_row = xl_sheet.row(0) ##### # A good way to print row meta-information and values print('(Column #) type:value') for idx, cell_obj in enumerate(a_row): cell_type_str = ctype_text.get(cell_obj.ctype, 'unknown type') print('(%s) %s %s' % (idx, cell_type_str, cell_obj.value)) ##### # Iterate through each cell and print the value # Number of columns num_cols = xl_sheet.ncols # Iterate through rows for row_idx in range(0, xl_sheet.nrows): print ('-'*40) print ('Row: %s' % row_idx) # Iterate through columns for col_idx in range(0, num_cols): cell_obj = xl_sheet.cell(row_idx, col_idx) # Get cell object by row, col
def callback(ch, method, properties, body): print(" [x] Received %r" % body) excel = body excel_dict = json.loads(excel) encoding_data = excel_dict["base64buffer"] decoded_data = base64.b64decode(encoding_data) file_extension = excel_dict["fileName"] userId = excel_dict["userId"] filename = excel_dict["fileName"] curr_date = excel_dict["curr_date"] message_format = { "userid": userId, "base64buffer": base64_message, "date": curr_date, "fileName": filename } message = json.dumps(message_format) print(userId) print(filename) if "base64buffer" in excel_dict: if str(file_extension).split(".")[1] == 'xls': print('excel format : xls') try: wbb1 = Workbook() ws = wbb1.active #xl_workbook = open_workbook('c:\om\exampleMasterCard.xls') # for read a file type xl_workbook = open_workbook(file_contents=decoded_data, on_demand=True) sheet_names = xl_workbook.sheet_names() #print(sheet_names) xl_sheet = xl_workbook.sheet_by_name(sheet_names[0]) xl_sheet = xl_workbook.sheet_by_index(0) #print('Sheet name: %s' % xl_sheet.name) row = xl_sheet.row(0) # print (row) #print('(Column #) type:value') for idx, cell_obj in enumerate(row): cell_type_str = ctype_text.get(cell_obj.ctype, 'unknown type') #print('(%s) %s %s' % (idx, cell_type_str, cell_obj.value)) num_cols = xl_sheet.ncols # Number of columns for row_idx in range(0, xl_sheet.nrows): # Iterate through rows #print('-' * 40) #print('Row: %s' % row_idx) # Print row number for col_idx in range(0, num_cols): # Iterate through columns cell_obj = xl_sheet.cell( row_idx, col_idx) # Get cell object by row, col #print('Column: [%s] cell_obj: [%s]' % (col_idx, cell_obj)) ws.cell(row=row_idx + 1, column=col_idx + 1).value = cell_obj.value f1 = 'c:\\OM\\output-xls-' + str(uuid.uuid4()) + '.xlsx' wbb1.save(filename=f1) workbook = openpyxl.load_workbook(f1) # wb = load_workbook(workbook) wb1 = workbook.active mr = wb1.max_row mc = wb1.max_column match = wb1.cell(row=1, column=1) if match.value == None: print("ISRACARD") wb1.delete_cols(3, 2) sheet = wb1 alist = [] blist = [] wbb = Workbook() ws = wbb.active x = [ 'תאריך עסקה', 'שם בית העסק', 'סכום עסקה', 'קטגוריה', 'מספר כרטיס' ] counter = 1 for i in x: ws.cell(row=1, column=counter, value=i) counter += 1 for i in range(4, mr): b3 = sheet.cell(row=i, column=1) s = str(b3.value).split(" ") r = re.compile('[0-9]{4}') newlist = list(filter(r.match, s)) list1 = [newlist, i] if newlist != []: alist.append( i) # add row number for cars number to list blist.append( ', '.join(newlist)) # add cards number to list else: continue alist.append(mr + 5) for a in range(0, len(alist) - 1): for i in range(alist[a] + 3, alist[a + 1] - 2): for j in range(1, 4): # reading cell value from source excel file c = wb1.cell(row=i, column=j) # writing the read value to destination excel file ws.cell(row=i - 5, column=j).value = c.value ws.cell(row=i - 5, column=5).value = str(blist[a]) for j in range(1, 4): for i in range(1, mr): if ws.cell(row=i, column=1).value is None: ws.delete_rows(i) else: continue f = 'c:\\OM\\output-isracard-' + str( uuid.uuid4()) + '.xlsx' wbb.save(filename=f) # channel.queue_declare(queue='Yogevhello', durable=True) with open(f, 'rb') as binary_file: # print(binary_file) binary_file_data = binary_file.read() base64_encoded_data = base64.b64encode( binary_file_data) base64_message = base64_encoded_data.decode('utf-8') print("Before publish to yogev") # wwc = {"userid": "Eldar", "time": "10/04/2020", "base64buffer": base64_message} # message_format = {"userid": userId, "time": curr_date, "base64buffer": base64_message, # "date": curr_date} # message = json.dumps(message_format) publisher.sender(message) # print(f) print("send message to sender function") # time.sleep(0.1) # os.remove(f) else: print("VISA") wb1.delete_cols(3) sheet = wb1 clist = [] # card = wb1.cell(row=2, column=1) # s = str(card.value).split(",")[1].split(" ")[3] # card_re = str(re.findall(r'[0-9]{4}',s)) b3 = sheet.cell(row=2, column=1) s = str(b3.value).split(",")[1].split(" ")[3] print(s) r = re.compile('[0-9]{4}') # print(r) newlist = list(s) # rint(newlist) list1 = [newlist, 2] if newlist != []: clist.append( ', '.join(newlist)) # add cards number to list wbb = Workbook() ws = wbb.active x = [ 'תאריך עסקה', 'שם בית העסק', 'סכום עסקה', 'קטגוריה', 'מספר כרטיס' ] counter = 1 for i in x: ws.cell(row=1, column=counter, value=i) counter += 1 for i in range(4, mr): for j in range(1, 4): c = wb1.cell(row=i, column=j) ws.cell(row=i - 2, column=j).value = c.value ws.cell(row=i - 2, column=5).value = s f = 'c:\\OM\\output-visa-' + str(uuid.uuid4()) + '.xlsx' wbb.save(filename=f) with open(f, 'rb') as binary_file: binary_file_data = binary_file.read() base64_encoded_data = base64.b64encode( binary_file_data) base64_message = base64_encoded_data.decode('utf-8') print("Before publish to yogev") # message_format = {"userid": userId, "time": curr_date, "base64buffer": base64_message, # "date": curr_date} # message = json.dumps(message_format) publisher.sender(message) except: print('cannot parsing xlsx file, the file is corrupted') if str(file_extension).split(".")[1] == 'xlsx': print('excel format : xlsx') try: xls_filelike = io.BytesIO(decoded_data) workbook = openpyxl.load_workbook(xls_filelike) wbb2 = Workbook() ws = wbb2.active data = workbook # parser.read_file(data) f2 = 'c:\\OM\\output-xlsx-' + str(uuid.uuid4()) + '.xlsx' workbook.save(filename=f2) workbook = openpyxl.load_workbook(f2) #wb = load_workbook(workbook) wb1 = workbook.active mr = wb1.max_row mc = wb1.max_column match = wb1.cell(row=1, column=1) if match.value == None: print("ISRACARD") wb1.delete_cols(3, 2) sheet = wb1 alist = [] blist = [] wbb = Workbook() ws = wbb.active x = [ 'תאריך עסקה', 'שם בית העסק', 'סכום עסקה', 'קטגוריה', 'מספר כרטיס' ] counter = 1 for i in x: ws.cell(row=1, column=counter, value=i) counter += 1 for i in range(4, mr): b3 = sheet.cell(row=i, column=1) s = str(b3.value).split(" ") r = re.compile('[0-9]{4}') newlist = list(filter(r.match, s)) list1 = [newlist, i] if newlist != []: alist.append( i) # add row number for cars number to list blist.append( ', '.join(newlist)) # add cards number to list else: continue alist.append(mr + 5) for a in range(0, len(alist) - 1): for i in range(alist[a] + 3, alist[a + 1] - 2): for j in range(1, 4): # reading cell value from source excel file c = wb1.cell(row=i, column=j) # writing the read value to destination excel file ws.cell(row=i - 5, column=j).value = c.value ws.cell(row=i - 5, column=5).value = str(blist[a]) for j in range(1, 4): for i in range(1, mr): if ws.cell(row=i, column=1).value is None: ws.delete_rows(i) else: continue f = 'c:\\OM\\output-isracard' + str(uuid.uuid4()) + '.xlsx' wbb.save(filename=f) #channel.queue_declare(queue='Yogevhello', durable=True) with open(f, 'rb') as binary_file: # print(binary_file) binary_file_data = binary_file.read() base64_encoded_data = base64.b64encode( binary_file_data) base64_message = base64_encoded_data.decode('utf-8') print("Before publish to yogev") #wwc = {"userid": "Eldar", "time": "10/04/2020", "base64buffer": base64_message} # message_format = {"userid": userId, "time": curr_date, "base64buffer": base64_message, "date": curr_date} # message = json.dumps(message_format) publisher.sender(message) #print(f) #print("send message to sender function") # time.sleep(0.1) # os.remove(f) else: print("VISA") wb1.delete_cols(3) sheet = wb1 clist = [] # card = wb1.cell(row=2, column=1) # s = str(card.value).split(",")[1].split(" ")[3] # card_re = str(re.findall(r'[0-9]{4}',s)) b3 = sheet.cell(row=2, column=1) s = str(b3.value).split(",")[1].split(" ")[3] print(s) r = re.compile('[0-9]{4}') #print(r) newlist = list(s) #rint(newlist) list1 = [newlist, 2] if newlist != []: clist.append( ', '.join(newlist)) # add cards number to list wbb = Workbook() ws = wbb.active x = [ 'תאריך עסקה', 'שם בית העסק', 'סכום עסקה', 'קטגוריה', 'מספר כרטיס' ] counter = 1 for i in x: ws.cell(row=1, column=counter, value=i) counter += 1 for i in range(4, mr): for j in range(1, 4): c = wb1.cell(row=i, column=j) ws.cell(row=i - 2, column=j).value = c.value ws.cell(row=i - 2, column=5).value = s f = 'c:\\OM\\output-visa-' + str(uuid.uuid4()) + '.xlsx' wbb.save(filename=f) with open(f, 'rb') as binary_file: binary_file_data = binary_file.read() base64_encoded_data = base64.b64encode( binary_file_data) base64_message = base64_encoded_data.decode('utf-8') print("Before publish to yogev") # message_format = {"userid": userId, "time": curr_date, "base64buffer": base64_message, "date": curr_date} # message = json.dumps(message_format) publisher.sender(message) except: print('cannot parsing xlsx file, the file is corrupted') else: print('Cannot parsing json from rabbit')
def merged_data_function(xl_workbook, sheet_name, merged_data_cols, data_cols, data_rows, extra_rows, last_row_in_sheet, spreadsheet_type, column_header_locations, column_position=1): """ Function to extract data from merged cells merged_data_cols is a list of tuples. Each tuple is in the format used by xlrd function merged_cells """ sheet = xl_workbook.sheet_by_name(sheet_name) last_row = max(data_rows) other_rows = set( i for i in range(last_row_in_sheet + 1) if i not in data_rows and i > max(column_header_locations)) all_rows = column_header_locations.union(other_rows) column_headings = pd.DataFrame() merged_meta_data = list( filter(lambda x: x[0] in all_rows, merged_data_cols)) # Get the merged items that have the same column dimensions. These are understood to be subheadings. other_rows = set( i for i in range(last_row) if i not in data_rows and i > max(column_header_locations) - 1) all_rows = column_header_locations.union(other_rows) merged_meta_data_subheadings_potential = list( filter(lambda x: x[0] in other_rows, merged_data_cols)) # Get the merged items that have the same column dimensions. These are understood to be subheadings. merged_meta_data_last_2_elements = [ el[2:] for el in merged_meta_data_subheadings_potential ] duplicates = list( set([ ele for ele in merged_meta_data_last_2_elements if merged_meta_data_last_2_elements.count(ele) > 1 ])) merged_meta_data_subheadings = [] for i in merged_meta_data_subheadings_potential: for j in duplicates: if i[2] == j[0] and i[3] == j[1]: merged_meta_data_subheadings.append(i) # duplicate_rows = set(el[0] for el in merged_meta_data_subheadings) # # merged_meta_data_subheadings = [] # for i in merged_meta_data: # for j in duplicates: # if i[2] == j[0] and i[3] == j[1]: # merged_meta_data_subheadings.append(i) # Remove the subheading rows merged_meta_data = [ x for x in merged_meta_data if x not in merged_meta_data_subheadings ] subheading_rows = [i[0] for i in merged_meta_data_subheadings] rows_not_subheadings = [ x for x in column_header_locations if x not in subheading_rows ] # Find how the merged data relates to the columns first_col = min(data_cols) all_positions = [] for i in rows_not_subheadings: for j in data_cols: all_positions.append((i, i + 1, j, j + 1)) all_merged_positions = [] for i in merged_meta_data: j = i[2] # start column k = 1 while j < i[3]: for cells in range(i[2], i[2] + k): all_merged_positions.append((i[0], i[1], cells, i[2] + k)) j += 1 k += 1 merged_meta_data_extended = copy.copy(merged_meta_data) merged_meta_data_extended.extend(i for i in all_positions if i not in all_merged_positions) # Needs to be sorted to ensure the descriptions line up properly with the data merged_meta_data_extended.sort(key=itemgetter(0, 2)) values = [0] values.extend(col for col in range(min(data_cols), max(data_cols)) if col not in data_cols) keys = [0] k = 1 for v in values: keys.append(k) k += 1 empty_cols = dict(zip(keys, values)) # sheet.merged_cells returns a list of tuples. Each tuple has 4 elements a,b,c,d # a,c is the top-left coordinate (row / col, starting with 0) where the merge starts. # b,d is the bottom right coordinate (row / col, starting with 1) where the merge finishes (who knows why?) column_titles = {} if spreadsheet_type == "Census TableBuilder": i = 1 for r in column_header_locations: column_titles[r] = sheet.cell_value(rowx=r, colx=0) if column_titles[r] == '': column_titles[r] = "Column_description_title_" + str(i) i += 1 else: for i, r in enumerate(rows_not_subheadings): column_titles[r] = "Row_description_title_" + str(i) for r in rows_not_subheadings: column_heading = column_titles[r] for i in merged_meta_data_extended: if i[0] == r: row_position = i[ 2] - first_col # row position in df (in other words, the column number) # Filter out entries that occur in empty columns empty_cols_filtered = dict( filter(lambda elem: elem[1] < i[3], empty_cols.items())) if empty_cols_filtered: row_position = row_position - max( empty_cols_filtered, key=empty_cols_filtered.get) for k in range(i[2], i[3]): cell = sheet.row(i[0])[i[2]] if row_position >= 0: if ctype_text.get(cell.ctype, 'unknown type') == "xldate": column_headings.loc[row_position, column_heading] = pd.to_datetime((cell.value - 25569) * 86400.0, unit='s').\ strftime('%d/%m/%Y') else: column_headings.loc[row_position, column_heading] = cell.value row_position += 1 columns_to_evaluate = [0] # Assume the extra row info is all in column A for j in extra_rows: column_heading = 'Col_desc_' + str(column_position) for i in columns_to_evaluate: cell = sheet.row(j)[i] for row_position in range(0, len(data_cols)): if ctype_text.get(cell.ctype, 'unknown type') == "xldate": column_headings.loc[row_position, column_heading] = pd.to_datetime((cell.value - 25569) * 86400.0, unit='s'). \ strftime('%d/%m/%Y') else: column_headings.loc[row_position, column_heading] = cell.value column_position += 1 return column_headings
def locate_data(xl_workbook, data_sheets, allowed_blank_rows, data_type): """ Function to locate the data in the spreadsheet and assign it to a TableData class """ # Initiate the first table tables = [] table_number = -1 for s in data_sheets: sheet = xl_workbook.sheet_by_name(s) found_table = False r = 0 found_data = False looking_for_multiple_tables = False blank_row = False blank_row_count = 0 quit_loop = False date_cols = [] if data_type == "Time series": for i in range(sheet.nrows): row = sheet.row(i) if quit_loop: break for idx, cell_obj in enumerate(row): if cell_obj.value == "Series ID": start_row = i + 1 quit_loop = True break else: start_row = 0 quit_loop = False if 'start_row' not in locals(): start_row = 0 # Find data for i in range(start_row, sheet.nrows): # if i < 9: # continue if quit_loop: break row = sheet.row(i) c = 0 if blank_row: blank_row_count += 1 else: blank_row_count = 0 if blank_row_count >= allowed_blank_rows: found_data = False found_table = False blank_row = False date_cols = [] continue if found_data: looking_for_multiple_tables = True blank_row = True # Temporary assignment, will be made false if something is found in the row blank_row_count = 0 r += 1 found_data = False for idx, cell_obj in enumerate(row): if isinstance(cell_obj.value, str): # if "STANDARD ERROR" in cell_obj.value: # quit_loop = True # break if not found_data: if cell_obj.value in [ "Year", "year", "Years", "Date", "Month", "Day" ]: date_cols.append(idx) continue # Return the cell 'weight' (700 is bold, 400 is 'normal') rd_xf = xl_workbook.xf_list[sheet.cell_xf_index(i, idx)] cell_font = xl_workbook.font_list[rd_xf.font_index].weight #cell_format = xl_workbook.format_map[rd_xf.format_key].format_str cell_type = ctype_text.get(cell_obj.ctype, 'unknown type') # Check if it is a total row (p.s. this is very specific, sometimes total row is at the top and bold if cell_type == "number" and not found_data and idx != 0: # Top left data cell left_of_data = sheet.row(i)[idx - 1].value #if cell_type == "number" and cell_format != "General" and \ if cell_type == "number" and ( cell_font != 700 or left_of_data == "Total") and idx not in date_cols: # Store info on location of data column found_data = True if not found_table: tables.append(TableData(sheet_name=sheet.name)) table_number += 1 found_table = True tables[table_number].add_row(i) tables[table_number].add_col(idx) c += 1 # Sometimes there is a 'total' row that is bold. Let's not skip it. # If the table has been started, allow bold rows. # p.s this might be redundant if the total row is always called 'Total' as per above if statement #if found_table and cell_type == "number" and cell_format != "General" and cell_font == 700: if found_table and cell_type == "number" and cell_font == 700: found_data = True tables[table_number].add_row(i) tables[table_number].add_col(idx) c += 1 if cell_type in ["number", "text" ] and looking_for_multiple_tables is True: blank_row = False blank_row_count = 0 return tables
def describe_col_headings(xl_workbook, sheet_name, data_rows, data_cols, row_descriptions, top_row, top_header_row, row_descriptions_header_row, spreadsheet_type, last_row_in_sheet): """ Find column headings. There might be multiple column headings (above each other) that might be units ($, %, etc) or they might be merged cells """ first_row = min(data_rows) last_col = max(data_cols) first_col = min(data_cols) sheet = xl_workbook.sheet_by_name(sheet_name) # Find merged column headings # sheet.merged_cells returns a list of tuples. Each tuple has 4 elements a,b,c,d # a,c is the top-left coordinate (row / col, starting with 0) where the merge starts. # b,d is the bottom right coordinate (row / col, starting with 1) where the merge finishes (who knows why?) assert isinstance(sheet.merged_cells, object) all_mergers = sheet.merged_cells rows_above_data = range(0, first_row) column_header_locations = set() blank_row = False found_a_non_blank_row = False found_a_column_header = False def check_rows(idx, merged_meta_data): if merged_meta_data: merged_col1 = list(zip(*merged_meta_data))[2] if idx >= first_col or idx in merged_col1: return True else: return False else: if idx >= first_col: return True else: return False blank_row = False found_a_non_blank_row = False for r in reversed(rows_above_data): if blank_row and found_a_non_blank_row and found_a_column_header: break row = sheet.row(r) merged_meta_data = list(filter(lambda x: x[0] == r, all_mergers)) for idx, cell_obj in enumerate(row): # Return the cell 'weight' (700 is bold, 400 is 'normal') rd_xf = xl_workbook.xf_list[sheet.cell_xf_index(r, idx)] cell_font = xl_workbook.font_list[rd_xf.font_index].weight cell_type = ctype_text.get(cell_obj.ctype, 'unknown type') # If the column index is greater than the first data column or is in the first column of merged cells if check_rows(idx, merged_meta_data): # As soon as something is found in a cell, it is not a blank row and break the loop, go to next row if cell_obj.value == "": blank_row = True # At least one of the column headers must be text or bold for this to work elif cell_type == "text" or cell_font == 700: blank_row = False found_a_non_blank_row = True found_a_column_header = True column_header_locations.add(r) break else: blank_row = False found_a_non_blank_row = True #column_header_locations.discard(top_row) # Extra meta data above top left cell extra_meta_data = set() if spreadsheet_type == "Data cube": if row_descriptions_header_row: column_headers_already_included = { top_row, row_descriptions_header_row } else: column_headers_already_included = {top_row} if all_mergers: for r in column_header_locations: mergers_filtered = [tup for tup in all_mergers if tup[0] == r] if mergers_filtered: for c in row_descriptions: if c in list(zip(*mergers_filtered))[2]: column_headers_already_included.add(r) rows_above_data = list( filter(lambda i: i not in column_headers_already_included, [*rows_above_data])) blank_row = False for r in reversed(rows_above_data): if blank_row: break row = sheet.row(r) for idx, cell_obj in enumerate(row): # If the column index is greater than the first data column or is in the first column of merged cells if idx in row_descriptions: rd_xf = xl_workbook.xf_list[sheet.cell_xf_index(r, idx)] cell_format = xl_workbook.format_map[ rd_xf.format_key].format_str # As soon as something is found in a cell, it is not a blank row and break the loop, go to next row if cell_obj.value == "": blank_row = True # At least one of the column headers must be text for this to work elif ctype_text.get( cell_obj.ctype, 'unknown type' ) == "text" or cell_format == 'General': blank_row = False extra_meta_data.add(r) break else: blank_row = False blank_row = False for r in rows_above_data: if blank_row: break row = sheet.row(r) for idx, cell_obj in enumerate(row): # If the column index is greater than the first data column or is in the first column of merged cells if idx in row_descriptions: rd_xf = xl_workbook.xf_list[sheet.cell_xf_index(r, idx)] cell_format = xl_workbook.format_map[ rd_xf.format_key].format_str # As soon as something is found in a cell, it is not a blank row and break the loop, go to next row if cell_obj.value == "": blank_row = True # At least one of the column headers must be text for this to work elif ctype_text.get( cell_obj.ctype, 'unknown type' ) == "text" or cell_format == 'General': blank_row = False extra_meta_data.add(r) break else: blank_row = False else: columns_to_evaluate = range(0, max(row_descriptions)) rows_above_top_header_row = range(0, top_header_row) column_headers_already_included = {top_header_row} if all_mergers: for r in column_header_locations: mergers_filtered = [tup for tup in all_mergers if tup[0] == r] if mergers_filtered: for c in columns_to_evaluate: if c in list(zip(*mergers_filtered))[2]: column_headers_already_included.add(r) # for c in columns_to_evaluate: # if r in list(zip(*all_mergers))[0] and c in list(zip(*all_mergers))[2]: # column_headers_already_included.add(r) blank_row = False rows_above_data = list( filter(lambda i: i not in column_headers_already_included, [*rows_above_top_header_row])) for r in reversed(rows_above_data): row = sheet.row(r) for idx, cell_obj in enumerate(row): # If the column index is greater than the first data column or is in the first column of merged cells if idx in columns_to_evaluate: # As soon as something is found in a cell, it is not a blank row and break the loop, go to next row if cell_obj.value == "": blank_row = True # At least one of the column headers must be text for this to work elif ctype_text.get(cell_obj.ctype, 'unknown type') == "text": blank_row = False extra_meta_data.add(r) break else: blank_row = False other_rows = set( i for i in range(last_row_in_sheet + 1) if i not in data_rows and i > max(column_header_locations)) all_rows = column_header_locations.union(other_rows) merged_meta_data = [] for i in all_mergers: # Only keep the merged cells that are above the data; not to the left and not to the right #if i[0] in column_headers_and_rows_above_data and \ # i[2] >= first_col and i[1] - 1 in column_headers_and_rows_above_data and i[3] - 1 <= last_col: if i[0] in all_rows and i[2] <= last_col: merged_meta_data.append(i) return merged_meta_data, column_header_locations, extra_meta_data
worksheet.write(0, 0, 'Telephone Number') worksheet.write(0, 1, 'Review Note') siteType = '_rev_Reviewnotes.xlsx' # Set column to A:A, the first column. worksheet.set_column('A:A', 13) # Read the slice from the first cell to the last accessible row in Excel. col = xl_sheet.col_slice(0, 1, 1048576) # Read each string line by line. for (idx, cell_obj) in enumerate(col): cell_type_str = ctype_text.get(cell_obj.ctype, 'unknown type') cell_obj_str = str(cell_obj) # Cut the numbers to their appropriate format. # Does a dash, parenthesis, or none of those exist? That will decide the numFormat. if '-' in cell_obj_str: firstStart = cell_obj_str.index('-') - 3 firstEnd = firstStart + 3 secondStart = cell_obj_str.index('-') + 1 secondEnd = secondStart + 3 thirdStart = cell_obj_str.index('-') + 5 thirdEnd = thirdStart + 4 teleWho = cell_obj_str[firstStart:firstEnd] + cell_obj_str[ secondStart:secondEnd] + cell_obj_str[thirdStart:thirdEnd]
def callback(ch, method, properties, body): print(" [x] Received %r" % body) excel = body excel_dict = json.loads(excel) encoding_data = excel_dict["base64buffer"] decoded_data = base64.b64decode(encoding_data) file_extension = excel_dict["fileName"] userId = excel_dict["userId"] fileName = excel_dict["fileName"] curr_date = excel_dict["curr_date"] #message_format = {"userid": userId, "base64buffer": base64_message, # "date": curr_date, "fileName": filename} #message = json.dumps(message_format) print(userId) print(fileName) if "base64buffer" in excel_dict: if str(file_extension).split(".")[1] == 'xls': print('excel format : xls') try: wbb1 = Workbook() ws = wbb1.active #xl_workbook = open_workbook('c:\om\exampleMasterCard.xls') # for read a file type xl_workbook = open_workbook(file_contents=decoded_data, on_demand=True) sheet_names = xl_workbook.sheet_names() #print(sheet_names) xl_sheet = xl_workbook.sheet_by_name(sheet_names[0]) xl_sheet = xl_workbook.sheet_by_index(0) #print('Sheet name: %s' % xl_sheet.name) row = xl_sheet.row(0) # print (row) #print('(Column #) type:value') for idx, cell_obj in enumerate(row): cell_type_str = ctype_text.get(cell_obj.ctype, 'unknown type') #print('(%s) %s %s' % (idx, cell_type_str, cell_obj.value)) num_cols = xl_sheet.ncols # Number of columns for row_idx in range(0, xl_sheet.nrows): # Iterate through rows #print('-' * 40) #print('Row: %s' % row_idx) # Print row number for col_idx in range(0, num_cols): # Iterate through columns cell_obj = xl_sheet.cell(row_idx, col_idx) # Get cell object by row, col #print('Column: [%s] cell_obj: [%s]' % (col_idx, cell_obj)) ws.cell(row=row_idx + 1, column=col_idx + 1).value = cell_obj.value try: for row_idx in range(0, xl_sheet.nrows): # Iterate through rows #print('-' * 40) #print('Row: %s' % row_idx) # Print row number for col_idx in range(0, 1): # Iterate through columns cell_obj = xl_sheet.cell(row_idx, col_idx) # Get cell object by row, col #print('Column: [%s] cell_obj: [%s]' % (col_idx, cell_obj)) #print(cell_obj) a = str(cell_obj).split(":") #print(a[0]) c = 'xldate' try: if a[0] == 'xldate': #print('AAAAAAA') d = str(a[1]).split(".") e = int(d[0]) #print(e) datetime_date = xlrd.xldate_as_datetime(e, 0) date_object = datetime_date.date() string_date = date_object.isoformat() date_time_originl = datetime.datetime.strptime(string_date, '%Y-%d-%m') date_time_convert = datetime.date.strftime(date_time_originl, "%d/%m/%y") #print(date_time_convert) ws.cell(row=row_idx + 1, column=col_idx + 1).value = date_time_convert except: continue except: print("can't convert the first format ") try: for row_idx in range(0, xl_sheet.nrows): # Iterate through rows #print('-' * 40) #print('Row: %s' % row_idx) # Print row number for col_idx in range(1, 2): # Iterate through columns cell_obj = xl_sheet.cell(row_idx, col_idx) # Get cell object by row, col #print('Column: [%s] cell_obj: [%s]' % (col_idx, cell_obj)) #print(cell_obj) a = str(cell_obj).split(":") c = 'xldate' try: if a[0] == 'xldate': d = str(a[1]).split(".") e = int(d[0]) datetime_date = xlrd.xldate_as_datetime(e, 0) date_object = datetime_date.date() string_date = date_object.isoformat() try: date_time_originl = datetime.datetime.strptime(string_date, '%Y-%m-%d') date_time_convert = datetime.date.strftime(date_time_originl, "%d/%m/%y") ws.cell(row=row_idx + 1, column=col_idx + 1).value = date_time_convert except: ws.cell(row=row_idx + 1, column=col_idx + 1).value = date_time_convert except: continue except: print("can't convert the second format ") f1 = 'c:\\OM\\output-xls-' + str(uuid.uuid4()) + '.xlsx' wbb1.save(filename=f1) workbook = openpyxl.load_workbook(f1) wb1 = workbook.active mr = wb1.max_row mc = wb1.max_column match = wb1.cell(row=1, column=1) match2 = wb1.cell(row=4, column=1) match3 = wb1.cell(row=3, column=1) if match2.value == None: print("OTSAR HAHAYAL") wb1.delete_cols(1) wb1.delete_cols(3) wbb = Workbook() ws = wbb.active x = ['תאריך עסקה', 'שם בית העסק', 'סכום עסקה', 'קטגוריה', 'מספר כרטיס'] counter = 1 for i in x: ws.cell(row=1, column=counter, value=i) counter += 1 card = wb1.cell(row=4, column=1) s = str(card.value).split(":")[1].split(" ") card_match = s[0] for i in range(7, mr - 1): for j in range(1, 4): # reading cell value from source excel file c = wb1.cell(row=i, column=j) # writing the read value to destination excel file ws.cell(row=i - 5, column=j).value = c.value ws.cell(row=i - 5, column=5).value = card_match for j in range(1, 4): for i in range(1, mr): if ws.cell(row=i, column=1).value is None: ws.delete_rows(i) else: continue f = 'c:\\OM\\output-otsar-hahayal-' + str(uuid.uuid4()) + '.xlsx' wbb.save(filename=f) with open(f, 'rb') as binary_file: binary_file_data = binary_file.read() base64_encoded_data = base64.b64encode(binary_file_data) base64_message = base64_encoded_data.decode('utf-8') print("Before publish to yogev") message_format = {"userid": userId, "base64buffer": base64_message, "curr_date": curr_date, "fileName": fileName} message = json.dumps(message_format) publisher.sender(message) else: if match.value == None: print("ISRACARD") wb1.delete_cols(3, 2) sheet = wb1 alist = [] blist = [] wbb = Workbook() ws = wbb.active x = ['תאריך עסקה', 'שם בית העסק', 'סכום עסקה', 'קטגוריה', 'מספר כרטיס'] counter = 1 for i in x: ws.cell(row=1, column=counter, value=i) counter += 1 for i in range(4, mr): b3 = sheet.cell(row=i, column=1) s = str(b3.value).split(" ") r = re.compile('[0-9]{4}') newlist = list(filter(r.match, s)) list1 = [newlist, i] if newlist != []: alist.append(i) # add row number for cars number to list blist.append(', '.join(newlist)) # add cards number to list else: continue alist.append(mr + 5) for a in range(0, len(alist) - 1): for i in range(alist[a] + 3, alist[a + 1] - 2): for j in range(1, 4): # reading cell value from source excel file c = wb1.cell(row=i, column=j) # writing the read value to destination excel file ws.cell(row=i - 5, column=j).value = c.value ws.cell(row=i - 5, column=5).value = str(blist[a]) for j in range(1, 4): for i in range(1, mr): if ws.cell(row=i, column=1).value is None: ws.delete_rows(i) else: continue f = 'c:\\OM\\output-isracard-' + str(uuid.uuid4()) + '.xlsx' wbb.save(filename=f) with open(f, 'rb') as binary_file: binary_file_data = binary_file.read() base64_encoded_data = base64.b64encode(binary_file_data) base64_message = base64_encoded_data.decode('utf-8') print("Before publish to yogev") message_format = {"userId": userId, "base64buffer": base64_message, "curr_date": curr_date, "fileName": fileName} message = json.dumps(message_format) print(message) publisher.sender(message) #print("send message to sender function") elif match3.value!=None: print("VISA") wb1.delete_cols(3) sheet = wb1 clist = [] b3 = sheet.cell(row=2, column=1) s = str(b3.value).split(",")[1].split(" ")[3] #print(s) r = re.compile('[0-9]{4}') # print(r) newlist = list(s) # rint(newlist) list1 = [newlist, 2] if newlist != []: clist.append(', '.join(newlist)) # add cards number to list wbb = Workbook() ws = wbb.active x = ['תאריך עסקה', 'שם בית העסק', 'סכום עסקה', 'קטגוריה', 'מספר כרטיס'] counter = 1 for i in x: ws.cell(row=1, column=counter, value=i) counter += 1 for i in range(4, mr): for j in range(1, 4): c = wb1.cell(row=i, column=j) ws.cell(row=i - 2, column=j).value = c.value ws.cell(row=i - 2, column=5).value = s f = 'c:\\OM\\output-visa-' + str(uuid.uuid4()) + '.xlsx' wbb.save(filename=f) with open(f, 'rb') as binary_file: binary_file_data = binary_file.read() base64_encoded_data = base64.b64encode(binary_file_data) base64_message = base64_encoded_data.decode('utf-8') print("Before publish to yogev") message_format = {"userId": userId, "base64buffer": base64_message, "curr_date": curr_date, "fileName": fileName} message = json.dumps(message_format) publisher.sender(message) else: print('cannot parsing file, Doesnt match any card format') error_type = 'cannot parsing file, Doesnt match any card format' message_error_format = {"userId": userId, "errtype": error_type, "curr_date": curr_date, "fileName": fileName} message_err = json.dumps(message_error_format) publisher.sender_err(message_err) except: print('cannot parsing to xlsx file, the xls file is corrupted') error_type = 'cannot parsing xls file, the file is corrupted' message_error_format = {"userId": userId, "errtype": error_type, "curr_date": curr_date, "fileName": fileName} message_err = json.dumps(message_error_format) publisher.sender_err(message_err) if str(file_extension).split(".")[1] == 'xlsx': print('excel format : xlsx') try: xls_filelike = io.BytesIO(decoded_data) workbook = openpyxl.load_workbook(xls_filelike) wbb2 = Workbook() ws = wbb2.active data = workbook # parser.read_file(data) f2 = 'c:\\OM\\output-xlsx-' + str(uuid.uuid4()) + '.xlsx' workbook.save(filename=f2) workbook = openpyxl.load_workbook(f2) wb1 = workbook.active mr = wb1.max_row mc = wb1.max_column match = wb1.cell(row=1, column=1) match2 = wb1.cell(row=4, column=1) match3 = wb1.cell(row=3, column=1) if match2.value == None: print("OTSAR HAHAYAL") wb1.delete_cols(1) wb1.delete_cols(3) card = wb1.cell(row=4, column=1) s = str(card.value).split(":")[1].split(" ") print(s[0]) card_match = s[0] for i in range(7, mr - 1): for j in range(1, 4): # reading cell value from source excel file c = wb1.cell(row=i, column=j) # writing the read value to destination excel file ws.cell(row=i - 5, column=j).value = c.value ws.cell(row=i - 5, column=5).value = card_match f = 'c:\\OM\\output-otsar-hahayal-' + str(uuid.uuid4()) + '.xlsx' wbb.save(filename=f) with open(f, 'rb') as binary_file: binary_file_data = binary_file.read() base64_encoded_data = base64.b64encode(binary_file_data) base64_message = base64_encoded_data.decode('utf-8') print("Before publish to yogev") message_format = {"userId": userId, "base64buffer": base64_message, "curr_date": curr_date, "fileName": fileName} message = json.dumps(message_format) publisher.sender(message) else: if match.value==None: print("ISRACARD") wb1.delete_cols(3, 2) sheet = wb1 alist = [] blist = [] wbb = Workbook() ws = wbb.active x = ['תאריך עסקה', 'שם בית העסק', 'סכום עסקה', 'קטגוריה', 'מספר כרטיס'] counter = 1 for i in x: ws.cell(row=1, column=counter, value=i) counter += 1 for i in range(4, mr): b3 = sheet.cell(row=i, column=1) s = str(b3.value).split(" ") r = re.compile('[0-9]{4}') newlist = list(filter(r.match, s)) list1 = [newlist, i] if newlist != []: alist.append(i) # add row number for cars number to list blist.append(', '.join(newlist)) # add cards number to list else: continue alist.append(mr + 5) for a in range(0, len(alist) - 1): for i in range(alist[a] + 3, alist[a + 1] - 2): for j in range(1, 4): # reading cell value from source excel file c = wb1.cell(row=i, column=j) # writing the read value to destination excel file ws.cell(row=i - 5, column=j).value = c.value ws.cell(row=i - 5, column=5).value = str(blist[a]) for j in range(1, 4): for i in range(1, mr): if ws.cell(row=i, column=1).value is None: ws.delete_rows(i) else: continue f = 'c:\\OM\\output-isracard' + str(uuid.uuid4())+'.xlsx' wbb.save(filename=f) #channel.queue_declare(queue='Yogevhello', durable=True) with open(f, 'rb') as binary_file: # print(binary_file) binary_file_data = binary_file.read() base64_encoded_data = base64.b64encode(binary_file_data) base64_message = base64_encoded_data.decode('utf-8') print("Before publish to yogev") message_format = {"userId": userId, "base64buffer": base64_message, "curr_date": curr_date, "fileName": fileName} message = json.dumps(message_format) publisher.sender(message) if match3.value==None: #match3.value==None: print("VISA") wb1.delete_cols(3) sheet = wb1 clist = [] b3 = sheet.cell(row=2, column=1) s = str(b3.value).split(",")[1].split(" ")[3] print(s) r = re.compile('[0-9]{4}') #print(r) newlist = list(s) #rint(newlist) list1 = [newlist, 2] if newlist != []: clist.append(', '.join(newlist)) # add cards number to list wbb = Workbook() ws = wbb.active x = ['תאריך עסקה', 'שם בית העסק', 'סכום עסקה', 'קטגוריה', 'מספר כרטיס'] counter = 1 for i in x: ws.cell(row=1, column=counter, value=i) counter+=1 for i in range(4,mr): for j in range(1,4): c = wb1.cell(row=i, column=j) ws.cell(row=i - 2, column=j).value = c.value ws.cell(row=i - 2, column=5).value = s f = 'c:\\OM\\output-visa-' + str(uuid.uuid4()) + '.xlsx' wbb.save(filename=f) with open(f, 'rb') as binary_file: binary_file_data = binary_file.read() base64_encoded_data = base64.b64encode(binary_file_data) base64_message = base64_encoded_data.decode('utf-8') print("Before publish to yogev") message_format = {"userId": userId, "base64buffer": base64_message, "curr_date": curr_date, "fileName": fileName} message = json.dumps(message_format) publisher.sender(message) else: print('cannot parsing file, Doesnt match any card format') error_type = 'cannot parsing file, Doesnt match any card format' message_error_format = {"userId": userId, "errtype": error_type, "curr_date": curr_date, "fileName": fileName} message_err = json.dumps(message_error_format) publisher.sender_err(message_err) except: print('cannot parsing xlsx file, the file is corrupted') error_type = 'cannot parsing xls file, the file is corrupted' message_error_format = {"userId": userId, "errtype": error_type, "curr_date": curr_date, "fileName": fileName} message_err = json.dumps(message_error_format) publisher.sender_err(message_err) else: print('Cannot parsing json from rabbit') error_type = 'cannot parsing Json Message, the Json Message is corrupted' message_error_format = {"userId": userId, "errtype": error_type, "curr_date": curr_date, "fileName": fileName} message_err = json.dumps(message_error_format) publisher.sender_err(message_err)
def import_xls(self, model, file, header_map=None, extra_columns=None): """ To map user column with database column - header_map = {'Name': 'name', 'Document', 'doc_id', } If there is additional fixed column value - extra_columns = [('name', 'ABC'), ('id', 10), ] If the import file have column id, we will use this column to create external id, and hence possible to return record id being created """ decoded_data = base64.decodestring(file) ftemp = 'temp' + datetime.utcnow().strftime('%H%M%S%f')[:-3] f = open(ftemp + '.xls', 'wb+') f.write(decoded_data) f.seek(0) f.close() try: wb = xlrd.open_workbook(f.name) except xlrd.XLRDError: raise ValidationError( _('Invalid file format, only .xls or .xlsx file allowed!')) except Exception: raise st = wb.sheet_by_index(0) csv_file = open(ftemp + '.csv', 'wb') csv_out = unicodecsv.writer(csv_file, encoding='utf-8', quoting=unicodecsv.QUOTE_ALL) if st._cell_values: _HEADER_FIELDS = st._cell_values[0] id_index = -1 # -1 means no id xml_ids = [] for nrow in xrange(st.nrows): if nrow == 0: # Header, find id field header_values = [ x.lower().strip() for x in st.row_values(nrow) ] if 'id' in header_values: id_index = header_values.index('id') if nrow > 0: row_values = st.row_values(nrow) for index, val in enumerate(row_values): ctype = st.cell(nrow, index).ctype type = ctype_text.get(ctype, 'unknown type') if id_index == index and val: # UUID replace id xml_id = '%s.%s' % ('pabi_xls', uuid.uuid4()) row_values[index] = xml_id xml_ids.append(xml_id) elif type == 'empty' or type == 'text' \ or type == 'bool' or type == 'error' \ or type == 'blank': row_values[index] = st.cell(nrow, index).value elif type == 'number': if not val: row_values[index] = 0 else: if not str(val).isdigit(): row_values[index] = int(val) else: row_values[index] = val elif type == 'xldate': str_date = self.xldate_to_datetime( st.cell(nrow, index).value) row_values[index] = str_date else: row_values[index] = st.cell(nrow, index).value csv_out.writerow(row_values) else: csv_out.writerow(st.row_values(nrow)) csv_file.close() csv_file = open(ftemp + '.csv', 'r') file_txt = csv_file.read() csv_file.close() os.unlink(ftemp + '.xls') os.unlink(ftemp + '.csv') if not file_txt: raise ValidationError(_(str("File Not found."))) # Create xml_ids if not already assigned if id_index == -1: _HEADER_FIELDS.insert(0, 'id') xml_id = '%s.%s' % ('pabi_xls', uuid.uuid4()) file_txt = self._add_column('id', xml_id, file_txt) xml_ids.append(xml_id) # Map column name if header_map: _HEADER_FIELDS = [ header_map.get(x.lower().strip(), False) and header_map[x.lower()] or False for x in _HEADER_FIELDS ] # Add extra column if extra_columns: for column in extra_columns: _HEADER_FIELDS.insert(0, str(column[0])) file_txt = self._add_column(column[0], column[1], file_txt) Import = self.env['base_import.import'] imp = Import.create({ 'res_model': model, 'file': file_txt, }) [errors] = imp.do(_HEADER_FIELDS, { 'headers': True, 'separator': ',', 'quoting': '"', 'encoding': 'utf-8' }) if errors: raise ValidationError(_(str(errors[0]['message']))) return list(set(xml_ids))
pn_header_cols = [(i, m.group('mfgr')) for i, col in enumerate(header_row) if i not in desc_header_cols and (m := header_pn_re.match(col))] for price_col, col in enumerate(header_row): if 'PRICE' in col or 'ABR' in col: break else: raise Exception('failed to find price column') for i in range(1, sheet.nrows): doc = {'retailer': retailer, 'file': str(f)[12:], 'file_modified_at': f.stat().st_mtime} row = sheet.row(i) doc['description'] = [{'title': header_row[i], 'value': None} for i in desc_header_cols] price_c = row[price_col] t = ctype_text.get(price_c.ctype) if t == 'number': price = price_c.value elif t == 'empty': price = None elif t == 'error': error_text_from_code.get(price_c.value) price = None else: raise Exception('you f****d up') #assert ctype_text.get(price.ctype) != 'number', 'invalid price cell' doc['price'] = price for j in desc_header_cols: cell = row[j] t = ctype_text.get(cell.ctype)