def export_cells_not_in(inTable, noTable, outTable, inSheet, noSheet, inFID, noFID): """ Export to a new file the cells of in Table not in noTable """ import xlrd import xlwt from gasp.fm import tbl_to_obj from gasp.pyt.xls.fld import col_name, get_columns_position from gasp.pyt.xls.summ import list_unique_values_column # TODO: check if tables are xls # Get Data inData = tbl_to_obj(inTable, sheet=inSheet, output='array') COLUMNS = col_name(inTable, sheet_name=inSheet) # From noDATA, get IDS that will not be in the outTable noXls = xlrd.open_workbook(noTable) _noSheet = noXls.sheet_by_name(noSheet) colsPosition = get_columns_position(_noSheet, noFID) noFIDS = list_unique_values_column(_noSheet, colsPosition[noFID]) # Create Output out_xls = xlwt.Workbook() new_sheet = out_xls.add_sheet(inSheet) # Write columns titles for c in range(len(COLUMNS)): new_sheet.write(0, c, COLUMNS[c]) # Write data not in noData l = 1 for row in inData: if row[inFID] not in noFIDS: c = 0 for col in COLUMNS: new_sheet.write(l, c, row[col]) c += 1 l += 1 out_xls.save(outTable) return outTable
def groups_to_lines(xls_data, xls_sheet, group_column, name_colum, cls_column, interest_column, column_labels, xls_output): """ Table: | group | label | class | interest 0 | AT | ATLA1 | 0 | 50 1 | AT | ATLA2 | 1 | 70 2 | AT | ATLA3 | 2 | 80 3 | AT | ATLA4 | 2 | 90 4 | AT | ATLA5 | 2 | 10 5 | PT | PTLA1 | 0 | 20 6 | PT | PTLA2 | 2 | 30 7 | PT | PTLA3 | 2 | 40 8 | PT | PTLA4 | 2 | 52 9 | PT | PTLA5 | 1 | 61 10| PT | PTLA6 | 2 | 12 Will be converted to a python object such as: table = { AT: { group_name : ATLA1, group_value : interest of line 0, master_unit_value : interest of line 1, units_value : [interest of line 2, 3, 4] }, PT: { group_name : PTLA1, group_value : interest of line 5, master_unit_value : interest of line 9, units_value : [interest of line 6, 7, 8, 10], } } Using this python object, the output table will be write as: | GROUP_NAME | GROUP_VALUE | MASTER_UNIT_VALUE | UNITS_1 | ... | UNITS_N 0 | ATLA1 | 50 | 70 | 80 | ... | 10 1 | PTLA1 | 20 | 61 | 30 | ... | 12 """ from gasp.pyt.xls.fld import get_columns_position interest_columns = [group_column, cls_column, name_colum, interest_column] # Open input __xls_data = xlrd.open_workbook(xls_data) sheet = __xls_data.sheet_by_name(xls_sheet) # Get columns position """ cols_position = { column_name: position_index, ... } """ cols_position = get_columns_position(sheet, interest_columns) if interest_column not in cols_position.keys(): return "Interest column not encountered in data file" # Get data of spreadsheet for the interest column """ data = { group: { group_name : name, group_value : value, master_unit_value : value, units_value : [value_unit1, ..., value_unitn] }, } """ data = {} groups_name = {} for row in range(1, sheet.nrows): group_id = str(sheet.cell(row, cols_position[group_column]).value) row_class = int(sheet.cell(row, cols_position[cls_column]).value) row_name = sheet.cell(row, cols_position[name_colum]).value row_value = sheet.cell(row, cols_position[interest_column]).value if group_id not in data.keys(): groups_name[row_name] = group_id if row_class == 0: data[group_id] = { 'group_name': row_name, 'group_value': row_value, 'units_value': [] } elif row_class == 1: data[group_id] = { 'master_unit_value': row_value, 'units_value': [] } elif row_class == 2: data[group_id] = { 'units_value': [] } else: return 'Class value not recognized' else: if row_class == 0: groups_name[row_name] = group_id data[group_id].update({ 'group_name': row_name, 'group_value': row_value }) elif row_class == 1: data[group_id].update({ 'master_unit_value': row_value, }) elif row_class == 2: data[group_id]['units_value'].append(row_value) else: return 'Class value not recognized' # Write Output out_xls = xlwt.Workbook() out_sheet = out_xls.add_sheet(interest_column) # Write first line col_to_write = 0 for k in data: c = len(data[k]['units_value']) if c > col_to_write: col_to_write = c for i in range(col_to_write): out_sheet.write(0, i+3, i+1) # Write second line for i in range(len(column_labels)): out_sheet.write(1, i, column_labels[i]) aux_titles = ['Min', 'Max', 'EU28_Average'] for i in range(len(aux_titles)): out_sheet.write(1, col_to_write + 3 + i, aux_titles[i]) # Organize groups alphabetically based on names grp_names = sorted(groups_name.keys()) groups = [] for name in grp_names: groups.append(groups_name[name]) # Write data l = [] for k in range(len(groups)): out_sheet.write(k+2, 0, data[groups[k]]['group_name']) out_sheet.write(k+2, 1, data[groups[k]]['group_value']) if 'master_unit_value' not in data[groups[k]]: out_sheet.write(k+2, 2, data[groups[k]]['group_value']) master_unit_key = 'group_value' else: out_sheet.write(k+2, 2, data[groups[k]]['master_unit_value']) master_unit_key = 'master_unit_value' for i in range(len(data[groups[k]]['units_value'])): out_sheet.write(k+2, i+3, data[groups[k]]['units_value'][i]) t = data[groups[k]]['units_value'] + [data[groups[k]][master_unit_key]] l += t # Write global min and max n_l = [] for i in l: if type(i) == type(10) or type(i) == type(10.0): n_l.append(i) out_sheet.write(2, col_to_write + 3, min(n_l)) out_sheet.write(2, col_to_write + 4, max(n_l)) out_xls.save(xls_output)
def split_col_by_year(dataFile, sheetName, codeCol, yearCol, out_file): """ For a table: | code | year | U | LTU 0 | PT1 | 2000 | x | x 1 | PT2 | 2000 | x | x 2 | PT3 | 2000 | x | x 3 | PT1 | 2001 | x | x 4 | PT2 | 2001 | x | x 5 | PT3 | 2001 | x | x 6 | PT1 | 2002 | x | x 7 | PT2 | 2002 | x | x 8 | PT3 | 2002 | x | x 9 | PT1 | 2003 | x | x 10 | PT2 | 2003 | x | x 11 | PT3 | 2003 | x | x 12 | PT1 | 2004 | x | x 13 | PT2 | 2004 | x | x 14 | PT3 | 2004 | x | x The output will be: | code | U_2000 | U_2001 | ... | LTU_2000 | ... | LTU_2004 0 | PT1 | x | x | ... | x | ... | x 1 | PT2 | x | x | ... | x | ... | x 2 | PT3 | x | x | ... | x | ... | x Outfile should be a CSV. """ import codecs from gasp.pyt.xls.fld import get_columns_position, get_columns_position_outside_options from gasp.pyt.xls.summ import list_unique_values_column # Open input xls = xlrd.open_workbook(dataFile) sheet = xls.sheet_by_name(sheetName) # Get code and year columns position codePos_yearPos = get_columns_position(sheet, [codeCol, yearCol]) codePos, yearPos = (codePos_yearPos[codeCol], codePos_yearPos[yearCol]) # Get variable positions varPos = get_columns_position_outside_options(sheet, [codeCol, yearCol]) # Get data data = {} VARS = [] YEARS = [] for row in range(1, sheet.nrows): entity = sheet.cell(row, codePos).value year = int(sheet.cell(row, yearPos).value) if year not in YEARS: YEARS.append(year) if entity not in data: data[entity] = {} if year not in data[entity]: data[entity][year] = {} for var in varPos: if var not in data[entity][year]: data[entity][year][var] = sheet.cell(row, varPos[var]).value else: print('Variable is already there for that entity and year') if var not in VARS: VARS.append(var) VARS.sort() YEARS.sort() with codecs.open(out_file, mode='w', encoding='utf-8') as f: # Write titles f.write(codeCol + '\t') for var in VARS: for year in YEARS: f.write('{v}_{y}\t'.format(v=str(var), y=str(year))) f.write('\n') # Write real data for entity in data: f.write(entity + '\t') for var in VARS: for year in YEARS: if year in data[entity]: f.write('{}\t'.format(str(data[entity][year][var]))) else: f.write('\t') f.write('\n') f.close() """
def transpose_groups_in_sheets(excel, datasheet, group, _id, output): """ For an Input Table like: | group | id | variable_1 | variable_2 0 | A | 1 | V1_A1 | V2_A1 1 | A | 2 | V1_A2 | V2_A2 2 | B | 1 | V1_B1 | V2_B1 3 | B | 2 | V1_B2 | V2_B2 This method will generate one table for each group: * GROUP A | variables | var_code | id1_value | id2_value 0 | variable_1 | | V1_A1 | V1_A2 1 | variable_2 | | V2_A1 | V2_A2 * GROUP B | variables | var_code | id1_value | id2_value 0 | variable_1 | | V1_B1 | V1_B2 1 | variable_2 | | V2_B1 | V2_B2 """ import xlrd; import xlwt from gasp.pyt.xls.fld import get_columns_position from gasp.pyt.xls.fld import get_columns_position_outside_options # Get data from excel xls = xlrd.open_workbook(excel) sdata = xls.sheet_by_name(datasheet) # Get data by group and variable """ data = { groupA : { varA : [val_1, ..., val_n], varB : [val_1, ..., val_n] }, ... groupN: { varA : [val_1, ..., val_n], varB : [val_1, ..., val_n] } } """ position_grp_id = get_columns_position(sdata, [group, _id]) position_var = get_columns_position_outside_options(sdata, [group, _id]) # To mantain the writting order of the variables (first column # should be first line.) var_cols = position_var.values() var_cols.sort() for i in range(len(var_cols)): for col in position_var: if var_cols[i] == position_var[col]: var_cols[i] = col # The columns names will be written in the correct order groupsChild = {} for row in range(1, sdata.nrows): grp = sdata.cell(row, position_grp_id[group]).value if grp not in groupsChild: groupsChild[grp] = {} entity = sdata.cell(row, position_grp_id[_id]).value for col in var_cols: value = sdata.cell(row, position_var[col]).value if not col in groupsChild[grp]: groupsChild[grp][col] = [value] else: groupsChild[grp][col].append(value) # Write output - one table for each group out_xls = xlwt.Workbook() for grp in groupsChild: grp_sheet = out_xls.add_sheet(grp) # Write first line titles = ['MIN', 'Max', 'component/area of concern/dimension'] for i in range(len(groupsChild[grp][groupsChild[grp].keys()[0]])): titles.append(i+1) for i in range(len(titles)): grp_sheet.write(0, i, titles[i]) # Write Values for i in range(len(var_cols)): # each iteration is a new line # Write Variable name grp_sheet.write(i+1, 2, var_cols[i]) # Write entities values for e in range(len(groupsChild[grp][var_cols[i]])): grp_sheet.write(i+1, 2 + e + 1, groupsChild[grp][var_cols[i]][e]) # Write MIN and MAX grp_sheet.write(i+1, 0, min(groupsChild[grp][var_cols[i]])) grp_sheet.write(i+1, 1, max(groupsChild[grp][var_cols[i]])) out_xls.save(output) return output
def year_to_col_by_attr(data_file, data_spreadsheet, field_id, fields_to_mantain, field_year, output_file): """ Transpose a table by attribute and year For a given table such as: | group | fid | label | year | attribute_x | attribute_y 0 | AT | AT_01 | ATLA1 | 2000 | val_x | val_y 1 | AT | AT_01 | ATLA1 | 2001 | val_x | val_y 2 | AT | AT_02 | ATLA2 | 2000 | val_x | val_y 3 | AT | AT_02 | ATLA2 | 2001 | val_x | val_y 4 | AT | AT_03 | ATLA3 | 2000 | val_x | val_y 5 | AT | AT_03 | ATLA3 | 2001 | val_x | val_y 6 | AT | AT_04 | ATLA4 | 2000 | val_x | val_y 7 | AT | AT_04 | ATLA4 | 2001 | val_x | val_y 8 | PT | PT_01 | PTLA1 | 2000 | val_x | val_y 9 | PT | PT_01 | PTLA1 | 2001 | val_x | val_y 10 | PT | PT_02 | PTLA2 | 2000 | val_x | val_y 11 | PT | PT_02 | PTLA2 | 2001 | val_x | val_y 12 | PT | PT_03 | PTLA3 | 2000 | val_x | val_y 13 | PT | PT_03 | PTLA3 | 2001 | val_x | val_y This method will generate one table (spreadsheet) for each attribute: Table attribute_x | group | fid | label | 2000 | 2001 0 | AT | AT_01 | ATLA1 | val_x | val_x 1 | AT | AT_02 | ATLA2 | val_x | val_x 2 | AT | AT_03 | ATLA3 | val_x | val_x 3 | AT | AT_04 | ATLA4 | val_x | val_x 4 | PT | PT_01 | PTLA1 | val_x | val_x 5 | PT | PT_02 | PTLA2 | val_x | val_x 6 | PT | PT_03 | PTLA3 | val_x | val_x Table attribute_y | group | fid | label | 2000 | 2001 0 | AT | AT_01 | ATLA1 | val_y | val_y 1 | AT | AT_02 | ATLA2 | val_y | val_y 2 | AT | AT_03 | ATLA3 | val_y | val_y 3 | AT | AT_04 | ATLA4 | val_y | val_y 4 | PT | PT_01 | PTLA1 | val_y | val_y 5 | PT | PT_02 | PTLA2 | val_y | val_y 6 | PT | PT_03 | PTLA3 | val_y | val_y """ from gasp.pyt.xls.fld import get_columns_position from gasp.pyt.xls.fld import get_columns_position_outside_options from gasp.pyt.xls.summ import list_unique_values_column from gasp.pyt.xls.fm import get_unit_attributes # Open input xls_data = xlrd.open_workbook(data_file) sheet = xls_data.sheet_by_name(data_spreadsheet) # Get interest columns position interest_position = get_columns_position(sheet, fields_to_mantain) # Get year position year_position = get_columns_position(sheet, [field_year]) # Get id position id_position = get_columns_position(sheet, [field_id]) # Get attributes columns position - indicators attributes_position = get_columns_position_outside_options( sheet, fields_to_mantain + [field_year, field_id] ) # List years years = list_unique_values_column(sheet, year_position[field_year]) # List entities units = list_unique_values_column(sheet, id_position[field_id]) units_atributtes = get_unit_attributes( sheet, id_position[field_id], interest_position ) # add years to data dict for unit in units_atributtes: for year in years: units_atributtes[unit][year] = {} # Get values by year for each unit for line in range(1, sheet.nrows): __year = sheet.cell(line, year_position[field_year]).value __id = sheet.cell(line, id_position[field_id]).value if __id not in units_atributtes.keys(): continue for indicator in attributes_position: if __year not in years: continue units_atributtes[__id][__year][indicator] = sheet.cell( line, attributes_position[indicator]).value # Write output out_xls = xlwt.Workbook() # for each attribute/indicator column for indicator in attributes_position: # Create new sheet out_sheet = out_xls.add_sheet(indicator) # Write columns out_sheet.write(0, 0, field_id) c = 1 for field in interest_position: out_sheet.write(0, c, field) c += 1 for year in years: out_sheet.write(0, c, year) c += 1 # Write real data lnh = 1 for unit in units_atributtes: out_sheet.write(lnh, 0, unit) col = 1 for field in interest_position: out_sheet.write( lnh, col, units_atributtes[unit][field] ) col += 1 for year in years: out_sheet.write( lnh, col, units_atributtes[unit][year][indicator] ) col += 1 lnh += 1 out_xls.save(output_file)
def frequencies_by_place(xls, entities_sheet, entities_id, entities_place, data_sheet, output, null=None, out_cols_basename=None, entities_filter=None, cols_exclusion=None): """ Entities table: ID_Aluno | codigo_postal xxxxxxxx | xxxx-xxx xxxxxxxx | xxxx-xxx data_sheet: ID_Aluno | disciplina_1 | disciplina_2 | disciplina_n xxxxxxxx | nota_1 | xxxxxxx | xxxxxxx xxxxxxxx | nota_2 | xxxxxxx | xxxxxxx xxxxxxxx | nota_3 | xxxxxxx | xxxxxxx Output tables (one for each disciplina and other for the total) | nota_1 | nota_2 | nota_3 | Mean | Variance total | NrAlunos | NrAlunos | NrAlunos | x | y codigo_postal_1 | NrAlunos | NrAlunos | NrAlunos | x | y codigo_postal_n | NrAlunos | NrAlunos | NrAlunos | x | y In the entities_sheet and data_sheet, there must be only one referente for an entity (duplicated entities are not allowed) Filtering entities properties is possible """ import numpy from decimal import Decimal from gasp.pyt.xls.fld import get_columns_position from gasp.pyt.xls.summ import list_unique_values_column from gasp.pyt.xls.summ import count_values_column from gasp.pyt.xls.summ import count_values_column_if_entity_exists from gasp.pyt.xls.summ import count_values_column_by_entity_property """ Aux method to estimate some generic statistics based on a "Dict Histogram" (total, mean and standard deviation) """ def map_dict(histogram): if 'mean' in histogram.keys(): del histogram['mean'] if 'stdesviation' in histogram.keys(): del histogram['stdesviation'] if 'total' in histogram.keys(): del histogram['total'] # Get the total of entities histogram['total'] = sum(histogram.values()) # Get mean numerator = 0 denominator = 0 std_sample = [] for v in histogram: if type(v) != str: numerator += Decimal(v) * Decimal(histogram[v]) denominator += histogram[v] std_sample += [v for x in range(histogram[v])] if numerator and denominator: histogram['mean'] = numerator / Decimal(denominator) histogram['stdesviation'] = Decimal(numpy.std(std_sample)) return histogram # Open xls file __xls_file = xlrd.open_workbook(xls, on_demand=True) # Map entities with location """ d = { entitie_id : location } """ if entities_filter: filters = entities_filter.split(';') filters_map = {} for flt in filters: col_filter, val_filter = flt.split('=') filters_map[col_filter] = val_filter interest_col = [entities_id, entities_place] + filters_map.keys() else: filters_map = 0 interest_col = [entities_id, entities_place] __entities_sheet = __xls_file.sheet_by_name(entities_sheet) id_place_position = get_columns_position(__entities_sheet, interest_col) if entities_id not in id_place_position.keys(): raise ValueError('Couldn\'t find the given column id') elif entities_place not in id_place_position.keys(): raise ValueError('Couldn\'t find the position of the place id') entities_map = {} for row in range(1, __entities_sheet.nrows): __id = __entities_sheet.cell(row, id_place_position[entities_id]).value location = __entities_sheet.cell( row, id_place_position[entities_place]).value if filters_map: c = 0 for col_filter in filters_map: __filter = __entities_sheet.cell( row, id_place_position[col_filter]).value if __filter != val_filter: c += 1 break if c: continue entities_map[__id] = location # Count Entities by value in each column """ d = { col_1 : { total : { total : total_count, value_1 : value_1_count, value_2 : value_2_count, value_n : value_n_count }, location_1: { total : total_count, value_1 : value_1_count, value_2 : value_2_count, value_n : value_n_count }, location_2: { total : total_count, value_1 : value_1_count, value_2 : value_2_count, value_n : value_n_count } }, col_2 : { total : { total : total_count, value_1 : value_1_count, value_2 : value_2_count, value_n : value_n_count }, location_1: { total : total_count, value_1 : value_1_count, value_2 : value_2_count, value_n : value_n_count }, location_2: { total : total_count, value_1 : value_1_count, value_2 : value_2_count, value_n : value_n_count } }, col_n : { ... } } """ data_sheet = __xls_file.sheet_by_name(data_sheet) data = {} cols_exclusion = cols_exclusion if type(cols_exclusion) == list else \ [cols_exclusion] if type(cols_exclusion) == str else 0 for col in range(1, data_sheet.ncols): column_name = data_sheet.cell(0, col).value if cols_exclusion: if column_name in cols_exclusion: continue # List unique values values = list_unique_values_column(data_sheet, col) # Del NoData identifier if defined if null or null == 0: if null in values: values.remove(null) # Count occourences of a value in that column val_count = count_values_column_if_entity_exists( data_sheet, col, entities_map.keys(), values) for v in values: if v not in val_count: val_count[v] = 0 data[column_name] = {'total': map_dict(val_count)} # Do the same for each location locations = list(set(entities_map.values())) for place in locations: val_count = count_values_column_by_entity_property( data_sheet, col, place, entities_map, values) for v in values: if v not in val_count: val_count[v] = 0 data[column_name].update({place: map_dict(val_count)}) """ Update data dict with one combination of all keys/values data d.update({ general : { total : { total : total_count, value_1 : value_1_count, value_2 : value_2_count, value_n : value_n_count }, location_1: { total : total_count, value_1 : value_1_count, value_2 : value_2_count, value_n : value_n_count }, location_2: { total : total_count, value_1 : value_1_count, value_2 : value_2_count, value_n : value_n_count } } }) """ general = {} for col_sheet in data: for k in data[col_sheet]: if k not in general: general[k] = {} for _k in data[col_sheet][k]: general[k][_k] = data[col_sheet][k][_k] else: for _k in data[col_sheet][k]: if _k in general[k]: general[k][_k] += data[col_sheet][k][_k] else: general[k][_k] = data[col_sheet][k][_k] data['general'] = general for __dict in data['general']: with_stats = map_dict(data['general'][__dict]) data['general'][__dict] = with_stats # Write data in a new xls_file out_xls = xlwt.Workbook() for col_sheet in data: new_sheet = out_xls.add_sheet(col_sheet) if out_cols_basename: cols_basename = [ '{b}_{_k}'.format(b=out_cols_basename, _k=str(k)) for k in data[col_sheet]['total'].keys() ] else: cols_basename = data[col_sheet]['total'].keys() cols_name = data[col_sheet]['total'].keys() cols_basename.sort() cols_name.sort() # Write columns names for c in range(len(cols_name)): new_sheet.write(0, c + 1, cols_basename[c]) # Write lines lnh = 1 lines_name = data[col_sheet].keys() lines_name.sort() for line in lines_name: # Write line name new_sheet.write(lnh, 0, line) # Write counting data for cln in range(len(cols_name)): if cols_name[cln] in data[col_sheet][line].keys(): new_sheet.write(lnh, cln + 1, data[col_sheet][line][cols_name[cln]]) lnh += 1 out_xls.save(output) __xls_file.release_resources() del __xls_file
def frequencies_table(xls, data_sheet, output, entities_sheet=None, entities_id=None, entities_filter=None, values_filter=None): """ Count values occurences in each attribute column input table: entity | attr_1 | attr_2 | ... | attr_n 0 | a | b | ... | a 1 | b | c | ... | b 2 | b | c | ... | c 3 | a | f | ... | d 4 | c | a | ... | f output table: | a | b | c | d | f | attr_1 | 2 | 2 | 1 | 0 | 0 | attr_2 | 1 | 1 | 2 | 0 | 1 | attr_n | 1 | 1 | 1 | 1 | 1 | It is possible filter entities and attribute values """ from gasp.pyt.xls.fld import get_columns_position from gasp.pyt.xls.summ import count_values_column from gasp.pyt.xls.summ import count_values_column_if_entity_exists # Open file __xls = xlrd.open_workbook(xls, on_demand=True) if entities_sheet and entities_id and entities_filter: filters = entities_filter.split(';') filters_map = {} for flt in filters: col_filter, val_filter = flt.split('=') filters_map[col_filter] = val_filter __entities_sheet = __xls.sheet_by_name(entities_sheet) int_col = [entities_id] + filters_map.keys() cols_position = get_columns_position(__entities_sheet, int_col) entities_map = [] for row in range(1, __entities_sheet.nrows): __id = __entities_sheet.cell(row, cols_position[entities_id]).value c = 0 for col_filter in filters_map: __filter = __entities_sheet.cell( row, cols_position[col_filter]).value if __filter != filters_map[col_filter]: c += 1 break if c: continue entities_map.append(__id) else: entities_map = None attr_sheet = __xls.sheet_by_name(data_sheet) data = {} registed_values = [] values_filter = values_filter if type(values_filter) == list else \ [values_filter] if type(values_filter) == str else None for col in range(1, attr_sheet.ncols): col_name = attr_sheet.cell(0, col).value if not entities_map: values_count = count_values_column(attr_sheet, col, values_filter) else: values_count = count_values_column_if_entity_exists( attr_sheet, col, entities_map, values_filter) registed_values.extend(values_count.keys()) data[col_name] = values_count registed_values = list(set(registed_values)) # Write output out_xls = xlwt.Workbook() new_sheet = out_xls.add_sheet( os.path.splitext(os.path.basename(output))[0]) # columns for i in range(len(registed_values)): new_sheet.write(0, i + 1, registed_values[i]) # Write values r = 1 for row in data: new_sheet.write(r, 0, row) for col in range(len(registed_values)): if registed_values[col] in data[row]: new_sheet.write(r, col + 1, data[row][registed_values[col]]) r += 1 out_xls.save(output) __xls.release_resources() del __xls
def frequencies_by_entity_attr(xls, entities_sheet, entities_id, attr_sheet, interest_values, output, entities_filter=None, attr_exclusion=None): """ Count entities with a number of attributes with a set of specific values Example: specific_values = A, S Entities table: entity_id | some_data xxxxxxxx | xxxx-xxx xxxxxxxx | xxxx-xxx attributes_sheet: From tables as: entity | attr_1 | attr_2 | ... | attr_n 0 | S | S | ... | S 1 | A | B | ... | S 2 | B | B | ... | A 3 | S | A | ... | S 4 | A | S | ... | B We came to the following result: values | entities_count | attribute_count S;A | 2 | 3 S;A | 2 | 2 S;A | 1 | 1 Filtering entities properties is possible... Many could be used... When many fields are used, the AND operator logic will be applied E.g of application: Number of students with a number of negatives """ from gasp.pyt.xls.fld import get_columns_position # Open file __xls = xlrd.open_workbook(xls, on_demand=True) # Map entities if entities_filter: filters = entities_filter.split(';') filters_map = {} for flt in filters: col_filter, val_filter = flt.split('=') filters_map[col_filter] = val_filter interest_col = [entities_id] + filters_map.keys() else: filters_map = 0 interest_col = entities_id __entities_sheet = __xls.sheet_by_name(entities_sheet) cols_position = get_columns_position(__entities_sheet, interest_col) if entities_id not in cols_position: raise ValueError('Couldn\'t find the given column id') entities_map = {} for row in range(1, __entities_sheet.nrows): __id = __entities_sheet.cell(row, cols_position[entities_id]).value if filters_map: c = 0 for col_filter in filters_map: __filter = __entities_sheet.cell( row, cols_position[col_filter]).value if __filter != filters_map[col_filter]: c += 1 break if c: continue entities_map[__id] = 0 # Count occurences of the specific/interes values by entitie data_sheet = __xls.sheet_by_name(attr_sheet) interest_values = interest_values if type(interest_values) == list else \ [interest_values] if type(interest_values) == str else 0 if not interest_values: raise ValueError('interest_values should be a list or string') for col in range(1, data_sheet.ncols): column_name = data_sheet.cell(0, col).value if attr_exclusion and type(attr_exclusion) == list: if column_name in attr_exclusion: continue for row in range(1, data_sheet.nrows): __id = data_sheet.cell(row, 0).value value = data_sheet.cell(row, col).value if value in interest_values: if __id in entities_map: entities_map[__id] += 1 # Count the number entities with the same number of occurences of the # interest/specific values occurence_count = entities_map.values() occurence_unique = list(set(occurence_count)) entities_countv = {} for countv in occurence_unique: entities_count = occurence_count.count(countv) entities_countv[entities_count] = countv # Write output out_xls = xlwt.Workbook() new_sheet = out_xls.add_sheet( os.path.splitext(os.path.basename(output))[0]) # colums c = ['values', 'entities_number', 'values_occurences'] for i in range(len(c)): new_sheet.write(0, i, c[i]) # values l = 1 for i in entities_countv: new_sheet.write(l, 0, ';'.join([str(x) for x in interest_values])) new_sheet.write(l, 1, i) new_sheet.write(l, 2, entities_countv[i]) l += 1 out_xls.save(output) __xls.release_resources() del __xls
def identify_cells_with_char(xls, sheetname, col_id, col_interest, interest_char, output): """ Identify the cells with a specific character in one column of interest Record these cells (ID and interest column) in a new file. """ import xlrd; import xlwt from gasp.pyt.xls.fld import get_columns_position __xls = xlrd.open_workbook(xls) sheet = __xls.sheet_by_name(sheetname) cols_position = get_columns_position(sheet, [col_id, col_interest]) out_xls = xlwt.Workbook() new_sheet = out_xls.add_sheet(sheetname) new_sheet.write(0, 0, col_id) new_sheet.write(0, 1, col_interest) Nrow = 1 for row in range(1, sheet.nrows): val = sheet.cell(row, cols_position[col_interest]).value if interest_char in val: new_sheet.write(Nrow, 0, sheet.cell(row, cols_position[col_id]).value) new_sheet.write(Nrow, 1, val) Nrow+=1
def xlstimedelta_to_pddf(inXls, timecol, sheet_name=None, sheet_index=None, columnsToMantain=None): """ Convert a table with a column with timedelta values to a valid Pandas DataFrame """ import datetime import xlrd from xlrd import xldate_as_tuple from gasp.pyt import obj_to_lst from gasp.pyt.xls.sheet import get_sheet_obj from gasp.pyt.xls.fld import col_name, get_columns_position __xls = xlrd.open_workbook(inXls) sheet = get_sheet_obj(__xls, name=sheet_name, index=sheet_index) # Get Cols name COLS_NAME = col_name(sheet) if not columnsToMantain else \ obj_to_lst(columnsToMantain) if type(timecol) != int: if timecol not in COLS_NAME: COLS_NAME.append(timecol) # Get Cols position COLS_POS = get_columns_position(sheet, COLS_NAME) POS_COLS = COLS_POS.values() if type(timecol) == int: COL_TIME_POSITION = timecol else: COL_TIME_POSITION = COLS_POS[timecol] data = [] for row in range(1, sheet.nrows): l_col = [] for col in range(sheet.ncols): if col not in POS_COLS: continue if col == COL_TIME_POSITION: tt = xldate_as_tuple( sheet.cell(row, col).value, __xls.datemode) l_col.append( datetime.timedelta(weeks=tt[1], days=tt[2], hours=tt[3], minutes=tt[4], seconds=tt[5])) else: l_col.append(sheet.cell(row, col).value) data.append(l_col) df = pandas.DataFrame(data, columns=COLS_NAME) return df