def split_dataset(dataset, p_train_data, split_mode): fields = list(fieldnames(dataset)) size_dataset = len(values(dataset, fields[0])) size_train_data = int(round(size_dataset * p_train_data)) size_test_data = abs(size_train_data - size_dataset) if split_mode == 'normal' : train_data = head(dataset, size_train_data - 1) if size_test_data == 0: test_data = [] else: test_data = tail(dataset, size_test_data - 1) #################### Falta incluir Shuffle mode ############### return train_data, test_data
table1 = [['foo', 'bar'], ['a', 1], ['b', 2], ['c', 5], ['d', 7], ['f', 42], ['f', 3], ['h', 90], ['k', 12], ['l', 77], ['q', 2]] from petl import tail, look look(table1) table2 = tail(table1, 4) look(table2) # sort table1 = [['foo', 'bar'], ['C', 2], ['A', 9], ['A', 6], ['F', 1], ['D', 10]] from petl import sort, look look(table1) table2 = sort(table1, 'foo')
def main(argv): parser = argparse.ArgumentParser() parser.add_argument("--input-csv-filename", required=True, help="Input UTF8 CSV to summarize") parser.add_argument( "--sep-columns", required=False, nargs='*', default=argparse.SUPPRESS, help= "Column names of columns containing comma- or semi-colon-separated values" ) parser.add_argument("--sep-character", required=False, help="Character used to separate values in multi-value " \ "fields. Defaults to ';' if not specified.") parser.add_argument("--skip-columns", required=False, nargs='*', default=argparse.SUPPRESS, help="Column names to NOT generate stats for") parser.add_argument("--skip-num-rows", required=False, type=int, help="Skip specified number " "of header rows") parser.add_argument( "--first-ccb-column", required=False, help="String name of first CCB column. If " "specified, all preceeding columns will be labeled 'Servant Keeper' and this column " "and all subsequent will be labeled 'CCB'") args = parser.parse_args() if args.first_ccb_column is not None: column_prefix = 'Servant Keeper ' else: column_prefix = '' assert os.path.isfile( args.input_csv_filename ), "Error: cannot open file '" + args.input_csv_filename + "'" table = petl.fromcsv(args.input_csv_filename) # Skip header rows if args.skip_num_rows: skip_num = args.skip_num_rows assert skip_num > 0, "--skip-num-rows value '" + str( skip_num) + "' is invalid. Must be positive." it = iter(table) while skip_num >= 0: row = next(it) skip_num -= 1 table = petl.setheader(table, row) table = petl.tail(table, petl.nrows(table) - args.skip_num_rows) # Print nicely formatted stats for each column sep = '' args_dict = vars(args) skip_columns_specified = 'skip_columns' in args_dict sep_char_specified = 'sep_character' in args_dict for column in petl.header(table): if args.first_ccb_column is not None and column == args.first_ccb_column: column_prefix = 'CCB ' if not skip_columns_specified or column not in args.skip_columns: output_str = column_prefix + "Column '" + column + "'" print sep + output_str print >> sys.stderr, output_str if args.sep_columns is not None and column in args.sep_columns: if sep_char_specified: sep_character = args.sep_character else: sep_character = ';' output_str = num_dict2str( dict_dump(sep_valuecounter(table, column, sep_character))) print output_str else: output_str = num_dict2str(dict_dump(valuecounts(table, column))) print output_str sep = '\n' # Flush to ensure all output is written sys.stdout.flush() sys.stderr.flush()
table1 = etl.addfield( etl.convertnumbers( etl.setheader(etl.fromcsv('winequality-red.csv'), table_header)), "Type", "Red") table2 = etl.addfield( etl.convertnumbers( etl.setheader(etl.fromcsv('winequality-white.csv'), table_header)), "Type", "White") #print(etl.head(table1)) #print(etl.head(table2)) table1_filtered = etl.select(table1, "Quality", lambda v: v > 6) table2_filtered = etl.select(table2, "Quality", lambda v: v > 4) good_wines = etl.cat(table1_filtered, table2_filtered) good_wines_enhanced = etl.addfields( good_wines, [("Max Acidity", lambda rec: rec["Fixed Acidity"] + rec["Volatile Acidity"]), ("Locked SO2", lambda rec: rec["Total SO2"] - rec["Free SO2"])]) #print(etl.head(good_wines_enhanced)) #print(etl.tail(good_wines_enhanced)) gwe_sorted = etl.sort(good_wines_enhanced, key=["Quality", "Sugar"]) #print(etl.head(gwe_sorted)) print(etl.lookall(etl.tail(gwe_sorted, 500)))
def main(argv): parser = argparse.ArgumentParser() parser.add_argument("--input-csv-filename", required=True, help="Input UTF8 CSV to summarize") parser.add_argument("--sep-columns", required=False, nargs = '*', default=argparse.SUPPRESS, help="Column names of columns containing comma- or semi-colon-separated values") parser.add_argument("--sep-character", required=False, help="Character used to separate values in multi-value " \ "fields. Defaults to ';' if not specified.") parser.add_argument("--skip-columns", required=False, nargs='*', default=argparse.SUPPRESS, help="Column names to NOT generate stats for") parser.add_argument("--skip-num-rows", required=False, type=int, help="Skip specified number " "of header rows") parser.add_argument("--first-ccb-column", required=False, help="String name of first CCB column. If " "specified, all preceeding columns will be labeled 'Servant Keeper' and this column " "and all subsequent will be labeled 'CCB'") args = parser.parse_args() if args.first_ccb_column is not None: column_prefix = 'Servant Keeper ' else: column_prefix = '' assert os.path.isfile(args.input_csv_filename), "Error: cannot open file '" + args.input_csv_filename + "'" table = petl.fromcsv(args.input_csv_filename) # Skip header rows if args.skip_num_rows: skip_num = args.skip_num_rows assert skip_num > 0, "--skip-num-rows value '" + str(skip_num) + "' is invalid. Must be positive." it = iter(table) while skip_num >= 0: row = next(it) skip_num -= 1 table = petl.setheader(table, row) table = petl.tail(table, petl.nrows(table) - args.skip_num_rows) # Print nicely formatted stats for each column sep = '' args_dict = vars(args) skip_columns_specified = 'skip_columns' in args_dict sep_char_specified = 'sep_character' in args_dict for column in petl.header(table): if args.first_ccb_column is not None and column == args.first_ccb_column: column_prefix = 'CCB ' if not skip_columns_specified or column not in args.skip_columns: output_str = column_prefix + "Column '" + column + "'" print sep + output_str print >> sys.stderr, output_str if args.sep_columns is not None and column in args.sep_columns: if sep_char_specified: sep_character = args.sep_character else: sep_character = ';' output_str = num_dict2str(dict_dump(sep_valuecounter(table, column, sep_character))) print output_str else: output_str = num_dict2str(dict_dump(valuecounts(table, column))) print output_str sep = '\n' # Flush to ensure all output is written sys.stdout.flush() sys.stderr.flush()
# tail() ######## import petl as etl table1 = [['foo', 'bar'], ['a', 1], ['b', 2], ['c', 5], ['d', 7], ['f', 42], ['f', 3], ['h', 90], ['k', 12], ['l', 77], ['q', 2]] table2 = etl.tail(table1, 4) table2 # skipcomments() ################ import petl as etl table1 = [['##aaa', 'bbb', 'ccc'], ['##mmm',], ['#foo', 'bar'], ['##nnn', 1], ['a', 1], ['b', 2]] table2 = etl.skipcomments(table1, '##') table2
# coding:utf8 import petl as etl table1 = [('foo', 'bar', 'baz'), ('apple', 1, 2.5), ('orange', 3, 4.5), ('pears', 5, 6.5), ('bananer', 7, 8.5), ('cat', 9, 10.5)] # head 4 table_head = etl.head(table1, 4) print(table_head) # tail 4 table_tail = etl.tail(table1, 4) print(table_tail) # rowslice rowsliceTb = etl.rowslice(table1, 2) print(rowsliceTb) rowsliceTb_2_4 = etl.rowslice(table1, 2, 4) print(rowsliceTb_2_4) # 从1开始,2作为第一个,步长为2, rowsliceTb_1_2_5 = etl.rowslice(table1, 1, 5, 2) print(rowsliceTb_1_2_5) # cut cutTb = etl.cut(table1, 'foo', 'bar') print(cutTb) # index starts from 0 cutTb_0_2 = etl.cut(table1, 0, 2)