def split_dataset(dataset, p_train_data, split_mode):

    fields = list(fieldnames(dataset))
    
    size_dataset = len(values(dataset, fields[0])) 
    size_train_data = int(round(size_dataset * p_train_data))
    size_test_data = abs(size_train_data - size_dataset)


    if split_mode == 'normal' :

        train_data = head(dataset, size_train_data - 1)
        
        if size_test_data == 0:
            
            test_data = []
            
        else:
            
            test_data = tail(dataset, size_test_data - 1)

    #################### Falta incluir Shuffle mode ###############

    return train_data, test_data
示例#2
0
文件: examples.py 项目: datamade/petl
table1 = [['foo', 'bar'],
          ['a', 1],
          ['b', 2],
          ['c', 5],
          ['d', 7],
          ['f', 42],
          ['f', 3],
          ['h', 90],
          ['k', 12],
          ['l', 77],
          ['q', 2]]

from petl import tail, look
look(table1)
table2 = tail(table1, 4)
look(table2)    


# sort

table1 = [['foo', 'bar'],
          ['C', 2],
          ['A', 9],
          ['A', 6],
          ['F', 1],
          ['D', 10]]

from petl import sort, look
look(table1)
table2 = sort(table1, 'foo')
示例#3
0
table1 = [['foo', 'bar'],
          ['a', 1],
          ['b', 2],
          ['c', 5],
          ['d', 7],
          ['f', 42],
          ['f', 3],
          ['h', 90],
          ['k', 12],
          ['l', 77],
          ['q', 2]]

from petl import tail, look
look(table1)
table2 = tail(table1, 4)
look(table2)    


# sort

table1 = [['foo', 'bar'],
          ['C', 2],
          ['A', 9],
          ['A', 6],
          ['F', 1],
          ['D', 10]]

from petl import sort, look
look(table1)
table2 = sort(table1, 'foo')
示例#4
0
def main(argv):

    parser = argparse.ArgumentParser()
    parser.add_argument("--input-csv-filename",
                        required=True,
                        help="Input UTF8 CSV to summarize")
    parser.add_argument(
        "--sep-columns",
        required=False,
        nargs='*',
        default=argparse.SUPPRESS,
        help=
        "Column names of columns containing comma- or semi-colon-separated values"
    )
    parser.add_argument("--sep-character", required=False, help="Character used to separate values in multi-value " \
        "fields.  Defaults to ';' if not specified.")
    parser.add_argument("--skip-columns",
                        required=False,
                        nargs='*',
                        default=argparse.SUPPRESS,
                        help="Column names to NOT generate stats for")
    parser.add_argument("--skip-num-rows",
                        required=False,
                        type=int,
                        help="Skip specified number "
                        "of header rows")
    parser.add_argument(
        "--first-ccb-column",
        required=False,
        help="String name of first CCB column.  If "
        "specified, all preceeding columns will be labeled 'Servant Keeper' and this column "
        "and all subsequent will be labeled 'CCB'")
    args = parser.parse_args()

    if args.first_ccb_column is not None:
        column_prefix = 'Servant Keeper '
    else:
        column_prefix = ''

    assert os.path.isfile(
        args.input_csv_filename
    ), "Error: cannot open file '" + args.input_csv_filename + "'"

    table = petl.fromcsv(args.input_csv_filename)

    # Skip header rows
    if args.skip_num_rows:
        skip_num = args.skip_num_rows
        assert skip_num > 0, "--skip-num-rows value '" + str(
            skip_num) + "' is invalid.  Must be positive."
        it = iter(table)
        while skip_num >= 0:
            row = next(it)
            skip_num -= 1
        table = petl.setheader(table, row)
        table = petl.tail(table, petl.nrows(table) - args.skip_num_rows)

    # Print nicely formatted stats for each column
    sep = ''
    args_dict = vars(args)
    skip_columns_specified = 'skip_columns' in args_dict
    sep_char_specified = 'sep_character' in args_dict
    for column in petl.header(table):
        if args.first_ccb_column is not None and column == args.first_ccb_column:
            column_prefix = 'CCB '
        if not skip_columns_specified or column not in args.skip_columns:
            output_str = column_prefix + "Column '" + column + "'"
            print sep + output_str
            print >> sys.stderr, output_str
            if args.sep_columns is not None and column in args.sep_columns:
                if sep_char_specified:
                    sep_character = args.sep_character
                else:
                    sep_character = ';'
                output_str = num_dict2str(
                    dict_dump(sep_valuecounter(table, column, sep_character)))
                print output_str
            else:
                output_str = num_dict2str(dict_dump(valuecounts(table,
                                                                column)))
                print output_str
        sep = '\n'

    # Flush to ensure all output is written
    sys.stdout.flush()
    sys.stderr.flush()
示例#5
0
table1 = etl.addfield(
    etl.convertnumbers(
        etl.setheader(etl.fromcsv('winequality-red.csv'), table_header)),
    "Type", "Red")
table2 = etl.addfield(
    etl.convertnumbers(
        etl.setheader(etl.fromcsv('winequality-white.csv'), table_header)),
    "Type", "White")

#print(etl.head(table1))
#print(etl.head(table2))

table1_filtered = etl.select(table1, "Quality", lambda v: v > 6)
table2_filtered = etl.select(table2, "Quality", lambda v: v > 4)

good_wines = etl.cat(table1_filtered, table2_filtered)

good_wines_enhanced = etl.addfields(
    good_wines,
    [("Max Acidity",
      lambda rec: rec["Fixed Acidity"] + rec["Volatile Acidity"]),
     ("Locked SO2", lambda rec: rec["Total SO2"] - rec["Free SO2"])])
#print(etl.head(good_wines_enhanced))
#print(etl.tail(good_wines_enhanced))

gwe_sorted = etl.sort(good_wines_enhanced, key=["Quality", "Sugar"])

#print(etl.head(gwe_sorted))
print(etl.lookall(etl.tail(gwe_sorted, 500)))
示例#6
0
def main(argv):

    parser = argparse.ArgumentParser()
    parser.add_argument("--input-csv-filename", required=True, help="Input UTF8 CSV to summarize")
    parser.add_argument("--sep-columns", required=False, nargs = '*', default=argparse.SUPPRESS,
        help="Column names of columns containing comma- or semi-colon-separated values")
    parser.add_argument("--sep-character", required=False, help="Character used to separate values in multi-value " \
        "fields.  Defaults to ';' if not specified.")
    parser.add_argument("--skip-columns", required=False, nargs='*', default=argparse.SUPPRESS,
        help="Column names to NOT generate stats for")
    parser.add_argument("--skip-num-rows", required=False, type=int, help="Skip specified number "
        "of header rows")
    parser.add_argument("--first-ccb-column", required=False, help="String name of first CCB column.  If "
        "specified, all preceeding columns will be labeled 'Servant Keeper' and this column "
        "and all subsequent will be labeled 'CCB'")
    args = parser.parse_args()

    if args.first_ccb_column is not None:
        column_prefix = 'Servant Keeper '
    else:
        column_prefix = ''

    assert os.path.isfile(args.input_csv_filename), "Error: cannot open file '" + args.input_csv_filename + "'"

    table = petl.fromcsv(args.input_csv_filename)

    # Skip header rows
    if args.skip_num_rows:
        skip_num = args.skip_num_rows
        assert skip_num > 0, "--skip-num-rows value '" + str(skip_num) + "' is invalid.  Must be positive."
        it = iter(table)
        while skip_num >= 0:
            row = next(it)
            skip_num -= 1
        table = petl.setheader(table, row)
        table = petl.tail(table, petl.nrows(table) - args.skip_num_rows)

    # Print nicely formatted stats for each column
    sep = ''
    args_dict = vars(args)
    skip_columns_specified = 'skip_columns' in args_dict
    sep_char_specified = 'sep_character' in args_dict
    for column in petl.header(table):
        if args.first_ccb_column is not None and column == args.first_ccb_column:
            column_prefix = 'CCB '
        if not skip_columns_specified or column not in args.skip_columns:
            output_str = column_prefix + "Column '" + column + "'"
            print sep + output_str
            print >> sys.stderr, output_str
            if args.sep_columns is not None and column in args.sep_columns:
                if sep_char_specified:
                    sep_character = args.sep_character
                else:
                    sep_character = ';'
                output_str = num_dict2str(dict_dump(sep_valuecounter(table, column, sep_character)))
                print output_str
            else:
                output_str = num_dict2str(dict_dump(valuecounts(table, column)))
                print output_str
        sep = '\n'

    # Flush to ensure all output is written
    sys.stdout.flush()
    sys.stderr.flush()
示例#7
0
文件: basics.py 项目: DeanWay/petl
# tail()
########

import petl as etl
table1 = [['foo', 'bar'],
          ['a', 1],
          ['b', 2],
          ['c', 5],
          ['d', 7],
          ['f', 42],
          ['f', 3],
          ['h', 90],
          ['k', 12],
          ['l', 77],
          ['q', 2]]
table2 = etl.tail(table1, 4)
table2


# skipcomments()
################

import petl as etl
table1 = [['##aaa', 'bbb', 'ccc'],
          ['##mmm',],
          ['#foo', 'bar'],
          ['##nnn', 1],
          ['a', 1],
          ['b', 2]]
table2 = etl.skipcomments(table1, '##')
table2
# coding:utf8

import petl as etl

table1 = [('foo', 'bar', 'baz'), ('apple', 1, 2.5), ('orange', 3, 4.5),
          ('pears', 5, 6.5), ('bananer', 7, 8.5), ('cat', 9, 10.5)]
# head 4
table_head = etl.head(table1, 4)
print(table_head)

# tail 4
table_tail = etl.tail(table1, 4)
print(table_tail)

# rowslice
rowsliceTb = etl.rowslice(table1, 2)
print(rowsliceTb)

rowsliceTb_2_4 = etl.rowslice(table1, 2, 4)
print(rowsliceTb_2_4)

# 从1开始,2作为第一个,步长为2,
rowsliceTb_1_2_5 = etl.rowslice(table1, 1, 5, 2)
print(rowsliceTb_1_2_5)

# cut
cutTb = etl.cut(table1, 'foo', 'bar')
print(cutTb)

# index starts from 0
cutTb_0_2 = etl.cut(table1, 0, 2)