def write_syn_dataset(csvPathname, rowCount, colCount=1, SEED='12345678', colSepChar=",", rowSepChar="\n", quoteChars=""): r1 = random.Random(SEED) enumList = create_enum_list(quoteChars=quoteChars) dsf = open(csvPathname, "w+") for row in range(rowCount): # doesn't guarantee that 10000 rows have 10000 unique enums in a column # essentially sampling with replacement rowData = [] for col in range(colCount): ri = random.choice(enumList) # first two rows can't tolerate single/double quote randomly # keep trying until you get one with no single or double quote in the line if row < 2: while True: # can't have solely white space cols either in the first two rows if "'" in ri or '"' in ri or h2o_util.might_h2o_think_whitespace(ri): ri = random.choice(enumList) else: break rowData.append(ri) # output column ri = r1.randint(0,1) rowData.append(ri) # use the new Hive separator rowDataCsv = colSepChar.join(map(str,rowData)) + rowSepChar ### sys.stdout.write(rowDataCsv) dsf.write(rowDataCsv) dsf.close()