forked from RSDO-DS3/SuperGLUE
/
csv2jsonl.py
61 lines (49 loc) · 2 KB
/
csv2jsonl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import pandas as pd
import os
from unflatten import unflatten
import jsonlines
import argparse
def test_jsonl(org, rev, file):
org = pd.read_json(org, lines=True)
org = org.reindex(sorted(org.columns), axis=1)
rev = pd.read_json(rev, lines=True)
rev = rev.reindex(sorted(rev.columns), axis=1)
if org.equals(rev): # equals fails if two entries are of a different dtype
print(f'{file:15}OK')
else:
# important: inspect failed files manually
print(f'{file:15}FAIL')
def csv2jsonl(json_eng, csv_eng):
nested_datasets = ['MultiRC', 'WSC', 'ReCoRD']
for file in os.listdir(csv_eng):
save_to = os.path.join(json_eng, file[:-4] + '.jsonl')
if os.path.join(csv_eng, file).split('/')[-2] in nested_datasets:
df = pd.read_csv(os.path.join(csv_eng, file))
with jsonlines.open(save_to, mode='w') as writer:
for sample in df.iterrows():
sample = sample[1].dropna()
sample = unflatten(sample.to_dict())
writer.write(sample)
else:
df = pd.read_csv(os.path.join(csv_eng, file), encoding='utf-8')
df.to_json(path_or_buf=save_to, orient='records', lines=True, force_ascii=False) # force_ascii
# # test (if you want to compare)
# org_json = os.path.join('combined-json-eng', json_eng.split('/')[1], file[:-4] + '.jsonl')
# rev_json = save_to
# test_jsonl(org_json, rev_json, file)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Format conversion tool'
)
parser.add_argument('--csv', required=True)
parser.add_argument('--jsonl', required=True)
args = parser.parse_args()
# variables
CSV = args.csv
JSON = args.jsonl
for dataset in os.listdir(CSV):
print(dataset)
json_eng = os.path.join(JSON, dataset)
csv_eng = os.path.join(CSV, dataset)
os.makedirs(json_eng, exist_ok=True)
csv2jsonl(json_eng, csv_eng)