def __init__(self, config, mode, encoding="utf8", *args, **params): self.config = config self.mode = mode self.file_list = [] self.data_path = config.get("data", "%s_data_path" % mode) self.encoding = encoding filename_list = config.get("data", "%s_file_list" % mode).replace( " ", "").split(",") recursive = False for name in filename_list: self.file_list = self.file_list + dfs_search( os.path.join(self.data_path, name), recursive) self.file_list.sort() self.data = [] for filename in self.file_list: f = open(filename, "r", encoding=encoding) for line in f: self.data.append(json.loads(line)) if mode == "train": random.shuffle(self.data) self.reduce = config.getboolean("data", "reduce") if mode != "train": self.reduce = False if self.reduce: self.reduce_ratio = config.getfloat("data", "reduce_ratio")
def __init__(self, config, mode, encoding="utf8", *args, **params): self.config = config self.mode = mode self.file_list = [] self.data_path = config.get("data", "%s_data_path" % mode) self.encoding = encoding filename_list = config.get("data", "%s_file_list" % mode).replace( " ", "").split(",") recursive = config.getboolean("data", "recursive") for name in filename_list: self.file_list = self.file_list + dfs_search( os.path.join(self.data_path, name), recursive) self.file_list.sort() self.json_format = config.get("data", "json_format") self.total = 0 for filename in self.file_list: if self.json_format == "single": data = json.load(open(filename, "r", encoding=encoding)) for a in range(0, len(data)): if self.check(data[a]): self.total += 1 else: f = open(filename, "r", encoding=encoding) for line in f: data = json.loads(line) if self.check(data): self.total += 1 f.close() self.init_zero()
def __init__(self, config, mode, encoding="utf8", *args, **params): self.config = config self.mode = mode self.file_list = [] self.data_path = config.get("data", "%s_data_path" % mode) self.encoding = encoding filename_list = config.get("data", "%s_file_list" % mode).replace( " ", "").split(",") recursive = config.getboolean("data", "recursive") for name in filename_list: self.file_list = self.file_list + dfs_search( os.path.join(self.data_path, name), recursive) self.file_list.sort() self.load_mem = config.getboolean("data", "load_into_mem") self.json_format = config.get("data", "json_format") self.data = [] for filename in self.file_list: if self.json_format == "single": self.data = self.data + json.load( open(filename, "r", encoding=encoding)) else: f = open(filename, "r", encoding=encoding) for line in f: self.data.append(json.loads(line)) self.filter_data() print(len(self.data))
def __init__(self, config, mode, encoding="utf8", *args, **params): self.config = config self.mode = mode self.file_list = [] self.data_path = config.get("data", "%s_data_path" % mode) self.encoding = encoding filename_list = config.get("data", "%s_file_list" % mode).replace(" ", "").split(",") recursive = config.getboolean("data", "recursive") for name in filename_list: self.file_list = self.file_list + dfs_search(os.path.join(self.data_path, name), recursive) self.file_list.sort() self.load_mem = config.getboolean("data", "load_into_mem") self.json_format = config.get("data", "json_format") if self.load_mem: self.data = [] for filename in self.file_list: if self.json_format == "single": self.data = self.data + json.load(open(filename, "r", encoding=encoding)) else: f = open(filename, "r", encoding=encoding) for line in f: self.data.append(json.loads(line)) else: self.total = 0 self.prefix_file_cnt = [] if self.json_format == "single": self.temp_data = { "data": json.load(open(self.file_list[0], "r", encoding=encoding)), "file_id": 0 } else: self.temp_file_list = [] for filename in self.file_list: if self.json_format == "single": data = json.load(open(filename, "r", encoding=encoding)) self.prefix_file_cnt.append(len(data)) else: f = open(filename, "r", encoding=encoding) cnt = 0 for line in f: cnt += 1 f.close() self.temp_file_list.append({ "file": open(filename, "r", encoding=encoding), "cnt": 0 }) self.prefix_file_cnt.append(cnt) for a in range(1, len(self.prefix_file_cnt)): self.prefix_file_cnt[a] += self.prefix_file_cnt[a - 1] self.total = self.prefix_file_cnt[-1]
def __init__(self, config, mode, *args, **params): self.config = config self.mode = mode self.file_list = [] self.data_path = config.get("data", "%s_data_path" % mode) filename_list = config.get("data", "%s_file_list" % mode).replace(" ", "").split(",") recursive = config.getboolean("data", "recursive") for name in filename_list: self.file_list = self.file_list + dfs_search(name, recursive) self.file_list.sort()
def __init__(self, config, mode, encoding="utf8", *args, **params): self.config = config self.mode = mode self.file_list = [] self.data_path = config.get("data", "%s_data_path" % mode) self.encoding = encoding filename_list = config.get("data", "%s_file_list" % mode).replace(" ", "").split(",") for name in filename_list: self.file_list = self.file_list + dfs_search(os.path.join(self.data_path, name), True) self.file_list.sort() self.data = [] for filename in self.file_list: self.data = self.data + json.load(open(filename, "r", encoding=encoding))
def __init__(self, config, mode, encoding="utf8", *args, **params): self.config = config self.mode = mode self.file_list = [] self.data_path = config.get("data", "%s_data_path" % mode) self.encoding = encoding # self.siglemulti = SingleMulti('gbt/statement_tfidf.model', 'gbt/statement_som_gbt.model') if mode != "test": filename_list = config.get("data", "%s_file_list" % mode).replace( " ", "").split(",") else: filename_list = "/input/" recursive = False for name in filename_list: self.file_list = self.file_list + dfs_search( os.path.join(self.data_path, name), recursive) self.file_list.sort() self.data = [] for filename in self.file_list: f = open(filename, "r", encoding=encoding) for line in f: data = json.loads(line) if mode == "test": self.data.append(json.loads(line)) continue # filter dataset for Single option model and Multiple option model. # clean up answers. data["answer"] = [a for a in data["answer"] if a != "。"] self.data.append(json.loads(line)) if mode == "train": random.shuffle(self.data) self.reduce = config.getboolean("data", "reduce") if mode != "train": self.reduce = False if self.reduce: self.reduce_ratio = config.getfloat("data", "reduce_ratio")
import json import os import pandas from tools.dataset_tool import dfs_search data_path = "../input/" recursive = False file_list = [] file_list = file_list + dfs_search(os.path.join(data_path, ''), recursive) file_list = [file for file in file_list if 'train' in file] file_list.sort() rawinput = [] for filename in file_list: f = open(filename, "r", encoding='utf8') for line in f: data = json.loads(line) # filter dataset for Single option model and Multiple option model. # clean up answers. data["answer"] = [a for a in data["answer"] if a != "。"] rawinput.append(json.loads(line)) df = pandas.DataFrame(columns=["q", "a", "r"]) for item in rawinput: for option in list("ABCD"): x = dict( zip(list("qar"), (item['statement'], item['option_list'][option], option in item['answer']))) df = df.append(x, ignore_index=True)