def prepare_ranking_file(src, tar, item_num, threshold=None, sampling_num=None, unified_num=False, excluded_files=None): user_items = {} with as_io(src) as f: for line in tqdm(f, "preparing ranking file"): user, item, rating = json.loads(line) user = int(user) item = int(item) if user not in user_items: user_items[user] = [[], [], []] # like, unlabeled, dislike rating = float(rating) if threshold is not None: rating = 0 if rating <= threshold else 1 pos = 0 if rating == 1 else 2 else: pos = 0 user_items[user][pos].append(item) excluded_user_items = defaultdict(set) if excluded_files: with as_io(excluded_files) as f: for line in f: user, item, _ = json.loads(line) user = int(user) item = int(item) excluded_user_items[user].add(item) for user, items in tqdm(user_items.items(), "sampling"): current_items = set(items[0]) | set(items[2]) | set(items[1]) unlabeled = set( range(item_num)) - current_items - excluded_user_items.get( user, set()) if sampling_num: if unified_num: _sampling_num = sampling_num - len(current_items) else: _sampling_num = sampling_num items[1].extend(random.sample(unlabeled, _sampling_num)) else: items[1].extend(list(unlabeled)) with as_out_io(tar) as wf: for user, items in tqdm(user_items.items(), "write to %s" % tar): _data = [user] + items print(json.dumps(_data), file=wf)
def load_jsonl(src: PATH_IO_TYPE): """ 缓冲式按行读取jsonl文件 Examples -------- Assume such component is written in demo.jsonl: .. code-block:: {"a": 1} {"a": 2} .. code-block:: python for line in load_jsonl('demo.jsonl'): print(line) .. code-block:: {"a": 1} {"a": 2} """ with as_io(src) as f: for line in f: yield json.loads(line)
def load_csv(src: PATH_IO_TYPE, delimiter=",", **kwargs): """ read the dict from csv Examples -------- Assume such component is written in demo.csv: .. code-block:: a,b,c 1,2,3 2,4,6 .. code-block:: python for line in load_csv('demo.csv'): print(line) .. code-block:: {"a": 1, "b": 2, "c": 3} {"a": 2, "b": 4, "c": 6} """ with as_io(src) as f: field_names = [i for i in csv.reader([f.readline()], delimiter=delimiter, **kwargs)][0] for line in csv.DictReader(f, field_names, delimiter=delimiter, **kwargs): yield line
def load_file(src: PATH_IO_TYPE): """ Read raw text from source Examples -------- Assume such component is written in demo.txt: .. code-block:: hello world use following codes to reading the component .. code-block:: python for line in load_csv('demo.txt'): print(line, end="") and get .. code-block:: hello world """ with as_io(src) as f: for line in f: yield line
def extract_eval(src): src_data = [] with as_io(src) as f: for line in f: user, like, unlabeled, dislike = json.loads(line) src_data.append([user, like, unlabeled, dislike]) return src_data
def test_loading(tmpdir): csv_src = path_append(tmpdir, "test.csv") json_src = path_append(tmpdir, "test.json") text_to_csv(csv_src) csv2jsonl(csv_src, json_src) jsonl2csv(json_src, csv_src) for src in [csv_src, json_src, load_jsonl(json_src)]: for i, line in enumerate(loading(src)): assert int(line["id"]) == i, line if i == 0: assert line["name"] == "Tom", line elif i == 1: assert line["name"] == "Jerry", line src = path_append(tmpdir, "test") with as_out_io(src) as wf: print(DEMO_TEXT.strip(), file=wf) assert [line.strip() for line in loading(src)] == DEMO_TEXT.strip().split("\n") with as_io(src) as f: assert [line.strip() for line in loading(f)] == DEMO_TEXT.strip().split("\n") assert "hello world" == loading(lambda: "hello world")
def extract(data_src): user_item_rating = [] with as_io(data_src) as f: for line in tqdm(f, "extracting file"): _user_item_rating = [] user_id, item_id, rating = json.loads(line) _user_item_rating.append(int(user_id)) _user_item_rating.append(int(item_id)) if int(rating) <= 3: _user_item_rating.append(0) else: _user_item_rating.append(1) user_item_rating.append(_user_item_rating) return user_item_rating
def load_ks_from_csv(edges): with as_io(edges) as f: for line in csv.reader(f, delimiter=","): yield line
def movielens(src, tar, separator): with as_io(src) as f, as_out_io(tar) as wf: for line in tqdm(f, "reformatting from %s to %s" % (src, tar)): user, item, rating, _ = line.strip().split(separator) print(json.dumps([int(user), int(item), int(rating)]), file=wf)
def iter_from_file(): with as_io(filename) as f: for line in f: yield json.loads(line)