def __gen_aspect_noun_filter_dict_file(sents_file, tok_texts_file, pos_tags_file, common_words_file, dst_file): sents = utils.load_json_objs(sents_file) tok_texts = utils.read_lines(tok_texts_file) pos_tags_list = utils.load_pos_tags(pos_tags_file) term_sys_cnts, term_hit_cnts = dict(), dict() for sent_idx, (sent, tok_text, pos_tags) in enumerate(zip(sents, tok_texts, pos_tags_list)): sent_words = tok_text.split(' ') noun_phrases = rules.rec_rule1(sent_words, pos_tags, None) term_objs = sent.get('terms', list()) terms_true = {term_obj['term'].lower() for term_obj in term_objs} for n in noun_phrases: sys_cnt = term_sys_cnts.get(n, 0) term_sys_cnts[n] = sys_cnt + 1 if n in terms_true: hit_cnt = term_hit_cnts.get(n, 0) term_hit_cnts[n] = hit_cnt + 1 common_words = utils.read_lines(common_words_file) filter_terms = set(common_words) for term, sys_cnt in term_sys_cnts.items(): hit_cnt = term_hit_cnts.get(term, 0) # print(term, hit_cnt, sys_cnt) if hit_cnt / sys_cnt < 0.4: filter_terms.add(term) fout = open(dst_file, 'w', encoding='utf-8', newline='\n') for t in filter_terms: fout.write('{}\n'.format(t)) fout.close()
def __run_with_mined_rules(mine_helper, rule_patterns_file, term_hit_rate_file, dep_tags_file, pos_tags_file, sent_texts_file, filter_terms_vocab_file, term_hit_rate_thres=0.6, dst_result_file=None, sents_file=None): l1_rules, l2_rules = rulescommon.load_rule_patterns_file( rule_patterns_file) term_vocab = rulescommon.get_term_vocab(term_hit_rate_file, term_hit_rate_thres) dep_tags_list = utils.load_dep_tags_list(dep_tags_file) pos_tags_list = utils.load_pos_tags(pos_tags_file) sent_texts = utils.read_lines(sent_texts_file) filter_terms_vocab = set(utils.read_lines(filter_terms_vocab_file)) # opinion_terms_vocab = set(utils.read_lines(opinion_terms_file)) terms_sys_list = list() for sent_idx, (dep_tag_seq, pos_tag_seq, sent_text) in enumerate( zip(dep_tags_list, pos_tags_list, sent_texts)): terms = set() l1_terms_new = set() for p in l1_rules: terms_new = rulescommon.find_terms_by_l1_pattern( p, dep_tag_seq, pos_tag_seq, mine_helper, filter_terms_vocab) terms.update(terms_new) l1_terms_new.update(terms_new) for p in l2_rules: terms_new = rulescommon.find_terms_by_l2_pattern( p, dep_tag_seq, pos_tag_seq, mine_helper, filter_terms_vocab, l1_terms_new) terms.update(terms_new) terms_new = mine_helper.get_terms_by_matching(dep_tag_seq, pos_tag_seq, sent_text, term_vocab) terms.update(terms_new) terms_sys_list.append(terms) if sent_idx % 10000 == 0: print(sent_idx) if dst_result_file is not None: __write_rule_results(terms_sys_list, sent_texts, dst_result_file) if sents_file is not None: sents = utils.load_json_objs(sents_file) # aspect_terms_true = utils.aspect_terms_list_from_sents(sents) terms_list_true = mine_helper.terms_list_from_sents(sents) sent_texts = [sent['text'] for sent in sents] correct_sent_idxs = __evaluate(terms_sys_list, terms_list_true, dep_tags_list, pos_tags_list, sent_texts)
def __load_data(dep_tags_file, pos_tags_file, sents_file, train_valid_split_file): tvs_line = utils.read_lines(train_valid_split_file)[0] tvs_arr = [int(v) for v in tvs_line.split()] dep_tags_list = utils.load_dep_tags_list(dep_tags_file) pos_tags_list = utils.load_pos_tags(pos_tags_file) sents = utils.load_json_objs(sents_file) assert len(tvs_arr) == len(dep_tags_list) dep_tags_list_train, dep_tags_list_valid = list(), list() pos_tags_list_train, pos_tags_list_valid = list(), list() sents_train, sents_valid = list(), list() for tvs_label, dep_tags, pos_tags, sent in zip(tvs_arr, dep_tags_list, pos_tags_list, sents): if tvs_label == 0: dep_tags_list_train.append(dep_tags) pos_tags_list_train.append(pos_tags) sents_train.append(sent) else: dep_tags_list_valid.append(dep_tags) pos_tags_list_valid.append(pos_tags) sents_valid.append(sent) data_train = RuleMineData(dep_tags_list_train, pos_tags_list_train, sents_train) data_valid = RuleMineData(dep_tags_list_valid, pos_tags_list_valid, sents_valid) return data_train, data_valid
def solution(file): print("Input: ", file) lines = ut.read_lines(file) i = 0 fields = [] while len(lines[i]) > 0: m = re.match('^(.*):\s(\d+)-(\d+)\sor\s(\d+)-(\d+)$', lines[i]) if m == None: break field = [m.group(1)] + [int(m.group(x)) for x in range(2, 6)] fields.append(field) i += 1 i += 1 ticket = [int(x) for x in lines[i].split(',')] i += 2 tickets = [] while i < len(lines) > 0: tickets.append([int(x) for x in lines[i].split(',')]) i += 1 p1 = part1(fields, ticket, tickets) print("Part1: ", p1) p2 = part2(fields, ticket, tickets) print("Part2: ", p2)
def solution(file): print("Input: ", file) data = ut.read_lines(file, True) rules = {} for i, line in enumerate(data): if len(line.strip()) == 0: break rules = {} for i, line in enumerate(data): if len(line.strip()) == 0: break r = line.split(':') if len(r[1]) == 4: rules[int(r[0])] = [0, r[1][2]] else: rules[int(r[0])] = [1, [x.strip() for x in r[1].split('|')]] messages = data[i + 1:] memo = {} p1 = part1(rules, memo, messages) print("Part1: ", p1) p2 = part2(rules, memo, messages) print("Part2: ", p2)
def read_sents_to_word_idx_seqs(tok_texts_file, word_idx_dict): texts = utils.read_lines(tok_texts_file) word_idx_seq_list = list() for sent_text in texts: words = sent_text.strip().split(' ') word_idx_seq_list.append([word_idx_dict.get(w, 0) for w in words]) return word_idx_seq_list
def load_train_data_bert_ol(sents_file, train_valid_split_file, valid_bert_tokens_file): from utils.utils import read_lines aspect_terms_list, opinion_terms_list = datautils.load_terms_list( sents_file, True) tvs_line = read_lines(train_valid_split_file)[0] tvs_arr = [int(v) for v in tvs_line.split()] # token_seqs_train = datautils.read_tokens_file(train_bert_tokens_file) token_seqs_valid = datautils.read_tokens_file(valid_bert_tokens_file) aspect_terms_list_train, aspect_terms_list_valid = list(), list() opinion_terms_list_train, opinion_terms_list_valid = list(), list() assert len(tvs_arr) == len(aspect_terms_list) for i, tvs_label in enumerate(tvs_arr): if tvs_label == 0: aspect_terms_list_train.append(aspect_terms_list[i]) opinion_terms_list_train.append(opinion_terms_list[i]) else: aspect_terms_list_valid.append(aspect_terms_list[i]) opinion_terms_list_valid.append(opinion_terms_list[i]) data_valid = ValidDataBertOL(token_seqs_valid, aspect_terms_list_valid, opinion_terms_list_valid) return len(aspect_terms_list_train), data_valid
def solution(file): print("Input: ", file) data = ut.read_lines(file, True) tiles = [] i = 0 tile = None while i < len(data): line = data[i] if line.startswith('Tile'): tile = Tile() tile.bits = [] tile.id = int(line.split()[1][0:-1]) for j in range(10): i += 1 line = data[i] tile.bits = tile.bits + [ x for x in line.replace('.', '0').replace('#', '1') ] tiles.append(tile) i += 1 p1 = part1(tiles) print("Part1: ", p1) p2 = part2(tiles)
def solution(file): print("Input:", file) data = ut.read_lines(file, True) rules = [] for d in data: i = 0 dirs = [] while i < len(d): if d[i:i+2] == 'nw': dirs.append('nw') i += 2 if d[i:i+2] == 'ne': dirs.append('ne') i += 2 if d[i:i+2] == 'sw': dirs.append('sw') i += 2 if d[i:i+2] == 'se': dirs.append('se') i += 2 if d[i:i+1] == 'w': dirs.append('w') i += 1 if d[i:i+1] == 'e': dirs.append('e') i += 1 rules.append(dirs) p1 = part1(rules) part2(p1[0], p1[1])
def solution(file): print("Input: ", file) lines = ut.read_lines(file) p1 = part1b(lines, 6) print("Part1: ", p1) p2 = part2b(lines, 6) print("Part2: ", p2)
def __rule_result_differ(): idxs_rule = utils.read_lines('d:/data/aspect/semeval14/rules-correct.txt') idxs_neu = utils.read_lines('d:/data/aspect/semeval14/lstmcrf-correct.txt') idxs_rule = [int(idx) for idx in idxs_rule] idxs_neu = [int(idx) for idx in idxs_neu] print(idxs_rule) print(idxs_neu) idxs_rule_only = list() for i in idxs_rule: if i not in idxs_neu: idxs_rule_only.append(i) idxs_neu_only = list() for i in idxs_neu: if i not in idxs_rule: idxs_neu_only.append(i) print(idxs_rule_only) print(len(idxs_rule_only)) print(idxs_neu_only) print(len(idxs_neu_only))
def solution(file, do1, do2): print("Input: ", file) data = ut.read_lines(file) if do1: p1 = part1(data) print("Part1: ", p1) if do2: p2 = part2(data) print("Part2: ", p2)
def gen_train_valid_sample_idxs_file(tok_texts_file, n_valid_samples, output_file): tok_texts = utils.read_lines(tok_texts_file) n_samples = len(tok_texts) np.random.seed(3719) perm = np.random.permutation(n_samples) n_train = n_samples - n_valid_samples idxs_train, idxs_valid = perm[:n_train], perm[n_train:] with open(output_file, 'w', encoding='utf-8') as fout: fout.write('{}\n'.format(' '.join([str(idx) for idx in idxs_train]))) fout.write('{}\n'.format(' '.join([str(idx) for idx in idxs_valid])))
def solution(file): print("Input: ", file) lines = ut.read_lines(file) nums = [int(x) for x in lines] p1 = part1(nums) print("Part1: ", p1) p2 = part2(nums) print("Part2: ", p2)
def __merge_train_test(train_sents_file, test_sents_file, train_valid_split_file, dst_sents_file, dst_datasplit_file): train_sents = utils.load_json_objs(train_sents_file) test_sents = utils.load_json_objs(test_sents_file) all_sents = train_sents + test_sents utils.save_json_objs(all_sents, dst_sents_file) train_valid_split_labels = utils.read_lines(train_valid_split_file)[0] train_valid_split_labels = [int(v) for v in train_valid_split_labels.split(' ')] all_data_split_labels = train_valid_split_labels + [2 for _ in range(len(test_sents))] with open(dst_datasplit_file, 'w', encoding='utf-8') as fout: fout.write('{}\n'.format(' '.join([str(v) for v in all_data_split_labels])))
def __get_manual_feat(tok_texts_file, terms_file): tok_texts = utils.read_lines(tok_texts_file) terms_list = utils.load_json_objs(terms_file) feat_list = list() for terms_true, tok_text in zip(terms_list, tok_texts): words = tok_text.split(' ') label_seq = modelutils.label_sentence(words, terms_true) feat_seq = np.zeros([len(label_seq), 3], np.int32) for i, v in enumerate(label_seq): feat_seq[i][v] = 1 feat_list.append(feat_seq) return feat_list
def solution(file): print("Input: ", file) lines = ut.read_lines(file) dirs = [] for line in lines: dirs.append([line[0], int(line[1:])]) p1 = part1(dirs) print("Part1: ", p1) p2 = part2(dirs) print("Part2: ", p2)
def main(): if len(sys.argv) < 2: print('Provide input file name') exit(-1) if len(sys.argv) < 3: print('Provide preamble length') exit(-1) lines = ut.read_lines(sys.argv[1]) nums = [int(x) for x in lines] value = part1(nums, int(sys.argv[2])) part2(nums, value)
def solution(file): print("Input: ", file) lines = ut.read_lines(file) start = int(lines[0]) buses = lines[1].split(',') p1 = part1(start, buses) print("Part1: ", p1) #p2 = part2(buses) #p2 = part2_crt(buses) p2 = part2_again(buses) print("Part2: ", p2)
def solution(file): print("Input: ", file) lines = ut.read_lines(file) rows = [] rows.append(list('.' * (len(lines[0]) + 2))) for line in lines: rows.append(list('.' + line + '.')) rows.append(list('.' * (len(lines[0]) + 2))) p1 = part1(rows) print("Part1: ", p1) p2 = part2(rows) print("Part2: ", p2)
def main(): if len(sys.argv) < 2: print('Provide input file name') exit(-1) lines = ut.read_lines(sys.argv[1]) prog = [] for line in lines: m = re.match('^(\w+)\s+(.)(\d+)$', line) sign = 1 if m.group(2) == '+' else -1 inst = [m.group(1), sign * int(m.group(3)), False] prog.append(inst) # Deep copy prog because Python part1([p[:] for p in prog])
def solution(file): print("Input:", file) data = ut.read_lines(file, True) foods = [] for line in data: temp = line.replace('(contains', ':').replace(',', ' ').replace(')', ' ').strip().split(':') food = Food() food.ings = set(temp[0].split()) food.alls = set(temp[1].split()) foods.append(food) xxx_ingredients = set.union(*[f.ings for f in foods]) eng_allergens = set.union(*[f.alls for f in foods]) eng_all_to_xxx_ings = { a: set.intersection(*[f.ings for f in foods if a in f.alls]) for a in eng_allergens } xxx_allergens = set.union(*[eng_all_to_xxx_ings[a] for a in eng_allergens]) # Part 1 safe = xxx_ingredients.difference(xxx_allergens) print('Part1:', sum([len(safe.intersection(f.ings)) for f in foods])) # Part 2 # Map of ingredients identified as allergens eng_all_to_ing = {} while True: # Find allergens which can be only one of the ingredients. pairs = {a: i for (a, i) in eng_all_to_xxx_ings.items() if len(i) == 1} # Assume zero or more matches. for p in pairs: eng_all_to_ing[p] = list(pairs[p])[0] # Zero means we're done - or the system of constraints can't be resolved. if len(pairs) == 0: break # Remove all the found ingredients from the food items for the next go around. ings = set.union(*[pairs[a] for a in pairs]) eng_all_to_xxx_ings = { a: i.difference(ings) for (a, i) in eng_all_to_xxx_ings.items() } print('Part2:', ','.join([i for i in sorted(eng_all_to_ing)])) print('Part2:', ','.join([eng_all_to_ing[i] for i in sorted(eng_all_to_ing)]))
def __opinion_rule_insight(dep_tags_file, pos_tags_file, sent_text_file, terms_vocab, dst_result_file=None, sents_file=None): print('loading data ...') dep_tags_list = utils.load_dep_tags_list(dep_tags_file) pos_tags_list = utils.load_pos_tags(pos_tags_file) sent_texts = utils.read_lines(sent_text_file) assert len(dep_tags_list) == len(sent_texts) assert len(pos_tags_list) == len(dep_tags_list) print('done.') opinions_sys_list = list() for sent_idx, sent_text in enumerate(sent_texts): dep_tags = dep_tags_list[sent_idx] pos_tags = pos_tags_list[sent_idx] assert len(dep_tags) == len(pos_tags) opinion_terms = set() # used rule2 and __match_terms to pretrain # terms_new = opinionrules.rule1(dep_tags, pos_tags) # opinion_terms.update(terms_new) terms_new = opinionrules.rule2(dep_tags, pos_tags) opinion_terms.update(terms_new) # terms_new = opinionrules.rule4(dep_tags, pos_tags) # opinion_terms.update(terms_new) terms_new = __match_terms(sent_text, terms_vocab) opinion_terms.update(terms_new) opinions_sys_list.append(opinion_terms) if sent_idx % 10000 == 0: print(sent_idx) if dst_result_file is not None: __write_rule_results(opinions_sys_list, sent_texts, dst_result_file) if sents_file is not None: sents = utils.load_json_objs(sents_file) opinions_true_list = list() for sent in sents: opinions_true_list.append( [t.lower() for t in sent.get('opinions', list())]) correct_sent_idxs = __evaluate(opinions_sys_list, opinions_true_list, dep_tags_list, pos_tags_list, sent_texts)
def load_train_data_bert(bert_embed_file, sents_file, train_valid_split_file): from utils.utils import read_lines token_seqs, token_embed_seqs = __load_bert_embed_data(bert_embed_file) aspect_terms_list, opinion_terms_list = datautils.load_terms_list( sents_file, True) tvs_line = read_lines(train_valid_split_file)[0] tvs_arr = [int(v) for v in tvs_line.split()] token_seqs_train, token_seqs_valid = list(), list() token_embed_seqs_train, token_embed_seqs_valid = list(), list() aspect_terms_list_train, aspect_terms_list_valid = list(), list() opinion_terms_list_train, opinion_terms_list_valid = list(), list() assert len(tvs_arr) == len(token_seqs) for i, tvs_label in enumerate(tvs_arr): if tvs_label == 0: token_seqs_train.append(token_seqs[i]) token_embed_seqs_train.append(token_embed_seqs[i]) aspect_terms_list_train.append(aspect_terms_list[i]) opinion_terms_list_train.append(opinion_terms_list[i]) else: token_seqs_valid.append(token_seqs[i]) token_embed_seqs_valid.append(token_embed_seqs[i]) aspect_terms_list_valid.append(aspect_terms_list[i]) opinion_terms_list_valid.append(opinion_terms_list[i]) cnt_miss = 0 label_seqs_train = list() for i, (aspect_terms, opinion_terms) in enumerate( zip(aspect_terms_list_train, opinion_terms_list_train)): y = datautils.label_sentence(token_seqs_train[i], aspect_terms, opinion_terms) # if len(aspect_terms) - np.count_nonzero(y == 1) > 0: # print(aspect_terms) label_seqs_train.append(y) cnt_miss += len(aspect_terms) - np.count_nonzero(y == 1) print(cnt_miss, 'missed') data_train = TrainDataBert(label_seqs_train, token_embed_seqs_train) data_valid = get_valid_data(token_embed_seqs_valid, token_seqs_valid, aspect_terms_list_valid, opinion_terms_list_valid) return data_train, data_valid
def __missing_terms(): opinion_terms_file = 'd:/data/aspect/semeval14/opinion-terms-full.txt' opinion_terms_vocab = set(utils.read_lines(opinion_terms_file)) train_sents = utils.load_json_objs(config.SE15R_FILES['train_sents_file']) test_sents = utils.load_json_objs(config.SE15R_FILES['test_sents_file']) train_terms = set() test_terms = dict() for s in train_sents: for t in s['opinions']: train_terms.add(t.lower()) for s in test_sents: for t in s['opinions']: cnt = test_terms.get(t.lower(), 0) test_terms[t.lower()] = cnt + 1 # test_terms.add(t.lower()) for t, cnt in test_terms.items(): if t not in train_terms: print(t, cnt, t in opinion_terms_vocab)
def __rand_laptops(n_sents): tok_texts_file = config.AMAZON_TOK_TEXTS_FILE aspect_terms_file = config.AMAZON_RM_TERMS_FILE opinion_terms_file = config.AMAZON_TERMS_TRUE4_FILE dst_tok_texts_file = 'd:/data/amazon/rand-laptops/laptops-tok-texts-{}.txt'.format( n_sents) dst_at_file = 'd:/data/amazon/rand-laptops/laptops-aspect-terms-{}.txt'.format( n_sents) dst_ot_file = 'd:/data/amazon/rand-laptops/laptops-opinion-terms-{}.txt'.format( n_sents) tok_texts = utils.read_lines(tok_texts_file) n_sents_total = len(tok_texts) rand_perm = np.random.permutation(n_sents_total) rand_idxs = rand_perm[:n_sents] __write_lines(tok_texts_file, dst_tok_texts_file, rand_idxs) __write_lines(aspect_terms_file, dst_at_file, rand_idxs) __write_lines(opinion_terms_file, dst_ot_file, rand_idxs)
def get_bboxes(img, gt_path): h, w = img.shape[0:2] lines = read_lines(gt_path) bboxes = [] tags = [] for line in lines: line = remove_all(line, '\xef\xbb\xbf') gt = split(line, ',') x1 = np.int(gt[0]) y1 = np.int(gt[1]) bbox = [np.int(gt[i]) for i in range(4, 32)] bbox = np.asarray(bbox) + ([x1 * 1.0, y1 * 1.0] * 14) bbox = np.asarray(bbox) / ([w * 1.0, h * 1.0] * 14) bboxes.append(bbox) tags.append(True) return np.array(bboxes), tags
def get_data_amazon_ao(vocab, aspect_terms_file, opinion_terms_file, tok_texts_file): aspect_terms_list = utils.load_json_objs(aspect_terms_file) opinion_terms_list = utils.load_json_objs(opinion_terms_file) tok_texts = utils.read_lines(tok_texts_file) assert len(aspect_terms_list) == len(tok_texts) assert len(opinion_terms_list) == len(tok_texts) word_idx_dict = {w: i + 1 for i, w in enumerate(vocab)} label_seq_list = list() word_idx_seq_list = list() for aspect_terms, opinion_terms, tok_text in zip(aspect_terms_list, opinion_terms_list, tok_texts): words = tok_text.split(' ') label_seq = label_sentence(words, aspect_terms, opinion_terms) label_seq_list.append(label_seq) word_idx_seq_list.append([word_idx_dict.get(w, 0) for w in words]) np.random.seed(3719) perm = np.random.permutation(len(label_seq_list)) n_train = len(label_seq_list) - 2000 idxs_train, idxs_valid = perm[:n_train], perm[n_train:] label_seq_list_train = [label_seq_list[idx] for idx in idxs_train] word_idx_seq_list_train = [word_idx_seq_list[idx] for idx in idxs_train] train_data = TrainData(label_seq_list_train, word_idx_seq_list_train) label_seq_list_valid = [label_seq_list[idx] for idx in idxs_valid] word_idx_seq_list_valid = [word_idx_seq_list[idx] for idx in idxs_valid] tok_texts_valid = [tok_texts[idx] for idx in idxs_valid] aspects_list_valid = [aspect_terms_list[idx] for idx in idxs_valid] opinions_list_valid = [opinion_terms_list[idx] for idx in idxs_valid] valid_data = ValidData(label_seq_list_valid, word_idx_seq_list_valid, tok_texts_valid, aspects_list_valid, opinions_list_valid) return train_data, valid_data
def get_data_semeval(train_sents_file, train_tok_text_file, train_valid_split_file, test_sents_file, test_tok_text_file, vocab, n_train, task): tvs_line = utils.read_lines(train_valid_split_file)[0] tvs_arr = [int(v) for v in tvs_line.split()] sents = utils.load_json_objs(train_sents_file) # texts = utils.read_lines(train_tok_text_file) tok_texts, word_span_seqs = load_token_pos_file(train_tok_text_file) sents_train, tok_texts_train, sents_valid, tok_texts_valid = list(), list( ), list(), list() word_span_seqs_train, word_span_seqs_valid = list(), list() for label, s, t, span_seq in zip(tvs_arr, sents, tok_texts, word_span_seqs): if label == 0: sents_train.append(s) tok_texts_train.append(t) word_span_seqs_train.append(span_seq) else: sents_valid.append(s) tok_texts_valid.append(t) word_span_seqs_valid.append(span_seq) labels_list_train, word_idxs_list_train = data_from_sents_file( sents_train, tok_texts_train, word_span_seqs_train, vocab, task) if n_train > -1: labels_list_train = labels_list_train[:n_train] word_idxs_list_train = word_idxs_list_train[:n_train] train_data = TrainData(labels_list_train, word_idxs_list_train) valid_data = __get_valid_data(sents_valid, tok_texts_valid, word_span_seqs_valid, vocab, task) sents_test = utils.load_json_objs(test_sents_file) texts_test, word_span_seqs_test = load_token_pos_file(test_tok_text_file) test_data = __get_valid_data(sents_test, texts_test, word_span_seqs_test, vocab, task) return train_data, valid_data, test_data
def __process_hl04(): filenames = utils.read_lines(config.DATA_FILE_LIST_FILE_HL04) reviews, sents, sents_text = list(), list(), list() for filename in filenames: tmp_revs, tmp_sents = __process_huliu04_file(filename, len(reviews)) reviews += tmp_revs sents += tmp_sents with open(config.SENT_TEXT_FILE_HL04, 'w', encoding='utf-8', newline='\n') as fout: for s in sents: assert '\n' not in s['text'] fout.write('{}\n'.format(s['text'])) fout = open(config.REVIEWS_FILE_HL04, 'w', encoding='utf-8', newline='\n') for r in reviews: fout.write('{}\n'.format(json.dumps(r, ensure_ascii=False))) fout.close() fout = open(config.SENTS_FILE_HL04, 'w', encoding='utf-8', newline='\n') for s in sents: fout.write('{}\n'.format(json.dumps(s, ensure_ascii=False))) fout.close()