def preprocessing(self): filenames = os.listdir(self.prefix) images = os.listdir(self.img_path) for file in filenames: with open(os.path.join(self.prefix, file), 'r') as f: for item in jsonlines.Reader(f): if item['file_name'] in images: self.valid_files_num += 1 self.items.append(item) for sentence in item['annotations']: for character in sentence: self.classes.add(character['text']) else: print("Invalid:", item['file_name']) self.classes_num = len(self.classes) + 1 # TODO self.classes = list(self.classes) self.classes.append('ignore')
def load_vector_map(file_path): entity_vector = {} width = 0 with open(file_path, "r", encoding="utf-8") as f: for item in jsonlines.Reader(f): words = "" vector = [] for feature in item["features"]: if feature["token"] == "[CLS]" or feature["token"] == "[SEP]": continue else: words += feature["token"] vector.append(feature["layers"][0]["values"]) width = len(feature["layers"][0]["values"]) entity_vector[words] = np.mean(vector, axis=0) f.close() return entity_vector, width
def read_jsonl_gz(path): with jsonlines.Reader(gzip.open(path)) as reader: raw_politician_tweets = list(reader) tweet_df = pd.DataFrame( data={ 'tweet': [t['full_text'] for t in raw_politician_tweets], 'author': [t['user']['screen_name'] for t in raw_politician_tweets], 'date': [ parse_twitter_datetime(t['created_at']) for t in raw_politician_tweets ], 'id': [t['id'] for t in raw_politician_tweets] }) tweet_df.set_index('id') return tweet_df
def main(args): ## dummy line for dependency file testing lines = ['1', '2', '3'] reader = jsonlines.Reader(lines) print('-----------requirment.txt 테스트 start----------------') for obj in reader: print(obj, "\n") print('------------requirment.txt 테스트 end -----------------') ########################################## logging.info("getting data") train_dataset = train_input_fn() eval_dataset = eval_input_fn() validation_dataset = validation_input_fn() logging.info("configuring model") model = keras_model_fn(args.learning_rate, args.weight_decay, args.optimizer, args.momentum) callbacks = [] #callbacks.append(ModelCheckpoint(args.model_dir + '/checkpoint-{epoch}.h5')) callbacks.append( ModelCheckpoint(args.model_output_dir + '/checkpoint-{epoch}.h5')) logging.info("Starting training") model.fit(train_dataset, steps_per_epoch=(num_examples_per_epoch('train') // args.batch_size), epochs=args.epochs, validation_data=validation_dataset, validation_steps=(num_examples_per_epoch('validation') // args.batch_size), callbacks=callbacks) score = model.evaluate(eval_dataset, steps=num_examples_per_epoch('eval') // args.batch_size, verbose=0) logging.info('Test loss:{}'.format(score[0])) logging.info('Test accuracy:{}'.format(score[1])) #return save_model(model, args.model_dir) return save_model(model, args.model_output_dir)
def detect_face(self, image_list_file, image_with_landmark_list_file): # detect frames = [] landmarks = [] probs = [] boxes = [] image_infos = [] with open(image_list_file) as fin: for image_info in tqdm(jsonlines.Reader(fin)): image_infos.append(image_info) for image_info in tqdm(image_infos): frame = cv2.imread(image_info["file_path"]) frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) frame = Image.fromarray(frame) frames.append(frame) # When batch is full, detect faces and reset batch list if len(frames) >= self.batch_size: batch_boxes, batch_probs, batch_landmarks = self.mtcnn.detect(frames, landmarks=True) probs.extend(batch_probs) landmarks.extend(batch_landmarks) boxes.extend(batch_boxes) frames = [] if len(frames): batch_boxes, batch_probs, batch_landmarks = self.mtcnn.detect(frames, landmarks=True) probs.extend(batch_probs) landmarks.extend(batch_landmarks) boxes.extend(batch_boxes) # save with jsonlines.open(image_with_landmark_list_file, 'w') as fout: for image_info, prob, box, landmark in zip(image_infos, probs, boxes, landmarks): fail_to_detect = landmark is None try: fout.write({"file_path": image_info["file_path"], "identity_name": image_info["identity_name"], "identity_id": image_info["identity_id"], "prob": 'none' if fail_to_detect else prob[0].tolist(), "box": 'none' if fail_to_detect else box[0].tolist(), "landmark": 'none' if fail_to_detect else landmark[0].tolist() }) except: print(type(prob), prob, image_info["file_path"])
def execute(self, cmdopts): cmd = [APISH.BIN] + cmdopts self.log.info("APISH CALL: %s" % ' '.join(cmd)) proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) try: c_out, c_err = proc.communicate( timeout=self.opts_driver['timeout']) except subprocess.TimeoutExpired: proc.kill() c_out, c_err = proc.communicate() if proc.returncode: raise RuntimeError('FAIL: %s' % ' '.join(cmd), c_out, c_err) return list(jsonlines.Reader(BytesIO(c_out)))
def _load_jsonlines(self, dataset): """ Function to encapsulate a dataset into a jsonlines Reader object. Args: dataset (HTTPResponse): HTTPResponse encapsulating a dataset Returns: jsonlines.Reader: JSON Lines Reader object wrapping a HTTPResponse object containing the full dataset Raises: TypeError: Invalid data was supplied to the function """ try: reader = jsonlines.Reader(dataset) except TypeError: _logger.error("Invalid data supplied to function") raise return reader
def deal_dict(file1, file2): f1 = open(file1, 'r', encoding='utf-8') f2 = open(file2, 'w', encoding='utf-8') items = jsonlines.Reader(f1) # 按行读入文件 final = [] for item in items: temp = [] str1 = item["more"] for idx, ch in enumerate(str1): if ch == "【": temp.append(str1[idx + 1]) temp = set(temp) new_list = [i for i in temp] item["more"] = new_list final.append(item) json.dump(final, f2, ensure_ascii=False, indent=1) # print(final) f1.close() f2.close()
def read_webhose_dataset(dataset_file, num_docs=None): # webhose datasets: https://webhose.io/free-datasets/ docs = [] dataset_path = join('datasets', dataset_file) # read the zip file containing the webhose dataset with zipfile.ZipFile(dataset_path) as dataset_zipfile: # find the name of the JSON file inside the zip file json_filename = dataset_zipfile.namelist()[0] with dataset_zipfile.open(json_filename) as json_file: # each line in the file is a JSON document json_reader = jsonlines.Reader(json_file) # read only the initial num_docs documents for doc in itertools.islice(json_reader, num_docs): docs.append(doc['text']) return docs
def parse_nodes(read_file): data = [] with open(read_file, 'r', encoding='utf-8') as f: for item in jsonlines.Reader(f): if 'en' not in item['labels'] or len(item['labels']['en']) == 0: labels = ['None'] else: labels = item['labels']['en'] if 'en' not in item['descriptions'] or len( item['descriptions']['en']) == 0: desc = 'None' else: desc = item['descriptions']['en'][0] qnode = item['qnode'] props = item['wd_prop_vals'] data.append([qnode, desc, labels, props]) return data
def save(c): """Save node ids for current docker context into environment context.json""" try: ec = EnvironmentContext(Path(".")) except ValueError as exc: print(exc) sys.exit(1) cmd = "docker node ls --format '{{json .}}' " res = c.run(cmd, hide=True, warn=True) if res.exited != 0: print("WARNING: Current Docker Host is not in Swarm mode!!!") return nodes = list(jsonlines.Reader(res.stdout.splitlines())) nodes = {node["ID"]: node for node in nodes} context_json_fname = "context.json" with (ec.env_dir / context_json_fname).open("w", encoding="utf-8") as f: json.dump(nodes, f) print(f"{context_json_fname} updated.")
def read_sampling_data(self): """ {"tp":"ar","cid":"diekwochen-1848-02-12-a-i0004","len":3899,"orig_lg":"lb","langdetect":[{"lang":"fr","prob":1}],"langid":[{"lang":"fr","prob":1}]} """ json_reader = jsonlines.Reader(sys.stdin) for jdata in json_reader: m = re.search( r"^(?P<COLLECTION>.+)-(?P<YEAR>\d{4})-(?P<MONTH>\d{2})-(?P<DAY>\d{2})-(?P<EDITION>[a-z])-i(?P<CONTENTITEM>\d{4})$", jdata["id"], ) if m: self.ids_per_coll_year[(m["COLLECTION"], m["YEAR"])].append(jdata["id"]) self.id2data[jdata["id"]] = jdata else: log.error(f'NO MATCH FOR CONTENTITEM {jdata["id"]}') for k in self.ids_per_coll_year: self.ids_per_coll_year[k].sort()
def request_export(self, method, url=None, path=None, params=None, json=None, **kwargs): if not self.__verified: self.__verified = self.check_access() if url and path: url = '{}/{}'.format(url, path) elif path and not url: url = 'https://data.mixpanel.com/api/2.0/{}'.format(path) if 'endpoint' in kwargs: endpoint = kwargs['endpoint'] del kwargs['endpoint'] else: endpoint = 'export' if 'headers' not in kwargs: kwargs['headers'] = {} kwargs['headers']['Accept'] = 'application/json' if self.__user_agent: kwargs['headers']['User-Agent'] = self.__user_agent if method == 'POST': kwargs['headers']['Content-Type'] = 'application/json' kwargs['headers']['Authorization'] = 'Basic {}'.format( str(base64.urlsafe_b64encode(self.__api_secret.encode("utf-8")), "utf-8")) with metrics.http_request_timer(endpoint) as timer: response = self.perform_request(method=method, url=url, params=params, json=json, stream=True, **kwargs) timer.tags[metrics.Tag.http_status_code] = response.status_code # export endpoint returns jsonl results; # other endpoints return json with array of results # jsonlines reference: https://jsonlines.readthedocs.io/en/latest/ reader = jsonlines.Reader(response.iter_lines()) for record in reader.iter(allow_none=True, skip_empty=True): yield record
def fetch_s2orc_keys_from_scirex_ids(scirex_doc_ids, data_download_commands): scirex_s2orc_mapping_file = os.path.join(caches_directory, "s2orc_hash_to_struct_mapping.pkl") if os.path.exists(scirex_s2orc_mapping_file): s2orc_hash_to_struct_mapping = pickle.load(open(scirex_s2orc_mapping_file, 'rb')) else: # If we don't have a cache file already, then manually match s2orc and scirex entries # (takes several hours and requires downloading and purging hundreds of GB of data) s2orc_hash_to_struct_mapping = {} start = time.perf_counter() for i, s2orc_shard_command in enumerate(data_download_commands): output_path = f"s2orc_downloads/{s2orc_shard_command[2]}" data_url = eval(s2orc_shard_command[3]) shard_id = get_shard_id_from_path(output_path, data_type="full_text") # wget.download(data_url, out=output_path) # end = time.perf_counter() print(f"Took {end - start} seconds to download shard {i}\n") start = end # shard = gzip.open(output_path, 'rt') s2orc_full = jsonlines.Reader(shard) hits = 0 for doc in s2orc_full: doc_hash = doc["_pdf_hash"] if doc['_pdf_hash'] in scirex_doc_ids: doc_id = doc["paper_id"] s2orc_hash_to_struct_mapping[doc_hash] = S2OrcEntry(shard_id, doc_id, doc_hash) hits += 1 # print(f"{hits} matching documents found!") end = time.perf_counter() print(f"Took {end - start} seconds to process pdf parses") start = end # if os.path.exists(output_path): os.remove(output_path) print(f"Deleted {output_path}") print("\n") pickle.dump(s2orc_hash_to_struct_mapping, open("s2orc_hash_to_struct_mapping.pkl", 'wb')) return s2orc_hash_to_struct_mapping
def init_data(request): with open(MOVIE_PATH) as f: for line in jsonlines.Reader(f): name = line['name'] score = float(line['score']) if line['score'] else 0.0 director = ','.join(line['director']) actor = ','.join(line['actor']) area = line['area'] length = line['length'] # print(length) length = int(length.split('分钟')[0]) if length else 0 brief = line['brief'] img = IMG_PATH + name + '.jpg' release_date = line['release_date'].split('(')[0] release_date = datetime.date(*map(int, release_date.split('-'))) types = line['type'] if not Play.objects.filter(name=name): play = Play(name=name, play_time=release_date, length=length, desc=brief, director=director, actors=actor, area=area, img=img, score=score) play.save() for type_name in types: # print(type_name) play_type = Type.objects.filter(name=type_name).first() # print(play_type) if play_type: play.types.add(play_type.id) else: type_new = Type(name=type_name) type_new.save() play.types.add(type_new.id) play.save() return redirect(reverse('play_list'))
def read_data(file_path, labels_path, data_num): # 读取数据 all_data, labels_data = [], [] with open(file_path, 'r+', encoding='utf-8') as f: n = 0 for item in jsonlines.Reader(f): n += 1 if n <= data_num: pre_text = item['ctx'] end_text = item['ending_options'] all_data.append([pre_text, end_text[0], end_text[1], end_text[2], end_text[3]]) with open(labels_path) as f: answer = [i.strip() for i in f.readlines()] m = 0 for j in answer: m += 1 if m <= data_num: labels_data.append(int(j)) return all_data, labels_data
def open_dataset() -> dict: """ 打开脚本文件 :return: dict response 内容 """ if request.method == "GET": file_path = request.args.get("path", "") if not os.path.exists(file_path): return {"code": 0, "msg": "打开文件失败!", "data": {"msg": "当前路径不存在文件,请检查文件路径!"}} try: print(file_path) with open(file_path, "r", encoding="utf-8") as file: dataset = [line for line in jsonlines.Reader(file)] data_handler = SequenceHandler() data_handler.set_dataset(dataset) data_handler.file_path = file_path uid = str(uuid4()) dataset_dict[uid] = data_handler feedback = { "code": 1, "msg": "打开文件成功!", "data": { "length": len(dataset), "uid": uid } } except Exception as error: feedback = { "code": 0, "msg": "打开文件失败!", "data": { "msg": str(error) } } else: feedback = {"code": 0, "msg": "请求方法有误!", "data": {}} return feedback
def load_label( path: str, flavor: str, small_data: bool = False ) -> List[List[List[List[str]]]]: label_path = path.replace("docs", flavor) print(f"reading labels from {label_path}") labels = defaultdict(dict) evidences = defaultdict(list) label_toi = {"entailment": 0, "contradiction": 1, "neutral": 2} reader = jsonlines.Reader(open(label_path)) for line in reader: label = line["classification"] idx = line["annotation_id"] labels[idx] = label_toi[label] evidences[label + "_hypothesis"] = [] evidences[label + "_premise"] = [] for evi in line["evidences"][0]: evidences[evi["docid"]].append((evi["start_token"], evi["end_token"])) if small_data and len(labels) > 1500: break return labels, evidences
def read_dev_data(self): with open('../hellaswag-train-dev/valid.jsonl', 'r+', encoding='utf-8') as f: for item in jsonlines.Reader(f): pre_text = item['ctx'] end_text = item['ending_options'] self.test_text_left.append(pre_text) self.test_text_left.append(pre_text) self.test_text_left.append(pre_text) self.test_text_left.append(pre_text) self.test_text_right.append(end_text[0]) self.test_text_right.append(end_text[1]) self.test_text_right.append(end_text[2]) self.test_text_right.append(end_text[3]) with open('../hellaswag-train-dev/valid-labels.lst') as f: answer = [i.strip() for i in f.readlines()] for j in answer: one_line_label = [0, 0, 0, 0] one_line_label[int(j)] = 1 self.test_label += one_line_label
def main(args): count = 0 image_idx = 1 cropped_image_dir = "/home1/sxy/datasets/face_recognition/CASIA-WebFace/cropped_images" with open(args.image_with_landmark_list_file) as fin, jsonlines.open(args.cropped_image_list_file, 'w') as fout: for image_info in jsonlines.Reader(fin): frame = cv2.imread(image_info["file_path"]) if image_info["landmark"] == 'none': count += 1 continue cropped_frame = align(frame, np.array(image_info["landmark"], dtype=np.float32)) cropped_file_path = os.path.join(cropped_image_dir, f"{image_idx}.jpg") cv2.imwrite(cropped_file_path, cropped_frame) cropped_image_info = {"cropped_file_path": cropped_file_path} cropped_image_info.update(image_info) fout.write(cropped_image_info) image_idx += 1 if image_idx % 10000 == 0: print(f"already processing {image_idx} images") print(count) # 2471
def load_personachat(data_dir='data'): import os import subprocess filename = os.path.join(data_dir, 'personachat_all_sentences_train.jsonl') if not os.path.exists(filename): os.makedirs(data_dir, exist_ok=True) url = "https://nyu.box.com/shared/static/q4nvswb0szelivhgyx87vd1056ttqfyi.jsonl" args = ['wget', '-O', filename, url] subprocess.call(args) url = "https://nyu.box.com/shared/static/8krcizo8sms1m0ppy7uiwfcx4a3l5nsq.jsonl" args = [ 'wget', '-O', os.path.join(data_dir, 'personachat_all_sentences_valid.jsonl'), url ] subprocess.call(args) raw_datasets = {} for name in ['train', 'valid']: raw_datasets[name] = [ x['tokens'] for x in jsonlines.Reader( open( os.path.join(data_dir, 'personachat_all_sentences_%s.jsonl' % name))) ] if os.path.exists(os.path.join(data_dir, 'vocab.pkl')): vocab = pickle.load(open(os.path.join(data_dir, 'vocab.pkl'), 'rb')) else: vocab = Dictionary(raw_datasets, include_valid=False) pickle.dump(vocab, open(os.path.join(data_dir, 'vocab.pkl'), 'wb')) tokenized_datasets = tokenize_dataset(raw_datasets, vocab) datasets = { name: SequenceDataset(ds) for name, ds in tokenized_datasets.items() } print("Vocab size: %d" % (len(vocab))) return raw_datasets, datasets, vocab
def parseJson(lang, total=False): #### #后期修改,直接从数据库读取 #### dataset = [] time_v = [] with open(filename, 'r+', encoding='utf8') as f: for item in jsonlines.Reader(f): l = '' n = 0 if (total): n = float(item['repo_num']) else: for i in range(0, 10): if (item['n%dlang' % (i + 1)] == lang): n = float(item['n%dnum' % (i + 1)]) timestamp = datetime.datetime.strptime(item['timestamp'], "%Y-%m-%dT%H:%M:%S") # # startday = datetime.datetime(2009,1,1,0,0) # # days=int((timestamp-startday).days) if (n != 0): dataset.append([n]) time_v.append([timestamp, n]) #time-value,后期修改,直接从数据库调用 time_v = np.array(time_v) time_v = pd.DataFrame(time_v, columns=['timestamp', 'repo_number']) time_v.to_csv(path + '/datas/%sdata.csv' % lang, encoding='gb18030') dataset = np.array(dataset) trainSeq = dataset[0:int(len(dataset) * 0.8)] testSeq = dataset[int(len(dataset) * 0.8):len(dataset)] del time_v del f return trainSeq, testSeq, dataset
def read_hellaswag_dev_data(text_path, labels_path, file_name): all_pre_text, all_endings, all_label = [], [], [] with open(text_path, 'r+', encoding='utf-8') as f: for item in jsonlines.Reader(f): pre_text = item['ctx'] end_text = item['ending_options'] all_pre_text.append(pre_text) all_endings.append(end_text) with open(labels_path) as f: answer = [i.strip() for i in f.readlines()] for j in answer: all_label.append(int(j)) print(len(all_pre_text), len(all_endings), len(all_label)) wb = Workbook() ws = wb.active for i in range(len(all_label)): ws.append([all_pre_text[i], all_endings[i][0], all_endings[i][1], all_endings[i][2], all_endings[i][3], all_label[i]]) wb.save('data/' + file_name)
def virus_report_for(path_to_zipfile): ''' Return an object representing the data report. path_to_zipfile: The relative path to the zipfile containing the virus data report old code below: with zipfile.ZipFile(path_to_zipfile, 'r') as zip: virus_report_as_dict = yaml.safe_load(zip.read('ncbi_dataset/data/data_report.yaml')) virus_report = virus_report_pb2.VirusReport() ParseDict(virus_report_as_dict, virus_report) return virus_report ''' genomesArray = [] with zipfile.ZipFile(path_to_zipfile, 'r') as zip: report_file_handle = zip.open('ncbi_dataset/data/data_report.jsonl') reader = jsonlines.Reader(report_file_handle) for json_dict in reader.iter(type=dict, skip_invalid=True): # json_dict is a single report - all fields should be there. genomesArray.append(json_dict) virus_report = {"genomes": genomesArray} return virus_report
def flatten_attrs(raw_file, flatten_dir, lang, attrs): def _get_file_info(filename): """get mode and file index from file name""" filename = os.path.split(filename)[-1] mode = filename[:str.rfind(filename, '.jsonl')] return mode _flatten_dir = os.path.expanduser(flatten_dir) mode = _get_file_info(raw_file) attr_writers = {} for attr in attrs: attr_file = os.path.join(_flatten_dir, lang, '{}.{}'.format(mode, attr)) os.makedirs(os.path.dirname(attr_file), exist_ok=True) attr_writers[attr] = open(attr_file, 'w') with open(raw_file, 'r') as reader: for line in jsonlines.Reader(reader): for attr, info in line.items(): if attr in attr_writers: print(ujson.dumps(info, ensure_ascii=False), file=attr_writers[attr])
def process_file(self, input_file, top_k: int = 1000): data = [] with open(input_file, 'r', encoding='utf-8') as f: for line in jsonlines.Reader(f): data.append(line) for article in tqdm(data): question = article['question'] passage = article['passage'] sentences = sentence_tokenizer.tokenize(passage) self.get_sim(sentences, question, 1) self.sort(top_k) for article in tqdm(data): article['sentence_id'] = self.evidence[0][0] self.evidence = self.evidence[1:] assert 'sentence_id' in data[0] return data
def read(self, file, get_meta=False, autojoin_paragraphs=True, para_joiner='\n\n'): with open(file, 'rb') as fh: self.fh = fh cctx = zstandard.ZstdDecompressor() reader = io.BufferedReader(cctx.stream_reader(fh)) rdr = jsonlines.Reader(reader) for ob in rdr: # naive jsonl where each object is just the string itself, with no meta. For legacy compatibility. if isinstance(ob, str): assert not get_meta yield ob continue text = ob['text'] if autojoin_paragraphs and isinstance(text, list): text = para_joiner.join(text) if get_meta: yield text, (ob['meta'] if 'meta' in ob else {}) else: yield text
def process_jsonlines_hotpotqa(filename): """ This is process_jsonlines method for intro-only processed_wikipedia file. The item example: {"id": "45668011", "url": "https://en.wikipedia.org/wiki?curid=45668011", "title": "Flouch Roundabout", "text": ["Flouch Roundabout is a roundabout near Penistone, South Yorkshire, England, where the A628 meets the A616."], "charoffset": [[[0, 6],...]] "text_with_links" : ["Flouch Roundabout is a roundabout near <a href=\"Penistone\">Penistone</a>, <a href=\"South%20Yorkshire\">South Yorkshire</a>, England, where the <a href=\"A628%20road\">A628</a> meets the <a href=\"A616%20road\">A616</a>."], "charoffset_with_links": [[[0, 6], ... [213, 214]]]} """ # item should be nested list extracted_items = [] # with jsonlines.open(filename) as reader: with bzopen(filename, "r") as bzfin: for obj in jsonlines.Reader(bzfin): wiki_id = obj["id"] title = obj["title"] title_id = make_wiki_id(title, 0) plain_text = "\t".join(obj["text"]) text_with_links = "\t".join(obj["text_with_links"]) hyper_linked_titles = [] hyper_linked_titles = find_hyper_linked_titles(text_with_links) if len(hyper_linked_titles) > 0: hyper_linked_titles_text = "\t".join(hyper_linked_titles) else: hyper_linked_titles_text = "" extracted_items.append({ "wiki_id": wiki_id, "title": title_id, "plain_text": plain_text, "hyper_linked_titles": hyper_linked_titles_text, "original_title": title }) return extracted_items
def filter_datasets(paths, out_path: str, require_fields=[]): logger.debug(f"Requiring fields {require_fields}") total_lines = 0 examples = [] for path in paths: full_path = pathlib.Path(path).resolve() f = gzip.open(full_path, "rb") if path.endswith(".jsonl.gz") else full_path.open("r") reader = jsonlines.Reader(f) logger.debug(f"Loading {full_path}") for json_dict in tqdm.tqdm(reader, desc=full_path.name): total_lines += 1 # Check all required fields are present if any([field not in json_dict or not json_dict[field] for field in require_fields]): continue # We need the identifier (method name) as a label. Filter invalid identifiers if "identifier" in require_fields and _valid_identifier_regex.match(json_dict["identifier"]) == None: print(f"WARN: Invalid identifier {require_fields['identifier']}, skipping record") continue examples.append(json_dict) if total_lines % 100000 == 0: logger.debug(f"Filtered jsonl to {len(examples)}/{total_lines}") f.close() logger.debug(f"DONE: Filtered jsonl to {len(examples)}/{total_lines}") # TODO: Subsample # Write output full_out_path = pathlib.Path(out_path).resolve() f = gzip.open(full_out_path, "wb") if out_path.endswith(".jsonl.gz") else full_out_path.open("w") writer = jsonlines.Writer(f) logger.debug(f"Writing output to {full_out_path}...") writer.write_all(examples) logger.debug(f"DONE writing") f.close()
def read_data(f_in, f_out): out_file = open(os.path.join(root, f_out), 'w+', encoding='utf-8') label2id = {'neutral': 0, 'contradiction': 1, 'entailment': 2} writer = csv.writer(out_file) writer.writerow(('label', 'sentence1', 'sentence2')) with open(os.path.join(root, f_in),'r+',encoding='utf-8') as f: count = 0 for item in jsonlines.Reader(f): label = item['gold_label'] if label in label2id.keys(): label = label2id[label] else: continue sen1 = item['sentence1'] sen1 = process(sen1.lower()) sen2 = item['sentence2'] sen2 = process(sen2.lower()) writer.writerow((label, sen1, sen2)) count += 1 print(count) if(count==150000): break out_file.close()