def save_tfrecords(save_dir, train_list, eval_list, test_list, idx): with TFRecordWriter(os.path.join(save_dir, f"{idx}_train_.tfrecords")) as writer: for e in train_list: writer.write(e) with TFRecordWriter(os.path.join(save_dir, f"{idx}_test_.tfrecords")) as writer: for e in test_list: writer.write(e) with TFRecordWriter(os.path.join(save_dir, f"{idx}_eval_.tfrecords")) as writer: for e in eval_list: writer.write(e)
def write_tfr_batches(data, label,batch_size, num_batches, savepath, dataset_type): start =0 next_start = 0 for batch in range(num_batches): #print(batch) start = batch*batch_size filename = '{}_0{}.tfrecord'.format(dataset_type,batch) filepath = os.path.join(savepath,filename) with open(filepath,'w') as f: writer = TFRecordWriter(f.name) if(batch != num_batches-1): next_start = (batch+1)*batch_size else: next_start = len(data) for i in range(start,next_start): #write_tfrecord(data[star:next_start], out_path, ) record = sequence_to_tfexample(sequence = data[i], sentiment = label[i]) writer.write(record.SerializeToString())
def preprocess(dataset, destination_folder, split_adj): """ Preprocesses tox21 data Args: dataet destination_folder split_adj Return: none """ tox21_df = pd.read_csv(dataset) task_names = list(tox21_df.columns) task_names.remove('smiles') task_names.remove('mol_id') if not os.path.exists(destination_folder): os.mkdir(destination_folder) train_num = int(len(tox21_df) * 0.8) eval_num = int(len(tox21_df) * 0.1) test_num = len(tox21_df) - train_num - eval_num split = ['train'] * train_num + ['eval'] * eval_num + ['test'] * test_num shuffle(split) tox21_df['split'] = split Molecule = recordclass('Molecule', 'mol label mask_label') for split in ['train', 'eval', 'test']: split_df = tox21_df[tox21_df['split'] == split] molecules = [] for index, row in split_df.iterrows(): mol = Chem.MolFromSmiles(row['smiles']) label = (row[:12].to_dense().values == 1).astype( np.float32).tolist() mask_label = np.invert(np.isnan(row[:12].values.astype( np.float32))).astype(np.float32) molecule = Molecule(mol, label, mask_label) molecules.append(molecule) with TFRecordWriter( os.path.join(destination_folder, row['mol_id'] + '_' + split + '.tfrecords')) as single_writer: ex = molecule_to_example(molecule, split_adj) single_writer.write(ex.SerializeToString()) #with tf.python_io.TFRecordWriter(os.path.join(dir_name, '_' + split + '.tfrecords')) as dataset_writer: #for molecule in molecules: #ex = molecule_to_example(molecule, split_adj) #dataset_writer.write(ex.SerializeToString()) task_names = "\n".join(task_names) with open(os.path.join(destination_folder, 'tasks.txt'), 'w') as text_file: text_file.write(task_names)
def make_dataset(data_path, output_dir, name, bert_client, training=True, label2int=None, class_weight=None, n_split=1): """ data_path: path to the data (csv) label2int: dict class_weight: list n_split: Save the dataset to `n_split` seperated files Write dataset to ${output_dir}/${name}_${seq}.tfrecord (seq = 0 ~ n_split-1) Return file names of the created datasets (list), size of the dataset """ data = pd.read_csv(data_path) # replace empty titles with 'none' data['title1_en'] = data['title1_en'].apply(lambda x: 'none' if x.strip() == '' else x) data['title2_en'] = data['title2_en'].apply(lambda x: 'none' if x.strip() == '' else x) n_samples = math.ceil(len(data) / n_split) filenames = [] with tqdm(total=len(data)) as pbar: for i in range(n_split): filenames.append(f"{name}_{i}.tfrecord") with TFRecordWriter(os.path.join(output_dir, filenames[-1])) as writer: examples = create_examples(data=data[i * n_samples:(i + 1) * n_samples], bert_client=bert_client, training=training, label2int=label2int, class_weight=class_weight) for example in examples: writer.write(example.SerializeToString()) pbar.update() return filenames, len(data)
def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('input_files', nargs='+', metavar='INPUT-FILE') parser.add_argument('output_file', metavar='OUTPUT-FILE') parser.add_argument( '-i', '--instrument-re', type=re.compile, default=re.compile('.*'), metavar='REGEX', help='a regular expression matching the instrument name') parser.add_argument('--instrument-id', type=lambda l: [int(x) for x in l.split(',')], default=None, metavar='ID', help='the integer ID(s) of the instrument(s)') parser.add_argument('-p', '--program', type=lambda l: [int(x) for x in l.split(',')], default=None, metavar='PRG', help='the MIDI program number(s)') group = parser.add_mutually_exclusive_group() group.add_argument('--drums', action='store_true', help='include only drums') group.add_argument('--no-drums', action='store_false', dest='drums', help='exclude drums') group.set_defaults(drums=None) args = parser.parse_args() tf.enable_eager_execution() with TFRecordWriter(args.output_file) as writer: for record in tf.data.TFRecordDataset(args.input_files): sequence = music_pb2.NoteSequence.FromString(record.numpy()) filter_sequence(sequence, instrument_re=args.instrument_re, instrument_ids=args.instrument_id, programs=args.program, drums=args.drums) writer.write(sequence.SerializeToString())
def write_examples_to_tfrecord(examples, label_list, max_seq_length, tokenizer, output_file, is_testing, pbar_desc=None): """Write a set of `InputExample`s to a TFRecord file.""" def create_int_feature(values): return tf.train.Feature(int64_list=tf.train.Int64List(value=values)) label_map = {label: i for i, label in enumerate(label_list)} with TFRecordWriter(output_file) as writer: for example in tqdm(examples, desc=pbar_desc): feature = _convert_single_example(example=example, label_map=label_map, max_seq_length=max_seq_length, tokenizer=tokenizer, is_testing=is_testing) tf_features = { 'input_ids': create_int_feature(feature.input_ids), 'input_mask': create_int_feature(feature.input_mask), 'segment_ids': create_int_feature(feature.segment_ids), 'label_id': create_int_feature([feature.label_id]), 'is_real_example': create_int_feature([int(feature.is_real_example)]) } tf_example = tf.train.Example(features=tf.train.Features( feature=tf_features)) writer.write(tf_example.SerializeToString())
def build( cls, dump_db: DumpDB, tokenizer: PreTrainedTokenizer, sentence_tokenizer: SentenceTokenizer, entity_vocab: EntityVocab, output_dir: str, max_seq_length: int, max_entity_length: int, max_mention_length: int, min_sentence_length: int, include_sentences_without_entities: bool, include_unk_entities: bool, pool_size: int, chunk_size: int, max_num_documents: int, ): target_titles = [ title for title in dump_db.titles() if not (":" in title and title.lower().split(":")[0] in ("image", "file", "category")) ] random.shuffle(target_titles) if max_num_documents is not None: target_titles = target_titles[:max_num_documents] max_num_tokens = max_seq_length - 2 # 2 for [CLS] and [SEP] tokenizer.save_pretrained(output_dir) entity_vocab.save(os.path.join(output_dir, ENTITY_VOCAB_FILE)) number_of_items = 0 tf_file = os.path.join(output_dir, DATASET_FILE) options = tf.io.TFRecordOptions( tf.compat.v1.io.TFRecordCompressionType.GZIP) with TFRecordWriter(tf_file, options=options) as writer: with tqdm(total=len(target_titles)) as pbar: initargs = ( dump_db, tokenizer, sentence_tokenizer, entity_vocab, max_num_tokens, max_entity_length, max_mention_length, min_sentence_length, include_sentences_without_entities, include_unk_entities, ) with closing( Pool(pool_size, initializer=WikipediaPretrainingDataset. _initialize_worker, initargs=initargs)) as pool: for ret in pool.imap( WikipediaPretrainingDataset._process_page, target_titles, chunksize=chunk_size): for data in ret: writer.write(data) number_of_items += 1 pbar.update() with open(os.path.join(output_dir, METADATA_FILE), "w") as metadata_file: json.dump( dict( number_of_items=number_of_items, max_seq_length=max_seq_length, max_entity_length=max_entity_length, max_mention_length=max_mention_length, min_sentence_length=min_sentence_length, tokenizer_class=tokenizer.__class__.__name__, language=dump_db.language, ), metadata_file, indent=2, )
def merge_shards(filename, num_shards_to_merge, out_tmp_dir, batch_size, ensure_batch_multiple): np.random.seed([int.from_bytes(os.urandom(4), byteorder='little') for i in range(5)]) tfoptions = TFRecordOptions(compression_type = 'ZLIB') record_writer = TFRecordWriter(filename,tfoptions) binaryInputNCHWPackeds = [] globalInputNCs = [] policyTargetsNCMoves = [] globalTargetsNCs = [] scoreDistrNs = [] valueTargetsNCHWs = [] for input_idx in range(num_shards_to_merge): shard_filename = os.path.join(out_tmp_dir, str(input_idx) + ".npz") with np.load(shard_filename) as npz: assert(set(npz.keys()) == set(keys)) binaryInputNCHWPacked = npz["binaryInputNCHWPacked"] globalInputNC = npz["globalInputNC"] policyTargetsNCMove = npz["policyTargetsNCMove"].astype(np.float32) globalTargetsNC = npz["globalTargetsNC"] scoreDistrN = npz["scoreDistrN"].astype(np.float32) valueTargetsNCHW = npz["valueTargetsNCHW"].astype(np.float32) binaryInputNCHWPackeds.append(binaryInputNCHWPacked) globalInputNCs.append(globalInputNC) policyTargetsNCMoves.append(policyTargetsNCMove) globalTargetsNCs.append(globalTargetsNC) scoreDistrNs.append(scoreDistrN) valueTargetsNCHWs.append(valueTargetsNCHW) ### #WARNING - if adding anything here, also add it to joint_shuffle below! ### binaryInputNCHWPacked = np.concatenate(binaryInputNCHWPackeds) globalInputNC = np.concatenate(globalInputNCs) policyTargetsNCMove = np.concatenate(policyTargetsNCMoves) globalTargetsNC = np.concatenate(globalTargetsNCs) scoreDistrN = np.concatenate(scoreDistrNs) valueTargetsNCHW = np.concatenate(valueTargetsNCHWs) num_rows = binaryInputNCHWPacked.shape[0] assert(globalInputNC.shape[0] == num_rows) assert(policyTargetsNCMove.shape[0] == num_rows) assert(globalTargetsNC.shape[0] == num_rows) assert(scoreDistrN.shape[0] == num_rows) assert(valueTargetsNCHW.shape[0] == num_rows) [binaryInputNCHWPacked,globalInputNC,policyTargetsNCMove,globalTargetsNC,scoreDistrN,valueTargetsNCHW] = ( joint_shuffle_take_first_n(num_rows,[binaryInputNCHWPacked,globalInputNC,policyTargetsNCMove,globalTargetsNC,scoreDistrN,valueTargetsNCHW]) ) assert(binaryInputNCHWPacked.shape[0] == num_rows) assert(globalInputNC.shape[0] == num_rows) assert(policyTargetsNCMove.shape[0] == num_rows) assert(globalTargetsNC.shape[0] == num_rows) assert(scoreDistrN.shape[0] == num_rows) assert(valueTargetsNCHW.shape[0] == num_rows) #Just truncate and lose the batch at the end, it's fine num_batches = (num_rows // (batch_size * ensure_batch_multiple)) * ensure_batch_multiple for i in range(num_batches): start = i*batch_size stop = (i+1)*batch_size example = tfrecordio.make_tf_record_example( binaryInputNCHWPacked, globalInputNC, policyTargetsNCMove, globalTargetsNC, scoreDistrN, valueTargetsNCHW, start, stop ) record_writer.write(example.SerializeToString()) jsonfilename = os.path.splitext(filename)[0] + ".json" with open(jsonfilename,"w") as f: json.dump({"num_rows":num_rows,"num_batches":num_batches},f) record_writer.close() return num_batches * batch_size
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) meta_file = "data/raw/photo.json" image_dir = "data/raw/photos" record_file = "data/preprocessed/yelp_photos_{}.tfrecords" data = open(meta_file).readlines() n = len(data) split_idx = n * np.array([0, 0.16, 0.32, 0.48, 0.64, 0.80, 1.0]) split_idx = split_idx.astype('int32') idx = np.arange(n) np.random.shuffle(idx) for i, split in enumerate( tqdm(['train1', 'train2', 'train3', 'train4', 'train5', 'test'])): with TFRecordWriter(record_file.format(split)) as writer: for j in tqdm(idx[split_idx[i]:split_idx[i + 1]]): datapoint = json.loads(data[j]) image_file = os.path.join(image_dir, datapoint["photo_id"] + '.jpg') image = tf.io.decode_jpeg(tf.io.read_file(image_file, 'rb')) h, w, _ = image.shape res = min(h, w) h0, w0 = (h - res) // 2, (w - res) // 2 image = tf.image.crop_to_bounding_box(image, h0, w0, res, res) image_string = tf.io.encode_jpeg(image) label = YELP_CLASSES[datapoint["label"]] feature = { 'image': _bytes_feature(image_string.numpy()),
import addressbook_pb2 import tensorflow as tf from tensorflow.io import TFRecordWriter, read_file from tensorflow.data import TFRecordDataset from google.protobuf.json_format import MessageToDict from google.protobuf.json_format import MessageToJson if __name__ == '__main__': filename = "test.tfrecords" # Create a TFRecord file with TFRecordWriter(filename) as writer: for i in range(1, 5): person = addressbook_pb2.Person() person.id = 1000 + i person.name = "John Doe " + str(i) phone = person.phones.add() phone.type = addressbook_pb2.Person.PhoneType.HOME phone.number = "333-431" + str(i) phone = person.phones.add() phone.type = addressbook_pb2.Person.PhoneType.WORK phone.number = "444-431" + str(i) writer.write(person.SerializeToString()) # Read back TFRecord file tf.enable_eager_execution() dataset = TFRecordDataset(filename) for record in dataset.take(2): person = addressbook_pb2.Person() person.ParseFromString(record.numpy())
def main(): parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('input_dir', metavar='INPUT-DIR') parser.add_argument('output_prefix', metavar='OUTPUT-PREFIX') parser.add_argument('-b', '--bars-per-segment', type=lambda l: [int(x) for x in l.split(',')], default=[8], metavar='NUM', help='the number of bars per segment (default: 8)') parser.add_argument('-n', '--min-notes-per-segment', type=int, default=1, metavar='NUM', help='discard segments with less than the given number of notes ' '(default: 1)') parser.add_argument('-t', '--force-tempo', type=float, default=None, metavar='BPM', help='warp the sequences to match the given tempo') parser.add_argument('--skip-bars', type=int, default=0, metavar='NUM', help='skip the given number of bars at the beginning') parser.add_argument('--merge-instruments', action='store_true', help='causes equivalent instruments to be merged') args = parser.parse_args() # Collect all paths paths = [] for dir_path, dirnames, filenames in os.walk(args.input_dir): dirnames.sort() filenames.sort() for fname in filenames: paths.append(os.path.join(dir_path, fname)) metadata = [] with TFRecordWriter(args.output_prefix + '.tfrecord') as writer: for path in paths: rel_path = os.path.relpath(path, args.input_dir) print(rel_path, file=sys.stderr, flush=True) midi = pretty_midi.PrettyMIDI(path) sequence = midi_io.midi_to_note_sequence(midi) sequence.filename = rel_path # Record the downbeat times so that they get updated by normalize_tempo later for time in midi.get_downbeats(): annotation = sequence.text_annotations.add() annotation.time = time annotation.annotation_type = BEAT annotation.text = DOWNBEAT if args.merge_instruments: merge_equivalent_instruments(sequence) if args.force_tempo: sequence = note_sequence_utils.normalize_tempo(sequence, args.force_tempo) # Get the updated downbeats downbeats = [a.time for a in sequence.text_annotations if (a.annotation_type, a.text) == (BEAT, DOWNBEAT)] del sequence.text_annotations[-len(downbeats):] for start, end, segment in note_sequence_utils.split_on_downbeats( sequence, downbeats=downbeats, bars_per_segment=args.bars_per_segment, skip_bars=args.skip_bars, min_notes_per_segment=args.min_notes_per_segment, include_span=True): writer.write(segment.SerializeToString()) metadata.append({ 'filename': rel_path, 'segment_id': f'bar_{start}-{end}' }) with gzip.open(args.output_prefix + '_meta.json.gz', 'wt', encoding='utf-8') as f: json.dump(metadata, f, separators=(',', ':'))