def assignHImass(aa, save=False): '''assign HI masses to halos''' zz = atoz(aa) if rank == 0: print('Redshift = %0.2f' % zz) halocat = readincatalog(aa) hmass = halocat['Mass'].compute() hpos = halocat['Position'].compute() #Do hod ofolder = myscratch + '/%s/fastpm_%0.4f/' % (sim, aa) try: os.make_dirs(ofolder) except: pass h1mass = HI_hod(hmass, aa) halocat['HImass'] = h1mass if save: colsave = [cols for cols in halocat.columns] colsave = ['ID', 'Position', 'Mass', 'HImass'] if rank == 0: print(colsave) halocat.save(ofolder + 'halocat', colsave) if rank == 0: print('Halos saved at path\n%s' % ofolder) return halocat
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') for task_name in FLAGS.tasks: logging.info('Starting task: %s.', task_name) df = probing_utils.read_senteval_data(FLAGS.senteval_path, task_name) test_indices_list = probing_utils.split_with_wasserstein( df['text'], test_set_size=FLAGS.test_set_size, no_of_trials=FLAGS.trial_count, min_df=FLAGS.feature_vector_min_df, leaf_size=FLAGS.nn_leaf_size) experiment_base_dir = os.path.join(FLAGS.base_out_dir, FLAGS.split_name) + '-{}' for trial_id in range(FLAGS.trial_count): split_dir = experiment_base_dir.format(trial_id) probing_dir = os.path.join(split_dir, 'probing') settings_path = os.path.join(probing_dir, '{}-settings.json'.format(task_name)) data_out_path = os.path.join(probing_dir, '{}'.format(task_name)) logging.info('Starting run: %d.', trial_id) test_indices = test_indices_list[trial_id] all_indices = set(range(len(df))) train_dev_indices = all_indices - set(test_indices) dev_indices = random.sample(train_dev_indices, FLAGS.dev_set_size) train_indices = list(train_dev_indices - set(dev_indices)) logging.info('Writing output to file: %s.', data_out_path) # Set new labels. df.loc[df.index[train_indices], 'set'] = 'tr' df.loc[df.index[dev_indices], 'set'] = 'va' df.loc[df.index[test_indices], 'set'] = 'te' os.make_dirs(probing_dir) with open(settings_path, 'w') as settings_file: settings = { 'task_name': task_name, 'trial_id': trial_id, 'train_size': len(train_indices), 'dev_size': len(dev_indices), 'test_size': len(test_indices), } logging.info('Settings:\n%r', settings) json.dump(settings, settings_file, indent=2) with open(data_out_path, 'w') as data_file: # Don't add quoting to retain the original format unaltered. df[['set', 'target', 'text']].to_csv(data_file, sep='\t', header=False, index=False, quoting=csv.QUOTE_NONE, doublequote=False)
def __init__(self, outputDirName, scale=1.0): PDFHandler.__init__(self) self.outputDirName = outputDirName try: os.make_dirs(os.path.join(self.outputDirName, 'images')) except OSError: pass self.scale = scale
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') for task_name in FLAGS.tasks: logging.info('Starting task: %s.', task_name) df = probing_utils.read_senteval_data(FLAGS.senteval_path, task_name) df['text_len'] = df['text'].str.split().apply(len) # There is only 1 trial. Therefore, in order to stay consistent in directory # names we manually add '0' here. split_dir = os.path.join(FLAGS.base_out_dir, FLAGS.split_name) + '-0' probing_dir = os.path.join(split_dir, 'probing') settings_path = os.path.join(probing_dir, '{}-settings.json'.format(task_name)) data_out_path = os.path.join(probing_dir, '{}'.format(task_name)) length_threshold, test_lengths, test_mask = ( probing_utils.split_by_length_threshold(df, FLAGS.test_set_size)) train_dev_idxs = np.nonzero(~test_mask)[0] assert len(train_dev_idxs) > FLAGS.dev_set_size, 'Dev set size too large.' dev_idxs = random.sample(train_dev_idxs.tolist(), FLAGS.dev_set_size) train_idxs = list(set(train_dev_idxs) - set(dev_idxs)) logging.info('Writing output to file: %s.', data_out_path) # Set new labels. df.loc[test_mask, 'set'] = 'te' df.loc[df.index[train_idxs], 'set'] = 'tr' df.loc[df.index[dev_idxs], 'set'] = 'va' os.make_dirs(probing_dir) with open(settings_path, 'w') as settings_file: settings = { 'task_name': task_name, 'length_threshold': length_threshold, 'all_lengths': sorted(set(df['text_len'])), 'test_lengths': sorted(test_lengths), 'train/dev_lengths': sorted(set(df['text_len']) - set(test_lengths)), 'train_size': len(train_idxs), 'dev_size': len(dev_idxs), 'test_size': int(test_mask.sum()), 'test_mask': test_mask.values.tolist(), } logging.info('Settings:\n%r', settings) json.dump(settings, settings_file, indent=2) with open(data_out_path, 'w') as data_file: # Don't add quoting to retain the original format unaltered. df[['set', 'target', 'text']].to_csv( data_file, sep='\t', header=False, index=False, quoting=csv.QUOTE_NONE, doublequote=False)
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') for task_name in FLAGS.tasks: logging.info('Starting task: %s.', task_name) df = probing_utils.read_senteval_data(FLAGS.senteval_path, task_name) experiment_base_dir = os.path.join( FLAGS.base_out_dir, '{}{}'.format(FLAGS.num_folds, FLAGS.split_name) + '-{}') skf = model_selection.StratifiedKFold(n_splits=FLAGS.num_folds) for current_fold_id, (train_indexes, test_indexes) in enumerate( skf.split(df['text'], df['target'])): split_dir = experiment_base_dir.format(current_fold_id) probing_dir = os.path.join(split_dir, 'probing') settings_path = os.path.join(probing_dir, '{}-settings.json'.format(task_name)) data_out_path = os.path.join(probing_dir, '{}'.format(task_name)) logging.info('Starting run: %d.', current_fold_id) # Use the same data for train and dev, because the probing code does some # hyperparameter search on dev. We don't wanna tune on the test portion. train_set = df.iloc[train_indexes].copy() train_set.loc[:, 'set'] = 'tr' dev_set = df.iloc[train_indexes].copy() dev_set.loc[:, 'set'] = 'va' test_set = df.iloc[test_indexes].copy() test_set.loc[:, 'set'] = 'te' new_data = pd.concat([train_set, dev_set, test_set], ignore_index=True) logging.info('Writing output to file: %s.', data_out_path) os.make_dirs(probing_dir) with open(settings_path, 'w') as settings_file: settings = { 'task_name': task_name, 'fold_id': current_fold_id, 'train_size': len(train_indexes), 'dev_size': len(train_indexes), 'test_size': len(test_indexes), } logging.info('Settings:\n%r', settings) json.dump(settings, settings_file, indent=2) with open(data_out_path, 'w') as data_file: # Don't add quoting to retain the original format unaltered. new_data[['set', 'target', 'text']].to_csv(data_file, sep='\t', header=False, index=False, quoting=csv.QUOTE_NONE, doublequote=False)
def download(self, url, save_directory): file_name = url.split('/')[-1] if not os.path.exists(save_directory): os.make_dirs(save_directory) save_location = os.path.join(save_directory, file_name) r = self.session.get(url, stream=True) with open(save_location, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): if chunk: f.write(chunk)
def create_qr_codes(self, output_dir, overwrite=False): # go through all the houses/contacts # if the qr month isn't populated, create a qr code pointing to the desired link and with the desired filename # save the code in the output_dir # update the qr month try: os.make_dirs(output_dir) except: pass raw_cmd = 'qrencode -o "{}/{}" "{}"' date_string = datetime.datetime.now().date().strftime('%Y%m') sheet_data = self.sheets_data[sc.sheet_names['houses']] for i, row in enumerate(sheet_data): if i == 0: continue if overwrite or not row[sc.house_data_header.index(sc.qr_date_column_name)]: cmd = raw_cmd.format(output_dir, row[sc.house_data_header.index(sc.qr_file_name_column_name)], row[sc.house_data_header.index(sc.unique_url_column_name)]) rc = os.system(cmd) if rc != 0: pdb.set_trace() print('Error creating qr code with filename {}'.format(row[sc.house_data_header.index(sc.qr_file_name_column_name)])) continue j = sheet_data[0].index(sc.qr_date_column_name) self.update_cell(i, j, date_string, sc.sheet_names['houses']) # TODO: import the house/contact data named tuple instead of indexing into the header # Contacts sheet_data = self.sheets_data[sc.sheet_names['contacts']] for i, row in enumerate(sheet_data): if i == 0: continue # at least address/email/links need to be populated if (overwrite or not row[sc.contact_data_header.index(sc.qr_date_column_name)]) \ and \ (row[sc.contact_data_header.index(sc.address_column_name)] or row[sc.contact_data_header.index(sc.email_column_name)] or row[sc.contact_data_header.index(sc.links_column_name)] or row[sc.contact_data_header.index(sc.name_column_name)]): cmd = raw_cmd.format(output_dir, row[sc.contact_data_header.index(sc.qr_file_name_column_name)], row[sc.contact_data_header.index(sc.unique_url_column_name)]) rc = os.system(cmd) if rc != 0: pdb.set_trace() print('Error creating qr code with filename {}'.format(row[sc.contact_data_header.index(sc.qr_file_name_column_name)])) continue j = sheet_data[0].index(sc.qr_date_column_name) self.update_cell(i, j, date_string, sc.sheet_names['contacts'])
def savemesh(aa, bs=bs, nc=nc, kmin=0.1, R=1, savecat=False): '''save HI mesh with modes removed as in the function "removemodes" ''' pm = ParticleMesh(BoxSize=bs, Nmesh=[nc, nc, nc]) hcat = assignHImass(aa, save=savecat) #mesh = pm.paint(hcat['Position'], mass=hcat['HImass']) mesh = hcat.to_mesh(BoxSize=bs, Nmesh=[nc, nc, nc], position='Position', value='HImass').to_real_field() meshG = mesh.r2c().apply( lambda k, v: v * np.exp(-sum(ki**2 for ki in k) * R**2)).c2r() path = myscratch + sim + '/fastpm_%0.4f/' % aa + '/HImesh_N%04d' % (nc) try: os.make_dirs(path) except: pass if rank == 0: print('Save mesh to path \n', path) meshkpar, meshkparG = removemodes(mesh, kmin, R) mesh = FieldMesh(mesh) mesh.save(path, dataset='HI', mode='real') meshG = FieldMesh(meshG) meshG.save(path, dataset='HI-R%dp%d' % (int(R), (R * 10) % 10), mode='real') meshkpar = FieldMesh(meshkpar) dataset = 'kpar%dp%d' % (int(kmin), kmin * 10) # meshkpar.save(path, dataset=dataset, mode='real') meshkparG = FieldMesh(meshkparG) dataset = 'kpar%dp%d-R%dp%d' % (int(kmin), (kmin * 10) % 10, int(R), (R * 10) % 10) # meshkparG.save(path, dataset=dataset, mode='real')
import os import pandas as pd import random in_dir = '../larger-processed/' out_dir = '../larger-processed/' try: os.make_dirs(out_dir) except: pass checks = pd.read_csv(open(in_dir + 'large-interventions-2015-12-20-checks.csv')) interventions = pd.read_csv( open(in_dir + 'large-interventions-2015-12-20-interventions.csv')) num = len(str(max(checks['intervention_num']))) ids = checks['coin_name'].map(str) + '-' + checks['intervention_num'].map( lambda x: str(x).zfill(num)) checks['ids'] = ids ids = interventions['coin_name'].map(str) + '-' + interventions[ 'intervention_num'].map(lambda x: str(x).zfill(num)) interventions['ids'] = ids ids = list(set(checks['ids'])) random.shuffle(ids)
def run_fasttext(args) -> None: """ Function to run the fasttext classifier with the given command line arguments. Parameters ---------- args: The NameSpace of command line arguments generated by the fasttext argument parser """ if not path_exists(abs_path(args.train_file)): raise ValueError("Invalid input train file path given!") elif args.test_file and not path_exists(abs_path(args.test_file)): raise ValueError("Invalid input test file path given!") elif args.predict_test_split and not args.test_file: raise ValueError("Cannot predict on test_file without specifying its \ path!") train_file_name = abs_path(args.train_file) classifier = ft.supervised(train_file_name, args.input_binary) output_file = None if args.metrics: from sys import stdout output_file = stdout else: if not path_exists(abs_path(args.output_dir)): make_dirs(abs_path(args.output_dir)) output_file = open( abs_path(path_join(args.output_dir, args.metrics_file)), "w" ) if args.test_file: test_file_name = abs_path(args.test_file) results = classifier.test(test_file_name) print("Number of test slices:", results.nexamples, file=output_file) print("Precision:", results.precision, file=output_file) print("Recall:", results.recall, file=output_file) print(file=output_file) if args.predict_strip or args.predict_test_split or args.predict_test_texts: print("Prediction results:\n", file=output_file) if args.predict_strip: print(fancy_prediction(classifier, args.predict_strip), file=output_file) elif args.predict_test_split and args.test_file: with open(test_file_name) as f: for line in f.read().splitlines(): print( fancy_prediction( classifier, ' '.join(line.split()[1:]) ), '\n', sep='', file=output_file ) elif args.predict_test_texts: with open(args.predict_test_texts) as f: for line in f.read().splitlines()[1:]: filename, label, tokens = line.split(',') print(filename, ':', sep='', file=output_file) print(fancy_prediction(classifier, tokens), '\n', sep='', file=output_file )
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') word_content_data = probing_utils.read_senteval_data( FLAGS.senteval_path, _TASK_NAME) all_sentences = list( probing_utils.get_strings_from_sharded_recordio(FLAGS.gutenberg_path)) train_set = word_content_data.loc[word_content_data['set'] == 'tr', :] dev_set = word_content_data.loc[word_content_data['set'] == 'va', :] experiment_base_dir = os.path.join(FLAGS.base_out_dir, f'{FLAGS.split_name}-{{}}') for trial_id in range(FLAGS.num_trials): split_dir = experiment_base_dir.format(trial_id) probing_dir = os.path.join(split_dir, 'probing') settings_path = os.path.join(probing_dir, f'{_TASK_NAME}-settings.json') data_out_path = os.path.join(probing_dir, _TASK_NAME) logging.info('Starting run: %d.', trial_id) sampled_sentences = random.sample(all_sentences, FLAGS.test_set_size) data_sample = [] for sentence in sampled_sentences: # Swap bigrams of 50% of the sentences. if random.randint(0, 1): tokens = sentence.split() index_to_swap = random.randint(0, len(tokens) - 2) tmp = tokens[index_to_swap + 1] tokens[index_to_swap + 1] = tokens[index_to_swap] tokens[index_to_swap] = tmp sentence = ' '.join(tokens) target = 'I' else: target = 'O' data_sample.append(('te', target, sentence)) test_set = pd.DataFrame(data_sample, columns=train_set.columns) new_data = pd.concat([train_set, dev_set, test_set], ignore_index=True) logging.info('Writing output to file: %s.', data_out_path) os.make_dirs(probing_dir) with open(settings_path, 'w') as settings_file: settings = { 'task_name': _TASK_NAME, 'trial_id': trial_id, 'train_size': len(train_set), 'dev_size': len(dev_set), 'test_size': len(test_set), } logging.info('Settings:\n%r', settings) json.dump(settings, settings_file, indent=2) with open(data_out_path, 'w') as data_file: # Don't add quoting to retain the original format unaltered. new_data[['set', 'target', 'text']].to_csv(data_file, sep='\t', header=False, index=False, quoting=csv.QUOTE_NONE, doublequote=False)
#!/usr/bin/env python # coding: utf-8 # In[12]: import os get_ipython().run_line_magic('run', 'embeddings.ipynb') get_ipython().run_line_magic('run', 'dataset.ipynb') get_ipython().run_line_magic('run', 'label_generator.ipynb') # **Parse raw data into different folders** # In[23]: if not os.path.exists: os.make_dirs('../data/parsed_data/cnn/') if not os.path.exists: os.make_dirs('../data/parsed_data/dailymail') parse_all('../data/neuralsum/cnn/', '../data/parsed_data/cnn/') parse_all('../data/neuralsum/dailymail/', '../data/parsed_data/dailymail/') # **Convert parsed training documents to embedding indices** # In[24]: if not os.path.exists: os.make_dirs('../data/processed/cnn/') if not os.path.exists: os.make_dirs('../data/processed/dailymail') data_converter = NeuralSumToEmbedding('/home/ramkishore.s/meta/glove/glove.6B.200d.txt') data_converter.root_convert('../data/parsed_data/cnn/', '../data/processed/cnn/') data_converter.root_convert('../data/parsed_data/dailymail/', '../data/processed/dailymail/')
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') for task_name in FLAGS.tasks: logging.info('Starting task: %s.', task_name) df = probing_utils.read_senteval_data(FLAGS.senteval_path, task_name) df['text_len'] = df['text'].str.split().apply(len) experiment_base_dir = os.path.join(FLAGS.base_out_dir, FLAGS.split_name) + '-{}' for trial_id in range(FLAGS.trial_count): split_dir = experiment_base_dir.format(trial_id) probing_dir = os.path.join(split_dir, 'probing') settings_path = os.path.join(probing_dir, '{}-settings.json'.format(task_name)) data_out_path = os.path.join(probing_dir, '{}'.format(task_name)) logging.info('Starting run: %d.', trial_id) test_lengths, test_mask = probing_utils.split_by_random_length( df, FLAGS.test_set_size) train_dev_indexes = np.nonzero(~test_mask)[0] dev_indexes = random.sample(train_dev_indexes.tolist(), FLAGS.dev_set_size) train_indexes = list(set(train_dev_indexes) - set(dev_indexes)) logging.info('Writing output to file: %s.', data_out_path) # Set new labels. df.loc[test_mask, 'set'] = 'te' df.loc[df.index[train_indexes], 'set'] = 'tr' df.loc[df.index[dev_indexes], 'set'] = 'va' os.make_dirs(probing_dir) with open(settings_path, 'w') as settings_file: settings = { 'task_name': task_name, 'trial_id': trial_id, 'all_lengths': sorted(set(df['text_len'])), 'test_lengths': sorted(test_lengths), 'train/dev_lengths': sorted(set(df['text_len']) - set(test_lengths)), 'train_size': len(train_indexes), 'dev_size': len(dev_indexes), 'test_size': int(test_mask.sum()), 'test_mask': test_mask.values.tolist(), } logging.info('Settings:\n%r', settings) json.dump(settings, settings_file, indent=2) with open(data_out_path, 'w') as data_file: # Don't add quoting to retain the original format unaltered. df[['set', 'target', 'text']].to_csv(data_file, sep='\t', header=False, index=False, quoting=csv.QUOTE_NONE, doublequote=False)
print("\tgunzipping", fastqfile) print( run_timed( [ "gunzip", "-q", fastqfile] ) ) print("\n>>> GUNZIP: unzipping _R2.fastq.gz files:") for fastqfile in samples2: print("\tgunzipping", fastqfile) print( run_timed( [ "gunzip", "-q", fastqfile] ) ) # strip the .gz here? unzipped1 = [f.replace('.gz', '') for f in samples1] unzipped2 = [f.replace('.gz', '') for f in samples2] # TRIMMOMATIC to remove unwanted sequences # FASTQC to determine quality print("\n>>> TRIMMOMATIC: Trimming excess sequences from .fastq file") os.make_dirs("%s01_trimmomatic" % outputdir, exit_ok=True) # FASTQC to determine quality print("\n>>> FASTQC: analyzing quality of each .fastq file") os.make_dirs( os.path.join(outputdir , "02_fastqc" ), exist_ok=True) for samplename, unzippedfile1, unzippedfile2 in zip(names, unzipped1, unzipped2): #Make output directories sample_1_outputdir = os.path.join( outputdir , "02_fastqc/" , samplename + "_1" ) sample_2_outputdir = os.path.join( outputdir , "02_fastqc/" , samplename + "_2" ) inputfile = os.path.join(inputdir, unzippedfile1) # R1 cmd1_args = [ 'fastqc', '-o', sample_1_outputdir, "-t 20", inputfile ]
cc = "g++" #scheme_src = """ #zengine/scheme/bignum.c #zengine/scheme/eval.c #zengine/scheme/gc.c #zengine/scheme/opcodes.c #zengine/scheme/sexp.c #zengine/scheme/vm.c #zengine/scheme/include/chibi/sexp-hufftabs.c #""" scheme_src = "" ############################ #create tempdir if not os.path.exists(temp_dir): os.make_dirs(temp_dir) ######################################### #split line cpp_files = [] for one_dir in dirs: cpp_files = cpp_files + [ one_dir + "/" + cpp for cpp in os.listdir(one_dir) if cpp.endswith('.cpp') or cpp.endswith('.c') or cpp.endswith('.cc') ] cpp_files = cpp_files + scheme_src.split() get_temp_o = lambda cpp: temp_dir + "/" + cpp.replace("./", "").replace( ".cpp", ".o").replace(".cc", ".o").replace(".c", ".o").replace("/", ".")
""" cosmic_gene_list cosmic_raw_data_folder - remember to add slash at the end """ cosmic_gene_list = sys.argv[1] base_folder = global_stuff.base_folder cosmic_raw_data_folder = sys.argv[2] f = open(cosmic_gene_list, 'r') for line in f: gene = line.strip() print gene folder = base_folder + gene + '/' try: os.makedirs(folder) except Exception, err: print err # find the sequence in cosmic downloaded seq folder seq_folder = cosmic_raw_data_folder + gene[0].upper() + '/' try: os.make_dirs(seq_folder) except Exception, err: pass seq_file = seq_folder + gene + '_protein.txt' new_seq_file = folder + 'seq' subprocess.call(['cp', seq_file, new_seq_file]) f.close()
# Python program to convert # text file to pdf file from fpdf import FPDF import os """import glob # file name in array file_path = glob.glob("./code_file/*") """ # save FPDF() class into # a variable pdf file_path = [] if not os.path.exists('code_file'): os.make_dirs('code_file') for dir in os.listdir('code_file'): file = './code_file/' + dir file_path.append(file) pdf = FPDF() #print(file_path) # Add a page pdf.add_page() # set style and size of font # that you want in the pdf pdf.set_font("Arial", size=15) # open the text file in read mode for f in file_path: f = open(f, "r")
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') word_content_data = probing_utils.read_senteval_data( FLAGS.senteval_path, _TASK_NAME) target_words = set(word_content_data['target']) senteval_sentences = set(word_content_data['text']) target_word_to_sentences = probing_utils.get_target_word_to_sentence_mapping( target_words, senteval_sentences, probing_utils.get_strings_from_sharded_recordio(FLAGS.gutenberg_path)) class_count = len(target_words) items_per_class = FLAGS.test_set_size // class_count target_word_counts = { k: len(v) for k, v in target_word_to_sentences.items() } valid_target_words = { k for k, v in target_word_counts.items() if v >= items_per_class } logging.info('Number of target words for which we have enough data: %d.', len(valid_target_words)) assert valid_target_words train_set = word_content_data.loc[word_content_data['set'] == 'tr', :] dev_set = word_content_data.loc[word_content_data['set'] == 'va', :] experiment_base_dir = os.path.join(FLAGS.base_out_dir, f'{FLAGS.split_name}-{{}}') for trial_id in range(FLAGS.num_trials): split_dir = experiment_base_dir.format(trial_id) probing_dir = os.path.join(split_dir, 'probing') settings_path = os.path.join(probing_dir, f'{_TASK_NAME}-settings.json') data_out_path = os.path.join(probing_dir, _TASK_NAME) logging.info('Starting run: %d.', trial_id) data_sample = [] for valid_target_word in valid_target_words: sentences = target_word_to_sentences[valid_target_word] current_sample = random.sample(sentences, items_per_class) data_sample.extend( ('te', valid_target_word, sample) for sample in current_sample) test_set = pd.DataFrame(data_sample, columns=train_set.columns) new_data = pd.concat([train_set, dev_set, test_set], ignore_index=True) logging.info('Writing output to file: %s.', data_out_path) os.make_dirs(probing_dir) with open(settings_path, 'w') as settings_file: settings = { 'task_name': _TASK_NAME, 'trial_id': trial_id, 'train_size': len(train_set), 'dev_size': len(dev_set), 'test_size': len(test_set), 'valid_target_words_size': len(valid_target_words), 'valid_target_words': sorted(valid_target_words), } logging.info('Settings:\n%r', settings) json.dump(settings, settings_file, indent=2) with open(data_out_path, 'w') as data_file: # Don't add quoting to retain the original format unaltered. new_data[['set', 'target', 'text']].to_csv(data_file, sep='\t', header=False, index=False, quoting=csv.QUOTE_NONE, doublequote=False)
def concatenate(enc_dir1, enc_dir2, output_dir): def _check_consistency(enc_dir1, enc_dir2): mp1 = _get_model_name(enc_dir1) mp2 = _get_model_name(enc_dir2) if mp1 != mp2: raise RuntimeError("Inconsistent model names, %s and %s" % (mp1, mp2)) et1 = _get_enc_transform(enc_dir1) et2 = _get_enc_transform(enc_dir2) if et1 != et2: raise RuntimeError("Inconsistent encoding transforms, %s and %s" \ % (et1, et2)) def _check_paths(enc_dir): paths = [ pj(enc_dir, "model_path.txt"), pj(enc_dir, "encoding_chip_list.txt"), pj(enc_dir, "encoding_transform.txt"), pj(enc_dir, "encoding.npy") ] for p in paths: if not pe(p): raise RuntimeError("Expecting to find path %s but it does not "\ "exist" % (p)) def _get_enc_transform(enc_dir): with open(pj(enc_dir, "encoding_transform.txt")) as fp: enc_tr = next(fp).strip() return enc_tr def _get_files(enc_dir): files = [] with open(pj(enc_dir, "encoding_chip_list.txt")) as fp: for line in fp: files.append(line.strip()) return files def _get_model_name(enc_dir): with open(pj(enc_dir, "model_path.txt")) as fp: mp = next(fp).strip() while mp.endswith("/"): mp = mp[:-1] name = os.path.basename(mp) return name def _write_ancillary(enc_dir1, enc_dir2, output_dir): shutil.copy(pj(enc_dir1, "encoding_transform.txt"), output_dir) shutil.copy(pj(enc_dir1, "model_path.txt"), output_dir) with open(pj(output_dir, "original_encodings.txt"), "w") as fp: fp.write(enc_dir1 + "\n") fp.write(enc_dir2 + "\n") def _write_chip_list(enc_dir1, enc_dir2, output_dir): files1 = _get_files(enc_dir1) files2 = _get_files(enc_dir2) files = files1 + files2 with open(pj(output_dir, "encoding_chip_list.txt"), "w") as fp: for f in files: fp.write(f + "\n") def _write_encs(enc_dir1, enc_dir2, output_dir): encs1 = np.load(pj(enc_dir1, "encoding.npy")) encs2 = np.load(pj(enc_dir2, "encoding.npy")) encs = np.concatenate((encs1, encs2)) np.save(pj(output_dir, "encoding.npy"), encs) _check_paths(enc_dir1) _check_paths(enc_dir2) _check_consistency(enc_dir1, enc_dir2) if not pe(output_dir): os.make_dirs(output_dir) _write_encs(enc_dir1, enc_dir2, output_dir) _write_chip_list(enc_dir1, enc_dir2, output_dir) _write_ancillary(enc_dir1, enc_dir2, output_dir)