def run_inference(model): ''' Run inference on tweet embeddings and save probabilites [model] : scikit-learn model object ''' n_batches = len(glob(abspath(join(cf.TWEETS_DIR, '*.npz')))) print('{} batches found...'.format(n_batches)) for i in tqdm(range(n_batches)): # iterate through batches probs = infer_single_batch(i) output_file_path = abspath( join(cf.TWEETS_DIR, 'probabilities{}.npy'.format(i))) if is_available(output_file_path): print('\tProbabilities already calculated...') continue np.save(output_file_path, probs) return None
def extract_questions_embeddings(model, tokenizer): ''' Extract Language-agnostic BERT Sentence Embeddings of 'Questions' dataset. [model] : tf.keras.Model [tokenizer] : BERT tokenizer ''' # load 'Questions' dataset questions = read_questions_dataset() # extract LaBSE embeddings output_file_path = abspath(join(cf.QUESTIONS_DIR, cf.questions_embeddings)) if is_available(output_file_path): print('\tFile already available...') return None embeddings = encode(list(questions['text']), model, tokenizer).numpy() np.save(output_file_path, embeddings) print('Embeddings extracted and saved.') return None
def extract_tweet_embeddings(model, tokenizer): ''' Extract Language-agnostic BERT Sentence Embeddings of tweets. Processes in batches to fit into GPU memory. [model] : tf.keras.Model [tokenizer] : BERT tokenizer ''' # read tweets tweets = pd.read_csv(abspath(join(cf.TWEETS_DIR, cf.cleaned_tweets_file)), header=None, names=['date', 'tweet']) tweets.dropna(inplace=True) print(tweets.shape) # split the tweets list into chunks to be able to fit into GPU memory tweet_chunks = list(chunks(list(tweets['tweet']), cf.batch_size)) print('Data has been split into {} batches of size {}.'.format( len(tweet_chunks), cf.batch_size)) # iterate over batches and extract sentence embeddings for batch in tqdm(enumerate(tweet_chunks)): print('Processing batch {}...'.format(batch[0])) output_file_path = abspath( join(cf.TWEETS_DIR, 'compressed_batch{}.npz'.format(batch[0]))) if is_available(output_file_path): print('\tBatch already extracted...') continue smaller_chunks = chunks(batch[1], int(cf.batch_size / cf.split_size)) embeddings = np.dstack([ encode(smaller_batch, model, tokenizer).numpy() for smaller_batch in smaller_chunks ]) # 3D numpy array np.savez_compressed(output_file_path, embeddings) print('Embeddings extracted and saved.') return None
for index_file in indexlist: if index_file != '': download_file(index_file, target_dir, aspera) if __name__ == '__main__': parser = set_parser() args = parser.parse_args() accession = args.accession format = args.format dest_dir = args.dest fetch_meta = args.meta fetch_index = args.index aspera = args.aspera if not utils.is_run(accession) and not utils.is_experiment(accession): print 'Error: Invalid accession. An INSDC run or experiment accession must be provided' sys.exit(1) if not utils.is_available(accession): print 'Record does not exist or is not available for accession provided' sys.exit(1) try: download_files(accession, format, dest_dir, fetch_index, fetch_meta, aspera) print 'Completed' except Exception: utils.print_error() sys.exit(1)
def check_availability(accession, output_format): if not utils.is_available(accession, output_format): sys.stderr.write( 'ERROR: Record does not exist or is not available for accession provided\n' ) sys.exit(1)
extract_wgs = args.extract_wgs expanded = args.expanded fetch_meta = args.meta fetch_index = args.index aspera = args.aspera aspera_settings = args.aspera_settings if aspera or aspera_settings is not None: aspera = utils.set_aspera(aspera_settings) try: if utils.is_wgs_set(accession): if output_format is not None: sequenceGet.check_format(output_format) sequenceGet.download_wgs(dest_dir, accession, output_format) elif not utils.is_available(accession, output_format): sys.stderr.write( 'ERROR: Record does not exist or is not available for accession provided\n' ) sys.exit(1) elif utils.is_sequence(accession): if output_format is not None: sequenceGet.check_format(output_format) sequenceGet.download_sequence(dest_dir, accession, output_format, expanded) elif utils.is_analysis(accession): if output_format is not None: readGet.check_read_format(output_format) readGet.download_files(accession, output_format, dest_dir, fetch_index, fetch_meta, aspera) elif utils.is_run(accession) or utils.is_experiment(accession):
shuffle=True, batch_size=batch_size, num_workers=num_workers) valid_loader = DataLoader(dataset=valid_dataset, shuffle=False, batch_size=valid_batch_size, num_workers=num_workers) print('data_loader end......') # model print('model load......') # net = getattr(import_module('torchvision.models'), module_name) net = getattr(import_module('lightcnn'), module_name) model = net(num_classes=num_classes, channels=18) is_available(model) # is_adaptive(model,fc_num=fc_num,num_classes=num_classes,num_channels = num_channels) # channels_conv(model,fc_num=fc_num,num_classes=num_classes) # load(model,train_mode,pretrained_path=pretrained_path) # In[28]: # optimizer print('create optimizer......') optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay) is_parallel(optimizer) # loss print('create loss......') # criterion = nn.CrossEntropyLoss(weight=class_weights) # nn.MSELoss()
batch_size=batch_size, num_workers=num_workers) valid_loader = DataLoader(dataset=valid_dataset, shuffle=False, batch_size=valid_batch_size, num_workers=num_workers) # print('data_loader end......') # # print('model load......') net = getattr(import_module('torchvision.models'), module_name) model = net(num_classes=num_classes) is_adaptive(model, fc_num=fc_num, num_classes=num_classes, num_channels=num_channels) is_available(model) model = is_parallel(model) # load(model,train_mode,pretrained_path=pretrained_path) # # # # # In[28]: # # # optimizer # print('create optimizer......') optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay) optimizer = is_parallel(optimizer) # loss print('create loss......')