def run_inference(model):
    '''
    Run inference on tweet embeddings and save probabilites
    [model] : scikit-learn model object
    '''

    n_batches = len(glob(abspath(join(cf.TWEETS_DIR, '*.npz'))))
    print('{} batches found...'.format(n_batches))

    for i in tqdm(range(n_batches)):  # iterate through batches
        probs = infer_single_batch(i)
        output_file_path = abspath(
            join(cf.TWEETS_DIR, 'probabilities{}.npy'.format(i)))
        if is_available(output_file_path):
            print('\tProbabilities already calculated...')
            continue
        np.save(output_file_path, probs)

    return None
Пример #2
0
def extract_questions_embeddings(model, tokenizer):
    '''
    Extract Language-agnostic BERT Sentence Embeddings of 'Questions' dataset.
    [model]     : tf.keras.Model
    [tokenizer] : BERT tokenizer
    '''
    # load 'Questions' dataset
    questions = read_questions_dataset()

    # extract LaBSE embeddings
    output_file_path = abspath(join(cf.QUESTIONS_DIR, cf.questions_embeddings))
    if is_available(output_file_path):
        print('\tFile already available...')
        return None

    embeddings = encode(list(questions['text']), model, tokenizer).numpy()
    np.save(output_file_path, embeddings)

    print('Embeddings extracted and saved.')

    return None
Пример #3
0
def extract_tweet_embeddings(model, tokenizer):
    '''
    Extract Language-agnostic BERT Sentence Embeddings of tweets.
    Processes in batches to fit into GPU memory.
    [model]     : tf.keras.Model
    [tokenizer] : BERT tokenizer
    '''

    # read tweets
    tweets = pd.read_csv(abspath(join(cf.TWEETS_DIR, cf.cleaned_tweets_file)),
                         header=None,
                         names=['date', 'tweet'])
    tweets.dropna(inplace=True)
    print(tweets.shape)

    # split the tweets list into chunks to be able to fit into GPU memory
    tweet_chunks = list(chunks(list(tweets['tweet']), cf.batch_size))
    print('Data has been split into {} batches of size {}.'.format(
        len(tweet_chunks), cf.batch_size))

    # iterate over batches and extract sentence embeddings
    for batch in tqdm(enumerate(tweet_chunks)):
        print('Processing batch {}...'.format(batch[0]))
        output_file_path = abspath(
            join(cf.TWEETS_DIR, 'compressed_batch{}.npz'.format(batch[0])))
        if is_available(output_file_path):
            print('\tBatch already extracted...')
            continue

        smaller_chunks = chunks(batch[1], int(cf.batch_size / cf.split_size))
        embeddings = np.dstack([
            encode(smaller_batch, model, tokenizer).numpy()
            for smaller_batch in smaller_chunks
        ])  # 3D numpy array

        np.savez_compressed(output_file_path, embeddings)
    print('Embeddings extracted and saved.')

    return None
Пример #4
0
        for index_file in indexlist:
            if index_file != '':
                download_file(index_file, target_dir, aspera)


if __name__ == '__main__':
    parser = set_parser()
    args = parser.parse_args()

    accession = args.accession
    format = args.format
    dest_dir = args.dest
    fetch_meta = args.meta
    fetch_index = args.index
    aspera = args.aspera

    if not utils.is_run(accession) and not utils.is_experiment(accession):
        print 'Error: Invalid accession. An INSDC run or experiment accession must be provided'
        sys.exit(1)

    if not utils.is_available(accession):
        print 'Record does not exist or is not available for accession provided'
        sys.exit(1)

    try:
        download_files(accession, format, dest_dir, fetch_index, fetch_meta, aspera)
        print 'Completed'
    except Exception:
        utils.print_error()
        sys.exit(1)
Пример #5
0
def check_availability(accession, output_format):
    if not utils.is_available(accession, output_format):
        sys.stderr.write(
            'ERROR: Record does not exist or is not available for accession provided\n'
        )
        sys.exit(1)
Пример #6
0
    extract_wgs = args.extract_wgs
    expanded = args.expanded
    fetch_meta = args.meta
    fetch_index = args.index
    aspera = args.aspera
    aspera_settings = args.aspera_settings

    if aspera or aspera_settings is not None:
        aspera = utils.set_aspera(aspera_settings)

    try:
        if utils.is_wgs_set(accession):
            if output_format is not None:
                sequenceGet.check_format(output_format)
            sequenceGet.download_wgs(dest_dir, accession, output_format)
        elif not utils.is_available(accession, output_format):
            sys.stderr.write(
                'ERROR: Record does not exist or is not available for accession provided\n'
            )
            sys.exit(1)
        elif utils.is_sequence(accession):
            if output_format is not None:
                sequenceGet.check_format(output_format)
            sequenceGet.download_sequence(dest_dir, accession, output_format,
                                          expanded)
        elif utils.is_analysis(accession):
            if output_format is not None:
                readGet.check_read_format(output_format)
            readGet.download_files(accession, output_format, dest_dir,
                                   fetch_index, fetch_meta, aspera)
        elif utils.is_run(accession) or utils.is_experiment(accession):
Пример #7
0
                          shuffle=True,
                          batch_size=batch_size,
                          num_workers=num_workers)
valid_loader = DataLoader(dataset=valid_dataset,
                          shuffle=False,
                          batch_size=valid_batch_size,
                          num_workers=num_workers)
print('data_loader end......')

# model
print('model load......')
# net = getattr(import_module('torchvision.models'), module_name)
net = getattr(import_module('lightcnn'), module_name)

model = net(num_classes=num_classes, channels=18)
is_available(model)
# is_adaptive(model,fc_num=fc_num,num_classes=num_classes,num_channels = num_channels)
# channels_conv(model,fc_num=fc_num,num_classes=num_classes)
# load(model,train_mode,pretrained_path=pretrained_path)

# In[28]:

# optimizer
print('create optimizer......')
optimizer = torch.optim.Adam(model.parameters(),
                             lr=lr,
                             weight_decay=weight_decay)
is_parallel(optimizer)
# loss
print('create loss......')
# criterion = nn.CrossEntropyLoss(weight=class_weights)  # nn.MSELoss()
Пример #8
0
                          batch_size=batch_size,
                          num_workers=num_workers)
valid_loader = DataLoader(dataset=valid_dataset,
                          shuffle=False,
                          batch_size=valid_batch_size,
                          num_workers=num_workers)
# print('data_loader end......')
#
# print('model load......')
net = getattr(import_module('torchvision.models'), module_name)
model = net(num_classes=num_classes)
is_adaptive(model,
            fc_num=fc_num,
            num_classes=num_classes,
            num_channels=num_channels)
is_available(model)
model = is_parallel(model)
# load(model,train_mode,pretrained_path=pretrained_path)
#
#
#
# # In[28]:
#
# # optimizer
# print('create optimizer......')
optimizer = torch.optim.Adam(model.parameters(),
                             lr=lr,
                             weight_decay=weight_decay)
optimizer = is_parallel(optimizer)
# loss
print('create loss......')