def process_files(first_file,second_file): """process input files for comparison""" tokens_1 = utils.tokenize_text(first_file,True) tokens_2 = utils.tokenize_text(second_file,True) tc_scr1,tc_scr2,tokens = token_comparison(tokens_1,tokens_2) fuz_scr1,fuz_scr2 = fuzzy_comparison(tokens[0],tokens[1]) scr_1 = (tc_scr1 + fuz_scr1)/2 scr_2 = (tc_scr2 + fuz_scr2)/2 print "First-->Second Score:"+str(scr_1) + " Second-->First Score-->"+str(scr_2)
def process_files(first_file, second_file): """process input files for comparison""" tokens_1 = utils.tokenize_text(first_file, True) tokens_2 = utils.tokenize_text(second_file, True) tc_scr1, tc_scr2, tokens = token_comparison(tokens_1, tokens_2) fuz_scr1, fuz_scr2 = fuzzy_comparison(tokens[0], tokens[1]) scr_1 = (tc_scr1 + fuz_scr1) / 2 scr_2 = (tc_scr2 + fuz_scr2) / 2 print "First-->Second Score:" + str( scr_1) + " Second-->First Score-->" + str(scr_2)
def predict_top_p(args, device, net, text, vocabulary, num_return_sequences=5, n_out=1): net.eval() normalized_text = text # normalize_text(text) if args.use_python_vocabulary: tokens = tokenize_text(normalized_text) ix = torch.tensor([[vocabulary.to_index(w) for w in tokens]]).to(device) else: ix = torch.tensor([vocabulary.encode(normalized_text).ids[-100:] ]).to(device) if any(mn in args.model_name for mn in args.transformers_models): output = net.generate(input_ids=ix, max_length=len(ix[0]) + n_out, temperature=1.0, top_k=0, top_p=0.9, do_sample=True, num_return_sequences=num_return_sequences, early_stopping=True) else: cn = len(ix[0]) // args.sinkhorn_bucket_size cn = cn * args.sinkhorn_bucket_size cn = min(cn, 100) ix = ix[:, -cn:] output = net.generate(input_ids=ix, max_length=len(ix[0]) + n_out, temperature=1.0, top_k=0, top_p=0.9, do_sample=True, num_return_sequences=num_return_sequences, early_stopping=True, vocab_size=args.vocab_size) # result = [] for choice in output: if args.use_python_vocabulary: words = [vocabulary.to_word(x) for x in choice.tolist()] print(' '.join(words)) else: words = vocabulary.decode(choice.tolist()) print(words) print('-' * 30)
def extract_location(text, candidate, verbose=False): if torch.cuda.is_available(): device = torch.device("cuda") if verbose: print("Device: cuda selected") else: device = torch.device("cpu") if verbose: print("Device: cpu selected") load = initialize_model(verbose=verbose) embedding = load["embedding"] if candidate not in embedding.word_to_ix: raise ValueError("Candidate not found in vocabulary") embed_candidate = embedding.word_to_ix[candidate] model = load['model'] model.to(device) model.eval() tokenized_text = tokenize_text(text) input_ids = tokenized_text['input_ids'] attention_masks = tokenized_text['attention_masks'] if verbose: print("Text: tokenized") texts_count = len(input_ids) with torch.no_grad(): b_input_ids = input_ids.to(device) b_attention_masks = attention_masks.to(device) b_candidates = torch.tensor([embed_candidate] * texts_count).to(device) logits = model(b_input_ids, b_candidates, input_mask=b_attention_masks).squeeze() logits = logits.detach().cpu().numpy() is_relevant = False for logit in np.atleast_2d(logits): is_relevant = is_relevant or logit > 0 return is_relevant
def extract_location(text, verbose=False): if torch.cuda.is_available(): device = torch.device("cuda") if verbose: print("Device: cuda selected") else: device = torch.device("cpu") if verbose: print("Device: cpu selected") model = initialize_model(verbose=verbose) model.to(device) model.eval() tokenized_text = tokenize_text(text) input_ids = tokenized_text['input_ids'] attention_masks = tokenized_text['attention_masks'] if verbose: print("Text: tokenized") with torch.no_grad(): b_input_ids = input_ids.to(device) b_attention_masks = attention_masks.to(device) logits = model(b_input_ids, b_attention_masks).squeeze() logits = np.atleast_2d(logits.detach().cpu().numpy()) logits = np.array([ np.array(list(map(lambda x: 1 if x > 0 else 0, l))) for l in logits ]) candidates = set() for (logit, text) in list(zip(logits, input_ids)): for i in range(len(logit)): if logit[i] == 1: candidates.add(text[i]) candidates = set(map(lambda c: detokenize_word(c), candidates)) if verbose: print("Locations: extracted") return list(candidates)
def buildFeatures(self): """ The purpose of this method is to build features using the pre-trained BERT tokenizer """ if self.train: filename = self.parameters['processed-data-path'] + self.parameters[ 'processed-train-data-filename'] else: filename = self.parameters['processed-data-path'] + self.parameters[ 'processed-test-data-filename'] data_df = pd.read_csv(filename) tokenizer = utils.get_BERT_Tokenizer() data_df['tokenized'] = data_df['text'].apply( lambda x: utils.tokenize_text(x, tokenizer)) if self.train: max_length = data_df['tokenized'].apply(len).max() max_length_json = {"max_length": max_length} with open('./parameters/maxlength.txt', "w") as json_file: json_file.write(str(max_length)) else: max_length_json = open('./parameters/maxlength.txt', 'r') max_length = int(max_length_json.read()) features_df = pd.DataFrame() features_df['tokenized'] = data_df['tokenized'].copy() if self.train: filename = self.parameters['features-path'] + self.parameters[ 'features-train-filename'] features_df['label'] = 0 indexes = data_df[data_df['label'] == 'SARCASM'].index features_df.iloc[indexes, -1] = 1 features_df.to_json(filename) else: filename = self.parameters['features-path'] + self.parameters[ 'features-test-filename'] features_df.to_json(filename) return
def get_name(entities, text): count = 0 try: nlp = config.en_sm name = entities['Name'] for n in name: if set(n.lower().split()) & set(comm.DESIGNATION): continue elif set(n.lower().split()) & set(comm.RESUME_SECTIONS): continue else: return n except: name = '' doc = nlp(text) # Extract entities doc_entities = doc.ents doc_persons = filter(lambda x: x.label_ == 'PERSON', doc_entities) doc_persons = filter(lambda x: len(x.text.strip().split()) >= 2, doc_persons) doc_persons = list(doc_persons) if len(doc_persons) > 0: name = str(doc_persons[0]) return name else: lines = utils.tokenize_text(text) for sentence in lines: entities = nltk.chunk.ne_chunk(sentence) for subtree in entities.subtrees(): if subtree.label() == 'PERSON': for leaf in subtree.leaves(): name = leaf[0] name = str(name) return name return "No Name Found"
def predict(item: Item): preprocessed_text = preprocess_text(item.description) tokenized_sent = tokenize_text(preprocessed_text) X_test = model_dbow.infer_vector(tokenized_sent, steps=20) return clf.predict([X_test])[0]
args = parser.parse_args() tokenizer = RobertaTokenizer.from_pretrained(args.model_path) model = RobertaForMaskedLM.from_pretrained(args.model_path) model.eval() vocab = utils.make_vocab(args.vocab_file) FEATURE_COUNT = 768 # Change this value to 1024 for the large RoBERTa model. MAX_LINES = 2000 # Maximum number of context lines to average per vocabulary embedding. if __name__ == "__main__": # Process vocabulary words in the outer loop. for v in vocab: with open(args.context_file, 'r') as lines: v_sum = torch.zeros([1, FEATURE_COUNT]) v_tokens = utils.tokenize_text(v, tokenizer) utils.print_tokenized_text(v_tokens, tokenizer) count_sentence = 0 count_tensor = 0 # Process all lines in the context file in the inner loop. for line in lines: # Check for this vocab word in this line; if found, split the line into individual sentences. if v in line.lower().split(): for sentence in line.split('.'): if v in sentence.lower(): line = sentence count_sentence += 1 break # We'll take the first instance of the word and discard the rest of the line. # Split the new sentence-based line into tokens. line_tokens = utils.tokenize_text(line, tokenizer)
import os import tensorflow as tf import tensorflow_datasets as tfds from utils import load_dataset, tokenize_text, encode_map_fn, split_dataset import datetime print(os.getcwd()) data_name = 'amazon_us_reviews/Mobile_Electronics_v1_00' train_dataset = load_dataset(name=data_name) vocabulary = tokenize_text(train_dataset) print(len(vocabulary)) vocab_size = len(vocabulary) + 1 print(f"Vocabulary size: {vocab_size}") # tokenize text encoder = tfds.features.text.TokenTextEncoder(vocabulary) encoder.save_to_file('vocab') print("Saved vocabulary file.") # apply encoding to dataset encoded_dataset = train_dataset.map(encode_map_fn) train_data, test_data = split_dataset(encoded_dataset, test_size=10000) model = tf.keras.models.Sequential() model.add(tf.keras.layers.Embedding(vocab_size, 128)) model.add( tf.keras.layers.Bidirectional( tf.keras.layers.LSTM(128, return_sequences=True))) # model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)))
# nltk.download('punkt') # nltk.download('wordnet') # nltk.download('stopwords') cores = multiprocessing.cpu_count() df_train=pd.read_csv('../data/train.csv') df_train['cleaned_desc']=df_train['Exception (input)'].apply(preprocess_text) df_test=pd.read_csv('../data/test.csv') df_test['cleaned_desc']=df_test['Exception (input)'].apply(preprocess_text) train_tagged = df_train.apply( lambda r: TaggedDocument(words=tokenize_text(r['cleaned_desc']), tags=[r['Exception Category (ouput)']]), axis=1) test_tagged = df_test.apply( lambda r: TaggedDocument(words=tokenize_text(r['cleaned_desc']), tags=[r['Exception Category (ouput)']]), axis=1) model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores) model_dbow.build_vocab([x for x in tqdm(train_tagged.values)]) for epoch in range(30): model_dbow.train([x for x in tqdm(train_tagged.values)], total_examples=len(train_tagged.values), epochs=1) model_dbow.alpha -= 0.002 model_dbow.min_alpha = model_dbow.alpha # save the model to disk filename = '../model/dbow_model.sav' joblib.dump(model_dbow, filename)
def train(args): layout = 'NT' train_sent, vocab, freq = tokenize_text(args.train_data, start_label=start_label, invalid_label=invalid_label) val_sent, _, _ = tokenize_text(args.valid_data, vocab=vocab, start_label=start_label, invalid_label=invalid_label) # layout, format of data and label. 'NT' means (batch_size, length) and 'TN' means (length, batch_size). data_train = LMNceIter(train_sent, args.batch_size, freq, layout=layout, buckets=buckets, invalid_label=invalid_label, num_label=args.num_label) data_val = LMNceIter(val_sent, args.batch_size, freq, layout=layout, buckets=buckets, invalid_label=invalid_label, num_label=args.num_label) cell = mx.rnn.SequentialRNNCell() for i in range(args.num_layers): cell.add( mx.rnn.LSTMCell(num_hidden=args.num_hidden, prefix='lstm_l%d_' % i)) def sym_gen(seq_len): # [batch_size, seq_len] data = mx.sym.Variable('data') # map input to a embeding vector embedIn = mx.sym.Embedding(data=data, input_dim=len(vocab), output_dim=args.num_embed, name='input_embed') # pass embedding vector to lstm # [batch_size, seq_len, num_hidden] output, _ = cell.unroll(seq_len, inputs=embedIn, layout='NTC', merge_outputs=True) #output = output.reshape(-1, num_embed) # map label to embeding label = mx.sym.Variable('label') labwgt = mx.sym.Variable('label_weight') # define output embeding matrix # # TODO: change to adapter binding embedwgt = mx.sym.Variable(name='output_embed_weight', shape=(len(vocab), args.num_hidden)) pred = nce_loss(output, label, labwgt, embedwgt, len(vocab), args.num_hidden, args.num_label, seq_len) return pred, ('data', ), ('label', 'label_weight') if args.gpus: contexts = [mx.gpu(int(i)) for i in args.gpus.split(',')] else: contexts = mx.cpu(0) model = mx.mod.BucketingModule( sym_gen=sym_gen, default_bucket_key=data_train.default_bucket_key, context=contexts) if args.load_epoch: _, arg_params, aux_params = mx.rnn.load_rnn_checkpoint( cell, args.model_prefix, args.load_epoch) else: arg_params = None aux_params = None opt_params = {'learning_rate': args.lr, 'wd': args.wd} if args.optimizer not in ['adadelta', 'adagrad', 'adam', 'rmsprop']: opt_params['momentum'] = args.mom model.fit(train_data=data_train, eval_data=data_val, eval_metric=NceMetric(invalid_label), kvstore=args.kv_store, optimizer=args.optimizer, optimizer_params=opt_params, initializer=mx.init.Xavier(factor_type="in", magnitude=2.34), arg_params=arg_params, aux_params=aux_params, begin_epoch=args.load_epoch, num_epoch=args.num_epochs, batch_end_callback=mx.callback.Speedometer( args.batch_size, args.disp_batches), epoch_end_callback=mx.rnn.do_rnn_checkpoint( cell, args.model_prefix, 1) if args.model_prefix else None)
def test(args): assert args.model_prefix, "Must specifiy path to load from" # generate data iterator layout = 'TN' train_sent, vocab, _ = tokenize_text(args.train_data, start_label=start_label, invalid_label=invalid_label) test_sent, _, _ = tokenize_text(args.test_data, vocab=vocab, start_label=start_label, invalid_label=invalid_label) data_test = mx.rnn.BucketSentenceIter(test_sent, args.batch_size, buckets=buckets, invalid_label=invalid_label, label_name='label', data_name='data', layout=layout) if not args.stack_rnn: stack = mx.rnn.FusedRNNCell(args.num_hidden, num_layers=args.num_layers, mode='lstm', bidirectional=args.bidirectional).unfuse() else: stack = mx.rnn.SequentialRNNCell() for i in range(args.num_layers): cell = mx.rnn.LSTMCell(num_hidden=args.num_hidden, prefix='lstm_l%d_' % i) stack.add(cell) def sym_gen(seq_len): data = mx.sym.Variable('data') embed = mx.sym.Embedding(data=data, input_dim=len(vocab), output_dim=args.num_embed, name='input_embed') stack.reset() # [seq_len*batch_size, num_hidden] outputs, states = stack.unroll(seq_len, inputs=embed, layout='TNC', merge_outputs=True) # [seq_len*batch_size, 1, num_hidden] pred = mx.sym.Reshape(data=outputs, shape=(-1, 1, args.num_hidden * (1 + args.bidirectional))) # get output embedding # TODO: this is a constant, initialize it only one-time # # [vocab_size] -> [vocab_size, num_hidden] allLab = mx.sym.Variable('alllab', shape=(len(vocab), ), dtype='float32') labs = mx.sym.Embedding(data=allLab, input_dim=len(vocab), output_dim=args.num_hidden, name='output_embed') # pred: [seq_len*batch_size, 1, num_hidden] # labs: [vocab_size, num_hidden] # output: [seq_len*batch_size, vocab_size, num_hidden] pred = mx.sym.broadcast_mul(pred, labs) # [seq_len*batch_size, vocab_size] pred = mx.sym.sum(data=pred, axis=2) label = mx.sym.Variable('label') label = mx.sym.Reshape(label, shape=(-1, )) pred = mx.sym.SoftmaxOutput(data=pred, label=label, name='softmax') return pred, ('data', ), ('label', ) if args.gpus: contexts = [mx.gpu(int(i)) for i in args.gpus.split(',')] else: contexts = mx.cpu(0) # 定义一个模型,使用bucket方式进行训练 model = mx.mod.BucketingModule( sym_gen=sym_gen, default_bucket_key=data_test.default_bucket_key, context=contexts) datashape = data_test.provide_data labelshape = data_test.provide_label model.bind(datashape, labelshape, for_training=False) # note here we load using SequentialRNNCell instead of FusedRNNCell. _, arg_params, aux_params = mx.rnn.load_rnn_checkpoint( stack, args.model_prefix, args.load_epoch) arg_params['alllab'] = mx.ndarray.arange( len(vocab), dtype='float32').as_in_context(contexts) model.set_params(arg_params, aux_params) model.score(data_test, mx.metric.Perplexity(invalid_label), batch_end_callback=mx.callback.Speedometer(args.batch_size, 5))
def test_tokenize_text(self): tokens = tokenize_text('data/test_data/sentences.txt',True) self.assertEqual(86, len(tokens))