예제 #1
0
def process_files(first_file,second_file):
  """process input files for comparison"""
  tokens_1 = utils.tokenize_text(first_file,True)
  tokens_2 = utils.tokenize_text(second_file,True)
  tc_scr1,tc_scr2,tokens = token_comparison(tokens_1,tokens_2)
  fuz_scr1,fuz_scr2 = fuzzy_comparison(tokens[0],tokens[1])
  scr_1 = (tc_scr1 + fuz_scr1)/2
  scr_2 = (tc_scr2 + fuz_scr2)/2
  print "First-->Second Score:"+str(scr_1) + " Second-->First Score-->"+str(scr_2)
예제 #2
0
def process_files(first_file, second_file):
    """process input files for comparison"""
    tokens_1 = utils.tokenize_text(first_file, True)
    tokens_2 = utils.tokenize_text(second_file, True)
    tc_scr1, tc_scr2, tokens = token_comparison(tokens_1, tokens_2)
    fuz_scr1, fuz_scr2 = fuzzy_comparison(tokens[0], tokens[1])
    scr_1 = (tc_scr1 + fuz_scr1) / 2
    scr_2 = (tc_scr2 + fuz_scr2) / 2
    print "First-->Second Score:" + str(
        scr_1) + " Second-->First Score-->" + str(scr_2)
예제 #3
0
def predict_top_p(args,
                  device,
                  net,
                  text,
                  vocabulary,
                  num_return_sequences=5,
                  n_out=1):
    net.eval()

    normalized_text = text  # normalize_text(text)

    if args.use_python_vocabulary:
        tokens = tokenize_text(normalized_text)
        ix = torch.tensor([[vocabulary.to_index(w)
                            for w in tokens]]).to(device)
    else:
        ix = torch.tensor([vocabulary.encode(normalized_text).ids[-100:]
                           ]).to(device)

    if any(mn in args.model_name for mn in args.transformers_models):
        output = net.generate(input_ids=ix,
                              max_length=len(ix[0]) + n_out,
                              temperature=1.0,
                              top_k=0,
                              top_p=0.9,
                              do_sample=True,
                              num_return_sequences=num_return_sequences,
                              early_stopping=True)
    else:
        cn = len(ix[0]) // args.sinkhorn_bucket_size
        cn = cn * args.sinkhorn_bucket_size
        cn = min(cn, 100)
        ix = ix[:, -cn:]
        output = net.generate(input_ids=ix,
                              max_length=len(ix[0]) + n_out,
                              temperature=1.0,
                              top_k=0,
                              top_p=0.9,
                              do_sample=True,
                              num_return_sequences=num_return_sequences,
                              early_stopping=True,
                              vocab_size=args.vocab_size)

    # result = []
    for choice in output:
        if args.use_python_vocabulary:
            words = [vocabulary.to_word(x) for x in choice.tolist()]
            print(' '.join(words))
        else:
            words = vocabulary.decode(choice.tolist())
            print(words)
        print('-' * 30)
예제 #4
0
def extract_location(text, candidate, verbose=False):
    if torch.cuda.is_available():
        device = torch.device("cuda")
        if verbose:
            print("Device: cuda selected")
    else:
        device = torch.device("cpu")
        if verbose:
            print("Device: cpu selected")

    load = initialize_model(verbose=verbose)

    embedding = load["embedding"]

    if candidate not in embedding.word_to_ix:
        raise ValueError("Candidate not found in vocabulary")

    embed_candidate = embedding.word_to_ix[candidate]

    model = load['model']
    model.to(device)

    model.eval()

    tokenized_text = tokenize_text(text)
    input_ids = tokenized_text['input_ids']
    attention_masks = tokenized_text['attention_masks']

    if verbose:
        print("Text: tokenized")

    texts_count = len(input_ids)

    with torch.no_grad():
        b_input_ids = input_ids.to(device)
        b_attention_masks = attention_masks.to(device)
        b_candidates = torch.tensor([embed_candidate] * texts_count).to(device)

        logits = model(b_input_ids, b_candidates,
                       input_mask=b_attention_masks).squeeze()
        logits = logits.detach().cpu().numpy()
        is_relevant = False

        for logit in np.atleast_2d(logits):
            is_relevant = is_relevant or logit > 0

        return is_relevant
예제 #5
0
def extract_location(text, verbose=False):

    if torch.cuda.is_available():
        device = torch.device("cuda")
        if verbose:
            print("Device: cuda selected")
    else:
        device = torch.device("cpu")
        if verbose:
            print("Device: cpu selected")

    model = initialize_model(verbose=verbose)
    model.to(device)

    model.eval()

    tokenized_text = tokenize_text(text)
    input_ids = tokenized_text['input_ids']
    attention_masks = tokenized_text['attention_masks']

    if verbose:
        print("Text: tokenized")

    with torch.no_grad():
        b_input_ids = input_ids.to(device)
        b_attention_masks = attention_masks.to(device)

        logits = model(b_input_ids, b_attention_masks).squeeze()
        logits = np.atleast_2d(logits.detach().cpu().numpy())
        logits = np.array([
            np.array(list(map(lambda x: 1 if x > 0 else 0, l))) for l in logits
        ])

        candidates = set()
        for (logit, text) in list(zip(logits, input_ids)):
            for i in range(len(logit)):
                if logit[i] == 1:
                    candidates.add(text[i])

        candidates = set(map(lambda c: detokenize_word(c), candidates))

        if verbose:
            print("Locations: extracted")

        return list(candidates)
예제 #6
0
    def buildFeatures(self):
        """
        The purpose of this method is to build features using 
        the pre-trained BERT tokenizer
        """
        if self.train:
            filename = self.parameters['processed-data-path'] + self.parameters[
                'processed-train-data-filename']
        else:
            filename = self.parameters['processed-data-path'] + self.parameters[
                'processed-test-data-filename']

        data_df = pd.read_csv(filename)

        tokenizer = utils.get_BERT_Tokenizer()
        data_df['tokenized'] = data_df['text'].apply(
            lambda x: utils.tokenize_text(x, tokenizer))

        if self.train:
            max_length = data_df['tokenized'].apply(len).max()

            max_length_json = {"max_length": max_length}
            with open('./parameters/maxlength.txt', "w") as json_file:
                json_file.write(str(max_length))
        else:
            max_length_json = open('./parameters/maxlength.txt', 'r')
            max_length = int(max_length_json.read())

        features_df = pd.DataFrame()
        features_df['tokenized'] = data_df['tokenized'].copy()
        if self.train:
            filename = self.parameters['features-path'] + self.parameters[
                'features-train-filename']

            features_df['label'] = 0
            indexes = data_df[data_df['label'] == 'SARCASM'].index
            features_df.iloc[indexes, -1] = 1

            features_df.to_json(filename)
        else:
            filename = self.parameters['features-path'] + self.parameters[
                'features-test-filename']
            features_df.to_json(filename)
        return
def get_name(entities, text):
    count = 0
    try:
        nlp = config.en_sm
        name = entities['Name']

        for n in name:
            if set(n.lower().split()) & set(comm.DESIGNATION):
                continue
            elif set(n.lower().split()) & set(comm.RESUME_SECTIONS):
                continue
            else:
                return n

    except:
        name = ''
        doc = nlp(text)

        # Extract entities
        doc_entities = doc.ents

        doc_persons = filter(lambda x: x.label_ == 'PERSON', doc_entities)
        doc_persons = filter(lambda x: len(x.text.strip().split()) >= 2,
                             doc_persons)
        doc_persons = list(doc_persons)
        if len(doc_persons) > 0:
            name = str(doc_persons[0])
            return name

        else:
            lines = utils.tokenize_text(text)

            for sentence in lines:
                entities = nltk.chunk.ne_chunk(sentence)
                for subtree in entities.subtrees():
                    if subtree.label() == 'PERSON':
                        for leaf in subtree.leaves():
                            name = leaf[0]
                            name = str(name)
                            return name
            return "No Name Found"
예제 #8
0
def predict(item: Item):
    preprocessed_text = preprocess_text(item.description)
    tokenized_sent = tokenize_text(preprocessed_text)
    X_test = model_dbow.infer_vector(tokenized_sent, steps=20)
    return clf.predict([X_test])[0]
args = parser.parse_args()

tokenizer = RobertaTokenizer.from_pretrained(args.model_path)
model = RobertaForMaskedLM.from_pretrained(args.model_path)
model.eval()

vocab = utils.make_vocab(args.vocab_file)
FEATURE_COUNT = 768  # Change this value to 1024 for the large RoBERTa model.
MAX_LINES = 2000  # Maximum number of context lines to average per vocabulary embedding.

if __name__ == "__main__":
    # Process vocabulary words in the outer loop.
    for v in vocab:
        with open(args.context_file, 'r') as lines:
            v_sum = torch.zeros([1, FEATURE_COUNT])
            v_tokens = utils.tokenize_text(v, tokenizer)
            utils.print_tokenized_text(v_tokens, tokenizer)
            count_sentence = 0
            count_tensor = 0

            # Process all lines in the context file in the inner loop.
            for line in lines:
                # Check for this vocab word in this line; if found, split the line into individual sentences.
                if v in line.lower().split():
                    for sentence in line.split('.'):
                        if v in sentence.lower():
                            line = sentence
                            count_sentence += 1
                            break  # We'll take the first instance of the word and discard the rest of the line.
                    # Split the new sentence-based line into tokens.
                    line_tokens = utils.tokenize_text(line, tokenizer)
예제 #10
0
import os
import tensorflow as tf
import tensorflow_datasets as tfds
from utils import load_dataset, tokenize_text, encode_map_fn, split_dataset
import datetime

print(os.getcwd())

data_name = 'amazon_us_reviews/Mobile_Electronics_v1_00'

train_dataset = load_dataset(name=data_name)
vocabulary = tokenize_text(train_dataset)
print(len(vocabulary))
vocab_size = len(vocabulary) + 1
print(f"Vocabulary size: {vocab_size}")

# tokenize text
encoder = tfds.features.text.TokenTextEncoder(vocabulary)
encoder.save_to_file('vocab')
print("Saved vocabulary file.")

# apply encoding to dataset
encoded_dataset = train_dataset.map(encode_map_fn)
train_data, test_data = split_dataset(encoded_dataset, test_size=10000)

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, 128))
model.add(
    tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(128, return_sequences=True)))
# model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)))
예제 #11
0
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')

cores = multiprocessing.cpu_count()



df_train=pd.read_csv('../data/train.csv')
df_train['cleaned_desc']=df_train['Exception (input)'].apply(preprocess_text)

df_test=pd.read_csv('../data/test.csv')
df_test['cleaned_desc']=df_test['Exception (input)'].apply(preprocess_text)

train_tagged = df_train.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['cleaned_desc']), tags=[r['Exception Category (ouput)']]), axis=1)
test_tagged = df_test.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['cleaned_desc']), tags=[r['Exception Category (ouput)']]), axis=1)

model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])
for epoch in range(30):
    model_dbow.train([x for x in tqdm(train_tagged.values)], total_examples=len(train_tagged.values), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

# save the model to disk
filename = '../model/dbow_model.sav'
joblib.dump(model_dbow, filename)

예제 #12
0
def train(args):
    layout = 'NT'
    train_sent, vocab, freq = tokenize_text(args.train_data,
                                            start_label=start_label,
                                            invalid_label=invalid_label)
    val_sent, _, _ = tokenize_text(args.valid_data,
                                   vocab=vocab,
                                   start_label=start_label,
                                   invalid_label=invalid_label)

    # layout, format of data and label. 'NT' means (batch_size, length) and 'TN' means (length, batch_size).
    data_train = LMNceIter(train_sent,
                           args.batch_size,
                           freq,
                           layout=layout,
                           buckets=buckets,
                           invalid_label=invalid_label,
                           num_label=args.num_label)

    data_val = LMNceIter(val_sent,
                         args.batch_size,
                         freq,
                         layout=layout,
                         buckets=buckets,
                         invalid_label=invalid_label,
                         num_label=args.num_label)

    cell = mx.rnn.SequentialRNNCell()
    for i in range(args.num_layers):
        cell.add(
            mx.rnn.LSTMCell(num_hidden=args.num_hidden,
                            prefix='lstm_l%d_' % i))

    def sym_gen(seq_len):
        # [batch_size, seq_len]
        data = mx.sym.Variable('data')

        # map input to a embeding vector
        embedIn = mx.sym.Embedding(data=data,
                                   input_dim=len(vocab),
                                   output_dim=args.num_embed,
                                   name='input_embed')

        # pass embedding vector to lstm
        # [batch_size, seq_len, num_hidden]
        output, _ = cell.unroll(seq_len,
                                inputs=embedIn,
                                layout='NTC',
                                merge_outputs=True)
        #output = output.reshape(-1, num_embed)

        # map label to embeding
        label = mx.sym.Variable('label')
        labwgt = mx.sym.Variable('label_weight')

        # define output embeding matrix
        #
        # TODO: change to adapter binding
        embedwgt = mx.sym.Variable(name='output_embed_weight',
                                   shape=(len(vocab), args.num_hidden))
        pred = nce_loss(output, label, labwgt, embedwgt, len(vocab),
                        args.num_hidden, args.num_label, seq_len)

        return pred, ('data', ), ('label', 'label_weight')

    if args.gpus:
        contexts = [mx.gpu(int(i)) for i in args.gpus.split(',')]
    else:
        contexts = mx.cpu(0)

    model = mx.mod.BucketingModule(
        sym_gen=sym_gen,
        default_bucket_key=data_train.default_bucket_key,
        context=contexts)

    if args.load_epoch:
        _, arg_params, aux_params = mx.rnn.load_rnn_checkpoint(
            cell, args.model_prefix, args.load_epoch)
    else:
        arg_params = None
        aux_params = None

    opt_params = {'learning_rate': args.lr, 'wd': args.wd}

    if args.optimizer not in ['adadelta', 'adagrad', 'adam', 'rmsprop']:
        opt_params['momentum'] = args.mom

    model.fit(train_data=data_train,
              eval_data=data_val,
              eval_metric=NceMetric(invalid_label),
              kvstore=args.kv_store,
              optimizer=args.optimizer,
              optimizer_params=opt_params,
              initializer=mx.init.Xavier(factor_type="in", magnitude=2.34),
              arg_params=arg_params,
              aux_params=aux_params,
              begin_epoch=args.load_epoch,
              num_epoch=args.num_epochs,
              batch_end_callback=mx.callback.Speedometer(
                  args.batch_size, args.disp_batches),
              epoch_end_callback=mx.rnn.do_rnn_checkpoint(
                  cell, args.model_prefix, 1) if args.model_prefix else None)
예제 #13
0
def test(args):
    assert args.model_prefix, "Must specifiy path to load from"

    # generate data iterator
    layout = 'TN'
    train_sent, vocab, _ = tokenize_text(args.train_data,
                                         start_label=start_label,
                                         invalid_label=invalid_label)
    test_sent, _, _ = tokenize_text(args.test_data,
                                    vocab=vocab,
                                    start_label=start_label,
                                    invalid_label=invalid_label)
    data_test = mx.rnn.BucketSentenceIter(test_sent,
                                          args.batch_size,
                                          buckets=buckets,
                                          invalid_label=invalid_label,
                                          label_name='label',
                                          data_name='data',
                                          layout=layout)

    if not args.stack_rnn:
        stack = mx.rnn.FusedRNNCell(args.num_hidden,
                                    num_layers=args.num_layers,
                                    mode='lstm',
                                    bidirectional=args.bidirectional).unfuse()
    else:
        stack = mx.rnn.SequentialRNNCell()
        for i in range(args.num_layers):
            cell = mx.rnn.LSTMCell(num_hidden=args.num_hidden,
                                   prefix='lstm_l%d_' % i)
            stack.add(cell)

    def sym_gen(seq_len):
        data = mx.sym.Variable('data')
        embed = mx.sym.Embedding(data=data,
                                 input_dim=len(vocab),
                                 output_dim=args.num_embed,
                                 name='input_embed')

        stack.reset()

        # [seq_len*batch_size, num_hidden]
        outputs, states = stack.unroll(seq_len,
                                       inputs=embed,
                                       layout='TNC',
                                       merge_outputs=True)

        # [seq_len*batch_size, 1, num_hidden]
        pred = mx.sym.Reshape(data=outputs,
                              shape=(-1, 1, args.num_hidden *
                                     (1 + args.bidirectional)))

        # get output embedding
        # TODO: this is a constant, initialize it only one-time
        #
        # [vocab_size] -> [vocab_size, num_hidden]
        allLab = mx.sym.Variable('alllab',
                                 shape=(len(vocab), ),
                                 dtype='float32')
        labs = mx.sym.Embedding(data=allLab,
                                input_dim=len(vocab),
                                output_dim=args.num_hidden,
                                name='output_embed')

        # pred: [seq_len*batch_size, 1, num_hidden]
        # labs: [vocab_size, num_hidden]
        # output: [seq_len*batch_size, vocab_size, num_hidden]
        pred = mx.sym.broadcast_mul(pred, labs)

        # [seq_len*batch_size, vocab_size]
        pred = mx.sym.sum(data=pred, axis=2)

        label = mx.sym.Variable('label')
        label = mx.sym.Reshape(label, shape=(-1, ))

        pred = mx.sym.SoftmaxOutput(data=pred, label=label, name='softmax')

        return pred, ('data', ), ('label', )

    if args.gpus:
        contexts = [mx.gpu(int(i)) for i in args.gpus.split(',')]
    else:
        contexts = mx.cpu(0)

    # 定义一个模型,使用bucket方式进行训练
    model = mx.mod.BucketingModule(
        sym_gen=sym_gen,
        default_bucket_key=data_test.default_bucket_key,
        context=contexts)

    datashape = data_test.provide_data
    labelshape = data_test.provide_label

    model.bind(datashape, labelshape, for_training=False)

    # note here we load using SequentialRNNCell instead of FusedRNNCell.
    _, arg_params, aux_params = mx.rnn.load_rnn_checkpoint(
        stack, args.model_prefix, args.load_epoch)
    arg_params['alllab'] = mx.ndarray.arange(
        len(vocab), dtype='float32').as_in_context(contexts)
    model.set_params(arg_params, aux_params)

    model.score(data_test,
                mx.metric.Perplexity(invalid_label),
                batch_end_callback=mx.callback.Speedometer(args.batch_size, 5))
예제 #14
0
 def test_tokenize_text(self):
   tokens = tokenize_text('data/test_data/sentences.txt',True)
   self.assertEqual(86, len(tokens))