Python ner_tokenizer 예제들, nlp_tools.tokenizer.ner_tokenizer Python 예제들

예제 #1

0

파일 보기

파일: decode_tools.py 프로젝트: sifhic/nl2bash

def translate_fun(data_point,
                  sess,
                  model,
                  vocabs,
                  FLAGS,
                  slot_filling_classifier=None):
    tg_ids = [data_utils.ROOT_ID]
    decoder_features = [[tg_ids]]
    if type(data_point) is str:
        source_str = data_point
        encoder_features = query_to_encoder_features(data_point, vocabs, FLAGS)
    else:
        source_str = data_point[0].sc_txt
        encoder_features = [[data_point[0].sc_ids]]
        if FLAGS.use_copy and FLAGS.copy_fun == 'copynet':
            encoder_features.append([data_point[0].csc_ids])

    if FLAGS.use_copy and FLAGS.copy_fun == 'copynet':
        # append dummy copynet target features (
        # used only for computing training objectives)
        ctg_ids = [data_utils.ROOT_ID]
        decoder_features.append([ctg_ids])
        # tokenize the source string with minimal changes on the token form
        copy_tokens = [query_to_copy_tokens(source_str, FLAGS)]
    else:
        copy_tokens = None
    if FLAGS.normalized:
        _, entities = tokenizer.ner_tokenizer(source_str)
        sc_fillers = [entities[0]]
    else:
        sc_fillers = None

    # Which bucket does it belong to?
    bucket_ids = [
        b for b in xrange(len(model.buckets))
        if model.buckets[b][0] > len(encoder_features[0][0])
    ]
    bucket_id = min(bucket_ids) if bucket_ids else (len(model.buckets) - 1)

    # Get a 1-element batch to feed the sentence to the model.
    formatted_example = model.format_batch(encoder_features,
                                           decoder_features,
                                           bucket_id=bucket_id)

    # Compute neural network decoding output
    model_outputs = model.step(sess,
                               formatted_example,
                               bucket_id,
                               forward_only=True)
    sequence_logits = model_outputs.sequence_logits

    decoded_outputs = decode(model_outputs,
                             FLAGS,
                             vocabs,
                             sc_fillers=sc_fillers,
                             slot_filling_classifier=slot_filling_classifier,
                             copy_tokens=copy_tokens)

    return decoded_outputs, sequence_logits

예제 #2

0

파일 보기

def extract_rewrites(data):
    """Extract all pairs of rewrites from a parallel corpus."""
    nls, cms = data

    # Step 1: group pairs with the same natural language description.
    group_pairs_by_nl = collections.defaultdict(set)
    for nl, cm in zip(nls, cms):
        nl = nl.strip()
        cm = cm.strip()
        if nl.lower() == "na":
            continue
        if not nl:
            continue
        if not cm:
            continue
        nl_tokens, _ = tokenizer.ner_tokenizer(nl)
        nl_temp = ' '.join(nl_tokens)
        cm_temp = data_tools.cmd2template(cm)
        if not cm_temp in group_pairs_by_nl[nl_temp]:
            group_pairs_by_nl[nl_temp].add(cm_temp)

    # Step 2: cluster the commands with the same natural language explanations.
    merged = set()
    nls = group_pairs_by_nl.keys()
    for i in xrange(len(nls)):
        nl = nls[i]
        cm_temp_set = group_pairs_by_nl[nl]
        for j in xrange(i + 1, len(nls)):
            nl2 = nls[j]
            cm_temp_set2 = group_pairs_by_nl[nl2]
            if len(cm_temp_set & cm_temp_set2) >= 2:
                for cm_temp in cm_temp_set:
                    if not cm_temp in group_pairs_by_nl[nl2]:
                        group_pairs_by_nl[nl2].add(cm_temp)
                merged.add(i)

    # Step 3: remove redundant clusters after merge.
    rewrites = {}
    for i in xrange(len(nls)):
        if not i in merged:
            rewrites[nls[i]] = group_pairs_by_nl[nls[i]]

    # Step 4: print extracted rewrites and store in database.
    with DBConnection() as db:
        db.create_schema()
        for nl, cm_temps in sorted(rewrites.items(),
                                   key=lambda x: len(x[1]),
                                   reverse=True)[:10]:
            if len(cm_temps) >= 2:
                for cm_temp1 in cm_temps:
                    for cm_temp2 in cm_temps:
                        if cm_temp1 == cm_temp2:
                            continue
                        if not db.exist_rewrite((cm_temp1, cm_temp2)):
                            db.add_rewrite((cm_temp1, cm_temp2))
                            print("* {} --> {}".format(cm_temp1, cm_temp2))
                print()

예제 #3

0

파일 보기

def group_parallel_data(dataset,
                        attribute='source',
                        use_temp=False,
                        tokenizer_selector='nl'):
    """
    Group parallel dataset by a certain attribute.

    :param dataset: a list of training quadruples (nl_str, cm_str, nl, cm)
    :param attribute: attribute by which the data is grouped
    :param bucket_input: if the input is grouped in buckets
    :param use_temp: set to true if the dataset is to be grouped by the natural
        language template; false if the dataset is to be grouped by the natural
        language strings
    :param tokenizer_selector: specify which tokenizer to use for making
        templates

    :return: list of (key, data group) tuples sorted by the key value.
    """
    if dataset.data_points and isinstance(dataset.data_points, list):
        if isinstance(dataset.data_points[0], list):
            data_points = functools.reduce(lambda x, y: x + y,
                                           dataset.data_points)
        else:
            data_points = dataset.data_points
    else:
        raise ValueError

    grouped_dataset = {}
    for i in xrange(len(data_points)):
        data_point = data_points[i]
        attr = data_point.sc_txt \
            if attribute == 'source' else data_point.tg_txt
        if use_temp:
            if tokenizer_selector == 'nl':
                words, _ = tokenizer.ner_tokenizer(attr)
            else:
                words = data_tools.bash_tokenizer(attr, arg_type_only=True)
            temp = ' '.join(words)
        else:
            if tokenizer_selector == 'nl':
                words, _ = tokenizer.basic_tokenizer(attr)
                temp = ' '.join(words)
            else:
                temp = attr
        if temp in grouped_dataset:
            grouped_dataset[temp].append(data_point)
        else:
            grouped_dataset[temp] = [data_point]

    return sorted(grouped_dataset.items(), key=lambda x: x[0])

예제 #4

0

파일 보기

def slot_filler_alignment_induction(nl, cm, verbose=False):
    """Give an oracle translation pair of (nl, cm), align the slot fillers
       extracted from the natural language with the slots in the command.
    """

    # Step 1: extract the token ids of the constants in the English sentence
    # and the slots in the command
    tokens, entities = tokenizer.ner_tokenizer(nl)
    nl_fillers, _, _ = entities
    cm_tokens = data_tools.bash_tokenizer(cm)
    cm_tokens_with_types = data_tools.bash_tokenizer(cm, arg_type_only=True)
    assert (len(cm_tokens) == len(cm_tokens_with_types))
    cm_slots = {}
    for i in xrange(len(cm_tokens_with_types)):
        if cm_tokens_with_types[i] in bash.argument_types:
            if i > 0 and format_args.is_min_flag(cm_tokens_with_types[i - 1]):
                cm_token_type = 'Timespan'
            else:
                cm_token_type = cm_tokens_with_types[i]
            cm_slots[i] = (cm_tokens[i], cm_token_type)

    # Step 2: construct one-to-one mappings for the token ids from both sides
    M = collections.defaultdict(dict)  # alignment score matrix
    for i in nl_fillers:
        surface, filler_type = nl_fillers[i]
        filler_value = format_args.extract_value(filler_type, filler_type,
                                                 surface)
        for j in cm_slots:
            slot_value, slot_type = cm_slots[j]
            if (filler_value and format_args.is_parameter(filler_value)) or \
                    slot_filler_type_match(slot_type, filler_type):
                M[i][j] = slot_filler_value_match(slot_value, filler_value,
                                                  slot_type)
            else:
                M[i][j] = -np.inf

    mappings, remained_fillers = stable_marriage_alignment(M)

    if verbose:
        print('nl: {}'.format(nl))
        print('cm: {}'.format(cm))
        for (i, j) in mappings:
            print('[{}] {} <-> [{}] {}'.format(i, nl_fillers[i][0], j,
                                               cm_slots[j][0]))
        for i in remained_fillers:
            print('filler {} is not matched to any slot\n'.format(
                nl_fillers[i][0].encode('utf-8')))

    return mappings

예제 #5

0

파일 보기

def translate_fun(data_point,
                  sess,
                  model,
                  vocabs,
                  FLAGS,
                  slot_filling_classifier=None):
    if type(data_point) is str:
        sc_ids, sc_full_ids, sc_copy_ids, sc_fillers = \
            vectorize_query(data_point, vocabs, FLAGS)
        tg_ids = [data_utils.ROOT_ID]
        tg_full_ids = [data_utils.ROOT_ID]
        pointer_targets = np.zeros(
            [1, FLAGS.max_tg_length, FLAGS.max_sc_length])
    else:
        sc_ids = data_point[0].sc_ids
        sc_full_ids = data_point[0].sc_full_ids
        sc_copy_ids = data_point[0].sc_copy_ids
        tg_ids = data_point[0].tg_ids
        tg_full_ids = data_point[0].tg_full_ids
        pointer_targets = data_point[0].pointer_targets
        _, entities = tokenizer.ner_tokenizer(data_point[0].sc_txt)
        sc_fillers = entities[0]

    # Which bucket does it belong to?
    bucket_id = min([
        b for b in xrange(len(model.buckets))
        if model.buckets[b][0] > len(sc_ids)
    ])

    # Get a 1-element batch to feed the sentence to the model.
    formatted_example = model.format_example(
        [[sc_ids], [sc_full_ids], [sc_copy_ids]], [[tg_ids], [tg_full_ids]],
        pointer_targets=pointer_targets,
        bucket_id=bucket_id)

    # Compute neural network decoding output
    model_outputs = model.step(sess,
                               formatted_example,
                               bucket_id,
                               forward_only=True)
    output_logits = model_outputs.output_logits

    decoded_outputs = decode(formatted_example.encoder_full_inputs,
                             model_outputs, FLAGS, vocabs, [sc_fillers],
                             slot_filling_classifier)

    return decoded_outputs, output_logits

예제 #6

0

파일 보기

파일: translate.py 프로젝트: sxdkxgwan/awesome_nmt

def eval_slot_filling(dataset):
    """
    Evaluate global slot filling algorithm F1 using ground truth templates.
    """
    vocabs = data_utils.load_vocab(FLAGS)
    rev_tg_vocab = vocabs.rev_tg_vocab
    rev_tg_full_vocab = vocabs.rev_tg_full_vocab

    with tf.Session(config=tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=FLAGS.log_device_placement)) as sess:
        # Create model.
        FLAGS.beam_size = 1
        FLAGS.token_decoding_algorithm = 'beam_search'
        FLAGS.force_reading_input = True
        model = graph_utils.create_model(sess,
                                         FLAGS,
                                         Seq2SeqModel,
                                         buckets=_buckets,
                                         forward_only=True)

        model_param_dir = os.path.join(FLAGS.model_dir,
                                       'train.mappings.X.Y.npz')
        train_X, train_Y = data_utils.load_slot_filling_data(model_param_dir)
        slot_filling_classifier = classifiers.KNearestNeighborModel(
            FLAGS.num_nn_slot_filling, train_X, train_Y)
        print('Slot filling classifier parameters loaded.')

        num_correct_argument = 0.0
        num_argument = 0.0
        num_correct_align = 0.0
        num_predict_align = 0.0
        num_gt_align = 0.0
        for bucket_id in xrange(len(_buckets)):
            for data_id in xrange(len(dataset[bucket_id])):
                dp = dataset[bucket_id][data_id]
                gt_mappings = [tuple(m) for m in dp.mappings]
                outputs = dp.tg_ids[1:-1]
                full_outputs = dp.tg_full_ids[1:-1]
                if gt_mappings:
                    _, entities = tokenizer.ner_tokenizer(dp.sc_txt)
                    nl_fillers = entities[0]
                    encoder_inputs = [dp.sc_ids]
                    encoder_full_inputs = [dp.sc_copy_ids] \
                        if FLAGS.use_copy else [dp.sc_full_ids]
                    decoder_inputs = [dp.tg_ids]
                    decoder_full_inputs = [dp.tg_full_ids] \
                        if FLAGS.use_copy else [dp.tg_copy_ids]
                    pointer_targets = [dp.pointer_targets] \
                        if FLAGS.use_copy else None
                    formatted_example = model.format_example(
                        [encoder_inputs, encoder_full_inputs],
                        [decoder_inputs, decoder_full_inputs],
                        pointer_targets=pointer_targets,
                        bucket_id=bucket_id)
                    model_outputs = model.step(sess,
                                               formatted_example,
                                               bucket_id,
                                               forward_only=True)
                    encoder_outputs = model_outputs.encoder_hidden_states
                    decoder_outputs = model_outputs.decoder_hidden_states
                    print(decoder_outputs[:, 0, :])

                    cm_slots = {}
                    output_tokens = []
                    for ii in xrange(len(outputs)):
                        output = outputs[ii]
                        if output < len(rev_tg_vocab):
                            token = rev_tg_vocab[output]
                            if "@@" in token:
                                token = token.split("@@")[-1]
                            output_tokens.append(token)
                            if token.startswith('__ARG__'):
                                token = token[len('__ARG__'):]
                            if nl_fillers is not None and \
                                    token in constants._ENTITIES:
                                if ii > 0 and slot_filling.is_min_flag(
                                        rev_tg_vocab[outputs[ii - 1]]):
                                    token_type = 'Timespan'
                                else:
                                    token_type = token
                                cm_slots[ii] = (token, token_type)
                        else:
                            output_tokens.append(data_utils._UNK)
                    if FLAGS.use_copy:
                        P = pointer_targets[0][0] > 0
                        pointers = model_outputs.pointers[0]
                        pointers = np.multiply(
                            np.sum(P.astype(float)[:pointers.shape[0],
                                                   -pointers.shape[1]:],
                                   1,
                                   keepdims=True), pointers)
                    else:
                        pointers = None
                    tree, _, mappings = slot_filling.stable_slot_filling(
                        output_tokens,
                        nl_fillers,
                        cm_slots,
                        pointers,
                        encoder_outputs[0],
                        decoder_outputs[0],
                        slot_filling_classifier,
                        verbose=True)

                    if mappings is not None:
                        # print(gt_mappings)
                        for mapping in mappings:
                            # print(mapping)
                            if mapping in gt_mappings:
                                num_correct_align += 1
                        num_predict_align += len(mappings)
                    num_gt_align += len(gt_mappings)

                    tokens = data_tools.ast2tokens(tree)
                    if not tokens:
                        continue
                    for ii in xrange(len(outputs)):
                        output = outputs[ii]
                        token = rev_tg_vocab[output]
                        if token.startswith('__ARG__'):
                            token = token[len('__ARG__'):]
                        if token in constants._ENTITIES:
                            argument = rev_tg_full_vocab[full_outputs[ii]]
                            if argument.startswith('__ARG__'):
                                argument = argument[len('__ARG__'):]
                            pred = tokens[ii]
                            if constants.remove_quotation(argument) == \
                                    constants.remove_quotation(pred):
                                num_correct_argument += 1
                            num_argument += 1
            if gt_mappings:
                break
        precision = num_correct_align / num_predict_align
        recall = num_correct_align / num_gt_align
        print("Argument Alignment Precision: {}".format(precision))
        print("Argument Alignment Recall: {}".format(recall))
        print("Argument Alignment F1: {}".format(2 * precision * recall /
                                                 (precision + recall)))

        print("Argument filling accuracy: {}".format(num_correct_argument /
                                                     num_argument))

예제 #7

0

파일 보기

파일: translate.py 프로젝트: sxdkxgwan/awesome_nmt

def decode_set(model, dataset, rev_sc_vocab, rev_tg_vocab, verbose=True):
    grouped_dataset = data_utils.group_data_by_nl(dataset, use_bucket=False,
                                                  use_temp=False)

    with DBConnection() as db:
        db.remove_model(model_name)
        
        num_eval = 0
        for sc_temp in grouped_dataset:
            batch_sc_strs, batch_tg_strs, batch_scs, batch_cmds = \
                grouped_dataset[sc_temp]
            _, entities = tokenizer.ner_tokenizer(sc_temp)
            nl_fillers = entities[-1]
            if nl_fillers is not None:
                cm_slots = {}

            sc_str = batch_sc_strs[0]
            nl = batch_scs[0]
            if verbose:
                print("Example {}".format(num_eval+1))
                print("Original English: " + sc_str.strip())
                print("English: " + sc_temp)
                for j in xrange(len(batch_tg_strs)):
                    print("GT Command {}: {}".format(j+1, batch_tg_strs[j].strip()))
            # retrieve top-ranked command template
            top_k_results = model.test(nl, 100)
            count = 0
            for i in xrange(len(top_k_results)):
                nn, output_tokens, score = top_k_results[i]
                nn_str = ' '.join([rev_sc_vocab[j] for j in nn])
                tokens = []
                for j in xrange(1, len(output_tokens)-1):
                    pred_token = rev_tg_vocab[output_tokens[j]]
                    if "@@" in pred_token:
                        pred_token = pred_token.split("@@")[-1]
                    if nl_fillers is not None and \
                            pred_token in constants._ENTITIES:
                        if j > 0 and slot_filling.is_min_flag(
                                rev_tg_vocab[output_tokens[j-1]]):
                            pred_token_type = 'Timespan'
                        else:
                            pred_token_type = pred_token
                        cm_slots[j] = (pred_token, pred_token_type)
                    tokens.append(pred_token)
                pred_cmd = ' '.join(tokens)
                # check if the predicted command templates have enough slots to
                # hold the fillers (to rule out templates that are trivially
                # unqualified)
                if FLAGS.dataset.startswith("bash"):
                    pred_cmd = re.sub('( ;\s+)|( ;$)', ' \\; ', pred_cmd)
                    tree = data_tools.bash_parser(pred_cmd)
                else:
                    tree = data_tools.paren_parser(pred_cmd)
                if nl_fillers is None or len(cm_slots) >= len(nl_fillers):
                    # Step 2: check if the predicted command template is grammatical
                    # filter out non-grammatical output
                    if tree is not None:
                        matched = slot_filling.heuristic_slot_filling(tree, nl_fillers)
                if tree is not None:
                    slot_filling.fill_default_value(tree)
                    pred_cmd = data_tools.ast2command(tree)
                if verbose:
                    print("NN: {}".format(nn_str))
                    print("Prediction {}: {} ({})".format(i, pred_cmd, score))
                db.add_prediction(model_name, sc_str, pred_cmd, float(score),
                                      update_mode=False)
                count += 1
                if count == 10:
                    break
            print("")        
            num_eval += 1