Exemplos de tokenize em Python, exemplos de utils.preprocess.tokenize em Python

Exemplo n.º 1

0

Exibir arquivo

    def extract(self, source, paraphrase, position):
        s = set(tokenize(source))
        p = set(tokenize(paraphrase))

        p = p.difference(s)
        s = s.difference(p)

        return n_similarity(list(p), list(s))

Exemplo n.º 2

0

Exibir arquivo

Arquivo: features.py Projeto: mysilver/PhD

def WordsFF(source, paraphrase, position):
    stokens = tokenize(source)
    ptokens = tokenize(paraphrase)

    word_num_diff = abs(len(stokens) - len(ptokens))
    letter_num_diff = len(source) - len(paraphrase)

    return [word_num_diff, letter_num_diff]

Exemplo n.º 3

0

Exibir arquivo

Arquivo: features.py Projeto: mysilver/PhD

def SemanticSimilarityFF(source, paraphrase, position):
    sts_sim = sts.similarity(source, paraphrase)
    sent2vec = sentence2vec.similarity(source, paraphrase)
    word2vec = n_similarity(tokenize(source), tokenize(paraphrase))
    wm = wm_distance(tokenize(source), tokenize(paraphrase))
    if math.isinf(wm):
        wm = 10

    return [sts_sim, sent2vec, word2vec, wm]

Exemplo n.º 4

0

Exibir arquivo

Arquivo: analysis.py Projeto: mysilver/PhD

def top_words():
    dictionary = Counter()
    for i, expression in enumerate(dataset):
        expr = set(tokenize(expression))
        for instance in dataset[expression]:
            paraphrase = instance[0]
            dictionary.update([
                t for t in tokenize(paraphrase)
                if t not in expr and t not in stopwords
            ])

    for token in dictionary.most_common(50):
        print(token)

Exemplo n.º 5

0

Exibir arquivo

 def extract(self, source, paraphrase, position):
     p = tokenize(paraphrase)
     tense = 0
     for t, tag in pos_tag(p):
         if 'VB' in tag:
             tense = max([tense, TenseFF._pos_to_digit(tag)])
     return tense

Exemplo n.º 6

0

Exibir arquivo

Arquivo: features.py Projeto: mysilver/PhD

def TenseFF(source, paraphrase, position):
    """
    VB = 1
    VBG = 2
    VBN = 3
    VBZ = 4
    VBD = 5
    """
    def _pos_to_digit(pos):
        if pos == 'VB' or pos == 'VBP':
            return 1
        if pos == 'VBG':
            return 2
        if pos == 'VBN':
            return 3
        if pos == 'VBZ':
            return 4
        if pos == 'VBD':
            return 5

        return -1

    p = tokenize(paraphrase)
    tense = 0
    for t, tag in pos_tag(p):
        if 'VB' in tag:
            tense = max([tense, _pos_to_digit(tag)])
    return tense

Exemplo n.º 7

0

Exibir arquivo

Arquivo: features.py Projeto: mysilver/PhD

def PronounFF(source, paraphrase, position):
    s = set(tokenize(source))
    p = set(tokenize(paraphrase))

    i = {'i', 'me', 'my', 'mine', 'myself'}
    you = {'you', 'yours', 'yourself'}
    he = {'he', 'his', 'him', 'himself'}
    she = {'she', 'her', 'herself'}
    we = {'we', 'us', 'our', 'ours', 'ourselves'}
    they = {'they', 'their', 'them', 'themselves'}

    s_i = len(s.intersection(i)) > 1
    p_i = len(p.intersection(i)) > 1

    s_you = len(s.intersection(you)) > 1
    p_you = len(p.intersection(you)) > 1

    s_he = len(s.intersection(he)) > 1
    p_he = len(p.intersection(he)) > 1

    s_she = len(s.intersection(she)) > 1
    p_she = len(p.intersection(she)) > 1

    s_we = len(s.intersection(we)) > 1
    p_we = len(p.intersection(we)) > 1

    s_they = len(s.intersection(they)) > 1
    p_they = len(p.intersection(they)) > 1

    dangling_i = s_i == p_i
    dangling_you = s_you == p_you
    dangling_he = s_he == p_he
    dangling_she = s_she == p_she
    dangling_we = s_we == p_we
    dangling_they = s_they == p_they

    return [
        int(dangling_i),
        int(dangling_you),
        int(dangling_he or dangling_she or dangling_we or dangling_they)
    ]

Exemplo n.º 8

0

Exibir arquivo

def sentence_to_vec(sentence, max_length):
    from ParaVec import word2vec
    ret = np.zeros((max_length, 300))
    for i, t in enumerate(tokenize(sentence)):
        vec = word2vec.vector(t)
        if vec is None:
            vec = word2vec.vector("unknown")

        ret[i] = vec

        if len(ret) == max_length - 1:
            break

    return ret

Exemplo n.º 9

0

Exibir arquivo

def plural_to_singular_edit(canonical, resources=None):
    """
    corrects plural noun grammatical errors
    """

    for resource in resources:
        rname = resource.name
        if resource.resource_type != SINGLETON:
            continue

        canonical = canonical.replace(rname, "a {}".format(singular(rname)))
        canonical = canonical.replace(" the a ", " a ")
        canonical = canonical.replace(" a a ", " a ")
        canonical = canonical.replace(" an a ", " a ")
        canonical = canonical.replace(" a an ", " a ")

    canonical = LanguageChecker().grammar_corector(canonical,
                                                   categories=['MISC'
                                                               ]).lower()
    tokens = tokenize(canonical, normilize_text=False)

    ret = []
    seen_article = -20
    for token in tokens:

        if token in {"a", "an"}:
            seen_article = 0

        if 3 > seen_article > 0:
            if not is_singular(token):
                ret.append(singular(token))
                seen_article = 0
                continue

        ret.append(token)
        seen_article += 1

    tokens = []
    prev = False
    for token in reversed(ret):
        if not is_singular(token):
            if prev:
                token = singular(token)
            prev = True
        tokens.append(token)

    ret = reversed(tokens)

    return " ".join(ret).replace("< <", "<<").replace("> >", ">>")

Exemplo n.º 10

0

Exibir arquivo

Arquivo: data_utils.py Projeto: raphael0202/off-category-classification

def process_product_name(
    product_names: Iterable[str],
    nlp,
    token_to_int: Dict,
    max_length: int,
    preprocessing_config: TextPreprocessingConfig,
) -> np.ndarray:
    tokens_all = [
        tokenize(
            preprocess_product_name(text, **dataclasses.asdict(preprocessing_config)),
            nlp,
        )
        for text in product_names
    ]
    tokens_int = [
        [token_to_int[t if t in token_to_int else UNK_TOKEN] for t in tokens]
        for tokens in tokens_all
    ]
    return pad_sequences(tokens_int, max_length)

Exemplo n.º 11

0

Exibir arquivo

Arquivo: analysis.py Projeto: mysilver/PhD

def top_ngrams(ngram):
    dictionary = Counter()
    for i, expression in enumerate(dataset):
        # expr = set(ngrams(tokenize(expression),ngram))
        for instance in dataset[expression]:
            paraphrase = instance[0]

            for t in ngrams(tokenize(paraphrase.lower()), ngram):
                phrase = " ".join(t)
                if phrase in expression.lower():
                    continue
                if type(t) is str:
                    tagged = pos_tag([t])
                else:
                    tagged = pos_tag(t)
                for t, tag in tagged:
                    if "VB" in tag:
                        dictionary.update([phrase])
                        break

    for token in dictionary.most_common(100):
        print(token)

Exemplo n.º 12

0

Exibir arquivo

 def extract(self, source, paraphrase, position):
     return len(tokenize(source)) - len(tokenize(paraphrase))

Exemplo n.º 13

0

Exibir arquivo

 def extract(self, source, paraphrase, position):
     # Levenshtein distance
     return normalized_damerau_levenshtein_distance(tokenize(source),
                                                    tokenize(paraphrase))

Exemplo n.º 14

0

Exibir arquivo

 def _measure(text):
     words = tokenize(text)
     sylabs = len([w for w in words if len(syllables(w)) > 2])
     return 1.0430 * math.sqrt(sylabs * 30 / 1) + 3.1291

Exemplo n.º 15

0

Exibir arquivo

 def _measure(text):
     words = tokenize(text)
     sylabs = len([w for w in words if len(syllables(w)) > 2])
     words = len(words) + 1
     return 206.835 - 1.015 * words / 1 - 84.4 * sylabs / words

Exemplo n.º 16

0

Exibir arquivo

 def _measure(text):
     words = tokenize(text)
     complex_words = [w for w in words if len(syllables(w)) >= 3]
     return 0.4 * (len(words) / 1 + 100 * len(complex_words) /
                   (len(words) + 1))

Exemplo n.º 17

0

Exibir arquivo

Arquivo: features.py Projeto: mysilver/PhD

def Entropy(source, paraphrase, position):
    s = set(tokenize(source))
    p = set(tokenize(paraphrase))
    word_diff_entropy = ent.shannon_entropy(" ".join(p.difference(s)))
    entropy = ent.shannon_entropy(paraphrase)
    return [entropy, word_diff_entropy]

Exemplo n.º 18

0

Exibir arquivo

 def extract(self, source, paraphrase, position):
     return n_similarity(tokenize(source), tokenize(paraphrase))

Exemplo n.º 19

0

Exibir arquivo

def main(args):
    """
    Save nx.graph (Gss, Gts,...) and corresponding torch_geometric.data.PairData
    (via clevr_parse embedder api).
    """
    if (args.input_vocab_json == '') and (args.output_vocab_json == ''):
        logger.info(
            'Must give one of --input_vocab_json or --output_vocab_json')
        return
    graph_parser = clevr_parser.Parser(
        backend='spacy',
        model=args.parser_lm,
        has_spatial=True,
        has_matching=True).get_backend(identifier='spacy')
    embedder = clevr_parser.Embedder(
        backend='torch', parser=graph_parser).get_backend(identifier='torch')
    is_directed_graph = args.is_directed_graph  # Parse graphs as nx.MultiDiGraph

    out_dir, out_f_prefix = _get_out_dir_and_file_prefix(args)
    checkpoint_dir = f"{out_dir}/checkpoints"
    utils.mkdirs(checkpoint_dir)

    questions, img_scenes = get_questions_and_parsed_scenes(
        args.input_questions_json, args.input_parsed_img_scenes_json)
    if args.is_debug:
        set_default_level(10)
        questions = questions[:
                              128]  # default BSZ is 64 ensuring enought for batch iter
        logger.debug(
            f"In DEBUG mode, sampling {len(questions)} questions only..")
    # Process Vocab #
    vocab = _process_vocab(args, questions)

    # Encode all questions and programs
    logger.info('Encoding data')
    questions_encoded, programs_encoded, answers, image_idxs = [], [], [], []
    question_families = []
    orig_idxs = []

    # Graphs and Embeddings #
    data_s_list = []  # List [torch_geometric.data.Data]
    data_t_list = []  # List [torch_geometric.data.Data]
    num_samples = 0  # Counter for keeping track of processed samples
    num_skipped = 0  # Counter for tracking num of samples skipped
    for orig_idx, q in enumerate(questions):
        # First See if Gss, Gts are possible to extract.
        # If not (for e.g., some edges cases like plurality, skip data sample
        img_idx = q['image_index']
        img_fn = q['image_filename']
        logger.debug(f"\tProcessing Image - {img_idx}: {img_fn} ...")
        # q_idx = q['question_index']
        # q_fam_idx = q['question_family_index']
        ## 1: Ensure both Gs,Gt is parseable for this question sample, o.w. skip
        img_scene = list(
            filter(lambda x: x['image_index'] == img_idx, img_scenes))[0]
        try:
            Gt, t_doc = graph_parser.get_doc_from_img_scene(
                img_scene, is_directed_graph=is_directed_graph)
            X_t, ei_t, e_attr_t = embedder.embed_t(
                img_idx, args.input_parsed_img_scenes_json)
        except AssertionError as ae:
            logger.warning(f"AssertionError Encountered: {ae}")
            logger.warning(f"[{img_fn}] Excluding images with > 10 objects")
            num_skipped += 1
            continue
        if Gt is None and ("SKIP" in t_doc):
            # If the derendering pipeline failed, then just skip the
            # scene, don't process the labels (and text_scenes) for the image
            print(f"Got None img_doc at image_index: {img_idx}")
            print(f"Skipping all text_scenes for imgage idx: {img_idx}")
            num_skipped += 1
            continue
        s = q['question']
        orig_idx = q['question_index']
        try:
            Gs, s_doc = graph_parser.parse(s,
                                           return_doc=True,
                                           is_directed_graph=is_directed_graph)
            X_s, ei_s, e_attr_s = embedder.embed_s(s)
        except ValueError as ve:
            logger.warning(f"ValueError Encountered: {ve}")
            logger.warning(f"Skipping question: {s} for {img_fn}")
            num_skipped += 1
            continue
        if Gs is None and ("SKIP" in s_doc):
            logger.warning(
                "Got None as Gs and 'SKIP' in Gs_embd. (likely plural with CLEVR_OBJS label) "
            )
            logger.warning(
                f"SKIPPING processing {s} for {img_fn} and at {img_idx}")
            num_skipped += 1
            continue

        # Using ClevrData allows us a debug extension to Data
        data_s = ClevrData(x=X_s, edge_index=ei_s, edge_attr=e_attr_s)
        data_t = ClevrData(x=X_t, edge_index=ei_t, edge_attr=e_attr_t)
        data_s_list.append(data_s)
        data_t_list.append(data_t)

        question = q['question']
        orig_idxs.append(orig_idx)
        image_idxs.append(img_idx)
        if 'question_family_index' in q:
            question_families.append(q['question_family_index'])
        question_tokens = preprocess_utils.tokenize(question,
                                                    punct_to_keep=[';', ','],
                                                    punct_to_remove=['?', '.'])
        question_encoded = preprocess_utils.encode(
            question_tokens,
            vocab['question_token_to_idx'],
            allow_unk=args.encode_unk == 1)
        questions_encoded.append(question_encoded)

        has_prog_seq = 'program' in q
        if has_prog_seq:
            program = q['program']
            program_str = program_to_str(program, args.mode)
            program_tokens = preprocess_utils.tokenize(program_str)
            program_encoded = preprocess_utils.encode(
                program_tokens, vocab['program_token_to_idx'])
            programs_encoded.append(program_encoded)

        if 'answer' in q:
            ans = q['answer']
            answers.append(vocab['answer_token_to_idx'][ans])

        num_samples += 1
        logger.info("-" * 50)
        logger.info(f"Samples processed count = {num_samples}")
        if has_prog_seq:
            logger.info(f"\n[{orig_idx}]: question: {question} \n"
                        f"\tprog_str: {program_str} \n"
                        f"\tanswer: {ans}")
        logger.info("-" * 50)

        # ---- CHECKPOINT ---- #
        if num_samples % args.checkpoint_every == 0:
            logger.info(f"Checkpointing at {num_samples}")
            checkpoint_fn_prefix = f"{out_f_prefix}_{num_samples}"
            _out_dir = f"{checkpoint_dir}/{out_f_prefix}_{num_samples}"
            utils.mkdirs(_out_dir)
            out_fpp = f"{_out_dir}/{checkpoint_fn_prefix}"
            # ------------ Checkpoint .H5 ------------#
            logger.info(
                f"CHECKPOINT: Saving checkpoint files at directory: {out_fpp}")
            save_h5(f"{out_fpp}.h5", vocab, questions_encoded, image_idxs,
                    orig_idxs, programs_encoded, question_families, answers)
            # ------------ Checkpoint GRAPH DATA ------------#
            save_graph_pairdata(out_fpp,
                                data_s_list,
                                data_t_list,
                                is_directed_graph=is_directed_graph)
            logger.info(f"-------------- CHECKPOINT: COMPLETED --------")

        if (args.max_sample > 0) and (num_samples >= args.max_sample):
            logger.info(f"len(questions_encoded = {len(questions_encoded)}")
            logger.info("args.max_sample reached: Completing ... ")
            break

    logger.debug(f"Total samples skipped = {num_skipped}")
    logger.debug(f"Total samples processed = {num_samples}")
    out_fpp = f"{out_dir}/{out_f_prefix}"
    ## SAVE .H5: Baseline {dataset}_h5.h5 file (q,p,ans,img_idx) as usual
    logger.info(f"Saving baseline (processed) data in: {out_fpp}.h5")
    save_h5(f"{out_fpp}.h5", vocab, questions_encoded, image_idxs, orig_idxs,
            programs_encoded, question_families, answers)
    ## ------------  SAVE GRAPH DATA ------------ ##
    ## N.b. Ensure the len of theses lists are all equals
    save_graph_pairdata(out_fpp,
                        data_s_list,
                        data_t_list,
                        is_directed_graph=is_directed_graph)
    logger.info(f"Saved Graph Data in: {out_fpp}_*.[h5|.gpickle|.npz|.pt] ")

Exemplo n.º 20

0

Exibir arquivo

Arquivo: preprocess_captions.py Projeto: ren-cell/RobustChangeCaptioning

def main(args):
    print('Loading captions')
    with open(args.input_captions_json, 'r') as f:
        captions = json.load(f)
    with open(args.input_neg_captions_json, 'r') as f:
        neg_captions = json.load(f)
    with open(args.split_json, 'r') as f:
        splits = json.load(f)
    all_imgs = sorted(os.listdir(args.input_image_dir))
    captioned_imgs = list(captions.keys())
    all_captions = []
    for img, caps in captions.items():
        all_captions.extend(caps)
    all_neg_captions = []
    for img, caps in neg_captions.items():
        all_neg_captions.extend(caps)

    # Extract train data points
    train_split = splits['train']
    train_imgs = [all_imgs[idx] for idx in train_split]
    train_captions = []
    train_neg_captions = []
    for img in train_imgs:
        cap = captions[img]
        neg_cap = neg_captions[img]
        train_captions.extend(cap)
        train_neg_captions.extend(neg_cap)

    N = len(all_imgs)
    N_captioned = len(captions)
    M = len(all_captions)
    M_neg = len(all_neg_captions)
    print('Total images: %d' % N)
    print('Total captioned images: %d' % N_captioned)
    print('Total captions: %d' % M)
    print('Total negative captions: %d' % M_neg)
    print('Total train images: %d' % len(train_imgs))
    print('Total train captions: %d' % len(train_captions))
    print('Total train neg captions: %d' % len(train_neg_captions))

    # Either create the vocab or load it from disk
    if args.input_vocab_json == '':
        print('Building vocab')
        word_to_idx = build_vocab(train_captions + train_neg_captions,
                                  min_token_count=args.word_count_threshold,
                                  punct_to_keep=[';', ','],
                                  punct_to_remove=['?', '.'])
    else:
        print('Loading vocab')
        with open(args.input_vocab_json, 'r') as f:
            word_to_idx = json.load(f)
    if args.output_vocab_json != '':
        with open(args.output_vocab_json, 'w') as f:
            json.dump(word_to_idx, f)

    # Encode all captions
    # First, figure out max length of captions
    all_cap_tokens = []
    max_length = -1
    cap_keys = sorted(list(captions.keys()))
    for img in cap_keys:
        caps = captions[img]
        n = len(caps)
        assert n > 0, 'error: some image has no caption'
        tokens_list = []
        for cap in caps:
            cap_tokens = tokenize(cap,
                                  add_start_token=True,
                                  add_end_token=False,
                                  punct_to_keep=[';', ','],
                                  punct_to_remove=['?', '.'])
            tokens_list.append(cap_tokens)
            max_length = max(max_length, len(cap_tokens))
        all_cap_tokens.append((img, tokens_list))

    all_neg_cap_tokens = []
    cap_keys = sorted(list(captions.keys()))
    for img in cap_keys:
        neg_caps = neg_captions[img]
        neg_n = len(neg_caps)
        assert neg_n > 0, 'error: some image has no caption'
        neg_tokens_list = []
        for neg_cap in neg_caps:
            neg_cap_tokens = tokenize(neg_cap,
                                      add_start_token=True,
                                      add_end_token=False,
                                      punct_to_keep=[';', ','],
                                      punct_to_remove=['?', '.'])
            neg_tokens_list.append(neg_cap_tokens)
        all_neg_cap_tokens.append((img, neg_tokens_list))

    print('Encoding captions')
    label_arrays = []
    label_start_idx = -np.ones(N, dtype=np.int)
    label_end_idx = -np.ones(N, dtype=np.int)
    label_length = np.zeros(M, dtype=np.int)
    caption_counter = 0
    counter = 0

    # Then encode
    for img, tokens_list in all_cap_tokens:
        i = int(img.split('.')[0].split('_')[-1])
        n = len(tokens_list)
        Li = np.zeros((n, max_length), dtype=np.int)
        for j, tokens in enumerate(tokens_list):
            label_length[caption_counter] = len(tokens)
            caption_counter += 1
            tokens_encoded = encode(tokens,
                                    word_to_idx,
                                    allow_unk=args.allow_unk == 1)
            for k, w in enumerate(tokens_encoded):
                Li[j, k] = w
        # captions are padded with zeros
        label_arrays.append(Li)
        label_start_idx[i] = counter
        label_end_idx[i] = counter + n - 1

        counter += n

    L = np.concatenate(label_arrays, axis=0)  # put all labels together
    assert L.shape[0] == M, "lengths don't match?"
    assert np.all(label_length > 0), 'error: some captions have no word?'

    print('Encoding negative captions')
    neg_label_arrays = []
    neg_label_start_idx = -np.ones(N, dtype=np.int)
    neg_label_end_idx = -np.ones(N, dtype=np.int)
    neg_label_length = np.zeros(M_neg, dtype=np.int)
    neg_caption_counter = 0
    neg_counter = 0

    # Then encode
    for img, tokens_list in all_neg_cap_tokens:
        i = int(img.split('.')[0].split('_')[-1])
        n = len(tokens_list)
        Li = np.zeros((n, max_length), dtype=np.int)
        for j, tokens in enumerate(tokens_list):
            neg_label_length[neg_caption_counter] = len(tokens)
            neg_caption_counter += 1
            tokens_encoded = encode(tokens,
                                    word_to_idx,
                                    allow_unk=args.allow_unk == 1)
            for k, w in enumerate(tokens_encoded):
                Li[j, k] = w
        # captions are padded with zeros
        neg_label_arrays.append(Li)
        neg_label_start_idx[i] = neg_counter
        neg_label_end_idx[i] = neg_counter + n - 1

        neg_counter += n

    neg_L = np.concatenate(neg_label_arrays, axis=0)  # put all labels together
    assert neg_L.shape[0] == M_neg, "lengths don't match?"
    assert np.all(neg_label_length > 0), 'error: some captions have no word?'

    # Create h5 file
    print('Writing output')
    print('Encoded captions array size: ', L.shape)
    print('Encoded negative captions array size: ', neg_L.shape)
    with h5py.File(args.output_h5, 'w') as f:
        f.create_dataset('labels', data=L)
        f.create_dataset('label_start_idx', data=label_start_idx)
        f.create_dataset('label_end_idx', data=label_end_idx)
        f.create_dataset('label_length', data=label_length)
        f.create_dataset('neg_labels', data=neg_L)
        f.create_dataset('neg_label_start_idx', data=neg_label_start_idx)
        f.create_dataset('neg_label_end_idx', data=neg_label_end_idx)
        f.create_dataset('neg_label_length', data=neg_label_length)

Exemplo n.º 21

0

Exibir arquivo

def main(args):
    if (args.input_vocab_json == '') and (args.output_vocab_json == ''):
        print('Must give one of --input_vocab_json or --output_vocab_json')
        return

    print('Loading data')
    with open(args.input_questions_json, 'r') as f:
        questions = json.load(f)['questions']

    # Either create the vocab or load it from disk
    if args.input_vocab_json == '' or args.expand_vocab == 1:
        print('Building vocab')
        if 'answer' in questions[0]:
            answer_token_to_idx = preprocess_utils.build_vocab(
                (q['answer'] for q in questions))
        question_token_to_idx = preprocess_utils.build_vocab(
            (q['question'] for q in questions),
            min_token_count=args.unk_threshold,
            punct_to_keep=[';', ','],
            punct_to_remove=['?', '.'])
        all_program_strs = []
        for q in questions:
            if 'program' not in q: continue
            program_str = program_to_str(q['program'], args.mode)
            if program_str is not None:
                all_program_strs.append(program_str)
        program_token_to_idx = preprocess_utils.build_vocab(all_program_strs)
        vocab = {
            'question_token_to_idx': question_token_to_idx,
            'program_token_to_idx': program_token_to_idx,
            'answer_token_to_idx': answer_token_to_idx,
        }

    if args.input_vocab_json != '':
        print('Loading vocab')
        if args.expand_vocab == 1:
            new_vocab = vocab
        with open(args.input_vocab_json, 'r') as f:
            vocab = json.load(f)
        if args.expand_vocab == 1:
            num_new_words = 0
            for word in new_vocab['question_token_to_idx']:
                if word not in vocab['question_token_to_idx']:
                    print('Found new word %s' % word)
                    idx = len(vocab['question_token_to_idx'])
                    vocab['question_token_to_idx'][word] = idx
                    num_new_words += 1
            print('Found %d new words' % num_new_words)

    if args.output_vocab_json != '':
        utils.mkdirs(os.path.dirname(args.output_vocab_json))
        with open(args.output_vocab_json, 'w') as f:
            json.dump(vocab, f)

    # Encode all questions and programs
    print('Encoding data')
    questions_encoded = []
    programs_encoded = []
    question_families = []
    orig_idxs = []
    image_idxs = []
    answers = []
    for orig_idx, q in enumerate(questions):
        question = q['question']

        orig_idxs.append(orig_idx)
        image_idxs.append(q['image_index'])
        if 'question_family_index' in q:
            question_families.append(q['question_family_index'])
        question_tokens = preprocess_utils.tokenize(question,
                                                    punct_to_keep=[';', ','],
                                                    punct_to_remove=['?', '.'])
        question_encoded = preprocess_utils.encode(
            question_tokens,
            vocab['question_token_to_idx'],
            allow_unk=args.encode_unk == 1)
        questions_encoded.append(question_encoded)

        if 'program' in q:
            program = q['program']
            program_str = program_to_str(program, args.mode)
            program_tokens = preprocess_utils.tokenize(program_str)
            program_encoded = preprocess_utils.encode(
                program_tokens, vocab['program_token_to_idx'])
            programs_encoded.append(program_encoded)

        if 'answer' in q:
            answers.append(vocab['answer_token_to_idx'][q['answer']])

    # Pad encoded questions and programs
    max_question_length = max(len(x) for x in questions_encoded)
    for qe in questions_encoded:
        while len(qe) < max_question_length:
            qe.append(vocab['question_token_to_idx']['<NULL>'])

    if len(programs_encoded) > 0:
        max_program_length = max(len(x) for x in programs_encoded)
        for pe in programs_encoded:
            while len(pe) < max_program_length:
                pe.append(vocab['program_token_to_idx']['<NULL>'])

    # Create h5 file
    print('Writing output')
    questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
    programs_encoded = np.asarray(programs_encoded, dtype=np.int32)
    print(questions_encoded.shape)
    print(programs_encoded.shape)
    utils.mkdirs(os.path.dirname(args.output_h5_file))
    with h5py.File(args.output_h5_file, 'w') as f:
        f.create_dataset('questions', data=questions_encoded)
        f.create_dataset('image_idxs', data=np.asarray(image_idxs))
        f.create_dataset('orig_idxs', data=np.asarray(orig_idxs))

        if len(programs_encoded) > 0:
            f.create_dataset('programs', data=programs_encoded)
        if len(question_families) > 0:
            f.create_dataset('question_families',
                             data=np.asarray(question_families))
        if len(answers) > 0:
            f.create_dataset('answers', data=np.asarray(answers))

Exemplo n.º 22

0

Exibir arquivo

 def extract(self, source, paraphrase, position):
     s = set(tokenize(source))
     p = set(tokenize(paraphrase))
     return 1 - len(p.difference(s)) / (len(p) + 1)

Exemplo n.º 23

0

Exibir arquivo

 def extract(self, source, paraphrase, position):
     wm = wm_distance(tokenize(source), tokenize(paraphrase))
     if math.isinf(wm):
         return 100
     return wm

Exemplo n.º 24

0

Exibir arquivo

 def extract(self, source, paraphrase, position):
     s = set(tokenize(source))
     p = set(tokenize(paraphrase))
     return ent.shannon_entropy(" ".join(p.difference(s)))

Exemplo n.º 25

0

Exibir arquivo

Arquivo: features.py Projeto: mysilver/PhD

def EditDistanceFF(source, paraphrase, position):
    levenshtein = editdistance.eval(source, paraphrase)
    normalized_l = normalized_damerau_levenshtein_distance(source, paraphrase)
    normalized_d = normalized_damerau_levenshtein_distance(
        tokenize(source), tokenize(paraphrase))
    return [levenshtein, normalized_l, normalized_d]