Python tokenize 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: mmf.utils.text_processing

메소드/함수: tokenize

hotexamples.com에서의 예제들: 2

Python tokenize - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 mmf.utils.text_processing.tokenize에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

def get_imdb(file_path):

    imdb = [{"dataset_name": "gqa"}]

    questions = json.load(open(file_path, "r"))
    print("Processing file {}".format(file_path))

    for qid, item in tqdm.tqdm(questions.items()):
        entry = {
            "image_name": item["imageId"] + "jpg",
            "image_id": item["imageId"],
            "question_id": qid,
            "question_str": item["question"],
            "question_tokens": tokenize(item["question"]),
        }

        if "answer" in item:
            entry["all_answers"] = [item["answer"] for _ in range(10)]
            entry["valid_answers"] = [item["answer"] for _ in range(10)]
            entry["semantic_string"] = (item["semanticStr"], )
            entry["gt_object_ids"] = (get_objects(item["semanticStr"]), )
            entry["meta_data"] = item["types"]

        imdb.append(entry)

    return np.array(imdb)

예제 #2

파일 보기

def convert_gqa_to_vqa(gqa_dir, out_dir):
    """
    Takes GQA dataset and converts it into VQA format

    Assumes GQA dir structure as:

    -gqa_dir/
      -images/
         -images/
         -objects/
         -spatial/
      -questions/
      -scenegraphs/
    """

    image_feat_path = os.path.join(gqa_dir, "images")
    extract_image_features(image_feat_path, out_dir)

    questions_dir = os.path.join(gqa_dir, "questions")

    if os.path.isfile(os.path.join(questions_dir, "train_all_questions.json")):
        print("Using previously generated train_all_questions.json file")
    else:
        merge_train(os.path.join(gqa_dir, "questions", "train_all_questions"))

    split_mapping = {
        "test": "test_all_questions.json",
        "val": "val_all_questions.json",
        "challenge": "challenge_all_questions.json",
        "train": "train_all_questions.json",
    }

    for split in split_mapping:
        for balance_type in ["balanced", "all"]:
            filename = split_mapping[split]
            csplit = split
            if balance_type == "balanced":
                filename = filename.replace("_all", "_balanced")
                csplit = split + "_balanced"

            file_path = os.path.join(questions_dir, filename)
            imdb = get_imdb(file_path)

            save_path = os.path.join(out_dir, "imdb_{}.npy".format(csplit))
            np.save(save_path, imdb)

    splits = ["val", "train"]
    split_type = ["balanced", "all"]

    global_answer = Counter()
    global_q = Counter()
    question_len = Counter()

    for s in splits:
        for st in split_type:
            questions_json = os.path.join(questions_dir,
                                          "{}_{}_questions.json".format(s, st))
            questions = json.load(open(questions_json, "r"))

            print("Processing split {}_{}".format(s, st))

            answers = Counter()
            q_tokens = Counter()

            for _, q in tqdm.tqdm(questions.items()):
                tokens = tokenize(q["question"])
                q_tokens.update(tokens)
                global_q.update(tokens)
                answers.update([q["answer"].lower()])
                global_answer.update([q["answer"].lower()])
                question_len.update([len(tokens)])

    print("N_unique answers :", len(global_answer))
    print("N unique q tokens:", len(global_q))
    print("Min Q length", min([x for x in question_len]))
    print("Max Q length", max([x for x in question_len]))
    print("Q length distribution", question_len)

    # Save question vocabulary
    q_vocabulary = [w[0] for w in global_q.items()]
    q_vocabulary.sort()
    q_vocabulary = ["<unk>"] + q_vocabulary

    vocab_file = os.path.join(out_dir, "vocabulary_gqa.txt")
    with open(vocab_file, "w") as f:
        f.writelines([w + "\n" for w in q_vocabulary])

    # Save answer vocabulary
    answer_list = [preprocess_answer(ans[0]) for ans in global_answer.items()]
    answer_list = [t.strip() for t in answer_list if len(t.strip()) > 0]
    answer_list.sort()

    if "<unk>" not in answer_list:
        answer_list = ["<unk>"] + answer_list

    answer_file = os.path.join(out_dir, "answers_gqa.txt")
    with open(answer_file, "w") as fp:
        fp.writelines([w + "\n" for w in answer_list])