Python TextUtils.preprocess示例，data_preprocessing.text_utils.TextUtils.preprocess Python示例

示例#1

0

显示文件

    def create_data(question_txt, word2ix, eos=False):
        """
        :param question_txt: List of questions as text
        :param word2ix: dictionary that matches word to id
        :param eos: Boolean variable for appending EOS char
        :return: list of questions where each questions a list of ids / of tokens
        """

        sentence_ids = []
        setence_words = []

        if eos:
            for q in question_txt:
                sentence_ids.append([word2ix[token]
                                     if token in word2ix else word2ix["UNK"] for token in
                                     TextUtils.preprocess(q)] + [word2ix["EOS"]])

                setence_words.append([token for token in TextUtils.preprocess(q)])
        else:
            for q in question_txt:
                sentence_ids.append([word2ix[token]
                                     if token in word2ix else word2ix["UNK"] for token in
                                     TextUtils.preprocess(q)])

                setence_words.append([token for token in TextUtils.preprocess(q)])

        return sentence_ids, setence_words

示例#2

0

显示文件

    def create_dict(questions, eos=False, cut_frq=False, cut_under=5, additional=None):
        tokens = [token for sentence in questions for token in TextUtils.preprocess(sentence)]
        if additional is not None:
            tokens.extend(additional)
        if cut_frq:
            word_count = Counter(tokens)
            words_set = set()
            for k, v in word_count.items():
                if v > cut_under:
                    words_set.add(k)

            words = sorted(list(words_set))

        else:
            words = sorted(list(set(tokens)))

        data_size, vocab_size = len(tokens), len(words)

        print("Initialize dataset with {} characters, {} unique.".format(data_size, vocab_size))

        word_to_ix = {ch: i + 1 for i, ch in enumerate(words)}
        ix_to_word = {i + 1: ch for i, ch in enumerate(words)}

        word_to_ix["UNK"] = len(word_to_ix) + 1
        ix_to_word[len(ix_to_word) + 1] = "UNK"

        if eos:
            word_to_ix["EOS"] = len(word_to_ix) + 1
            ix_to_word[len(ix_to_word) + 1] = "EOS"

        return word_to_ix, ix_to_word

示例#3

0

显示文件

文件： candidate_generation.py 项目： GSidiropoulos/kgsqa_for_unseen_domains

def preprocess_mid2ent(mid2entity):
    """
    :return: dictionary where key is an MID and value is the entity"s name
    """
    mid2alias = dict()
    for m, l in mid2entity.items():
        tokens = TextUtils.preprocess(l[0])
        mid2alias[m] = " ".join(token.lower() for token in tokens)

    return mid2alias

示例#4

0

显示文件

    def create_predicate_dict(predicates):
        predicates_ = []
        for p in predicates:
            predicates_.append(p.replace("www.freebase.com/", ""))

        predicates = sorted(list(set(predicates_)))
        pred2ix = {ch: i for i, ch in enumerate(predicates)}
        ix2pred = {i: ch for i, ch in enumerate(predicates)}

        print("Initialize dataset with {} unique predicates.".format(len(predicates)))

        label_words = []  # word level: football player
        label_relation = []  # relation level: football_player
        for pred in pred2ix.keys():
            pred_words = TextUtils.preprocess(pred.replace("_", " ").replace("/", " "))
            label_words.append(pred_words)

            pred_relation = TextUtils.preprocess(pred.replace("/", " "))
            label_relation.append(pred_relation)

        return pred2ix, ix2pred, label_words, label_relation

示例#5

0

显示文件

文件： rp_data.py 项目： GSidiropoulos/kgsqa_for_unseen_domains

def preprocess_synthetic_questions(df, mid2entity, predicate_names):
    """ adds placeholders on the synthetic questions """

    add_questions = []
    additional_quest = []
    for i in range(len(df)):

        reference = [
            df["y_label_post_proc"][i].replace("_END_",
                                               "").replace("_START_",
                                                           "").split()
        ]
        candidate = df["y_post_proc"][i].replace("_END_",
                                                 "").replace("_START_",
                                                             "").split()

        new_cand = []
        if "_PLACEHOLDER_SUB_" not in candidate:
            tmp = mid2entity[df["sub"][i].replace("www.freebase.com/m/", "")]

            if len(tmp) != 0:
                annotations_padded, wrong_num_of_ent, wrong_num_of_ent_num = create_annotations(
                    [candidate],
                    mid2entity[df["sub"][i].replace("www.freebase.com/m/",
                                                    "")], [])
                if 1 in annotations_padded[0]:
                    inds = [
                        index for index, x in enumerate(annotations_padded[0])
                        if x == 1
                    ]

                    candidate = [
                        "_PLACEHOLDER_SUB_" if index in inds else word
                        for index, word in enumerate(candidate)
                    ]

        for word in candidate:
            if word == "_PLACEHOLDER_SUB_" and "sbj" not in new_cand:
                new_cand.append("sbj")
            elif word != "_PLACEHOLDER_SUB_":
                new_cand.append(word)
        new_cand = TextUtils.preprocess(" ".join(new_cand))

        add_questions.extend(new_cand)
        additional_quest.append(new_cand)
    for pp in predicate_names:
        add_questions.extend(pp)

    return add_questions, additional_quest

示例#6

0

显示文件

文件： rp_data.py 项目： GSidiropoulos/kgsqa_for_unseen_domains

def create_data_placeholder(path, word2ix, pred2ix, name):
    """Takes as input the path for an annotated freebase data file, and returns
      subject mids, predicates, object mids, data(questions as word ids), targets(predicate ids), questions"""
    use_col = [0, 1, 2, 3, 4, 6, 7] if name == "test" else [0, 1, 2, 3, 4, 6]
    df_data = pd.read_csv(path, usecols=use_col)
    sbj_mid = df_data["subject"].to_list()
    obj_mid = df_data["object"].to_list()
    predicate = df_data["relation"].to_list()
    annotations = df_data["annotation"].apply(ast.literal_eval).to_list(
    ) if name != "test" else df_data["prediction"].apply(
        ast.literal_eval).to_list()
    question = df_data["question_initial"].to_list()

    question_words = []
    for q in question:
        question_words.append([token for token in TextUtils.preprocess(q)])
    question = replace_plchdr(np.array(question_words), annotations, name)
    data = create_data_pl(question, word2ix)

    targets = DataCreator.create_targets(predicate, pred2ix)

    return sbj_mid, predicate, obj_mid, data, targets, question

示例#7

0

显示文件

文件： rp_data.py 项目： GSidiropoulos/kgsqa_for_unseen_domains

def create_vocab_dictionaries(args,
                              placeholder=False,
                              pred_w=None,
                              pred_n=None,
                              annotations=None,
                              none_sbj=None,
                              separately=False,
                              keywords=None,
                              indices=[]):
    """
    :param args: args
    :param placeholder: placeholders exist or not
    :param pred_w: predicate words
    :param pred_n: predicate names
    :param annotations: annotations
    :param none_sbj: samples for which we do not have problematic mid
    :param separately: use different embeddings for questions and predicates
    :param keywords: use keywords or not
    :param indices: target domain indices
    :return: dictionaries word2ix, ix2word
    """

    sbj_mid, predicate, obj_mid, question = DataCreator.get_spo_question(
        args.path_load_sq, "annotated_fb_data_train.txt")
    # init
    word2ix = None
    ix2word = None
    word2ix_predicates = None
    ix2word_predicates = None

    if (pred_w is None) and (not placeholder):
        # for the multi-class models
        word2ix, ix2word = DataCreator.create_dict(questions=question,
                                                   eos=False,
                                                   cut_frq=False,
                                                   additional=keywords)

    elif (pred_w is not None) and (keywords is None) and (not placeholder):
        # for relatedness models without placeholders
        print("Pred_w: ", len(pred_w), " Placeholder: ", placeholder)
        tokens = [token for label_w in pred_w for token in label_w]
        word2ix, ix2word = DataCreator.create_dict(questions=question,
                                                   eos=False,
                                                   cut_frq=False,
                                                   additional=tokens)

    elif (pred_w is not None) and (keywords is not None) and (not placeholder):
        # for relatedness models without placeholders
        # with word level predicate labels and keywords
        print("Pred_w: ", len(pred_w), " Placeholder:", placeholder,
              " Keywords:", len(keywords))
        tokens = [token for label_w in pred_w for token in label_w] + keywords
        word2ix, ix2word = DataCreator.create_dict(questions=question,
                                                   eos=False,
                                                   cut_frq=False,
                                                   additional=tokens)

    elif (pred_w is not None) and (keywords is not None) and placeholder:
        # for relatedness models with placeholders
        # with word level predicate labels and keywords
        print("Pred_w: ", len(pred_w), " Placeholder:", placeholder,
              " Keywords:", len(keywords))
        question_words = []
        for q in question:
            question_words.append([token for token in TextUtils.preprocess(q)])
        question = replace_plchdr(
            np.delete(np.array(question_words), none_sbj, axis=0), annotations)
        # comment next line if there are no training indices to delete
        print("delete unseen domain questions")
        print(len(indices))
        question = list(np.delete(np.array(question), indices, axis=0))

        additional_predicate_txt = set([w for p_w in pred_w
                                        for w in p_w]).union(set(keywords))
        word2ix, ix2word = create_dict_pl(question, additional_predicate_txt)

    elif (pred_w is not None) and placeholder:
        # for relatedness models with placeholders instead of subject entities in question
        question_words = []
        for q in question:
            question_words.append([token for token in TextUtils.preprocess(q)])
        question = replace_plchdr(
            np.delete(np.array(question_words), none_sbj, axis=0), annotations)

        additional_predicate_txt = set(
            [w for p_w in pred_w for w in p_w]) if pred_n is None else set(
                [w for p_w in pred_w
                 for w in p_w]).union(set([n for p_n in pred_n for n in p_n]))

        if separately:
            # create different vocabs for question and predicate labels
            word2ix, ix2word = create_dict_pl_separately(
                [token for sentence in question for token in sentence])
            word2ix_predicates, ix2word_predicates = create_dict_pl_separately(
                additional_predicate_txt)
        else:
            # same vocab for question and predicate labels
            # comment next line if there are no indices to delete
            print("delete unseen domain questions")
            print(len(indices))
            question = list(np.delete(np.array(question), indices, axis=0))
            word2ix, ix2word = create_dict_pl(question,
                                              additional_predicate_txt)

    print("Dictionary size: ", len(word2ix))

    return word2ix, ix2word, word2ix_predicates, ix2word_predicates

示例#8

0

显示文件

文件： rp_data.py 项目： GSidiropoulos/kgsqa_for_unseen_domains

def main():
    """
    path_load_sq: path of the original SimpleQuestions dataset
    path_load_md_data: path of the folder generated after running MD
    path_load_synthetic: path of the csv file generated by QG
    path_load_mid2ent: path of the folder in which mid2ent exists in
    path_load_keywords: path of the folder in which pred2key exists in (this file is created when extracting keywords)
    path_save: where to save the RP data
    target_domain: the target domain (e.g. book, film, astronomy etc)
    placeholders: placeholders in questions instead of subject names or questions in the original format
    use_relation_words_only: use only words from original questions not to be used with keywords or synthetic questions
    use_keywords: if keywords w.r.t relation will be provided
    use_synthetic_questions: if synthetic questions of the target domain will be provided
    """
    parser = argparse.ArgumentParser()
    parser.add_argument("--path_load_sq",
                        type=str,
                        default=None,
                        required=True)
    parser.add_argument("--path_load_md_data", type=str, default=None)
    parser.add_argument("--path_load_synthetic", type=str, default=None)
    parser.add_argument("--path_load_mid2ent",
                        type=str,
                        default=None,
                        required=True)
    parser.add_argument("--path_load_keywords", type=str, default=None)
    parser.add_argument("--path_save", type=str, default=None, required=True)
    parser.add_argument("--target_domain", type=str, default=None)
    parser.add_argument("--placeholders", action="store_true")
    parser.add_argument("--use_relation_words_only", action="store_true")
    parser.add_argument("--use_keywords", action="store_true")
    parser.add_argument("--use_synthetic_questions", action="store_true")
    args = parser.parse_args()

    # check if args are provided correctly
    check_args(args, parser)

    save_path = args.path_save

    # load mid to entity dictionary
    mid2entity = DataSaverLoader.load_pickle(path=args.path_load_mid2ent,
                                             filename="mid2ent")

    # if args.placeholders:
    #     # load train annotations only for the placeholder case (plc in questions)
    df_train = pd.read_csv(args.path_load_md_data + "/train/" + "/data.csv",
                           usecols=[4])
    train_annotations = df_train["annotation"].apply(
        ast.literal_eval).to_list()
    train_none_sbj = DataSaverLoader.load_pickle(path=args.path_load_md_data +
                                                 "/train/",
                                                 filename="none_sbj")

    # create predicate label to id, id to predicate dictionaries
    # and word level predicate labels and name level predicate labels list
    pred2ix, ix2pred, predicate_words, predicate_names = create_predicate_dictionaries(
        args.path_load_sq)

    if args.use_keywords:
        # load keywords so they can be included in the vocab
        predicate2keywords = DataSaverLoader.load_pickle(
            path=args.path_load_keywords, filename="pred2key")
        keywords_total = []
        for keywords in predicate2keywords.values():
            keywords_total.extend(keywords)

    # indices to delete, if there is no domain given it will remain empty
    indices = []
    if args.target_domain is not None:
        # find the training samples of the target domain which appear in the initial training set
        # those samples need to be removed for the domain adaptation scenario, otherwise we have
        # information leakage

        train = pd.read_csv(args.path_load_sq + "annotated_fb_data_train.txt",
                            sep="\t",
                            usecols=[1],
                            names=["relation"])

        labels_train = create_targets(train["relation"].to_list(), pred2ix)

        # find the relations types of which are part of the target domain
        rem = find_remove_domain_ids(args.target_domain, ix2pred)

        for indx, v in enumerate(labels_train):
            if v[0] in rem:
                indices.append(indx)
        indices_to_delete = np.zeros(len(labels_train))
        indices_to_delete[indices] = 1
        indices_to_delete = np.delete(indices_to_delete, train_none_sbj, 0)
        indices = np.where(indices_to_delete == 1)[0]

    if args.use_synthetic_questions and args.placeholders:
        # the text of the target domain synthetic questions should be part of the final vocabulary
        path_noisy_q = args.path_load_synthetic
        new_q = pd.read_csv(path_noisy_q)
        add_questions, additional_quest = preprocess_synthetic_questions(
            new_q, mid2entity, predicate_names)

    elif args.use_synthetic_questions and not args.placeholders:
        path_noisy_q = args.path_load_synthetic
        new_q = pd.read_csv(path_noisy_q)

        add_questions = []
        additional_quest = []
        for q in new_q["y_post_proc"]:
            q = TextUtils.preprocess(q)
            add_questions.extend(q)
            additional_quest.append(q)
        for pp in predicate_names:
            add_questions.extend(pp)

    # create vocabulary
    if args.use_keywords:
        print(1)
        word2ix, ix2word, _, _ = create_vocab_dictionaries(
            args=args,
            placeholder=args.placeholders,
            pred_w=predicate_words,
            keywords=keywords_total,
            annotations=train_annotations,
            none_sbj=train_none_sbj,
            indices=indices)
    elif args.use_relation_words_only:
        print(2)
        word2ix, ix2word, _, _ = create_vocab_dictionaries(
            args=args,
            placeholder=args.placeholders,
            pred_w=predicate_words,
            annotations=train_annotations,
            none_sbj=train_none_sbj,
            indices=indices)
    elif args.use_synthetic_questions:
        print(3)
        word2ix, ix2word, _, _ = create_vocab_dictionaries(
            args=args,
            placeholder=args.placeholders,
            pred_w=predicate_words,
            keywords=add_questions,
            annotations=train_annotations,
            none_sbj=train_none_sbj,
            indices=indices)
    else:
        print(4)
        word2ix, ix2word, _, _ = create_vocab_dictionaries(
            args=args,
            placeholder=False,
            pred_w=predicate_words,
            indices=indices)

    for i in ["train", "valid", "test"]:
        print("----", i, "----")

        path_tmp = args.path_load_md_data + i + "/data_new.csv" if i != "train" else args.path_load_md_data + i + "/data.csv"
        sbj_mid, predicate, obj_mid, data, targets, questions = create_data_placeholder(
            path_tmp, word2ix, pred2ix, i)

        df_out = pd.DataFrame({
            "subject": sbj_mid,
            "relation": predicate,
            "object": obj_mid,
            "data": data,
            "targets": targets,
            "question": questions
        })
        DataSaverLoader.save_csv(save_path + i + "/", "data.csv", df_out)

        print("Number of samples: ", len(data))

    DataSaverLoader.save_pickle(path=save_path,
                                name="word2ix",
                                python_object=word2ix)
    DataSaverLoader.save_pickle(path=save_path,
                                name="ix2word",
                                python_object=ix2word)

    DataSaverLoader.save_pickle(path=save_path,
                                name="pred2ix",
                                python_object=pred2ix)
    DataSaverLoader.save_pickle(path=save_path,
                                name="ix2pred",
                                python_object=ix2pred)
    DataSaverLoader.save_pickle(path=save_path,
                                name="pred_names",
                                python_object=predicate_names)
    DataSaverLoader.save_pickle(path=save_path,
                                name="pred_words",
                                python_object=predicate_words)

    if args.use_synthetic_questions:
        y_noisy = []
        x_noisy = []
        for noisy_predicate in new_q["pred"]:
            y_noisy.append(pred2ix[noisy_predicate.replace(
                "www.freebase.com/", "")])

        for noisy_q in additional_quest:
            x_noisy.append([word2ix[w] for w in noisy_q])
        DataSaverLoader.save_pickle(path=save_path + "train/",
                                    name="y_noisy",
                                    python_object=y_noisy)
        DataSaverLoader.save_pickle(path=save_path + "train/",
                                    name="x_noisy",
                                    python_object=x_noisy)