Пример #1
0
def convert():
    parser = argparse.ArgumentParser()
    parser.add_argument("-s",
                        "--source-folder",
                        type=str,
                        default="../data/sources/tiger")
    parser.add_argument("-t",
                        "--target-folder",
                        type=str,
                        default="../data/ready/pos/tiger")
    args = parser.parse_args()

    print("Source folder: {}".format(args.source_folder))
    print("Target folder: {}".format(args.target_folder))
    print()

    sentence_pairs = []

    for file in os.listdir(args.source_folder):
        if re.match(".*\.xml$", file):
            print("processing data from {}".format(file))

            file_path = os.path.join(args.source_folder, file)
            root = xml.etree.ElementTree.parse(file_path).getroot()
            sentence_tags = root.find("body").findall("s")

            for sentence_tag in sentence_tags:
                word_tags = sentence_tag.find("graph").find("terminals")
                word_pairs = [[w.attrib["word"], w.attrib["pos"]]
                              for w in word_tags]
                sentence_pairs.append(word_pairs)

            break

    if not os.path.exists(args.target_folder):
        os.makedirs(args.target_folder)

    label_count_pairs = common.get_label_count_pairs(sentence_pairs)
    common.report_statistics(sentence_pairs, label_count_pairs)

    train_bound, val_bound = 40472, 40472 + 5000
    split = sentence_pairs[:train_bound], \
        sentence_pairs[train_bound:val_bound], \
        sentence_pairs[val_bound:]

    for target, dataset in zip(["train", "val", "test"], split):
        sentences_written, tokens_written = 0, 0
        out_path = os.path.join(args.target_folder, target + ".txt")

        with open(out_path, "w+", encoding="utf-8") as out:
            for sentence in dataset:
                out.write("{}\t{}\n".format(
                    " ".join([p[0] for p in sentence]),
                    " ".join([p[1] for p in sentence]),
                ))
                tokens_written += len(sentence)
            sentences_written = len(dataset)

        print("{:,} sentences ({:,} tokens) written to {}".format(
            sentences_written, tokens_written, out_path))

    label_path = os.path.join(args.target_folder, "labels.txt")
    with open(label_path, "w+", encoding="utf-8") as out:
        for lb in label_count_pairs:
            out.write("{}\n".format(lb[0]))

    print("{} labels written to {}".format(len(label_count_pairs), label_path))
Пример #2
0
def convert():
    parser = argparse.ArgumentParser()
    parser.add_argument("-s",
                        "--source-folder",
                        type=str,
                        default="../data/sources/conll2012")
    parser.add_argument("-tp",
                        "--target-folder-pos",
                        type=str,
                        default="../data/ready/pos/conll2012")
    parser.add_argument("-tn",
                        "--target-folder-nerc",
                        type=str,
                        default="../data/ready/nerc/conll2012")
    parser.add_argument("-tpr",
                        "--target-folder-pred",
                        type=str,
                        default="../data/ready/pred/conll2012")
    parser.add_argument("-i", "--iobes", type=bool, default=True)
    args = parser.parse_args()

    print("Source folder: {}".format(args.source_folder))
    print("Target folder for POS task: {}".format(args.target_folder_pos))
    print("Target folder for NERC task: {}".format(args.target_folder_nerc))
    print("Target folder for PRED task: {}".format(args.target_folder_pred))
    print("Convert to IOBES: {}".format(args.iobes))
    print()

    args.target_folders = {
        "POS": args.target_folder_pos,
        "NERC": args.target_folder_nerc,
        "PRED": args.target_folder_pred
    }

    sentence_pairs_per_task_and_folder = {
        t: {}
        for t in args.target_folders.keys()
    }

    for folder in ["train", "development", "test"]:
        folder_path = os.path.join(args.source_folder, folder)
        for task in sentence_pairs_per_task_and_folder.keys():
            sentence_pairs_per_task_and_folder[task][folder] = []

        print("processing data from {} folder".format(folder_path))

        for path in enumerate_files(folder_path, "\.gold_conll$"):
            file_pairs = {t: [] for t in args.target_folders.keys()}
            with open(path, encoding="utf-8") as f:
                running_joint_pairs = []
                for line in [l[:-1] for l in f.readlines()]:
                    if line == "" or line.startswith("#"):
                        if len(running_joint_pairs) > 0:
                            for task, pairs in split_by_task(
                                    running_joint_pairs):
                                # excluding sentences with rare labels from POS data
                                if task == "POS" and any(p[1] in ["*", "AFX"]
                                                         for p in pairs):
                                    continue
                                file_pairs[task].append(
                                    common.convert_to_iobes_tags(pairs) if
                                    task == "NERC" and args.iobes else pairs)
                            running_joint_pairs = []
                        continue
                    running_joint_pairs.append(get_joint_pair(line))

            # excluding files with only "XX" or "VERB" POS labels from POS data
            # (if everything is normal with the dataset, there should't be any)
            if any(
                    any(p[1] not in ["XX", "VERB"] for p in s)
                    for s in file_pairs["POS"]):
                sentence_pairs_per_task_and_folder["POS"][folder].extend(
                    file_pairs["POS"])
            # excluding files without named entity labelling from NERC data
            # (this should correspond to "New Testament" files: /pt/nt/*)
            if any(any(p[1] != "O" for p in s) for s in file_pairs["NERC"]):
                sentence_pairs_per_task_and_folder["NERC"][folder].extend(
                    file_pairs["NERC"])
            # excluding files without predicate labelling from PRED data
            # (if everything is normal with the dataset, there should't be any)
            if any(any(p[1] == "V" for p in s) for s in file_pairs["PRED"]):
                sentence_pairs_per_task_and_folder["PRED"][folder].extend(
                    file_pairs["PRED"])

    for task in ["POS", "NERC", "PRED"]:
        print("\n--------------------------------------\n")
        print("Data for {} task:".format(task))

        target_folder = args.target_folders[task]
        if not os.path.exists(target_folder):
            os.makedirs(target_folder)

        dummy = "-" if task == "PRED" else None
        label_count_pairs = common.get_label_count_pairs(
            sentence_pairs_per_task_and_folder[task], dummy)
        common.report_statistics(sentence_pairs_per_task_and_folder[task],
                                 label_count_pairs)

        for target, source in [["train", "train"], ["val", "development"],
                               ["test", "test"]]:
            sentences_written, tokens_written = 0, 0
            out_path = os.path.join(target_folder, target + ".txt")

            with open(out_path, "w+", encoding="utf-8") as out:
                for sentence in sentence_pairs_per_task_and_folder[task][
                        source]:
                    out.write("{}\t{}\n".format(
                        " ".join([p[0] for p in sentence]),
                        " ".join([p[1] for p in sentence]),
                    ))
                    tokens_written += len(sentence)
                sentences_written += len(
                    sentence_pairs_per_task_and_folder[task][source])

            print(
                "data from {} folder ({:,} sentences, {:,} tokens) written to {}"
                .format(source, sentences_written, tokens_written, out_path))

        label_path = os.path.join(target_folder, "labels.txt")
        with open(label_path, "w+", encoding="utf-8") as out:
            for lb in label_count_pairs:
                out.write("{}\n".format(lb[0]))

        print("{} labels written to {}".format(len(label_count_pairs),
                                               label_path))
Пример #3
0
def convert():
    parser = argparse.ArgumentParser()
    parser.add_argument("-s",
                        "--source-folder",
                        type=str,
                        default="../data/sources/kaggle")
    parser.add_argument("-t",
                        "--target-folder",
                        type=str,
                        default="../data/ready/nerc/kaggle")
    parser.add_argument("-i", "--iobes", type=bool, default=True)
    args = parser.parse_args()

    print("Source folder: {}".format(args.source_folder))
    print("Target folder: {}".format(args.target_folder))
    print("Convert to IOBES: {}".format(args.iobes))
    print()

    sentence_pairs = []

    file_path = os.path.join(args.source_folder, "ner_dataset.csv")
    with open(file_path, encoding="iso-8859-1") as f:
        file_lines = [l[:-1] for l in f.readlines()]

    print("processing data from {}".format(file_path))

    running_pairs = []
    for tokens in csv.reader(file_lines[1:]):
        if tokens[0].startswith("Sentence:") and len(running_pairs) > 0:
            sentence_pairs.append(
                common.convert_to_iobes_tags(running_pairs) if args.
                iobes else running_pairs)
            running_pairs = []
        running_pairs.append(tokens[1::2])
    if len(running_pairs) > 0:
        sentence_pairs.append(running_pairs)

    if not os.path.exists(args.target_folder):
        os.makedirs(args.target_folder)

    label_count_pairs = common.get_label_count_pairs(sentence_pairs)
    common.report_statistics(sentence_pairs, label_count_pairs)

    for target, dataset in zip(["train", "val", "test"],
                               common.shuffle_and_split(sentence_pairs,
                                                        split_points=(0.8,
                                                                      0.9))):
        sentences_written, tokens_written = 0, 0
        out_path = os.path.join(args.target_folder, target + ".txt")

        with open(out_path, "w+", encoding="utf-8") as out:
            for sentence in dataset:
                out.write("{}\t{}\n".format(
                    " ".join([p[0] for p in sentence]),
                    " ".join([p[1] for p in sentence]),
                ))
                tokens_written += len(sentence)
            sentences_written = len(dataset)

        print("{:,} sentences ({:,} tokens) written to {}".format(
            sentences_written, tokens_written, out_path))

    label_path = os.path.join(args.target_folder, "labels.txt")
    with open(label_path, "w+", encoding="utf-8") as out:
        for lb in label_count_pairs:
            out.write("{}\n".format(lb[0]))

    print("{} labels written to {}".format(len(label_count_pairs), label_path))
Пример #4
0
def convert():
    parser = argparse.ArgumentParser()
    parser.add_argument("-s",
                        "--source-folder",
                        type=str,
                        default="../data/sources/conll2003")
    parser.add_argument("-te",
                        "--target-folder-eng",
                        type=str,
                        default="../data/ready/nerc/conll2003_eng")
    parser.add_argument("-tn",
                        "--target-folder-deu",
                        type=str,
                        default="../data/ready/nerc/conll2003_deu")
    parser.add_argument("-i", "--iobes", type=bool, default=True)
    args = parser.parse_args()

    print("Source folder: {}".format(args.source_folder))
    print("Target folder (English): {}".format(args.target_folder_eng))
    print("Target folder (German): {}".format(args.target_folder_deu))
    print("Convert to IOBES: {}".format(args.iobes))
    print()

    args.target_folders = {
        "eng": args.target_folder_eng,
        "deu": args.target_folder_deu
    }

    for language in ["eng", "deu"]:
        sentence_pairs_per_file = {}

        for file in os.listdir(args.source_folder):
            if file.startswith(language):
                sentence_pairs_per_file[file] = []
                file_path = os.path.join(args.source_folder, file)
                file_lines = [
                    l[:-1] for l in open(file_path,
                                         encoding="iso-8859-1").readlines()
                ]

                print("processing data from {}".format(file_path))

                running_pairs = []
                for line in file_lines:
                    if line == "" or line.startswith("-DOCSTART-"):
                        if len(running_pairs) > 0:
                            sentence_pairs_per_file[file].append(
                                common.convert_to_iobes_tags(running_pairs)
                                if args.iobes else running_pairs)
                            running_pairs = []
                        continue
                    tokens = line.split(" ")
                    pair = [tokens[0], tokens[-1]]
                    running_pairs.append(fix_b_tag(pair, running_pairs))

        if len(sentence_pairs_per_file) == 0:
            print("{} files not found\n".format(language))
            continue

        if not os.path.exists(args.target_folders[language]):
            os.makedirs(args.target_folders[language])

        label_count_pairs = common.get_label_count_pairs(
            sentence_pairs_per_file)
        common.report_statistics(sentence_pairs_per_file, label_count_pairs)

        for target, source in [["train", "{}.train".format(language)],
                               ["val", "{}.testa".format(language)],
                               ["test", "{}.testb".format(language)]]:
            sentences_written, tokens_written = 0, 0
            out_path = os.path.join(args.target_folders[language],
                                    target + ".txt")

            with open(out_path, "w+", encoding="utf-8") as out:
                for sentence in sentence_pairs_per_file[source]:
                    out.write("{}\t{}\n".format(
                        " ".join([p[0] for p in sentence]),
                        " ".join([p[1] for p in sentence]),
                    ))
                    tokens_written += len(sentence)
                sentences_written += len(sentence_pairs_per_file[source])

            print("data from {} ({:,} sentences, {:,} tokens) written to {}".
                  format(source, sentences_written, tokens_written, out_path))

        label_path = os.path.join(args.target_folders[language], "labels.txt")
        with open(label_path, "w+", encoding="utf-8") as out:
            for lb in label_count_pairs:
                out.write("{}\n".format(lb[0]))

        print("{} labels written to {}".format(len(label_count_pairs),
                                               label_path))
        print()
Пример #5
0
def convert():
    parser = argparse.ArgumentParser()
    parser.add_argument("-s", "--source-folder", type=str, default="../data/sources/germeval")
    parser.add_argument("-t", "--target-folder", type=str, default="../data/ready/nerc/germeval")
    parser.add_argument("-i", "--iobes", type=bool, default=True)
    args = parser.parse_args()

    print("Source folder: {}".format(args.source_folder))
    print("Target folder: {}".format(args.target_folder))
    print("Convert to IOBES: {}".format(args.iobes))
    print()

    sentence_pairs_per_file = {}

    for file in os.listdir(args.source_folder):
        sentence_pairs_per_file[file] = []
        file_path = os.path.join(args.source_folder, file)
        file_lines = [l[:-1] for l in open(file_path, encoding="utf-8").readlines()]

        print("processing data from {}".format(file_path))

        running_pairs = []
        for line in file_lines + [""]:
            if line == "" or line.startswith("#\t"):
                if len(running_pairs) > 0:
                    sentence_pairs_per_file[file].append(
                        common.convert_to_iobes_tags(running_pairs)
                        if args.iobes else running_pairs
                    )
                    running_pairs = []
                continue
            pair = line.split("\t")[1:3]
            running_pairs.append(pair)

    if not os.path.exists(args.target_folder):
        os.makedirs(args.target_folder)

    label_count_pairs = common.get_label_count_pairs(sentence_pairs_per_file)
    common.report_statistics(sentence_pairs_per_file, label_count_pairs)

    for target, source in [
        ["train", "NER-de-train.tsv"],
        ["val", "NER-de-dev.tsv"],
        ["test", "NER-de-test.tsv"]
    ]:
        sentences_written, tokens_written = 0, 0
        out_path = os.path.join(args.target_folder, target + ".txt")

        with open(out_path, "w+", encoding="utf-8") as out:
            for sentence in sentence_pairs_per_file[source]:
                out.write("{}\t{}\n".format(
                    " ".join([p[0] for p in sentence]),
                    " ".join([p[1] for p in sentence]),
                ))
                tokens_written += len(sentence)
            sentences_written += len(sentence_pairs_per_file[source])

        print("data from {} ({:,} sentences, {:,} tokens) written to {}".format(
            source, sentences_written, tokens_written, out_path
        ))

    label_path = os.path.join(args.target_folder, "labels.txt")
    with open(label_path, "w+", encoding="utf-8") as out:
        for lb in label_count_pairs:
            out.write("{}\n".format(lb[0]))

    print("{} labels written to {}".format(
        len(label_count_pairs), label_path
    ))
Пример #6
0
def convert():
    parser = argparse.ArgumentParser()
    parser.add_argument("-s",
                        "--source-folder",
                        type=str,
                        default="../data/sources/wsj")
    parser.add_argument("-t",
                        "--target-folder",
                        type=str,
                        default="../data/ready/pos/wsj")
    args = parser.parse_args()

    print("Source folder: {}".format(args.source_folder))
    print("Target folder: {}".format(args.target_folder))
    print()

    sentences_pairs_per_section = {}

    for folder in [
            f for f in sorted(os.listdir(args.source_folder))
            if re.match("\d{2}", f)
    ]:
        section = int(folder)
        sentences_pairs_per_section[section] = []
        folder_path = os.path.join(args.source_folder, folder)

        print("processing section {} from {} folder".format(
            section, folder_path))

        for file in [
                f for f in sorted(os.listdir(folder_path))
                if re.match("wsj_\d{4}.mrg", f)
        ]:
            file_path = os.path.join(args.source_folder, folder, file)
            file_lines = [l[:-1] for l in open(file_path).readlines()]

            running_tree_lines = []
            for line in file_lines:
                if not line.startswith(" ") and len(running_tree_lines) > 0:
                    sentences_pairs_per_section[section].append(
                        tree_lines_to_pairs(running_tree_lines))
                    running_tree_lines = []
                if line != "":
                    running_tree_lines.append(line)
            sentences_pairs_per_section[section].append(
                tree_lines_to_pairs(running_tree_lines))

    if not os.path.exists(args.target_folder):
        os.makedirs(args.target_folder)

    label_count_pairs = common.get_label_count_pairs(
        sentences_pairs_per_section)
    common.report_statistics(sentences_pairs_per_section, label_count_pairs)

    for target, from_, to in [["train", 0, 18], ["val", 19, 21],
                              ["test", 22, 24]]:
        sentences_written, tokens_written = 0, 0
        out_path = os.path.join(args.target_folder, target + ".txt")

        with open(out_path, "w+", encoding="utf-8") as out:
            for section in range(from_, to + 1):
                for sentence in sentences_pairs_per_section[section]:
                    out.write("{}\t{}\n".format(
                        " ".join([p[0] for p in sentence]),
                        " ".join([p[1] for p in sentence]),
                    ))
                    tokens_written += len(sentence)
                sentences_written += len(sentences_pairs_per_section[section])

        print("sections {}-{} ({:,} sentences, {:,} tokens) written to {}".
              format(from_, to, sentences_written, tokens_written, out_path))

    label_path = os.path.join(args.target_folder, "labels.txt")
    with open(label_path, "w+", encoding="utf-8") as out:
        for lb in label_count_pairs:
            out.write("{}\n".format(lb[0]))

    print("{} labels written to {}".format(len(label_count_pairs), label_path))
def convert():
    parser = argparse.ArgumentParser()
    parser.add_argument("-s", "--source-folder", type=str, default="../data/sources/conll2000")
    parser.add_argument("-t", "--target-folder", type=str, default="../data/ready/chunk/conll2000")
    parser.add_argument("-i", "--iobes", type=bool, default=True)
    args = parser.parse_args()

    print("Source folder: {}".format(args.source_folder))
    print("Target folder: {}".format(args.target_folder))
    print("Convert to IOBES: {}".format(args.iobes))
    print()

    sentence_pairs_per_file = {}

    for file in os.listdir(args.source_folder):
        sentence_pairs_per_file[file] = []
        file_path = os.path.join(args.source_folder, file)
        file_lines = [l[:-1] for l in open(file_path).readlines()]

        print("processing data from {}".format(file_path))

        running_pairs = []
        for line in file_lines:
            if line == "":
                if len(running_pairs) > 0:
                    sentence_pairs_per_file[file].append(
                        common.convert_to_iobes_tags(running_pairs)
                        if args.iobes else running_pairs
                    )
                    running_pairs = []
                continue
            pair = line.split(" ")[0::2]
            running_pairs.append(pair)

    if not os.path.exists(args.target_folder):
        os.makedirs(args.target_folder)

    label_count_pairs = common.get_label_count_pairs(sentence_pairs_per_file)
    common.report_statistics(sentence_pairs_per_file, label_count_pairs)

    train, val = common.shuffle_and_split(sentence_pairs_per_file["train.txt"], split_points=(0.9,))
    test = sentence_pairs_per_file["test.txt"]

    for target, dataset in zip(["train", "val", "test"], [train, val, test]):
        sentences_written, tokens_written = 0, 0
        out_path = os.path.join(args.target_folder, target + ".txt")

        with open(out_path, "w+", encoding="utf-8") as out:
            for sentence in dataset:
                out.write("{}\t{}\n".format(
                    " ".join([p[0] for p in sentence]),
                    " ".join([p[1] for p in sentence]),
                ))
                tokens_written += len(sentence)
            sentences_written = len(dataset)

        print("{:,} sentences ({:,} tokens) written to {}".format(
            sentences_written, tokens_written, out_path
        ))

    label_path = os.path.join(args.target_folder, "labels.txt")
    with open(label_path, "w+", encoding="utf-8") as out:
        for lb in label_count_pairs:
            out.write("{}\n".format(lb[0]))

    print("{} labels written to {}".format(
        len(label_count_pairs), label_path
    ))