Пример #1
0
def load_mr():
    path = io.get_root_path() + "/mr_dataset/mr.txt"
    text_file = io.get_root_path() + "/mr_dataset/mr/text_all.txt"
    files = io.read_txt(path)
    labels = []
    sents = []

    all_sents = []
    with open(text_file, "r", encoding="iso-8859-1") as f:
        all_sents = f.readlines()
    f.close()

    for index, x in enumerate(files):
        cats = x.split("\t")

        sentences = all_sents[index].replace("\n", " ").replace("\t", " ")
        sentences = strip_accents(sentences)
        labels.append("\t".join([str(index), cats[1], cats[2]]))
        sents.append(sentences)

    file.save_labels(labels, "mr")
    file.save_sentences(sents, "mr")
Пример #2
0
def load_ohsumed():
    path = io.get_root_path() + "/data/ohsumed.txt"
    files = io.read_txt(path)
    labels = []
    sents = []
    for index, x in enumerate(files):
        cats = x.split("\t")
        if "training" in cats[1]:
            cats[1] = "train"

        with open(cats[0], "rt") as f:
            sentence = f.readlines()
            sentence = [sen.replace("\n", " ") for sen in sentence]
            sentence = [sen.replace("\t", " ") for sen in sentence]
            sentences = " ".join(sentence)
        f.close()

        labels.append("\t".join([str(index), cats[1], cats[2]]))
        sents.append(sentences)

    file.save_labels(labels, "ohsumed")
    file.save_sentences(sents, "ohsumed")
Пример #3
0
def get_nouns_vocab(dataset=FLAGS.dataset):
    path = io.get_nouns_vocab(dataset)
    exist(path, "Dataset has not been tokenized yet, run `prep_data.py`")
    return io.read_txt(path)
Пример #4
0
def get_vocab(dataset=FLAGS.dataset):
    path = io.get_vocab_path(dataset)
    exist(path, "Your dataset must include a `_vocab.txt` file")
    return io.read_txt(path)
Пример #5
0
def get_labels(dataset=FLAGS.dataset):
    path = io.get_labels_path(dataset)
    exist(path, "Your dataset must include a `_labels.txt` file")
    return io.read_txt(path)
Пример #6
0
def get_cleaned_sentences(dataset=FLAGS.dataset):
    path = io.get_clean_sentences_path(dataset)
    exist(path, "Your dataset must include a `_cleaned_sentences.txt` file")
    return io.read_txt(path)
Пример #7
0
def get_doc2relations(dataset=FLAGS.dataset):
    return io.read_txt(io.get_doc2relations_path(dataset))