def load_mr(): path = io.get_root_path() + "/mr_dataset/mr.txt" text_file = io.get_root_path() + "/mr_dataset/mr/text_all.txt" files = io.read_txt(path) labels = [] sents = [] all_sents = [] with open(text_file, "r", encoding="iso-8859-1") as f: all_sents = f.readlines() f.close() for index, x in enumerate(files): cats = x.split("\t") sentences = all_sents[index].replace("\n", " ").replace("\t", " ") sentences = strip_accents(sentences) labels.append("\t".join([str(index), cats[1], cats[2]])) sents.append(sentences) file.save_labels(labels, "mr") file.save_sentences(sents, "mr")
def load_ohsumed(): path = io.get_root_path() + "/data/ohsumed.txt" files = io.read_txt(path) labels = [] sents = [] for index, x in enumerate(files): cats = x.split("\t") if "training" in cats[1]: cats[1] = "train" with open(cats[0], "rt") as f: sentence = f.readlines() sentence = [sen.replace("\n", " ") for sen in sentence] sentence = [sen.replace("\t", " ") for sen in sentence] sentences = " ".join(sentence) f.close() labels.append("\t".join([str(index), cats[1], cats[2]])) sents.append(sentences) file.save_labels(labels, "ohsumed") file.save_sentences(sents, "ohsumed")
def get_nouns_vocab(dataset=FLAGS.dataset): path = io.get_nouns_vocab(dataset) exist(path, "Dataset has not been tokenized yet, run `prep_data.py`") return io.read_txt(path)
def get_vocab(dataset=FLAGS.dataset): path = io.get_vocab_path(dataset) exist(path, "Your dataset must include a `_vocab.txt` file") return io.read_txt(path)
def get_labels(dataset=FLAGS.dataset): path = io.get_labels_path(dataset) exist(path, "Your dataset must include a `_labels.txt` file") return io.read_txt(path)
def get_cleaned_sentences(dataset=FLAGS.dataset): path = io.get_clean_sentences_path(dataset) exist(path, "Your dataset must include a `_cleaned_sentences.txt` file") return io.read_txt(path)
def get_doc2relations(dataset=FLAGS.dataset): return io.read_txt(io.get_doc2relations_path(dataset))