Python check_cache_and_download 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: wellcomeml.datasets.download

메소드/함수: check_cache_and_download

hotexamples.com에서의 예제들: 5

Python check_cache_and_download - 5개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 wellcomeml.datasets.download.check_cache_and_download에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

def load_winer(split="train",
               shuffle=True,
               inc_outside=True,
               merge_entities=True):
    path = check_cache_and_download("winer")

    if split == "train":
        train_data_path = os.path.join(path, "train.txt")
        X, Y = _load_data_spacy(train_data_path,
                                inc_outside=inc_outside,
                                merge_entities=merge_entities)
    elif split == "test":
        test_data_path = os.path.join(path, "test.txt")
        X, Y = _load_data_spacy(test_data_path,
                                inc_outside=inc_outside,
                                merge_entities=merge_entities)
    else:
        raise ValueError(
            f"Split argument {split} is not one of train, test or evaluate")

    if shuffle:
        data = list(zip(X, Y))
        random.shuffle(data)
        X, Y = zip(*data)

    return X, Y

예제 #2

파일 보기

def load_conll(split="train",
               shuffle=True,
               inc_outside=True,
               dataset: str = "conll"):
    """Load the conll dataset

    Args:
        split(str): Which split of the data to collect, one of ["train", "test",
            "evaluate"].
        shuffle(bool): Should the data be shuffled with random.shuffle?
        inc_outside(bool): Should outside charavters be included?
        dataset(str): Which dataset to load. This defaults to "conll" and should
            only be altered for test purposes in which case it should be set to
            "test_conll".
    """
    path = check_cache_and_download(dataset)

    map = {"train": "eng.train", "test": "eng.testa", "evaluate": "eng.testb"}

    try:
        data_path = os.path.join(path, map[split])
        X, Y = _load_data_spacy(data_path, inc_outside=inc_outside)
    except KeyError:
        raise KeyError(
            f"Split argument {split} is not one of train, test or evaluate")

    if shuffle:
        data = list(zip(X, Y))
        random.shuffle(data)
        X, Y = zip(*data)

    return X, Y

예제 #3

파일 보기

def load_hoc(split="train", shuffle=True):
    path = check_cache_and_download("hoc")

    if split == "train":
        train_data_path = os.path.join(path, "train.tsv")
        X, Y = load_split(train_data_path)
    elif split == "test":
        test_data_path = os.path.join(path, "test.tsv")
        X, Y = load_split(test_data_path)
    else:
        raise ValueError(f"Split argument {split} is not one of train or test")

    if shuffle:
        data = list(zip(X, Y))
        random.shuffle(data)
        X, Y = zip(*data)

    return X, Y

예제 #4

파일 보기

def load_conll(split="train", shuffle=True, inc_outside=True):
    path = check_cache_and_download("conll")

    if split == "train":
        train_data_path = os.path.join(path, "eng.train")
        X, Y = _load_data_spacy(train_data_path, inc_outside=inc_outside)
    elif split == "test":
        test_data_path = os.path.join(path, "eng.testa")
        X, Y = _load_data_spacy(test_data_path, inc_outside=inc_outside)
    elif split == "evaluate":
        eval_data_path = os.path.join(path, "eng.testb")
        X, Y = _load_data_spacy(eval_data_path, inc_outside=inc_outside)
    else:
        raise ValueError(
            f"Split argument {split} is not one of train, test or evaluate"
        )

    if shuffle:
        data = list(zip(X, Y))
        random.shuffle(data)
        X, Y = zip(*data)

    return X, Y

예제 #5

파일 보기

                                merge_entities=merge_entities)
    else:
        raise ValueError(
            f"Split argument {split} is not one of train, test or evaluate")

    if shuffle:
        data = list(zip(X, Y))
        random.shuffle(data)
        X, Y = zip(*data)

    return X, Y


if __name__ == "__main__":

    path = check_cache_and_download("winer")

    train_processed_path = os.path.join(path, "train.txt")
    test_processed_path = os.path.join(path, "test.txt")

    if not os.path.exists(train_processed_path):
        # Since this has been done once it shouldnt need to be done again,
        # including here for completeness or in th case we want to increase
        # the sample size
        logger.info("No {} training data file found, generating ...".format(
            train_processed_path))

        NE_path = os.path.join(path, "CoarseNE.tar.bz2")
        docs_path = os.path.join(path, "Documents.tar.bz2")
        vocab_path = os.path.join(path, "document.vocab")