示例#1
0
def main():
    parser = argparse.ArgumentParser("Preprocess SQuAD data")
    #basedir = join(expanduser("~"), "data", "squad")
    basedir = join(expanduser("~"), "azayats", "data", "squad")
    parser.add_argument("--train_file",
                        default=join(basedir, "train-v1.1.json"))
    parser.add_argument("--dev_file", default=join(basedir, "dev-v1.1.json"))

    if not exists(config.CORPUS_DIR):
        mkdir(config.CORPUS_DIR)

    target_dir = join(config.CORPUS_DIR, SquadCorpus.NAME)
    if exists(target_dir) and len(listdir(target_dir)) > 0:
        raise ValueError("Files already exist in " + target_dir)

    args = parser.parse_args()
    tokenzier = NltkAndPunctTokenizer()

    print("Parsing train...")
    train = list(parse_squad_data(args.train_file, "train", tokenzier))

    print("Parsing dev...")
    dev = list(parse_squad_data(args.dev_file, "dev", tokenzier))

    print("Saving...")
    SquadCorpus.make_corpus(train, dev)
    print("Done")
示例#2
0
def main():
    #Namespace(directory= 'C:/Users/boidiyv/document-qa-master',dump=False, fake=False, verbose=False)
    parser = argparse.ArgumentParser("Preprocess SQuAD data")
    #parser = argparse.ArgumentParser()

    parser.add_argument('--document-qa/docqa/squad', type=Path)
    parser.add_argument("--train_file", default=config.SQUAD_TRAIN)
    parser.add_argument("--dev_file", default=config.SQUAD_DEV)
    #parser.add_argument("--document-qa-master",type=lambda p: Path(p).absolute(),default=Path(__file__).absolute().parent / "document-qa-master",help="Path to the data directory" )
    
    
    if not exists(config.CORPUS_DIR):
        mkdir(config.CORPUS_DIR)

    target_dir = join(config.CORPUS_DIR, SquadCorpus.NAME)
    if exists(target_dir) and len(listdir(target_dir)) > 0:
        raise ValueError("Files already exist in " + target_dir)

    args = parser.parse_args('')
    tokenzier = NltkAndPunctTokenizer()

    print("Parsing train...")
    train = list(parse_squad_data(args.train_file, "train", tokenzier))
    print(train)

    print("Parsing dev...")
    dev = list(parse_squad_data(args.dev_file, "dev", tokenzier))

    print("Saving...")
    SquadCorpus.make_corpus(train, dev)
    print("Done")
示例#3
0
def main():
    parser = argparse.ArgumentParser("Preprocess SQuAD data")
    parser.add_argument("--train_file", default=config.SQUAD_TRAIN)
    parser.add_argument("--dev_file", default=config.SQUAD_DEV)
    parser.add_argument("--weighted-questions", action='store_true')

    if not exists(config.CORPUS_DIR):
        mkdir(config.CORPUS_DIR)

    target_dir = join(config.CORPUS_DIR, SquadCorpus.NAME)
    if exists(target_dir) and len(listdir(target_dir)) > 0:
        raise ValueError("Files already exist in " + target_dir)

    args = parser.parse_known_args()[0]
    tokenizer = NltkAndPunctTokenizer()

    print("Parsing train...")
    train = list(
        parse_squad_data(args.train_file,
                         "train",
                         tokenizer,
                         weighted_samples=args.weighted_questions))

    print("Parsing dev...")
    dev = list(parse_squad_data(args.dev_file, "dev", tokenizer))

    print("Saving...")
    SquadCorpus.make_corpus(train, dev)
    print("Done")