def raw_to_corpus(sample, output): if output: output_folder = output else: output_folder = join(dirname(dirname(__file__)), "tmp", "vlsp2013") try: makedirs(output_folder) except Exception as e: pass raw_folders = ["Trainset-POS-full", "Testset-POS"] output_names = ["train.txt", "test.txt"] data_folder = join(dirname(dirname(__file__)), "data", "vlsp2013", "raw") for i, raw_folder in enumerate(raw_folders): tagged_corpus = TaggedCorpus() sentences = [] files = listdir(join(data_folder, raw_folder)) files = [join(data_folder, raw_folder, file) for file in files] for file in files: sentences += preprocess(file) if sample != None: if len(sentences) > sample: sentences = sentences[:sample] break tagged_corpus.sentences = sentences output_file = join(output_folder, output_names[i]) tagged_corpus.save(output_file) print("{} sentences is saved to file {}".format(len(sentences), output_file))
def raw_to_corpus(sample, output): if output: output_folder = output else: output_folder = join(dirname(dirname(__file__)), "tmp", "vlsp2013") try: makedirs(output_folder) except Exception as e: pass raw_folders = ["Trainset-POS-full", "Testset-POS"] output_names = ["train.txt", "test.txt"] data_folder = join(dirname(dirname(__file__)), "data", "vlsp2013", "raw") for i, raw_folder in enumerate(raw_folders): tagged_corpus = TaggedCorpus() sentences = [] files = listdir(join(data_folder, raw_folder)) files = [join(data_folder, raw_folder, file) for file in files] for file in files: sentences += preprocess(file) if sample != None: if len(sentences) > sample: sentences = sentences[:sample] break tagged_corpus.sentences = sentences output_file = join(output_folder, output_names[i]) tagged_corpus.save(output_file) print("{} sentences is saved to file {}".format( len(sentences), output_file))
def raw_to_corpus(): for f in ["train.txt", "dev.txt", "test.txt"]: tagged_corpus = TaggedCorpus() input = join(dirname(__file__), "raw", f) tagged_corpus.load(input) tagged_corpus.sentences = preprocess(tagged_corpus.sentences) output = join(dirname(__file__), "corpus", f) tagged_corpus.save(output)
def sample_data(n=200): tagged_corpus = TaggedCorpus() file = join(dirname(__file__), "corpus", "vlsp_chunk", "train.txt") tagged_corpus.load(file) sentences = tagged_corpus.sentences[:n] sample_corpus = TaggedCorpus(sentences) file = join(dirname(__file__), "corpus", "vlsp_chunk_sample", "train.txt") sample_corpus.save(file)
def raw_to_sample_corpus(): for f in ["train.txt", "dev.txt", "test.txt"]: tagged_corpus = TaggedCorpus() input = join(dirname(dirname(__file__)), "raw", "vlsp2016", f) tagged_corpus.load(input) tagged_corpus.sentences = preprocess(tagged_corpus.sentences)[:100] output = join(dirname(dirname(__file__)), "corpus", "sample_vlsp_2016", f) tagged_corpus.save(output)
def raw_to_corpus(): for f in ["train.txt", "dev.txt", "test.txt"]: tagged_corpus = TaggedCorpus() input = join(dirname(__file__), "raw", f) tagged_corpus.load(input) tagged_corpus.sentences = preprocess(tagged_corpus.sentences) corpus_folder = join(dirname(__file__), "corpus") try: mkdir(corpus_folder) except: pass output = join(corpus_folder, f) tagged_corpus.save(output)
def raw_to_corpus(sample, output): if output: output_folder = output else: output_folder = join(dirname(dirname(__file__)), "tmp", "vlsp2016") for f in ["train.txt", "dev.txt", "test.txt"]: input = join(dirname(dirname(__file__)), "data", "vlsp2016", "raw", f) tagged_corpus = TaggedCorpus() tagged_corpus.load(input) sentences = tagged_corpus.sentences if sample: sentences = sentences[:sample] tagged_corpus.sentences = preprocess(sentences) try: makedirs(output_folder) except: pass output_data = join(output_folder, f) tagged_corpus.save(output_data)