def encrypt_archive(archive_name, directory_name): if exists(archive_name) and isfile(archive_name): filesize = stat(archive_name).st_size if filesize < 100: print(yellowify("Present archive has wrong size (%d bytes). Removing." % (filesize,))) execute_bash("rm %s" % (archive_name,)) if not exists(archive_name) or not isfile(archive_name): if exists(directory_name) and isdir(directory_name): print(boldify("Encrypting data")) execute_bash("sh %s encrypt" % (ENCRYPT_SCRIPT)) print(greenify("Successfully Encrypted data"))
def encrypt_archive(archive_name, directory_name): if exists(archive_name) and isfile(archive_name): filesize = stat(archive_name).st_size if filesize < 100: print( yellowify( "Present archive has wrong size (%d bytes). Removing." % (filesize, ))) execute_bash("rm %s" % (archive_name, )) if not exists(archive_name) or not isfile(archive_name): if exists(directory_name) and isdir(directory_name): print(boldify("Encrypting data")) execute_bash("sh %s encrypt" % (ENCRYPT_SCRIPT)) print(greenify("Successfully Encrypted data"))
from os import listdir from os.path import isdir, isfile, join, dirname, realpath # add data to path DATA_DIR = dirname(dirname(realpath(__file__))) sys.path.append(DATA_DIR) from utils import print_progress, execute_bash THIS_DATA_DIR = dirname(realpath(__file__)) # important ZIP_URL = 'http://cs224d.stanford.edu/assignment2/assignment2.zip' ZIP_LOCAL = join(THIS_DATA_DIR, 'assignment2.zip') UNZIPPED_LOCAL = join(THIS_DATA_DIR, "assignment2") def delete_paths(paths): for path in paths: execute_bash('rm -rf %s' % (path,)) if __name__ == '__main__': local_files = [join(THIS_DATA_DIR, f) for f in ['train.txt', 'test.txt', 'dev.txt']] delete_paths([ZIP_LOCAL, join(THIS_DATA_DIR, "trees")] + local_files) execute_bash('wget -O %s %s' % (ZIP_LOCAL, ZIP_URL)) execute_bash('unzip %s -d %s' % (ZIP_LOCAL, UNZIPPED_LOCAL)) execute_bash('mv %s %s' % (join(UNZIPPED_LOCAL, "data", "ner", "train"), join(THIS_DATA_DIR, "train.txt"))) execute_bash('mv %s %s' % (join(UNZIPPED_LOCAL, "data", "ner", "test.masked"), join(THIS_DATA_DIR, "test.txt"))) execute_bash('mv %s %s' % (join(UNZIPPED_LOCAL, "data", "ner", "dev"), join(THIS_DATA_DIR, "dev.txt"))) delete_paths([ZIP_LOCAL, UNZIPPED_LOCAL])
from utils import print_progress, execute_bash THIS_DATA_DIR = dirname(realpath(__file__)) # important ZIP_URL = 'http://cs224d.stanford.edu/assignment2/assignment2.zip' ZIP_LOCAL = join(THIS_DATA_DIR, 'assignment2.zip') UNZIPPED_LOCAL = join(THIS_DATA_DIR, "assignment2") def delete_paths(paths): for path in paths: execute_bash('rm -rf %s' % (path, )) if __name__ == '__main__': local_files = [ join(THIS_DATA_DIR, f) for f in ['train.txt', 'test.txt', 'dev.txt'] ] delete_paths([ZIP_LOCAL, join(THIS_DATA_DIR, "trees")] + local_files) execute_bash('wget -O %s %s' % (ZIP_LOCAL, ZIP_URL)) execute_bash('unzip %s -d %s' % (ZIP_LOCAL, UNZIPPED_LOCAL)) execute_bash('mv %s %s' % (join(UNZIPPED_LOCAL, "data", "ner", "train"), join(THIS_DATA_DIR, "train.txt"))) execute_bash('mv %s %s' % (join(UNZIPPED_LOCAL, "data", "ner", "test.masked"), join(THIS_DATA_DIR, "test.txt"))) execute_bash('mv %s %s' % (join(UNZIPPED_LOCAL, "data", "ner", "dev"), join(THIS_DATA_DIR, "dev.txt"))) delete_paths([ZIP_LOCAL, UNZIPPED_LOCAL])
def decrypt_archive(): print(boldify("Decrypting data")) execute_bash("sh %s decrypt" % (ENCRYPT_SCRIPT))
import random import sys from os import listdir from os.path import isdir, isfile, join, dirname, realpath # add data to path DATA_DIR = dirname(dirname(realpath(__file__))) sys.path.append(DATA_DIR) from utils import print_progress, execute_bash THIS_DATA_DIR = dirname(realpath(__file__)) # important ZIP_URL = 'http://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip' ZIP_LOCAL = join(THIS_DATA_DIR, 'trainDevTestTrees_PTB.zip') def delete_paths(paths): for path in paths: execute_bash('rm -rf %s' % (path,)) if __name__ == '__main__': local_files = [join(THIS_DATA_DIR, f) for f in ['train.txt', 'test.txt', 'dev.txt']] delete_paths([ZIP_LOCAL, join(THIS_DATA_DIR, "trees")] + local_files) execute_bash('wget -O %s %s' % (ZIP_LOCAL, ZIP_URL)) execute_bash('unzip %s -d %s' % (ZIP_LOCAL, THIS_DATA_DIR)) execute_bash('mv %s %s' % (join(THIS_DATA_DIR, "trees", "*"), THIS_DATA_DIR)) delete_paths([ZIP_LOCAL,join(THIS_DATA_DIR, "trees")])
def delete_paths(paths): for path in paths: execute_bash('rm -rf %s' % (path, ))
] temp_dir = join(THIS_DATA_DIR, "tmp") def delete_paths(paths): for path in paths: execute_bash('rm -rf %s' % (path,)) if __name__ == '__main__': if not exists(temp_dir): makedirs(temp_dir) # delete existing downloads delete_paths([out for url, path, out, newdir in urls] + [path for url, path, out, newdir in urls] + [newdir for url, path, out, newdir in urls]) # download new ones for url, path, out, newdir in urls: execute_bash("wget -O {path} {url}".format(url=url, path=path)) execute_bash("tar -xf %s -C %s" % (path, THIS_DATA_DIR)) execute_bash("rm %s" % join(out, "00-readme.txt")) execute_bash("mv {dataset_files} {tmp_dir}".format( tmp_dir=temp_dir, dataset_files=join(out, "*.txt")) ) delete_paths([out]) if not exists(newdir): makedirs(newdir) execute_bash("mv {tmp_files} {newdir}".format( tmp_files = join(temp_dir, "*.txt"), newdir = newdir) ) delete_paths([temp_dir] + [path for url, path, out, newdir in urls])
def cleanup(): execute_bash('rm -rf wikianswer.tar.gz %s*' % (DATASET_DIR,))
# not so important DATASET_DIR = 'Question_Answer_Dataset' DATASET_FILE = 'question_answer_pairs.txt' MIN_ANSWER_LENGTH = 1 # Who does that? WINDOWS_ENCODING = 'latin-1' def cleanup(): execute_bash('rm -rf wikianswer.tar.gz %s*' % (DATASET_DIR,)) if __name__ == '__main__': cleanup() execute_bash('wget -O wikianswer.tar.gz %s' % (TARBALL,)) execute_bash('tar -xz -f wikianswer.tar.gz') execute_bash('mv Question_Answer_Dataset* %s' % (DATASET_DIR,)) directories = [d for d in listdir(DATASET_DIR) if isdir(join(DATASET_DIR, d)) and d.startswith('S')] assert len(directories) > 0 num_nonascii = 0 num_too_short = 0 output_content = [] for d in [join(DATASET_DIR, d) for d in directories]: dataset_file = join(d, DATASET_FILE) assert isfile(dataset_file) with open(dataset_file, newline='', encoding=WINDOWS_ENCODING) as f:
def delete_paths(paths): for path in paths: execute_bash('rm -rf %s' % (path, )) if __name__ == '__main__': delete_paths([ ZIP_LOCAL, join(THIS_DATA_DIR, "MCTest"), join(THIS_DATA_DIR, '*.{tsv,ans}') ]) delete_paths([ ZIP_LOCAL, join(THIS_DATA_DIR, "MCTestAnswers"), join(THIS_DATA_DIR, '*.{tsv,ans}') ]) execute_bash('wget -O %s %s' % (ZIP_LOCAL, ZIP_URL)) execute_bash('unzip %s -d %s' % (ZIP_LOCAL, THIS_DATA_DIR)) execute_bash('wget -O %s %s' % (ZIP_TEST_LOCAL, ZIP_URL_TEST)) execute_bash('unzip %s -d %s' % (ZIP_TEST_LOCAL, THIS_DATA_DIR)) execute_bash( 'mv %s %s' % (join(THIS_DATA_DIR, "MCTest*", "*.{tsv,ans}"), THIS_DATA_DIR)) delete_paths([ZIP_LOCAL, ZIP_TEST_LOCAL, join(THIS_DATA_DIR, "MCTest*")]) delete_paths( [join(THIS_DATA_DIR, TRAIN_FILE), join(THIS_DATA_DIR, TEST_FILE)]) parse(THIS_DATA_DIR, TRAIN_FILE, TEST_FILE)
DATA_DIR = dirname(dirname(realpath(__file__))) sys.path.append(DATA_DIR) from utils import print_progress, execute_bash THIS_DATA_DIR = dirname(realpath(__file__)) TRAIN_FILE = join(THIS_DATA_DIR, "mc_train.txt") TEST_FILE = join(THIS_DATA_DIR, "mc_test.txt") # important ZIP_URL = 'http://research.microsoft.com/en-us/um/redmond/projects/mctest/data/MCTest.zip' ZIP_URL_TEST = 'http://research.microsoft.com/en-us/um/redmond/projects/mctest/data/MCTestAnswers.zip' ZIP_LOCAL = join(THIS_DATA_DIR, 'MCTest.zip') ZIP_TEST_LOCAL = join(THIS_DATA_DIR, 'MCTestAnswers.zip') def delete_paths(paths): for path in paths: execute_bash('rm -rf %s' % (path,)) if __name__ == '__main__': delete_paths([ZIP_LOCAL, join(THIS_DATA_DIR, "MCTest"), join(THIS_DATA_DIR, '*.{tsv,ans}')]) delete_paths([ZIP_LOCAL, join(THIS_DATA_DIR, "MCTestAnswers"), join(THIS_DATA_DIR, '*.{tsv,ans}')]) execute_bash('wget -O %s %s' % (ZIP_LOCAL, ZIP_URL)) execute_bash('unzip %s -d %s' % (ZIP_LOCAL, THIS_DATA_DIR)) execute_bash('wget -O %s %s' % (ZIP_TEST_LOCAL, ZIP_URL_TEST)) execute_bash('unzip %s -d %s' % (ZIP_TEST_LOCAL, THIS_DATA_DIR)) execute_bash('mv %s %s' % (join(THIS_DATA_DIR, "MCTest*", "*.{tsv,ans}"), THIS_DATA_DIR)) delete_paths([ZIP_LOCAL,ZIP_TEST_LOCAL, join(THIS_DATA_DIR, "MCTest*")]) delete_paths([join(THIS_DATA_DIR, TRAIN_FILE), join(THIS_DATA_DIR, TEST_FILE)]) parse(THIS_DATA_DIR, TRAIN_FILE, TEST_FILE)
import random import sys from os import listdir, makedirs, stat from os.path import isdir, isfile, join, dirname, realpath, exists # add data to path DATA_DIR = dirname(dirname(realpath(__file__))) sys.path.append(DATA_DIR) from utils import print_progress, execute_bash THIS_DATA_DIR = dirname(realpath(__file__)) DOWNLOADED_FILE = join(THIS_DATA_DIR, "wikianswers.paraphrases.tsv.gz") FILE_URL="https://www.dropbox.com/s/td3ionbuj80hrkb/wikianswers.paraphrases.tsv.gz?dl=0" if __name__ == '__main__': if exists(DOWNLOADED_FILE): print("Found file.") else: execute_bash("wget -O {path} {url}".format(url=FILE_URL, path=DOWNLOADED_FILE)) print("Downloaded file")
sys.path.append(DATA_DIR) from utils import execute_bash, collect_files_with_ext from midi.utils import midiread from scipy.io import wavfile import numpy as np THIS_DATA_DIR = dirname(realpath(__file__)) DOWNLOADED_ZIP = join(THIS_DATA_DIR, "dataset.zip") DOWNLOADED_DIR = join(THIS_DATA_DIR, "dataset") FILE_URL="http://c4dm.eecs.qmul.ac.uk/rdr/bitstream/handle/123456789/13/Score-informed%20Piano%20Transcription%20Dataset.zip?sequence=1" if __name__ == '__main__': if not exists(DOWNLOADED_ZIP): execute_bash("wget -O {path} {url}".format(url=FILE_URL, path=DOWNLOADED_ZIP)) if exists(DOWNLOADED_DIR) and isdir(DOWNLOADED_DIR): execute_bash("rm -rf %s" % (DOWNLOADED_DIR)) execute_bash("rm %s " % (join(THIS_DATA_DIR, "*.npy"))) makedirs(DOWNLOADED_DIR) execute_bash("unzip %s -d %s" % (DOWNLOADED_ZIP, DOWNLOADED_DIR)) files = collect_files_with_ext(DOWNLOADED_DIR, ".wav") for subpath, name in files: if name.endswith(".wav") and "Chromatic" not in name: sampling_rate, music = wavfile.read(subpath) np.save(join(THIS_DATA_DIR, name.replace(".wav", ".npy")), music) piece = midiread(str(subpath).replace(".wav", "_correct.mid")) np.save(join(THIS_DATA_DIR, name.replace(".wav", ".mid.npy")), piece.piano_roll)
temp_dir = join(THIS_DATA_DIR, "tmp") def delete_paths(paths): for path in paths: execute_bash('rm -rf %s' % (path, )) if __name__ == '__main__': if not exists(temp_dir): makedirs(temp_dir) # delete existing downloads delete_paths([out for url, path, out, newdir in urls] + [path for url, path, out, newdir in urls] + [newdir for url, path, out, newdir in urls]) # download new ones for url, path, out, newdir in urls: execute_bash("wget -O {path} {url}".format(url=url, path=path)) execute_bash("tar -xf %s -C %s" % (path, THIS_DATA_DIR)) execute_bash("rm %s" % join(out, "00-readme.txt")) execute_bash("mv {dataset_files} {tmp_dir}".format(tmp_dir=temp_dir, dataset_files=join( out, "*.txt"))) delete_paths([out]) if not exists(newdir): makedirs(newdir) execute_bash("mv {tmp_files} {newdir}".format(tmp_files=join( temp_dir, "*.txt"), newdir=newdir)) delete_paths([temp_dir] + [path for url, path, out, newdir in urls])
import random import sys from os import listdir from os.path import isdir, isfile, join, dirname, realpath # add data to path DATA_DIR = dirname(dirname(realpath(__file__))) sys.path.append(DATA_DIR) from utils import print_progress, execute_bash THIS_DATA_DIR = dirname(realpath(__file__)) # important URL = 'http://norvig.com/big.txt' LOCAL = join(THIS_DATA_DIR, 'big.txt') def delete_paths(paths): for path in paths: execute_bash('rm -rf %s' % (path,)) if __name__ == '__main__': delete_paths([LOCAL]) execute_bash('wget -O %s %s' % (LOCAL, URL))
from os import listdir from os.path import isdir, isfile, join, dirname, realpath # add data to path DATA_DIR = dirname(dirname(realpath(__file__))) sys.path.append(DATA_DIR) from utils import print_progress, execute_bash THIS_DATA_DIR = dirname(realpath(__file__)) # important ZIP_URL = 'http://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip' ZIP_LOCAL = join(THIS_DATA_DIR, 'trainDevTestTrees_PTB.zip') def delete_paths(paths): for path in paths: execute_bash('rm -rf %s' % (path, )) if __name__ == '__main__': local_files = [ join(THIS_DATA_DIR, f) for f in ['train.txt', 'test.txt', 'dev.txt'] ] delete_paths([ZIP_LOCAL, join(THIS_DATA_DIR, "trees")] + local_files) execute_bash('wget -O %s %s' % (ZIP_LOCAL, ZIP_URL)) execute_bash('unzip %s -d %s' % (ZIP_LOCAL, THIS_DATA_DIR)) execute_bash('mv %s %s' % (join(THIS_DATA_DIR, "trees", "*"), THIS_DATA_DIR)) delete_paths([ZIP_LOCAL, join(THIS_DATA_DIR, "trees")])
with open(dataset_fname, "rt") as finputs: with open(dataset_fname.replace(".input.", ".gs."), "rt") as flabels: label_lines = (line for line in flabels) input_lines = (line for line in finputs) for label, sentences in zip(label_lines, input_lines): fout.write(sentences.strip() + "\t" + label) if tokenizer_available: ftokenized.write(tokenize_sentences(sentences) + "\t" + label) def delete_paths(paths): for path in paths: execute_bash('rm -rf %s' % (path,)) if __name__ == '__main__': execute_bash("rm -rf %s" % (ZIP_LOCAL,)) execute_bash("rm -rf %s" % (UNZIPPED_LOCAL,)) execute_bash("wget -O {path} {url}".format(url=ZIP_URL, path=ZIP_LOCAL)) execute_bash("unzip {zipfile} -d {target}".format(zipfile=ZIP_LOCAL, target=UNZIPPED_LOCAL)) # create test set: test_input_names = collect_text_files(UNZIPPED_LOCAL) transform_files_into_one(test_input_names, TEST_TOKENIZED_FILE) if not tokenizer_available: execute_bash("rm %s" % (TEST_TOKENIZED_FILE,)) delete_paths([path for path, name in test_input_names]) # untar train files: tar_files = collect_files_with_ext(UNZIPPED_LOCAL, ".tgz") for tar_file, tar_file_name in tar_files:
import random import sys from os import listdir from os.path import isdir, isfile, join, dirname, realpath # add data to path DATA_DIR = dirname(dirname(realpath(__file__))) sys.path.append(DATA_DIR) from utils import print_progress, execute_bash THIS_DATA_DIR = dirname(realpath(__file__)) # important URL = 'http://norvig.com/big.txt' LOCAL = join(THIS_DATA_DIR, 'big.txt') def delete_paths(paths): for path in paths: execute_bash('rm -rf %s' % (path, )) if __name__ == '__main__': delete_paths([LOCAL]) execute_bash('wget -O %s %s' % (LOCAL, URL))
import random import sys from os import listdir from os.path import isdir, isfile, join, dirname, realpath # add data to path DATA_DIR = dirname(dirname(realpath(__file__))) sys.path.append(DATA_DIR) from utils import print_progress, execute_bash THIS_DATA_DIR = dirname(realpath(__file__)) # important TEXT_URL = 'https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/ner.txt' UNZIPPED_LOCAL = join(THIS_DATA_DIR, "train.txt") def delete_paths(paths): for path in paths: execute_bash('rm -rf %s' % (path,)) if __name__ == '__main__': delete_paths([UNZIPPED_LOCAL]) execute_bash('wget -O %s %s' % (TEXT_URL, UNZIPPED_LOCAL))
import random import sys from os import listdir, makedirs, stat from os.path import isdir, isfile, join, dirname, realpath, exists # add data to path DATA_DIR = dirname(dirname(realpath(__file__))) sys.path.append(DATA_DIR) from utils import print_progress, execute_bash THIS_DATA_DIR = dirname(realpath(__file__)) DOWNLOADED_FILE = join(THIS_DATA_DIR, "wikianswers.paraphrases.tsv.gz") FILE_URL = "https://www.dropbox.com/s/td3ionbuj80hrkb/wikianswers.paraphrases.tsv.gz?dl=0" if __name__ == '__main__': if exists(DOWNLOADED_FILE): print("Found file.") else: execute_bash("wget -O {path} {url}".format(url=FILE_URL, path=DOWNLOADED_FILE)) print("Downloaded file")
DATA_DIR = dirname(dirname(realpath(__file__))) sys.path.append(DATA_DIR) from utils import execute_bash, collect_files_with_ext from midi.utils import midiread from scipy.io import wavfile import numpy as np THIS_DATA_DIR = dirname(realpath(__file__)) DOWNLOADED_ZIP = join(THIS_DATA_DIR, "dataset.zip") DOWNLOADED_DIR = join(THIS_DATA_DIR, "dataset") FILE_URL = "http://c4dm.eecs.qmul.ac.uk/rdr/bitstream/handle/123456789/13/Score-informed%20Piano%20Transcription%20Dataset.zip?sequence=1" if __name__ == '__main__': if not exists(DOWNLOADED_ZIP): execute_bash("wget -O {path} {url}".format(url=FILE_URL, path=DOWNLOADED_ZIP)) if exists(DOWNLOADED_DIR) and isdir(DOWNLOADED_DIR): execute_bash("rm -rf %s" % (DOWNLOADED_DIR)) execute_bash("rm %s " % (join(THIS_DATA_DIR, "*.npy"))) makedirs(DOWNLOADED_DIR) execute_bash("unzip %s -d %s" % (DOWNLOADED_ZIP, DOWNLOADED_DIR)) files = collect_files_with_ext(DOWNLOADED_DIR, ".wav") for subpath, name in files: if name.endswith(".wav") and "Chromatic" not in name: sampling_rate, music = wavfile.read(subpath) np.save(join(THIS_DATA_DIR, name.replace(".wav", ".npy")), music) piece = midiread(str(subpath).replace(".wav", "_correct.mid")) np.save(join(THIS_DATA_DIR, name.replace(".wav", ".mid.npy")), piece.piano_roll)
def delete_paths(paths): for path in paths: execute_bash('rm -rf %s' % (path,))
with open(fname, "w") as f: for line in all_file.split('\n'): line = line.replace('\t', ' ') if '?' in line: no_and_question, rest = line.split('?') rest = rest.strip() rest = rest.split(' ') answer = rest[0].replace(',', ' ') rest = ' '.join(rest[1:]) line = '%s ?\t%s\t%s' % (no_and_question.strip(), answer.strip()+ ' .', rest.strip()) else: line = ' .'.join(line.split('.')) f.write(line + '\n') f.flush() if __name__ == '__main__': delete_paths([TARBALL_LOCAL, OUTPUT_DIR]) execute_bash('rm -rf %s' % (NORMALIZED_OUTPUT_DIR,)) execute_bash('wget -O %s %s' % (TARBALL_LOCAL, TARBALL_URL)) execute_bash('mkdir %s' % (OUTPUT_DIR,)) execute_bash('tar -xz -f %s -C %s' % (TARBALL_LOCAL, THIS_DATA_DIR)) execute_bash('mv %s %s' % (OUTPUT_DIR, NORMALIZED_OUTPUT_DIR)) delete_paths([TARBALL_LOCAL,]) for f in findfiles(NORMALIZED_OUTPUT_DIR, "qa*.txt"): print("Converting %s to common QA format..." % (f,)) fix_babi_file(f)
for line in all_file.split('\n'): line = line.replace('\t', ' ') if '?' in line: no_and_question, rest = line.split('?') rest = rest.strip() rest = rest.split(' ') answer = rest[0].replace(',', ' ') rest = ' '.join(rest[1:]) line = '%s ?\t%s\t%s' % (no_and_question.strip(), answer.strip() + ' .', rest.strip()) else: line = ' .'.join(line.split('.')) f.write(line + '\n') f.flush() if __name__ == '__main__': delete_paths([TARBALL_LOCAL, OUTPUT_DIR]) execute_bash('rm -rf %s' % (NORMALIZED_OUTPUT_DIR, )) execute_bash('wget -O %s %s' % (TARBALL_LOCAL, TARBALL_URL)) execute_bash('mkdir %s' % (OUTPUT_DIR, )) execute_bash('tar -xz -f %s -C %s' % (TARBALL_LOCAL, THIS_DATA_DIR)) execute_bash('mv %s %s' % (OUTPUT_DIR, NORMALIZED_OUTPUT_DIR)) delete_paths([ TARBALL_LOCAL, ]) for f in findfiles(NORMALIZED_OUTPUT_DIR, "qa*.txt"): print("Converting %s to common QA format..." % (f, )) fix_babi_file(f)