def download_model(self): # Validate contents of data_path folder: data_path = self.data_path download = False for file_name in self.file_name_dict.values(): if not os.path.exists(os.path.join(data_path, file_name)): # prompt download = True print("The following required file is missing :", file_name) if download is True: if self.prompt is True: license_prompt('mrc_data', 'https://s3-us-west-2.amazonaws.com/nlp-architect-data/models/mrc' '/mrc_data.zip', self.data_dir) license_prompt('mrc_model', 'https://s3-us-west-2.amazonaws.com/nlp-architect-data/models/mrc' '/mrc_model.zip', self.model_dir) data_zipfile = os.path.join(self.data_dir, 'mrc_data.zip') model_zipfile = os.path.join(self.model_dir, 'mrc_model.zip') makedirs(self.data_dir, exist_ok=True) makedirs(self.model_dir, exist_ok=True) download_unlicensed_file('https://s3-us-west-2.amazonaws.com/nlp-architect-data' '/models/mrc/', 'mrc_data.zip', data_zipfile) download_unlicensed_file('https://s3-us-west-2.amazonaws.com/nlp-architect-data' '/models/mrc/', 'mrc_model.zip', model_zipfile) with zipfile.ZipFile(data_zipfile) as data_zip_ref: data_zip_ref.extractall(self.data_dir) with zipfile.ZipFile(model_zipfile) as model_zip_ref: model_zip_ref.extractall(self.model_dir)
def __init__( self, model="en", disable=None, display_prompt=True, n_jobs=8, batch_size=1500, spacy_doc=False, show_tok=True, show_doc=True, ptb_pos=False, ): if disable is None: disable = [] try: self._parser = spacy.load(model, disable=disable) except OSError: url = "https://spacy.io/models" if display_prompt and license_prompt("Spacy {} model".format(model), url) is False: sys.exit(0) spacy_download(model) print("Spacy model installed, please rerun your command.") sys.exit(0) self.n_jobs = n_jobs self.batch_size = batch_size self.spacy_doc = spacy_doc self.show_tok = show_tok self.show_doc = show_doc self.ptb_pos = ptb_pos
def __init__(self): try: nltk.data.find('corpora/wordnet') except LookupError: if license_prompt('WordNet data set', 'http://www.nltk.org/nltk_data/') is False: raise Exception("can't continue data prepare process " "without downloading WordNet dataset") nltk.download('wordnet') self.wordnet = wn
def download_model(self): # Validate contents of data_path folder: data_path = self.data_path download = False for file_name in self.file_name_dict.values(): if not os.path.exists(os.path.join(data_path, file_name)): # prompt download = True print("The following required file is missing :", file_name) if download is True: if self.prompt is True: license_prompt( "mrc_data", "https://d2zs9tzlek599f.cloudfront.net/models/mrc" "/mrc_data.zip", self.data_dir, ) license_prompt( "mrc_model", "https://d2zs9tzlek599f.cloudfront.net/models/mrc" "/mrc_model.zip", self.model_dir, ) data_zipfile = os.path.join(self.data_dir, "mrc_data.zip") model_zipfile = os.path.join(self.model_dir, "mrc_model.zip") makedirs(self.data_dir, exist_ok=True) makedirs(self.model_dir, exist_ok=True) download_unlicensed_file( "https://d2zs9tzlek599f.cloudfront.net" "/models/mrc/", "mrc_data.zip", data_zipfile, ) download_unlicensed_file( "https://d2zs9tzlek599f.cloudfront.net" "/models/mrc/", "mrc_model.zip", model_zipfile, ) with zipfile.ZipFile(data_zipfile) as data_zip_ref: data_zip_ref.extractall(self.data_dir) with zipfile.ZipFile(model_zipfile) as model_zip_ref: model_zip_ref.extractall(self.model_dir)
def __init__(self, model="en", disable=None, display_prompt=True): if disable is None: disable = [] try: self._parser = spacy.load(model, disable=disable) except OSError: url = "https://spacy.io/models" if display_prompt and license_prompt("Spacy {} model".format(model), url) is False: sys.exit(0) spacy_download(model) self._parser = spacy.load(model, disable=disable)
def __init__(self, model='en', disable=None): if disable is None: disable = [] try: self._parser = spacy.load(model, disable=disable) except OSError: url = 'https://spacy.io/models' if license_prompt('Spacy {} model'.format(model), url) is False: sys.exit(0) spacy_download(model) self._parser = spacy.load(model, disable=disable)
def _load_data(): try: train_set = conll2000.chunked_sents('train.txt') test_set = conll2000.chunked_sents('test.txt') except Exception: if license_prompt('CONLL2000 data set', 'http://www.nltk.org/nltk_data/') is False: sys.exit(0) nltk.download('conll2000') train_set = conll2000.chunked_sents('train.txt') test_set = conll2000.chunked_sents('test.txt') train_data = [list(zip(*nltk.chunk.tree2conlltags(sent))) for sent in train_set] test_data = [list(zip(*nltk.chunk.tree2conlltags(sent))) for sent in test_set] return train_data, test_data
def __init__(self): try: nltk.data.find('corpora/brown') except LookupError: if license_prompt('brown data set', 'http://www.nltk.org/nltk_data/') is False: raise Exception("can't continue data prepare process " "without downloading brown dataset") nltk.download('brown') self.bigram_finder = nltk.collocations.BigramCollocationFinder.from_words( nltk.corpus.brown.words()) self.bigram_messure = nltk.collocations.BigramAssocMeasures() self.likelihood_ration_dict = self.build_bigram_score_dict( self.bigram_messure.likelihood_ratio) self.chi_sq_dict = self.build_bigram_score_dict(self.bigram_messure.chi_sq) self.pmi_dict = self.build_bigram_score_dict(self.bigram_messure.pmi)
def download_unzip(url: str, sourcefile: str, unzipped_path: str or PathLike, license_msg: str = None): """Downloads a zip file, extracts it to destination, deletes the zip file. If license_msg is supplied, user is prompted for download confirmation.""" dest_parent = Path(unzipped_path).parent if not os.path.exists(unzipped_path): if license_msg is None or license_prompt(license_msg, urlparse(url).netloc): zip_path = dest_parent / sourcefile makedirs(dest_parent, exist_ok=True) download_unlicensed_file(url, sourcefile, zip_path) print('Unzipping...') uncompress_file(zip_path, dest_parent) remove(zip_path) return unzipped_path
def download_model(self): # Validate contents of data_path folder: data_path = self.data_path download = False for file_name in self.file_name_dict.values(): if not os.path.exists(os.path.join(data_path, file_name)): # prompt download = True print("The following required file is missing :", file_name) if download is True: if self.prompt is True: license_prompt( 'mrc_data', 'http://nervana-modelzoo.s3.amazonaws.com/NLP/mrc/mrc_data.zip', self.data_dir) license_prompt( 'mrc_model', 'http://nervana-modelzoo.s3.amazonaws.com/NLP/mrc/mrc_model.zip', self.model_dir) data_zipfile = os.path.join(self.data_dir, 'mrc_data.zip') model_zipfile = os.path.join(self.model_dir, 'mrc_model.zip') makedirs(self.data_dir, exist_ok=True) makedirs(self.model_dir, exist_ok=True) download_unlicensed_file( 'http://nervana-modelzoo.s3.amazonaws.com/NLP/mrc/', 'mrc_data.zip', data_zipfile) download_unlicensed_file( 'http://nervana-modelzoo.s3.amazonaws.com/NLP/mrc/', 'mrc_model.zip', model_zipfile) data_zip_ref = zipfile.ZipFile(data_zipfile, 'r') data_zip_ref.extractall(self.data_dir) data_zip_ref.close() model_zip_ref = zipfile.ZipFile(model_zipfile, 'r') model_zip_ref.extractall(self.model_dir) model_zip_ref.close()
def __init__(self): try: nltk.data.find('corpora/brown') except LookupError: if license_prompt('brown data set', 'http://www.nltk.org/nltk_data/') is False: raise Exception("can't continue data prepare process " "without downloading brown dataset") nltk.download('brown') self.bigram_finder = nltk.collocations.BigramCollocationFinder.from_words( nltk.corpus.brown.words()) self.bigram_messure = nltk.collocations.BigramAssocMeasures() self.likelihood_ration_dict = self.build_bigram_score_dict( self.bigram_messure.likelihood_ratio) self.chi_sq_dict = self.build_bigram_score_dict( self.bigram_messure.chi_sq) self.pmi_dict = self.build_bigram_score_dict(self.bigram_messure.pmi)
def _load_data(): try: train_set = conll2000.chunked_sents('train.txt') test_set = conll2000.chunked_sents('test.txt') except Exception: if license_prompt('CONLL2000 data set', 'http://www.nltk.org/nltk_data/') is False: sys.exit(0) nltk.download('conll2000') train_set = conll2000.chunked_sents('train.txt') test_set = conll2000.chunked_sents('test.txt') train_data = [ list(zip(*nltk.chunk.tree2conlltags(sent))) for sent in train_set ] test_data = [ list(zip(*nltk.chunk.tree2conlltags(sent))) for sent in test_set ] return train_data, test_data
def load_data(self, path=".", subset='wiki_entities'): """ Fetch the Facebook WikiMovies dataset and load it to memory. Arguments: path (str, optional): Local directory in which to cache the raw dataset. Defaults to current directory. Returns: tuple: knowledge base, entity list, training and test files are returned """ self.data_dict = {} self.vocab = None workdir, filepath = valid_path_append(path, '', self.filename) babi_dir_name = self.filename.split('.')[0] if subset == 'wiki-entities': subset_folder = 'wiki_entities' else: subset_folder = subset file_base = babi_dir_name + '/questions/' + subset_folder + '/' + subset + '_qa_{}.txt' train_file = os.path.join(workdir, file_base.format('train')) test_file = os.path.join(workdir, file_base.format('test')) entity_file_path = babi_dir_name + '/knowledge_source/entities.txt' entity_file = os.path.join(workdir, entity_file_path) # Check for the existence of the entity file # If it isn't there then we know we need to fetch everything if not os.path.exists(entity_file): if license_prompt('WikiMovies', 'https://research.fb.com/downloads/babi/', self.path) is False: sys.exit(0) fetch_file(self.url, self.filename, filepath, self.size) knowledge_file_path = babi_dir_name + '/knowledge_source/' + subset_folder + '/' \ + subset_folder + '_kb.txt' kb_file = os.path.join(workdir, knowledge_file_path) return entity_file, kb_file, train_file, test_file
def _maybe_download(self): """ Download filename from url unless it's already in directory """ # 1. Check if the file doesnt exist. Download and extract if it doesnt filename = "wiki." + self.language + ".vec" filepath = os.path.join(self.path, filename) link = "https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md" if not os.path.exists(filepath): if license_prompt(filepath, link, self.path): print("Downloading FastText embeddings for " + self.language + " to " + filepath) urllib.request.urlretrieve(self.url, filepath) statinfo = os.stat(filepath) print("Sucessfully downloaded", filename, statinfo.st_size, "bytes") else: exit() else: print("Found FastText embeddings for " + self.language + " at " + filepath) return filepath
def get_eval_data(eval_path, src_lang, tgt_lang): """ Downloads evaluation cross lingual dictionaries to the eval_path Arguments: eval_path: Path where cross-lingual dictionaries are downloaded src_lang : Source Language tgt_lang : Target Language Returns: Path to where cross lingual dictionaries are downloaded """ eval_url = 'https://s3.amazonaws.com/arrival/dictionaries/' link = "https://github.com/facebookresearch/MUSE#ground-truth-bilingual-dictionaries" src_path = os.path.join(eval_path, '%s-%s.5000-6500.txt' % (src_lang, tgt_lang)) filename = src_lang + '-' + tgt_lang + '.5000-6500.txt' if not os.path.exists(src_path): if license_prompt(src_path, link, src_path): os.system("mkdir -p " + eval_path) print("Downloading cross-lingual dictionaries for " + src_lang) fp, _ = urllib.request.urlretrieve(eval_url + filename, src_path) print("Completed downloading to " + eval_path) else: exit() return src_path
def load_data(self): """ Fetch and extract the Facebook bAbI-dialog dataset if not already downloaded. Returns: tuple: training and test filenames are returned """ if self.task < 5: self.candidate_answer_filename = 'dialog-babi-candidates.txt' self.kb_filename = 'dialog-babi-kb-all.txt' self.cands_mat_filename = 'babi-cands-with-matchtype_{}.npy' self.vocab_filename = 'dialog-babi-vocab-task{}'.format(self.task + 1) +\ '_matchtype{}.pkl'.format(self.use_match_type) else: self.candidate_answer_filename = 'dialog-babi-task6-dstc2-candidates.txt' self.kb_filename = 'dialog-babi-task6-dstc2-kb.txt' self.cands_mat_filename = 'dstc2-cands-with-matchtype_{}.npy' self.vocab_filename = 'dstc2-vocab-task{}_matchtype{}.pkl'.format( self.task + 1, self.use_match_type) self.vectorized_filename = 'vectorized_task{}.pkl'.format(self.task + 1) self.data_dict = {} self.vocab = None self.workdir, filepath = valid_path_append(self.path, '', self.filename) if not os.path.exists(filepath): if license_prompt('bAbI-dialog', 'https://research.fb.com/downloads/babi/', self.path) is False: sys.exit(0) download_unlicensed_file(self.url, self.filename, filepath, self.size) self.babi_dir_name = self.filename.split('.')[0] self.candidate_answer_filename = self.babi_dir_name + \ '/' + self.candidate_answer_filename self.kb_filename = self.babi_dir_name + '/' + self.kb_filename self.cands_mat_filename = os.path.join( self.workdir, self.babi_dir_name + '/' + self.cands_mat_filename) self.vocab_filename = self.babi_dir_name + '/' + self.vocab_filename self.vectorized_filename = self.babi_dir_name + '/' + self.vectorized_filename task_name = self.babi_dir_name + '/' + self.tasks[self.task] + '{}.txt' train_file = os.path.join(self.workdir, task_name.format('trn')) dev_file = os.path.join(self.workdir, task_name.format('dev')) test_file_postfix = 'tst-OOV' if self.oov else 'tst' test_file = os.path.join(self.workdir, task_name.format(test_file_postfix)) cand_file = os.path.join(self.workdir, self.candidate_answer_filename) kb_file = os.path.join(self.workdir, self.kb_filename) vocab_file = os.path.join(self.workdir, self.vocab_filename) vectorized_file = os.path.join(self.workdir, self.vectorized_filename) if (os.path.exists(train_file) is False or os.path.exists(dev_file) is False or os.path.exists(test_file) is False or os.path.exists(cand_file) is False): with tarfile.open(filepath, 'r:gz') as f: f.extractall(self.workdir) return train_file, dev_file, test_file, cand_file, kb_file, vocab_file, vectorized_file
def load_data(self): """ Fetch and extract the Facebook bAbI-dialog dataset if not already downloaded. Returns: tuple: training and test filenames are returned """ if self.task < 5: self.candidate_answer_filename = 'dialog-babi-candidates.txt' self.kb_filename = 'dialog-babi-kb-all.txt' self.cands_mat_filename = 'babi-cands-with-matchtype_{}.npy' self.vocab_filename = 'dialog-babi-vocab-task{}.pkl'.format(self.task + 1) else: self.candidate_answer_filename = 'dialog-babi-task6-dstc2-candidates.txt' self.kb_filename = 'dialog-babi-task6-dstc2-kb.txt' self.cands_mat_filename = 'dstc2-cands-with-matchtype_{}.npy' self.vocab_filename = 'dstc2-vocab-task{}.pkl'.format(self.task + 1) self.vectorized_filename = 'vectorized_task{}.pkl'.format(self.task + 1) self.data_dict = {} self.vocab = None self.workdir, filepath = valid_path_append( self.path, '', self.filename) if not os.path.exists(filepath): if license_prompt('bAbI-dialog', 'https://research.fb.com/downloads/babi/', self.path) is False: sys.exit(0) fetch_file(self.url, self.filename, filepath, self.size) self.babi_dir_name = self.filename.split('.')[0] self.candidate_answer_filename = self.babi_dir_name + \ '/' + self.candidate_answer_filename self.kb_filename = self.babi_dir_name + '/' + self.kb_filename self.cands_mat_filename = os.path.join( self.workdir, self.babi_dir_name + '/' + self.cands_mat_filename) self.vocab_filename = self.babi_dir_name + '/' + self.vocab_filename self.vectorized_filename = self.babi_dir_name + '/' + self.vectorized_filename task_name = self.babi_dir_name + '/' + self.tasks[self.task] + '{}.txt' train_file = os.path.join(self.workdir, task_name.format('trn')) dev_file = os.path.join(self.workdir, task_name.format('dev')) test_file_postfix = 'tst-OOV' if self.oov else 'tst' test_file = os.path.join( self.workdir, task_name.format(test_file_postfix)) cand_file = os.path.join(self.workdir, self.candidate_answer_filename) kb_file = os.path.join(self.workdir, self.kb_filename) vocab_file = os.path.join(self.workdir, self.vocab_filename) vectorized_file = os.path.join(self.workdir, self.vectorized_filename) if (os.path.exists(train_file) is False or os.path.exists(dev_file) is False or os.path.exists(test_file) is False or os.path.exists(cand_file) is False): with tarfile.open(filepath, 'r:gz') as f: f.extractall(self.workdir) return train_file, dev_file, test_file, cand_file, kb_file, vocab_file, vectorized_file
def load_data(self): """ Fetch and extract the Facebook bAbI-dialog dataset if not already downloaded. Returns: tuple: training and test filenames are returned """ if self.task < 5: self.candidate_answer_filename = "dialog-babi-candidates.txt" self.kb_filename = "dialog-babi-kb-all.txt" self.cands_mat_filename = "babi-cands-with-matchtype_{}.npy" self.vocab_filename = "dialog-babi-vocab-task{}".format( self.task + 1) + "_matchtype{}.pkl".format(self.use_match_type) else: self.candidate_answer_filename = "dialog-babi-task6-dstc2-candidates.txt" self.kb_filename = "dialog-babi-task6-dstc2-kb.txt" self.cands_mat_filename = "dstc2-cands-with-matchtype_{}.npy" self.vocab_filename = "dstc2-vocab-task{}_matchtype{}.pkl".format( self.task + 1, self.use_match_type) self.vectorized_filename = "vectorized_task{}.pkl".format(self.task + 1) self.data_dict = {} self.vocab = None self.workdir, filepath = valid_path_append(self.path, "", self.filename) if not os.path.exists(filepath): if (license_prompt("bAbI-dialog", "https://research.fb.com/downloads/babi/", self.path) is False): sys.exit(0) download_unlicensed_file(self.url, self.filename, filepath, self.size) self.babi_dir_name = self.filename.split(".")[0] self.candidate_answer_filename = self.babi_dir_name + "/" + self.candidate_answer_filename self.kb_filename = self.babi_dir_name + "/" + self.kb_filename self.cands_mat_filename = os.path.join( self.workdir, self.babi_dir_name + "/" + self.cands_mat_filename) self.vocab_filename = self.babi_dir_name + "/" + self.vocab_filename self.vectorized_filename = self.babi_dir_name + "/" + self.vectorized_filename task_name = self.babi_dir_name + "/" + self.tasks[self.task] + "{}.txt" train_file = os.path.join(self.workdir, task_name.format("trn")) dev_file = os.path.join(self.workdir, task_name.format("dev")) test_file_postfix = "tst-OOV" if self.oov else "tst" test_file = os.path.join(self.workdir, task_name.format(test_file_postfix)) cand_file = os.path.join(self.workdir, self.candidate_answer_filename) kb_file = os.path.join(self.workdir, self.kb_filename) vocab_file = os.path.join(self.workdir, self.vocab_filename) vectorized_file = os.path.join(self.workdir, self.vectorized_filename) if (os.path.exists(train_file) is False or os.path.exists(dev_file) is False or os.path.exists(test_file) is False or os.path.exists(cand_file) is False): with tarfile.open(filepath, "r:gz") as f: f.extractall(self.workdir) return train_file, dev_file, test_file, cand_file, kb_file, vocab_file, vectorized_file
# See the License for the specific language governing permissions and # limitations under the License. # **************************************************************************** import re import nltk import numpy from numpy import dot from numpy.linalg import norm from nlp_architect.utils.generic import license_prompt try: nltk.data.find('taggers/averaged_perceptron_tagger') except LookupError: if license_prompt('Averaged Perceptron Tagger', 'http://www.nltk.org/nltk_data/') is False: raise Exception("can't continue data prepare process " "without downloading averaged_perceptron_tagger") nltk.download('averaged_perceptron_tagger') try: nltk.data.find('tokenizers/punkt') except LookupError: if license_prompt('Punkt model', 'http://www.nltk.org/nltk_data/') is False: raise Exception("can't continue data prepare process " "without downloading punkt") nltk.download('punkt') def extract_features_envelope(target_word, definition, hyps_vec, model_w2v):
# See the License for the specific language governing permissions and # limitations under the License. # **************************************************************************** import re import nltk import numpy from numpy import dot from numpy.linalg import norm from nlp_architect.utils.generic import license_prompt try: nltk.data.find("taggers/averaged_perceptron_tagger") except LookupError: if license_prompt("Averaged Perceptron Tagger", "http://www.nltk.org/nltk_data/") is False: raise Exception("can't continue data prepare process " "without downloading averaged_perceptron_tagger") nltk.download("averaged_perceptron_tagger") try: nltk.data.find("tokenizers/punkt") except LookupError: if license_prompt("Punkt model", "http://www.nltk.org/nltk_data/") is False: raise Exception("can't continue data prepare process " "without downloading punkt") nltk.download("punkt") def extract_features_envelope(target_word, definition, hyps_vec, model_w2v):
# See the License for the specific language governing permissions and # limitations under the License. # **************************************************************************** import re import nltk import numpy from numpy import dot from numpy.linalg import norm from nlp_architect.utils.generic import license_prompt try: nltk.data.find('taggers/averaged_perceptron_tagger') except LookupError: if license_prompt('Averaged Perceptron Tagger', 'http://www.nltk.org/nltk_data/') is False: raise Exception("can't continue data prepare process " "without downloading averaged_perceptron_tagger") nltk.download('averaged_perceptron_tagger') try: nltk.data.find('tokenizers/punkt') except LookupError: if license_prompt('Punkt model', 'http://www.nltk.org/nltk_data/') is False: raise Exception("can't continue data prepare process " "without downloading punkt") nltk.download('punkt') # -------------------------------------------------------------------------------------#