def download_model(self):
        # Validate contents of data_path folder:
        data_path = self.data_path
        download = False
        for file_name in self.file_name_dict.values():
            if not os.path.exists(os.path.join(data_path, file_name)):
                # prompt
                download = True
                print("The following required file is missing :", file_name)

        if download is True:
            if self.prompt is True:
                license_prompt('mrc_data',
                               'https://s3-us-west-2.amazonaws.com/nlp-architect-data/models/mrc'
                               '/mrc_data.zip',
                               self.data_dir)
                license_prompt('mrc_model',
                               'https://s3-us-west-2.amazonaws.com/nlp-architect-data/models/mrc'
                               '/mrc_model.zip',
                               self.model_dir)
            data_zipfile = os.path.join(self.data_dir, 'mrc_data.zip')
            model_zipfile = os.path.join(self.model_dir, 'mrc_model.zip')
            makedirs(self.data_dir, exist_ok=True)
            makedirs(self.model_dir, exist_ok=True)
            download_unlicensed_file('https://s3-us-west-2.amazonaws.com/nlp-architect-data'
                                     '/models/mrc/',
                                     'mrc_data.zip', data_zipfile)
            download_unlicensed_file('https://s3-us-west-2.amazonaws.com/nlp-architect-data'
                                     '/models/mrc/',
                                     'mrc_model.zip', model_zipfile)
            with zipfile.ZipFile(data_zipfile) as data_zip_ref:
                data_zip_ref.extractall(self.data_dir)
            with zipfile.ZipFile(model_zipfile) as model_zip_ref:
                model_zip_ref.extractall(self.model_dir)
Пример #2
0
 def __init__(
     self,
     model="en",
     disable=None,
     display_prompt=True,
     n_jobs=8,
     batch_size=1500,
     spacy_doc=False,
     show_tok=True,
     show_doc=True,
     ptb_pos=False,
 ):
     if disable is None:
         disable = []
     try:
         self._parser = spacy.load(model, disable=disable)
     except OSError:
         url = "https://spacy.io/models"
         if display_prompt and license_prompt("Spacy {} model".format(model), url) is False:
             sys.exit(0)
         spacy_download(model)
         print("Spacy model installed, please rerun your command.")
         sys.exit(0)
     self.n_jobs = n_jobs
     self.batch_size = batch_size
     self.spacy_doc = spacy_doc
     self.show_tok = show_tok
     self.show_doc = show_doc
     self.ptb_pos = ptb_pos
Пример #3
0
 def __init__(self):
     try:
         nltk.data.find('corpora/wordnet')
     except LookupError:
         if license_prompt('WordNet data set', 'http://www.nltk.org/nltk_data/') is False:
             raise Exception("can't continue data prepare process "
                             "without downloading WordNet dataset")
         nltk.download('wordnet')
     self.wordnet = wn
Пример #4
0
 def __init__(self):
     try:
         nltk.data.find('corpora/wordnet')
     except LookupError:
         if license_prompt('WordNet data set', 'http://www.nltk.org/nltk_data/') is False:
             raise Exception("can't continue data prepare process "
                             "without downloading WordNet dataset")
         nltk.download('wordnet')
     self.wordnet = wn
Пример #5
0
    def download_model(self):
        # Validate contents of data_path folder:
        data_path = self.data_path
        download = False
        for file_name in self.file_name_dict.values():
            if not os.path.exists(os.path.join(data_path, file_name)):
                # prompt
                download = True
                print("The following required file is missing :", file_name)

        if download is True:
            if self.prompt is True:
                license_prompt(
                    "mrc_data",
                    "https://d2zs9tzlek599f.cloudfront.net/models/mrc"
                    "/mrc_data.zip",
                    self.data_dir,
                )
                license_prompt(
                    "mrc_model",
                    "https://d2zs9tzlek599f.cloudfront.net/models/mrc"
                    "/mrc_model.zip",
                    self.model_dir,
                )
            data_zipfile = os.path.join(self.data_dir, "mrc_data.zip")
            model_zipfile = os.path.join(self.model_dir, "mrc_model.zip")
            makedirs(self.data_dir, exist_ok=True)
            makedirs(self.model_dir, exist_ok=True)
            download_unlicensed_file(
                "https://d2zs9tzlek599f.cloudfront.net"
                "/models/mrc/",
                "mrc_data.zip",
                data_zipfile,
            )
            download_unlicensed_file(
                "https://d2zs9tzlek599f.cloudfront.net"
                "/models/mrc/",
                "mrc_model.zip",
                model_zipfile,
            )
            with zipfile.ZipFile(data_zipfile) as data_zip_ref:
                data_zip_ref.extractall(self.data_dir)
            with zipfile.ZipFile(model_zipfile) as model_zip_ref:
                model_zip_ref.extractall(self.model_dir)
Пример #6
0
 def __init__(self, model="en", disable=None, display_prompt=True):
     if disable is None:
         disable = []
     try:
         self._parser = spacy.load(model, disable=disable)
     except OSError:
         url = "https://spacy.io/models"
         if display_prompt and license_prompt("Spacy {} model".format(model), url) is False:
             sys.exit(0)
         spacy_download(model)
         self._parser = spacy.load(model, disable=disable)
Пример #7
0
 def __init__(self, model='en', disable=None):
     if disable is None:
         disable = []
     try:
         self._parser = spacy.load(model, disable=disable)
     except OSError:
         url = 'https://spacy.io/models'
         if license_prompt('Spacy {} model'.format(model), url) is False:
             sys.exit(0)
         spacy_download(model)
         self._parser = spacy.load(model, disable=disable)
Пример #8
0
 def __init__(self, model='en', disable=None):
     if disable is None:
         disable = []
     try:
         self._parser = spacy.load(model, disable=disable)
     except OSError:
         url = 'https://spacy.io/models'
         if license_prompt('Spacy {} model'.format(model), url) is False:
             sys.exit(0)
         spacy_download(model)
         self._parser = spacy.load(model, disable=disable)
Пример #9
0
 def _load_data():
     try:
         train_set = conll2000.chunked_sents('train.txt')
         test_set = conll2000.chunked_sents('test.txt')
     except Exception:
         if license_prompt('CONLL2000 data set', 'http://www.nltk.org/nltk_data/') is False:
             sys.exit(0)
         nltk.download('conll2000')
         train_set = conll2000.chunked_sents('train.txt')
         test_set = conll2000.chunked_sents('test.txt')
     train_data = [list(zip(*nltk.chunk.tree2conlltags(sent))) for sent in train_set]
     test_data = [list(zip(*nltk.chunk.tree2conlltags(sent))) for sent in test_set]
     return train_data, test_data
Пример #10
0
 def __init__(self):
     try:
         nltk.data.find('corpora/brown')
     except LookupError:
         if license_prompt('brown data set', 'http://www.nltk.org/nltk_data/') is False:
             raise Exception("can't continue data prepare process "
                             "without downloading brown dataset")
         nltk.download('brown')
     self.bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(
         nltk.corpus.brown.words())
     self.bigram_messure = nltk.collocations.BigramAssocMeasures()
     self.likelihood_ration_dict = self.build_bigram_score_dict(
         self.bigram_messure.likelihood_ratio)
     self.chi_sq_dict = self.build_bigram_score_dict(self.bigram_messure.chi_sq)
     self.pmi_dict = self.build_bigram_score_dict(self.bigram_messure.pmi)
Пример #11
0
def download_unzip(url: str, sourcefile: str, unzipped_path: str or PathLike,
                   license_msg: str = None):
    """Downloads a zip file, extracts it to destination, deletes the zip file. If license_msg is
    supplied, user is prompted for download confirmation."""
    dest_parent = Path(unzipped_path).parent

    if not os.path.exists(unzipped_path):
        if license_msg is None or license_prompt(license_msg, urlparse(url).netloc):
            zip_path = dest_parent / sourcefile
            makedirs(dest_parent, exist_ok=True)
            download_unlicensed_file(url, sourcefile, zip_path)
            print('Unzipping...')
            uncompress_file(zip_path, dest_parent)
            remove(zip_path)
    return unzipped_path
Пример #12
0
    def download_model(self):
        # Validate contents of data_path folder:
        data_path = self.data_path
        download = False
        for file_name in self.file_name_dict.values():
            if not os.path.exists(os.path.join(data_path, file_name)):
                # prompt
                download = True
                print("The following required file is missing :", file_name)

        if download is True:
            if self.prompt is True:
                license_prompt(
                    'mrc_data',
                    'http://nervana-modelzoo.s3.amazonaws.com/NLP/mrc/mrc_data.zip',
                    self.data_dir)
                license_prompt(
                    'mrc_model',
                    'http://nervana-modelzoo.s3.amazonaws.com/NLP/mrc/mrc_model.zip',
                    self.model_dir)
            data_zipfile = os.path.join(self.data_dir, 'mrc_data.zip')
            model_zipfile = os.path.join(self.model_dir, 'mrc_model.zip')
            makedirs(self.data_dir, exist_ok=True)
            makedirs(self.model_dir, exist_ok=True)
            download_unlicensed_file(
                'http://nervana-modelzoo.s3.amazonaws.com/NLP/mrc/',
                'mrc_data.zip', data_zipfile)
            download_unlicensed_file(
                'http://nervana-modelzoo.s3.amazonaws.com/NLP/mrc/',
                'mrc_model.zip', model_zipfile)
            data_zip_ref = zipfile.ZipFile(data_zipfile, 'r')
            data_zip_ref.extractall(self.data_dir)
            data_zip_ref.close()
            model_zip_ref = zipfile.ZipFile(model_zipfile, 'r')
            model_zip_ref.extractall(self.model_dir)
            model_zip_ref.close()
Пример #13
0
 def __init__(self):
     try:
         nltk.data.find('corpora/brown')
     except LookupError:
         if license_prompt('brown data set',
                           'http://www.nltk.org/nltk_data/') is False:
             raise Exception("can't continue data prepare process "
                             "without downloading brown dataset")
         nltk.download('brown')
     self.bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(
         nltk.corpus.brown.words())
     self.bigram_messure = nltk.collocations.BigramAssocMeasures()
     self.likelihood_ration_dict = self.build_bigram_score_dict(
         self.bigram_messure.likelihood_ratio)
     self.chi_sq_dict = self.build_bigram_score_dict(
         self.bigram_messure.chi_sq)
     self.pmi_dict = self.build_bigram_score_dict(self.bigram_messure.pmi)
Пример #14
0
 def _load_data():
     try:
         train_set = conll2000.chunked_sents('train.txt')
         test_set = conll2000.chunked_sents('test.txt')
     except Exception:
         if license_prompt('CONLL2000 data set',
                           'http://www.nltk.org/nltk_data/') is False:
             sys.exit(0)
         nltk.download('conll2000')
         train_set = conll2000.chunked_sents('train.txt')
         test_set = conll2000.chunked_sents('test.txt')
     train_data = [
         list(zip(*nltk.chunk.tree2conlltags(sent))) for sent in train_set
     ]
     test_data = [
         list(zip(*nltk.chunk.tree2conlltags(sent))) for sent in test_set
     ]
     return train_data, test_data
Пример #15
0
    def load_data(self, path=".", subset='wiki_entities'):
        """
        Fetch the Facebook WikiMovies dataset and load it to memory.

        Arguments:
            path (str, optional): Local directory in which to cache the raw
                                  dataset.  Defaults to current directory.

        Returns:
            tuple: knowledge base, entity list, training and test files are returned
        """
        self.data_dict = {}
        self.vocab = None
        workdir, filepath = valid_path_append(path, '', self.filename)
        babi_dir_name = self.filename.split('.')[0]

        if subset == 'wiki-entities':
            subset_folder = 'wiki_entities'
        else:
            subset_folder = subset

        file_base = babi_dir_name + '/questions/' + subset_folder + '/' + subset + '_qa_{}.txt'
        train_file = os.path.join(workdir, file_base.format('train'))
        test_file = os.path.join(workdir, file_base.format('test'))

        entity_file_path = babi_dir_name + '/knowledge_source/entities.txt'
        entity_file = os.path.join(workdir, entity_file_path)

        # Check for the existence of the entity file
        # If it isn't there then we know we need to fetch everything
        if not os.path.exists(entity_file):
            if license_prompt('WikiMovies',
                              'https://research.fb.com/downloads/babi/',
                              self.path) is False:
                sys.exit(0)

            fetch_file(self.url, self.filename, filepath, self.size)

        knowledge_file_path = babi_dir_name + '/knowledge_source/' + subset_folder + '/' \
            + subset_folder + '_kb.txt'
        kb_file = os.path.join(workdir, knowledge_file_path)

        return entity_file, kb_file, train_file, test_file
Пример #16
0
    def load_data(self, path=".", subset='wiki_entities'):
        """
        Fetch the Facebook WikiMovies dataset and load it to memory.

        Arguments:
            path (str, optional): Local directory in which to cache the raw
                                  dataset.  Defaults to current directory.

        Returns:
            tuple: knowledge base, entity list, training and test files are returned
        """
        self.data_dict = {}
        self.vocab = None
        workdir, filepath = valid_path_append(path, '', self.filename)
        babi_dir_name = self.filename.split('.')[0]

        if subset == 'wiki-entities':
            subset_folder = 'wiki_entities'
        else:
            subset_folder = subset

        file_base = babi_dir_name + '/questions/' + subset_folder + '/' + subset + '_qa_{}.txt'
        train_file = os.path.join(workdir, file_base.format('train'))
        test_file = os.path.join(workdir, file_base.format('test'))

        entity_file_path = babi_dir_name + '/knowledge_source/entities.txt'
        entity_file = os.path.join(workdir, entity_file_path)

        # Check for the existence of the entity file
        # If it isn't there then we know we need to fetch everything
        if not os.path.exists(entity_file):
            if license_prompt('WikiMovies',
                              'https://research.fb.com/downloads/babi/',
                              self.path) is False:
                sys.exit(0)

            fetch_file(self.url, self.filename, filepath, self.size)

        knowledge_file_path = babi_dir_name + '/knowledge_source/' + subset_folder + '/' \
            + subset_folder + '_kb.txt'
        kb_file = os.path.join(workdir, knowledge_file_path)

        return entity_file, kb_file, train_file, test_file
Пример #17
0
 def _maybe_download(self):
     """
     Download filename from url unless it's already in directory
     """
     # 1. Check if the file doesnt exist. Download and extract if it doesnt
     filename = "wiki." + self.language + ".vec"
     filepath = os.path.join(self.path, filename)
     link = "https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md"
     if not os.path.exists(filepath):
         if license_prompt(filepath, link, self.path):
             print("Downloading FastText embeddings for " + self.language +
                   " to " + filepath)
             urllib.request.urlretrieve(self.url, filepath)
             statinfo = os.stat(filepath)
             print("Sucessfully downloaded", filename, statinfo.st_size,
                   "bytes")
         else:
             exit()
     else:
         print("Found FastText embeddings for " + self.language + " at " +
               filepath)
     return filepath
Пример #18
0
def get_eval_data(eval_path, src_lang, tgt_lang):
    """
    Downloads evaluation cross lingual dictionaries to the eval_path
    Arguments:
        eval_path: Path where cross-lingual dictionaries are downloaded
        src_lang : Source Language
        tgt_lang : Target Language
    Returns:
        Path to where cross lingual dictionaries are downloaded
    """
    eval_url = 'https://s3.amazonaws.com/arrival/dictionaries/'
    link = "https://github.com/facebookresearch/MUSE#ground-truth-bilingual-dictionaries"
    src_path = os.path.join(eval_path, '%s-%s.5000-6500.txt' % (src_lang, tgt_lang))
    filename = src_lang + '-' + tgt_lang + '.5000-6500.txt'
    if not os.path.exists(src_path):
        if license_prompt(src_path, link, src_path):
            os.system("mkdir -p " + eval_path)
            print("Downloading cross-lingual dictionaries for " + src_lang)
            fp, _ = urllib.request.urlretrieve(eval_url + filename, src_path)
            print("Completed downloading to " + eval_path)
        else:
            exit()
    return src_path
Пример #19
0
    def load_data(self):
        """
        Fetch and extract the Facebook bAbI-dialog dataset if not already downloaded.

        Returns:
            tuple: training and test filenames are returned
        """
        if self.task < 5:
            self.candidate_answer_filename = 'dialog-babi-candidates.txt'
            self.kb_filename = 'dialog-babi-kb-all.txt'
            self.cands_mat_filename = 'babi-cands-with-matchtype_{}.npy'
            self.vocab_filename = 'dialog-babi-vocab-task{}'.format(self.task + 1) +\
                                  '_matchtype{}.pkl'.format(self.use_match_type)
        else:
            self.candidate_answer_filename = 'dialog-babi-task6-dstc2-candidates.txt'
            self.kb_filename = 'dialog-babi-task6-dstc2-kb.txt'
            self.cands_mat_filename = 'dstc2-cands-with-matchtype_{}.npy'
            self.vocab_filename = 'dstc2-vocab-task{}_matchtype{}.pkl'.format(
                self.task + 1, self.use_match_type)

        self.vectorized_filename = 'vectorized_task{}.pkl'.format(self.task +
                                                                  1)

        self.data_dict = {}
        self.vocab = None
        self.workdir, filepath = valid_path_append(self.path, '',
                                                   self.filename)
        if not os.path.exists(filepath):
            if license_prompt('bAbI-dialog',
                              'https://research.fb.com/downloads/babi/',
                              self.path) is False:
                sys.exit(0)

            download_unlicensed_file(self.url, self.filename, filepath,
                                     self.size)

        self.babi_dir_name = self.filename.split('.')[0]

        self.candidate_answer_filename = self.babi_dir_name + \
            '/' + self.candidate_answer_filename
        self.kb_filename = self.babi_dir_name + '/' + self.kb_filename
        self.cands_mat_filename = os.path.join(
            self.workdir, self.babi_dir_name + '/' + self.cands_mat_filename)
        self.vocab_filename = self.babi_dir_name + '/' + self.vocab_filename
        self.vectorized_filename = self.babi_dir_name + '/' + self.vectorized_filename

        task_name = self.babi_dir_name + '/' + self.tasks[self.task] + '{}.txt'

        train_file = os.path.join(self.workdir, task_name.format('trn'))
        dev_file = os.path.join(self.workdir, task_name.format('dev'))
        test_file_postfix = 'tst-OOV' if self.oov else 'tst'
        test_file = os.path.join(self.workdir,
                                 task_name.format(test_file_postfix))

        cand_file = os.path.join(self.workdir, self.candidate_answer_filename)
        kb_file = os.path.join(self.workdir, self.kb_filename)
        vocab_file = os.path.join(self.workdir, self.vocab_filename)
        vectorized_file = os.path.join(self.workdir, self.vectorized_filename)

        if (os.path.exists(train_file) is False
                or os.path.exists(dev_file) is False
                or os.path.exists(test_file) is False
                or os.path.exists(cand_file) is False):
            with tarfile.open(filepath, 'r:gz') as f:
                f.extractall(self.workdir)

        return train_file, dev_file, test_file, cand_file, kb_file, vocab_file, vectorized_file
Пример #20
0
    def load_data(self):
        """
        Fetch and extract the Facebook bAbI-dialog dataset if not already downloaded.

        Returns:
            tuple: training and test filenames are returned
        """
        if self.task < 5:
            self.candidate_answer_filename = 'dialog-babi-candidates.txt'
            self.kb_filename = 'dialog-babi-kb-all.txt'
            self.cands_mat_filename = 'babi-cands-with-matchtype_{}.npy'
            self.vocab_filename = 'dialog-babi-vocab-task{}.pkl'.format(self.task + 1)
        else:
            self.candidate_answer_filename = 'dialog-babi-task6-dstc2-candidates.txt'
            self.kb_filename = 'dialog-babi-task6-dstc2-kb.txt'
            self.cands_mat_filename = 'dstc2-cands-with-matchtype_{}.npy'
            self.vocab_filename = 'dstc2-vocab-task{}.pkl'.format(self.task + 1)

        self.vectorized_filename = 'vectorized_task{}.pkl'.format(self.task + 1)

        self.data_dict = {}
        self.vocab = None
        self.workdir, filepath = valid_path_append(
            self.path, '', self.filename)
        if not os.path.exists(filepath):
            if license_prompt('bAbI-dialog',
                              'https://research.fb.com/downloads/babi/',
                              self.path) is False:
                sys.exit(0)

            fetch_file(self.url, self.filename, filepath, self.size)

        self.babi_dir_name = self.filename.split('.')[0]

        self.candidate_answer_filename = self.babi_dir_name + \
            '/' + self.candidate_answer_filename
        self.kb_filename = self.babi_dir_name + '/' + self.kb_filename
        self.cands_mat_filename = os.path.join(
            self.workdir, self.babi_dir_name + '/' + self.cands_mat_filename)
        self.vocab_filename = self.babi_dir_name + '/' + self.vocab_filename
        self.vectorized_filename = self.babi_dir_name + '/' + self.vectorized_filename

        task_name = self.babi_dir_name + '/' + self.tasks[self.task] + '{}.txt'

        train_file = os.path.join(self.workdir, task_name.format('trn'))
        dev_file = os.path.join(self.workdir, task_name.format('dev'))
        test_file_postfix = 'tst-OOV' if self.oov else 'tst'
        test_file = os.path.join(
            self.workdir,
            task_name.format(test_file_postfix))

        cand_file = os.path.join(self.workdir, self.candidate_answer_filename)
        kb_file = os.path.join(self.workdir, self.kb_filename)
        vocab_file = os.path.join(self.workdir, self.vocab_filename)
        vectorized_file = os.path.join(self.workdir, self.vectorized_filename)

        if (os.path.exists(train_file) is False
            or os.path.exists(dev_file) is False
            or os.path.exists(test_file) is False
                or os.path.exists(cand_file) is False):
            with tarfile.open(filepath, 'r:gz') as f:
                f.extractall(self.workdir)

        return train_file, dev_file, test_file, cand_file, kb_file, vocab_file, vectorized_file
Пример #21
0
    def load_data(self):
        """
        Fetch and extract the Facebook bAbI-dialog dataset if not already downloaded.

        Returns:
            tuple: training and test filenames are returned
        """
        if self.task < 5:
            self.candidate_answer_filename = "dialog-babi-candidates.txt"
            self.kb_filename = "dialog-babi-kb-all.txt"
            self.cands_mat_filename = "babi-cands-with-matchtype_{}.npy"
            self.vocab_filename = "dialog-babi-vocab-task{}".format(
                self.task + 1) + "_matchtype{}.pkl".format(self.use_match_type)
        else:
            self.candidate_answer_filename = "dialog-babi-task6-dstc2-candidates.txt"
            self.kb_filename = "dialog-babi-task6-dstc2-kb.txt"
            self.cands_mat_filename = "dstc2-cands-with-matchtype_{}.npy"
            self.vocab_filename = "dstc2-vocab-task{}_matchtype{}.pkl".format(
                self.task + 1, self.use_match_type)

        self.vectorized_filename = "vectorized_task{}.pkl".format(self.task +
                                                                  1)

        self.data_dict = {}
        self.vocab = None
        self.workdir, filepath = valid_path_append(self.path, "",
                                                   self.filename)
        if not os.path.exists(filepath):
            if (license_prompt("bAbI-dialog",
                               "https://research.fb.com/downloads/babi/",
                               self.path) is False):
                sys.exit(0)

            download_unlicensed_file(self.url, self.filename, filepath,
                                     self.size)

        self.babi_dir_name = self.filename.split(".")[0]

        self.candidate_answer_filename = self.babi_dir_name + "/" + self.candidate_answer_filename
        self.kb_filename = self.babi_dir_name + "/" + self.kb_filename
        self.cands_mat_filename = os.path.join(
            self.workdir, self.babi_dir_name + "/" + self.cands_mat_filename)
        self.vocab_filename = self.babi_dir_name + "/" + self.vocab_filename
        self.vectorized_filename = self.babi_dir_name + "/" + self.vectorized_filename

        task_name = self.babi_dir_name + "/" + self.tasks[self.task] + "{}.txt"

        train_file = os.path.join(self.workdir, task_name.format("trn"))
        dev_file = os.path.join(self.workdir, task_name.format("dev"))
        test_file_postfix = "tst-OOV" if self.oov else "tst"
        test_file = os.path.join(self.workdir,
                                 task_name.format(test_file_postfix))

        cand_file = os.path.join(self.workdir, self.candidate_answer_filename)
        kb_file = os.path.join(self.workdir, self.kb_filename)
        vocab_file = os.path.join(self.workdir, self.vocab_filename)
        vectorized_file = os.path.join(self.workdir, self.vectorized_filename)

        if (os.path.exists(train_file) is False
                or os.path.exists(dev_file) is False
                or os.path.exists(test_file) is False
                or os.path.exists(cand_file) is False):
            with tarfile.open(filepath, "r:gz") as f:
                f.extractall(self.workdir)

        return train_file, dev_file, test_file, cand_file, kb_file, vocab_file, vectorized_file
Пример #22
0
# See the License for the specific language governing permissions and
# limitations under the License.
# ****************************************************************************
import re

import nltk
import numpy
from numpy import dot
from numpy.linalg import norm

from nlp_architect.utils.generic import license_prompt

try:
    nltk.data.find('taggers/averaged_perceptron_tagger')
except LookupError:
    if license_prompt('Averaged Perceptron Tagger',
                      'http://www.nltk.org/nltk_data/') is False:
        raise Exception("can't continue data prepare process "
                        "without downloading averaged_perceptron_tagger")
    nltk.download('averaged_perceptron_tagger')

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    if license_prompt('Punkt model',
                      'http://www.nltk.org/nltk_data/') is False:
        raise Exception("can't continue data prepare process "
                        "without downloading punkt")
    nltk.download('punkt')


def extract_features_envelope(target_word, definition, hyps_vec, model_w2v):
Пример #23
0
# See the License for the specific language governing permissions and
# limitations under the License.
# ****************************************************************************
import re

import nltk
import numpy
from numpy import dot
from numpy.linalg import norm

from nlp_architect.utils.generic import license_prompt

try:
    nltk.data.find("taggers/averaged_perceptron_tagger")
except LookupError:
    if license_prompt("Averaged Perceptron Tagger",
                      "http://www.nltk.org/nltk_data/") is False:
        raise Exception("can't continue data prepare process "
                        "without downloading averaged_perceptron_tagger")
    nltk.download("averaged_perceptron_tagger")

try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    if license_prompt("Punkt model",
                      "http://www.nltk.org/nltk_data/") is False:
        raise Exception("can't continue data prepare process "
                        "without downloading punkt")
    nltk.download("punkt")


def extract_features_envelope(target_word, definition, hyps_vec, model_w2v):
Пример #24
0
# See the License for the specific language governing permissions and
# limitations under the License.
# ****************************************************************************
import re

import nltk
import numpy
from numpy import dot
from numpy.linalg import norm

from nlp_architect.utils.generic import license_prompt

try:
    nltk.data.find('taggers/averaged_perceptron_tagger')
except LookupError:
    if license_prompt('Averaged Perceptron Tagger', 'http://www.nltk.org/nltk_data/') is False:
        raise Exception("can't continue data prepare process "
                        "without downloading averaged_perceptron_tagger")
    nltk.download('averaged_perceptron_tagger')

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    if license_prompt('Punkt model', 'http://www.nltk.org/nltk_data/') is False:
        raise Exception("can't continue data prepare process "
                        "without downloading punkt")
    nltk.download('punkt')

# -------------------------------------------------------------------------------------#