Python download示例，nltk.download Python示例

示例#1

0

显示文件

文件： convert2emb.py 项目： jxwuyi/WebNav

def compute_emb(pages_path_in, pages_path_out, vocab):

    wemb = pkl.load(open(prm.wordemb_path, 'rb'))
    dim_emb = wemb[wemb.keys()[0]].shape[0]
    W = 0.01 * np.random.randn(len(vocab), dim_emb).astype(np.float32)
    for word, pos in vocab.items():
        if word in wemb:
            W[pos,:] = wemb[word]

    f = h5py.File(pages_path_in, 'r')

    if prm.att_doc and prm.att_segment_type == 'sentence':
        nltk.download('punkt')
        tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    os.remove(pages_path_out) if os.path.exists(pages_path_out) else None

    # Save to HDF5
    fout = h5py.File(pages_path_out,'a')

    if prm.att_doc:
        shape = (f['text'].shape[0],prm.max_segs_doc,prm.dim_emb)
    else:
        shape=(f['text'].shape[0],prm.dim_emb)

    embs = fout.create_dataset('emb', shape=shape, dtype=np.float32)
    mask = fout.create_dataset('mask', shape=(f['text'].shape[0],), dtype=np.float32)

    i = 0
    for text in f['text']:
        st = time.time()

        if prm.att_doc:
            if prm.att_segment_type == 'section':
                segs = ['']
                for line in text.split('\n'):
                    if line.strip().startswith('==') and line.strip().endswith('=='):
                        segs.append('')
                    segs[-1] += line + '\n'
            elif prm.att_segment_type == 'sentence':
                segs = tokenizer.tokenize(text.decode('ascii', 'ignore'))
            else:
                raise ValueError('Not a valid value for the attention segment type (att_segment_type) parameter.')

            segs = segs[:prm.max_segs_doc]
            emb_ = utils.Word2Vec_encode(segs, wemb)
            embs[i,:len(emb_),:] = emb_
            mask[i] = len(emb_)
        else:
            bow0, bow1 = utils.BOW(wordpunct_tokenize(text.lower()), vocab)
            emb = (W[bow0] * bow1[:,None]).sum(0)
            embs[i,:] = emb
        i += 1
        #if i > 3000:
        #    break

        print 'processing article', i, 'time', time.time()-st

    f.close()
    fout.close()

示例#2

0

显示文件

文件： rhymelib.py 项目： StefanKopieczek/pyverse

    def _build_wordset(clazz, obscurity_limit):
        # I'm sorry this method is so disgusting.
        # It's all in the cause of fast loading in the main case.

        from nltk import FreqDist

        # Ensure corpora are loaded.
        try:
            from nltk.corpus import cmudict
            cmudict.entries()
        except LookupError:
            print "CMUDict corpus not found. Downloading..."
            from nltk import download
            download('cmudict')
            print "[Done]"
        if obscurity_limit is not None:
            from nltk.corpus import brown
            try:
                brown.words()
            except LookupError:
                print "Brown corpus not found. Downloading...",
                from nltk import download
                download('brown')
                print "[Done]"

        words = cmudict.entries()
        if obscurity_limit is not None:
            freqs = FreqDist([w.lower() for w in brown.words()])
            words = sorted(words,
                           key=lambda x: freqs[x[0].lower()],
                           reverse=True)
            return words[:obscurity_limit]
        else:
            return list(words)

示例#3

0

显示文件

文件： wordnet.py 项目： ooda/vwordnet

def _download_nltk_data():
    """Install corpus data.
    """
    for directory, data in nltk_data.iteritems():
        for datum in data:
            if not exists(join(NLTK_DATA_DIR, directory, datum)):
                nltk.download(datum, download_dir=NLTK_DATA_DIR)

示例#4

0

显示文件

文件： analytics.py 项目： FrankGrimm/text-insights

    def handle(self, *args, **options):
        if args is None or len(args) < 2:
            pages = Page.objects.all()
            for page in pages:
                self._log.info("Page #%s: %s" % (page.id, page.fb_page_name))
            raise CommandError('Invalid arguments. Expected: <page_id> <action>, where action might be: extract, tfidf, webidf')


        page_id = args[0]
        action = args[1]

        if page_id == 'setup':
            self._log.info("invoking nltk download")
            nltk.download()
            exit()

        self._log.info('AnalyticsCommand initializing.')

        self._log.info('Page-Id: %s' % page_id)
        page = Page.objects.get(id=page_id)

        if action == "extract":
            self.processPageExtract(page)
        elif action == "tfidf":
            self.processTfIdf(page)
        elif action == "webidf":
            self.processWebIdf(page)
        else:
            self._log.warn("Unknown action: %s" % action)

        self._log.info("All done for now.")

示例#5

0

显示文件

文件： InstallNLTKResources.py 项目： paradisepilot/statistics

def installNLTKResources():

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    try:
        nltk.data.find('corpora/wordnet')
    except LookupError:
        nltk.download('wordnet')

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    try:
        nltk.data.find('corpora/stopwords')
    except LookupError:
        nltk.download('stopwords')

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    try:
        nltk.data.find('corpora/europarl_raw')
    except LookupError:
        nltk.download('europarl_raw')

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt')

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    try:
        nltk.data.find('corpora/gutenberg')
    except LookupError:
        nltk.download('gutenberg')

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    return( None )

示例#6

0

显示文件

文件： nltk_tokenizer.py 项目： CuteCha/DeepPavlov

 def __init__(self, save_path=None, download=False, tokenizer="wordpunct_tokenize", *args, **kwargs):
     super().__init__(save_path=save_path)
     if download:
         nltk.download()
     self.tokenizer = getattr(nltk.tokenize, tokenizer, None)
     if not callable(self.tokenizer):
         raise AttributeError("Tokenizer {} is not defined in nltk.tokenizer".format(tokenizer))

示例#7

0

显示文件

文件： setup.py 项目： tassieg/topic-explorer

def _post_install(dir):
    import site
    reload(site)

    import nltk
    nltk.download('punkt')
    nltk.download('stopwords')

示例#8

0

显示文件

文件： plural_Find.py 项目： knkumar/Plural_find

 def __init__(self):
     print("Please Install the brown-corpus and wordnet on your machine : ")
     nltk.download()
     self.pfile = open("pcent_plurals.txt","w")
     self.pfile.write("%s %s \n" % ("Plurals".ljust(20),"Percentages"))
     self.plural_dict = {}
     self.single_dict = {}

示例#9

0

显示文件

文件： loc_finder.py 项目： alexsosn/OwlLocalizer

def search_for_all_strings(line, file_format):
    '''Search for all strings with NLTK'''
    result = []
    for regexp in Config.excluded_lines:
        for match in re.finditer(regexp, line):
            if match:
                return([])

    for regexp in Config.strings_patterns[file_format]:
        for match in re.finditer(regexp, line):
            if not match:
                continue
            group = match.group(1)
            if len(group) > 0 and not contains_forbidden_patterns(group):
                try:
                    tokens = nltk.word_tokenize(group)
                    if len(tokens) > 0:
                        for word in tokens:
                            morf = wn.morphy(word)
                            if morf and len(str(morf)) > 1:
                                if (output_format == "csv") | (group not in global_word_pull):
                                    result.append(group)
                                    global_word_pull.add(group)
                                break
                except:
                    print ("Unexpected error:{0}".format(sys.exc_info()))
                    traceback.print_tb(sys.exc_info()[2])
                    url = os.path.join(os.path.split(os.path.realpath(__file__))[0] + "/nltk_info.html")
                    print("See here for installation instructions:\n" + url)
                    webbrowser.open_new(url)

                    nltk.download()
                    sys.exit(2)

    return result

示例#10

0

显示文件

文件： test.py 项目： Brbrew/Docker

def main():

    nltk.download('stopwords')
    nltk.download('vader_lexicon')        
        
    print("\n================================================================================\n")
    print("---------------------------------- Platform Information ------------------------")
    print('machine: {}'.format(platform.machine()))
    print('node: {}'.format(platform.node()))    
    print('processor: {}'.format(platform.processor()))    
    print('release: {}'.format(platform.release()))
    print('system: {}'.format(platform.system()))    
    print('version: {}'.format(platform.version()))
    print('uname: {}'.format(platform.uname()))
    
    #mem = virtual_memory()
    #print('memory: {}'.format(mem.total))  # total physical memory available
    
    print('python_build: {}'.format(platform.python_build()))
    print('python_compiler: {}'.format(platform.python_compiler()))
    print('python_branch: {}'.format(platform.python_branch()))
    print('python_implementation: {}'.format(platform.python_implementation()))
    
    print('python_revision: {}'.format(platform.python_revision()))
    print('python_version: {}'.format(platform.python_version()))
    
    print("\n================================================================================\n")

示例#11

0

显示文件

文件： text_nltk.py 项目： dirkneumann/nuanceq

def lemma_tokenize(paragraph):
    lmtzr = WordNetLemmatizer()
    try:
        return [lmtzr.lemmatize(word).lower() for sentence in tokenize(paragraph) for word in sentence]
    except LookupError:
        nltk.download('wordnet')
        return [lmtzr.lemmatize(word).lower() for sentence in tokenize(paragraph) for word in sentence]

示例#12

0

显示文件

文件： semantic_predictability.py 项目： mmcauliffe/linguistic-helper-functions

    def __init__(self, ngram=False, use_idf=False):
        self.ngram = ngram
        self.use_idf = use_idf

        # Load WordNet synsets and download data if necessary
        try:
            wordnet_path = nltk.data.find("corpora/wordnet")
        except LookupError:
            nltk.download("wordnet")
            wordnet_path = nltk.data.find("corpora/wordnet")
        self.wn = wordnet.WordNetCorpusReader(wordnet_path)

        # Initialize the two types of n-gram generators
        pentagram_vectorizer = CountVectorizer(
            ngram_range=(1, 5), token_pattern=r"\b[A-Za-z]+\b", min_df=1, stop_words=stop_list
        )
        unigram_vectorizer = CountVectorizer(
            ngram_range=(1, 1), token_pattern=r"\b[A-Za-z]+\b", min_df=1, stop_words=stop_list
        )

        # Function for generating five-grams through unigrams
        self.pent_analyze = pentagram_vectorizer.build_analyzer()

        # Function for generating just unigrams
        self.uni_analyze = unigram_vectorizer.build_analyzer()

        # Load IDF scores
        self.IDF = self.get_idf_scores()
        self.counts = self.get_counts()

示例#13

0

显示文件

文件： setup.py 项目： jvlomax/mancify

def main():
    import io

    with io.open(os.path.join(HERE, "README.rst"), "r") as readme:
        setup(
            name=app.__project__,
            version=app.__version__,
            description=app.__doc__,
            long_description=readme.read(),
            classifiers=app.__classifiers__,
            author=app.__author__,
            author_email=app.__author_email__,
            # url                  = app.__url__,
            license=[c.rsplit("::", 1)[1].strip() for c in app.__classifiers__ if c.startswith("License ::")][0],
            keywords=" ".join(app.__keywords__),
            packages=["mancify"],
            package_data={},
            include_package_data=True,
            platforms=app.__platforms__,
            install_requires=app.__requires__,
            extras_require=app.__extra_requires__,
            zip_safe=True,
            entry_points=app.__entry_points__,
            tests_require=["pytest-cov", "pytest", "mock"],
            cmdclass={"test": PyTest},
        )

    # Download the required NLTK packages automatically
    import nltk

    nltk.download("cmudict")
    nltk.download("maxent_treebank_pos_tagger")

示例#14

0

显示文件

文件： setup.py 项目： sovaa/neuralnet

 def run(self):
     _install.run(self)
     import nltk
     try:
         nltk.data.find('tokenizers/punkt')
     except LookupError:
         nltk.download('punkt')

示例#15

0

显示文件

文件： preprocess.py 项目： daniaki/ppi_wrangler

def annotations_to_words(terms, dag, ipr_map, lower):
    """
    Converts a string of accesssions into a string of the corresponding english-text representations.
    """
    try:
        sws = stopwords.words('english')
    except LookupError:
        nltk.download('stopwords')
        sws = stopwords.words('english')

    if lower:
        sws = set([x.lower() for x in sws])
        case = string.lower
    else:
        sws = set([x.upper() for x in sws])
        case = string.upper

    go_terms = [t.upper() for t in terms if 'GO' in t.upper()]
    ipr_terms = [t.upper() for t in terms if t.upper() in ipr_map]

    go_descriptions = ' '.join([case(dag[t].name) for t in go_terms]).split(' ')
    ipr_descriptions = ' '.join([case(ipr_map[t]) for t in ipr_terms]).split(' ')

    go_descriptions = [x.translate(None, string.punctuation) for x in go_descriptions]
    ipr_descriptions = [x.translate(None, string.punctuation) for x in ipr_descriptions]

    go_descriptions = [x for x in go_descriptions if case(x) not in sws]
    ipr_descriptions = [x for x in ipr_descriptions if case(x) not in sws]

    line = ' '.join(go_descriptions + ipr_descriptions)
    return line

示例#16

0

显示文件

文件： nltk_tokenizer.py 项目： RileyShe/DeepPavlov

 def __init__(self, tokenizer: str = "wordpunct_tokenize", download: bool = False,
              *args, **kwargs):
     if download:
         nltk.download()
     self.tokenizer = getattr(nltk.tokenize, tokenizer, None)
     if not callable(self.tokenizer):
         raise AttributeError("Tokenizer {} is not defined in nltk.tokenizer".format(tokenizer))

示例#17

0

显示文件

文件： ai.py 项目： jimmytheleaf/botutils

def boostrap_nltk_data():
    nltk.data.path.append('./data/')
    nltkdata_exists = Path('./data/tokenizers/punkt/english.pickle')

    if not nltkdata_exists.exists():
        logging.info("Downloading NLTK Data")
        nltk.download('punkt', './data')

示例#18

0

显示文件

文件： setup.py 项目： alexlafroscia/class-projects

def _post_install():
    from importlib import reload
    import site
    reload(site)

    import nltk
    nltk.download('punkt')

示例#19

0

显示文件

文件： transforms.py 项目： hridaydutta123/gluon-nlp

 def __init__(self):
     try:
         from sacremoses import MosesDetokenizer
         self._detokenizer = MosesDetokenizer()
     except (ImportError, TypeError) as err:
         if isinstance(err, TypeError):
             warnings.warn('The instantiation of MosesDetokenizer in sacremoses is'
                           ' currently only supported in python3.'
                           ' Now try NLTKMosesDetokenizer using NLTK ...')
         else:
             warnings.warn('sacremoses is not installed. '
                           'To install sacremoses, use pip install -U sacremoses'
                           ' Now try NLTKMosesDetokenizer using NLTK ...')
         try:
             import nltk
             try:
                 nltk.data.find('perluniprops')
             except LookupError:
                 nltk.download('perluniprops')
             from nltk.tokenize.moses import MosesDetokenizer
             self._detokenizer = MosesDetokenizer()
         except ImportError:
             raise ImportError('NLTK is not installed. '
                               'You must install NLTK <= 3.2.5 in order to use the '
                               'NLTKMosesDetokenizer. You can refer to the official '
                               'installation guide in https://www.nltk.org/install.html .')

示例#20

0

显示文件

文件： datasets.py 项目： pramitchoudhary/Experiments

def nltk_corpus(corpus_name):
    corpus = getattr(nltk.corpus, corpus_name)
    try:
        corpus.ensure_loaded()
    except:
        nltk.download(corpus_name)
    return corpus

示例#21

0

显示文件

文件： tasks.py 项目： 7digital/synonym-list

def generate(dictionary='/usr/share/dict/british-english', output='../../gb-us-synonyms.txt'):
    nltk.download('wordnet')
    with open(dictionary) as dict_file:
        with open(output, 'w') as output_file:
            for gb, us in gen_synonyms(dict_file):
                output_file.write(gb + ', ' + us + '\n')
                print(gb + ',', us)

示例#22

0

显示文件

文件： utils.py 项目： jianjun66/ChatterBot

def nltk_download_corpus(resource_path):
    """
    Download the specified NLTK corpus file
    unless it has already been downloaded.

    Returns True if the corpus needed to be downloaded.
    """
    from nltk.data import find
    from nltk import download
    from os.path import split

    # Download the wordnet data only if it is not already downloaded
    _, corpus_name = split(resource_path)

    ## From http://www.nltk.org/api/nltk.html ##
    # When using find() to locate a directory contained in a zipfile,
    # the resource name must end with the forward slash character.
    # Otherwise, find() will not locate the directory.
    ####
    # Helps when resource_path=='sentiment/vader_lexicon''
    if not resource_path.endswith('/'):
        resource_path = resource_path + '/'

    downloaded = False

    try:
        find(resource_path)
    except LookupError:
        download(corpus_name)
        downloaded = True

    return downloaded

示例#23

0

显示文件

文件： single.py 项目： IsaacHaze/xtas

def morphy(doc):
    """Lemmatize tokens using morphy, WordNet's lemmatizer."""
    # XXX Results will be better if we do POS tagging first, but then we
    # need to map Penn Treebank tags to WordNet tags.
    nltk.download('wordnet', quiet=False)
    return map(nltk.WordNetLemmatizer().lemmatize,
               _tokenize_if_needed(fetch(doc)))

示例#24

0

显示文件

文件： download_datasets.py 项目： zhuwenxing/text-to-image

def download_dataset(data_name):
    if data_name == 'flowers':
        print('== Flowers dataset ==')
        flowers_dir = os.path.join(DATA_DIR, 'flowers')
        flowers_jpg_tgz = os.path.join(flowers_dir, '102flowers.tgz')
        make_sure_path_exists(flowers_dir)

        # the original google drive link at https://drive.google.com/file/d/0B0ywwgffWnLLcms2WWJQRFNSWXM/view
        # from https://github.com/reedscot/icml2016 is problematic to download automatically, so included
        # the text_c10 directory from that archive as a bzipped file in the repo
        captions_tbz = os.path.join(DATA_DIR, 'flowers_text_c10.tar.bz2')
        print('Extracting ' + captions_tbz)
        captions_tar = tarfile.open(captions_tbz, 'r:bz2')
        captions_tar.extractall(flowers_dir)

        flowers_url = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz'
        print('Downloading ' + flowers_jpg_tgz + ' from ' + flowers_url)
        urlretrieve(flowers_url, flowers_jpg_tgz,
                    reporthook=dl_progress_hook)
        print('Extracting ' + flowers_jpg_tgz)
        flowers_jpg_tar = tarfile.open(flowers_jpg_tgz, 'r:gz')
        flowers_jpg_tar.extractall(flowers_dir)  # archive contains jpg/ folder

    elif data_name == 'skipthoughts':
        print('== Skipthoughts models ==')
        SKIPTHOUGHTS_DIR = os.path.join(DATA_DIR, 'skipthoughts')
        SKIPTHOUGHTS_BASE_URL = 'http://www.cs.toronto.edu/~rkiros/models/'
        make_sure_path_exists(SKIPTHOUGHTS_DIR)

        # following https://github.com/ryankiros/skip-thoughts#getting-started
        skipthoughts_files = [
            'dictionary.txt', 'utable.npy', 'btable.npy', 'uni_skip.npz', 'uni_skip.npz.pkl', 'bi_skip.npz',
            'bi_skip.npz.pkl',
        ]
        for filename in skipthoughts_files:
            src_url = SKIPTHOUGHTS_BASE_URL + filename
            print('Downloading ' + src_url)
            urlretrieve(src_url, os.path.join(SKIPTHOUGHTS_DIR, filename),
                        reporthook=dl_progress_hook)

    elif data_name == 'nltk_punkt':
        import nltk
        print('== NLTK pre-trained Punkt tokenizer for English ==')
        nltk.download('punkt')

    elif data_name == 'pretrained_model':
        print('== Pretrained model ==')
        MODEL_DIR = os.path.join(DATA_DIR, 'Models')
        pretrained_model_filename = 'latest_model_flowers_temp.ckpt'
        src_url = 'https://bitbucket.org/paarth_neekhara/texttomimagemodel/raw/74a4bbaeee26fe31e148a54c4f495694680e2c31/' + pretrained_model_filename
        print('Downloading ' + src_url)
        urlretrieve(
            src_url,
            os.path.join(MODEL_DIR, pretrained_model_filename),
            reporthook=dl_progress_hook,
        )

    else:
        raise ValueError('Unknown dataset name: ' + data_name)

示例#25

0

显示文件

文件： setup.py 项目： jkeung/Customer_Recommender

def _post_install():  
    # since nltk may have just been install
    # we need to update our PYTHONPATH
    import site
    reload(site)
    # Now we can import nltk
    import nltk
    nltk.download('stopwords')

示例#26

0

显示文件

 def test_notebook_runner_2a_eco_nlp_correction(self):
     fLOG(
         __file__,
         self._testMethodName,
         OutputPrint=__name__ == "__main__")
     import nltk
     nltk.download('stopwords')
     self.common_notebook_runner_2a_eco_nlp_enonce("correction")

示例#27

0

显示文件

文件： nltkdatafiles.py 项目： gkunter/coquery

 def download_packages(self):
     import nltk
     
     for x in [comp for comp in self._missing if "/" in comp]:
         package = x.split("/")[1]
         self.updateLabel.emit(package)
         nltk.download(package, raise_on_error=True)
         self.progressTheBar.emit()

示例#28

0

显示文件

文件： GitCommitBear.py 项目： srisankethu/coala-bears

 def setup_dependencies(self):
     if not self._nltk_data_downloaded and bool(
             self.section.get('shortlog_imperative_check', True)):
         nltk.download([
             'punkt',
             'averaged_perceptron_tagger',
         ])
         type(self)._nltk_data_downloaded = True

示例#29

0

显示文件

文件： sentiment-analyzer.py 项目： alexandretea/sentiment-analysis

def main():
    nltk.download("punkt")
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("training_datasets", type=str, nargs="+",
                        help="path to training data sets")
    args = parser.parse_args(sys.argv[1:])

    run(args)

示例#30

0

显示文件

文件： word_net.py 项目： fmoliveira/ChatterBot

    def __init__(self):
        from nltk.data import find
        from nltk import download

        try:
            find('wordnet.zip')
        except LookupError:
            download('wordnet')

示例#31

0

显示文件

文件： eval_2datasets_pipeline.py 项目： mashres15/FakeNewsCapstone

import numpy as np

import time

from textblob import TextBlob
import dill as pickle
# import pickle

from customTransfomers import *

t0 = time.time()
#-------------------------------------
#***********Setting NTLK**************
#-------------------------------------

nltk.download('punkt')
nltk.download('stopwords')

#-------------------------------------
#***********Reading Data**************
#-------------------------------------
print("Reading Data ......")
df = pd.read_csv('fakecorpusWithMeta.csv')

print("Cleaning the Data ......")
# Drop unused columns
df = df.drop(columns=["scraped_at", "index", "Unnamed: 0"])

print("Filtering Data ......")
# If title is missing drop the entry
df = df.dropna(subset=['title'])