예제 #1
0
def install_nltk(download_dir=None):
    """ Download specific collection identifiers """
    if not download_dir:
        download_dir = settings.NLTK_DATA_PATH
    downloader = Downloader(download_dir=download_dir)
    downloader.download('punkt')
    downloader.download('maxent_treebank_pos_tagger')
예제 #2
0
    def __init__(self, ):
        """A method to initialize parameters"""

        DEFAULT_URL = 'https://raw.githubusercontent.com/JosephSefara/AfricanWordNet/master/data/index.xml'
        """The default URL for the NLTK data server's index"""

        try:
            downloader = Downloader(server_index_url=DEFAULT_URL)
            downloader.download('africanwordnet')
        except:
            raise
예제 #3
0
 def __init__(self):
     super(RssSkill, self).__init__('RssSkill')
     self._is_reading_headlines = False
     self.feeds = {}
     self.cached_items = {}
     self.cache_time = {}
     try:
         pos_tag('advance')
     except LookupError:
         logger.debug('Tagger not installed... Trying to download')
         dler = Downloader()
         if not dler.download('averaged_perceptron_tagger'):
             logger.debug('Trying alternative source...')
             dler = Downloader(ALT_NLTK_DATA)
             dler.download('averaged_perceptron_tagger',
                           raise_on_error=True)
예제 #4
0
def dl_nltk():
    TO_DL = ['stopwords', 'punkt']

    dler = Downloader('https://pastebin.com/raw/D3TBY4Mj')

    for to_dl in TO_DL:
        if not nltk.download(to_dl):
            print('Downloading NLTK data from alternative source...')
            if not dler.download(to_dl):
                print('Failed download NLTK data...')
예제 #5
0
    def build_list_from_nltk(self, lang):
        downloader = Downloader()

        # Check if NLTK data directory exists.
        if StopwordRemover.nltk_dir == None:
            # Create temporary directory for download
            StopwordRemover.nltk_dir = tempfile.mkdtemp(prefix='cherami')
            nltk.data.path = [StopwordRemover.nltk_dir]
            
            logger.info('NLTK data directory is "{0}"'
                .format(StopwordRemover.nltk_dir))
        
        # Check if the NLTK data has already been downloaded.
        if not downloader.is_installed('stopwords'):
            logger.info('Downloading NLTK stopword data...')
            downloader.download('stopwords', StopwordRemover.nltk_dir, True)
            logger.info('NLTK stopword data downloaded.')

        for word in stopwords.words(lang):
            self.stopword_list.add(word)
예제 #6
0
    def build_list_from_nltk(self, lang):
        downloader = Downloader()
        tempdir = None
        
        # Check if the NLTK data has already been downloaded.
        if not downloader.is_installed('stopwords'):
            # Create temporary directory for download
            tempdir = tempfile.mkdtemp(prefix='cherami')
            logger.info('Downloading NLTK stopword data into "{0}"'
                '...'.format(tempdir))

            downloader.download('stopwords', tempdir, True)
            logger.info('NLTK stopword data downloaded.')

            nltk.data.path = [tempdir]

        for word in stopwords.words(lang):
            self.stopword_list.add(word)

        # Clean up after we're done.
        if tempdir is not None:
            shutil.rmtree(tempdir)
예제 #7
0
파일: __init__.py 프로젝트: jhpyle/pattern
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet_ic as wn_ic
from nltk.corpus.reader.wordnet import Synset as WordNetSynset

# Make sure the necessary corpora are downloaded to the local drive
for token in ("wordnet", "wordnet_ic", "sentiwordnet"):
    try:
        nltk.data.find("corpora/" + token)
    except LookupError:
        try:
            nltk.download(token, quiet = True, raise_on_error = True)
        except ValueError:
            # Sometimes there are problems with the default index.xml URL. Then we will try this...
            from nltk.downloader import Downloader as NLTKDownloader
            d = NLTKDownloader("http://nltk.github.com/nltk_data/")
            d.download(token, quiet = True, raise_on_error = True)

# Use the Brown corpus for calculating information content (IC)
brown_ic = wn_ic.ic('ic-brown.dat')
IC_CORPUS, IC_MAX = brown_ic, {}
for key in IC_CORPUS:
    IC_MAX[key] = max(IC_CORPUS[key].values())

# This will hold the WordNet version
VERSION = wn.get_version() or "3.0"

#---------------------------------------------------------------------------------------------------

DIACRITICS = {
    "a": ("á", "ä", "â", "à", "å"),
    "e": ("é", "ë", "ê", "è"),
예제 #8
0
파일: __init__.py 프로젝트: clips/pattern
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet_ic as wn_ic
from nltk.corpus.reader.wordnet import Synset as WordNetSynset

# Make sure the necessary corpora are downloaded to the local drive
for token in ("wordnet", "wordnet_ic", "sentiwordnet"):
    try:
        nltk.data.find("corpora/" + token)
    except LookupError:
        try:
            nltk.download(token, quiet = True, raise_on_error = True)
        except ValueError:
            # Sometimes there are problems with the default index.xml URL. Then we will try this...
            from nltk.downloader import Downloader as NLTKDownloader
            d = NLTKDownloader("http://nltk.github.com/nltk_data/")
            d.download(token, quiet = True, raise_on_error = True)

# Use the Brown corpus for calculating information content (IC)
brown_ic = wn_ic.ic('ic-brown.dat')
IC_CORPUS, IC_MAX = brown_ic, {}
for key in IC_CORPUS:
    IC_MAX[key] = max(IC_CORPUS[key].values())

# This will hold the WordNet version
VERSION = wn.get_version() or "3.0"

#---------------------------------------------------------------------------------------------------

DIACRITICS = {
    "a": ("á", "ä", "â", "à", "å"),
    "e": ("é", "ë", "ê", "è"),
예제 #9
0
import os, nltk
from nltk.downloader import Downloader
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag, map_tag

nltk_packages = [
	'punkt',
	'maxent_treebank_pos_tagger',
	'universal_tagset',
	'wordnet'
]
nltk_path = os.path.dirname(os.path.realpath(__file__)) + '/nltk'
nltk.data.path.append(nltk_path)
nltk_dl = Downloader(download_dir = nltk_path)
for package in nltk_packages:
	nltk_dl.download(package)

primary_tags = set(['NOUN', 'VERB', 'ADJ', 'ADV', 'PRON'])

processes = [
	'parts_all',
	'tokens_adj',
	'tokens_adv',
	'tokens_all',
	'tokens_dense',
	'tokens_noun',
	'tokens_other',
	'tokens_pron',
	'tokens_verb'
	]
예제 #10
0
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import nltk
from nltk.downloader import Downloader
from nltk.probability import FreqDist
from nltk.probability import FreqDist
from nltk.corpus import stopwords as stopwords_corpus
from nltk import pos_tag
from nltk.util import ngrams
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures

from sklearn.metrics import f1_score

downloader = Downloader()
downloader.download("stopwords")
nltk.download('averaged_perceptron_tagger')

with open("../data/full.csv", "r") as csvfile:
    reader = csv.reader(csvfile, quoting=csv.QUOTE_ALL)
    documents = [t[0] for t in reader]

matplotlib.rcParams.update({'font.size': 22})
stop_words = set(stopwords_corpus.words("english"))

_non_alpha = re.compile("[^a-zA-Z ]")


def normalize(text):
    """Map a token to a canonical form, e.g. lower case it, remove non-alpha characters, etc.