예제 #1
0
    def __init__(self, grammar: Grammar, reduction: Reduction = None, verbose: bool = False):
        """
        Initialize a parser with some global parameters.
        :param grammar: A CFG grammar driving acceptable transitions.
        :param reduction: A mapping of a complex grammar to a simpler one.
        :param verbose: Enables additional output.
        """
        self._grammar = grammar
        self.__parse_stack = []  # A stack of parsed Symbols
        self.__input_stack = []  # A stack of raw input strings and reduced Symbols.
        self._reduction = reduction
        self._needs_prune = False
        self.verbose = verbose

        # Check if necessary NLTK resources are available.
        try:
            nltk.find('tokenizers/punkt')
        except LookupError:
            print('Missing NLTK "punkt" package, downloading...')
            nltk.download('punkt')

        try:
            nltk.find('taggers/averaged_perceptron_tagger')
        except LookupError:
            print('Missing NLTK "Perceptron Tagger" package, downloading...')
            nltk.download('averaged_perceptron_tagger')

        # Create an artificial state frame to server as parse tree root.
        root_frame = StateFrame((Grammar.ROOT_SYM, 0, 0))
        root_frame.to_sym = [grammar.start_symbol]
        self.__state = root_frame

        # Generate the initial rule set as all rules accessible from the start point.
        self._set_looking_for(root_frame, create_all=True)
예제 #2
0
def test_word_stemming_filter():
    stim = ComplexTextStim(join(TEXT_DIR, 'sample_text.txt'),
                           columns='to',
                           default_duration=1)

    # With all defaults (porter stemmer)
    filt = WordStemmingFilter()
    assert isinstance(filt.stemmer, nls.PorterStemmer)
    stemmed = filt.transform(stim)
    stems = [s.text for s in stemmed]
    target = ['some', 'sampl', 'text', 'for', 'test', 'annot']
    assert stems == target

    # Try a different stemmer
    filt = WordStemmingFilter(stemmer='snowball', language='english')
    assert isinstance(filt.stemmer, nls.SnowballStemmer)
    stemmed = filt.transform(stim)
    stems = [s.text for s in stemmed]
    assert stems == target

    # Handles StemmerI stemmer
    stemmer = nls.SnowballStemmer(language='english')
    filt = WordStemmingFilter(stemmer=stemmer)
    stemmed = filt.transform(stim)
    stems = [s.text for s in stemmed]
    assert stems == target

    # Try lemmatization filter
    try:
        nltk.find('taggers/universal_tagset')
    except LookupError:
        nltk.download('universal_tagset')
    try:
        nltk.find('corpora/wordnet')
    except LookupError:
        nltk.download('wordnet')
    stim = ComplexTextStim(text='These are tests for Stemming filters')
    filt = WordStemmingFilter(stemmer='wordnet')
    lemmatized = filt.transform(stim)
    lemmas = [l.text for l in lemmatized]
    target = ['these', 'be', 'test', 'for', 'stem', 'filter']
    assert lemmas == target

    # Try case sensitive
    filt = WordStemmingFilter(stemmer='wordnet', case_sensitive=True)
    lemmatized = filt.transform(stim)
    lemmas = [l.text for l in lemmatized]
    target = ['These', 'be', 'test', 'for', 'Stemming', 'filter']
    assert lemmas == target

    # Fails on invalid values
    with pytest.raises(ValueError):
        filt = WordStemmingFilter(stemmer='nonexistent_stemmer')

    # Try a long text stim
    stim2 = TextStim(text='theres something happening here')
    filt = WordStemmingFilter()
    assert filt.transform(stim2).text == 'there someth happen here'
예제 #3
0
    def data_preparation(self):
        """
        Splits one of Brown, BNC News, Indian corpora into train set and
        test set

        Returns:
        --------
            sentences (list):
                Sentences without POS-tags
            tagged_sentences (list):
                Sentences with POS-tags
        """
        if self.corpus == 'brown':
            tagged_sentences = brown.tagged_sents(categories='news')
            sentences = brown.sents(categories='news')
        elif self.corpus == 'bnc':
            root = find('corpora/bnc')
            bncnews = TaggedCorpusReader(root,
                                         'bnc-news-wtp.txt',
                                         tagset='en-claws')
            if self.tagset is None:
                tagged_sentences = bncnews.tagged_sents()
            elif self.tagset == 'universal':
                tagged_sentences = bncnews.tagged_sents(tagset=self.tagset)
            sentences = bncnews.sents()
        elif self.corpus == 'indian':
            if self.lang in ['telugu', 'hindi', 'marathi', 'bangla']:
                tagged_sentences = indian.tagged_sents(f'{self.lang}.pos')
                sentences = indian.sents(f'{self.lang}.pos')
            else:
                print('Language not part of Indian Corpus.')
        return sentences, tagged_sentences
예제 #4
0
    def __init__(self):
        if not find("corpora/stopwords.zip"):
            download('stopwords')

        self.link_regex = re.compile(
            r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+'
            r'[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})',
            re.IGNORECASE)
        self.account_regex = re.compile(r"@\w*", re.IGNORECASE)
        self.low_fre_words = defaultdict(int)
        self.model = None
        self.labeled_data = None
        self.get_labeled_data()
예제 #5
0
import requests
from bs4 import BeautifulSoup
from operator import itemgetter

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import copy
from nltk.corpus import movie_reviews
import random
from nltk.tokenize import word_tokenize

try:
    nltk.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
try:
    nltk.find('sentiment/vader_lexicon.zip')
except LookupError:
    nltk.download('vader_lexicon')


def scraper():
    work = "https://api.nytimes.com/svc/movies/v2/reviews/all.json?api-key=rPRXhYeMN9E6OCRWs7704hENbvHAGmyK"
    res = requests.get(work)
    all = res.json()
    movie_data = all["results"]
    myLs = []

    # adds the first 20 reviews to a list of dict's
    for i in range(0, 19):
        link = movie_data[i]["link"]
예제 #6
0
import pickle
import random
import re
from collections import defaultdict

from nltk import word_tokenize, find, download

try:
    find("corpora/stopwords.zip")
except LookupError:
    download('stopwords')
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import stopwords
import rootpath

rootpath.append()
from backend.data_preparation.connection import Connection


class NLTKTest:
    def __init__(self):
        if not find("corpora/stopwords.zip"):
            download('stopwords')

        self.link_regex = re.compile(
            r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+'
            r'[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})',
            re.IGNORECASE)
        self.account_regex = re.compile(r"@\w*", re.IGNORECASE)
        self.low_fre_words = defaultdict(int)
        self.model = None
예제 #7
0
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from reader import DSReader

sys.path.append("src")

try:
    nltk.find("wordnet")
except LookupError:
    nltk.download("wordnet")

try:
    nltk.find("stopwords")
except LookupError:
    nltk.download("stopwords")

try:
    nltk.find("punkt")
except LookupError:
    nltk.download('punkt')

dataset_path = os.path.abspath("tests/datasets/emails.csv")
예제 #8
0
import sys
import random

try:
    nlp = spacy.load("en_core_web_md")
except Exception:
    traceback.print_exc()
    print("Error loading Spacy", file=sys.stderr)
    print("Please run the following command:", file=sys.stderr)
    print("python -m spacy download en_core_web_md", file=sys.stderr)
try:
    nltk.data.find("wordnet")
except Exception:
    nltk.download("wordnet")
try:
    nltk.find("averaged_perceptron_tagger")
except Exception:
    nltk.download("averaged_perceptron_tagger")
try:
    nltk.find("vader_lexicon")
except Exception:
    nltk.download("vader_lexicon")
from emora_stdm.state_transition_dialogue_manager.wordnet import related_synsets, wordnet_knowledge_base  # , lemmas_of
from nltk.corpus import wordnet
import regex
import re


def _process_args_set(args, vars):
    for i, e in enumerate(args):
        if isinstance(e, str) and "$" == e[0]:
예제 #9
0
#!/bin/python
import nltk
try:
    nltk.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')
try:
    from nltk.corpus import wordnet as wn
except:
    nltk.download('wordnet', force=True)
    from nltk.corpus import wordnet as wn

import json
import segment
import convert
import data
import numpy as np
import re
from collections import defaultdict

# Load dictionary for use in making annotations:
dictionary = np.load("dict/epsd.npz", allow_pickle=True)["dictionary"].item()


def get_noun_hypernyms(word):
    """
    word:       An English word
    returns:    A set of Synset objects representing the given
                word's hypernyms. Only returns noun synsets.
    """
    all_hypernyms = wn.synsets(word)
예제 #10
0
import spacy
import sys
import random
try:
    nlp = spacy.load("en_core_web_md")
except Exception as e:
    traceback.print_exc()
    print('Error loading Spacy', file=sys.stderr)
    print('Please run the following command:', file=sys.stderr)
    print('python -m spacy download en_core_web_md', file=sys.stderr)
try:
    nltk.data.find('wordnet')
except:
    nltk.download('wordnet')
try:
    nltk.find('averaged_perceptron_tagger')
except:
    nltk.download('averaged_perceptron_tagger')
try:
    nltk.find('vader_lexicon')
except:
    nltk.download('vader_lexicon')
from emora_stdm.state_transition_dialogue_manager.wordnet import \
    related_synsets, wordnet_knowledge_base, lemmas_of
from nltk.corpus import wordnet
import regex
import re


def _process_args_set(args, vars):
    for i, e in enumerate(args):
예제 #11
0
import pronouncing, nltk

# download tagger
try:
    nltk.find("taggers/averaged_perceptron_tagger")
except:
    nltk.download('averaged_perceptron_tagger')

# load common nouns
with open("nounlist.txt") as f:
    common_nouns = set(f.read().split("\n"))
with open("bad-words.txt") as f:
    bad_words = set(f.read().split("\n"))
"""
Creates a sentence using the elf on the shelf template
"""


def make_sentence(query):
    global common_nouns
    rhymes = set(pronouncing.rhymes(query))
    choice_nouns = list(rhymes)
    choice_nouns = list(rhymes - bad_words)

    return choice_nouns