def main():
    # FILENAME = "CellPhoneReview-1000.json"
    # print('Reading data...')
    # review_data = open(FILENAME).readlines()
    # document = [json.loads(d)['reviewText'] for d in review_data][0]
    document = "These are awesome and make my phone look so stylish! I have only used one so far and have had it on for almost a year! CAN YOU BELIEVE THAT! ONE YEAR!! Great quality!"
    print(document)
    nltk_tagger = NLTKTagger()
    extractor = ConllExtractor()
    blob = TextBlob(document, pos_tagger=nltk_tagger, np_extractor=extractor)
    print(blob.tags)
    print(blob.noun_phrases)

    pattern_tagger = PatternTagger()
    blob2 = TextBlob(document, pos_tagger=pattern_tagger, np_extractor=extractor)
    print(blob2.tags)
    print(blob2.noun_phrases)

    tagged = nltk.pos_tag(tokenize(document.lower()))
    print(tagged)
    grammar = ('''
            NP: {<DT>?(<RB.?>*<VB.?>*<NNPS?>+<NNS?>+ | <JJ>*<NNS?>+)} # NP
            ''')

    chunkParser = nltk.RegexpParser(grammar)
    tree = chunkParser.parse(tagged)
    noun_phrases = []
    for subtree in tree.subtrees():
        if subtree.label() == 'NP':
            noun_phrase = ' '.join([elem[0] for elem in subtree])
            noun_phrases.append(noun_phrase)

    print(noun_phrases)
예제 #2
0
        'detector': TwitterDetector,
        'autoload': True
    },
    UrlDetector.name: {
        'detector': UrlDetector,
        'autoload': True
    },
    # Detectors that are not automatically loaded by scrubadub
    KnownFilthDetector.name: {
        'detector': KnownFilthDetector,
        'autoload': False
    },
}  # type: Dict[str, DetectorConfigurationItem]

# BaseBlob uses NLTKTagger as a pos_tagger, but it works wrong
BaseBlob.pos_tagger = PatternTagger()


def register_detector(detector: Type[Detector], autoload: bool = False):
    """Register a detector for use with the ``Scrubber`` class.

    This is used when you dont want to have to import a detector by default.
    It may be useful for certain detectors with large or unusal dependancies, which you may not always want to import.
    In this case you can use ``register_detector(NewDetector, autoload=True)`` after your detector definition so that
    if the file is imported it wil be automatically registered.
    This will mean that you don't need to import the ``NewDetector`` in this file and so it's dependencies won't need
    to be installed just to import this package.

    The argument ``autoload`` sets if a new ``Scrubber()`` instance should load this ``Detector`` by default.
    """
    detector_configuration[detector.name] = {
import helper
import json
import os 
import sqlite3
from textblob.en.taggers import PatternTagger
from textblob.tokenizers import WordTokenizer

tk = WordTokenizer()
tagger = PatternTagger()

# since lots of repeat words, we store an index to the actual token.
keys = []
def key_to_int(key):
  try:
    return keys.index(key) 
  except ValueError:
    keys.append(key)
    return len(keys) - 1

ntoken_freq = {}
npos_freq = {}

conn = sqlite3.connect("data.db")
c = conn.cursor()

USAGE_MINIMUM = 15
NTOKENS_PURGE_THRESHOLD = 5E6

# used to track progress
posts_processed = 0
예제 #4
0
def spellCheck(text):
    language = getLanguage(text)
    b = TextBlob(text, pos_tagger=PatternTagger())
    for word in b.words:
        print(word)
        print(word.spellcheck())