Пример #1
0
    def __init__(self,
                 text,
                 tokenizer=None,
                 pos_tagger=None,
                 np_extractor=None,
                 analyzer=None,
                 parser=None,
                 classifier=None,
                 clean_html=False):

        self.tokenizer = tokenizer if tokenizer is not None else NLTKPunktTokenizer(
        )
        self.pos_tagger = pos_tagger if pos_tagger is not None else PatternTagger(
            tokenizer=self.tokenizer)
        self.np_extractor = np_extractor if np_extractor is not None \
            else PatternParserNPExtractor(tokenizer=self.tokenizer)
        self.analyzer = analyzer if analyzer is not None \
            else PatternAnalyzer(tokenizer=self.tokenizer)
        self.parser = parser if parser is not None \
            else PatternParser(tokenizer=self.tokenizer)
        self.classifier = classifier if classifier is not None else None

        if not isinstance(text, basestring):
            raise TypeError('The `text` argument passed to `__init__(text)` '
                            'must be a string, not {0}'.format(type(text)))
        if clean_html:
            raise NotImplementedError(
                "clean_html has been deprecated. "
                "To remove HTML markup, use BeautifulSoup's "
                "get_text() function")
        self.raw = self.string = text
        self.stripped = lowerstrip(self.raw, all=True)
        _initialize_models(self, self.tokenizer, self.pos_tagger,
                           self.np_extractor, self.analyzer, self.parser,
                           self.classifier)
Пример #2
0
 def word_counts(self):
     """Dictionary of word frequencies in this text."""
     counts = defaultdict(int)
     stripped_words = [lowerstrip(word) for word in self.words]
     for word in stripped_words:
         counts[word] += 1
     return counts
Пример #3
0
 def word_counts(self):
     """Dictionary of word frequencies in this text."""
     counts = defaultdict(int)
     stripped_words = [lowerstrip(word) for word in self.words]
     for word in stripped_words:
         counts[word] += 1
     return counts
Пример #4
0
def sentiment_overview(df):
    st.subheader('Overview of comments')
    st.write(
        'Sort the comments by sentiment based on polarity and then it displays them in the list below.'
    )
    option = st.selectbox('Select the sentiment',
                          ('Positive', 'Negative', 'Neutral'))

    if option == 'All':
        if len(df['comments'].tolist()) > 0:
            st.table(df['comments'])
        else:
            st.info('There is no comments on this video.')
    else:
        if len(df[df['sentiment'] == lowerstrip(option)]
               ['comments'].tolist()) > 0:
            st.table(df[df['sentiment'] == lowerstrip(option)]['comments'])
        else:
            st.info(
                f'There is no {lowerstrip(option)} comments on this video.')
Пример #5
0
 def __init__(self, text, tokenizer=None,
             pos_tagger=None, np_extractor=None, analyzer=None,
             parser=None, classifier=None, clean_html=False):
     if not isinstance(text, basestring):
         raise TypeError('The `text` argument passed to `__init__(text)` '
                         'must be a string, not {0}'.format(type(text)))
     if clean_html:
         raise NotImplementedError("clean_html has been deprecated. "
                                 "To remove HTML markup, use BeautifulSoup's "
                                 "get_text() function")
     self.raw = self.string = text
     self.stripped = lowerstrip(self.raw, all=True)
     _initialize_models(self, tokenizer, pos_tagger, np_extractor, analyzer,
                        parser, classifier)
Пример #6
0
 def __init__(self, text, tokenizer=None,
             pos_tagger=None, np_extractor=None, analyzer=None,
             parser=None, classifier=None, clean_html=False):
     if not isinstance(text, basestring):
         raise TypeError('The `text` argument passed to `__init__(text)` '
                         'must be a string, not {0}'.format(type(text)))
     if clean_html:
         raise NotImplementedError("clean_html has been deprecated. "
                                 "To remove HTML markup, use BeautifulSoup's "
                                 "get_text() function")
     self.raw = self.string = text
     self.stripped = lowerstrip(self.raw, all=True)
     _initialize_models(self, tokenizer, pos_tagger, np_extractor, analyzer,
                        parser, classifier)
Пример #7
0
    def __init__(self, text, tokenizer=None,
                 pos_tagger=None,
                 np_extractor=None,
                 analyzer=None,
                 parser=None,
                 classifier=None, clean_html=False):

        self.tokenizer = tokenizer if tokenizer is not None else NLTKPunktTokenizer()
        self.pos_tagger = pos_tagger if pos_tagger is not None else PatternTagger(
            tokenizer=self.tokenizer)
        self.np_extractor = np_extractor if np_extractor is not None \
            else PatternParserNPExtractor(tokenizer=self.tokenizer)
        self.analyzer = analyzer if analyzer is not None \
            else PatternAnalyzer(tokenizer=self.tokenizer)
        self.parser = parser if parser is not None \
            else PatternParser(tokenizer=self.tokenizer)
        self.classifier = classifier if classifier is not None else None

        if not isinstance(text, basestring):
            raise TypeError('The `text` argument passed to `__init__(text)` '
                            'must be a string, not {0}'.format(type(text)))
        if clean_html:
            raise NotImplementedError(
                "clean_html has been deprecated. "
                "To remove HTML markup, use BeautifulSoup's "
                "get_text() function")
        self.raw = self.string = text
        self.stripped = lowerstrip(self.raw, all=True)
        _initialize_models(
            self,
            self.tokenizer,
            self.pos_tagger,
            self.np_extractor,
            self.analyzer,
            self.parser,
            self.classifier)
Пример #8
0
 def test_lowerstrip(self):
     assert_equal(lowerstrip(self.text),
                 'this. has. punctuation')
Пример #9
0
 def test_lowerstrip(self):
     assert_equal(lowerstrip(self.text), 'this. has. punctuation')
Пример #10
0
import keys
from operator import itemgetter
from tweetlistener import TweetListener
import preprocessor as p
from textblob.utils import lowerstrip

#authenticate with Twitter and create an api
auth = tweepy.OAuthHandler(keys.consumer_key, keys.consumer_secret)
auth.set_access_token(keys.access_token, keys.access_token_secret)

api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

search = api.search(q='liverpool', count=100)

# initialize the preprocessor
p.set_options(p.OPT.URL, p.OPT.RESERVED, p.OPT.NUMBER, p.OPT.SMILEY,
              p.OPT.EMOJI, p.OPT.MENTION, p.OPT.HASHTAG)

clean = []
for tweet in search:
    try:
        text = tweet.extended_tweet.text
    except:
        text = tweet.text
    clean.append(p.clean(text))

stripped_text = [lowerstrip(t) for t in clean]

for text in stripped_text:
    print(text)