Python TextAnalyser.TextAnalyser примеры использования

Язык программирования: Python

Пространство имен/Пакет: analysis.text

Класс/Тип: TextAnalyser

Метод/Функция: TextAnalyser

Примеров на hotexamples.com: 5

Python TextAnalyser.TextAnalyser - 5 примеров найдено. Это лучшие примеры Python кода для analysis.text.TextAnalyser.TextAnalyser, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

TextAnalyser(5)

add_document(4)

_preprocess(1)

Пример #1

Показать файл

Файл: text_analysis_tests.py Проект: nihaofuyue0617/pythia

 def test_text_preprocessing(self):
     text = "This is a sample text. # ! . "
     analyser = TextAnalyser()
     processed = analyser._preprocess(text)
     expected = ('This is a sample text. # ! . ', ['sampl',
                                                   'text'], [('sampl', 1),
                                                             ('text', 1)])
     self.assertEqual(expected, processed)

Пример #2

Показать файл

Файл: text_analysis_tests.py Проект: nihaofuyue0617/pythia

    def test_tokenization(self):
        expected, sample_docs, objects = get_test_documents()
        calculated = {}
        analyser = TextAnalyser()
        id = 0
        for s in sample_docs:
            d = analyser.add_document(s)
            calculated[str(id)] = d
            id += 1

        self.assertEqual(expected, calculated)

Пример #3

Показать файл

Файл: TopsyCrawler.py Проект: nihaofuyue0617/pythia

    def crawl(self, only_english=False):
        '''
        Performs the actual crawling. 
        '''
        text_analyser = TextAnalyser(ngram=1, only_english=only_english)
        exception_log = []
        kw = otter.loadrc()  # load api key
        count = 0
        while self.maxtime != self.to_date:
            for page in range(PAGE_SIZE):
                try:
                    search = otter.Resource('search', **kw)
                    #search(q='#jan25 OR #egypt OR #tahrir', mintime = time.mktime(mintime.timetuple()), maxtime = time.mktime(maxtime.timetuple()), type='tweet', offset=page*10)
                    search(q=self.keywords,
                           mintime=time.mktime(self.from_date.timetuple()),
                           maxtime=time.mktime(self.maxtime.timetuple()),
                           type='tweet',
                           perpage=100,
                           page=page + 1)
                    for item in search.response.list:
                        print "--------------------------------------------------------------------------"
                        print "Storing tweet #", count, "for the period", self.from_date, "until", self.maxtime
                        tt = self.type()
                        tt.url = item.url
                        analysed = text_analyser.add_document(item.content)
                        #if this tweet is really small just ignore it.
                        if len(analysed['tokens']) <= 3:
                            print "Ignoring this tweet"
                            continue
                        content = Content()
                        content.raw = analysed['raw']
                        content.tokens = analysed['tokens']
                        content.construct_word_freq_list(
                            analysed['word_frequencies'])
                        content.date = self.from_date
                        tt.content = content
                        tt.date = self.from_date
                        tt.retweet_count = item.trackback_total
                        tt.screen_name = item.trackback_author_nick
                        tt.author_screen_name = item.trackback_author_nick
                        tt.author_name = item.trackback_author_name
                        tt.save(safe=True)

                        if len(
                                Author.objects(screen_name=item.
                                               trackback_author_nick)) == 0:

                            author = Author()
                            author.screen_name = item.trackback_author_nick
                            author.tweets.append(tt)
                        else:
                            author = Author.objects(
                                screen_name=item.trackback_author_nick).get()
                            author.tweets.append(tt)
                        author.save()

                        count += 1
                except Exception, e:
                    print e
                    exception_log.append(e)
                finally:
                    pass

Пример #4

Показать файл

Файл: text_analysis_tests.py Проект: nihaofuyue0617/pythia

 def test_unicode_doc_translation(self):
     expected, document = get_unicode_document()
     analyser = TextAnalyser()
     document = analyser.add_document(document)
     self.assertEqual(expected, document["raw"])

Пример #5

Показать файл

Created on 1 Feb 2012

@author: george
'''
import unittest
from analysis.social import TwitterSocialAnalyser
from analysis.text import TextAnalyser
from collections import OrderedDict

tweet_with_RT = "RT @monaeltahawy: RT @Gheblawi Beyond belief: religious history &amp; make-up of #Egypt interesting discussion #Copts http://www.bbc.co.uk/podcasts/series/belief"
tweet_with_VIA = "Breaking News - Messi spotted outside the Etihad #transferdeadlineday http://twitpic.com/8dwcum (via @AndrewBloch )"
not_a_retweet = "This is not a retweet #test"
tweet_with_almost_RT = "RT Beyond belief: religious history &amp; make-up of #Egypt interesting discussion #Copts http://www.bbc.co.uk/podcasts/series/belief"
tweets = [tweet_with_RT, tweet_with_VIA, not_a_retweet, tweet_with_almost_RT]

t = TextAnalyser()
dataset = OrderedDict()
id = 0
for tweet in tweets:
    d = t.add_document(tweet)
    dataset[id] = d
    id += 1


class Test(unittest.TestCase):
    def test_retweet_filter(self):
        tsa = TwitterSocialAnalyser(dataset)
        result = tsa.filter_retweets()
        expected = []
        expected.append((0, t.add_document(tweet_with_RT)))
        expected.append((1, t.add_document(tweet_with_VIA)))