Пример #1
0
 def test_text_preprocessing(self):
     text = "This is a sample text. # ! . "
     analyser = TextAnalyser()
     processed = analyser._preprocess(text)
     expected = ('This is a sample text. # ! . ', ['sampl',
                                                   'text'], [('sampl', 1),
                                                             ('text', 1)])
     self.assertEqual(expected, processed)
Пример #2
0
    def test_tokenization(self):
        expected, sample_docs, objects = get_test_documents()
        calculated = {}
        analyser = TextAnalyser()
        id = 0
        for s in sample_docs:
            d = analyser.add_document(s)
            calculated[str(id)] = d
            id += 1

        self.assertEqual(expected, calculated)
Пример #3
0
    def crawl(self, only_english=False):
        '''
        Performs the actual crawling. 
        '''
        text_analyser = TextAnalyser(ngram=1, only_english=only_english)
        exception_log = []
        kw = otter.loadrc()  # load api key
        count = 0
        while self.maxtime != self.to_date:
            for page in range(PAGE_SIZE):
                try:
                    search = otter.Resource('search', **kw)
                    #search(q='#jan25 OR #egypt OR #tahrir', mintime = time.mktime(mintime.timetuple()), maxtime = time.mktime(maxtime.timetuple()), type='tweet', offset=page*10)
                    search(q=self.keywords,
                           mintime=time.mktime(self.from_date.timetuple()),
                           maxtime=time.mktime(self.maxtime.timetuple()),
                           type='tweet',
                           perpage=100,
                           page=page + 1)
                    for item in search.response.list:
                        print "--------------------------------------------------------------------------"
                        print "Storing tweet #", count, "for the period", self.from_date, "until", self.maxtime
                        tt = self.type()
                        tt.url = item.url
                        analysed = text_analyser.add_document(item.content)
                        #if this tweet is really small just ignore it.
                        if len(analysed['tokens']) <= 3:
                            print "Ignoring this tweet"
                            continue
                        content = Content()
                        content.raw = analysed['raw']
                        content.tokens = analysed['tokens']
                        content.construct_word_freq_list(
                            analysed['word_frequencies'])
                        content.date = self.from_date
                        tt.content = content
                        tt.date = self.from_date
                        tt.retweet_count = item.trackback_total
                        tt.screen_name = item.trackback_author_nick
                        tt.author_screen_name = item.trackback_author_nick
                        tt.author_name = item.trackback_author_name
                        tt.save(safe=True)

                        if len(
                                Author.objects(screen_name=item.
                                               trackback_author_nick)) == 0:

                            author = Author()
                            author.screen_name = item.trackback_author_nick
                            author.tweets.append(tt)
                        else:
                            author = Author.objects(
                                screen_name=item.trackback_author_nick).get()
                            author.tweets.append(tt)
                        author.save()

                        count += 1
                except Exception, e:
                    print e
                    exception_log.append(e)
                finally:
                    pass
Пример #4
0
 def test_unicode_doc_translation(self):
     expected, document = get_unicode_document()
     analyser = TextAnalyser()
     document = analyser.add_document(document)
     self.assertEqual(expected, document["raw"])
Пример #5
0
Created on 1 Feb 2012

@author: george
'''
import unittest
from analysis.social import TwitterSocialAnalyser
from analysis.text import TextAnalyser
from collections import OrderedDict

tweet_with_RT = "RT @monaeltahawy: RT @Gheblawi Beyond belief: religious history &amp; make-up of #Egypt interesting discussion #Copts http://www.bbc.co.uk/podcasts/series/belief"
tweet_with_VIA = "Breaking News - Messi spotted outside the Etihad #transferdeadlineday http://twitpic.com/8dwcum (via @AndrewBloch )"
not_a_retweet = "This is not a retweet #test"
tweet_with_almost_RT = "RT Beyond belief: religious history &amp; make-up of #Egypt interesting discussion #Copts http://www.bbc.co.uk/podcasts/series/belief"
tweets = [tweet_with_RT, tweet_with_VIA, not_a_retweet, tweet_with_almost_RT]

t = TextAnalyser()
dataset = OrderedDict()
id = 0
for tweet in tweets:
    d = t.add_document(tweet)
    dataset[id] = d
    id += 1


class Test(unittest.TestCase):
    def test_retweet_filter(self):
        tsa = TwitterSocialAnalyser(dataset)
        result = tsa.filter_retweets()
        expected = []
        expected.append((0, t.add_document(tweet_with_RT)))
        expected.append((1, t.add_document(tweet_with_VIA)))