def test_tokenization(self): expected, sample_docs, objects = get_test_documents() calculated = {} analyser = TextAnalyser() id = 0 for s in sample_docs: d = analyser.add_document(s) calculated[str(id)] = d id += 1 self.assertEqual(expected, calculated)
def test_tokenization(self): expected, sample_docs, objects = get_test_documents() calculated = {} analyser = TextAnalyser() id=0 for s in sample_docs: d = analyser.add_document(s) calculated[str(id)] = d id+=1 self.assertEqual(expected, calculated)
def crawl(self, only_english=False): ''' Performs the actual crawling. ''' text_analyser = TextAnalyser(ngram=1, only_english=only_english) exception_log = [] kw = otter.loadrc() # load api key count = 0 while self.maxtime != self.to_date: for page in range(PAGE_SIZE): try: search = otter.Resource('search', **kw) #search(q='#jan25 OR #egypt OR #tahrir', mintime = time.mktime(mintime.timetuple()), maxtime = time.mktime(maxtime.timetuple()), type='tweet', offset=page*10) search(q=self.keywords, mintime = time.mktime(self.from_date.timetuple()), maxtime = time.mktime(self.maxtime.timetuple()), type='tweet', perpage=100, page=page+1) for item in search.response.list: print "--------------------------------------------------------------------------" print "Storing tweet #",count, "for the period",self.from_date,"until",self.maxtime tt = self.type() tt.url = item.url analysed = text_analyser.add_document(item.content) #if this tweet is really small just ignore it. if len(analysed['tokens']) <= 3: print"Ignoring this tweet" continue content = Content() content.raw = analysed['raw'] content.tokens = analysed['tokens'] content.construct_word_freq_list(analysed['word_frequencies']) content.date = self.from_date tt.content = content tt.date = self.from_date tt.retweet_count = item.trackback_total tt.screen_name = item.trackback_author_nick tt.author_screen_name = item.trackback_author_nick tt.author_name = item.trackback_author_name tt.save(safe=True) if len(Author.objects(screen_name=item.trackback_author_nick)) == 0: author = Author() author.screen_name = item.trackback_author_nick author.tweets.append(tt) else: author = Author.objects(screen_name=item.trackback_author_nick).get() author.tweets.append(tt) author.save() count += 1 except Exception, e: print e exception_log.append(e) finally: pass
def crawl(self, only_english=False): ''' Performs the actual crawling. ''' text_analyser = TextAnalyser(ngram=1, only_english=only_english) exception_log = [] kw = otter.loadrc() # load api key count = 0 while self.maxtime != self.to_date: for page in range(PAGE_SIZE): try: search = otter.Resource('search', **kw) #search(q='#jan25 OR #egypt OR #tahrir', mintime = time.mktime(mintime.timetuple()), maxtime = time.mktime(maxtime.timetuple()), type='tweet', offset=page*10) search(q=self.keywords, mintime=time.mktime(self.from_date.timetuple()), maxtime=time.mktime(self.maxtime.timetuple()), type='tweet', perpage=100, page=page + 1) for item in search.response.list: print "--------------------------------------------------------------------------" print "Storing tweet #", count, "for the period", self.from_date, "until", self.maxtime tt = self.type() tt.url = item.url analysed = text_analyser.add_document(item.content) #if this tweet is really small just ignore it. if len(analysed['tokens']) <= 3: print "Ignoring this tweet" continue content = Content() content.raw = analysed['raw'] content.tokens = analysed['tokens'] content.construct_word_freq_list( analysed['word_frequencies']) content.date = self.from_date tt.content = content tt.date = self.from_date tt.retweet_count = item.trackback_total tt.screen_name = item.trackback_author_nick tt.author_screen_name = item.trackback_author_nick tt.author_name = item.trackback_author_name tt.save(safe=True) if len( Author.objects(screen_name=item. trackback_author_nick)) == 0: author = Author() author.screen_name = item.trackback_author_nick author.tweets.append(tt) else: author = Author.objects( screen_name=item.trackback_author_nick).get() author.tweets.append(tt) author.save() count += 1 except Exception, e: print e exception_log.append(e) finally: pass
def test_unicode_doc_translation(self): expected, document = get_unicode_document() analyser = TextAnalyser() document = analyser.add_document(document) self.assertEqual(expected, document["raw"])
import unittest from analysis.social import TwitterSocialAnalyser from analysis.text import TextAnalyser from collections import OrderedDict tweet_with_RT = "RT @monaeltahawy: RT @Gheblawi Beyond belief: religious history & make-up of #Egypt interesting discussion #Copts http://www.bbc.co.uk/podcasts/series/belief" tweet_with_VIA= "Breaking News - Messi spotted outside the Etihad #transferdeadlineday http://twitpic.com/8dwcum (via @AndrewBloch )" not_a_retweet = "This is not a retweet #test" tweet_with_almost_RT = "RT Beyond belief: religious history & make-up of #Egypt interesting discussion #Copts http://www.bbc.co.uk/podcasts/series/belief" tweets = [tweet_with_RT, tweet_with_VIA, not_a_retweet, tweet_with_almost_RT] t = TextAnalyser() dataset = OrderedDict() id = 0 for tweet in tweets: d = t.add_document(tweet) dataset[id] = d id += 1 class Test(unittest.TestCase): def test_retweet_filter(self): tsa = TwitterSocialAnalyser(dataset) result = tsa.filter_retweets() expected = [] expected.append( (0, t.add_document(tweet_with_RT)) ) expected.append( (1, t.add_document(tweet_with_VIA)) ) self.assertEqual(result, OrderedDict(expected)) def test_mention_filter(self): pass
import unittest from analysis.social import TwitterSocialAnalyser from analysis.text import TextAnalyser from collections import OrderedDict tweet_with_RT = "RT @monaeltahawy: RT @Gheblawi Beyond belief: religious history & make-up of #Egypt interesting discussion #Copts http://www.bbc.co.uk/podcasts/series/belief" tweet_with_VIA = "Breaking News - Messi spotted outside the Etihad #transferdeadlineday http://twitpic.com/8dwcum (via @AndrewBloch )" not_a_retweet = "This is not a retweet #test" tweet_with_almost_RT = "RT Beyond belief: religious history & make-up of #Egypt interesting discussion #Copts http://www.bbc.co.uk/podcasts/series/belief" tweets = [tweet_with_RT, tweet_with_VIA, not_a_retweet, tweet_with_almost_RT] t = TextAnalyser() dataset = OrderedDict() id = 0 for tweet in tweets: d = t.add_document(tweet) dataset[id] = d id += 1 class Test(unittest.TestCase): def test_retweet_filter(self): tsa = TwitterSocialAnalyser(dataset) result = tsa.filter_retweets() expected = [] expected.append((0, t.add_document(tweet_with_RT))) expected.append((1, t.add_document(tweet_with_VIA))) self.assertEqual(result, OrderedDict(expected)) def test_mention_filter(self): pass