def crawl(self, only_english=False): ''' Performs the actual crawling. ''' text_analyser = TextAnalyser(ngram=1, only_english=only_english) exception_log = [] kw = otter.loadrc() # load api key count = 0 while self.maxtime != self.to_date: for page in range(PAGE_SIZE): try: search = otter.Resource('search', **kw) #search(q='#jan25 OR #egypt OR #tahrir', mintime = time.mktime(mintime.timetuple()), maxtime = time.mktime(maxtime.timetuple()), type='tweet', offset=page*10) search(q=self.keywords, mintime = time.mktime(self.from_date.timetuple()), maxtime = time.mktime(self.maxtime.timetuple()), type='tweet', perpage=100, page=page+1) for item in search.response.list: print "--------------------------------------------------------------------------" print "Storing tweet #",count, "for the period",self.from_date,"until",self.maxtime tt = self.type() tt.url = item.url analysed = text_analyser.add_document(item.content) #if this tweet is really small just ignore it. if len(analysed['tokens']) <= 3: print"Ignoring this tweet" continue content = Content() content.raw = analysed['raw'] content.tokens = analysed['tokens'] content.construct_word_freq_list(analysed['word_frequencies']) content.date = self.from_date tt.content = content tt.date = self.from_date tt.retweet_count = item.trackback_total tt.screen_name = item.trackback_author_nick tt.author_screen_name = item.trackback_author_nick tt.author_name = item.trackback_author_name tt.save(safe=True) if len(Author.objects(screen_name=item.trackback_author_nick)) == 0: author = Author() author.screen_name = item.trackback_author_nick author.tweets.append(tt) else: author = Author.objects(screen_name=item.trackback_author_nick).get() author.tweets.append(tt) author.save() count += 1 except Exception, e: print e exception_log.append(e) finally: pass
def clone_document(self, document): tt = EvaluationTweet() tt.url = document.url content = Content() content.raw = document.content.raw content.tokens = document.content.tokens content.word_frequencies = document.content.word_frequencies content.date = document.date tt.content = content tt.date = document.date tt.retweet_count = document.retweet_count tt.author_screen_name = document.author_screen_name tt.author_name = document.author_name #tt.save(safe=True) return tt
def crawl(self, only_english=False): ''' Performs the actual crawling. ''' text_analyser = TextAnalyser(ngram=1, only_english=only_english) exception_log = [] kw = otter.loadrc() # load api key count = 0 while self.maxtime != self.to_date: for page in range(PAGE_SIZE): try: search = otter.Resource('search', **kw) #search(q='#jan25 OR #egypt OR #tahrir', mintime = time.mktime(mintime.timetuple()), maxtime = time.mktime(maxtime.timetuple()), type='tweet', offset=page*10) search(q=self.keywords, mintime=time.mktime(self.from_date.timetuple()), maxtime=time.mktime(self.maxtime.timetuple()), type='tweet', perpage=100, page=page + 1) for item in search.response.list: print "--------------------------------------------------------------------------" print "Storing tweet #", count, "for the period", self.from_date, "until", self.maxtime tt = self.type() tt.url = item.url analysed = text_analyser.add_document(item.content) #if this tweet is really small just ignore it. if len(analysed['tokens']) <= 3: print "Ignoring this tweet" continue content = Content() content.raw = analysed['raw'] content.tokens = analysed['tokens'] content.construct_word_freq_list( analysed['word_frequencies']) content.date = self.from_date tt.content = content tt.date = self.from_date tt.retweet_count = item.trackback_total tt.screen_name = item.trackback_author_nick tt.author_screen_name = item.trackback_author_nick tt.author_name = item.trackback_author_name tt.save(safe=True) if len( Author.objects(screen_name=item. trackback_author_nick)) == 0: author = Author() author.screen_name = item.trackback_author_nick author.tweets.append(tt) else: author = Author.objects( screen_name=item.trackback_author_nick).get() author.tweets.append(tt) author.save() count += 1 except Exception, e: print e exception_log.append(e) finally: pass