''' Created on 22 Jan 2012 @author: george ''' import datetime from crawlers.CrawlerFactory import CrawlerFactory from database.model.tweets import TwoGroupsTweet from mongoengine import * f = CrawlerFactory() t = f.get_crawler("topsy") search_hashtags = "uk OR #uk OR #UK or #usa OR #USA OR #US OR usa OR us" t.search_for(search_hashtags) t.search_between(from_date=datetime.datetime(2011, 01, 23, 0, 0, 0), to_date=datetime.datetime(2011, 01, 25, 0, 0, 0), granularity_days=1, granularity_hours=0, granularity_mins=0) t.retrieve_items_of_type(TwoGroupsTweet) t.crawl()
def test_construction_of_twitter_crawlers(self): factory = CrawlerFactory() t = factory.get_crawler("twitter") t.login() info = t.getUserInfoByScreenName("GeorgeEracleous")
''' Created on 22 Jan 2012 @author: george ''' import datetime from crawlers.CrawlerFactory import CrawlerFactory from database.model.tweets import * from database.model.agents import * from mongoengine import * import tools.utils from urlparse import urlparse from database.warehouse import WarehouseServer f = CrawlerFactory() twitter = f.get_crawler("twitter") #twitter.login() ws = WarehouseServer() from_date = datetime.datetime(2011, 1, 25, 0, 0, 0) to_date = datetime.datetime(2011, 1, 26, 0, 00, 0) items = ws.get_documents_by_date(from_date, to_date, limit=100) screen_names = [] for tweet in items: screen_names.append(tweet.author_screen_name) screen_names = set(screen_names) print len(screen_names) # A terrible hack to save the screen_names of users which are mentioned in tweets # but they are not yet in the database. They'll be considered after all authors have #been stored. mentions_of_not_stored_users = []
''' Created on 22 Jan 2012 @author: george ''' from database.model.agents import TrainingAuthor from crawlers.CrawlerFactory import CrawlerFactory f = CrawlerFactory() crawler = f.get_crawler("scrapy") crawler.setup(user_type=TrainingAuthor) crawler.crawl(store=True)