def main(): # Get requests.session session = load_session_or_login() # Get database session for add and commit sql_item db_engine = db_connect() create_engine(db_engine) Session = sessionmaker(bind=db_engine) db_session = Session() # Run actual spider code non-blockingly # Generate ebooks and push to subscribers' kindle return True
def __init__(self): # Issue links: ('http://magazine.caixin.com/2012/cw533/', ...) self.old_issues = set() self.new_issues = set() # By default it won't go back to 1998 self.fetch_old_articles = True # {date: [link], ...} self.articles = dict() # final articles to crawl: [link, ...] self.articles_to_fetch = set() # Latest issue link to generate rss self.latest_issue_date = None # Event loop, aiohttp Session self.loop = asyncio.get_event_loop() self.session = load_session_or_login()