def sync_hashtag(self, name): set_root_logger() logger.info('Begin sync hashtag %s from remote.', name) jl_ftp_path = os.path.join(settings.FTP_ROOT_URI, settings.DUMP_DATA_PATH_ROOT, settings.DUMP_DATA_PATH_HASHTAG, '{}.jl'.format(name)) try: with urllib.request.urlopen(jl_ftp_path) as resp: hashtag_info = json.load(TextIOWrapper(resp, 'utf-8')) logger.info('Retrieved hashtag info: %s', hashtag_info) except: logger.error('Retrieve hashtag %s info data FAILED. EXIT. %s', name, traceback.format_exc()) return try: ret = mongodb_coll.update({"name": hashtag_info["name"]}, hashtag_info, upsert=True) if ret['updatedExisting']: logger.info('Updated hashtag info to MongoDB: %s', hashtag_info['name']) else: logger.info('Inserted hashtag info to MongoDB: %s', hashtag_info['name']) except: logger.error( 'Save hashtag info to MongoDB FAILED. EXIT. %s. Error: %s', hashtag_info['name'], traceback.format_exc()) return logger.info('Finished sync hashtag info from remote: %s', name) return
def poll_instagram(self): set_root_logger() redis_cli = redis.StrictRedis(connection_pool=redis_pool) latest_update = redis_cli.zrange(REDIS_LATEST_UPDATE_KEY, 0, -1) logger.info('Retrieved latest update info: %s', latest_update) target_info = None with redis_cli.pipeline() as pipe: for t in latest_update: t = t.decode('utf8') k = REDIS_UPDATING_KEY.format(t) try: pipe.watch(k) is_updating = redis_cli.get(k) if is_updating: logger.info('%s is updating. Checking next.', t) continue pipe.multi() pipe.set(k, 1) pipe.execute() logger.info('%s is not updating. Begin polling web for it.', t) target_info = t break except redis.WatchError: logger.info('%s is updating(simultaneously). Checking next.', t) continue if target_info is None: logger.info('No target found. EXIT polling.') return target, target_type = target_info.split('.') spider = spider_cls.get(target_type) if spider is None: logger.info('No spider found for %s. EXIT.', target_info) return logger.info('Start crawling %s.', target_info) configure_logging(install_root_handler=False) process = CrawlerProcess(get_project_settings()) process.crawl(spider, target=target, target_type=target_type, target_info=target_info) process.start() logger.info('End crawling %s.', target_info) # runner = CrawlerRunner(get_project_settings()) # d = runner.crawl(spider, target=target, target_type=target_type, target_info=target_info) # d.addBoth(lambda _: logger.info('End crawling %s.', target_info)) # logger.info('Start crawling %s.', target_info) # if not reactor.running: # logger.info('reactor is not running. Run it.') # reactor.run() # else: # logger.info('reactor is already running.') return
def sync_publisher(self, username): set_root_logger() logger.info('Begin sync publisher %s from remote.', username) jl_ftp_path = os.path.join(settings.FTP_ROOT_URI, settings.DUMP_DATA_PATH_ROOT, settings.DUMP_DATA_PATH_PUBLISHER, '{}.jl'.format(username)) try: with urllib.request.urlopen(jl_ftp_path) as resp: publisher_info = json.load(TextIOWrapper(resp, 'utf-8')) logger.info('Retrieved publisher info: %s', publisher_info) except: logger.error('Retrieve publisher %s info data FAILED. EXIT. %s', username, traceback.format_exc()) return try: download_avatar_info = publisher_info['downloaded_avatar_info'][0] avatar_updated = download_avatar_info['updated'] if avatar_updated == True: avatar_ftp_path = os.path.join(settings.FTP_ROOT_URI, settings.IMAGE_PATH_ROOT, download_avatar_info['path']) avatar_local_path = os.path.join(settings.BASE_PATH, settings.IMAGE_PATH_ROOT, download_avatar_info['path']) urllib.request.urlretrieve(avatar_ftp_path, avatar_local_path) logger.info('Saved publisher avatar %s to %s', download_avatar_info['path'], avatar_local_path) except: logger.error('Retrieve publisher %s avatar FAILED. EXIT. %s', username, traceback.format_exc()) return try: ret = mongodb_coll.update({"_id": publisher_info["_id"]}, publisher_info, upsert=True) if ret['updatedExisting']: logger.info('Updated publisher info to MongoDB: %s', publisher_info['username']) else: logger.info('Inserted publisher info to MongoDB: %s', publisher_info['username']) except: logger.error( 'Save publisher info to MongoDB FAILED. EXIT. %s. Error: %s', publisher_info['username'], traceback.format_exc()) return logger.info('Finished sync publisher info from remote: %s', username) return
def fetch_image(self, img_id): set_root_logger() logger.info('Begin sync graph image %s from remote.', img_id) jl_ftp_path = os.path.join(settings.FTP_ROOT_URI, settings.DUMP_DATA_PATH_ROOT, settings.DUMP_DATA_PATH_GRAPHIMAGE, '{}.jl'.format(img_id)) try: with urllib.request.urlopen(jl_ftp_path) as resp: img_info = json.load(TextIOWrapper(resp, 'utf-8')) logger.info('Retrieved graph image info for %s', img_id) logger.debug('Retrieved graph image info: %s', img_info) except: logger.error('Retrieve graph image %s info data FAILED. EXIT. %s', img_id, traceback.format_exc()) return downloaded_img_info = img_info[settings.IMAGES_RESULT_FIELD] for img in downloaded_img_info: img_ftp_path = os.path.join(settings.FTP_ROOT_URI, settings.IMAGE_PATH_ROOT, img['path']) img_local_path = os.path.join(settings.BASE_PATH, settings.IMAGE_PATH_ROOT, img['path']) try: urllib.request.urlretrieve(img_ftp_path, img_local_path) logger.info('Saved graph image %s to %s', img['path'], img_local_path) except: logger.error('Retrieve graph image %s FAILED. EXIT. %s', img, traceback.format_exc()) return try: ret = mongodb_coll.update({"_id": img_info["_id"]}, img_info, upsert=True) if ret['updatedExisting']: logger.info('Updated graph image info to MongoDB: %s', img_info["_id"]) else: logger.info('Inserted graph image info to MongoDB: %s', img_info["_id"]) except: logger.error( 'Save graph image info to MongoDB FAILED. EXIT. %s. Error: %s', img_info["_id"], traceback.format_exc()) return logger.info('Finished sync graph image from remote: %s', img_id) return
from twisted.internet import reactor # from instagram.spiders.spider_publisher import PublisherSpider from instagram.spiders import spider_cls from utils.logger import set_root_logger from task import settings REDIS_UPDATING_KEY = 'updating:{}' REDIS_LATEST_UPDATE_KEY = 'latest_update' app = Celery('instagram') app.config_from_object('task.config') set_root_logger() logger = get_task_logger('poll_instagram') redis_pool = redis.ConnectionPool( host=settings.REDIS_HOST, port=settings.REDIS_PORT, db=settings.REDIS_DB, password=settings.REDIS_PASSWORD, ) @app.task(name='poll_instagram', bind=True) def poll_instagram(self): set_root_logger() redis_cli = redis.StrictRedis(connection_pool=redis_pool) latest_update = redis_cli.zrange(REDIS_LATEST_UPDATE_KEY, 0, -1)