Пример #1
0
def sync_hashtag(self, name):
    set_root_logger()
    logger.info('Begin sync hashtag %s from remote.', name)
    jl_ftp_path = os.path.join(settings.FTP_ROOT_URI,
                               settings.DUMP_DATA_PATH_ROOT,
                               settings.DUMP_DATA_PATH_HASHTAG,
                               '{}.jl'.format(name))
    try:
        with urllib.request.urlopen(jl_ftp_path) as resp:
            hashtag_info = json.load(TextIOWrapper(resp, 'utf-8'))
        logger.info('Retrieved hashtag info: %s', hashtag_info)
    except:
        logger.error('Retrieve hashtag %s info data FAILED. EXIT. %s', name,
                     traceback.format_exc())
        return
    try:
        ret = mongodb_coll.update({"name": hashtag_info["name"]},
                                  hashtag_info,
                                  upsert=True)
        if ret['updatedExisting']:
            logger.info('Updated hashtag info to MongoDB: %s',
                        hashtag_info['name'])
        else:
            logger.info('Inserted hashtag info to MongoDB: %s',
                        hashtag_info['name'])
    except:
        logger.error(
            'Save hashtag info to MongoDB FAILED. EXIT. %s. Error: %s',
            hashtag_info['name'], traceback.format_exc())
        return
    logger.info('Finished sync hashtag info from remote: %s', name)
    return
Пример #2
0
def poll_instagram(self):
    set_root_logger()
    redis_cli = redis.StrictRedis(connection_pool=redis_pool)
    latest_update = redis_cli.zrange(REDIS_LATEST_UPDATE_KEY, 0, -1)
    logger.info('Retrieved latest update info: %s', latest_update)
    target_info = None
    with redis_cli.pipeline() as pipe:
        for t in latest_update:
            t = t.decode('utf8')
            k = REDIS_UPDATING_KEY.format(t)
            try:
                pipe.watch(k)
                is_updating = redis_cli.get(k)
                if is_updating:
                    logger.info('%s is updating. Checking next.', t)
                    continue
                pipe.multi()
                pipe.set(k, 1)
                pipe.execute()
                logger.info('%s is not updating. Begin polling web for it.', t)
                target_info = t
                break
            except redis.WatchError:
                logger.info('%s is updating(simultaneously). Checking next.',
                            t)
                continue

    if target_info is None:
        logger.info('No target found. EXIT polling.')
        return

    target, target_type = target_info.split('.')
    spider = spider_cls.get(target_type)
    if spider is None:
        logger.info('No spider found for %s. EXIT.', target_info)
        return

    logger.info('Start crawling %s.', target_info)
    configure_logging(install_root_handler=False)

    process = CrawlerProcess(get_project_settings())
    process.crawl(spider,
                  target=target,
                  target_type=target_type,
                  target_info=target_info)
    process.start()
    logger.info('End crawling %s.', target_info)
    # runner = CrawlerRunner(get_project_settings())
    # d = runner.crawl(spider, target=target, target_type=target_type, target_info=target_info)
    # d.addBoth(lambda _: logger.info('End crawling %s.', target_info))
    # logger.info('Start crawling %s.', target_info)
    # if not reactor.running:
    #     logger.info('reactor is not running. Run it.')
    # reactor.run()
    # else:
    #     logger.info('reactor is already running.')
    return
Пример #3
0
def sync_publisher(self, username):
    set_root_logger()
    logger.info('Begin sync publisher %s from remote.', username)
    jl_ftp_path = os.path.join(settings.FTP_ROOT_URI,
                               settings.DUMP_DATA_PATH_ROOT,
                               settings.DUMP_DATA_PATH_PUBLISHER,
                               '{}.jl'.format(username))
    try:
        with urllib.request.urlopen(jl_ftp_path) as resp:
            publisher_info = json.load(TextIOWrapper(resp, 'utf-8'))
        logger.info('Retrieved publisher info: %s', publisher_info)
    except:
        logger.error('Retrieve publisher %s info data FAILED. EXIT. %s',
                     username, traceback.format_exc())
        return
    try:
        download_avatar_info = publisher_info['downloaded_avatar_info'][0]
        avatar_updated = download_avatar_info['updated']
        if avatar_updated == True:
            avatar_ftp_path = os.path.join(settings.FTP_ROOT_URI,
                                           settings.IMAGE_PATH_ROOT,
                                           download_avatar_info['path'])
            avatar_local_path = os.path.join(settings.BASE_PATH,
                                             settings.IMAGE_PATH_ROOT,
                                             download_avatar_info['path'])
            urllib.request.urlretrieve(avatar_ftp_path, avatar_local_path)
            logger.info('Saved publisher avatar %s to %s',
                        download_avatar_info['path'], avatar_local_path)
    except:
        logger.error('Retrieve publisher %s avatar FAILED. EXIT. %s', username,
                     traceback.format_exc())
        return
    try:
        ret = mongodb_coll.update({"_id": publisher_info["_id"]},
                                  publisher_info,
                                  upsert=True)
        if ret['updatedExisting']:
            logger.info('Updated publisher info to MongoDB: %s',
                        publisher_info['username'])
        else:
            logger.info('Inserted publisher info to MongoDB: %s',
                        publisher_info['username'])
    except:
        logger.error(
            'Save publisher info to MongoDB FAILED. EXIT. %s. Error: %s',
            publisher_info['username'], traceback.format_exc())
        return
    logger.info('Finished sync publisher info from remote: %s', username)
    return
Пример #4
0
def fetch_image(self, img_id):
    set_root_logger()
    logger.info('Begin sync graph image %s from remote.', img_id)
    jl_ftp_path = os.path.join(settings.FTP_ROOT_URI,
                               settings.DUMP_DATA_PATH_ROOT,
                               settings.DUMP_DATA_PATH_GRAPHIMAGE,
                               '{}.jl'.format(img_id))
    try:
        with urllib.request.urlopen(jl_ftp_path) as resp:
            img_info = json.load(TextIOWrapper(resp, 'utf-8'))
        logger.info('Retrieved graph image info for %s', img_id)
        logger.debug('Retrieved graph image info: %s', img_info)
    except:
        logger.error('Retrieve graph image %s info data FAILED. EXIT. %s',
                     img_id, traceback.format_exc())
        return
    downloaded_img_info = img_info[settings.IMAGES_RESULT_FIELD]
    for img in downloaded_img_info:
        img_ftp_path = os.path.join(settings.FTP_ROOT_URI,
                                    settings.IMAGE_PATH_ROOT, img['path'])
        img_local_path = os.path.join(settings.BASE_PATH,
                                      settings.IMAGE_PATH_ROOT, img['path'])
        try:
            urllib.request.urlretrieve(img_ftp_path, img_local_path)
            logger.info('Saved graph image %s to %s', img['path'],
                        img_local_path)
        except:
            logger.error('Retrieve graph image %s FAILED. EXIT. %s', img,
                         traceback.format_exc())
            return
    try:
        ret = mongodb_coll.update({"_id": img_info["_id"]},
                                  img_info,
                                  upsert=True)
        if ret['updatedExisting']:
            logger.info('Updated graph image info to MongoDB: %s',
                        img_info["_id"])
        else:
            logger.info('Inserted graph image info to MongoDB: %s',
                        img_info["_id"])
    except:
        logger.error(
            'Save graph image info to MongoDB FAILED. EXIT. %s. Error: %s',
            img_info["_id"], traceback.format_exc())
        return
    logger.info('Finished sync graph image from remote: %s', img_id)
    return
Пример #5
0
from twisted.internet import reactor

# from instagram.spiders.spider_publisher import PublisherSpider
from instagram.spiders import spider_cls
from utils.logger import set_root_logger

from task import settings

REDIS_UPDATING_KEY = 'updating:{}'
REDIS_LATEST_UPDATE_KEY = 'latest_update'

app = Celery('instagram')
app.config_from_object('task.config')

set_root_logger()
logger = get_task_logger('poll_instagram')

redis_pool = redis.ConnectionPool(
    host=settings.REDIS_HOST,
    port=settings.REDIS_PORT,
    db=settings.REDIS_DB,
    password=settings.REDIS_PASSWORD,
)


@app.task(name='poll_instagram', bind=True)
def poll_instagram(self):
    set_root_logger()
    redis_cli = redis.StrictRedis(connection_pool=redis_pool)
    latest_update = redis_cli.zrange(REDIS_LATEST_UPDATE_KEY, 0, -1)