redis-server
cd $KAFKA_HOME
sudo ./bin/zookeeper-server-start.sh ./config/zookeeper.properties
cd $KAFKA_HOME
sudo ./bin/kafka-server-start.sh ./config/server.properties
cd $JAY_CLUSTER_HOME/kafka-monitor
python kafka-monitor.py run
cd $JAY_CLUSTER_HOME/kafka-monitor
python redis-monitor.py
cd $JAY_CLUSTER_HOME/crawler
scrapy crawl --logfile=<logfile_name.log> <spider_name>
cd $JAY_CLUSTER_HOME/kafka-monitor
python kafka_monitor.py feed '{ "url": "http://www.finishline.com/store/shop/men/shoes/training/_/N-33ida?categoryId=cat301585&mnid=men_shoes_training", "appid":"testapp", "crawlid":"abc123", "spiderid":"finishline", "callback":"parse"}'
cd $JAY_CLUSTER_HOME/kafka-monitor
python kafka-monitor.py feed '{"url":"http://www.amazon.com/gp/product/B00GR4KBKC/ref=twister_dp_update?ie=UTF8&psc=1", "appid":"testapp", "crawlid":"abc123", "spiderid":"amazon", "callback":"parse_item_update","attrs":{"asin":"B00GR4KBKC"}}'
cd $JAY_CLUSTER_HOME/kafka-monitor
python kafkafeed.py -appid=testapp -crawlid=testcrawlid -spiderid=amazon5 -urlsfile=urls.txt -fullurl=true
##3.4 获取某个任务的信息 ##
TODO
TODO
- 相关检查工具 4.1
cd $JAY_CLUSTER_HOME/kafka-monitor
python kafkadump.py dump jay.crawled_firehose --host=127.0.0.1:9092
4.2
cd $JAY_CLUSTER_HOME/kafka-monitor
python kafkadump.py dump jay.outbound_firehose --host=127.0.0.1:9092
4.3 把数据写入mongodb
cd $JAY_CLUSTER_HOME/kafka-monitor
python dump_to_mongodb.py dump jay.crawled_firehose_images --host=127.0.0.1:9092
4.4 分发图片下载任务
cd $JAY_CLUSTER_HOME/kafka-monitor
python aria2_dispatch.py --topic=jay.crawled_firehose --host=127.0.0.1:9092 --s=settings_aria2_dispatch.py
- 注意事项
删除scraper_schema.json中spiderid的默认值 删除默认的 enum和 default