# -*- Encoding: utf-8 -*- import re import os import json from log4f import debug_logger log = debug_logger('log/parser', 'crawler.parser') class Indexing: def __init__(self, ptn, filename): self.ptn = ptn self.filename = filename self.data = dict() if os.path.exists(filename): with open(self.filename, 'r') as fr: self.data = json.load(fr) def save(self): with open(self.filename, 'wb') as fw: json.dump(self.data, fw, indent=4) def scan(self, path, save_period=2000): total = set(os.listdir(path)) todo = total - set(self.data.keys()) print '{}/{} to parse.'.format(len(todo), len(total)) for i, filename in enumerate(todo): with open(os.path.join(path, filename)) as f:
# -*- Encoding: utf-8 -*- """builk requests for some kinds of web pages """ import re import os import shutil import redis from req import request, request_pages from log4f import debug_logger log = debug_logger('log/download', 'download') def get_title(content): """demo validator of builk_single""" m = re.compile(r'<title>(.*?)</title>').search(content) if m is None: return 'No Title' else: return m.group(1) def builk_single(job, url_ptn, cache_dir, find_new=None): """builk download. one single corresponding page for one ID. usually, it is used for profile page of a user/book/shop etc. """ key = job.next() print 'downloading...'
# -*- Encoding: utf-8 -*- import json import redis from os.path import join, dirname from wechat_sdk import WechatExt # from wechat_sdk.exceptions import NeedLoginError from log4f import debug_logger import settings LOGIN_TIMEOUT = 4 * 3600 # 4 hours r = redis.StrictRedis(**settings.REDIS_CONN) log = debug_logger(join(dirname(__file__), 'log/notify'), 'root.notify') def login(username, password): d = r.get(username) if d: log.info('lazy login. use cookie, username={}'.format(username)) return WechatExt(username, password, login=False, **json.loads(d)) else: print username, password wechat = WechatExt(username, password, login=False) wechat.login() log.info('login to wechat server. username={}'.format(username)) r.setex(username, LOGIN_TIMEOUT, json.dumps(wechat.get_token_cookies(), indent=4)) return wechat def init_info():
# -*- Encoding: utf-8 -*- import redis from wechat import send from os.path import dirname, join from log4f import debug_logger BASE_DIR = dirname(__file__) log = debug_logger(join(BASE_DIR, 'log/download'), 'root.download') class JobPool: def __init__(self, job_name, host='localhost', port=6379, db=0, timeout=10): self.timeout = timeout self.db = redis.StrictRedis(host, port, db) self.total_tbl = '{}:total'.format(job_name) self.todo_tbl = '{}:todo'.format(job_name) self.name = job_name def init_data(self, total, done): self.db.delete(self.total_tbl) self.db.delete(self.todo_tbl) todo = set(total) - set(done) self.db.sadd(self.total_tbl, *total)
# -*- Encoding: utf-8 -*- import json import redis from os.path import join, dirname from wechat_sdk import WechatExt # from wechat_sdk.exceptions import NeedLoginError from log4f import debug_logger import settings LOGIN_TIMEOUT = 4 * 3600 # 4 hours r = redis.StrictRedis(**settings.REDIS_CONN) log = debug_logger(join(dirname(__file__), 'log/notify'), 'root.notify') def login(username, password): d = r.get(username) if d: log.info('lazy login. use cookie, username={}'.format(username)) return WechatExt(username, password, login=False, **json.loads(d)) else: print username, password wechat = WechatExt(username, password, login=False) wechat.login() log.info('login to wechat server. username={}'.format(username)) r.setex(username, LOGIN_TIMEOUT, json.dumps(wechat.get_token_cookies(), indent=4)) return wechat
# -*- Encoding: utf-8 -*- import re import redis import socket from httplib2 import Http import time import random from os.path import dirname, join from log4f import debug_logger import settings BASE_DIR = dirname(__file__) log = debug_logger(join(BASE_DIR, 'log/request'), 'root.request') r = redis.StrictRedis(**settings.REDIS_CONN) def wait(f): lock_name = 'http-lock' def _wrap_func(*args, **kwargs): t = r.ttl(lock_name) if t > 0: time.sleep(t) n_t = int(random.uniform(settings.DELAY_BOTTOM, settings.DELAY_TOP)) r.setex(lock_name, n_t, 'locking') return f(*args, **kwargs) return _wrap_func
# -*- Encoding: utf-8 -*- import redis from wechat import send from os.path import dirname, join from log4f import debug_logger BASE_DIR = dirname(__file__) log = debug_logger(join(BASE_DIR, 'log/download'), 'root.download') class JobPool: def __init__(self, job_name, host='localhost', port=6379, db=0, timeout=10): self.timeout = timeout self.db = redis.StrictRedis(host, port, db) self.total_tbl = '{}:total'.format(job_name) self.todo_tbl = '{}:todo'.format(job_name) self.name = job_name def init_data(self, total, done): self.db.delete(self.total_tbl) self.db.delete(self.todo_tbl) todo = set(total) - set(done) self.db.sadd(self.total_tbl, *total) self.db.rpush(self.todo_tbl, *todo) def count_todo(self):
# -*- Encoding: utf-8 -*- import re import socket from httplib2 import Http import time import random from log4f import debug_logger log = debug_logger('log/request', 'root.request') _last_req = None def delay(bottom=2, top=7): global _last_req if _last_req is None: _last_req = time.time() return 0 period = max(0, _last_req+random.uniform(bottom, top)-time.time()) log.debug('...wait {:.2f} sec'.format(period)) time.sleep(period) _last_req = time.time() return period def wait(f): def _wrap_func(*args, **kwargs): delay()
# -*- Encoding: utf-8 -*- import json import time from os.path import join, dirname, exists from os import makedirs import tornado.ioloop import tornado.web from tornado.options import define, options from wechat_sdk import WechatBasic, WechatExt from wechat_sdk.exceptions import NeedLoginError from log4f import debug_logger log = debug_logger(join(dirname(__file__), 'log'), 'root') define("username", default='username', help="username of wechat", type=str) define("password", default='password', help="password of wechat", type=str) define("token", default='', help="token of wechat", type=str) define("port", default=8000, help="run on the given port", type=int) define("debug", default=False, help="run in Debug mode", type=bool) today = lambda: time.strftime('%Y%m%d', time.localtime()) cookie_dir = join(dirname(__file__), 'cookie') def login_http(username, password): wechat = WechatExt(username, password) wechat.login()