def __init__(self, profile): self.secret_access_key = profile["secret_access_key"] self.access_id = profile["access_id"] config.ACCESS_KEY = safestr(self.access_id) config.SECRET_KEY = safestr(self.secret_access_key) if "bucket" in profile and profile["bucket"]: self.bucket = profile["bucket"] else: self.bucket = "cmdp" self.token_expires = 3600 + now() self.init_tokent()
def process_kaice(db, insert_data, data): insert_data["insert_time"] = str(time.strftime("%Y-%m-%d %H:%M:%S")) r = db.get_one(where={ "guid" : insert_data["guid"]}) if r: return None; if data["kaice_type"] and safestr(data['kaice_type'].value) == "网页游戏": if "test_date" in insert_data: insert_data["test_date"] = safestr(insert_data["test_date"]).replace("今日", str(time.strftime("%Y-%m-%d"))) if insert_data["game_name"] is None or insert_data["game_name"] == "None" or insert_data["game_name"] == "": return None return insert_data else: return None
def specialFilter(self): if len(self.filters) > 0: for filter in self.filters: rule = filter; rule = rule.replace('(*)', '(.+)?') if isinstance(self.content, unicode): rule = safeunicode(rule) else: rule = safestr(rule) self.content = re.compile(rule, re.I).sub("", self.content);
def getItemGUID(self, data): guid_rule = self.guid_rule s = ""; if isinstance(guid_rule, list): for field_id in guid_rule: field = get_field_from_cache(field_id) if field: field_name = field["name"] if field_name and data[field_name]: if "value" in data[field_name] and data[field_name].value: s += safestr(data[field_name].value) elif data[field_name] and isinstance(data[field_name], unicode) and isinstance(data[field_name], str): s += safestr(data[field_name]) elif isinstance(guid_rule, str) or isinstance(guid_rule, unicode): s = data[guid_rule] return md5(s).hexdigest()
def process_kaifu(db, insert_data, data): insert_data["insert_time"] = str(time.strftime("%Y-%m-%d %H:%M:%S")) r = db.get_one(where={ "guid" : insert_data["guid"]}) if r: return None; test_date = safestr(insert_data['test_date']) today = datetime.datetime.today() today_month = today.month today_day = today.day today_year = today.year today_string = "%.2d月%.2d日" % (today_month, today_day) test_date = test_date.replace("今日", today_string) for rule in date_rule: try: new_test_date = time.strptime(safestr(test_date), rule) if new_test_date: y = new_test_date[0] m = new_test_date[1] d = new_test_date[2] h = new_test_date[3] if y == 1900: y = today_year new_test_date = datetime.datetime(y, m, d, h) insert_data["test_date"] = str(new_test_date) break; except: pass if insert_data["game_name"] is None or insert_data["game_name"] == "None" or insert_data["game_name"] == "": return None return insert_data
def fetchListPages(self, listtype="html"): print "Start to fetch and parse List" urls = self.listRule.getListUrls() for url in urls: print "Fetching list page: ", url, "charset:", safestr(self.seed["charset"]), "timeout:", safestr(self.seed["timeout"]) f = Fetch(url, charset = self.seed["charset"], timeout = self.seed["timeout"]) if f.isReady(): doc = f.read() if listtype == "html": self.parseListPage(f, doc, url) elif listtype == "json": self.parseJsonPage(f, doc, url) print "List has finished parsing. It has %s docs." % ansicolor.red(self.__len__())
def getElementData(obj, rule, images=None, fetch_all=0): """ 根据rule对obj的进行解析 obj可以是pq后的对象, 也可以是html页面 images将会把解析过程的image连接插入此表中 规则可以有两种模式: 1. DOM selector 1.1 选择器类似于jquery 比如你要某个a的url >> a.attr("href") 1.2 需要一个标签内的文本内容 >> div[id="content"].text() 1.3 需要获得某个子元素中的内容 >> li.eq(1).text() #li元素组中的第2个文本内容 2. 正则模式 正则模式需要的内容使用[arg]标签,其余可以使用(*)填充 """ if not isinstance(obj, pq): obj = pq(obj); old_rule = rule rule = rule.split(".") #避免有url链接 if len(rule) > 1 and old_rule.find("[arg]") == -1: #第一个永远是dom选择 selectRule = rule.pop(0) #移除 ( ) selectRule = selectRule.replace("(", ""); selectRule = selectRule.replace(")", ""); selecteddom = obj.find(selectRule); for attr in rule: m = attrParrent.match(attr) if m: action, v = m.groups() if v: v = v.encode("utf-8") #去除引号 v = v.strip("\'").strip('\"'); if action == "attr" and hasattr(selecteddom, "attr") and v: if fetch_all == 1: values = [] dom_count = len(selecteddom) for i in range(dom_count): vv = selecteddom.eq(i).attr(v) if vv: values.append(vv) if is_image(vv): images.append(vv) return values else: value = selecteddom.attr(v) if selecteddom and selecteddom[0].tag == "img" and v == "src" and images is not None: images.append(value) return value elif action == "eq" and hasattr(selecteddom, "eq"): _rules = attr.split(" ") if len(rule) > 1: selecteddom = selecteddom.eq(int(v)) if len(_rules) > 1: ''' 假设eq后面还有子元素 eq(1) a ''' _rules.pop(0) _dom = " ".join(_rules) selecteddom = selecteddom.find(_dom) else: return selecteddom.eq(int(v)) elif action == "text" and hasattr(selecteddom, "text"): return safeunicode(selecteddom.text()).strip() elif action == "html" and hasattr(selecteddom, "html"): return safeunicode(selecteddom.html()).strip() elif len(rule) == 1: rule = rule.pop() #正则模式 if rule.find('[arg]'): content = obj.html() content_text = obj.text() rule = rule.replace('[arg]', '(.+)?') rule = rule.replace('(*)', '.+?') if isinstance(content, unicode): rule = safeunicode(rule) else: rule = safestr(rule) parrent = re.compile(rule, re.MULTILINE | re.UNICODE) try: result = parrent.search(content) if result is not None: result = safeunicode(result.group(1)).strip() return result else: result = parrent.search(content_text) if result is not None: result = safeunicode(result.group(1)).strip() return result except: return None return None