def select_from_xpath(self, xpath, item_rule, item_body, response, item_dict): ''' 按照xpath的规则提取一个项目的值 argv: xpath:提取用的xpath规则 item_rule:项目提取规则 item_body:html response:http响应 item_dict:字典结构的item return: 提取获得的数据 ''' item_dom = item_body.select(xpath) #print item_dom if item_rule['XMLPathSelectType'] == ConfConstants.XMLPathSelectType.OnlyOne: item_dom = item_dom[:1] if item_rule['XMLPathType'] in [ConfConstants.XMLPathType.InnerText, ConfConstants.XMLPathType.InnerTextWithPic, ConfConstants.XMLPathType.InnerDateTime]:#内部文本 if item_rule['XMLPathType'] == ConfConstants.XMLPathType.InnerTextWithPic:#带图片的内部文本 item_dict['image_urls'] = item_dom.select('.//img/@src').extract() item_text = "".join(item_dom.select('.//text()').extract()) item_text = "\n".join([line.strip() for line in item_text.split("\n") if line.strip()]) item_value = item_text if item_rule['XMLPathType'] == ConfConstants.XMLPathType.InnerDateTime: try: item_value = Utils.format_datetime(item_value.decode(response.encoding)) except: item_value = Utils.format_datetime(item_value) else: item_value = " ".join(item_dom.extract()) return item_value
def load_site_conf(site): #pid_file = file("%s.pid" % site, "w+") #pid_file.write(os.getpid()) #pid_file.close() Utils.settings = get_project_settings() sites_rule = {} conf_dict = xmltodict.parse(file(r"%s" % (site)).read()) sites_rule = conf_dict['SiteRule']['Sites']['Site'] if not isinstance(sites_rule, list): sites_rule = [sites_rule] Conf.conf_dict = conf_dict Utils.conf_dict = conf_dict Conf.sites_rule = sites_rule Conf.ua = Conf.conf_dict['SiteRule'].get('UserAgent', '') Utils.get_mongodb_client() db = Utils.get_db() res = list(db.select(Utils.settings['MYSQL_TASKSTATUS_TABLE'], what="id", where="uuid=$uuid", vars={"uuid":conf_dict['SiteRule'].get('Uuid', "")})) Conf.uuid = int(conf_dict['SiteRule'].get('Uuid', 0)) Conf.task_id = int(res[0]['id']) if len(res) else int(conf_dict['SiteRule']['TaskId']) Conf.pid = int(conf_dict['SiteRule']['TaskId']) Conf.output_files = { ConfConstants.Output.XML:Conf.conf_dict['SiteRule']['XMLFileName'], ConfConstants.Output.JSON:Conf.conf_dict['SiteRule']['JsonFileName'], ConfConstants.Output.MongoDB:Conf.conf_dict['SiteRule']['TableName'], ConfConstants.Output.Hadoop:Conf.conf_dict['SiteRule']['Hadoop'] } Utils.daemon = ServiceDaemon()