def __init__(self): """ 将item中的值初始化为空 """ Item.__init__(self) self['coal'] = '' self['non_coal'] = ''
def __init__(self): Item.__init__(self) #爬取下来的一个全局唯一ID self['crawlerid'] = '' #页面链接 self['url'] = '' #html源码 self['html_code'] = '' #页面编码 self['encoding'] = '' #标题 self['title'] = '' #作者 self['authors'] = [] #正文 self['content'] = '' #新闻时间 self['time'] = '' #来源 self['source'] = '' #编辑 self['editor'] = '' #频道类别 self['ctype'] = '' #频道类别 self['subtype'] = '' #关键词 self['keywords'] = [] #摘要 self['abstract'] = '' self['copyright'] = '' self['originality'] = '' self['type'] = 'text'
def __init__(self, *args, **kwargs): Item.__init__(self, *args, **kwargs) self['platform'] = kwargs.get("keyword") self['keyword'] = kwargs.get("keyword") self['crawl_time'] = int(time.time()) self['url'] = kwargs.get("url") self['real_url'] = kwargs.get("real_url") self['title'] = kwargs.get("title") self['source_url'] = kwargs.get("title") self['spider'] = kwargs.get("spider") self['skip_url'] = kwargs.get("skip_url") self['snapshot_url'] = kwargs.get("snapshot_url") self['show_url'] = kwargs.get("show_url") self['is_ad'] = kwargs.get("is_ad") self['content'] = kwargs.get("content")
def __init__(self, year): global this_year this_year = year Item.__init__(self)