content_extractor = pickle.load(f) # Kafka consumer = KafkaConsumer("test6", group_id="group", bootstrap_servers=['172.16.129.43:9092']) producer = KafkaProducer(bootstrap_servers=['172.16.129.43:9092']) print "start sk parser!" for message in consumer: if message is not None: try: jsonValue = json.loads(message.value) # 解析出正文 content = content_extractor.analyze(jsonValue["html"]) for useParser in ["lxml", "html5lib", "html.parser"]: # 解析标题 发布时间 正文段等 parseHtml = extractHtml(jsonValue["html"], content, useParser) parseTitle = parseHtml.title() parsePublishDate = parseHtml.publishDate() parseContent = parseHtml.mainContent() if len(parseContent[0]) == 0: parseContent[0] = content if len(parsePublishDate) == 0 or parsePublishDate == " " or parsePublishDate == None: # 解析不到发布时间 则将发布时间设置为爬虫时间 parsePublishDate = jsonValue["crawletime"] if len(parseContent[0]) > 0: dictData = { "type": "none", "url": jsonValue["url"], "keywords": " ", "description": " ", "title": parseTitle,
# 测试 print "start newspaper parser!" for message in consumer: if message is not None: print "xxx" try: jsonValue = json.loads(message.value) html = jsonValue["html"] contentWithOutTag = fulltext(html, language="zh") for useParser in ["lxml"]: # 将无标签正文带回html解析 parseHtml = extractHtml(html, contentWithOutTag, useParser) parseTitle = parseHtml.title() parsePublishDate = parseHtml.publishDate() parseContent = parseHtml.mainContent() if len(parseContent[0]) == 0: parseContent[0] = contentWithOutTag if len(parsePublishDate) == 0 or parsePublishDate == " " or parsePublishDate == None: # 解析不到发布时间 则将发布时间设置为爬虫时间 parsePublishDate = time.asctime(time.localtime(time.time())) print "\n-----------------------------------------------------------------------------\n" print "url:\t", jsonValue["url"] print "标题:\t", parseTitle print "正文:\t", parseContent[0] print "发布时间:\t", parsePublishDate print "\n-----------------------------------------------------------------------------\n"