def crawler(company_id, link): retry_time = 0 while True: result = news_crawler.crawl(link, agent=False) if result['get'] == 'success': #logger.info(result["content"]) html = util.html_encode(result["content"]) #logger.info(html) contents = extract.extractContents(link, html) title = extract.extractTitle(html) date = extractArticlePublishedDate.extractArticlePublishedDate( link, html) dnews = { "companyId": company_id, "date": date, "title": title, "link": link, "createTime": datetime.datetime.now(), "source": 13001 } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } dcontents.append(dc) rank += 1 dnews["contents"] = dcontents logger.info(dnews) mongo = db.connect_mongo() _id = mongo.article.news.insert_one(dnews).inserted_id mongo.close() return _id retry_time += 1 if retry_time > 10: break return None
def parse_news(news_key): news = fromdb.direct_news.find_one({ "source": source, "news_key": news_key }) if news == None: return content = news["content"] try: article = pq(content)('.article-detail') d = pq(article) title = d('div.title > h1').text() news_time = d('.subtitle > .time').text() #content = d('.article-content').text() print title print news_time #print content contents = extract.extractContents(news["url"], content) news_content = { "date": datetime.datetime.now(), "news_key": news_key, "source": source, "url": news['url'], "title": title, "news_time": news_time, "contents": contents, "company_id": news['company_id'], "search_name": news['search_name'] } if news_collection.find_one({ "source": source, "news_key": news_key }) != None: news_collection.delete_one({ "source": source, "news_key": news_key }) news_collection.insert_one(news_content) msg = {"type": "direct_news_parser", "news_key": news_key} logger.info(msg) kafka_producer.send_messages("news_parser", json.dumps(msg)) except: traceback.print_exc()
def process(key, content, url): #logger.info(content) dcontents = [] companyId = None if has_content(content): d = pq((html.fromstring(content.decode("utf-8")))) article = d('dl#job_detail.job_detail').html() # # logger.info(article) contents = extract.extractContents(url, article, document=False) rank = 1 over = False for c in contents: if c["data"].find("职位发布者") >= 0 or over is True: break if c["data"].find("查看地图") >= 0: c["data"] = c["data"].replace("查看地图", "") over = True if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } # logger.info(c["data"]) dcontents.append(dc) rank += 1 companyId = d('input#companyid').attr("value") # logger.info("companyId: %s",companyId) save_job_mongo(key, dcontents, companyId)
def insert_source_news(source_news): s = source_news if s is None: return None try: source = s["source"] company_key = s["company_key"] news_key = s["news_key"] title = s["title"] url = s["url"] domain = s["domain"] date = s["date"] html = s["content"] if html.find("404未找到页面") != -1: logger.info("404未找到页面") return None if fromdb.source_news.find_one({ "source": source, "company_key": company_key, "news_key": news_key }) == None: contents = extract.extractContents(url, html) data = { "source": source, "news_key": news_key, "company_key": company_key, "url": url, "title": title, "source_domain": domain, "date": date, "contents": contents } fromdb.source_news.insert_one(data) except Exception, ex: logger.exception(ex)
def parse_news(news): if news == None: return try: html = news["content"] summary = news["summary"] title = news["title"] url = news["share_url"] logger.info(title) logger.info(url) contents = extract.extractContents(url, html) #for c in contents: # logger.info(c["data"]) collection.update( {"_id": news["_id"]}, {"$set": { "parsed": True, "parsed_contents": contents }}) except: traceback.print_exc()
def parseNews(item): if item is None: return None try: company_key = item["company_key"] news_key = item["news_key"] url = item["url"] news_source_domain = item["news_source_domain"] news_date = item["news_date"] html = item["content"] if html.find("404未找到页面") != -1: logger.info("404未找到页面") return None if fromdb.source_news.find_one({"source":item["source"],"company_key":company_key,"news_key":news_key}) == None: contents = extract.extractContents(item["url"], html) data = {"source":item["source"], "news_key":news_key, "company_key":company_key, "url":url, "title":item["news_title"],"source_domain":news_source_domain, "date":news_date, "contents":contents} fromdb.source_news.insert_one(data) except Exception,ex: logger.exception(ex)
def process_news(column, newsurl, content, newspost, download_crawler): # if has_news_content(content): if 1: # logger.info('here') download_crawler = download.DownloadCrawler(use_proxy=False) # logger.info(content) d = pq(html.fromstring(content.decode('gb2312', 'ignore'))) category = None categoryNames = [] key = newsurl.split("/")[-1].replace(".htm", "") title = d('h1.title').text().strip() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None tags = [] # articletags = d("meta[name='keywords']").attr("content") # if articletags is not None: # for tag in articletags.split(","): # if tag is not None and tag.strip() != "" and tag not in tags and tag != title: # tags.append(tag) brief = None news_time = d('.timer').text() news_time = datetime.datetime.strptime(news_time, '%Y-%m-%d %H:%M:%S') article = d('.content').html() contents = extract.extractContents(newsurl, article, document=False) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": None, # "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["data"].find("fromgeek.com/awards/") >= 0 or \ c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0: continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8) # collection_news.insert(dnews) mongo.close() if news_classify.get_class(dcontents, 13866) == 1: logger.info('%s is fundingNews', title) TYPE = 60001 else: TYPE = 60010 logger.info('%s is not fundingNews', title) dnews['type'] = TYPE if title is not None and len(contents) > 0: # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass return
def process_news(column, newsurl, content, newspost, topic, download_crawler): if has_news_content(content): logger.info('here') download_crawler = download.DownloadCrawler(use_proxy=False) # logger.info(content) d = pq(html.fromstring(content.decode("gbk"))) key = newsurl.split("/")[-1].replace(".htm", "") type = TYPE category = None categoryNames = [] title = d('div.hd> h1').text().strip() logger.info("title: %s", title) mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return tags = [] articletags = d("meta[name='keywords']").attr("content") if articletags is not None: for tag in articletags.split(","): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) # post = d('div#post_thumbnail> img').attr("src") # postraw = newspost # # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") # (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") # if posturl is not None: # post = str(posturl) # else: # post = None post = None brief = d("meta[name='description']").attr("content") post_time = d('div.a_Info> span.a_time').text() logger.info(post_time) news_time = extract.extracttime(post_time) if news_time is None: news_time = datetime.datetime.now() article = d('div.bd> div.Cnt-Main-Article-QQ').html() contents = extract.extractContents(newsurl, article) # logger.info(contents) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) # exit() # mongo = db.connect_mongo() # collection_news = mongo.article.news # if collection_news.find_one({"title": title}) is not None: # mongo.close() # return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["data"].find("fromgeek.com/awards/") >= 0 or \ c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0: continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # if post is None or post.strip() == "": # post = util.get_poster_from_news(dcontents) # dnews["post"] = post if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) mongo.close() # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) return
def process_news(item, url, content): if has_news_content(content): d = pq(html.fromstring(content.decode("gbk"))) title = d( 'div.g-main> div> div.m-cont-hd> div.title> h1').text().strip() datecontent = d( 'div.g-main> div> div.m-cont-hd> div.m-info> div> div> div.box> div.origin' ).text().strip() result = util.re_get_result('(\d{4}\/.*?)$', datecontent) if result: post_time, = result news_time = datetime.datetime.strptime(post_time, "%Y/%m/%d %H:%M:%S") else: post_time = None news_time = None key = item["key"] column = d('div.g-main> div> div.m-cont-hd> div.tag').text().strip() brief = d('div.g-article> div> div.review').text().strip() postraw = item["post"] # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None if column is not None: tags = column.split() else: tags = [] logger.info("%s, %s, %s, %s, %s, %s", key, title, post_time, news_time, brief, ":".join(tags)) article = d('div.g-article> div.m-article').html() #logger.info(article) contents = extract.extractContents(url, article) if collection_news.find_one({"link": url}) is not None: return # collection_news.delete_one({"link": url}) # # for t in contents: # logger.info(t["data"]) # logger.info("") flag, domain = url_helper.get_domain(url) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": url, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": TYPE, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": None, "domain": domain, "categoryNames": [] } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # if post is None or post.strip() == "": # post = util.get_posterId_from_news(dcontents) # dnews["post"] = post if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid)
def process_news(column, newsurl, content, newspost, download_crawler): logger.info('starting process_news %s', newsurl) # if has_news_content(content): if 1: download_crawler = download.DownloadCrawler(use_proxy=False) d = pq(html.fromstring(content.decode("utf-8"))) key = newsurl.split("/")[-1] # type = TYPE category = None title = d('.article_title p').text().strip() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() logger.info('title:%s already exists' % title) return tags = [] articletags = d(".labs a").text().strip() if articletags is not None: for tag in articletags.split(): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) postraw = newspost # post = d('div#post_thumbnail> img').attr("src") # if post is not None: # post = "http://vcbeat.com"+ post brief = None # brief = brief.replace(u'摘要', '').replace(u'摘要:', '').replace(u'摘要:', '').strip() # news_time = extractArticlePublishedDate.extractArticlePublishedDate(newsurl, content) news_time = d('.time').text().strip() news_time = datetime.datetime.strptime(news_time, '%Y-%m-%d %H:%M') # dt = datetime.date.today() # today = datetime.datetime.now() # if news_time is None or news_time > today: # news_time = datetime.datetime.now() article = d('.art_text').html() contents = extract.extractContents(newsurl, article, document=False) # if len(contents)==0: # contents = extract.extractContents(newsurl, article, document=False) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, postraw) # mongo = db.connect_mongo() # collection_news = mongo.article.news # if collection_news.find_one({"title": title}) is not None: # mongo.close() # logger.info( 'title:%s already exists'%title) # return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": None, "type": TYPE, "original_tags": None, "processStatus": 0, # "companyId": None, "companyIds": [], "category": None, "domain": domain, "categoryNames": [] } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) dnews["brief"] = brief # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # update link content with oldId item = collection_news.find_one({"link": newsurl}) if item is None: nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) else: logger.info("update %s", newsurl) # collection_news.update_many({'link': newsurl},{'$set': dnews}) # oldId = collection_news.find_one({"link": newsurl})['_id'] # collection_news.delete_one({"link": newsurl}) # dnews['_id']=oldId # collection_news.insert(dnews) mongo.close() logger.info("*************DONE*************") return
def process_news(column, newsurl, content, newspost, download_crawler): if has_news_content(content): d = pq(html.fromstring(content.decode("utf-8"))) key = newsurl.split("/")[-1].strip().replace(".html", "").replace( 'detail_', '') type = TYPE category = None title = d('div.left.zb-n> h1').text().strip() tags = [] postraw = newspost # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None # brief = d("meta[name='description']").attr("content").replace(u'一鸣网——让发生的发声|智慧共享新媒体平台|上海TMT媒体开创者、一鸣网ymtmt.com','') brief = d('div.left.zb-n> p.gy').text().strip() news_time = datetime.datetime.now() article = d('div.left.zb-n').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post) # exit() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: logger.info('already exists %s', title) mongo.close() return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": [] } dcontents = [] rank = 1 start = False for c in contents: if start is False and c["data"].find( brief) >= 0 and c["data"].find(title) >= 0: start = True continue if start is False: continue if c["data"].find("-END-") >= 0: break if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) mongo.close() # logger.info("*************DONE*************") else: logger.info('has no news content %s', newsurl) return
def process_news(content, news_key, url): if has_content(content): download_crawler = download.DownloadCrawler(use_proxy=False) d = pq(html.fromstring(content)) brief = d("meta[name='description']").attr("content").split(",")[-1] title = d('div#article> div.single-item> div.article-hd> h1').text().strip() pagetitle = d('head> title').text().strip() temp = pagetitle.split("-")[-2] categoryNames = [] if temp.strip() == "初页": category = 60102 categoryNames.append("产品") elif temp.strip() == 'IPO/并购': category = 60105 categoryNames.append("大公司") else: category = None post_time = d('div.author-time> span.date-time').attr("data-time") post_date = time.localtime(int(post_time)) news_time = datetime.datetime(post_date.tm_year, post_date.tm_mon, post_date.tm_mday, post_date.tm_hour, post_date.tm_min, post_date.tm_sec) key = news_key column = d('div.article-tags> a').text() tags = column.split() logger.info("%s, %s, %s, %s, %s, %s, %s", key, title, post_time, news_time, temp, category, ":".join(tags)) article = d('div#article> div> div.article-content').html() # # logger.info(article) contents = extract.extractContents(url, article) if collection_news.find_one({"source": SOURCE, "key_int": int(key)}) is not None: return # collection_news.delete_one({"source": SOURCE, "key_int": int(key)}) if collection_news.find_one({"title": title, "source": {"$ne": SOURCE}}) is not None: return # collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}}) # for t in contents: # logger.info(t["data"]) # logger.info("") flag, domain = url_helper.get_domain(url) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": url, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": TYPE, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "" or desc_helper.check_desc(brief,2) is False: brief = util.get_brief_from_news(dcontents) # post = util.get_poster_from_news(dcontents) # dnews["post"] = post post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8) # collection_news.insert(dnews) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid)
def process_news(column, newsurl, content, newsposttime, download_crawler): if has_news_content(content): d = pq(html.fromstring(content.decode("utf-8"))) key = newsurl.split("/")[-1].strip() type = TYPE title = d('div.article-wrap> div.article-head> h1').text().strip() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return tags = [] articletags = d("meta[name='keywords']").attr("content") if articletags is not None: for tag in articletags.split(","): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) category = None categoryNames = [] if "投资并购" in tags: category = 60101 categoryNames.append("融资") # post = d('div#post_thumbnail> img').attr("src") post = None brief = d("meta[name='description']").attr("content") news_time = None if newsposttime is not None: news_time = extract.extracttime(newsposttime) if news_time is None: dt = datetime.date.today() post_time = d( 'div.article-wrap> div.article-head> p> span.article-time' ).text() if post_time is None or post_time.strip() == str(dt): news_time = datetime.datetime.now() # news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d") else: news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d") article = d('div.article-wrap> div.article-content').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post) # exit() # mongo = db.connect_mongo() # collection_news = mongo.article.news # if collection_news.find_one({"title": title}) is not None: # mongo.close() # return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) dnews["postId"] = post dnews["brief"] = brief # Design for sector: dnews["sectors"] = [10] dnews["sector_confidence"] = [1] if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) mongo.close() # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) return
def process_news(column, newsurl, content, newspost, download_crawler): if has_news_content(content): d = pq(html.fromstring(content.decode("utf-8", "ignore"))) key = newsurl.split("/")[-1].replace(".html", "") type = TYPE category = None title = d('article> h1').text().strip() tags = [] articletags = d("meta[name='keywords']").attr("content") if articletags is not None: for tag in articletags.split(","): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) if "English" in tags or "english" in tags: logger.info("Englis not needed, get out!") return if "商业价值杂志" in tags: type = 60003 category = 60107 # post = d('div#post_thumbnail> img').attr("src") # posturl = parser_mysql_util.get_logo_id(newspost, download_crawler, SOURCE, key, "news") # if posturl is not None: # post = str(posturl) # else: # post = None postraw = d("meta[property='og:image']").attr("content") # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None brief = d("article> p.post-abstract").text().strip().replace( '摘要: ', "") post_time = d('article> div.post-info> span.time').text() logger.info(post_time) news_time = extract.extracttime(post_time) if news_time is None: news_time = datetime.datetime.now() article = d('article> div.inner').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post) # exit() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": [] # "sectors": [20] } dcontents = [] rank = 1 for c in contents: if c["data"] == "http://www.tmtpost.com/public/css/images/wzny_ewm.jpg": continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) if title is not None and len(contents) > 0: # collection_news.insert(dnews) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) mongo.close() # logger.info("*************DONE*************") return
def process(crawler): while True: if len(URLS) == 0: return linkDict = URLS.pop(0) retry = 0 while True: retry += 1 if retry > 6: break download_crawler = download.DownloadCrawler(use_proxy=False) url = linkDict['href'] result = crawler.crawl(url) if result['get'] == 'success': d = pq(html.fromstring(result['content'].decode("utf-8"))) title = linkDict['title'] key = url.split('/')[-1] category = d('.al-crumbs a:nth-child(2)').text() if categoryDict.has_key(category): TYPE = categoryDict[category]['type'] category = categoryDict[category]['category'] else: TYPE = 60001 category = None brief = linkDict['brief'] postraw = linkDict['post'] tags = [] # for tag in d('.tags').text().split(): # if tag.strip() not in tags: tags.append(tag) news_time = d('.article__published').eq(0).text() # news_time = datetime.datetime.strptime(' '.join(news_time.split(' ')[:2]), '%Y年%m月%d日 %H:%M') # news_time = datetime.datetime.strptime(news_time, '%Y/%m/%d %p %I:%M') news_time = datetime.datetime.strptime(news_time, '%Y/%m/%d %H:%M') flag, domain = url_helper.get_domain(url) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": url, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": None, "type": TYPE, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": [] } article = d('.article__content').html() contents = extract.extractContents(url, article) dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new( postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post # brief=brief[:100] dnews["brief"] = brief mongo = db.connect_mongo() collection_news = mongo.article.news # update link content with oldId item = collection_news.find_one({"link": url}) if len(dcontents) > 1: if item is None: # collection_news.insert(dnews) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) else: logger.info("update %s", url) # oldId = collection_news.find_one({"link": url})['_id'] # collection_news.delete_one({"link": url}) # dnews['_id'] = oldId # collection_news.insert(dnews) mongo.close() logger.info("%s, %s, %s, %s, %s, %s, %s", key, title, news_time, category, " ".join(tags), brief, post) logger.info("*************DONE*************") break
def process_news(column, j_content, content, download_crawler): if has_news_content(content): download_crawler = download.DownloadCrawler(use_proxy=False) d = pq(html.fromstring(content.decode("utf-8", "ignore"))) category = None categoryNames = [] key = j_content['id'] type = TYPE title = j_content['title'] newspost = j_content.get('image') (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None tags = [] brief = j_content['description'] newsurl = j_content['uri'] try: date = j_content['date'] post_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(str(date)[:-3]))) news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d %H:%M:%S") - datetime.timedelta(days=1) logger.info("news-time: %s", news_time) except Exception, e: logger.info(e) news_time = datetime.datetime.now() article = d('div.article-container').html() contents = extract.extractContents(newsurl, article,document=False) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": None, "type": type, "original_tags": tags, "processStatus": 1, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": if c["data"].find("Share to facebookShare to twitterShare to linkedin") >= 0: c['data'] = c['data'].replace('Share to facebookShare to twitterShare to linkedin', '') dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8) # logger.info(json.dumps(dnews,ensure_ascii=False,indent=2,cls=util.CJsonEncoder)) if title is not None and len(contents) > 0: nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass
def process_news(column, newsurl, content, newspost): if has_news_content(content): download_crawler = download.DownloadCrawler(use_proxy=False) key = content["news"]["id"] newsurl = "https://www.chinaventure.com.cn/cmsmodel/news/detail/%s.shtml" % key type = TYPE category = None categoryNames = [] if content["news"].has_key("newsChannelId"): if content["news"]["newsChannelId"] == 52: category = 60101 categoryNames.append("融资") if content["news"].has_key("tagName"): if content["news"]["tagName"] == '人物': category = 60103 tags = [] if content.has_key("keywordList") is True and len( content["keywordList"]) > 0: for tag in content["keywordList"]: if tag.has_key("keyword") and tag[ "keyword"] is not None and tag["keyword"].strip( ) != "" and tag["keyword"] not in tags: tags.append(tag["keyword"]) title = content["news"]["title"].replace(""", "\"") mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: logger.info( "***************************News existed!!!***********************" ) mongo.close() return # post = d('div#post_thumbnail> img').attr("src") postraw = "http://pic.chinaventure.com.cn/" + content["news"][ "coverImg"] # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None brief = content["news"]["introduction"] post_time = content["news"]["updateAt"] news_time = extract.extracttime(str(post_time)) if news_time is None: news_time = datetime.datetime.now() article = pq(content["news"]["content"]).html() contents = extract.extractContents(newsurl, article) # for c in contents: # logger.info(c["data"]) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, post) # return # mongo = db.connect_mongo() # collection_news = mongo.article.news # if collection_news.find_one({"title": title}) is not None: # logger.info("***************************News existed!!!***********************") # mongo.close() # return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["data"].find("img.mp.itc.cn") >= 0: continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # if post is None or post.strip() == "": # post = util.get_poster_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post # dnews["post"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) mongo.close() nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) # logger.info("*************DONE*************") return
def process_news(column, newsurl, content, newspost, download_crawler): if has_news_content(content): logger.info('here') d = pq(html.fromstring(content.decode("utf-8", 'ignore'))) if d.text().find('embed') >= 0: # 排除视频文章 logger.info('not article:%s' % newsurl) return category = None categoryNames = [] key = newsurl.split("/")[-1].replace(".html", "") type = TYPE title = d('h1').text().strip() if title is None or title == "": return mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({'title': title}) is not None: mongo.close() return try: (posturl, width, height) = parser_mysql_util.get_logo_id_new( newspost, download_crawler, SOURCE, key, "news") except: posturl = None if posturl is not None: post = str(posturl) else: post = None tags = [] articletags = d("meta[name='keywords']").attr('content') if articletags is not None: for tag in articletags.split(","): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) try: brief = d("meta[name='description']").attr("content") except: brief = None try: post_time_1 = d("div.wyt-post-content-meta> div> p ").find( 'span').text().strip() post_time_2 = d("div.wyt-post-content-meta> div").find( 'p').next().text().strip() if post_time_1: post_time = post_time_1 else: post_time = post_time_2 if re.match('\d{2}-\d{2}', post_time): # 匹配 03-19格式 post_time = str(time.localtime()[0]) + '-' + post_time news_time = extract.extracttime(post_time) logger.info("news-time: %s", news_time) except Exception, e: logger.info(e) news_time = datetime.datetime.now() if news_time is None: news_time = datetime.datetime.now() article = d('article.wyt-post-content').html() contents = extract.extractContents(newsurl, article, document=True) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames, } dcontents = [] rank = 1 if contents[0]['type'] == 'img': del contents[0] for c in contents: # logger.info("%s-%s",c["type"],c["data"]) if c['type'] == 'text': if re.match('^\d+$', c['data']) or c['data'].find('收藏') >= 0 or c['data'].find('投融资') >= 0 or c['data'].find('阅读时间') >= 0 \ or c['data'].find('违者必究') >= 0 or c['data'].find('微信公众号') >= 0 or c['data'].find('微信扫描') >= 0 \ or c['data'].find('点击获取完整版报告') >= 0 or c['data'].find('作者原创,微信号') >= 0: continue # if c['data'].find('译者') >= 0: # c['data'] = c['data'].split(' ')[0] # # if c['data'].find('来源') >= 0: # c['data'] = c['data'].split('|')[0] if c['data'].find('| 未央网') >= 0: c['data'] = c['data'].replace('| 未央网', ' ') dc = { 'rank': rank, 'content': c['data'], 'image': '', 'image_src': '', } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews['contents'] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) logger.info( json.dumps(dnews, ensure_ascii=False, cls=util.CJsonEncoder)) if title is not None and len(contents) > 0: nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass mongo.close()
def process(content, citykey, crawler): cnt = 0 if has_content(content): DT = datetime.date.today() TODAY = datetime.datetime(DT.year, DT.month, DT.day) #logger.info(content) d = pq(html.fromstring(content.decode("utf-8"))) lis = d('div.wrap> div> div> ul.ativities> li.item') for li in lis: c = pq(li) img = c('a> img').attr("src").strip().replace("|130w","") if img is not None: # logger.info("poster: %s", poster) # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(img, download_crawler, SOURCE, key, "news") if posturl is not None: poster = str(posturl) else: poster = None title = c('h3.title> a').text() link = c('h3.title> a').attr("href") if link.find("http") ==-1: continue key = link.split("/")[-1] key_int = int(key) location = c('div.intro> div.address').text() sponors = c('div.intro> div.sponors> span').text().replace(","," ").replace(","," ").split() spans = c('div.intro> div.time> span') if len(spans) == 3: date = c('div.intro> div.time> span').eq(0).text() times = c('div.intro> div.time> span').eq(2).text().split("~") beginTime = date+" "+times[0] endTime = date+" "+times[1] elif len(spans) == 5: date = c('div.intro> div.time> span').eq(0).text() year = date.split("-")[0] times = c('div.intro> div.time> span').eq(2).text().split("~") beginTime = date+" "+times[0] endTime = year+"-"+times[1]+" "+c('div.intro> div.time> span').eq(4).text() else: continue try: beginDate = datetime.datetime.strptime(beginTime, "%Y-%m-%d %H:%M") endDate = datetime.datetime.strptime(endTime, "%Y-%m-%d %H:%M") except: beginDate = None if beginDate is None or beginDate < TODAY or key_int is None: # Not save active activity continue result = crawler.crawl(link) while True: if result['get'] == 'success': break else: result = crawler.crawl(link) if has_content(result['content']): contents = extract.extractContents(link, result['content']) flag, domain = url_helper.get_domain(link) dact = { "beginDate": beginDate - datetime.timedelta(hours=8), "endDate": endDate - datetime.timedelta(hours=8), "date": beginDate - datetime.timedelta(hours=8), "title": title, "link": link, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": key_int, "type": TYPE, "original_tags": [], "processStatus": 0, "companyIds": [], "location": location, "city": citymap[citykey], "sponors": sponors, "post": poster, "domain": domain, "categoryNames": [] } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": if c["data"].find("我要报名") >= 0: logger.info("************************over") break dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"],download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dact["contents"] = dcontents value = activity_simhash.get_simhash_value(dcontents) dact["simhashValue"] = value record = collection_news.find_one({"source": SOURCE, "key_int": key_int}) if record is not None: city = record["city"] if record["beginDate"] == dact["beginDate"] and record["endDate"] == dact["endDate"] and record["title"] == dact["title"] and record["city"] == citymap[citykey] and record["location"] == dact["location"]: logger.info("%s activity already existed", title) cnt += 1 continue else: collection_news.delete_one({"source": SOURCE, "key_int": key_int}) if city != citymap[citykey]: logger.info("%s has two city : %s and %s with location %s, something is wrong", title, city, citymap[citykey], location) cnt += 1 continue collection_news.insert(dact) logger.info("%s, %s, %s->%s, %s, %s, %s, %s", key, title, beginDate, endDate, ":".join(sponors),location, link, img) else: if activity_simhash.check_same_act(dact) is True: pass else: collection_news.insert(dact) logger.info("%s, %s, %s->%s, %s, %s, %s, %s", key, title, beginDate, endDate, ":".join(sponors), location, link, img) cnt += 1 logger.info("************Done***************") logger.info("*******%s activities has been checked or recorded", cnt) return cnt
def process_news(column, newsurl, content, newspost, topic, download_crawler): if has_news_content(content): logger.info('here') download_crawler = download.DownloadCrawler(use_proxy=False) # logger.info(content) # d = pq(html.fromstring(content.decode("gbk","ignore"))) utfflag = False if content.find("gb2312") == -1: d = pq(html.fromstring(content.decode("utf-8", "ignore"))) utfflag = True else: d = pq(html.fromstring(content.decode("gbk", "ignore"))) category = None categoryNames = [] key = newsurl.split("/")[-1].replace(".shtml", "") type = TYPE post = None if utfflag is True: title = d('div#titsize> strong').text().strip() else: title = d('div.titmain> h1').text().strip() # logger.info("title: %s", title) if title is None or title.strip() == "": title = d('div.texttitbox> h1').text().strip() logger.info("title: %s", title) mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return tags = [] # articletags = d("meta[name='keywords']").attr("content") # if articletags is not None: # for tag in articletags.split(","): # if tag is not None and tag.strip() != "" and tag not in tags and tag != title: # tags.append(tag) # try: # brief = d('div.daodu> p').text().strip().replace("【数据猿导读】","") # except: # brief = None brief = None try: if utfflag is True: post_time = d("p.time> span.mh-title").text().strip() else: post_time = d("meta[property='og:release_date']").attr( "content").split("+")[0] logger.info(post_time) news_time = extract.extracttime(post_time) logger.info("news-time: %s", news_time) except Exception, e: logger.info(e) news_time = datetime.datetime.now() if utfflag is True: article = d('div.tbox.content').html() else: article = d('div.texttit_m1').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=20), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["data"].find("fromgeek.com/awards/") >= 0 or \ c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0: continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) mongo.close() if title is not None and len(contents) > 0: # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass
def process(g, crawler, url, key, content): if has_content(content): #logger.info(content) main = pq(content)('div.article_content') d = pq(main) title = d('h1#article_title').text() brief = pq(content)("meta[name='description']").attr("content") # post_time =pq(content)("meta[property='article:published_time']").attr("content").split("+")[0] # news_time = datetime.datetime.strptime(post_time, "%Y-%m-%dT%H:%M:%S") result = util.re_get_result("var publishTime = new Date\(\"(.*?)\"\)", content) if result: post_time, = result news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d %H:%M:%S") else: logger.info("incorrcet post time") logger.info(content) exit() return contents = extract.extractContents(url, content) if title.find("融资") >= 0 or title.find("获投") >= 0: category = 60101 else: category = None tags = [] articletags = pq(content)("meta[name='keywords']").attr("content") if articletags is None: logger.info(content) else: for tag in articletags.split(): if tag is not None and tag.strip() != "" and tag not in tags: tags.append(tag) logger.info("%s, %s, %s, %s, %s", key, title, news_time, category, ":".join(tags)) #logger.info(news_time) #logger.info(contents) # for t in contents: # logger.info(t["data"]) #item = collection_news.find_one({"source": g.SOURCE, "key_int": int(key)}) craw = True #2016-10-01 pencilnews website upgrade, news keys changed! Have to redownload article with new keys if collection_news.find_one({ "source": g.SOURCE, "key_int": int(key) }) is not None: cnews = collection_news.find_one({ "source": g.SOURCE, "key_int": int(key) }) logger.info("%s, %s", url, cnews["link"]) if url == cnews["link"]: craw = False else: collection_news.delete_many({ "source": g.SOURCE, "key_int": int(key) }) logger.info("different link!") if craw: newses = list( collection_news.find({ "title": title, "source": { "$ne": g.SOURCE } })) for news in newses: if news.has_key("type") and news["type"] > 0: craw = False break if craw: if collection_news.find_one({ "title": title, "source": { "$ne": g.SOURCE } }) is not None: collection_news.delete_many({ "title": title, "source": { "$ne": g.SOURCE } }) flag, domain = url_helper.get_domain(url) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": url, "createTime": datetime.datetime.now(), "source": g.SOURCE, "key": key, "key_int": int(key), "type": TYPE, "original_tags": tags, "processStatus": 0, "companyId": None, "companyIds": [], "category": category, "domain": domain } dcontents = [] rank = 1 for c in contents: if c["data"] == "/The End/": break if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) post = util.get_poster_from_news(dcontents) dnews["post"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) collection_news.insert(dnews) logger.info("*************DONE**************") g.latestIncr()
def process_news(column, newsurl, content, newspost, topic, download_crawler): if has_news_content(content): # logger.info('here') download_crawler = download.DownloadCrawler(use_proxy=False) # logger.info(content) d = pq(html.fromstring(content.decode("utf-8", "ignore"))) category = None categoryNames = [] key = newsurl.split("/")[-2].replace(".html", "") type = TYPE title = d('h1.single-title').text().strip() newspost = d('header> img.wp-post-image').attr("src") (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None tags = [] # articletags = d("meta[name='keywords']").attr("content") # if articletags is not None: # for tag in articletags.split(","): # if tag is not None and tag.strip() != "" and tag not in tags and tag != title: # tags.append(tag) try: brief = d("meta[name='description']").attr("content") except: brief = None # try: # post_time = topic # # logger.info(post_time) # news_time = datetime.datetime.strptime(post_time, "%Y-%m-%dT%H:%M:%S") # logger.info("news-time: %s", news_time) # except Exception, e: # logger.info(e) news_time = datetime.datetime.now() article = d('section.post_content').html() contents = extract.extractContents(newsurl, article, document=False) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": None, "type": type, "original_tags": tags, "processStatus": 1, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } processStatus = 0 dcontents = [] rank = 1 for c in contents: if c["data"].find("fromgeek.com/awards/") >= 0 or \ c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0: continue if c["data"].find( "Continue reading this story with a subscription to DealStreetAsia" ) >= 0: processStatus = -5 if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 if processStatus != 0: dnews["processStatus"] = processStatus dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) if title is not None and len(contents) > 0: # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s | %s", nid, processStatus) pass return
def process_news(column, newsurl, content, newspost, download_crawler): if has_news_content(content): d = pq(html.fromstring(content.decode("utf-8"))) key = newsurl.split("/")[-1].strip().replace(".shtml", "") type = TYPE category = None title = d('div.subject> h1').text().strip() tags = [] post = newspost brief = d("meta[name='description']").attr("content") post_time = d('div.meta> span.meta-date').text().replace("发布", "") logger.info(post_time) news_time = extract.extracttime(post_time) if news_time is None: news_time = datetime.datetime.now() article = d('div.subject> div.subject-content').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post) # exit() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, "companyId": None, "companyIds": [], "category": category, "domain": domain } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: imgurl = parser_mysql_util.get_logo_id( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) collection_news.insert(dnews) mongo.close() logger.info("*************DONE*************") return
def process_news(column, newsurl, content, newspost, download_crawler): if has_news_content(content): d = pq(html.fromstring(content.decode("utf-8","ignore"))) key = newsurl.split("/")[-1].replace("i","") type = TYPE category = None title = d('head> title').text().strip() r = "content: '(.*?)',.*groupId" result = util.re_get_result(r.strip()[:-1], content) (b,) = result logger.info(b) # exit() tags = [] articletags = d("meta[name='keywords']").attr("content") if articletags is not None: for tag in articletags.replace(",", ",").split(","): if tag is not None and tag.strip() != "" and tag not in tags and tag != title: tags.append(tag) post = None brief = None news_time = None try: r1 = "time: '(.*?)'.*},.*tagInfo" result = util.re_get_result(r1, content) (post_time,) = result logger.info(post_time) news_time = extract.extracttime(post_time) logger.info("news-time: %s", news_time) except: pass if news_time is None: news_time = datetime.datetime.now() # exit() # article = d('div.post> div.post-content').html() # contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post) # exit() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": 60101, "domain": domain, "categoryNames": [], # "sectors": [20] } dcontents = [] rank = 1 bb = b.replace('<', "<").replace(">",">").replace(""","\"").replace("=","=") logger.info(bb) contents = extract.extractContents(newsurl, bb, document=False) for c in contents: logger.info(c["data"]) if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 # for c in b.replace("<div><p>",'').replace("</p></div>","").split('</p><p>'): # logger.info(c) # if c.find("转载务必署名来源")>=0 or c.find("</p></div>")>=0 or c.find("<div><p> ")>=0: # continue # if c.find("img") >= 0: # c = re.sub(r'<(.*)?img.*"0">',"",c) # dc = { # "rank": rank, # "content": c, # "image": "", # "image_src": "", # } # else: # dc = { # "rank": rank, # "content": c, # "image": "", # "image_src": "", # } # # else: # # if download_crawler is None: # # dc = { # # "rank": rank, # # "content": "", # # "image": "", # # "image_src": c, # # } # # else: # # (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c, download_crawler, SOURCE, key, "news") # # if imgurl is not None: # # dc = { # # "rank": rank, # # "content": "", # # "image": str(imgurl), # # "image_src": "", # # "height": int(height), # # "width": int(width) # # } # # else: # # continue # # logger.info(c) # dcontents.append(dc) # rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8) mid = None if title is not None and len(dcontents) > 0: # mid = collection_news.insert(dnews) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass mongo.close() # logger.info("*************DONE*************%s",mid) return
def process_news(column, newsurl, content, newspost, download_crawler, sort): if has_news_content(content): logger.info("here") d = pq(html.fromstring(content.decode("utf-8"))) key = newsurl.split("/")[-1] type = TYPE if sort.find("投融资") >= 0: type = 60001 category = None title = d('div.mod-head> h1').text().strip() if title is None or title == "": return tags = [] # articletags = d("meta[name='keywords']").attr("content") # if articletags is not None: # for tag in articletags.replace(",", ",").split(","): # if tag is not None and tag.strip() != "" and tag not in tags and tag != title: # tags.append(tag) # # # newspost1 = d('div.article-main> div> img').attr("src") # posturl = parser_mysql_util.get_logo_id(newspost, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None # post = d("meta[property='og:image']").attr("content") try: brief = d("meta[name='description']").attr("content") except: brief = None try: post_time = d('span.time> time').text() logger.info(post_time) # if post_time == datetime.date.strftime(datetime.date.today(),'%Y-%m-%d'): # news_time = datetime.datetime.now() # else: news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d %H:%M") except: news_time = datetime.datetime.now() article = d('div.mod-body> div.content').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s, %s, %s", key, title, news_time, ":".join(tags), type, category, brief, post) # exit() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": [], "sectors": [20] } dcontents = [] rank = 1 for c in contents: # if c["data"].find("◆END◆")>=0 or c["data"].find("…………………")>=0: # break # # if c["data"].find("ACG 领域最具影响力的产业新媒体") >= 0 or c["data"].find("访问三文娱网站3wyu.com查看产业必读文章") >=0: # continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) if title is not None and len(contents) > 0: # mid = collection_news.insert(dnews) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass mongo.close() # logger.info("*************DONE************* %s", mid) return
def process_news(content, news_key, url, news_posttime): if has_news_content(content): download_crawler = download.DownloadCrawler(use_proxy=False) d = pq(html.fromstring(content.decode('utf-8'))) title = d('header.article-header>h1').text().strip() if title is None or title.strip() == "": logger.info("wrong title for url: %s", url) return post_time = pq(content)("meta[name='sailthru.date']").attr("content") news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d %H:%M:%S") + datetime.timedelta(hours=15) key = news_key try: postraw = pq(content)("meta[property='og:image']").attr("content") if postraw.find("techcrunch.opengraph.default.png")>=0: postraw = None except: postraw = None # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None divtags = d('div.tags> div.tag-item') tags = [pq(divtag)('a.tag').text().strip() for divtag in divtags if pq(divtag)('a.tag').text().strip() is not None] category = None logger.info("%s, %s, %s, %s, %s -> %s", key, title, post_time, news_time, ":".join(tags),category) article = d('div.article-entry.text').html() # logger.info(article) contents = extract.extractContents(url, article) mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"source": SOURCE, "key_int": int(key)}) is not None: mongo.close() return # collection_news.delete_one({"source": SOURCE, "key_int": int(key)}) if collection_news.find_one({"title": title, "source": {"$ne": SOURCE}}) is not None: mongo.close() return # collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}}) mongo.close() flag, domain = url_helper.get_domain(url) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": url, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": TYPE, "original_tags": tags, "processStatus": 1, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": [] } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews["contents"] = dcontents brief = util.get_brief_from_news(dcontents) # if post is None or post.strip() == "" or post.find("techcrunch.opengraph.default.png")>=0: # post = util.get_poster_from_news(dcontents) # # dnews["post"] = post if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8) if len(dcontents) > 0: # mongo = db.connect_mongo() # collection_news = mongo.article.news # collection_news.insert(dnews) # mongo.close() nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) logger.info("Done")
def process_news(newsurl, content, newspost, download_crawler): if has_news_content(content): logger.info('here.') d = pq(html.fromstring(content.decode("utf-8", 'ignore'))) category = None categoryNames = [] Type = TYPE tags = [] brief = None title = d('h1').text().strip() if title is None or title == "": return mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({'title': title}) is not None: mongo.close() return key = d('article').attr('id').strip().split('-')[-1] try: (posturl, width, height) = parser_mysql_util.get_logo_id_new( newspost, download_crawler, SOURCE, key, "news") except: posturl = None if posturl is not None: post = str(posturl) else: post = None try: post_time = d("header> div> span> time").text().strip() res = re.search(u'(\d{4})年(\d+)月(\d+)日', post_time) year = res.group(1) month = res.group(2) if len(month) == 1: month = '0' + month day = res.group(3) if len(day) == 1: day = '0' + day post_time = '{}-{}-{}'.format(year, month, day) news_time = extract.extracttime(post_time) except Exception as e: logger.info(e) news_time = datetime.datetime.now() if news_time is None: news_time = datetime.datetime.now() article = d('div.td-post-content').html() contents = extract.extractContents(newsurl, article, document=True) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": Type, "original_tags": tags, "processStatus": 0, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames, } dcontents = [] rank = 1 for c in contents: if c['type'] == 'text': dc = { 'rank': rank, 'content': c['data'], 'image': '', 'image_src': '', } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews['contents'] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) logger.info( json.dumps(dnews, ensure_ascii=False, cls=util.CJsonEncoder)) if title is not None and len(contents) > 0: nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass mongo.close()
def process_news(content, url): if has_news_content(content): d = pq(html.fromstring(content.decode("utf-8"))) download_crawler = download.DownloadCrawler(use_proxy=False) title = d( 'div.post-img-left> div> div.post-head> h1.title').text().strip() post_time = d('article.post-article').attr("ptime") post_Date = time.localtime(int(post_time)) news_time = datetime.datetime(post_Date.tm_year, post_Date.tm_mon, post_Date.tm_mday, post_Date.tm_hour, post_Date.tm_min, post_Date.tm_sec) if collection_news.find_one({"link": url}) is not None: return # collection_news.delete_one({"link": url}) if collection_news.find_one({ "title": title, "source": { "$ne": SOURCE } }) is not None: return key = d('article.post-article').attr("postid") try: key_int = int(key) except: key_int = None column = d('span.post-category').text().strip() brief = d("meta[name='description']").attr("content").strip() if column is not None: tags = column.split() else: tags = [] categoryNames = [] if "人物" in tags: category = 60103 elif "公司" in tags: category = 60105 categoryNames.append("大公司") else: category = None keywords = d("meta[name='keywords']").attr("content") if keywords is not None: for keyword in keywords.split(","): if keyword is not None and keyword.strip( ) not in tags and keyword.strip() not in ["PingWest", "品玩"]: tags.append(keyword.strip()) postraw = d("link[rel='image_src']").attr("href") # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None logger.info("%s, %s, %s, %s, %s, %s -> %s, %s", key, title, post_time, news_time, brief, ":".join(tags), category, post) article = d('div.box-con> div#sc-container').html() # logger.info(article) contents = extract.extractContents(url, article) # if collection_news.find_one({"link": url}) is not None: # return # # collection_news.delete_one({"link": url}) # # if collection_news.find_one({"title": title, "source": {"$ne": SOURCE}}) is not None: # return # collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}}) flag, domain = url_helper.get_domain(url) dnews = { "date": news_time - datetime.timedelta(hours=16), "title": title, "link": url, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": key_int, "type": TYPE, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"].replace("?imageView2/2/w/750/q/90",""), # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"].replace("?imageView2/2/w/750/q/90", ""), } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"].replace("?imageView2/2/w/750/q/90", ""), download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # if post is None or post.strip() == "": # post = util.get_poster_from_news(dcontents) # dnews["post"] = post if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid)
def process_news(column, newsurl, content, newspost, topic, download_crawler): if has_news_content(content): logger.info('here') download_crawler = download.DownloadCrawler(use_proxy=False) # logger.info(content) d = pq(html.fromstring(content.decode("utf-8", "ignore"))) category = None categoryNames = [] key = newsurl.split("/")[-1].replace(".html", "") type = TYPE title = d('div.da-title> h2').text().strip() if title.find("融资") >= 0: type = 60001 category = 60101 (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None logger.info("title: %s", title) mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return tags = [] articletags = d("meta[name='keywords']").attr("content") if articletags is not None: for tag in articletags.split(","): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) try: brief = d("meta[name='description']").attr("content") except: brief = None try: post_time = d("span.article-time").eq(0).text().strip() logger.info(post_time) news_time = extract.extracttime(post_time) logger.info("news-time: %s", news_time) except Exception, e: logger.info(e) news_time = datetime.datetime.now() if news_time is None: news_time = datetime.datetime.now() article = d('div.data-article').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": None, "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["data"].find("btm地址") >= 0 or \ c["data"].find("版权声明") >= 0: continue if c["data"].find("8btctest1/custom/images") >= 0: continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) mongo.close() if title is not None and len(contents) > 0: # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass
newstime = publish_time.split('+')[0].replace('T', ' ') news_time = datetime.datetime.strptime(newstime, "%Y-%m-%d %H:%M:%S") except Exception, e: logger.info(e) news_time = datetime.datetime.now() article = d( 'div.pfcng-row-02> div.pfcng-col-2> div.pos-0> div.pane-content').html( ) if article.find('<form') >= 0: form_str = re.search('<form(.*?)</form>', article).group(1) article = article.replace(form_str, '') # elif article.find('<iframe') >= 0: # iframe_str = re.search('<iframe(.*?)</iframe>',article).group(1) # article = article.replace(iframe_str, '') contents = extract.extractContents(newsurl, article, document=False) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": None, "type": type, "original_tags": tags,