def expand(): #init crawler beian_links_crawler = beian_links.BeianLinksCrawler() icp_chinaz_crawler = icp_chinaz.IcpchinazCrawler() screenshot_crawler = screenshot_website.phantomjsScreenshot() download_crawler_itjuzi = download.DownloadCrawler(max_crawl=200, timeout=10) download_crawler_kr36 = download.DownloadCrawler(use_proxy=False) download_crawler_lagou = download.DownloadCrawler(use_proxy=True) download_crawler = download.DownloadCrawler() while True: # gevent -> list of source_companies if len(COMPANIES) == 0: return sc = COMPANIES.pop(0) source = sc["source"] sourceId = sc["sourceId"] # company_info_expand_mongo.expand_source_company(source, sourceId, beian_links_crawler, icp_chinaz_crawler, screenshot_crawler) if source == 13030: diff_sourceCompanyId = check_expand_diff.check_diff( source, sourceId, download_crawler_itjuzi) elif source == 13020: diff_sourceCompanyId = check_expand_diff.check_diff( source, sourceId, download_crawler_kr36) elif source == 13050: diff_sourceCompanyId = check_expand_diff.check_diff( source, sourceId, download_crawler_lagou) else: diff_sourceCompanyId = check_expand_diff.check_diff( source, sourceId, download_crawler) logger.info("Source: %s, sourceId: %s, Diff: %s", source, sourceId, diff_sourceCompanyId) #Set processStatus in mysql and mongo mongo = db.connect_mongo() collection_source_company = mongo.source.company collection_source_company.update_one( { "source": source, "sourceId": sourceId }, {'$set': { "processStatus": 1 }}) mongo.close() if diff_sourceCompanyId is not None: # #Set recommendIds # # insert audit_source_company # parser_mysql_util.insert_audit_source_company(diff_sourceCompanyId) # parser_mysql_util.update_db_processStatus(source, sourceId, 1) pass
def process_news(column, d_map, content, download_crawler): download_crawler = download.DownloadCrawler(use_proxy=False) d = pq(html.fromstring(content.decode("utf-8", "ignore"))) category = None categoryNames = [] newsurl = d_map['link'] key = re.search('.*?(\d+)/.*', newsurl).group(1) type = TYPE title = d('h1.title').text().strip() brief = d('div.field-item> p').text().strip() publish_time = d( 'div.pfcng-row-01> div.pfcng-col-1> div.pos-2> div> div.node-published' ).attr('content') # newspost = d('div.pane-node-field-images> div> div> div> div> img').html() # logger.info('%s | %s | %s '%(title,brief,publish_time)) # (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") # if posturl is not None: # post = str(posturl) # else: post = None tags = [] try: newstime = publish_time.split('+')[0].replace('T', ' ') news_time = datetime.datetime.strptime(newstime, "%Y-%m-%d %H:%M:%S") except Exception, e: logger.info(e) news_time = datetime.datetime.now()
def start_run(concurrent_num, flag): global DATE global CURRENT_PAGE while True: listcrawler = ListCrawler() newscrawler = ListCrawler() download_crawler = download.DownloadCrawler(use_proxy=False) # download_crawler = None logger.info("%s news %s start...", NEWSSOURCE, flag) # #Re download news of 24 hours # dt = datetime.date.today() # if DATE != dt: # logger.info("Date changed!!! Back to yesterday") # today = datetime.datetime(dt.year, dt.month, dt.day) # yesterday = datetime.datetime(dt.year, dt.month, dt.day) - datetime.timedelta(days=1) # mongo = db.connect_mongo() # collection_news = mongo.article.news # for nn in list(collection_news.find({"source": SOURCE, "createTime": {"$gt":yesterday, "$lt": today}})): # link = nn["link"] # logger.info("Redownload %s", link) # crawler_news(column={}, crawler=newscrawler, newsurl=link, newspost=None, download_crawler=download_crawler) # DATE = dt for column in columns: CURRENT_PAGE = 1 run(flag, column, listcrawler, newscrawler, concurrent_num, download_crawler) logger.info("%s news %s end.", NEWSSOURCE, flag) if flag == "incr": time.sleep(60 * 8) #30 minutes else: return
def start_run(concurrent_num, codes, flag): download_crawler = None download_crawler = download.DownloadCrawler(use_proxy=1) if len(codes) == 0: codesMongo = list(collectionUser.find()) codes = [i['code'] for i in codesMongo] while True: logger.info("%s start...", SOURCENAME) zhihucrawler = Zhihucrawler() # download_crawler = download.DownloadCrawler(use_proxy=False) run(zhihucrawler, concurrent_num, codes, flag, download_crawler) logger.info("%s end.", SOURCENAME) # return if flag == "incr": logger.info('sleeping') gevent.sleep(60 * 60) # 30 minutes else: return
def process_news(column, content, msg, download_crawler): download_crawler = download.DownloadCrawler(use_proxy=False) d = pq(html.fromstring(content.decode("utf-8", "ignore"))) title = msg['title'] newsurl = msg['link'] brief = msg['brief'] newspost = msg['post'] post_time = msg['newsDate'] category = None categoryNames = [] key = re.search('https://vulcanpost.com/(\d+)/.*',newsurl).group(1) type = TYPE (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None tags = [] try: news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d %H:%M:%S") logger.info("news-time: %s", news_time) except Exception, e: logger.info(e) news_time = datetime.datetime.now()
def download(): crawler = download.DownloadCrawler() url = "http://www.sgs.gov.cn/notice/captcha" i = 100 while i<1000: i += 1 image = crawler.get_image(url, max_retry=2) if image is not None: f = open("logs/%s.jpg" % i,'wb') f.write(image) f.close()
def process(content, wechatcrawler, wechatprocess): j = json.loads(content) infos = j["value"]["datas"] logger.info("Got %s news", len(infos)) cnt = 0 download_crawler = download.DownloadCrawler(use_proxy=False) if len(infos) == 0: return cnt mongo = db.connect_mongo() collection_news = mongo.article.news for info in infos: wexinlink = info["url"] readNum = int(info["clicks_count"]) likeNum = int(info["like_count"]) title = info["title"] try: publicTime = datetime.datetime.strptime(info["public_time"],"%Y-%m-%d %H:%M:%S.0")- datetime.timedelta(hours=8) except: publicTime = datetime.datetime.now() - datetime.timedelta(hours=8) logger.info("link: %s", wexinlink) logger.info("article : %s, read: %s, like: %s", title, readNum, likeNum) item = collection_news.find_one({"link": wexinlink}) # item2 = collection_news.find_one({"title": title}) if item is None: dnews = wechatprocess.crawler_news(wechatcrawler, wexinlink, download_crawler, wechatId="微信公众号") # for a in dnews: # logger.info("%s _> %s", a, dnews[a]) dnews["date"] = publicTime dnews["clicksCount"] = readNum dnews["likeCount"] = likeNum # dnews["wechatId"] = wechatId # dnews["wechatName"] = wechatName dnews["processStatus"] = 0 dnews["imgChecked"] = True # dnews["sectors"] = [20] if dnews["result"] == 'SUCCESS' and len(dnews["contents"])>=1: dnews.pop('result') try: collection_news.insert(dnews) cnt += 1 except Exception, e: logger.info(e) pass else: if item["source"] == 13841: logger.info("Update click/update: %s/%s", readNum, likeNum) collection_news.update_one({"_id": item["_id"]}, {"$set": {"clicksCount": readNum, "likeCount": likeNum}})
def start_run(): download_crawler = download.DownloadCrawler(use_proxy=False) while True: logger.info("Begin...") items = list(collection.find({"source": SOURCE, "parsed": {"$ne": True}}).limit(100)) for item in items: parse(item, download_crawler) # break logger.info("End.") # break if len(items) == 0: time.sleep(60)
def run_xiniu(crawler=Zhihucrawler()): url = 'https://www.zhihu.com/api/v4/members/xi-niu-shu-ju/articles?include=data%5B*%5D.comment_count%2Ccan_comment%2Ccomment_permission%2Ccontent%2Cvoteup_count%2Ccreated%2Cupdated%2Cupvoted_followees%2Cvoting%2Creview_info%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=0&limit=20&sort_by=created' data = {'authorization': "oauth c3cef7c66a1843f8b3a9e6a1e3160e20"} while True: result = crawler.crawl(url, headers=data, agent=True) if result['get'] == 'success': process_xiniu( result, code='xi-niu-shu-ju', flag='incr', download_crawler=download.DownloadCrawler(use_proxy=1)) break
def start_run(flag): while True: logger.info("%s news %s start...", NEWSSOURCE, flag) listcrawler = ListCrawler() newscrawler = NewsCrawler() download_crawler = download.DownloadCrawler(use_proxy=False) run(flag, listcrawler, newscrawler, download_crawler) logger.info("%s news %s end.", NEWSSOURCE, flag) if flag == "incr": time.sleep(60 * 8) else: return
def start_run(concurrent_num, flag): global CURRENT_PAGE while True: logger.info("%s news %s start...", NEWSSOURCE, flag) listcrawler = ListCrawler() newscrawler = NewsCrawler() download_crawler = download.DownloadCrawler(use_proxy=False) for column in columns: CURRENT_PAGE = 1 run(flag, column, listcrawler, newscrawler, concurrent_num, download_crawler) logger.info("%s news %s end.", NEWSSOURCE, flag) if flag == "incr": gevent.sleep(60*8) #30 minutes else: return
def start_run(concurrent_num, flag): global CURRENT_PAGE while True: logger.info("%s news %s start...", NEWSSOURCE, flag) listcrawler = ListCrawler() newscrawler = ListCrawler() download_crawler = download.DownloadCrawler(use_proxy=False) # download_crawler = None forums = get_columns(listcrawler) for forumlink in forums: column = {"column": forumlink, "max": 1} CURRENT_PAGE = 1 run(flag, column, listcrawler, newscrawler, concurrent_num, download_crawler) logger.info("%s news %s end.", NEWSSOURCE, flag) if flag == "incr": time.sleep(60 * 50) #30 minutes else: return
def start_run(concurrent_num, codes, flag): download_crawler = None download_crawler = download.DownloadCrawler(use_proxy=1) while True: logger.info("%s start...", SOURCENAME) zhihucrawler = Zhihucrawler() # download_crawler = download.DownloadCrawler(use_proxy=False) run(zhihucrawler, concurrent_num, codes, flag, download_crawler) logger.info("%s end.", SOURCENAME) # return if flag == "incr": logger.info('sleeping') gevent.sleep(60 * 60) # 30 minutes else: return
def process_news(content, download_crawler): download_crawler = download.DownloadCrawler(use_proxy=False) category = None categoryNames = [] key = content['id'] type = TYPE title = content['title'] mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return newspost = content.get('featured_image').get('source') (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None # logger.info(post) tags = [] for tag in content['tags']: tags.append(tag['name']) brief = content['seo']['description'] try: post_time = content['modified_gmt'] news_time = None if post_time.find('T'): post_time = post_time.replace('T', ' ') news_time = extract.extracttime(post_time) logger.info("news-time: %s", news_time) except Exception, e: logger.info(e) news_time = datetime.datetime.now()
def crawlerNews(link, pdate = None): download_crawler = download.DownloadCrawler(use_proxy=False) download_crawler_n = None if link.find("pencilnews.cn") >= 0: pencil_news_v2.crawler_news({}, pencil_news_v2.NewsCrawler(), link, None, download_crawler) elif link.find("lieyunwang.com") >= 0: lieyun_news.run_news(lieyun_news.LieyunNewsCrawler(), link) elif link.find("iyiou.com") >= 0: iyiou_news.crawler_news({}, iyiou_news.NewsCrawler(), link) elif link.find("huxiu.com") >= 0: huxiu_news.crawler_news({}, huxiu_news.NewsCrawler(), link, None, download_crawler) elif link.find("leiphone.com") >= 0: leiphone_news.process(leiphone_news.Contentcrawler(), link) elif link.find("36kr.com") >= 0 : kr36_news.run_news(kr36_news.kr36NewsCrawler(), link) elif link.find("mp.weixin.qq.com") >= 0: wechatcrawler = Wechatcrawler.WechatCrawler() wechatprocess = Wechatcrawler.NewsDownloader() dnews = wechatprocess.crawler_news(wechatcrawler, link, download_crawler, wechatId="微信公众号") # dnews["wechatId"] = "微信公众号" # dnews["wechatName"] = "微信公众号" # try: # dnews["date"] = datetime.datetime.strptime(pdate,"%Y-%m-%d %H:%M:%S") - datetime.timedelta(hours=8) # except: # dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8) if dnews["result"] == 'SUCCESS' and dnews.has_key("contents") is True and len(dnews["contents"]) >= 1: dnews.pop('result') try: mongo = db.connect_mongo() collection_news = mongo.article.news id = collection_news.insert(dnews) mongo.close() logger.info("Done %s", id) # collection_news.insert(dnews) except Exception, e: logger.info(e) pass
def process_news(column, newsurl, content, newspost, topic, download_crawler): if has_news_content(content): # logger.info('here') download_crawler = download.DownloadCrawler(use_proxy=False) # logger.info(content) d = pq(html.fromstring(content.decode("utf-8", "ignore"))) category = None categoryNames = [] key = newsurl.split("/")[-2].replace(".html", "") type = TYPE title = d('h1.single-title').text().strip() newspost = d('header> img.wp-post-image').attr("src") (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None tags = [] # articletags = d("meta[name='keywords']").attr("content") # if articletags is not None: # for tag in articletags.split(","): # if tag is not None and tag.strip() != "" and tag not in tags and tag != title: # tags.append(tag) try: brief = d("meta[name='description']").attr("content") except: brief = None # try: # post_time = topic # # logger.info(post_time) # news_time = datetime.datetime.strptime(post_time, "%Y-%m-%dT%H:%M:%S") # logger.info("news-time: %s", news_time) # except Exception, e: # logger.info(e) news_time = datetime.datetime.now() article = d('section.post_content').html() contents = extract.extractContents(newsurl, article, document=False) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": None, "type": type, "original_tags": tags, "processStatus": 1, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } processStatus = 0 dcontents = [] rank = 1 for c in contents: if c["data"].find("fromgeek.com/awards/") >= 0 or \ c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0: continue if c["data"].find( "Continue reading this story with a subscription to DealStreetAsia" ) >= 0: processStatus = -5 if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 if processStatus != 0: dnews["processStatus"] = processStatus dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) if title is not None and len(contents) > 0: # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s | %s", nid, processStatus) pass return
def process_news(column, newsurl, content, newspost, topic, download_crawler): if has_news_content(content): logger.info('here') download_crawler = download.DownloadCrawler(use_proxy=False) # logger.info(content) d = pq(html.fromstring(content.decode("gbk"))) key = newsurl.split("/")[-1].replace(".htm", "") type = TYPE category = None categoryNames = [] title = d('div.hd> h1').text().strip() logger.info("title: %s", title) mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return tags = [] articletags = d("meta[name='keywords']").attr("content") if articletags is not None: for tag in articletags.split(","): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) # post = d('div#post_thumbnail> img').attr("src") # postraw = newspost # # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") # (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") # if posturl is not None: # post = str(posturl) # else: # post = None post = None brief = d("meta[name='description']").attr("content") post_time = d('div.a_Info> span.a_time').text() logger.info(post_time) news_time = extract.extracttime(post_time) if news_time is None: news_time = datetime.datetime.now() article = d('div.bd> div.Cnt-Main-Article-QQ').html() contents = extract.extractContents(newsurl, article) # logger.info(contents) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) # exit() # mongo = db.connect_mongo() # collection_news = mongo.article.news # if collection_news.find_one({"title": title}) is not None: # mongo.close() # return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["data"].find("fromgeek.com/awards/") >= 0 or \ c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0: continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # if post is None or post.strip() == "": # post = util.get_poster_from_news(dcontents) # dnews["post"] = post if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) mongo.close() # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) return
for column in columns: CURRENT_PAGE = 1 run(flag, column, listcrawler, newscrawler, concurrent_num, download_crawler) logger.info("%s news %s end.", NEWSSOURCE, flag) if flag == "incr": time.sleep(60 * 38) #30 minutes else: return #gevent.sleep(86400*3) #3 days if __name__ == "__main__": if len(sys.argv) > 1: param = sys.argv[1] if param == "incr": start_run(1, "incr") elif param == "all": start_run(1, "all") else: link = param download_crawler = download.DownloadCrawler(use_proxy=False) # download_crawler = None crawler_news({ "column": "new", "max": 1 }, NewsCrawler(), link, "", download_crawler, "投融资") else: start_run(1, "incr")
def process(content, wechatcrawler, wechatprocess): # j = json.loads(content) # infos = j["value"] # logger.info("Got %s news", len(infos)) cnt = 0 d = pq(html.fromstring(content.decode("utf-8"))) title = d('head> title').text().strip() logger.info("title: %s", title) download_crawler = download.DownloadCrawler(use_proxy=False) mongo = db.connect_mongo() collection_news = mongo.article.news for li in d('div.news-box> ul.news-list>li'): try: title = d(li)('h3> a').text() title = "".join(title.split(" ")) wexinlink = d(li)('h3> a').attr("href").strip() post_time = d('div.s-p').attr("t") logger.info(post_time) try: post_time = time.localtime(int(post_time)) news_time = datetime.datetime( post_time.tm_year, post_time.tm_mon, post_time.tm_mday, post_time.tm_hour, post_time.tm_min, post_time.tm_sec) if news_time is None: news_time = datetime.datetime.now() except: news_time = datetime.datetime.now() logger.info("link: %s", wexinlink) logger.info("article : %s,%s", title, news_time) item = collection_news.find_one({"link": wexinlink}) item2 = collection_news.find_one({"title": title}) # # item2 = collection_news.find_one({"title": title}) # logger.info(item) # logger.info(item2) if item is None and item2 is None: logger.info("here crawler") dnews = wechatprocess.crawler_news(wechatcrawler, wexinlink, download_crawler, wechatId="微信公众号") # dnews["wechatId"] = wechatId # dnews["wechatName"] = wechatName dnews["title"] = title dnews["date"] = news_time - datetime.timedelta(hours=8) dnews["processStatus"] = 0 dnews["imgChecked"] = True dnews["category"] = None if dnews["result"] == 'SUCCESS' and len( dnews["contents"]) >= 1: dnews.pop('result') try: id = collection_news.insert(dnews) logger.info("**************: %s", id) cnt += 1 except Exception, e: logger.info(e) pass except: pass mongo.close() return cnt
def process_news(column, newsurl, content, newspost, download_crawler): logger.info('starting process_news %s', newsurl) # if has_news_content(content): if 1: download_crawler = download.DownloadCrawler(use_proxy=False) d = pq(html.fromstring(content.decode("utf-8"))) key = newsurl.split("/")[-1] # type = TYPE category = None title = d('.article_title p').text().strip() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() logger.info('title:%s already exists' % title) return tags = [] articletags = d(".labs a").text().strip() if articletags is not None: for tag in articletags.split(): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) postraw = newspost # post = d('div#post_thumbnail> img').attr("src") # if post is not None: # post = "http://vcbeat.com"+ post brief = None # brief = brief.replace(u'摘要', '').replace(u'摘要:', '').replace(u'摘要:', '').strip() # news_time = extractArticlePublishedDate.extractArticlePublishedDate(newsurl, content) news_time = d('.time').text().strip() news_time = datetime.datetime.strptime(news_time, '%Y-%m-%d %H:%M') # dt = datetime.date.today() # today = datetime.datetime.now() # if news_time is None or news_time > today: # news_time = datetime.datetime.now() article = d('.art_text').html() contents = extract.extractContents(newsurl, article, document=False) # if len(contents)==0: # contents = extract.extractContents(newsurl, article, document=False) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, postraw) # mongo = db.connect_mongo() # collection_news = mongo.article.news # if collection_news.find_one({"title": title}) is not None: # mongo.close() # logger.info( 'title:%s already exists'%title) # return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": None, "type": TYPE, "original_tags": None, "processStatus": 0, # "companyId": None, "companyIds": [], "category": None, "domain": domain, "categoryNames": [] } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) dnews["brief"] = brief # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # update link content with oldId item = collection_news.find_one({"link": newsurl}) if item is None: nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) else: logger.info("update %s", newsurl) # collection_news.update_many({'link': newsurl},{'$set': dnews}) # oldId = collection_news.find_one({"link": newsurl})['_id'] # collection_news.delete_one({"link": newsurl}) # dnews['_id']=oldId # collection_news.insert(dnews) mongo.close() logger.info("*************DONE*************") return
import pymongo reload(sys) sys.setdefaultencoding("utf-8") sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../../../util')) sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../../support')) import loghelper import util, name_helper, url_helper, download, db sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../util')) import parser_db_util #logger loghelper.init_logger("card_d3", stream=True) logger = loghelper.get_logger("card_d3") download_crawler = download.DownloadCrawler(use_proxy=True) SOURCE = 13121 #parse data from qimingpian directly, bamy called it step 1 to checkout company def parse_company(item): logger.info("parse_company") company_key = item["postdata"]["id"] #company basic info c = item["data"]["basic"] tags = c["tags"] tags_str = tags.replace("|",",")
def process_news(content, news_key, url): if has_content(content): download_crawler = download.DownloadCrawler(use_proxy=False) d = pq(html.fromstring(content)) brief = d("meta[name='description']").attr("content").split(",")[-1] title = d('div#article> div.single-item> div.article-hd> h1').text().strip() pagetitle = d('head> title').text().strip() temp = pagetitle.split("-")[-2] categoryNames = [] if temp.strip() == "初页": category = 60102 categoryNames.append("产品") elif temp.strip() == 'IPO/并购': category = 60105 categoryNames.append("大公司") else: category = None post_time = d('div.author-time> span.date-time').attr("data-time") post_date = time.localtime(int(post_time)) news_time = datetime.datetime(post_date.tm_year, post_date.tm_mon, post_date.tm_mday, post_date.tm_hour, post_date.tm_min, post_date.tm_sec) key = news_key column = d('div.article-tags> a').text() tags = column.split() logger.info("%s, %s, %s, %s, %s, %s, %s", key, title, post_time, news_time, temp, category, ":".join(tags)) article = d('div#article> div> div.article-content').html() # # logger.info(article) contents = extract.extractContents(url, article) if collection_news.find_one({"source": SOURCE, "key_int": int(key)}) is not None: return # collection_news.delete_one({"source": SOURCE, "key_int": int(key)}) if collection_news.find_one({"title": title, "source": {"$ne": SOURCE}}) is not None: return # collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}}) # for t in contents: # logger.info(t["data"]) # logger.info("") flag, domain = url_helper.get_domain(url) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": url, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": TYPE, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "" or desc_helper.check_desc(brief,2) is False: brief = util.get_brief_from_news(dcontents) # post = util.get_poster_from_news(dcontents) # dnews["post"] = post post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8) # collection_news.insert(dnews) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid)
def process_news(column, newsurl, content, newspost): if has_news_content(content): download_crawler = download.DownloadCrawler(use_proxy=False) key = content["news"]["id"] newsurl = "https://www.chinaventure.com.cn/cmsmodel/news/detail/%s.shtml" % key type = TYPE category = None categoryNames = [] if content["news"].has_key("newsChannelId"): if content["news"]["newsChannelId"] == 52: category = 60101 categoryNames.append("融资") if content["news"].has_key("tagName"): if content["news"]["tagName"] == '人物': category = 60103 tags = [] if content.has_key("keywordList") is True and len( content["keywordList"]) > 0: for tag in content["keywordList"]: if tag.has_key("keyword") and tag[ "keyword"] is not None and tag["keyword"].strip( ) != "" and tag["keyword"] not in tags: tags.append(tag["keyword"]) title = content["news"]["title"].replace(""", "\"") mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: logger.info( "***************************News existed!!!***********************" ) mongo.close() return # post = d('div#post_thumbnail> img').attr("src") postraw = "http://pic.chinaventure.com.cn/" + content["news"][ "coverImg"] # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None brief = content["news"]["introduction"] post_time = content["news"]["updateAt"] news_time = extract.extracttime(str(post_time)) if news_time is None: news_time = datetime.datetime.now() article = pq(content["news"]["content"]).html() contents = extract.extractContents(newsurl, article) # for c in contents: # logger.info(c["data"]) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, post) # return # mongo = db.connect_mongo() # collection_news = mongo.article.news # if collection_news.find_one({"title": title}) is not None: # logger.info("***************************News existed!!!***********************") # mongo.close() # return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["data"].find("img.mp.itc.cn") >= 0: continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # if post is None or post.strip() == "": # post = util.get_poster_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post # dnews["post"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) mongo.close() nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) # logger.info("*************DONE*************") return
def process_news(column, j_content, content, download_crawler): if has_news_content(content): download_crawler = download.DownloadCrawler(use_proxy=False) d = pq(html.fromstring(content.decode("utf-8", "ignore"))) category = None categoryNames = [] key = j_content['id'] type = TYPE title = j_content['title'] newspost = j_content.get('image') (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None tags = [] brief = j_content['description'] newsurl = j_content['uri'] try: date = j_content['date'] post_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(str(date)[:-3]))) news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d %H:%M:%S") - datetime.timedelta(days=1) logger.info("news-time: %s", news_time) except Exception, e: logger.info(e) news_time = datetime.datetime.now() article = d('div.article-container').html() contents = extract.extractContents(newsurl, article,document=False) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": None, "type": type, "original_tags": tags, "processStatus": 1, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": if c["data"].find("Share to facebookShare to twitterShare to linkedin") >= 0: c['data'] = c['data'].replace('Share to facebookShare to twitterShare to linkedin', '') dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8) # logger.info(json.dumps(dnews,ensure_ascii=False,indent=2,cls=util.CJsonEncoder)) if title is not None and len(contents) > 0: nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass
def process_news(content, url): if has_news_content(content): d = pq(html.fromstring(content.decode("utf-8"))) download_crawler = download.DownloadCrawler(use_proxy=False) title = d( 'div.post-img-left> div> div.post-head> h1.title').text().strip() post_time = d('article.post-article').attr("ptime") post_Date = time.localtime(int(post_time)) news_time = datetime.datetime(post_Date.tm_year, post_Date.tm_mon, post_Date.tm_mday, post_Date.tm_hour, post_Date.tm_min, post_Date.tm_sec) if collection_news.find_one({"link": url}) is not None: return # collection_news.delete_one({"link": url}) if collection_news.find_one({ "title": title, "source": { "$ne": SOURCE } }) is not None: return key = d('article.post-article').attr("postid") try: key_int = int(key) except: key_int = None column = d('span.post-category').text().strip() brief = d("meta[name='description']").attr("content").strip() if column is not None: tags = column.split() else: tags = [] categoryNames = [] if "人物" in tags: category = 60103 elif "公司" in tags: category = 60105 categoryNames.append("大公司") else: category = None keywords = d("meta[name='keywords']").attr("content") if keywords is not None: for keyword in keywords.split(","): if keyword is not None and keyword.strip( ) not in tags and keyword.strip() not in ["PingWest", "品玩"]: tags.append(keyword.strip()) postraw = d("link[rel='image_src']").attr("href") # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None logger.info("%s, %s, %s, %s, %s, %s -> %s, %s", key, title, post_time, news_time, brief, ":".join(tags), category, post) article = d('div.box-con> div#sc-container').html() # logger.info(article) contents = extract.extractContents(url, article) # if collection_news.find_one({"link": url}) is not None: # return # # collection_news.delete_one({"link": url}) # # if collection_news.find_one({"title": title, "source": {"$ne": SOURCE}}) is not None: # return # collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}}) flag, domain = url_helper.get_domain(url) dnews = { "date": news_time - datetime.timedelta(hours=16), "title": title, "link": url, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": key_int, "type": TYPE, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"].replace("?imageView2/2/w/750/q/90",""), # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"].replace("?imageView2/2/w/750/q/90", ""), } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"].replace("?imageView2/2/w/750/q/90", ""), download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # if post is None or post.strip() == "": # post = util.get_poster_from_news(dcontents) # dnews["post"] = post if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid)
def process(crawler): while True: if len(URLS) == 0: return linkDict = URLS.pop(0) retry = 0 while True: retry += 1 if retry > 6: break download_crawler = download.DownloadCrawler(use_proxy=False) url = linkDict['href'] result = crawler.crawl(url) if result['get'] == 'success': d = pq(html.fromstring(result['content'].decode("utf-8"))) title = linkDict['title'] key = url.split('/')[-1] category = d('.al-crumbs a:nth-child(2)').text() if categoryDict.has_key(category): TYPE = categoryDict[category]['type'] category = categoryDict[category]['category'] else: TYPE = 60001 category = None brief = linkDict['brief'] postraw = linkDict['post'] tags = [] # for tag in d('.tags').text().split(): # if tag.strip() not in tags: tags.append(tag) news_time = d('.article__published').eq(0).text() # news_time = datetime.datetime.strptime(' '.join(news_time.split(' ')[:2]), '%Y年%m月%d日 %H:%M') # news_time = datetime.datetime.strptime(news_time, '%Y/%m/%d %p %I:%M') news_time = datetime.datetime.strptime(news_time, '%Y/%m/%d %H:%M') flag, domain = url_helper.get_domain(url) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": url, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": None, "type": TYPE, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": [] } article = d('.article__content').html() contents = extract.extractContents(url, article) dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new( postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post # brief=brief[:100] dnews["brief"] = brief mongo = db.connect_mongo() collection_news = mongo.article.news # update link content with oldId item = collection_news.find_one({"link": url}) if len(dcontents) > 1: if item is None: # collection_news.insert(dnews) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) else: logger.info("update %s", url) # oldId = collection_news.find_one({"link": url})['_id'] # collection_news.delete_one({"link": url}) # dnews['_id'] = oldId # collection_news.insert(dnews) mongo.close() logger.info("%s, %s, %s, %s, %s, %s, %s", key, title, news_time, category, " ".join(tags), brief, post) logger.info("*************DONE*************") break
def process_news(column, newsurl, content, newspost, download_crawler): # if has_news_content(content): if 1: # logger.info('here') download_crawler = download.DownloadCrawler(use_proxy=False) # logger.info(content) d = pq(html.fromstring(content.decode('gb2312', 'ignore'))) category = None categoryNames = [] key = newsurl.split("/")[-1].replace(".htm", "") title = d('h1.title').text().strip() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None tags = [] # articletags = d("meta[name='keywords']").attr("content") # if articletags is not None: # for tag in articletags.split(","): # if tag is not None and tag.strip() != "" and tag not in tags and tag != title: # tags.append(tag) brief = None news_time = d('.timer').text() news_time = datetime.datetime.strptime(news_time, '%Y-%m-%d %H:%M:%S') article = d('.content').html() contents = extract.extractContents(newsurl, article, document=False) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": None, # "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["data"].find("fromgeek.com/awards/") >= 0 or \ c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0: continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8) # collection_news.insert(dnews) mongo.close() if news_classify.get_class(dcontents, 13866) == 1: logger.info('%s is fundingNews', title) TYPE = 60001 else: TYPE = 60010 logger.info('%s is not fundingNews', title) dnews['type'] = TYPE if title is not None and len(contents) > 0: # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass return
def process_news(column, newsurl, content, newspost, topic, download_crawler): if has_news_content(content): logger.info('here') download_crawler = download.DownloadCrawler(use_proxy=False) # logger.info(content) # d = pq(html.fromstring(content.decode("gbk","ignore"))) utfflag = False if content.find("gb2312") == -1: d = pq(html.fromstring(content.decode("utf-8", "ignore"))) utfflag = True else: d = pq(html.fromstring(content.decode("gbk", "ignore"))) category = None categoryNames = [] key = newsurl.split("/")[-1].replace(".shtml", "") type = TYPE post = None if utfflag is True: title = d('div#titsize> strong').text().strip() else: title = d('div.titmain> h1').text().strip() # logger.info("title: %s", title) if title is None or title.strip() == "": title = d('div.texttitbox> h1').text().strip() logger.info("title: %s", title) mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return tags = [] # articletags = d("meta[name='keywords']").attr("content") # if articletags is not None: # for tag in articletags.split(","): # if tag is not None and tag.strip() != "" and tag not in tags and tag != title: # tags.append(tag) # try: # brief = d('div.daodu> p').text().strip().replace("【数据猿导读】","") # except: # brief = None brief = None try: if utfflag is True: post_time = d("p.time> span.mh-title").text().strip() else: post_time = d("meta[property='og:release_date']").attr( "content").split("+")[0] logger.info(post_time) news_time = extract.extracttime(post_time) logger.info("news-time: %s", news_time) except Exception, e: logger.info(e) news_time = datetime.datetime.now() if utfflag is True: article = d('div.tbox.content').html() else: article = d('div.texttit_m1').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=20), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["data"].find("fromgeek.com/awards/") >= 0 or \ c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0: continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) mongo.close() if title is not None and len(contents) > 0: # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass
def process_news(column, newsurl, content, newspost, topic, download_crawler): if has_news_content(content): logger.info('here') download_crawler = download.DownloadCrawler(use_proxy=False) # logger.info(content) d = pq(html.fromstring(content.decode("utf-8", "ignore"))) category = None categoryNames = [] key = newsurl.split("/")[-1].replace(".html", "") type = TYPE title = d('div.da-title> h2').text().strip() if title.find("融资") >= 0: type = 60001 category = 60101 (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None logger.info("title: %s", title) mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return tags = [] articletags = d("meta[name='keywords']").attr("content") if articletags is not None: for tag in articletags.split(","): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) try: brief = d("meta[name='description']").attr("content") except: brief = None try: post_time = d("span.article-time").eq(0).text().strip() logger.info(post_time) news_time = extract.extracttime(post_time) logger.info("news-time: %s", news_time) except Exception, e: logger.info(e) news_time = datetime.datetime.now() if news_time is None: news_time = datetime.datetime.now() article = d('div.data-article').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": None, "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["data"].find("btm地址") >= 0 or \ c["data"].find("版权声明") >= 0: continue if c["data"].find("8btctest1/custom/images") >= 0: continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) mongo.close() if title is not None and len(contents) > 0: # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass
def process_news(content, news_key, url, news_posttime): if has_news_content(content): download_crawler = download.DownloadCrawler(use_proxy=False) d = pq(html.fromstring(content.decode('utf-8'))) title = d('header.article-header>h1').text().strip() if title is None or title.strip() == "": logger.info("wrong title for url: %s", url) return post_time = pq(content)("meta[name='sailthru.date']").attr("content") news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d %H:%M:%S") + datetime.timedelta(hours=15) key = news_key try: postraw = pq(content)("meta[property='og:image']").attr("content") if postraw.find("techcrunch.opengraph.default.png")>=0: postraw = None except: postraw = None # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None divtags = d('div.tags> div.tag-item') tags = [pq(divtag)('a.tag').text().strip() for divtag in divtags if pq(divtag)('a.tag').text().strip() is not None] category = None logger.info("%s, %s, %s, %s, %s -> %s", key, title, post_time, news_time, ":".join(tags),category) article = d('div.article-entry.text').html() # logger.info(article) contents = extract.extractContents(url, article) mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"source": SOURCE, "key_int": int(key)}) is not None: mongo.close() return # collection_news.delete_one({"source": SOURCE, "key_int": int(key)}) if collection_news.find_one({"title": title, "source": {"$ne": SOURCE}}) is not None: mongo.close() return # collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}}) mongo.close() flag, domain = url_helper.get_domain(url) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": url, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": TYPE, "original_tags": tags, "processStatus": 1, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": [] } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews["contents"] = dcontents brief = util.get_brief_from_news(dcontents) # if post is None or post.strip() == "" or post.find("techcrunch.opengraph.default.png")>=0: # post = util.get_poster_from_news(dcontents) # # dnews["post"] = post if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8) if len(dcontents) > 0: # mongo = db.connect_mongo() # collection_news = mongo.article.news # collection_news.insert(dnews) # mongo.close() nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) logger.info("Done")