def process_news(item, url, content, category_ori): if has_news_content(content): download_crawler = download.DownloadCrawler(use_proxy=False) d = pq(html.fromstring(content.decode("utf-8"))) if content.find("c-single-normal__title") >= 0: title = d('h1.c-single-normal__title').text().strip() elif content.find("c-article-header__title") >= 0: title = d('h1.c-article-header__title').text().strip() else: # exit() return try: post_time = pq(content)("meta[property='og:updated_time']").attr( "content").split("+")[0] news_time = datetime.datetime.strptime(post_time, "%Y-%m-%dT%H:%M:%S") except: datecontent = d( 'div.c-article-header-meta> span.c-article-header-meta__time' ).text().strip() logger.info("Date********%s", datecontent) result = util.re_get_result('(\d{4}\-)', datecontent) if result: news_time = datetime.datetime.strptime(datecontent, "%Y-%m-%d %H:%M") else: post_time = str(dt.year) + '-' + datecontent news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d %H:%M") key = item["key"] column = d( 'div.c-article-header-meta> span.c-article-header-meta__category' ).text().strip() brief = d("meta[name='description']").attr("content")[:100] if column is not None: tags = column.split() else: tags = [] categoryNames = [] category = categoryDict[category_ori] if category == 60105: categoryNames.append("大公司") # if category == None: # if "访谈" in tags: # category = 60103 # elif "范品" in tags or "产品" in tags: # category = 60102 # else: # category = None keywords = d('div#article-content> div.c-article-tags').text() if keywords is not None: for keyword in keywords.split(): if keyword is not None and keyword.strip() not in tags: tags.append(keyword.strip()) logger.info("%s, %s, %s, %s, %s, %s", key, title, news_time, category, ":".join(tags), brief) article = d('article.s-single-article').html() #logger.info(article) contents = extract.extractContents(url, article) if collection_news.find_one({"link": url}) is not None: return # collection_news.delete_one({"link": url}) if collection_news.find_one({ "title": title, "source": { "$ne": SOURCE } }) is not None: return # collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}}) postraw = d("meta[property='og:image']").attr("content") # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None flag, domain = url_helper.get_domain(url) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": url, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": TYPE, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # if post is None or post.strip() == "" or (post.find("http://") == -1 and post.find("https://") == -1): # post = util.get_poster_from_news(dcontents) # dnews["post"] = post if post is None or post.strip() == "" or ( post.find("http://") == -1 and post.find("https://") == -1): post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief[:100] if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid)
def process_news(column, newsurl, content, newspost, download_crawler): if has_news_content(content): d = pq(html.fromstring(content.decode("utf-8","ignore"))) key = newsurl.split("/")[-1].replace("i","") type = TYPE category = None title = d('head> title').text().strip() r = "content: '(.*?)',.*groupId" result = util.re_get_result(r.strip()[:-1], content) (b,) = result logger.info(b) # exit() tags = [] articletags = d("meta[name='keywords']").attr("content") if articletags is not None: for tag in articletags.replace(",", ",").split(","): if tag is not None and tag.strip() != "" and tag not in tags and tag != title: tags.append(tag) post = None brief = None news_time = None try: r1 = "time: '(.*?)'.*},.*tagInfo" result = util.re_get_result(r1, content) (post_time,) = result logger.info(post_time) news_time = extract.extracttime(post_time) logger.info("news-time: %s", news_time) except: pass if news_time is None: news_time = datetime.datetime.now() # exit() # article = d('div.post> div.post-content').html() # contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post) # exit() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": 60101, "domain": domain, "categoryNames": [], # "sectors": [20] } dcontents = [] rank = 1 bb = b.replace('<', "<").replace(">",">").replace(""","\"").replace("=","=") logger.info(bb) contents = extract.extractContents(newsurl, bb, document=False) for c in contents: logger.info(c["data"]) if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 # for c in b.replace("<div><p>",'').replace("</p></div>","").split('</p><p>'): # logger.info(c) # if c.find("转载务必署名来源")>=0 or c.find("</p></div>")>=0 or c.find("<div><p> ")>=0: # continue # if c.find("img") >= 0: # c = re.sub(r'<(.*)?img.*"0">',"",c) # dc = { # "rank": rank, # "content": c, # "image": "", # "image_src": "", # } # else: # dc = { # "rank": rank, # "content": c, # "image": "", # "image_src": "", # } # # else: # # if download_crawler is None: # # dc = { # # "rank": rank, # # "content": "", # # "image": "", # # "image_src": c, # # } # # else: # # (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c, download_crawler, SOURCE, key, "news") # # if imgurl is not None: # # dc = { # # "rank": rank, # # "content": "", # # "image": str(imgurl), # # "image_src": "", # # "height": int(height), # # "width": int(width) # # } # # else: # # continue # # logger.info(c) # dcontents.append(dc) # rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8) mid = None if title is not None and len(dcontents) > 0: # mid = collection_news.insert(dnews) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass mongo.close() # logger.info("*************DONE*************%s",mid) return
def process_news(content, url): if has_news_content(content): d = pq(html.fromstring(content.decode("utf-8"))) download_crawler = download.DownloadCrawler(use_proxy=False) title = d( 'div.post-img-left> div> div.post-head> h1.title').text().strip() post_time = d('article.post-article').attr("ptime") post_Date = time.localtime(int(post_time)) news_time = datetime.datetime(post_Date.tm_year, post_Date.tm_mon, post_Date.tm_mday, post_Date.tm_hour, post_Date.tm_min, post_Date.tm_sec) if collection_news.find_one({"link": url}) is not None: return # collection_news.delete_one({"link": url}) if collection_news.find_one({ "title": title, "source": { "$ne": SOURCE } }) is not None: return key = d('article.post-article').attr("postid") try: key_int = int(key) except: key_int = None column = d('span.post-category').text().strip() brief = d("meta[name='description']").attr("content").strip() if column is not None: tags = column.split() else: tags = [] categoryNames = [] if "人物" in tags: category = 60103 elif "公司" in tags: category = 60105 categoryNames.append("大公司") else: category = None keywords = d("meta[name='keywords']").attr("content") if keywords is not None: for keyword in keywords.split(","): if keyword is not None and keyword.strip( ) not in tags and keyword.strip() not in ["PingWest", "品玩"]: tags.append(keyword.strip()) postraw = d("link[rel='image_src']").attr("href") # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None logger.info("%s, %s, %s, %s, %s, %s -> %s, %s", key, title, post_time, news_time, brief, ":".join(tags), category, post) article = d('div.box-con> div#sc-container').html() # logger.info(article) contents = extract.extractContents(url, article) # if collection_news.find_one({"link": url}) is not None: # return # # collection_news.delete_one({"link": url}) # # if collection_news.find_one({"title": title, "source": {"$ne": SOURCE}}) is not None: # return # collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}}) flag, domain = url_helper.get_domain(url) dnews = { "date": news_time - datetime.timedelta(hours=16), "title": title, "link": url, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": key_int, "type": TYPE, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"].replace("?imageView2/2/w/750/q/90",""), # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"].replace("?imageView2/2/w/750/q/90", ""), } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"].replace("?imageView2/2/w/750/q/90", ""), download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # if post is None or post.strip() == "": # post = util.get_poster_from_news(dcontents) # dnews["post"] = post if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid)
def process_news(column, newsurl, content, newspost, topic, download_crawler): if has_news_content(content): logger.info('here') download_crawler = download.DownloadCrawler(use_proxy=False) # logger.info(content) # d = pq(html.fromstring(content.decode("gbk","ignore"))) utfflag = False if content.find("gb2312") == -1: d = pq(html.fromstring(content.decode("utf-8", "ignore"))) utfflag = True else: d = pq(html.fromstring(content.decode("gbk", "ignore"))) category = None categoryNames = [] key = newsurl.split("/")[-1].replace(".shtml", "") type = TYPE post = None if utfflag is True: title = d('div#titsize> strong').text().strip() else: title = d('div.titmain> h1').text().strip() # logger.info("title: %s", title) if title is None or title.strip() == "": title = d('div.texttitbox> h1').text().strip() logger.info("title: %s", title) mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return tags = [] # articletags = d("meta[name='keywords']").attr("content") # if articletags is not None: # for tag in articletags.split(","): # if tag is not None and tag.strip() != "" and tag not in tags and tag != title: # tags.append(tag) # try: # brief = d('div.daodu> p').text().strip().replace("【数据猿导读】","") # except: # brief = None brief = None try: if utfflag is True: post_time = d("p.time> span.mh-title").text().strip() else: post_time = d("meta[property='og:release_date']").attr( "content").split("+")[0] logger.info(post_time) news_time = extract.extracttime(post_time) logger.info("news-time: %s", news_time) except Exception, e: logger.info(e) news_time = datetime.datetime.now() if utfflag is True: article = d('div.tbox.content').html() else: article = d('div.texttit_m1').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=20), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["data"].find("fromgeek.com/awards/") >= 0 or \ c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0: continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) mongo.close() if title is not None and len(contents) > 0: # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass
def process_news(column, newsurl, content, newspost, topic, download_crawler): if has_news_content(content): # logger.info('here') download_crawler = download.DownloadCrawler(use_proxy=False) # logger.info(content) d = pq(html.fromstring(content.decode("utf-8", "ignore"))) category = None categoryNames = [] key = newsurl.split("/")[-2].replace(".html", "") type = TYPE title = d('h1.single-title').text().strip() newspost = d('header> img.wp-post-image').attr("src") (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None tags = [] # articletags = d("meta[name='keywords']").attr("content") # if articletags is not None: # for tag in articletags.split(","): # if tag is not None and tag.strip() != "" and tag not in tags and tag != title: # tags.append(tag) try: brief = d("meta[name='description']").attr("content") except: brief = None # try: # post_time = topic # # logger.info(post_time) # news_time = datetime.datetime.strptime(post_time, "%Y-%m-%dT%H:%M:%S") # logger.info("news-time: %s", news_time) # except Exception, e: # logger.info(e) news_time = datetime.datetime.now() article = d('section.post_content').html() contents = extract.extractContents(newsurl, article, document=False) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": None, "type": type, "original_tags": tags, "processStatus": 1, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } processStatus = 0 dcontents = [] rank = 1 for c in contents: if c["data"].find("fromgeek.com/awards/") >= 0 or \ c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0: continue if c["data"].find( "Continue reading this story with a subscription to DealStreetAsia" ) >= 0: processStatus = -5 if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 if processStatus != 0: dnews["processStatus"] = processStatus dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) if title is not None and len(contents) > 0: # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s | %s", nid, processStatus) pass return
def process_news(column, newsurl, content, newspost, download_crawler): if has_news_content(content): logger.info("here") d = pq(html.fromstring(content.decode("utf-8"))) key = newsurl.split("/")[-1].replace(".html", "") type = TYPE category = None # title = d('div.des').text().strip() title = d('h1.entry-title').text().strip() if title is None or title == "": return tags = [] articletags = d("meta[name='keywords']").attr("content") if articletags is not None: for tag in articletags.replace(",", ",").split(","): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) # # post = d('div#post_thumbnail> img').attr("src") # posturl = parser_mysql_util.get_logo_id(newspost, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None # post = d("meta[property='og:image']").attr("content") try: brief = d("meta[name='description']").attr("content") except: brief = None # brief = None try: post_time = d('time.entry-date').attr("datetime").split("+")[0] news_time = datetime.datetime.strptime(post_time, "%Y-%m-%dT%H:%M:%S") except: news_time = datetime.datetime.now() # article = d('div.artile_box> div.c').html() article = d('div.entry-content').html() contents = extract.extractContents(newsurl, article, document=False) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post) # exit() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": [], "sectors": [20] } dcontents = [] rank = 1 for c in contents: if c["data"].find("◆END◆") >= 0 or c["data"].find("…………………") >= 0: break if c["data"].find("ACG 领域最具影响力的产业新媒体") >= 0 or c["data"].find( "访问三文娱网站3wyu.com查看产业必读文章") >= 0: continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) mid = None if title is not None and len(contents) > 0: # mid = collection_news.insert(dnews) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass mongo.close() # logger.info("*************DONE************* %s", mid) return
def process_news(column, newsurl, content, newsposttime, download_crawler): if has_news_content(content): d = pq(html.fromstring(content.decode("utf-8"))) key = newsurl.split("/")[-1].strip() type = TYPE title = d('div.article-wrap> div.article-head> h1').text().strip() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return tags = [] articletags = d("meta[name='keywords']").attr("content") if articletags is not None: for tag in articletags.split(","): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) category = None categoryNames = [] if "投资并购" in tags: category = 60101 categoryNames.append("融资") # post = d('div#post_thumbnail> img').attr("src") post = None brief = d("meta[name='description']").attr("content") news_time = None if newsposttime is not None: news_time = extract.extracttime(newsposttime) if news_time is None: dt = datetime.date.today() post_time = d( 'div.article-wrap> div.article-head> p> span.article-time' ).text() if post_time is None or post_time.strip() == str(dt): news_time = datetime.datetime.now() # news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d") else: news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d") article = d('div.article-wrap> div.article-content').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post) # exit() # mongo = db.connect_mongo() # collection_news = mongo.article.news # if collection_news.find_one({"title": title}) is not None: # mongo.close() # return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) dnews["postId"] = post dnews["brief"] = brief # Design for sector: dnews["sectors"] = [10] dnews["sector_confidence"] = [1] if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) mongo.close() # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) return
def process_news(column, newsurl, content, newspost, download_crawler): if has_news_content(content): # logger.info('here') download_crawler = download.DownloadCrawler(use_proxy=False) # logger.info(content) d = pq(html.fromstring(content.decode('gbk', 'ignore'))) category = None categoryNames = [] key = newsurl.split("/")[-1].replace(".html", "") title = d('div.article> h1').text().strip() (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None tags = [] # articletags = d("meta[name='keywords']").attr("content") # if articletags is not None: # for tag in articletags.split(","): # if tag is not None and tag.strip() != "" and tag not in tags and tag != title: # tags.append(tag) brief = None try: news_time = d('div.info> p> em').eq(0).text() news_time = datetime.datetime.strptime(news_time, '%Y-%m-%d %H:%M') except: news_time = datetime.datetime.now() article = d('div.article-t').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": str(key), # "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["data"].find("var currentPage") >= 0 or \ c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0: break if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8) # collection_news.insert(dnews) # mongo.close() if news_classify.get_class(dcontents, 13866) == 1: logger.info('%s is fundingNews', title) TYPE = 60001 else: TYPE = 60010 logger.info('%s is not fundingNews', title) dnews['type'] = TYPE if title is not None and len(contents) > 0: # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass return
def process_news(item, url, content): if has_news_content(content): download_crawler = download.DownloadCrawler(use_proxy=False) d = pq(html.fromstring(content.decode("utf-8"))) title = d('div.main-wrap> header> h1').text().strip() post_time = d('div.topic-info> span.release-date> span').attr( "data-time") post_Date = time.localtime(int(post_time)) news_time = datetime.datetime(post_Date.tm_year, post_Date.tm_mon, post_Date.tm_mday, post_Date.tm_hour, post_Date.tm_min, post_Date.tm_sec) key = item["key"] column = d('div.main-wrap> div.label').text().strip() brief = d("meta[name='description']").attr("content") if brief is not None: brief = brief.strip() if column is not None: tags = column.split() else: tags = [] category = None categoryNames = [] if "深度报道" in tags: type = 60003 category = 60107 else: type = 60001 if "极客早知道" in tags: category = 60105 categoryNames.append("大公司") keywords = pq( content.decode("utf-8"))("meta[name='keywords']").attr("content") if keywords is not None: for keyword in keywords.split(","): if keyword is not None and keyword not in tags: tags.append(keyword.strip()) logger.info("%s, %s, %s, %s, %s, %s -> %s", key, title, post_time, news_time, brief, ":".join(tags), category) article = d( 'section.main-content> article> div.article-content').html() #logger.info(article) contents = extract.extractContents(url, article) if collection_news.find_one({ "source": SOURCE, "key_int": int(key) }) is not None: return # collection_news.delete_one({"source": SOURCE, "key_int": int(key)}) if collection_news.find_one({ "title": title, "source": { "$ne": SOURCE } }) is not None: return # collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}}) flag, domain = url_helper.get_domain(url) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": url, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] article_img = d( 'section.main-content> article> div.topic-cover> img').attr("src") if article_img is not None: dc = { "rank": 1, "content": "", "image": "", "image_src": article_img, } dcontents.append(dc) rank = len(dcontents) + 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # post = util.get_poster_from_news(dcontents) # dnews["post"] = post post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid)
def process_news(column, newsurl, content): if has_news_content(content): d = pq(content) title = d('div#post_title').text() url = newsurl key = url.split('/')[-1] post_time = d('div#post_date').text() logger.info(post_time) news_time = extract.extracttime(post_time) if news_time is None: news_time = datetime.datetime.now() logger.info("title:%s, date:%s", title, news_time) flag, domain = url_helper.get_domain(url) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": url, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": TYPE, "original_tags": [], "processStatus": 0, # "companyId":companyId, "companyIds": [], "category": None, "domain": domain, "categoryNames": [] } dcontents = [] description = d('div#post_description').text() if description is not None: dc = { "rank": 1, "content": "亿欧快讯", "image": "", "image_src": "", } dcontents.append(dc) dc = { "rank": 2, "content": description.replace("【消息来源】", ""), "image": "", "image_src": "", } dcontents.append(dc) logger.info(description) dnews["contents"] = dcontents brief = util.get_brief_from_news(dcontents) post = util.get_posterId_from_news(dcontents) dnews["postId"] = post # dnews["post"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) # collection_news.insert(dnews) return
def process(crawler, outlink=None): while True: if outlink is None: if len(URLS) == 0: return linkDict = URLS.pop(0) else: linkDict = { "href": outlink, "post": None, "title": None } retries = 0 while True: if retries > 10: break retries += 1 download_crawler = download.DownloadCrawler(use_proxy=False) url = linkDict['href'] result = crawler.crawl(url, agent=True) if result['get'] == 'success': d = pq(html.fromstring(result['content'].decode("utf-8", 'ignore'))) title = linkDict['title'] if title is None: title = d('h1.headTit').text().strip() key = url.split('/')[-1].split('.')[0] # category = d('.al-crumbs a:nth-child(2)').text() # # if categoryDict.has_key(category): # type = categoryDict[category]['type'] # category = categoryDict[category]['category'] # else: # type = 60001 # category = None brief = d('.article-lead').text().replace('导语:', '') postraw = linkDict['post'] tags = [] for tag in d('.related-link a'): t = tag.text.strip() if t not in tags: tags.append(t) news_time = d('.inner .time').text() news_time = datetime.datetime.strptime(news_time, '%Y-%m-%d %H:%M') flag, domain = url_helper.get_domain(url) try: key_int = int(key) except: key_int = None category = None dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": url, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": key_int, "type": 60001, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": [] } article = d('.lph-article-comView').html() contents = extract.extractContents(url, article) dcontents = [] rank = 1 for c in contents: if c["type"] == "text": if c["data"].find("未经授权禁止转载。详情见转载须知。") > 0: continue dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief mongo = db.connect_mongo() collection_news = mongo.article.news # update link content with oldId item = collection_news.find_one({"link": url}) if item is None: nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) else: logger.info("update %s", url) # oldId = collection_news.find_one({"link": url})['_id'] # collection_news.delete_one({"link": url}) # dnews['_id'] = oldId # collection_news.insert(dnews) mongo.close() logger.info("%s, %s, %s, %s, %s, %s, %s", key, title, news_time, category, " ".join(tags), brief, post) # logger.info("*************DONE*************") break if outlink is not None: break
def process_news(column, newsurl, content, newspost, download_crawler): if has_news_content(content): d = pq(html.fromstring(content.decode('utf-8', "ignore"))) key = newsurl.split("/")[-1].replace(".html", "") type = TYPE category = None title = d('div#content> article> header> h1').text().strip() [author, cleanTitle] = clean_title(title) tags = [] # articletags = d("meta[name='keywords']").attr("content") # if articletags is not None: # for tag in articletags.split(","): # if tag is not None and tag.strip() != "" and tag not in tags and tag != title: # tags.append(tag) # posturl = parser_mysql_util.get_logo_id(newspost, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None brief = d("meta[name='description']").attr("content") news_time = None post_time = d('li.post-time> time').text() logger.info(post_time) if post_time.find("月") >= 0: dt = datetime.date.today() today = datetime.datetime(dt.year, dt.month, dt.day) if datetime.datetime.strptime(post_time, "%Y年%m月%d日") == today: news_time = datetime.datetime.now() if news_time is None: if post_time is not None: news_time = datetime.datetime.strptime(post_time, "%Y年%m月%d日") else: news_time = datetime.datetime.now() article = d('div#content> article> div.entry-content').html() # logger.info(article) contents = extract.extractContents(newsurl, article, document=False) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post) # exit() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 1, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "author": author, "cleanTitle": cleanTitle, "categoryNames": [] # "sectors": [20] } dcontents = [] rank = 1 for c in contents: if c["data"].find("报告下载")>=0 and c["data"].find("回复关键词")>=0 or c["data"].find("原创编译")>=0 or \ c["data"].find("199IT感谢您的支持!") >= 0: continue if c["data"].find("其它年份报告,请点击下载")>=0 or c["data"].find("var wum")>=0 or \ c["data"].find("原创编译自") >=0 or c["data"].find("更多阅读")>=0: break if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # mid = collection_news.insert(dnews) mongo.close() # logger.info("*************DONE*************%s",mid) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) return
def process(crawler): while True: if len(URLS) == 0: return linkDict = URLS.pop(0) retries = 0 while True: if retries > 6: break retries += 1 download_crawler = download.DownloadCrawler(use_proxy=False) url = linkDict['href'] result = crawler.crawl(url) if result['get'] == 'success': d = pq(result['content']) title = linkDict['title'] key = url.split('=')[-1] if categoryDict.has_key(linkDict['category']): type = categoryDict[linkDict['category']]['type'] category = categoryDict[linkDict['category']]['category'] else: type = 60001 category = None brief = linkDict['brief'] postraw = linkDict['post'] tags = [] for tag in d('.txt span em').text().split(): if tag.strip() not in tags: tags.append(tag) for tag in d('.pag span').text().split(): if tag.strip() not in tags: tags.append(tag) news_time = linkDict['date'] flag, domain = url_helper.get_domain(url) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": url, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": [] } article = d('.article .top').html() contents = extract.extractContents(url, article) dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new( postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief mongo = db.connect_mongo() collection_news = mongo.article.news # update link content with oldId item = collection_news.find_one({"link": url}) if item is None: nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) else: logger.info("update %s", url) # oldId = collection_news.find_one({"link": url})['_id'] # collection_news.delete_one({"link": url}) # dnews['_id'] = oldId # collection_news.insert(dnews) mongo.close() logger.info("%s, %s, %s, %s, %s, %s, %s", key, title, news_time, category, " ".join(tags), brief, post) # logger.info("*************DONE*************") break
def process_news(item): # if has_news_content(item): if 1: d = pq(item) title = d('.item-title').text() url = d('.item-title').attr('href') key = url.split('/')[-1].split('.')[0] date = d('.news-time').attr('data-time') news_time = datetime.datetime.fromtimestamp(float(date)) flag, domain = url_helper.get_domain(url) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": url, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": TYPE, "original_tags": [], "processStatus": 0, # "companyId":companyId, "companyIds": [], "category": None, "domain": domain, "categoryNames": [] } dcontents = [] description = d('.item-desc').text() if description is not None: dc = { "rank": 1, "content": "创业邦快讯", "image": "", "image_src": "", } dcontents.append(dc) dc = { "rank": 2, "content": description, "image": "", "image_src": "", } dcontents.append(dc) logger.info(description) dnews["contents"] = dcontents brief = util.get_brief_from_news(dcontents) post = util.get_posterId_from_news(dcontents) dnews["postId"] = post # dnews["post"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid)
def process(crawler): while True: if len(URLS) == 0: return linkDict = URLS.pop(0) retry = 0 while True: retry += 1 if retry > 6: break download_crawler = download.DownloadCrawler(use_proxy=False) url = linkDict['href'] result = crawler.crawl(url) if result['get'] == 'success': d = pq(html.fromstring(result['content'].decode("utf-8"))) title = linkDict['title'] key = url.split('/')[-1] category = d('.al-crumbs a:nth-child(2)').text() if categoryDict.has_key(category): TYPE = categoryDict[category]['type'] category = categoryDict[category]['category'] else: TYPE = 60001 category = None brief = linkDict['brief'] postraw = linkDict['post'] tags = [] # for tag in d('.tags').text().split(): # if tag.strip() not in tags: tags.append(tag) news_time = d('.article__published').eq(0).text() # news_time = datetime.datetime.strptime(' '.join(news_time.split(' ')[:2]), '%Y年%m月%d日 %H:%M') # news_time = datetime.datetime.strptime(news_time, '%Y/%m/%d %p %I:%M') news_time = datetime.datetime.strptime(news_time, '%Y/%m/%d %H:%M') flag, domain = url_helper.get_domain(url) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": url, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": None, "type": TYPE, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": [] } article = d('.article__content').html() contents = extract.extractContents(url, article) dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new( postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post # brief=brief[:100] dnews["brief"] = brief mongo = db.connect_mongo() collection_news = mongo.article.news # update link content with oldId item = collection_news.find_one({"link": url}) if len(dcontents) > 1: if item is None: # collection_news.insert(dnews) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) else: logger.info("update %s", url) # oldId = collection_news.find_one({"link": url})['_id'] # collection_news.delete_one({"link": url}) # dnews['_id'] = oldId # collection_news.insert(dnews) mongo.close() logger.info("%s, %s, %s, %s, %s, %s, %s", key, title, news_time, category, " ".join(tags), brief, post) logger.info("*************DONE*************") break
def process_news(column, newsurl, content, newspost, download_crawler): if has_news_content(content): d = pq(html.fromstring(content.decode('gbk', "ignore"))) key = newsurl.split("/")[-1].replace(".html", "") type = TYPE category = None title = d('div#postlist> table> tr> td.plc.ptm.pbn> h1').text().strip() [author, cleanTitle] = clean_title(title) tags = [] articletags = d("meta[name='keywords']").attr("content") if articletags is not None: for tag in articletags.split(","): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) # posturl = parser_mysql_util.get_logo_id(newspost, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None brief = d("meta[name='description']").attr("content") news_time = None post_time = d( 'div#postlist> div> table.plhin> tr> td> div> div> div.authi> em' ).eq(0).text() logger.info(post_time) # if post_time.find("月") >=0: # dt = datetime.date.today() # today = datetime.datetime(dt.year, dt.month, dt.day) # if post_time is None or datetime.datetime.strptime(post_time, "%Y年%m月%d日") == today: # news_time = datetime.datetime.now() if news_time is None: news_time = datetime.datetime.strptime( post_time.replace("发布时间: ", ""), "%Y-%m-%d %H:%M") article = d('div.t_fsz> table> tr> td.t_f').eq(0).html() # logger.info(article) contents = extract.extractContents(newsurl, article, document=False) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post) # exit() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": None, "type": type, "original_tags": tags, "processStatus": 1, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "author": author, "cleanTitle": cleanTitle, "categoryNames": [] # "sectors": [20] } dcontents = [] rank = 1 for c in contents: # logger.info(c["data"]) if c["data"].find("image/common/none.gif") >= 0 or c["data"].find( "下载本地保存到信息图册") >= 0: continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) mongo.close() # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) return
def process_news(column, newsurl, content, newspost, download_crawler): if has_news_content(content): d = pq(html.fromstring(content.decode("utf-8", "ignore"))) key = newsurl.split("/")[-1].replace(".html", "") type = TYPE category = None title = d('article> h1').text().strip() tags = [] articletags = d("meta[name='keywords']").attr("content") if articletags is not None: for tag in articletags.split(","): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) if "English" in tags or "english" in tags: logger.info("Englis not needed, get out!") return if "商业价值杂志" in tags: type = 60003 category = 60107 # post = d('div#post_thumbnail> img').attr("src") # posturl = parser_mysql_util.get_logo_id(newspost, download_crawler, SOURCE, key, "news") # if posturl is not None: # post = str(posturl) # else: # post = None postraw = d("meta[property='og:image']").attr("content") # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None brief = d("article> p.post-abstract").text().strip().replace( '摘要: ', "") post_time = d('article> div.post-info> span.time').text() logger.info(post_time) news_time = extract.extracttime(post_time) if news_time is None: news_time = datetime.datetime.now() article = d('article> div.inner').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post) # exit() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": [] # "sectors": [20] } dcontents = [] rank = 1 for c in contents: if c["data"] == "http://www.tmtpost.com/public/css/images/wzny_ewm.jpg": continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) if title is not None and len(contents) > 0: # collection_news.insert(dnews) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) mongo.close() # logger.info("*************DONE*************") return
def process_news(column, newsurl, content, newspost, download_crawler): if has_news_content(content): logger.info('here') d = pq(html.fromstring(content.decode("utf-8", 'ignore'))) category = None categoryNames = [] key = newsurl.split("/")[-1].replace(".html", "") type = TYPE title = d('div.post-inner> h1').text().strip() if title is None or title == "": return mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({'title': title}) is not None: mongo.close() return (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None tags = [] articletags = d("meta[name='keywords']").attr('content') if articletags is not None: for tag in articletags.split(","): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) try: brief = d("meta[name='description']").attr("content") except: brief = None try: post_time = d("p.post-byline> time.published").text().strip() logger.info('时间:%s' % post_time) p = re.compile(u'(年|月)') post_time = p.sub('-', post_time).replace('日', '') logger.info(post_time) news_time = extract.extracttime(post_time) logger.info("news-time: %s", news_time) except Exception, e: logger.info(e) news_time = datetime.datetime.now() if news_time is None: news_time = datetime.datetime.now() article = d('div.entry-inner').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames, } dcontents = [] rank = 1 for c in contents: if c['data'].find('文章相关引用及参考') >= 0 or c['data'].find( '读者QQ群') >= 0: continue if c['type'] == 'text': dc = { 'rank': rank, 'content': c['data'], 'image': '', 'image_src': '', } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews['contents'] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) logger.info( json.dumps(dnews, ensure_ascii=False, cls=util.CJsonEncoder)) if title is not None and len(contents) > 0: nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass mongo.close()
def process_news(column, newsurl, content, newspost, download_crawler): if has_news_content(content): d = pq(html.fromstring(content.decode("utf-8"))) key = newsurl.split("/")[-1].strip().replace(".shtml", "") type = TYPE category = None title = d('div.subject> h1').text().strip() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return tags = [] postraw = newspost # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None brief = d("meta[name='description']").attr("content") post_time = d('div.meta> span.meta-date').text().replace("发布", "") logger.info(post_time) news_time = extract.extracttime(post_time) if news_time is None: news_time = datetime.datetime.now() article = d('div.subject> div.subject-content').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post) # exit() # mongo = db.connect_mongo() # collection_news = mongo.article.news # if collection_news.find_one({"title": title}) is not None: # mongo.close() # return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": [] } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) mongo.close() # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) return
def process_news(content, url, key, col): if has_news_content(content): download_crawler = download.DownloadCrawler(use_proxy=False) d = pq(html.fromstring(content.decode("utf-8"))) title = d('div.cj_content> div.cj_top> div.cj_tit> h2').text().strip().replace("&quo;", "\"") datecontent = d('div.cj_content> div.cj_top> div.cj_tit> p.fa').text() result = util.re_get_result('(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})', datecontent) if result: post_time, = result news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d %H:%M:%S") else: logger.info("incorrcet post time") return try: key_int = int(key) except: key_int = None brief = d("meta[name='description']").attr("content").strip() if col["column"] == "view": type = 60003 else: type = TYPE categoryNames = [] category = col["category"] if category == 60105: categoryNames.append("大公司") if category == 60101: categoryNames.append("融资") tags = [] keywords = d("meta[name='keywords']").attr("content") if keywords is not None: for keyword in keywords.split(","): if keyword is not None and keyword.strip() not in tags: tags.append(keyword.strip()) postraw = d('div.cj_content> div.cj_top> img.gg').attr("src") # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None logger.info("%s, %s, %s, %s, %s, %s -> %s, %s", key, title, post_time, news_time, brief, ":".join(tags), category, post) article = d('div.para_ycont> div.col-xs-12').html() # logger.info(article) contents = extract.extractContents(url, article) if collection_news.find_one({"link": url}) is not None: return # collection_news.delete_one({"link": url}) if collection_news.find_one({"title": title, "source": {"$ne": SOURCE}}) is not None: collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}}) flag, domain = url_helper.get_domain(url) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": url, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": key_int, "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"].replace("?imageView2/2/w/750/q/90",""), # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"].replace("?imageView2/2/w/750/q/90",""), } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"].replace("?imageView2/2/w/750/q/90",""), download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # if post is None or post.strip() == "": # post = util.get_poster_from_news(dcontents) # dnews["post"] = post if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8) # collection_news.insert(dnews) # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid)
def process_news(column, newsurl, content, newspost, download_crawler): if has_news_content(content): logger.info('here') d = pq(html.fromstring(content.decode("utf-8", 'ignore'))) if d.text().find('embed') >= 0: # 排除视频文章 logger.info('not article:%s' % newsurl) return category = None categoryNames = [] key = newsurl.split("/")[-1].replace(".html", "") type = TYPE title = d('h1').text().strip() if title is None or title == "": return mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({'title': title}) is not None: mongo.close() return try: (posturl, width, height) = parser_mysql_util.get_logo_id_new( newspost, download_crawler, SOURCE, key, "news") except: posturl = None if posturl is not None: post = str(posturl) else: post = None tags = [] articletags = d("meta[name='keywords']").attr('content') if articletags is not None: for tag in articletags.split(","): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) try: brief = d("meta[name='description']").attr("content") except: brief = None try: post_time_1 = d("div.wyt-post-content-meta> div> p ").find( 'span').text().strip() post_time_2 = d("div.wyt-post-content-meta> div").find( 'p').next().text().strip() if post_time_1: post_time = post_time_1 else: post_time = post_time_2 if re.match('\d{2}-\d{2}', post_time): # 匹配 03-19格式 post_time = str(time.localtime()[0]) + '-' + post_time news_time = extract.extracttime(post_time) logger.info("news-time: %s", news_time) except Exception, e: logger.info(e) news_time = datetime.datetime.now() if news_time is None: news_time = datetime.datetime.now() article = d('article.wyt-post-content').html() contents = extract.extractContents(newsurl, article, document=True) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames, } dcontents = [] rank = 1 if contents[0]['type'] == 'img': del contents[0] for c in contents: # logger.info("%s-%s",c["type"],c["data"]) if c['type'] == 'text': if re.match('^\d+$', c['data']) or c['data'].find('收藏') >= 0 or c['data'].find('投融资') >= 0 or c['data'].find('阅读时间') >= 0 \ or c['data'].find('违者必究') >= 0 or c['data'].find('微信公众号') >= 0 or c['data'].find('微信扫描') >= 0 \ or c['data'].find('点击获取完整版报告') >= 0 or c['data'].find('作者原创,微信号') >= 0: continue # if c['data'].find('译者') >= 0: # c['data'] = c['data'].split(' ')[0] # # if c['data'].find('来源') >= 0: # c['data'] = c['data'].split('|')[0] if c['data'].find('| 未央网') >= 0: c['data'] = c['data'].replace('| 未央网', ' ') dc = { 'rank': rank, 'content': c['data'], 'image': '', 'image_src': '', } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews['contents'] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) logger.info( json.dumps(dnews, ensure_ascii=False, cls=util.CJsonEncoder)) if title is not None and len(contents) > 0: nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass mongo.close()
def process_news(column, d_map, content, download_crawler): if has_news_content(content): download_crawler = download.DownloadCrawler(use_proxy=False) d = pq(html.fromstring(content.decode("utf-8", "ignore"))) category = None categoryNames = [] type = TYPE key = d('div#sb-site> article').attr('data-id') title = d('div#sb-site> article> section#article-header> h2> strong> a' ).text().strip() newspost = d( 'div#sb-site> article> section#article-image> div> figure> img' ).attr('src') logger.info('newspost:%s' % newspost) newsurl = d_map['link'] (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None tags = [] brief = d("meta[name='description']").attr('content') post_time = d('div#sb-site> article> section#article-meta> span> em' ).text().strip() news_time = None is_re = re.search('(\d{2}-\d{2}-\d{4})', post_time) is_re2 = re.search('(\d) hours ago', post_time) if is_re: news_time = datetime.datetime.strptime(is_re.group(1), "%d-%m-%Y") elif is_re2: news_time = datetime.datetime.now() - datetime.timedelta( hours=int(is_re2.group(1))) elif post_time.find('a moment') >= 0: news_time = datetime.datetime.now() logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) article = d( 'div#sb-site> article> section#article-content> div.post-content> div.row' ).html() is_re3 = re.search("(<strong>DailySocial\.id.*?</p>)", article, re.S) if is_re3: article = article.replace(is_re3.group(1), '') contents = extract.extractContents(newsurl, article, document=False) flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": None, "type": type, "original_tags": tags, "processStatus": 1, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["data"].find("Also Read") >= 0 or c['data'].find( 'function()') >= 0: continue # if c['data'].find('caption') >= 0: # c['data'] = c['data'].replace('caption','') if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # logger.info(json.dumps(dnews,ensure_ascii=False,indent=2,cls=util.CJsonEncoder)) if title is not None and len(contents) > 0: nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass return
def process_news(column, newsurl, content, newspost, download_crawler): logger.info('starting process_news %s', newsurl) # if has_news_content(content): if 1: download_crawler = download.DownloadCrawler(use_proxy=False) d = pq(html.fromstring(content.decode("utf-8"))) key = newsurl.split("/")[-1].replace('.html', '') # type = TYPE category = None title = d('.single-post-title').text().strip() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() logger.info('title:%s already exists' % title) return tags = [] articletags = d(".mb-2 a").text().strip() if articletags is not None: for tag in articletags.split(): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) postraw = None # post = d('div#post_thumbnail> img').attr("src") # if post is not None: # post = "http://luxe.com"+ post # brief = d(".intr").text() # brief = brief.replace(u'摘要', '').replace(u'摘要:', '').replace(u'摘要:', '').strip() brief = None # news_time = extractArticlePublishedDate.extractArticlePublishedDate(newsurl, content) # news_time = datetime.datetime.strptime(news_time, '%Y/%m/%d %H:%M:%S') news_time = d('.post-meta').text().split()[-1] news_time = extract.extracttime(news_time) # dt = datetime.date.today() today = datetime.datetime.now() if news_time is None or news_time > today: news_time = datetime.datetime.now() article = d('.post-body').html() contents = extract.extractContents(newsurl, article, document=False) # if len(contents)==0: # contents = extract.extractContents(newsurl, article, document=False) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, postraw) # mongo = db.connect_mongo() # collection_news = mongo.article.news # if collection_news.find_one({"title": title}) is not None: # mongo.close() # logger.info( 'title:%s already exists'%title) # return flag, domain = url_helper.get_domain(newsurl) typeNames = d('.breadcrumb-item a').text() TYPE = 60001 if typeNames.find(u'金融') >= 0 or typeNames.find( u'融资') >= 0 else 60005 if title.find(u'融资') >= 0 or title.find(u'投资') >= 0: TYPE = 60001 category = 60101 else: category = None dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": TYPE, "original_tags": None, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": [] } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) dnews["brief"] = brief # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # update link content with oldId item = collection_news.find_one({"link": newsurl}) if item is None: nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) else: logger.info("update %s", newsurl) # collection_news.update_many({'link': newsurl},{'$set': dnews}) # oldId = collection_news.find_one({"link": newsurl})['_id'] # collection_news.delete_one({"link": newsurl}) # dnews['_id']=oldId # collection_news.insert(dnews) mongo.close() # logger.info("*************DONE*************") return
def process_news(column, newsurl, content, newspost): if has_news_content(content): download_crawler = download.DownloadCrawler(use_proxy=False) # d = pq(html.fromstring(content.decode("utf-8","ignore"))) if content.find("charset=GBK") == -1: d = pq(html.fromstring(content.decode("utf-8","ignore"))) utfflag = True else: d = pq(html.fromstring(content.decode("gbk", "ignore"))) utfflag = False key = newsurl.split("?")[0].split("/")[-1].replace(".shtml","") type = TYPE category = None categoryNames = [] tags = [] articletags = d("meta[name='keywords']").attr("content") if articletags is not None: for tag in articletags.split(): if tag is not None and tag.strip() != "" and tag not in tags: tags.append(tag) if utfflag is True: title = d('article> div> h1').text().strip() else: title = d('div.titleH> h1').text().strip() logger.info("title: %s",title) mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return # post = d('div#post_thumbnail> img').attr("src") postraw = newspost # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None brief = d("meta[name='description']").attr("content") if utfflag is True: post_time = d('p.source> span.f-right').eq(0).text() else: post_time = d('div.titleH> p.zsp> span').eq(2).text() logger.info(post_time) news_time = extract.extracttime(post_time) if news_time is None: news_time = datetime.datetime.now() # article = d('div.contdiv').html() if utfflag is True: article = d('div.post-text').html() else: article = d('div.contdiv').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, post) # exit() # mongo = db.connect_mongo() # collection_news = mongo.article.news # if collection_news.find_one({"title": title}) is not None: # mongo.close() # return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["data"].find("电商资讯第一入口") != -1: break if c["data"] in Nocontents: continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # if post is None or post.strip() == "": # post = util.get_poster_from_news(dcontents) # dnews["post"] = post if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8) # collection_news.insert(dnews) mongo.close() # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) return
def process_news(column, newsurl, content, newspost, topic, download_crawler): if has_news_content(content): logger.info('here') download_crawler = download.DownloadCrawler(use_proxy=False) # logger.info(content) d = pq(html.fromstring(content.decode("utf-8", "ignore"))) key = newsurl.split("/")[-1].replace(".html", "") type = TYPE category = None categoryNames = [] title = d('div.article-section> h1').text().strip() logger.info("title: %s", title) mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return tags = [] articletags = d("meta[name='keywords']").attr("content") if articletags is not None: for tag in articletags.split(","): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) # post = d('div#post_thumbnail> img').attr("src") postraw = newspost # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None brief = d("meta[name='description']").attr("content") post_time = d('li.date>span').text() logger.info(post_time) if post_time == str(datetime.date.today()): news_time = datetime.datetime.now() else: news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d") article = d('div.article-section> div> article').html() contents = extract.extractContents(newsurl, article) # logger.info(contents) logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief) # exit() # mongo = db.connect_mongo() # collection_news = mongo.article.news # if collection_news.find_one({"title": title}) is not None: # mongo.close() # return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } dcontents = [] rank = 1 for c in contents: if c["data"].find("fromgeek.com/awards/") >= 0 or \ c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0: continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # if post is None or post.strip() == "": # post = util.get_poster_from_news(dcontents) # dnews["post"] = post if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # collection_news.insert(dnews) mongo.close() # logger.info("*************DONE*************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) return
def process_news(column, newsurl, content, newspost, download_crawler, force): if has_news_content(content): main = pq(content)('div.article_content') d = pq(main) key = newsurl.split("/")[-1].replace(".html", "") title = pq(content)('head> title').text().strip() logger.info("title: %s", title) # title = d('h1#article_title').text() brief = pq(content)("meta[name='description']").attr("content") # post_time =pq(content)("meta[property='article:published_time']").attr("content").split("+")[0] # news_time = datetime.datetime.strptime(post_time, "%Y-%m-%dT%H:%M:%S") result = util.re_get_result("var publishTime = new Date\(\"(.*?)\"\)", content) if result: post_time, = result news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d %H:%M:%S") else: logger.info("incorrcet post time") logger.info(content) # exit() return categoryNames = [] contents = extract.extractContents(newsurl, d.html()) if title.find("融资") >= 0 or title.find("获投") >= 0: category = 60101 categoryNames.append("融资") else: category = None tags = [] articletags = pq(content)("meta[name='keywords']").attr( "content").replace(";", ",") if articletags is None: logger.info(content) else: for tag in articletags.split(","): if tag is not None and tag.strip() != "" and tag not in tags: tags.append(tag) logger.info("%s, %s, %s, %s, %s, %s", key, title, news_time, category, ":".join(tags), brief) if force is True: mongo = db.connect_mongo() collection_news = mongo.article.news collection_news.delete_many({ "source": SOURCE, "key_int": int(key) }) collection_news.delete_many({"title": title}) mongo.close() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": TYPE, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": categoryNames } #pjtables pjcontents = [] trs = pq(content)('div.proj_table> table> tr') logger.info("*****len of trs %s", len(trs)) for tr in trs: logger.info(tr) co = pq(tr).text() logger.info(co) if co is not None and co.strip() != "": pjcontents.append(co.replace(" ", ":")) dcontents = [] rank = 1 for c in contents: if c["data"] == "/The End/": break if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 for pjc in pjcontents: dc = { "rank": rank, "content": pjc, "image": "", "image_src": "", } dcontents.append(dc) logger.info(pjc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) # post = util.get_poster_from_news(dcontents) # dnews["post"] = post # if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) # id =collection_news.insert(dnews) # logger.info("***********id: %s", id) # logger.info("*************DONE**************") nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) mongo.close() return
def process_news(content, news_key, url, news_posttime): if has_news_content(content): download_crawler = download.DownloadCrawler(use_proxy=False) d = pq(html.fromstring(content.decode('utf-8'))) title = d('header.article-header>h1').text().strip() if title is None or title.strip() == "": logger.info("wrong title for url: %s", url) return post_time = pq(content)("meta[name='sailthru.date']").attr("content") news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d %H:%M:%S") + datetime.timedelta(hours=15) key = news_key try: postraw = pq(content)("meta[property='og:image']").attr("content") if postraw.find("techcrunch.opengraph.default.png")>=0: postraw = None except: postraw = None # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None divtags = d('div.tags> div.tag-item') tags = [pq(divtag)('a.tag').text().strip() for divtag in divtags if pq(divtag)('a.tag').text().strip() is not None] category = None logger.info("%s, %s, %s, %s, %s -> %s", key, title, post_time, news_time, ":".join(tags),category) article = d('div.article-entry.text').html() # logger.info(article) contents = extract.extractContents(url, article) mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"source": SOURCE, "key_int": int(key)}) is not None: mongo.close() return # collection_news.delete_one({"source": SOURCE, "key_int": int(key)}) if collection_news.find_one({"title": title, "source": {"$ne": SOURCE}}) is not None: mongo.close() return # collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}}) mongo.close() flag, domain = url_helper.get_domain(url) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": url, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": TYPE, "original_tags": tags, "processStatus": 1, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": [] } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: # dc = { # "rank": rank, # "content": "", # "image": "", # "image_src": c["data"], # } if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue dcontents.append(dc) rank += 1 dnews["contents"] = dcontents brief = util.get_brief_from_news(dcontents) # if post is None or post.strip() == "" or post.find("techcrunch.opengraph.default.png")>=0: # post = util.get_poster_from_news(dcontents) # # dnews["post"] = post if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8) if len(dcontents) > 0: # mongo = db.connect_mongo() # collection_news = mongo.article.news # collection_news.insert(dnews) # mongo.close() nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) logger.info("Done")
def process_news(self, newsurl, content, download_crawler): if self.has_news_content(content): try: d = pq(html.fromstring(content.decode("utf-8"))) except: d = pq(html.fromstring(content)) key = newsurl.split("/")[-1].replace(".shtml","").replace(".html","") try: key_int = int(key) except: key_int = None news_time = extractArticlePublishedDate.extractArticlePublishedDate(newsurl, content) if news_time is None: news_time = datetime.datetime.now() title = extract.extractTitle(content) contents = extract.extractContents(newsurl, content) tags = [] try: articletags = d("meta[name='keywords']").attr("content") if articletags is not None: for tag in articletags.split(): if tag is not None and tag.strip() != "" and tag not in tags and tag != title: tags.append(tag) except: pass logger.info("News: %s, %s, %s", key, title, news_time) # mongo = db.connect_mongo() # collection_news = mongo.article.news # if collection_news.find_one({"link": newsurl}) is not None: # mongo.close() # return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": self.SOURCE, "key": key, "key_int": key_int, "type": self.TYPE, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": self.CATEGORY, "domain": domain, "categoryNames": [] } dcontents = [] rank = 1 for c in contents: if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: # imgurl = parser_mysql_util.get_logo_id(c["data"], download_crawler, self.SOURCE, key, "news") (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, self.SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents brief = util.get_brief_from_news(dcontents) post = util.get_poster_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief # if news_time > datetime.datetime.now() or news_time < datetime.datetime.now() - datetime.timedelta(days=30): # logger.info("Time: %s is not correct with current time", news_time) # dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8) if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8) if len(dnews["contents"])> 2: # mongo = db.connect_mongo() # collection_news = mongo.article.news # collection_news.insert(dnews) # mongo.close() nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) logger.info("*************DONE*************")
if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8) # logger.info(json.dumps(dnews,ensure_ascii=False,indent=2,cls=util.CJsonEncoder)) if title is not None and len(contents) > 0: nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass return def crawler_news(column, crawler, d_map, download_crawler): global Proxies retry = 0 while True: if Proxies is None: Proxies = get_proxy(http_type='https') logger.info('---->retry:%d<----' % retry) try: newsurl = d_map['link'] logger.info('crawl url:%s' % newsurl)
def process_news(column, newsurl, content, newspost, download_crawler): if has_news_content(content): d = pq(html.fromstring(content.decode("utf-8", "ignore"))) key = newsurl.split("/")[-1] type = TYPE category = None title = d('div.post> div.post-title> h1.title').text().strip() # logger.info("title: %s", title) if title is None or title == "": return tags = [] articletags = d("meta[name='keywords']").attr("content") if articletags is not None: for tag in articletags.replace(",", ",").split(","): if tag is not None and tag.strip( ) != "" and tag not in tags and tag != title: tags.append(tag) # post = d('div#post_thumbnail> img').attr("src") # posturl = parser_mysql_util.get_logo_id(newspost, download_crawler, SOURCE, key, "news") (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news") if posturl is not None: post = str(posturl) else: post = None # post = d("meta[property='og:image']").attr("content") brief = d("meta[name='description']").attr("content") news_time = None try: post_time = d( 'div.post> div.post-title> div> span.postclock').text() logger.info("post-time: %s", post_time) # for span in spans: # if d(span).text() is not None and d(span).text().find("日期") >= 0: # post_time = d(span).text().replace("日期:","").strip() # logger.info(post_time) # try: # news_time = datetime.datetime.strptime(post_time,"%Y年%m月%d日 %H:%M") # except Exception, e: # logger.info(e) # pass # break news_time = extract.extracttime(post_time) logger.info("news-time: %s", news_time) except: pass if news_time is None: news_time = datetime.datetime.now() article = d('div.post> div.post-content').html() contents = extract.extractContents(newsurl, article) logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post) # exit() mongo = db.connect_mongo() collection_news = mongo.article.news if collection_news.find_one({"title": title}) is not None: mongo.close() return flag, domain = url_helper.get_domain(newsurl) dnews = { "date": news_time - datetime.timedelta(hours=8), "title": title, "link": newsurl, "createTime": datetime.datetime.now(), "source": SOURCE, "key": key, "key_int": int(key), "type": type, "original_tags": tags, "processStatus": 0, # "companyId": None, "companyIds": [], "category": category, "domain": domain, "categoryNames": [], # "sectors": [20] } dcontents = [] rank = 1 for c in contents: if c["data"].find("转载请联系原出处") >= 0 or c["data"].find( "网页转载须在文首") >= 0: continue if c["type"] == "text": dc = { "rank": rank, "content": c["data"], "image": "", "image_src": "", } else: if download_crawler is None: dc = { "rank": rank, "content": "", "image": "", "image_src": c["data"], } else: (imgurl, width, height) = parser_mysql_util.get_logo_id_new( c["data"], download_crawler, SOURCE, key, "news") if imgurl is not None: dc = { "rank": rank, "content": "", "image": str(imgurl), "image_src": "", "height": int(height), "width": int(width) } else: continue # logger.info(c["data"]) dcontents.append(dc) rank += 1 dnews["contents"] = dcontents if brief is None or brief.strip() == "": brief = util.get_brief_from_news(dcontents) if post is None or post.strip() == "": post = util.get_posterId_from_news(dcontents) if download_crawler is None: dnews["post"] = post else: dnews["postId"] = post dnews["brief"] = brief if news_time > datetime.datetime.now(): logger.info("Time: %s is not correct with current time", news_time) dnews["date"] = datetime.datetime.now() - datetime.timedelta( hours=8) mid = None if title is not None and len(contents) > 0: # mid = collection_news.insert(dnews) nid = parser_mongo_util.save_mongo_news(dnews) logger.info("Done: %s", nid) pass mongo.close() # logger.info("*************DONE*************%s",mid) return