예제 #1
0
def process_news(column, d_map, content, download_crawler):
    if has_news_content(content):
        download_crawler = download.DownloadCrawler(use_proxy=False)
        d = pq(html.fromstring(content.decode("utf-8", "ignore")))

        category = None
        categoryNames = []
        type = TYPE

        key = d('div#sb-site> article').attr('data-id')
        title = d('div#sb-site> article> section#article-header> h2> strong> a'
                  ).text().strip()
        newspost = d(
            'div#sb-site> article> section#article-image> div> figure> img'
        ).attr('src')
        logger.info('newspost:%s' % newspost)
        newsurl = d_map['link']

        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(newspost,
                                                     download_crawler, SOURCE,
                                                     key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        tags = []
        brief = d("meta[name='description']").attr('content')
        post_time = d('div#sb-site> article> section#article-meta> span> em'
                      ).text().strip()
        news_time = None
        is_re = re.search('(\d{2}-\d{2}-\d{4})', post_time)
        is_re2 = re.search('(\d) hours ago', post_time)
        if is_re:
            news_time = datetime.datetime.strptime(is_re.group(1), "%d-%m-%Y")
        elif is_re2:
            news_time = datetime.datetime.now() - datetime.timedelta(
                hours=int(is_re2.group(1)))
        elif post_time.find('a moment') >= 0:
            news_time = datetime.datetime.now()

        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time,
                    ":".join(tags), category, brief)

        article = d(
            'div#sb-site> article> section#article-content> div.post-content> div.row'
        ).html()
        is_re3 = re.search("(<strong>DailySocial\.id.*?</p>)", article, re.S)
        if is_re3:
            article = article.replace(is_re3.group(1), '')
        contents = extract.extractContents(newsurl, article, document=False)

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": None,
            "type": type,
            "original_tags": tags,
            "processStatus": 1,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("Also Read") >= 0 or c['data'].find(
                    'function()') >= 0:
                continue
            # if c['data'].find('caption') >= 0:
            #     c['data'] = c['data'].replace('caption','')

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)

        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # logger.info(json.dumps(dnews,ensure_ascii=False,indent=2,cls=util.CJsonEncoder))

        if title is not None and len(contents) > 0:
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
        return
예제 #2
0
def process_news(column, newsurl, content, newspost, topic, download_crawler):
    if has_news_content(content):
        logger.info('here')
        download_crawler = download.DownloadCrawler(use_proxy=False)
        # logger.info(content)
        d = pq(html.fromstring(content.decode("gbk")))

        key = newsurl.split("/")[-1].replace(".htm", "")

        type = TYPE

        category = None
        categoryNames = []

        title = d('div.hd> h1').text().strip()
        logger.info("title: %s", title)
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        tags = []
        articletags = d("meta[name='keywords']").attr("content")
        if articletags is not None:
            for tag in articletags.split(","):
                if tag is not None and tag.strip(
                ) != "" and tag not in tags and tag != title:
                    tags.append(tag)

        # post = d('div#post_thumbnail> img').attr("src")
        # postraw = newspost
        # # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        # (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news")
        # if posturl is not None:
        #     post = str(posturl)
        # else:
        #     post = None

        post = None
        brief = d("meta[name='description']").attr("content")

        post_time = d('div.a_Info> span.a_time').text()
        logger.info(post_time)
        news_time = extract.extracttime(post_time)
        if news_time is None:
            news_time = datetime.datetime.now()

        article = d('div.bd> div.Cnt-Main-Article-QQ').html()
        contents = extract.extractContents(newsurl, article)
        # logger.info(contents)

        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time,
                    ":".join(tags), category, brief)
        # exit()
        # mongo = db.connect_mongo()
        # collection_news = mongo.article.news
        # if collection_news.find_one({"title": title}) is not None:
        #     mongo.close()
        #     return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("fromgeek.com/awards/") >= 0 or \
                    c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0:
                continue

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        # if post is None or post.strip() == "":
        #     post = util.get_poster_from_news(dcontents)
        # dnews["post"] = post
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        mongo.close()
        # logger.info("*************DONE*************")
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
    return
예제 #3
0
def process_news(column, newsurl, content, newspost, download_crawler):
    # if has_news_content(content):
    if 1:
        # logger.info('here')
        download_crawler = download.DownloadCrawler(use_proxy=False)
        # logger.info(content)
        d = pq(html.fromstring(content.decode('gb2312', 'ignore')))

        category = None
        categoryNames = []

        key = newsurl.split("/")[-1].replace(".htm", "")

        title = d('h1.title').text().strip()

        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        tags = []
        # articletags = d("meta[name='keywords']").attr("content")
        # if articletags is not None:
        #     for tag in articletags.split(","):
        #         if tag is not None and tag.strip() != "" and tag not in tags and tag != title:
        #             tags.append(tag)

        brief = None

        news_time = d('.timer').text()
        news_time = datetime.datetime.strptime(news_time, '%Y-%m-%d %H:%M:%S')

        article = d('.content').html()
        contents = extract.extractContents(newsurl, article, document=False)

        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief)

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": None,
            # "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("fromgeek.com/awards/") >= 0 or \
                    c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0:
                continue

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE,
                                                                                key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)

        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8)
        # collection_news.insert(dnews)
        mongo.close()

        if news_classify.get_class(dcontents, 13866) == 1:
            logger.info('%s is fundingNews', title)
            TYPE = 60001
        else:
            TYPE = 60010
            logger.info('%s is not fundingNews', title)

        dnews['type'] = TYPE

        if title is not None and len(contents) > 0:
            # logger.info("*************DONE*************")
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
    return
예제 #4
0
파일: vcbeat_news.py 프로젝트: yujiye/Codes
def process_news(column, newsurl, content, newspost, download_crawler):
    logger.info('starting process_news %s', newsurl)
    # if has_news_content(content):
    if 1:
        download_crawler = download.DownloadCrawler(use_proxy=False)
        d = pq(html.fromstring(content.decode("utf-8")))

        key = newsurl.split("/")[-1]

        # type = TYPE

        category = None

        title = d('.article_title p').text().strip()

        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            logger.info('title:%s already exists' % title)
            return

        tags = []
        articletags = d(".labs a").text().strip()
        if articletags is not None:
            for tag in articletags.split():
                if tag is not None and tag.strip(
                ) != "" and tag not in tags and tag != title:
                    tags.append(tag)

        postraw = newspost
        # post = d('div#post_thumbnail> img').attr("src")
        # if post is not None:
        #     post = "http://vcbeat.com"+ post

        brief = None
        # brief = brief.replace(u'摘要', '').replace(u'摘要:', '').replace(u'摘要:', '').strip()

        # news_time = extractArticlePublishedDate.extractArticlePublishedDate(newsurl, content)
        news_time = d('.time').text().strip()
        news_time = datetime.datetime.strptime(news_time, '%Y-%m-%d %H:%M')
        # dt = datetime.date.today()
        # today = datetime.datetime.now()
        # if news_time is None or news_time > today:
        #     news_time = datetime.datetime.now()

        article = d('.art_text').html()
        contents = extract.extractContents(newsurl, article, document=False)
        # if len(contents)==0:
        #     contents = extract.extractContents(newsurl, article, document=False)

        logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time,
                    ":".join(tags), category, brief, postraw)
        # mongo = db.connect_mongo()
        # collection_news = mongo.article.news
        # if collection_news.find_one({"title": title}) is not None:
        #     mongo.close()
        #     logger.info( 'title:%s already exists'%title)
        #     return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": None,
            "type": TYPE,
            "original_tags": None,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": None,
            "domain": domain,
            "categoryNames": []
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue

            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        dnews["brief"] = brief

        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler,
                                                     SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)

        # update link content with oldId
        item = collection_news.find_one({"link": newsurl})
        if item is None:
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
        else:
            logger.info("update %s", newsurl)
            # collection_news.update_many({'link': newsurl},{'$set': dnews})

            # oldId = collection_news.find_one({"link": newsurl})['_id']
            # collection_news.delete_one({"link": newsurl})
            # dnews['_id']=oldId
            # collection_news.insert(dnews)
        mongo.close()
        logger.info("*************DONE*************")

    return
예제 #5
0
def process_news(item, url, content):
    if has_news_content(content):
        d = pq(html.fromstring(content.decode("gbk")))

        title = d(
            'div.g-main> div> div.m-cont-hd> div.title> h1').text().strip()
        datecontent = d(
            'div.g-main> div> div.m-cont-hd> div.m-info> div> div> div.box> div.origin'
        ).text().strip()
        result = util.re_get_result('(\d{4}\/.*?)$', datecontent)
        if result:
            post_time, = result
            news_time = datetime.datetime.strptime(post_time,
                                                   "%Y/%m/%d %H:%M:%S")
        else:
            post_time = None
            news_time = None

        key = item["key"]
        column = d('div.g-main> div> div.m-cont-hd> div.tag').text().strip()
        brief = d('div.g-article> div> div.review').text().strip()
        postraw = item["post"]
        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler,
                                                     SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        if column is not None:
            tags = column.split()
        else:
            tags = []

        logger.info("%s, %s, %s, %s, %s, %s", key, title, post_time, news_time,
                    brief, ":".join(tags))
        article = d('div.g-article> div.m-article').html()
        #logger.info(article)
        contents = extract.extractContents(url, article)

        if collection_news.find_one({"link": url}) is not None:
            return
            # collection_news.delete_one({"link": url})
        #
        # for t in contents:
        #    logger.info(t["data"])
        #    logger.info("")
        flag, domain = url_helper.get_domain(url)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": url,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": TYPE,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": None,
            "domain": domain,
            "categoryNames": []
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        # if post is None or post.strip() == "":
        #     post = util.get_posterId_from_news(dcontents)
        # dnews["post"] = post
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief
        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)

        # collection_news.insert(dnews)
        # logger.info("*************DONE*************")
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
예제 #6
0
파일: chinav_news.py 프로젝트: yujiye/Codes
def process_news(column, newsurl, content, newspost):
    if has_news_content(content):
        download_crawler = download.DownloadCrawler(use_proxy=False)
        key = content["news"]["id"]

        newsurl = "https://www.chinaventure.com.cn/cmsmodel/news/detail/%s.shtml" % key

        type = TYPE

        category = None
        categoryNames = []
        if content["news"].has_key("newsChannelId"):
            if content["news"]["newsChannelId"] == 52:
                category = 60101
                categoryNames.append("融资")

        if content["news"].has_key("tagName"):
            if content["news"]["tagName"] == '人物':
                category = 60103

        tags = []
        if content.has_key("keywordList") is True and len(
                content["keywordList"]) > 0:
            for tag in content["keywordList"]:
                if tag.has_key("keyword") and tag[
                        "keyword"] is not None and tag["keyword"].strip(
                        ) != "" and tag["keyword"] not in tags:
                    tags.append(tag["keyword"])

        title = content["news"]["title"].replace("&quot;", "\"")

        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            logger.info(
                "***************************News existed!!!***********************"
            )
            mongo.close()
            return

        # post = d('div#post_thumbnail> img').attr("src")
        postraw = "http://pic.chinaventure.com.cn/" + content["news"][
            "coverImg"]
        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler,
                                                     SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        brief = content["news"]["introduction"]

        post_time = content["news"]["updateAt"]

        news_time = extract.extracttime(str(post_time))
        if news_time is None:
            news_time = datetime.datetime.now()

        article = pq(content["news"]["content"]).html()
        contents = extract.extractContents(newsurl, article)
        # for c in contents:
        #     logger.info(c["data"])
        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time,
                    ":".join(tags), category, post)
        # return
        # mongo = db.connect_mongo()
        # collection_news = mongo.article.news
        # if collection_news.find_one({"title": title}) is not None:
        #     logger.info("***************************News existed!!!***********************")
        #     mongo.close()
        #     return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("img.mp.itc.cn") >= 0:
                continue
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        # if post is None or post.strip() == "":
        #     post = util.get_poster_from_news(dcontents)

        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        # dnews["post"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        mongo.close()
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
        # logger.info("*************DONE*************")
    return
예제 #7
0
def process_news(column, newsurl, content, newspost, download_crawler):
    if has_news_content(content):
        d = pq(html.fromstring(content.decode("utf-8")))

        key = newsurl.split("/")[-1].strip().replace(".html", "").replace(
            'detail_', '')

        type = TYPE

        category = None

        title = d('div.left.zb-n> h1').text().strip()

        tags = []

        postraw = newspost
        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler,
                                                     SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        # brief = d("meta[name='description']").attr("content").replace(u'一鸣网——让发生的发声|智慧共享新媒体平台|上海TMT媒体开创者、一鸣网ymtmt.com','')
        brief = d('div.left.zb-n> p.gy').text().strip()
        news_time = datetime.datetime.now()

        article = d('div.left.zb-n').html()
        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time,
                    ":".join(tags), category, brief, post)
        # exit()
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            logger.info('already exists %s', title)
            mongo.close()
            return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": []
        }
        dcontents = []
        rank = 1
        start = False
        for c in contents:
            if start is False and c["data"].find(
                    brief) >= 0 and c["data"].find(title) >= 0:
                start = True
                continue
            if start is False:
                continue

            if c["data"].find("-END-") >= 0:
                break
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue

            logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)
        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
        mongo.close()
        # logger.info("*************DONE*************")
    else:
        logger.info('has no news content %s', newsurl)
    return
예제 #8
0
def process(crawler):
    while True:
        if len(URLS) == 0: return
        linkDict = URLS.pop(0)

        retry = 0

        while True:
            retry += 1
            if retry > 6: break
            download_crawler = download.DownloadCrawler(use_proxy=False)
            url = linkDict['href']
            result = crawler.crawl(url)
            if result['get'] == 'success':
                d = pq(html.fromstring(result['content'].decode("utf-8")))

                title = linkDict['title']
                key = url.split('/')[-1]

                category = d('.al-crumbs a:nth-child(2)').text()

                if categoryDict.has_key(category):
                    TYPE = categoryDict[category]['type']
                    category = categoryDict[category]['category']
                else:
                    TYPE = 60001
                    category = None

                brief = linkDict['brief']
                postraw = linkDict['post']

                tags = []
                # for tag in d('.tags').text().split():
                #     if tag.strip() not in tags: tags.append(tag)

                news_time = d('.article__published').eq(0).text()
                # news_time = datetime.datetime.strptime(' '.join(news_time.split(' ')[:2]), '%Y年%m月%d日 %H:%M')
                # news_time = datetime.datetime.strptime(news_time, '%Y/%m/%d %p %I:%M')
                news_time = datetime.datetime.strptime(news_time,
                                                       '%Y/%m/%d %H:%M')

                flag, domain = url_helper.get_domain(url)
                dnews = {
                    "date": news_time - datetime.timedelta(hours=8),
                    "title": title,
                    "link": url,
                    "createTime": datetime.datetime.now(),
                    "source": SOURCE,
                    "key": key,
                    "key_int": None,
                    "type": TYPE,
                    "original_tags": tags,
                    "processStatus": 0,
                    # "companyId": None,
                    "companyIds": [],
                    "category": category,
                    "domain": domain,
                    "categoryNames": []
                }

                article = d('.article__content').html()
                contents = extract.extractContents(url, article)
                dcontents = []
                rank = 1
                for c in contents:
                    if c["type"] == "text":
                        dc = {
                            "rank": rank,
                            "content": c["data"],
                            "image": "",
                            "image_src": "",
                        }
                    else:
                        if download_crawler is None:
                            dc = {
                                "rank": rank,
                                "content": "",
                                "image": "",
                                "image_src": c["data"],
                            }
                        else:
                            (imgurl, width,
                             height) = parser_mysql_util.get_logo_id_new(
                                 c["data"], download_crawler, SOURCE, key,
                                 "news")
                            if imgurl is not None:
                                dc = {
                                    "rank": rank,
                                    "content": "",
                                    "image": str(imgurl),
                                    "image_src": "",
                                    "height": int(height),
                                    "width": int(width)
                                }
                            else:
                                continue
                    dcontents.append(dc)
                    rank += 1
                dnews["contents"] = dcontents

                if brief is None or brief.strip() == "":
                    brief = util.get_brief_from_news(dcontents)

                # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
                (posturl, width, height) = parser_mysql_util.get_logo_id_new(
                    postraw, download_crawler, SOURCE, key, "news")
                if posturl is not None:
                    post = str(posturl)
                else:
                    post = None
                if post is None or post.strip() == "":
                    post = util.get_posterId_from_news(dcontents)

                if download_crawler is None:
                    dnews["post"] = post
                else:
                    dnews["postId"] = post

                # brief=brief[:100]
                dnews["brief"] = brief

                mongo = db.connect_mongo()
                collection_news = mongo.article.news
                # update link content with oldId
                item = collection_news.find_one({"link": url})

                if len(dcontents) > 1:
                    if item is None:
                        # collection_news.insert(dnews)
                        nid = parser_mongo_util.save_mongo_news(dnews)
                        logger.info("Done: %s", nid)
                    else:
                        logger.info("update %s", url)
                        #     oldId = collection_news.find_one({"link": url})['_id']
                        #     collection_news.delete_one({"link": url})
                        #     dnews['_id'] = oldId
                        #     collection_news.insert(dnews)
                mongo.close()
                logger.info("%s, %s, %s, %s, %s, %s, %s", key, title,
                            news_time, category, " ".join(tags), brief, post)
                logger.info("*************DONE*************")
                break
예제 #9
0
def process_news(column, newsurl, content, newspost, download_crawler):
    if has_news_content(content):
        d = pq(html.fromstring(content.decode("utf-8", "ignore")))

        key = newsurl.split("/")[-1].replace(".html", "")

        type = TYPE

        category = None

        title = d('article> h1').text().strip()

        tags = []
        articletags = d("meta[name='keywords']").attr("content")
        if articletags is not None:
            for tag in articletags.split(","):
                if tag is not None and tag.strip(
                ) != "" and tag not in tags and tag != title:
                    tags.append(tag)

        if "English" in tags or "english" in tags:
            logger.info("Englis not needed, get out!")
            return

        if "商业价值杂志" in tags:
            type = 60003
            category = 60107
        # post = d('div#post_thumbnail> img').attr("src")
        # posturl = parser_mysql_util.get_logo_id(newspost, download_crawler, SOURCE, key, "news")
        # if posturl is not None:
        #     post = str(posturl)
        # else:
        #     post = None

        postraw = d("meta[property='og:image']").attr("content")
        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler,
                                                     SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        brief = d("article> p.post-abstract").text().strip().replace(
            '摘要: ', "")

        post_time = d('article> div.post-info> span.time').text()
        logger.info(post_time)
        news_time = extract.extracttime(post_time)
        if news_time is None:
            news_time = datetime.datetime.now()

        article = d('article> div.inner').html()
        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time,
                    ":".join(tags), category, brief, post)
        # exit()
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": []
            # "sectors": [20]
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"] == "http://www.tmtpost.com/public/css/images/wzny_ewm.jpg":
                continue
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue

            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        if title is not None and len(contents) > 0:
            # collection_news.insert(dnews)
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
        mongo.close()
        # logger.info("*************DONE*************")
    return
예제 #10
0
def process_news(column, newsurl, content, newspost, download_crawler, force):
    if has_news_content(content):
        main = pq(content)('div.article_content')
        d = pq(main)

        key = newsurl.split("/")[-1].replace(".html", "")

        title = pq(content)('head> title').text().strip()
        logger.info("title: %s", title)
        # title = d('h1#article_title').text()

        brief = pq(content)("meta[name='description']").attr("content")
        # post_time =pq(content)("meta[property='article:published_time']").attr("content").split("+")[0]
        # news_time = datetime.datetime.strptime(post_time, "%Y-%m-%dT%H:%M:%S")
        result = util.re_get_result("var publishTime = new Date\(\"(.*?)\"\)",
                                    content)
        if result:
            post_time, = result
            news_time = datetime.datetime.strptime(post_time,
                                                   "%Y-%m-%d %H:%M:%S")
        else:
            logger.info("incorrcet post time")
            logger.info(content)
            # exit()
            return

        categoryNames = []
        contents = extract.extractContents(newsurl, d.html())
        if title.find("融资") >= 0 or title.find("获投") >= 0:
            category = 60101
            categoryNames.append("融资")
        else:
            category = None
        tags = []

        articletags = pq(content)("meta[name='keywords']").attr(
            "content").replace(";", ",")
        if articletags is None:
            logger.info(content)

        else:
            for tag in articletags.split(","):
                if tag is not None and tag.strip() != "" and tag not in tags:
                    tags.append(tag)

        logger.info("%s, %s, %s, %s, %s, %s", key, title, news_time, category,
                    ":".join(tags), brief)

        if force is True:
            mongo = db.connect_mongo()
            collection_news = mongo.article.news
            collection_news.delete_many({
                "source": SOURCE,
                "key_int": int(key)
            })
            collection_news.delete_many({"title": title})
            mongo.close()

        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": TYPE,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }

        #pjtables
        pjcontents = []
        trs = pq(content)('div.proj_table> table> tr')
        logger.info("*****len of trs %s", len(trs))
        for tr in trs:
            logger.info(tr)
            co = pq(tr).text()
            logger.info(co)
            if co is not None and co.strip() != "":
                pjcontents.append(co.replace(" ", ":"))

        dcontents = []
        rank = 1
        for c in contents:
            if c["data"] == "/The End/":
                break
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1

        for pjc in pjcontents:
            dc = {
                "rank": rank,
                "content": pjc,
                "image": "",
                "image_src": "",
            }
            dcontents.append(dc)
            logger.info(pjc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        # post = util.get_poster_from_news(dcontents)
        # dnews["post"] = post

        # if post is None or post.strip() == "":
        post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief
        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # id =collection_news.insert(dnews)
        # logger.info("***********id: %s", id)
        # logger.info("*************DONE**************")
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
        mongo.close()
    return
예제 #11
0
파일: Newscrawler.py 프로젝트: yujiye/Codes
    def process_news(self, newsurl, content, download_crawler):
        if self.has_news_content(content):
            try:
                d = pq(html.fromstring(content.decode("utf-8")))
            except:
                d = pq(html.fromstring(content))

            key = newsurl.split("/")[-1].replace(".shtml","").replace(".html","")
            try:
                key_int = int(key)
            except:
                key_int = None

            news_time = extractArticlePublishedDate.extractArticlePublishedDate(newsurl, content)
            if news_time is None:
                news_time = datetime.datetime.now()

            title = extract.extractTitle(content)

            contents = extract.extractContents(newsurl, content)

            tags = []
            try:
                articletags = d("meta[name='keywords']").attr("content")
                if articletags is not None:
                    for tag in articletags.split():
                        if tag is not None and tag.strip() != "" and tag not in tags and tag != title:
                            tags.append(tag)
            except:
                pass

            logger.info("News: %s, %s, %s", key, title, news_time)

            # mongo = db.connect_mongo()
            # collection_news = mongo.article.news
            # if collection_news.find_one({"link": newsurl}) is not None:
            #     mongo.close()
            #     return

            flag, domain = url_helper.get_domain(newsurl)
            dnews = {
                "date": news_time - datetime.timedelta(hours=8),
                "title": title,
                "link": newsurl,
                "createTime": datetime.datetime.now(),
                "source": self.SOURCE,
                "key": key,
                "key_int": key_int,
                "type": self.TYPE,
                "original_tags": tags,
                "processStatus": 0,
                # "companyId": None,
                "companyIds": [],
                "category": self.CATEGORY,
                "domain": domain,
                "categoryNames": []
            }
            dcontents = []
            rank = 1
            for c in contents:

                if c["type"] == "text":
                    dc = {
                        "rank": rank,
                        "content": c["data"],
                        "image": "",
                        "image_src": "",
                    }
                else:
                    if download_crawler is None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": "",
                            "image_src": c["data"],
                        }
                    else:
                        # imgurl = parser_mysql_util.get_logo_id(c["data"], download_crawler, self.SOURCE, key, "news")
                        (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler,
                                                                                    self.SOURCE,
                                                                                    key, "news")
                        if imgurl is not None:
                            dc = {
                                "rank": rank,
                                "content": "",
                                "image": str(imgurl),
                                "image_src": "",
                                "height": int(height),
                                "width": int(width)
                            }
                        else:
                            continue

                logger.info(c["data"])
                dcontents.append(dc)
                rank += 1
            dnews["contents"] = dcontents

            brief = util.get_brief_from_news(dcontents)
            post = util.get_poster_from_news(dcontents)
            if download_crawler is None:
                dnews["post"] = post
            else:
                dnews["postId"] = post
            dnews["brief"] = brief

            # if news_time > datetime.datetime.now() or news_time < datetime.datetime.now() - datetime.timedelta(days=30):
            #     logger.info("Time: %s is not correct with current time", news_time)
            #     dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8)

            if news_time > datetime.datetime.now():
                logger.info("Time: %s is not correct with current time", news_time)
                dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8)
            if len(dnews["contents"])> 2:
                # mongo = db.connect_mongo()
                # collection_news = mongo.article.news
                # collection_news.insert(dnews)
                # mongo.close()
                nid = parser_mongo_util.save_mongo_news(dnews)
                logger.info("Done: %s", nid)
            logger.info("*************DONE*************")
예제 #12
0
    def process_news(self, newsurl, content, download_crawler):
        dnews = {}
        if self.has_news_content(content):
            try:
                d = pq(html.fromstring(content.decode("utf-8")))
            except:
                d = pq(html.fromstring(content))
            try:
                key = re.findall('sn=(.*)?&', newsurl)[0]
            except:
                key = newsurl

            try:
                key_int = int(key)
            except:
                key_int = None

            news_time = extractArticlePublishedDate.extractArticlePublishedDate(
                newsurl, content)

            if news_time is None:
                news_time = datetime.datetime.now()

            # title = extract.extractTitle(content)
            # title = d('.rich_media_title').eq(0).text()

            r = "var msg_title = \"(.*?)\".*var msg_desc"
            result = util.re_get_result(r, content)
            if result:
                # logger.info("Found brief")
                title, = result
                logger.info(title)
            else:
                title = None
            logger.info("title: %s", title)

            article = d('#page-content .rich_media_content').html()
            # contents = extract.extractContents(newsurl, article)
            contents = self.extractWechatContents(article)

            r = "var msg_desc = \"(.*?)\".*var msg_cdn_url"
            result = util.re_get_result(r, content)
            if result:
                # logger.info("Found brief")
                brief, = result
                logger.info(brief)
            else:
                brief = None
            # logger.info(b)

            tags = []
            try:
                articletags = d("meta[name='keywords']").attr("content")
                if articletags is not None:
                    for tag in articletags.split():
                        if tag is not None and tag.strip(
                        ) != "" and tag not in tags and tag != title:
                            tags.append(tag)
            except:
                pass

            logger.info("News: %s, %s, %s", key, title, news_time)

            # mongo = db.connect_mongo()
            # collection_news = mongo.article.news
            # if collection_news.find_one({"link": newsurl}) is not None:
            #     mongo.close()
            #     return
            try:
                wechatId = d('span.profile_meta_value').eq(0).text().strip()
                wechatName = d('strong.profile_nickname').text().strip()
            except:
                wechatId = None
                wechatName = None
            logger.info("wechatId: %s, wechatName: %s", wechatId, wechatName)
            flag, domain = url_helper.get_domain(newsurl)
            dnews = {
                "date": news_time - datetime.timedelta(hours=8),
                "title": title,
                "link": newsurl,
                "createTime": datetime.datetime.now(),
                "source": self.SOURCE,
                "key": key,
                "key_int": key_int,
                "type": self.TYPE,
                "original_tags": tags,
                "processStatus": 0,
                # "companyId": None,
                "companyIds": [],
                "category": self.CATEGORY,
                "domain": domain,
                "categoryNames": [],
                "wechatId": wechatId,
                "wechatName": wechatName
            }
            dcontents = []
            rank = 1

            for c in contents:

                if c["type"] == "text":
                    dc = {
                        "rank": rank,
                        "content": c["data"],
                        "image": "",
                        "image_src": "",
                    }
                else:
                    if download_crawler is None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": "",
                            "image_src": c["data"],
                        }
                    else:
                        (imgurl, width,
                         height) = parser_mysql_util.get_logo_id_new(
                             c["data"], download_crawler, self.SOURCE, key,
                             "news")
                        if imgurl is not None:
                            dc = {
                                "rank": rank,
                                "content": "",
                                "image": str(imgurl),
                                "image_src": "",
                                "height": int(height),
                                "width": int(width)
                            }
                        else:

                            continue
                # logger.info(c["data"])

                logger.info(c["data"])
                dcontents.append(dc)
                rank += 1
            dnews["contents"] = dcontents
            if brief is None:
                brief = util.get_brief_from_news(dcontents)
            post = util.get_posterId_from_news(dcontents)
            if download_crawler is None:
                dnews["post"] = post
            else:
                dnews["postId"] = post
            dnews["brief"] = brief.decode("utf-8")[:100]

            # if news_time > datetime.datetime.now() or news_time < datetime.datetime.now() - datetime.timedelta(days=30):
            #     logger.info("Time: %s is not correct with current time", news_time)
            #     dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8)
            # collection_news.insert(dnews)
            # mongo.close()

            # print dnews

            logger.info("*************DONE*************")
        return dnews
예제 #13
0
def process_news(column, newsurl, content, newspost, download_crawler):
    if has_news_content(content):
        key = content["id"]

        type = TYPE

        category = None

        title = content["title"]

        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        try:
            tags = content["keywords"].split(",")
        except:
            tags = []

        postraw = newspost
        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler,
                                                     SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        brief = content["description"]

        post_time = content["pubdate"]
        news_time = extract.extracttime(str(post_time))
        if news_time is None:
            news_time = datetime.datetime.now()

        article = pq(content["content"]).html()
        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time,
                    ":".join(tags), category, brief, post)
        # exit()
        # mongo = db.connect_mongo()
        # collection_news = mongo.article.news
        # if collection_news.find_one({"title": title}) is not None:
        #     mongo.close()
        #     return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": []
        }
        dcontents = []
        rank = 1
        for c in contents:

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue

            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)
        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        mongo.close()
        # logger.info("*************DONE*************")
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
    return
예제 #14
0
파일: ebrun_news.py 프로젝트: yujiye/Codes
def process_news(column, newsurl, content, newspost):
    if has_news_content(content):
        download_crawler = download.DownloadCrawler(use_proxy=False)
        # d = pq(html.fromstring(content.decode("utf-8","ignore")))
        if content.find("charset=GBK") == -1:
            d = pq(html.fromstring(content.decode("utf-8","ignore")))
            utfflag = True
        else:
            d = pq(html.fromstring(content.decode("gbk", "ignore")))
            utfflag = False

        key = newsurl.split("?")[0].split("/")[-1].replace(".shtml","")

        type = TYPE

        category = None
        categoryNames = []

        tags = []
        articletags = d("meta[name='keywords']").attr("content")
        if articletags is not None:
            for tag in articletags.split():
                if tag is not None and tag.strip() != "" and tag not in tags:
                    tags.append(tag)

        if utfflag is True:
            title = d('article> div> h1').text().strip()
        else:
            title = d('div.titleH> h1').text().strip()
        logger.info("title: %s",title)
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return


        # post = d('div#post_thumbnail> img').attr("src")
        postraw = newspost
        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        brief = d("meta[name='description']").attr("content")

        if utfflag is True:
            post_time = d('p.source> span.f-right').eq(0).text()
        else:
            post_time = d('div.titleH> p.zsp> span').eq(2).text()
        logger.info(post_time)
        news_time = extract.extracttime(post_time)
        if news_time is None:
            news_time = datetime.datetime.now()

        # article = d('div.contdiv').html()
        if utfflag is True:
            article = d('div.post-text').html()
        else:
            article = d('div.contdiv').html()
        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, post)
        # exit()
        # mongo = db.connect_mongo()
        # collection_news = mongo.article.news
        # if collection_news.find_one({"title": title}) is not None:
        #     mongo.close()
        #     return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("电商资讯第一入口") != -1:
                break
            if c["data"] in Nocontents:
                continue
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        # if post is None or post.strip() == "":
        #     post = util.get_poster_from_news(dcontents)
        # dnews["post"] = post
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8)
        # collection_news.insert(dnews)
        mongo.close()
        # logger.info("*************DONE*************")
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
    return
예제 #15
0
파일: scmp_news.py 프로젝트: yujiye/Codes
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": str(imgurl),
                        "image_src": "",
                        "height": int(height),
                        "width": int(width)
                    }
                else:
                    continue
        # logger.info(c["data"])
        dcontents.append(dc)
        rank += 1
    dnews["contents"] = dcontents
    if brief is None or brief.strip() == "":
        brief = util.get_brief_from_news(dcontents)

    if post is None or post.strip() == "":
        post = util.get_posterId_from_news(dcontents)

    if download_crawler is None:
        dnews["post"] = post
    else:
        dnews["postId"] = post
    dnews["brief"] = brief

    if news_time > datetime.datetime.now():
        logger.info("Time: %s is not correct with current time", news_time)
        dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8)
    # logger.info(json.dumps(dnews,ensure_ascii=False,indent=2,cls=util.CJsonEncoder))
예제 #16
0
def process_news(column, newsurl, content, newsposttime, download_crawler):
    if has_news_content(content):
        d = pq(html.fromstring(content.decode("utf-8")))

        key = newsurl.split("/")[-1].strip()

        type = TYPE

        title = d('div.article-wrap> div.article-head> h1').text().strip()
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return
        tags = []
        articletags = d("meta[name='keywords']").attr("content")
        if articletags is not None:
            for tag in articletags.split(","):
                if tag is not None and tag.strip(
                ) != "" and tag not in tags and tag != title:
                    tags.append(tag)

        category = None
        categoryNames = []
        if "投资并购" in tags:
            category = 60101
            categoryNames.append("融资")

        # post = d('div#post_thumbnail> img').attr("src")
        post = None

        brief = d("meta[name='description']").attr("content")

        news_time = None
        if newsposttime is not None:
            news_time = extract.extracttime(newsposttime)
        if news_time is None:
            dt = datetime.date.today()
            post_time = d(
                'div.article-wrap> div.article-head> p> span.article-time'
            ).text()
            if post_time is None or post_time.strip() == str(dt):
                news_time = datetime.datetime.now()
                # news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d")
            else:
                news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d")

        article = d('div.article-wrap> div.article-content').html()
        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time,
                    ":".join(tags), category, brief, post)
        # exit()
        # mongo = db.connect_mongo()
        # collection_news = mongo.article.news
        # if collection_news.find_one({"title": title}) is not None:
        #     mongo.close()
        #     return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                (imgurl, width, height) = parser_mysql_util.get_logo_id_new(
                    c["data"], download_crawler, SOURCE, key, "news")
                if imgurl is not None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": str(imgurl),
                        "image_src": "",
                        "height": int(height),
                        "width": int(width)
                    }
                else:
                    continue

            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)
        dnews["postId"] = post
        dnews["brief"] = brief

        # Design for sector:
        dnews["sectors"] = [10]
        dnews["sector_confidence"] = [1]

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        mongo.close()
        # logger.info("*************DONE*************")
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
    return
예제 #17
0
파일: forbes_news.py 프로젝트: yujiye/Codes
def process_news(column, j_content, content, download_crawler):
    if has_news_content(content):
        download_crawler = download.DownloadCrawler(use_proxy=False)
        d = pq(html.fromstring(content.decode("utf-8", "ignore")))

        category = None
        categoryNames = []

        key = j_content['id']
        type = TYPE
        title = j_content['title']

        newspost = j_content.get('image')
        (posturl, width, height) = parser_mysql_util.get_logo_id_new(newspost, download_crawler, SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        tags = []
        brief = j_content['description']
        newsurl = j_content['uri']

        try:
            date = j_content['date']
            post_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(str(date)[:-3])))
            news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d %H:%M:%S") - datetime.timedelta(days=1)
            logger.info("news-time: %s", news_time)
        except Exception, e:
            logger.info(e)
            news_time = datetime.datetime.now()

        article = d('div.article-container').html()
        contents = extract.extractContents(newsurl, article,document=False)

        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time, ":".join(tags), category, brief)
        flag, domain = url_helper.get_domain(newsurl)

        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": None,
            "type": type,
            "original_tags": tags,
            "processStatus": 1,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["type"] == "text":
                if c["data"].find("Share to facebookShare to twitterShare to linkedin") >= 0:
                    c['data'] = c['data'].replace('Share to facebookShare to twitterShare to linkedin', '')
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE,
                                                                                key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)

        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8)
        # logger.info(json.dumps(dnews,ensure_ascii=False,indent=2,cls=util.CJsonEncoder))

        if title is not None and len(contents) > 0:
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
예제 #18
0
def process_news(column, newsurl, content, newspost, download_crawler):
    if has_news_content(content):
        logger.info('here')

        d = pq(html.fromstring(content.decode("utf-8", 'ignore')))

        if d.text().find('embed') >= 0:  # 排除视频文章
            logger.info('not article:%s' % newsurl)
            return

        category = None
        categoryNames = []

        key = newsurl.split("/")[-1].replace(".html", "")

        type = TYPE

        title = d('h1').text().strip()

        if title is None or title == "":
            return

        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({'title': title}) is not None:
            mongo.close()
            return

        try:
            (posturl, width, height) = parser_mysql_util.get_logo_id_new(
                newspost, download_crawler, SOURCE, key, "news")
        except:
            posturl = None
        if posturl is not None:

            post = str(posturl)
        else:
            post = None

        tags = []
        articletags = d("meta[name='keywords']").attr('content')
        if articletags is not None:
            for tag in articletags.split(","):
                if tag is not None and tag.strip(
                ) != "" and tag not in tags and tag != title:
                    tags.append(tag)

        try:
            brief = d("meta[name='description']").attr("content")
        except:
            brief = None

        try:
            post_time_1 = d("div.wyt-post-content-meta> div> p ").find(
                'span').text().strip()
            post_time_2 = d("div.wyt-post-content-meta> div").find(
                'p').next().text().strip()
            if post_time_1:
                post_time = post_time_1
            else:
                post_time = post_time_2

            if re.match('\d{2}-\d{2}', post_time):  # 匹配 03-19格式
                post_time = str(time.localtime()[0]) + '-' + post_time

            news_time = extract.extracttime(post_time)
            logger.info("news-time: %s", news_time)
        except Exception, e:
            logger.info(e)
            news_time = datetime.datetime.now()

        if news_time is None:
            news_time = datetime.datetime.now()

        article = d('article.wyt-post-content').html()

        contents = extract.extractContents(newsurl, article, document=True)

        logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time,
                    ":".join(tags), category, brief, post)

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames,
        }
        dcontents = []
        rank = 1

        if contents[0]['type'] == 'img':
            del contents[0]

        for c in contents:
            # logger.info("%s-%s",c["type"],c["data"])
            if c['type'] == 'text':

                if re.match('^\d+$', c['data']) or c['data'].find('收藏') >= 0 or c['data'].find('投融资') >= 0 or c['data'].find('阅读时间') >= 0 \
                        or c['data'].find('违者必究') >= 0 or c['data'].find('微信公众号') >= 0 or c['data'].find('微信扫描') >= 0 \
                        or c['data'].find('点击获取完整版报告') >= 0 or c['data'].find('作者原创,微信号') >= 0:
                    continue

                # if c['data'].find('译者') >= 0:
                #     c['data'] = c['data'].split(' ')[0]
                #
                # if c['data'].find('来源') >= 0:
                #     c['data'] = c['data'].split('|')[0]

                if c['data'].find('| 未央网') >= 0:
                    c['data'] = c['data'].replace('| 未央网', ' ')

                dc = {
                    'rank': rank,
                    'content': c['data'],
                    'image': '',
                    'image_src': '',
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            # logger.info(c["data"])
            dcontents.append(dc)
            rank += 1

        dnews['contents'] = dcontents

        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)

        logger.info(
            json.dumps(dnews, ensure_ascii=False, cls=util.CJsonEncoder))

        if title is not None and len(contents) > 0:
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
        mongo.close()
예제 #19
0
def process_news(content, news_key, url):
    if has_content(content):
        download_crawler = download.DownloadCrawler(use_proxy=False)
        d = pq(html.fromstring(content))
        brief = d("meta[name='description']").attr("content").split(",")[-1]
        title = d('div#article> div.single-item> div.article-hd> h1').text().strip()
        pagetitle = d('head> title').text().strip()
        temp = pagetitle.split("-")[-2]
        categoryNames = []
        if temp.strip() == "初页":
            category = 60102
            categoryNames.append("产品")
        elif temp.strip() == 'IPO/并购':
            category = 60105
            categoryNames.append("大公司")
        else:
            category = None
        post_time = d('div.author-time> span.date-time').attr("data-time")
        post_date = time.localtime(int(post_time))
        news_time = datetime.datetime(post_date.tm_year, post_date.tm_mon, post_date.tm_mday, post_date.tm_hour,
                                      post_date.tm_min, post_date.tm_sec)
        key = news_key
        column = d('div.article-tags> a').text()
        tags = column.split()
        logger.info("%s, %s, %s, %s, %s, %s, %s", key, title, post_time, news_time, temp,  category, ":".join(tags))
        article = d('div#article> div> div.article-content').html()
        # # logger.info(article)
        contents = extract.extractContents(url, article)

        if collection_news.find_one({"source": SOURCE, "key_int": int(key)}) is not None:
            return
            # collection_news.delete_one({"source": SOURCE, "key_int": int(key)})

        if collection_news.find_one({"title": title, "source": {"$ne": SOURCE}}) is not None:
            return
            # collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}})

        # for t in contents:
        #    logger.info(t["data"])
        #    logger.info("")
        flag, domain = url_helper.get_domain(url)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": url,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": TYPE,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "" or desc_helper.check_desc(brief,2) is False:
            brief = util.get_brief_from_news(dcontents)
        # post = util.get_poster_from_news(dcontents)
        # dnews["post"] = post

        post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post

        dnews["brief"] = brief
        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8)
        # collection_news.insert(dnews)
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
예제 #20
0
def process_news(column, newsurl, content, newspost, topic, download_crawler):
    if has_news_content(content):
        logger.info('here')
        download_crawler = download.DownloadCrawler(use_proxy=False)
        # logger.info(content)
        # d = pq(html.fromstring(content.decode("gbk","ignore")))
        utfflag = False
        if content.find("gb2312") == -1:
            d = pq(html.fromstring(content.decode("utf-8", "ignore")))
            utfflag = True
        else:
            d = pq(html.fromstring(content.decode("gbk", "ignore")))
        category = None
        categoryNames = []

        key = newsurl.split("/")[-1].replace(".shtml", "")

        type = TYPE

        post = None

        if utfflag is True:
            title = d('div#titsize> strong').text().strip()
        else:
            title = d('div.titmain> h1').text().strip()
            # logger.info("title: %s", title)
            if title is None or title.strip() == "":
                title = d('div.texttitbox> h1').text().strip()
        logger.info("title: %s", title)
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        tags = []
        # articletags = d("meta[name='keywords']").attr("content")
        # if articletags is not None:
        #     for tag in articletags.split(","):
        #         if tag is not None and tag.strip() != "" and tag not in tags and tag != title:
        #             tags.append(tag)

        # try:
        #     brief = d('div.daodu> p').text().strip().replace("【数据猿导读】","")
        # except:
        #     brief = None
        brief = None

        try:
            if utfflag is True:
                post_time = d("p.time> span.mh-title").text().strip()
            else:
                post_time = d("meta[property='og:release_date']").attr(
                    "content").split("+")[0]

            logger.info(post_time)
            news_time = extract.extracttime(post_time)
            logger.info("news-time: %s", news_time)
        except Exception, e:
            logger.info(e)
            news_time = datetime.datetime.now()

        if utfflag is True:
            article = d('div.tbox.content').html()
        else:
            article = d('div.texttit_m1').html()
        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time,
                    ":".join(tags), category, brief)

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=20),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("fromgeek.com/awards/") >= 0 or \
                    c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0:
                continue

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)

        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        mongo.close()
        if title is not None and len(contents) > 0:
            # logger.info("*************DONE*************")
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
예제 #21
0
def process(g, crawler, url, key, content):
    if has_content(content):
        #logger.info(content)
        main = pq(content)('div.article_content')
        d = pq(main)
        title = d('h1#article_title').text()
        brief = pq(content)("meta[name='description']").attr("content")
        # post_time =pq(content)("meta[property='article:published_time']").attr("content").split("+")[0]
        # news_time = datetime.datetime.strptime(post_time, "%Y-%m-%dT%H:%M:%S")
        result = util.re_get_result("var publishTime = new Date\(\"(.*?)\"\)",
                                    content)
        if result:
            post_time, = result
            news_time = datetime.datetime.strptime(post_time,
                                                   "%Y-%m-%d %H:%M:%S")
        else:
            logger.info("incorrcet post time")
            logger.info(content)
            exit()
            return

        contents = extract.extractContents(url, content)
        if title.find("融资") >= 0 or title.find("获投") >= 0:
            category = 60101
        else:
            category = None
        tags = []

        articletags = pq(content)("meta[name='keywords']").attr("content")
        if articletags is None:
            logger.info(content)
        else:
            for tag in articletags.split():
                if tag is not None and tag.strip() != "" and tag not in tags:
                    tags.append(tag)

        logger.info("%s, %s, %s, %s, %s", key, title, news_time, category,
                    ":".join(tags))
        #logger.info(news_time)
        #logger.info(contents)
        # for t in contents:
        #     logger.info(t["data"])

        #item = collection_news.find_one({"source": g.SOURCE, "key_int": int(key)})
        craw = True
        #2016-10-01 pencilnews website upgrade, news keys changed! Have to redownload article with new keys
        if collection_news.find_one({
                "source": g.SOURCE,
                "key_int": int(key)
        }) is not None:
            cnews = collection_news.find_one({
                "source": g.SOURCE,
                "key_int": int(key)
            })
            logger.info("%s, %s", url, cnews["link"])
            if url == cnews["link"]:
                craw = False
            else:
                collection_news.delete_many({
                    "source": g.SOURCE,
                    "key_int": int(key)
                })
                logger.info("different link!")

        if craw:
            newses = list(
                collection_news.find({
                    "title": title,
                    "source": {
                        "$ne": g.SOURCE
                    }
                }))
            for news in newses:
                if news.has_key("type") and news["type"] > 0:
                    craw = False
                    break
        if craw:
            if collection_news.find_one({
                    "title": title,
                    "source": {
                        "$ne": g.SOURCE
                    }
            }) is not None:
                collection_news.delete_many({
                    "title": title,
                    "source": {
                        "$ne": g.SOURCE
                    }
                })
            flag, domain = url_helper.get_domain(url)
            dnews = {
                "date": news_time - datetime.timedelta(hours=8),
                "title": title,
                "link": url,
                "createTime": datetime.datetime.now(),
                "source": g.SOURCE,
                "key": key,
                "key_int": int(key),
                "type": TYPE,
                "original_tags": tags,
                "processStatus": 0,
                "companyId": None,
                "companyIds": [],
                "category": category,
                "domain": domain
            }

            dcontents = []
            rank = 1
            for c in contents:
                if c["data"] == "/The End/":
                    break
                if c["type"] == "text":
                    dc = {
                        "rank": rank,
                        "content": c["data"],
                        "image": "",
                        "image_src": "",
                    }
                else:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                dcontents.append(dc)
                rank += 1
            dnews["contents"] = dcontents
            if brief is None or brief.strip() == "":
                brief = util.get_brief_from_news(dcontents)
            post = util.get_poster_from_news(dcontents)
            dnews["post"] = post
            dnews["brief"] = brief
            if news_time > datetime.datetime.now():
                logger.info("Time: %s is not correct with current time",
                            news_time)
                dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                    hours=8)
            collection_news.insert(dnews)
            logger.info("*************DONE**************")

        g.latestIncr()
예제 #22
0
def process_news(column, newsurl, content, newspost, topic, download_crawler):
    if has_news_content(content):
        # logger.info('here')
        download_crawler = download.DownloadCrawler(use_proxy=False)
        # logger.info(content)
        d = pq(html.fromstring(content.decode("utf-8", "ignore")))

        category = None
        categoryNames = []

        key = newsurl.split("/")[-2].replace(".html", "")

        type = TYPE

        title = d('h1.single-title').text().strip()

        newspost = d('header> img.wp-post-image').attr("src")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(newspost,
                                                     download_crawler, SOURCE,
                                                     key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        tags = []
        # articletags = d("meta[name='keywords']").attr("content")
        # if articletags is not None:
        #     for tag in articletags.split(","):
        #         if tag is not None and tag.strip() != "" and tag not in tags and tag != title:
        #             tags.append(tag)
        try:
            brief = d("meta[name='description']").attr("content")
        except:
            brief = None

        # try:
        #    post_time = topic
        #
        #    logger.info(post_time)
        #    news_time = datetime.datetime.strptime(post_time, "%Y-%m-%dT%H:%M:%S")
        #    logger.info("news-time: %s", news_time)
        # except Exception, e:
        #     logger.info(e)
        news_time = datetime.datetime.now()

        article = d('section.post_content').html()
        contents = extract.extractContents(newsurl, article, document=False)

        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time,
                    ":".join(tags), category, brief)

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": None,
            "type": type,
            "original_tags": tags,
            "processStatus": 1,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        processStatus = 0
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("fromgeek.com/awards/") >= 0 or \
                    c["data"].find("http://www.fromgeek.com/uploadfile/2017/0430/20170430328184.jpg") >= 0:
                continue

            if c["data"].find(
                    "Continue reading this story with a subscription to DealStreetAsia"
            ) >= 0:
                processStatus = -5
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        if processStatus != 0:
            dnews["processStatus"] = processStatus
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)

        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        if title is not None and len(contents) > 0:
            # logger.info("*************DONE*************")
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s | %s", nid, processStatus)
            pass
    return
예제 #23
0
def process_news(column, newsurl, content, newspost, download_crawler):
    if has_news_content(content):
        d = pq(html.fromstring(content.decode("utf-8")))

        key = newsurl.split("/")[-1].strip().replace(".shtml", "")

        type = TYPE

        category = None

        title = d('div.subject> h1').text().strip()

        tags = []

        post = newspost

        brief = d("meta[name='description']").attr("content")

        post_time = d('div.meta> span.meta-date').text().replace("发布", "")
        logger.info(post_time)
        news_time = extract.extracttime(post_time)
        if news_time is None:
            news_time = datetime.datetime.now()

        article = d('div.subject> div.subject-content').html()
        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time,
                    ":".join(tags), category, brief, post)
        # exit()
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain
        }
        dcontents = []
        rank = 1
        for c in contents:

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    imgurl = parser_mysql_util.get_logo_id(
                        c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                        }
                    else:
                        continue

            logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)
        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        collection_news.insert(dnews)
        mongo.close()
        logger.info("*************DONE*************")
    return
예제 #24
0
def process_news(column, newsurl, content, newspost, download_crawler):
    if has_news_content(content):
        d = pq(html.fromstring(content.decode("utf-8","ignore")))

        key = newsurl.split("/")[-1].replace("i","")

        type = TYPE

        category = None
        title = d('head> title').text().strip()

        r = "content: '(.*?)',.*groupId"

        result = util.re_get_result(r.strip()[:-1], content)
        (b,) = result
        logger.info(b)

        # exit()
        tags = []
        articletags = d("meta[name='keywords']").attr("content")
        if articletags is not None:
            for tag in articletags.replace(",", ",").split(","):
                if tag is not None and tag.strip() != "" and tag not in tags and tag != title:
                    tags.append(tag)



        post = None

        brief = None
        news_time = None
        try:
            r1 = "time: '(.*?)'.*},.*tagInfo"

            result = util.re_get_result(r1, content)
            (post_time,) = result
            logger.info(post_time)
            news_time = extract.extracttime(post_time)
            logger.info("news-time: %s", news_time)
        except:
            pass
        if news_time is None:
            news_time = datetime.datetime.now()
        # exit()
        # article = d('div.post> div.post-content').html()
        # contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s. %s", key, title, news_time, ":".join(tags), category, brief, post)
        # exit()
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": 60101,
            "domain": domain,
            "categoryNames": [],
            # "sectors": [20]
        }
        dcontents = []
        rank = 1
        bb = b.replace('&lt;', "<").replace("&gt;",">").replace("&quot;","\"").replace("&#x3D;","=")
        logger.info(bb)

        contents = extract.extractContents(newsurl, bb, document=False)
        for c in contents:
            logger.info(c["data"])
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            dcontents.append(dc)
            rank += 1
        # for c in b.replace("&lt;div&gt;&lt;p&gt;",'').replace("&lt;/p&gt;&lt;/div&gt;","").split('&lt;/p&gt;&lt;p&gt;'):
        #     logger.info(c)
        #     if c.find("转载务必署名来源")>=0 or c.find("&lt;/p&gt;&lt;/div&gt;")>=0 or c.find("&lt;div&gt;&lt;p&gt; ")>=0:
        #         continue
        #     if c.find("img") >= 0:
        #         c = re.sub(r'&lt;(.*)?img.*&quot;0&quot;&gt;',"",c)
        #         dc = {
        #             "rank": rank,
        #             "content": c,
        #             "image": "",
        #             "image_src": "",
        #         }
        #     else:
        #         dc = {
        #             "rank": rank,
        #             "content": c,
        #             "image": "",
        #             "image_src": "",
        #         }
        #     # else:
        #     #     if download_crawler is None:
        #     #         dc = {
        #     #             "rank": rank,
        #     #             "content": "",
        #     #             "image": "",
        #     #             "image_src": c,
        #     #         }
        #     #     else:
        #     #         (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c, download_crawler, SOURCE, key, "news")
        #     #         if imgurl is not None:
        #     #             dc = {
        #     #                 "rank": rank,
        #     #                 "content": "",
        #     #                 "image": str(imgurl),
        #     #                 "image_src": "",
        #     #                 "height": int(height),
        #     #                 "width": int(width)
        #     #             }
        #     #         else:
        #     #             continue
        #
        #     logger.info(c)
        #     dcontents.append(dc)
        #     rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8)
        mid = None
        if title is not None and len(dcontents) > 0:
            # mid = collection_news.insert(dnews)
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
        mongo.close()
        # logger.info("*************DONE*************%s",mid)
    return
예제 #25
0
def process_news(column, newsurl, content, newspost, download_crawler, sort):
    if has_news_content(content):
        logger.info("here")
        d = pq(html.fromstring(content.decode("utf-8")))

        key = newsurl.split("/")[-1]

        type = TYPE
        if sort.find("投融资") >= 0:
            type = 60001
        category = None

        title = d('div.mod-head> h1').text().strip()

        if title is None or title == "":
            return
        tags = []
        # articletags = d("meta[name='keywords']").attr("content")
        # if articletags is not None:
        #     for tag in articletags.replace(",", ",").split(","):
        #         if tag is not None and tag.strip() != "" and tag not in tags and tag != title:
        #             tags.append(tag)
        # #

        # newspost1 = d('div.article-main> div> img').attr("src")
        # posturl = parser_mysql_util.get_logo_id(newspost, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(newspost,
                                                     download_crawler, SOURCE,
                                                     key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        # post = d("meta[property='og:image']").attr("content")
        try:
            brief = d("meta[name='description']").attr("content")
        except:
            brief = None

        try:
            post_time = d('span.time> time').text()
            logger.info(post_time)
            # if post_time == datetime.date.strftime(datetime.date.today(),'%Y-%m-%d'):
            #     news_time = datetime.datetime.now()
            # else:
            news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d %H:%M")
        except:
            news_time = datetime.datetime.now()
        article = d('div.mod-body> div.content').html()
        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s, %s, %s", key, title, news_time,
                    ":".join(tags), type, category, brief, post)
        # exit()
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": [],
            "sectors": [20]
        }
        dcontents = []
        rank = 1
        for c in contents:
            # if c["data"].find("◆END◆")>=0 or c["data"].find("…………………")>=0:
            #     break
            #
            # if c["data"].find("ACG 领域最具影响力的产业新媒体") >= 0 or c["data"].find("访问三文娱网站3wyu.com查看产业必读文章") >=0:
            #     continue

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue

            logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)

        if title is not None and len(contents) > 0:
            # mid = collection_news.insert(dnews)
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
        mongo.close()
        # logger.info("*************DONE************* %s", mid)
    return
예제 #26
0
def process_news(content, news_key, url, news_posttime):
    if has_news_content(content):
        download_crawler = download.DownloadCrawler(use_proxy=False)
        d = pq(html.fromstring(content.decode('utf-8')))
        title = d('header.article-header>h1').text().strip()
        if title is None or title.strip() == "":
            logger.info("wrong title for url: %s", url)
            return
        post_time = pq(content)("meta[name='sailthru.date']").attr("content")
        news_time = datetime.datetime.strptime(post_time, "%Y-%m-%d %H:%M:%S") + datetime.timedelta(hours=15)

        key = news_key
        try:
            postraw = pq(content)("meta[property='og:image']").attr("content")
            if postraw.find("techcrunch.opengraph.default.png")>=0:
                postraw = None
        except:
            postraw = None
        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width, height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler, SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        divtags = d('div.tags> div.tag-item')
        tags = [pq(divtag)('a.tag').text().strip() for divtag in divtags if pq(divtag)('a.tag').text().strip() is not None]
        category = None
        logger.info("%s, %s, %s, %s, %s -> %s", key, title, post_time, news_time, ":".join(tags),category)

        article = d('div.article-entry.text').html()
        # logger.info(article)
        contents = extract.extractContents(url, article)

        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"source": SOURCE, "key_int": int(key)}) is not None:
            mongo.close()
            return
            # collection_news.delete_one({"source": SOURCE, "key_int": int(key)})
        if collection_news.find_one({"title": title, "source": {"$ne": SOURCE}}) is not None:
            mongo.close()
            return
            # collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}})
        mongo.close()

        flag, domain = url_helper.get_domain(url)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": url,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": TYPE,
            "original_tags": tags,
            "processStatus": 1,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": []
        }
        dcontents = []
        rank = 1
        for c in contents:

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width, height) = parser_mysql_util.get_logo_id_new(c["data"], download_crawler, SOURCE,
                                                                                key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        brief = util.get_brief_from_news(dcontents)
        # if post is None or post.strip() == "" or post.find("techcrunch.opengraph.default.png")>=0:
        #     post = util.get_poster_from_news(dcontents)
        #
        # dnews["post"] = post
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post

        dnews["brief"] = brief
        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8)
        if len(dcontents) > 0:
            # mongo = db.connect_mongo()
            # collection_news = mongo.article.news
            # collection_news.insert(dnews)
            # mongo.close()
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)

        logger.info("Done")
예제 #27
0
def process_news(newsurl, content, newspost, download_crawler):
    if has_news_content(content):
        logger.info('here.')

        d = pq(html.fromstring(content.decode("utf-8", 'ignore')))

        category = None
        categoryNames = []
        Type = TYPE
        tags = []
        brief = None

        title = d('h1').text().strip()
        if title is None or title == "":
            return
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({'title': title}) is not None:
            mongo.close()
            return

        key = d('article').attr('id').strip().split('-')[-1]

        try:
            (posturl, width, height) = parser_mysql_util.get_logo_id_new(
                newspost, download_crawler, SOURCE, key, "news")
        except:
            posturl = None
        if posturl is not None:
            post = str(posturl)
        else:
            post = None
        try:
            post_time = d("header> div> span> time").text().strip()
            res = re.search(u'(\d{4})年(\d+)月(\d+)日', post_time)
            year = res.group(1)
            month = res.group(2)
            if len(month) == 1:
                month = '0' + month
            day = res.group(3)
            if len(day) == 1:
                day = '0' + day
            post_time = '{}-{}-{}'.format(year, month, day)
            news_time = extract.extracttime(post_time)

        except Exception as e:
            logger.info(e)
            news_time = datetime.datetime.now()
        if news_time is None:
            news_time = datetime.datetime.now()

        article = d('div.td-post-content').html()
        contents = extract.extractContents(newsurl, article, document=True)

        flag, domain = url_helper.get_domain(newsurl)

        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": int(key),
            "type": Type,
            "original_tags": tags,
            "processStatus": 0,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames,
        }

        dcontents = []
        rank = 1

        for c in contents:
            if c['type'] == 'text':
                dc = {
                    'rank': rank,
                    'content': c['data'],
                    'image': '',
                    'image_src': '',
                }
            else:
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            dcontents.append(dc)
            rank += 1

        dnews['contents'] = dcontents

        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)

        logger.info(
            json.dumps(dnews, ensure_ascii=False, cls=util.CJsonEncoder))

        if title is not None and len(contents) > 0:
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
        mongo.close()
예제 #28
0
def process_news(content, url):
    if has_news_content(content):
        d = pq(html.fromstring(content.decode("utf-8")))
        download_crawler = download.DownloadCrawler(use_proxy=False)
        title = d(
            'div.post-img-left> div> div.post-head> h1.title').text().strip()
        post_time = d('article.post-article').attr("ptime")
        post_Date = time.localtime(int(post_time))
        news_time = datetime.datetime(post_Date.tm_year, post_Date.tm_mon,
                                      post_Date.tm_mday, post_Date.tm_hour,
                                      post_Date.tm_min, post_Date.tm_sec)

        if collection_news.find_one({"link": url}) is not None:
            return
            # collection_news.delete_one({"link": url})

        if collection_news.find_one({
                "title": title,
                "source": {
                    "$ne": SOURCE
                }
        }) is not None:
            return

        key = d('article.post-article').attr("postid")
        try:
            key_int = int(key)
        except:
            key_int = None
        column = d('span.post-category').text().strip()
        brief = d("meta[name='description']").attr("content").strip()

        if column is not None:
            tags = column.split()
        else:
            tags = []

        categoryNames = []
        if "人物" in tags:
            category = 60103
        elif "公司" in tags:
            category = 60105
            categoryNames.append("大公司")
        else:
            category = None

        keywords = d("meta[name='keywords']").attr("content")
        if keywords is not None:
            for keyword in keywords.split(","):
                if keyword is not None and keyword.strip(
                ) not in tags and keyword.strip() not in ["PingWest", "品玩"]:
                    tags.append(keyword.strip())

        postraw = d("link[rel='image_src']").attr("href")
        # posturl = parser_mysql_util.get_logo_id(postraw, download_crawler, SOURCE, key, "news")
        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(postraw, download_crawler,
                                                     SOURCE, key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        logger.info("%s, %s, %s, %s, %s, %s -> %s, %s", key, title, post_time,
                    news_time, brief, ":".join(tags), category, post)
        article = d('div.box-con> div#sc-container').html()
        # logger.info(article)
        contents = extract.extractContents(url, article)

        # if collection_news.find_one({"link": url}) is not None:
        #     return
        #     # collection_news.delete_one({"link": url})
        #
        # if collection_news.find_one({"title": title, "source": {"$ne": SOURCE}}) is not None:
        #     return
        # collection_news.delete_many({"title": title, "source": {"$ne": SOURCE}})
        flag, domain = url_helper.get_domain(url)
        dnews = {
            "date": news_time - datetime.timedelta(hours=16),
            "title": title,
            "link": url,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": key_int,
            "type": TYPE,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []

        rank = 1
        for c in contents:
            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"].replace("?imageView2/2/w/750/q/90",""),
                # }
                if download_crawler is None:
                    dc = {
                        "rank":
                        rank,
                        "content":
                        "",
                        "image":
                        "",
                        "image_src":
                        c["data"].replace("?imageView2/2/w/750/q/90", ""),
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"].replace("?imageView2/2/w/750/q/90", ""),
                         download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)
        # if post is None or post.strip() == "":
        #     post = util.get_poster_from_news(dcontents)
        # dnews["post"] = post
        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        nid = parser_mongo_util.save_mongo_news(dnews)
        logger.info("Done: %s", nid)
예제 #29
0
def process_news(column, newsurl, content, newspost, topic, download_crawler):
    if has_news_content(content):
        logger.info('here')
        download_crawler = download.DownloadCrawler(use_proxy=False)
        # logger.info(content)
        d = pq(html.fromstring(content.decode("utf-8", "ignore")))

        category = None
        categoryNames = []

        key = newsurl.split("/")[-1].replace(".html", "")

        type = TYPE

        title = d('div.da-title> h2').text().strip()
        if title.find("融资") >= 0:
            type = 60001
            category = 60101

        (posturl, width,
         height) = parser_mysql_util.get_logo_id_new(newspost,
                                                     download_crawler, SOURCE,
                                                     key, "news")
        if posturl is not None:
            post = str(posturl)
        else:
            post = None

        logger.info("title: %s", title)
        mongo = db.connect_mongo()
        collection_news = mongo.article.news
        if collection_news.find_one({"title": title}) is not None:
            mongo.close()
            return

        tags = []
        articletags = d("meta[name='keywords']").attr("content")
        if articletags is not None:
            for tag in articletags.split(","):
                if tag is not None and tag.strip(
                ) != "" and tag not in tags and tag != title:
                    tags.append(tag)

        try:
            brief = d("meta[name='description']").attr("content")
        except:
            brief = None

        try:
            post_time = d("span.article-time").eq(0).text().strip()

            logger.info(post_time)
            news_time = extract.extracttime(post_time)
            logger.info("news-time: %s", news_time)
        except Exception, e:
            logger.info(e)
            news_time = datetime.datetime.now()

        if news_time is None:
            news_time = datetime.datetime.now()

        article = d('div.data-article').html()
        contents = extract.extractContents(newsurl, article)

        logger.info("%s, %s, %s, %s -> %s, %s", key, title, news_time,
                    ":".join(tags), category, brief)

        flag, domain = url_helper.get_domain(newsurl)
        dnews = {
            "date": news_time - datetime.timedelta(hours=8),
            "title": title,
            "link": newsurl,
            "createTime": datetime.datetime.now(),
            "source": SOURCE,
            "key": key,
            "key_int": None,
            "type": type,
            "original_tags": tags,
            "processStatus": 0,
            # "companyId": None,
            "companyIds": [],
            "category": category,
            "domain": domain,
            "categoryNames": categoryNames
        }
        dcontents = []
        rank = 1
        for c in contents:
            if c["data"].find("btm地址") >= 0 or \
                    c["data"].find("版权声明") >= 0:
                continue

            if c["data"].find("8btctest1/custom/images") >= 0:
                continue

            if c["type"] == "text":
                dc = {
                    "rank": rank,
                    "content": c["data"],
                    "image": "",
                    "image_src": "",
                }
            else:
                # dc = {
                #     "rank": rank,
                #     "content": "",
                #     "image": "",
                #     "image_src": c["data"],
                # }
                if download_crawler is None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                else:
                    (imgurl, width,
                     height) = parser_mysql_util.get_logo_id_new(
                         c["data"], download_crawler, SOURCE, key, "news")
                    if imgurl is not None:
                        dc = {
                            "rank": rank,
                            "content": "",
                            "image": str(imgurl),
                            "image_src": "",
                            "height": int(height),
                            "width": int(width)
                        }
                    else:
                        continue
            logger.info(c["data"])
            dcontents.append(dc)
            rank += 1
        dnews["contents"] = dcontents
        if brief is None or brief.strip() == "":
            brief = util.get_brief_from_news(dcontents)

        if post is None or post.strip() == "":
            post = util.get_posterId_from_news(dcontents)

        if download_crawler is None:
            dnews["post"] = post
        else:
            dnews["postId"] = post
        dnews["brief"] = brief

        if news_time > datetime.datetime.now():
            logger.info("Time: %s is not correct with current time", news_time)
            dnews["date"] = datetime.datetime.now() - datetime.timedelta(
                hours=8)
        # collection_news.insert(dnews)
        mongo.close()
        if title is not None and len(contents) > 0:
            # logger.info("*************DONE*************")
            nid = parser_mongo_util.save_mongo_news(dnews)
            logger.info("Done: %s", nid)
            pass
예제 #30
0
def run(url):
    proxy = get_proxy()
    if proxy is None:
        logger.info("Error: no proxy!")
        return
    logger.info(proxy)

    response = request(proxy, url)
    if response is None:
        return
    text = response.text
    # logger.info(html)
    key = url.split("/")[-1].strip()
    download_crawler = None
    d = pq(html.fromstring(text.decode("utf-8")))
    title = d('h1.article-title').text().strip()
    str_time = d('span.time').text().strip()
    str_content = d('div.article-content').html()
    brief = d("meta[name='description']").attr("content")
    logger.info(title)
    logger.info(str_time)
    # logger.info(str_content)
    contents = extract.extractContents(url, str_content)
    # logger.info(contents)

    mongo = db.connect_mongo()
    collection_news = mongo.article.news
    if collection_news.find_one({"title": title}) is not None:
        mongo.close()
        return

    flag, domain = url_helper.get_domain(url)
    news_time = datetime.datetime.strptime(str_time, "%Y-%m-%d %H:%M")
    dnews = {
        "date": news_time - datetime.timedelta(hours=8),
        "title": title,
        "link": url,
        "createTime": datetime.datetime.now(),
        "source": SOURCE,
        "key": key,
        "key_int": None,
        "type": 60001,
        "original_tags": [],
        "processStatus": 0,
        # "companyId": None,
        "companyIds": [],
        "domain": domain,
        "category": None,
        "categoryNames": []
    }
    dcontents = []
    rank = 1
    for c in contents:

        if c["type"] == "text":
            dc = {
                "rank": rank,
                "content": c["data"],
                "image": "",
                "image_src": "",
            }
        else:
            if download_crawler is None:
                dc = {
                    "rank": rank,
                    "content": "",
                    "image": "",
                    "image_src": c["data"],
                }
            else:
                imgurl = parser_mysql_util.get_logo_id(c["data"], download_crawler, SOURCE, key, "news")
                if imgurl is not None:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": str(imgurl),
                        "image_src": "",
                    }
                else:
                    continue

        logger.info(c["data"])
        dcontents.append(dc)
        rank += 1
    dnews["contents"] = dcontents
    if brief is None or brief.strip() == "":
        brief = util.get_brief_from_news(dcontents)

    post = util.get_posterId_from_news(dcontents)

    if download_crawler is None:
        dnews["post"] = post
    else:
        dnews["postId"] = post
    dnews["brief"] = brief

    if news_time > datetime.datetime.now():
        logger.info("Time: %s is not correct with current time", news_time)
        dnews["date"] = datetime.datetime.now() - datetime.timedelta(hours=8)

    id = collection_news.insert(dnews)
    mongo.close()
    logger.info("*************DONE************* %s", id)