示例#1
0
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '..'))
import BaseCrawler

sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../../util'))
import loghelper, extract, db, util, url_helper, download

sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../parser/util2'))
import parser_mysql_util
import parser_mongo_util

#logger
loghelper.init_logger("crawler_dailyeco_news", stream=True)
logger = loghelper.get_logger("crawler_dailyeco_news")

NEWSSOURCE = "dailyeco"
RETRY = 3
TYPE = 60005
SOURCE = 13865
URLS = []
CURRENT_PAGE = 1
linkPattern = "cn.dailyeconomic.com/\w+/\d+/\d+/\d+/\d+.html"
Nocontents = []
columns = [
    # {"column": "jmd", "max": 2},
    {
        "column": "投资",
        "max": 1
示例#2
0
# -*- coding: utf-8 -*-
import os, sys
import datetime
import xlwt

reload(sys)
sys.setdefaultencoding("utf-8")

sys.path.append(
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../util'))
import loghelper, config, util, db

#logger
loghelper.init_logger("audi_export", stream=True)
logger = loghelper.get_logger("audi_export")

mongo = db.connect_mongo()
collection = mongo.demoday.contest_company


def get_topic(topic_id):
    if topic_id == 13:
        return u"人工智能的应用"
    if topic_id == 11:
        return u"数字化"
    if topic_id == 12:
        return u"车·生活"
    return ""


def get_contest_company_stage_status(status):
示例#3
0
import os, sys
import time

reload(sys)
sys.setdefaultencoding("utf-8")
sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../../util'))
sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../support'))
import loghelper
import db

#logger
loghelper.init_logger("patch_company_round", stream=True)
logger = loghelper.get_logger("patch_company_round")


def process(corporate_id):
    logger.info("corporate id: %s", corporate_id)
    conn = db.connect_torndb()
    funding = conn.get(
        "select * from funding where corporateId=%s and (active is null or active !='N') order by fundingDate desc limit 1",
        corporate_id)
    if funding is not None:
        # corporate = conn.get("select * from corporate where id=%s", corporate_id)
        # if corporate is not None:
        conn.update("update corporate set round=%s, roundDesc=%s where id=%s",
                    funding["round"], funding["roundDesc"], corporate_id)
    else:
示例#4
0
import time
import gevent
from gevent.event import Event
from gevent import monkey
monkey.patch_all()
reload(sys)

sys.setdefaultencoding("utf-8")
sys.path.append(
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../util'))
import loghelper, config
import db
import util

#logger
loghelper.init_logger("domain_2_beian", stream=True)
logger = loghelper.get_logger("domain_2_beian")

#mongo
mongo = db.connect_mongo()
collection = mongo.info.beian

BEIANS = []


def whoisCheck():
    while True:
        if len(BEIANS) == 0:
            return
        beian = BEIANS.pop(0)
示例#5
0
reload(sys)
sys.setdefaultencoding("utf-8")

sys.path.append(
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '../util'))
import loghelper, db, util
import extract, extractArticlePublishedDate

sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../spider2/crawler'))
import BaseCrawler

#logger
loghelper.init_logger("deal_news_new", stream=True)
logger = loghelper.get_logger("deal_news_new")


class NewsCrawler(BaseCrawler.BaseCrawler):
    def __init__(self):
        BaseCrawler.BaseCrawler.__init__(self)

    def is_crawl_success(self, url, content):
        if content.find("</html>") == -1:
            return False

        return True


news_crawler = NewsCrawler()
示例#6
0
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '..'))
import BaseCrawler

sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../../util'))
import loghelper, extract, db, util, url_helper, download, extractArticlePublishedDate

sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../parser/util2'))
import parser_mysql_util
import parser_mongo_util

#logger
loghelper.init_logger("crawler_yxtl_news", stream=True)
logger = loghelper.get_logger("crawler_yxtl_news")

NEWSSOURCE = "youxituoluo"
RETRY = 3
TYPE = 60005
SOURCE = 13846
URLS = []
CURRENT_PAGE = 1
linkPattern = "www.youxituoluo.com/\d+.html"
Nocontents = []
columns = [
    {
        "column": "new",
        "max": 2
    },
示例#7
0
import pymongo
import datetime, time
import re
import pytz

reload(sys)
sys.setdefaultencoding("utf-8")
sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../../util'))
sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../support'))
import loghelper
import config
import util
import proxy_pool

#logger
loghelper.init_logger("itunes_parser", stream=True)
logger = loghelper.get_logger("itunes_parser")

#mongo
(mongodb_host, mongodb_port) = config.get_mongodb_config()
mongo = MongoClient(mongodb_host, mongodb_port)

itunes_collection = mongo.crawler_v2.market_itunes

if __name__ == "__main__":
    logger.info("Start...")
    rexExp = re.compile(r"[一-龥]+")
    apps = itunes_collection.find({"name":rexExp, "parsed":{"$ne":True}})
    logger.info("cnt: %s" % apps.count())
    for app in apps:
        html = app.get("html")
示例#8
0
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '..'))
import BaseCrawler

sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../../../util'))
import loghelper, extract, db, util, url_helper, download

sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../parser/util2'))
import parser_mysql_util
import parser_mongo_util

#logger
loghelper.init_logger("crawler_Kr36_newfs", stream=True)
logger = loghelper.get_logger("crawler_Kr36_newfs")

#mongo
mongo = db.connect_mongo()
collection_news = mongo.article.news

newsid = []
b_id = ""

TYPE = 60008


def find_companyId(sourceId):
    if sourceId == "0" or sourceId == 0:
        return None
示例#9
0
# -*- coding: utf-8 -*-
import os, sys

reload(sys)
sys.setdefaultencoding("utf-8")
sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../../util'))
sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../support'))
import loghelper
import db

#logger
loghelper.init_logger("collection_newproduct", stream=True)
logger = loghelper.get_logger("collection_newproduct")


def process():
    logger.info("Process collection new product")
    conn = db.connect_torndb()
    cs = conn.query(
        "select * from company where (active is null or active='Y') and type=41020"
    )
    for c in cs:
        rel = conn.get(
            "select * from collection_company_rel where (active is null or active='Y') and collectionId=21 and companyId=%s",
            c["id"])
        if rel is None:
            conn.insert(
示例#10
0
sys.path.append(
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '../util'))
import helper

sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../parser/util'))
sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0],
        '../../parser/company/itjuzi'))
import parser_db_util
import itjuzi_helper

#logger
loghelper.init_logger("patch_36kr", stream=True)
logger = loghelper.get_logger("patch_36kr")


def remove_13030_funding():
    conn = db.connect_torndb()
    fs = conn.query(
        "select * from source_funding where sourceCompanyId in (select id from source_company where source=13030)"
    )
    for f in fs:
        conn.execute(
            "delete from source_funding_investor_rel where sourceFundingId=%s",
            f["id"])
        conn.execute("delete from source_funding where id=%s", f["id"])
    conn.close()
示例#11
0
# -*- coding: utf-8 -*-
import os, sys
from pymongo import MongoClient
import pymongo
import datetime

reload(sys)
sys.setdefaultencoding("utf-8")
sys.path.append(
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../util'))
import loghelper, config
import db
import xlwt

#logger
loghelper.init_logger("export_collection", stream=True)
logger = loghelper.get_logger("export_collection")


def getRound(round):
    if round is None:
        return ""
    if round < 1010:
        return ""
    if round < 1020:
        return "Angel"
    if round < 1030:
        return "Pre-A"
    if round < 1040:
        return "A"
    if round < 1050:
示例#12
0
reload(sys)
sys.setdefaultencoding("utf-8")

sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '..'))
import BaseCrawler

sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../../util'))
import loghelper,extract,db, util,url_helper,download, extractArticlePublishedDate

sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../parser/util2'))
import parser_mysql_util
import parser_mongo_util


#logger
loghelper.init_logger("crawler_xtecher_news", stream=True)
logger = loghelper.get_logger("crawler_xtecher_news")

NEWSSOURCE = "Xtecher"
RETRY = 3
TYPE = 60001
SOURCE =13821
URLS = []
links=[]
CURRENT_PAGE = 1
linkPattern = "Xfeature/view\?aid=\d+"
Nocontents = [
]
columns = [
    {"column": None, "max": 3},
]
示例#13
0
sys.setdefaultencoding("utf-8")
sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../../util'))
sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../support'))
import loghelper
import config
import util
import proxy_pool
import db
import aggregator_util

#logger
loghelper.init_logger("appstore_rank", stream=True)
logger = loghelper.get_logger("appstore_rank")

#mongo
(mongodb_host, mongodb_port) = config.get_mongodb_config()
mongo = MongoClient(mongodb_host, mongodb_port)
appstore_rank_collection = mongo.crawler_v2.appstore_rank

total = 0


def request(url, callback):
    proxy = {'type': 'http', 'anonymity': 'high', 'ping': 1, 'transferTime': 5}
    # proxy = {'type': 'http', 'anonymity':'high'}
    proxy_ip = None
    while proxy_ip is None:
示例#14
0
import json

reload(sys)
sys.setdefaultencoding("utf-8")

sys.path.append(
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '..'))
import BaseCrawler

sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../../util'))
import loghelper, extract, db, util, url_helper, desc_helper

#logger
loghelper.init_logger("news_domain", stream=True)
logger = loghelper.get_logger("news_domain")

#mongo
mongo = db.connect_mongo()
collection_news = mongo.article.news

if __name__ == "__main__":

    results = list(
        collection_news.aggregate([{
            "$group": {
                "_id": "$domain",
                "count": {
                    "$sum": 1
                }
示例#15
0
import loghelper
import config
import util
import proxy_pool
import db

sys.path.append(
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '../util'))
import crawler_util

sys.path.append(
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '..'))
import market.miapp as miapp_parser

#logger
loghelper.init_logger("miapp_trends", stream=True)
logger = loghelper.get_logger("miapp_trends")

#mongo
mongo = db.connect_mongo()
collection = mongo.trend.android

collection_market = mongo.market.android_market  #TODO

cnt = 0
total = 0
TYPE = 16070
# def handle_search_result(response, app):
#     global total
#     global notfound
#
示例#16
0
        init_kafka()

    # action: create, delete
    msg = {"source": action, "id": company_id, "detail": source}
    flag = False
    while flag is False:
        try:
            kafkaProducer.send_messages("task_company", json.dumps(msg))
            flag = True
        except Exception, e:
            logger.exception(e)
            time.sleep(60)


#logger
loghelper.init_logger("sh_import", stream=True)
logger = loghelper.get_logger("sh_import")


def insert(shortname, name, brief, website):
    name = name.replace("(开业)", "")
    sourceId = util.md5str(name)
    sid = parser_db_util.save_company_yitai(shortname, name, 13100, sourceId,
                                            brief)
    logger.info("sid:%s->sourceId:%s", sid, sourceId)
    parser_db_util.save_source_company_name(sid, name, 12010)
    parser_db_util.save_source_company_name(sid, shortname, 12020)
    if website is not None and website.strip() != "":
        website = url_helper.url_normalize(website)
        if website is not None and website != "":
            if website.find("http://") == -1 and website.find("https://"):
示例#17
0
# -*- coding: utf-8 -*-
__author__ = 'arthur'
#老的FA是通过source_company来标记的

import os, sys
import time

reload(sys)
sys.setdefaultencoding("utf-8")

sys.path.append(
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../util'))
import loghelper, util, db, name_helper

#logger
loghelper.init_logger("set_endorse_tag_patch", stream=True)
logger = loghelper.get_logger("set_endorse_tag_patch")

if __name__ == "__main__":
    logger.info("Start...")
    conn = db.connect_torndb()
    cs = conn.query(
        "select * from source_company where source in "
        "(13100,13101,13102,13103,13104,13300,13301,13800) order by id")
    for c in cs:
        source = c["source"]
        dict = conn.get("select * from dictionary where value=%s", source)
        tag_id = None
        if dict["subTypeValue"] == 1301:
            #FA
            tag_id = 479289
示例#18
0
reload(sys)
sys.setdefaultencoding("utf-8")
sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../../util'))
sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../support'))
import loghelper
import config
import util
import proxy_pool
import db

#logger
loghelper.init_logger("itunes_trends", stream=True)
logger = loghelper.get_logger("itunes_trends")

#mongo
mongo = db.connect_mongo()
collection = mongo.trend.itunes

collection_itunes = mongo.market.itunes

cnt = 0
total = 0


def request(url, callback):
    # proxy = {'type': 'https', 'anonymity':'high', 'ping':1, 'transferTime':5}
    proxy = {'type': 'http', 'anonymity': 'high'}
示例#19
0
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '..'))
import BaseCrawler

sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../../util'))
import loghelper, extract, db, util, url_helper, download, extractArticlePublishedDate

sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../parser/util2'))
import parser_mysql_util
import parser_mongo_util

#logger
loghelper.init_logger("crawler_baijia_news", stream=True)
logger = loghelper.get_logger("crawler_baijia_news")

NEWSSOURCE = "Baijia"
RETRY = 3
TYPE = 60001
SOURCE = 13829
URLS = []
CURRENT_PAGE = 1
linkPattern = "/article/\d+"
Nocontents = []
columns = [
    {
        "column": None,
        "max": 2
    },
示例#20
0
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '..'))
import BaseCrawler

sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../../util'))
import loghelper, extract, db, util, url_helper, download, extractArticlePublishedDate, oss2_helper

sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../parser/util2'))
import parser_mysql_util
import parser_mongo_util

#logger
loghelper.init_logger("crawler_irui_report", stream=True)
logger = loghelper.get_logger("crawler_irui_report")

NEWSSOURCE = "irui"
RETRY = 3
FILE = "irui_download.pdf"
SOURCE = "艾瑞"
URLS = []
CURRENT_PAGE = 1
Nocontents = []
columns = [
    {
        "column": "business",
        "max": 1
    },
    # {"column": "company", "max": 1},
示例#21
0
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '..'))
import BaseCrawler

sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../../../util'))
import loghelper, db, util, extract, url_helper, json, download

sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../parser/util2'))
import parser_mysql_util
import parser_mongo_util

# logger
loghelper.init_logger("crawler_sspai_news", stream=True)
logger = loghelper.get_logger("crawler_sspai_news")

# mongo
# mongo = db.connect_mongo()
# collection_news = mongo.article.news

MAX_PAGE_ALL = 50
CURRENT_PAGE = 0

SOURCE = 13814
TYPE = 60001


class SspaiCrawler(BaseCrawler.BaseCrawler):
    def __init__(self):
示例#22
0
import traceback
from kafka import (KafkaConsumer, KafkaClient, SimpleProducer)
import process_topic_message
import process_topic_company
import process_company_message
import process_investor_message

reload(sys)
sys.setdefaultencoding("utf-8")

sys.path.append(
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '../util'))
import loghelper, config

#logger
loghelper.init_logger("process_message", stream=True)
logger = loghelper.get_logger("process_message")

# kafka
kafkaConsumer = None
kafkaProducer = None


def init_kafka():
    global kafkaConsumer
    global kafkaProducer
    (url) = config.get_kafka_config()
    # HashedPartitioner is default
    kafkaConsumer = KafkaConsumer("track_message_v2",
                                  group_id="process_topic_message",
                                  bootstrap_servers=[url],
示例#23
0
# -*- coding: utf-8 -*-
import os, sys
import time

reload(sys)
sys.setdefaultencoding("utf-8")

sys.path.append(
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '../util'))
import loghelper, util, db, config, name_helper

#logger
loghelper.init_logger("artifact_correct", stream=True)
logger = loghelper.get_logger("artifact_correct")


def weibo():
    id = -1
    conn = db.connect_torndb()
    while True:
        ars = conn.query(
            "select * from artifact where type=4030 and verify is null and id>%s order by id limit 1000",
            id)
        if len(ars) == 0:
            break
        for a in ars:
            artifact_id = a["id"]
            # logger.info(a["link"])
            if artifact_id > id:
                id = artifact_id
            link = a["link"]
示例#24
0
from gevent import monkey; monkey.patch_all()
from pymongo import MongoClient
import pymongo


reload(sys)
sys.setdefaultencoding("utf-8")

sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '..'))
import BaseCrawler

sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../../../util'))
import loghelper, db, util

#logger
loghelper.init_logger("itunes_aso100", stream=True)
logger = loghelper.get_logger("itunes_aso100")

linkPattern = "/app/rank/appid/\d+"
#不收录图书,游戏, 报刊杂志
cates = [
    {"name":"商务",       "url": "6000"},
    {"name":"商品指南",    "url": "6022"},
    {"name":"教育",       "url": "6017"},
    {"name":"娱乐",       "url": "6016"},
    {"name":"财务",       "url": "6015"},
    {"name":"美食佳饮",    "url": "6023"},
    {"name":"健康健美",    "url": "6013"},
    {"name":"生活",       "url": "6012"},
    {"name":"医疗",       "url": "6020"},
    {"name":"音乐",       "url": "6011"},
示例#25
0
import pymongo
from distutils.version import LooseVersion
import urllib

reload(sys)
sys.setdefaultencoding("utf-8")

sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '..'))
import BaseCrawler

sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../../../util'))
import loghelper, db, util, name_helper
import android

#logger
loghelper.init_logger("baidu", stream=True)
logger = loghelper.get_logger("baidu")

#mongo
mongo = db.connect_mongo()
collection_index = mongo.market.baidu_index
collection_search = mongo.market.baidu_search
collection_market = mongo.market.android_market

NAMES = []

class BaiduSearchCrawler(BaseCrawler.BaseCrawler):
    def __init__(self):
        BaseCrawler.BaseCrawler.__init__(self)

    def is_crawl_success(self,url, content):
示例#26
0
文件: lagou_jd.py 项目: yujiye/Codes
sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../parser/util'))
import parser_db_util

sys.path.append(
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '..'))
import BaseCrawler

sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../support'))
import proxy_pool

#logger
loghelper.init_logger("crawler_jd", stream=True)
logger = loghelper.get_logger("crawler_jd")
cnt = 0
SJ = []

#mongo
mongo = db.connect_mongo()
collection_job = mongo.job.job
collection_company = mongo.job.company


class LagouCrawler(BaseCrawler.BaseCrawler):
    def __init__(self, timeout=20):
        BaseCrawler.BaseCrawler.__init__(self, timeout=timeout)

    # 实现
示例#27
0
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '..'))
import BaseCrawler

sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../../util'))
import loghelper, extract, db, util, url_helper, download, extractArticlePublishedDate

sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../parser/util2'))
import parser_mysql_util
import parser_mongo_util

#logger
loghelper.init_logger("crawler_jingmeiti_news", stream=True)
logger = loghelper.get_logger("crawler_jingmeiti_news")

NEWSSOURCE = "jingmeiti"
RETRY = 3
TYPE = 60001
SOURCE = 13843
URLS = []
CURRENT_PAGE = 1
linkPattern = "www.jingmeiti.com/archives/\d+"
Nocontents = []
columns = [
    {
        "column": "new",
        "max": 3
    },
示例#28
0
sys.path.append(
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '..'))
import BaseCrawler

sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../../util'))
import loghelper, extract, db, util, url_helper, download, extractArticlePublishedDate

sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../parser/util2'))
import parser_mysql_util

#logger
loghelper.init_logger("crawler_baidu", stream=True)
logger = loghelper.get_logger("crawler_baidu")


class NewsDownloader:
    def __init__(self,
                 TYPE=60001,
                 SOURCE=13080,
                 RETRY=20,
                 CATEGORY=None,
                 FORCE=False):
        self.TYPE = TYPE
        self.SOURCE = SOURCE
        self.RETRY = RETRY
        self.CATEGORY = CATEGORY
        self.FORCE = FORCE
示例#29
0
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '..'))
import BaseCrawler

sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../../../util'))
import loghelper, extract, db, util, url_helper, download

sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../parser/util2'))
import parser_mysql_util
import parser_mongo_util

#logger
loghelper.init_logger("crawler_ifanr_news", stream=True)
logger = loghelper.get_logger("crawler_ifanr_news")

#mongo
mongo = db.connect_mongo()
collection_news = mongo.article.news

newsid = []
b_id = ""

SOURCE = 13811
TYPE = 60001
dt = datetime.date.today()

columns = [
    {
示例#30
0
sys.path.append(
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../util'))
import GlobalValues, crawler_util

sys.path.append(
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '../..'))
import BaseCrawler

sys.path.append(
    os.path.join(
        os.path.split(os.path.realpath(__file__))[0], '../../../../util'))
import loghelper, util, db

#logger
loghelper.init_logger("crawler_lagou_company", stream=True)
logger = loghelper.get_logger("crawler_lagou_company")

cnt = 0


class LagouCrawler(BaseCrawler.BaseCrawler):
    def __init__(self, timeout=15):
        BaseCrawler.BaseCrawler.__init__(self, timeout=timeout)

    # 实现
    def is_crawl_success(self, url, content):
        if content.find("</html>") == -1:
            return False
        d = pq(html.fromstring(content.decode("utf-8")))
        title = d('head> title').text().strip()