Exemplo n.º 1
0
def main():

    global baidunewsConf, baidunewsCrawlerQueue, Channels, CurrentTime, TenDaysAgoTime
    if len(sys.argv) != 3:
        usage()

    config_dir = sys.argv[2]
    configFile = os.path.join(config_dir, MODULENAME + ".conf")

    #1.load system config
    appConf = appSystemVars.appSystemConf
    appConf.loadConfigBuffer(MAINCONFIGBUFFER)
    crawlerDB = appConf.getCrawlerDB()
    resultManager = appConf.getResultManager()
    DBPC = appConf.getDBPC()
    logConfigger = appConf.getLogger()
    ampqer = appConf.getMQ()
    timeperiod = appConf.getTimePeriod()
    #2. load baidunews config
    baidunewsConf = loadConfig(configFile)
    channelid = baidunewsConf
    Channels = getChannels(channelid)
    baidunewsCrawlerQueue = appCrawlerQueue(ampqer.getURL(),
                                            ampqer.getExchange(),
                                            ampqer.getRoutingKey(),
                                            ampqer.getHostQueue())
    CurrentTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    TenDaysAgo = (datetime.datetime.now() -
                  datetime.timedelta(int(timeperiod)))
    TenDaysAgoTime = TenDaysAgo.strftime("%Y-%m-%d %H:%M:%S")
    #start crawler
    print("Start baidunewsCrawler ...")
    for channel in Channels:
        getNewsList(channel)
Exemplo n.º 2
0
def main():

    if len(sys.argv) != 3:
        usage()

    config_dir = sys.argv[2]
    configFile = os.path.join(config_dir, MODULENAME + ".conf")
    global neteaseConf, ChannelIds, neteaseCrawlerQueue, CurrentTime, TenDaysAgoTime
    #1.load system config
    appConf = appSystemVars.appSystemConf
    appConf.loadConfigBuffer(MAINCONFIGBUFFER)
    crawlerDB = appConf.getCrawlerDB()
    resultManager = appConf.getResultManager()
    DBPC = appConf.getDBPC()
    logConfigger = appConf.getLogger()
    ampqer = appConf.getMQ()
    timeperiod = appConf.getTimePeriod()

    #2. load netease config

    neteaseConf = loadConfig(configFile)
    ChannelIds = neteaseConf["ChannelIds"]
    neteaseCrawlerQueue = appCrawlerQueue(ampqer.getURL(), ampqer.getExchange(), \
                                          ampqer.getRoutingKey(), ampqer.getHostQueue())

    CurrentTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    TenDaysAgo = (datetime.datetime.now() -
                  datetime.timedelta(int(timeperiod)))
    TenDaysAgoTime = TenDaysAgo.strftime("%Y-%m-%d %H:%M:%S")

    #3. start to crawler
    ##ChannelIds = {"video":"T1457068979049"}
    for channelid in ChannelIds:
        getChannelNewsList(channelid)
Exemplo n.º 3
0
def main():
    global config, weiboCrawlerQueue, Channels, CurrentTime, TenDaysAgoTime, weibolinkQueue
    if len(sys.argv) != 3:
        usage()

    config_dir = sys.argv[2]
    configFile = os.path.join(config_dir, MODULENAME + ".conf")

    #1.load system config
    appConf = appSystemVars.appSystemConf
    appConf.loadConfigBuffer(MAINCONFIGBUFFER)
    crawlerDB = appConf.getCrawlerDB()
    resultManager = appConf.getResultManager()
    DBPC = appConf.getDBPC()

    logConfigger = appConf.getLogger()
    ampqer = appConf.getMQ()
    timeperiod = appConf.getTimePeriod()

    tracksource = appConf.getTrackSource()
    tracksourceHost = tracksource.getHost()
    tracksourcePort = tracksource.getPort()
    #2. load weibo config
    Channels = getweiboChannels(tracksourceHost, tracksourcePort)
    config = ConfigParser.ConfigParser()
    config.read(configFile)
    weiboCrawlerQueue = appCrawlerQueue(\
        ampqer.getURL(), ampqer.getExchange(), ampqer.getRoutingKey(), ampqer.getHostQueue())

    weibolinkQueue = appCrawlerQueue(\
        ampqer.getURL(), ampqer.getExchange(), ampqer.getRoutingKey(), ampqer.getLinkQueue())
    CurrentTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    TenDaysAgo = (datetime.datetime.now() -
                  datetime.timedelta(int(timeperiod)))
    TenDaysAgoTime = TenDaysAgo.strftime("%Y-%m-%d %H:%M:%S")
    #start crawler
    appLogger.info("Start weiboCrawler ...")

    start = time.time()
    for channel in Channels:
        if channel[
                'name'] in '新华网 央视新闻  北京日报  新浪视频 中安在线 人民日报 重庆晚报  泰州晚报 北京晨报 南通网':
            print channel['name']
            getNewsList(channel)
    end = time.time()
    #crawl timeline
    print end - start
Exemplo n.º 4
0
def main():
    global tencentConf, tencentCrawlerQueue, ChannelIds, CurrentTime, TenDaysAgoTime, tencentlinkQueue
    if len(sys.argv) != 3:
        usage()

    config_dir = sys.argv[2]
    configFile = os.path.join(config_dir, MODULENAME + ".conf")

    #1.load system config
    appConf = appSystemVars.appSystemConf
    appConf.loadConfigBuffer(MAINCONFIGBUFFER)
    crawlerDB = appConf.getCrawlerDB()
    resultManager = appConf.getResultManager()
    DBPC = appConf.getDBPC()

    logConfigger = appConf.getLogger()
    ampqer = appConf.getMQ()
    timeperiod = appConf.getTimePeriod()

    #2. load tencent config
    tencentConf = loadConfig(configFile)
    ChannelIds = tencentConf["ChannelIds"]
    #tencentCrawlerQueue = appCrawlerQueue (tencentConf["amqpurl"],tencentConf["request_queue"], tencentConf["request_queue"], tencentConf["request_queue"])
    #tencentlinkQueue = appCrawlerQueue (tencentConf["amqpurl"],tencentConf["outerlink_queue"], tencentConf["outerlink_queue"], tencentConf["outerlink_queue"])
    tencentCrawlerQueue = appCrawlerQueue(ampqer.getURL(),
                                          ampqer.getExchange(),
                                          ampqer.getRoutingKey(),
                                          ampqer.getHostQueue())
    tencentlinkQueue = appCrawlerQueue(ampqer.getURL(), ampqer.getExchange(),
                                       ampqer.getRoutingKey(),
                                       ampqer.getLinkQueue())
    CurrentTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    TenDaysAgo = (datetime.datetime.now() -
                  datetime.timedelta(int(timeperiod)))
    TenDaysAgoTime = TenDaysAgo.strftime("%Y-%m-%d %H:%M:%S")

    #3. start to crawler
    for channelid in ChannelIds:
        getChannelNewsList(channelid)
Exemplo n.º 5
0
def main():
    global wechatConf, wechatCrawlerQueue, Channels, CurrentTime, TenDaysAgoTime
    if len(sys.argv) != 3:
        usage()

    config_dir = sys.argv[2]
    configFile = os.path.join(config_dir, MODULENAME + ".conf")

    #1.load system config
    appConf = appSystemVars.appSystemConf
    appConf.loadConfigBuffer(MAINCONFIGBUFFER)
    crawlerDB = appConf.getCrawlerDB()
    resultManager = appConf.getResultManager()
    DBPC = appConf.getDBPC()

    logConfigger = appConf.getLogger()
    ampqer = appConf.getMQ()
    timeperiod = appConf.getTimePeriod()

    tracksource = appConf.getTrackSource()
    tracksourceHost = tracksource.getHost()
    tracksourcePort = tracksource.getPort()
    #2. load wechat config
    try:
        Channels = getWechatChannels(tracksourceHost, tracksourcePort)
    except:
        appLogger.error(traceback.format_exc())
        wechatConf = loadConfig(configFile)
        Channels = wechatConf['channels']
    wechatCrawlerQueue = appCrawlerQueue(ampqer.getURL(), ampqer.getExchange(),
                                         ampqer.getRoutingKey(),
                                         ampqer.getHostQueue())
    CurrentTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    TenDaysAgo = (datetime.datetime.now() -
                  datetime.timedelta(int(timeperiod)))
    TenDaysAgoTime = TenDaysAgo.strftime("%Y-%m-%d %H:%M:%S")
    #start crawler
    appLogger.info("Start wechatCrawler ...")
    print len(Channels)
    start = time.time()

    channel_list = '太行日报 新华视点 淮北日报 新华视界 新华国际 微黔江 天津日报 人民日报 上海观察 深圳特区报'
    for channel in Channels:
        if channel['name'] in channel_list:
            #print channel['name']
            getNewsList(channel)
            time.sleep(300)
    end = time.time()
    #crawl timeline
    print end - start
Exemplo n.º 6
0
def main():
    global sohuConf, sohuCrawlerQueue, ChannelIds, CurrentTime, TenDaysAgoTime
    if len(sys.argv) != 3:
        usage()

    config_dir = sys.argv[2]
    configFile = os.path.join(config_dir, MODULENAME + ".conf")

    #1.load system config
    appConf = appSystemVars.appSystemConf
    appConf.loadConfigBuffer(MAINCONFIGBUFFER)
    crawlerDB = appConf.getCrawlerDB()
    resultManager = appConf.getResultManager()
    DBPC = appConf.getDBPC()

    logConfigger = appConf.getLogger()
    ampqer = appConf.getMQ()
    timeperiod = appConf.getTimePeriod()

    #2. load sohu config
    sohuConf = loadConfig(configFile)
    ChannelIds = sohuConf["ChannelIds"]
    #sohuCrawlerQueue = appCrawlerQueue (sohuConf["amqpurl"],sohuConf["request_queue"], sohuConf["request_queue"], sohuConf["request_queue"])
    sohuCrawlerQueue = appCrawlerQueue(ampqer.getURL(), ampqer.getExchange(),
                                       ampqer.getRoutingKey(),
                                       ampqer.getHostQueue())

    CurrentTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    TenDaysAgo = (datetime.datetime.now() -
                  datetime.timedelta(int(timeperiod)))
    TenDaysAgoTime = TenDaysAgo.strftime("%Y-%m-%d %H:%M:%S")

    #3.start crawler
    print("Start sohuCrawler ...")
    for channelid in ChannelIds:
        #videos
        if int(ChannelIds[channelid]) == 36:
            #continue
            getVideoList()
        #Album
        if int(ChannelIds[channelid]) == 47 or int(
                ChannelIds[channelid]) == 54:
            #continue
            getAlbumList(channelid)
        #news
        getNewsList(channelid)
Exemplo n.º 7
0
def main():

    global xinhuaConf, xinhuaCrawlerQueue, Channels, CurrentTime, TenDaysAgoTime
    if len(sys.argv) != 3:
        usage()

    config_dir = sys.argv[2]
    configFile = os.path.join(config_dir, MODULENAME + ".conf")

    #1.load system config
    appConf = appSystemVars.appSystemConf
    appConf.loadConfigBuffer(MAINCONFIGBUFFER)
    crawlerDB = appConf.getCrawlerDB()
    resultManager = appConf.getResultManager()
    DBPC = appConf.getDBPC()

    logConfigger = appConf.getLogger()
    ampqer = appConf.getMQ()
    timeperiod = appConf.getTimePeriod()

    tracksource = appConf.getTrackSource()
    tracksourceHost = tracksource.getHost()
    tracksourcePort = tracksource.getPort()
    #2. load xinhua config
    xinhuaCrawlerQueue = appCrawlerQueue(ampqer.getURL(), ampqer.getExchange(),
                                         ampqer.getRoutingKey(),
                                         ampqer.getHostQueue())
    CurrentTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    TenDaysAgo = (datetime.datetime.now() -
                  datetime.timedelta(int(timeperiod)))
    TenDaysAgoTime = TenDaysAgo.strftime("%Y-%m-%d %H:%M:%S")
    #start crawler
    appLogger.info("Start xinhuaCrawler ...")
    for i in range(3):
        videodate = DateFormat(int(time.time()) - 3600 * 24 * i)[:10]
        getNewsList(videodate)