# -*- coding: utf-8 -*- import logging, json, time, datetime, requests, re, random, socket from lxml import etree from bs4 import BeautifulSoup from context.context import Context join_path = Context().get("pathutil.join_path") correct_link = Context().get("pathutil.correct_link") fmt_time = Context().get("datetimeutil.fmt_time") local2utc = Context().get("datetimeutil.local2utc") def get_web_data(url, data=None, headers={}, proxies={}, allow_redirects=True, timeout=None): count = 0 html_stream = None while count < 2: try: if data is not None: html_stream = requests.post(url, timeout=timeout, data=data, headers=headers, proxies=proxies, allow_redirects=allow_redirects)
# -*- coding: utf-8 -*- from uuid import uuid1 from django.conf import settings from context.context import Context ContentModel = Context().get("search.ContentModel") CassandraQueryApi = Context().get("CassandraQueryApi") class SearchArticleModel(ContentModel): """docstring for SearchArticleModel""" #TYPE = "zjld.article" TYPE = "zjld.search" FIELDS = { "type": u"元搜索", "author": u"", "publisher": u"", "title": u"", "content": u"", "url": u"", "key": u"", } def __init__(self, dct={}): super(SearchArticleModel, self).__init__(dct) def find_dup(self): dup = []
# -*- coding: utf-8 -*- from uuid import uuid1 from django.conf import settings from context.context import Context ContentModel = Context().get("zjld.ContentModel") CassandraQueryApi = Context().get("CassandraQueryApi") class ZjldArticleModel(ContentModel): """docstring for ZjldArticleModel""" TYPE = "zjld.article" FIELDS = { "type": u"文章", "author": u"", "publisher": u"", "title": u"", "content": u"", "url": u"", } def __init__(self, dct={}): super(ZjldArticleModel, self).__init__(dct) def find_dup(self): dup = [] if self.get('url'): cql = """SELECT * FROM %s WHERE url='%s' LIMIT 1""" \
# -*- coding: utf-8 -*- import re from context.context import Context extract_key = Context().get("utils.extract_key") _CHAR2NUM = { u"0": 0, u"1": 1, u"2": 2, u"3": 3, u"4": 4, u"5": 5, u"6": 6, u"7": 7, u"8": 8, u"9": 9, u"零": 0, u"一": 1, u"壹": 1, u"二": 2, u"贰": 2, u"两": 2, u"三": 3, u"叁": 3, u"四": 4, u"肆": 4, u"五": 5, u"伍": 5, u"六": 6,
#!/usr/bin/python # -*- coding: utf-8 -*- import re import requests import datetime import json #from datetime import datetime import time from bs4 import BeautifulSoup from lxml import etree from context.context import Context fmt_time = Context().get("datetimeutil.fmt_time") def get_urls_re(homepage, time = 10, cookie=''): html_stream = None count = 0 while count < 2: try: html_stream = requests.get(homepage ,cookies=cookie ,\ timeout = time) except: count += 1 else: break return html_stream
# -*- coding: utf-8 -*- import time import os import signal import logging from django.conf import settings from context.context import Context Daemon = Context().get("utils.Daemon") RedisQueryApi = Context().get("RedisQueryApi") Handler = Context().get("Handler") _CRAWLER_TYPES = {} _TERMINATING = False inject_logger = logging.getLogger("crawler.inject") class CrawlerDaemon(Daemon): """ 注入任务服务的类,继承了Daemon类。 """ def __init__(self, CRAWLER_PID): super(CrawlerDaemon, self).__init__(pidfile=CRAWLER_PID) def run(self): signal.signal(signal.SIGTERM, self.term_handler) #将正常终止信号绑定自定义方法。 print "jobtracker pid=%s start done." % os.getpid() inject_logger.info("jobtracker pid=%s START !" % os.getpid())
# -*- coding: utf-8 -*- import re from context.context import Context join_path = Context().get("pathutil.join_path") Field = Context().get("Field") Url = Context().get("Url") ArticleContentCrawler = Context().get("ArticleContentCrawler") FatherCrawler = Context().get("FatherCrawler") is_url = Context().get("htmlutil.is_url") class AqsiqCrawler(FatherCrawler): type = "aqsiq.news" item = Field(name="item", path=r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')") url = Field(name="key", path=r".*", type=Url) province = Field(name="province", value=u"全国") publisher = Field(name="publisher", value=u"国家质量监督检验检疫总局") xpath = { 'title': "//tr/td[@align='center']/h1", 'pubtime': "//tr/td[@align='center']/h1/../../following-sibling::tr[1]/td/text()", 'content': "//div[@class='TRS_Editor']", } child = ArticleContentCrawler export_fields = [province, publisher]
#!/usr/bin/python # -*- coding: utf-8 -*- import sys reload(sys) sys.setdefaultencoding('utf-8') import time, random, re from bs4 import BeautifulSoup from urllib import quote, unquote from context.context import Context WeiboArticleModel = Context().get("WeiboArticleModel") WeiboHotModel = Context().get("WeiboHotModel") SearchArticleModel = Context().get("SearchArticleModel") Crawler = Context().get("Crawler") export = Context().get("export") from crawlerimpl.weixin.processdata import HandleUrl, new_time, clear_label, \ HandleContent, get_urls_re, get_charset, change_to_json, clear_space def _get_url(url): html_stream = get_urls_re(url, time=6) if True: html_stream.encoding = "utf-8" else: html_stream.encoding = get_charset(html_stream.text) return html_stream class FirstCrawler(Crawler):
# -*- coding: utf-8 -*- import os import signal import time import logging from threading import Timer from django.conf import settings from context.context import Context Crawler = Context().get("Crawler") Handler = Context().get("Handler") get_exception_info = Context().get("get_exception_info") fetch_logger = logging.getLogger("crawler.fetch") _RUNNING_CRAWLER = None _TERMINATING = False def procedure(): """ 一个执行任务服务进程所需要做的事。 """ signal.signal(signal.SIGTERM, service_term_handler) #将正常终止信号与自定义方法绑定。 signal.signal(signal.SIGALRM, task_term_handler) #将闹钟信号与自定义方法绑定。 start_time = time.time() print "tasktracker pid=%s start done." % os.getpid() fetch_logger.info("tasktracker pid=%s START !" % os.getpid()) while (True if settings.PROCESS_TIMEOUT > 0 else
# -*- coding: utf-8 -*- from datetime import datetime from uuid import uuid1 from context.context import Context ContentModel = Context().get("ecommerce.ContentModel") CassandraQueryApi = Context().get("CassandraQueryApi") class EcBasicModel(ContentModel): TYPE = "ecommerce.basic" FIELDS = { "source_id": u"", "title": u"", "adword": u"", "version": u"", "original_price": 0.0, "history_price": {}, "price": 0.0, "score": 0, "summary": {}, "address": u"", "status": 0, } INDEXES = [ { "key": [("source", 1), ("source_id", 1)], "unique": True
# -*- coding: utf-8 -*- import sys import os import signal from django.conf import settings from context.context import Context _create_child = Context().get("processutil._create_child") procedure = Context().get("procedure") def start(): pid_file = file(settings.CRAWLER_TASK_PID, "w+") for i in range(settings.TASKTRACKER_COUNT): pid = _create_child(procedure, [], {}).keys()[0] pid_file.write(str(pid) + "\n") pid_file.close() def stop(): pid_file = file(settings.CRAWLER_TASK_PID, "r") pids = pid_file.readlines() pid_file.close() for i in range(len(pids)): pids[i] = int(pids[i].strip()) try: os.kill(pids[i], signal.SIGTERM) except: self.pids.pop(pid)
import urllib3 import urlparse import urllib import time import random from scrapy.selector import HtmlXPathSelector from context.context import Context unix_time = Context().get("datetimeutil.unix_time") _SITES_RATE_LIMIT = { "mp3.easou.com": 1.0, 'music.douban.com': 2.0, 'douban.fm': 2.0, 'music.baidu.com': 2.0, } _SITES_LAST_ACCESS = {} _NUM_POOLS = 10 _TIMEOUT = 30 _DEFAULT_HEADER = {} #_DEFAULT_HEADER = { # 'Accept' :"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", # 'Accept-Charset' : "ISO-8859-1,utf-8;q=0.7,*;q=0.3", # 'Accept-Encoding' : "gzip,deflate,sdch", # 'Accept-Language' : "en-US,en;q=0.8", # 'Cache-Control' : 'max-age=0', # 'Connection' :'keep-alive', # 'User-Agent' :"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.162 Safari/535.19"
# -*- coding: utf-8 -*- import re from lxml import etree, html from datetime import datetime from context.context import Context Crawler = Context().get("Crawler") export = Context().get("export") Handler = Context().get("Handler") SearchArticleModel = Context().get("SearchArticleModel") ZjldArticleModel = Context().get("ZjldArticleModel") Readability = Context().get("Readability") htmlutil = Context().get("htmlutil") clear_space = Context().get("textutil.clear_space") new_time = Context().get("datetimeutil.new_time") fmt_time = Context().get("datetimeutil.fmt_time") local2utc = Context().get("datetimeutil.local2utc") Field = Context().get("Field") Url = Context().get("Url") join_path = Context().get("pathutil.join_path") getTag = Context().get("bosonutil.getTag") PROXIES = { "http": "http://192.168.1.165:8888", "https": "http://192.168.1.191:8888" } def find_field(name, fields): for i in fields:
# -*- coding: utf-8 -*- from uuid import uuid1 import time from datetime import datetime from context.context import Context] ContentModel = Context().get("weibo.ContentModel") CassandraQueryApi = Context().get("CassandraQueryApi") RedisQueryApi = Context().get("RedisQueryApi") class WeiboArticleModel(ContentModel): """docstring for WeiboArticleModel""" TYPE = "zjld.weibo" FIELDS = { "type": u"微博", "id": uuid1(), "author": u"", "title": u"", "subtitle": [], "content": u"", "url": u"", "imgurl":[], "source": u"", "origin_source": u"", "pubtime": datetime.utcfromtimestamp(0), "crtime": datetime.now(), "publisher": u"",
# -*- coding: utf-8 -*- import json import time import logging from datetime import datetime, timedelta from django.conf import settings from django.db import transaction from context.context import Context CrawlerConf = Context().get("CrawlerConf") Task = Context().get("Task") RedisQueryApi = Context().get("RedisQueryApi") time2str = Context().get("datetimeutil.time2str") inject_logger = logging.getLogger("crawler.inject") fetch_logger = logging.getLogger("crawler.fetch") _CRAWLER_CONF = CrawlerConf() class Status: """ 任务状态。 """ NotStart = 0 Running = 1 Succeed = 2 Failed = -1 Canceling = -2
# -*- coding: utf-8 -*- import sys root_mod = '/Users/liujiasheng/workspace/crawler/crawler' sys.path.append(root_mod) import django, os os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings.development") django.setup() import re from datetime import datetime from apps.base.models import ScarletOnsell from context.context import Context htmlutil = Context().get("htmlutil") Url = Context().get("Url") SearchContentCrawler = Context().get("SearchContentCrawler") FatherCrawler = Context().get("FatherCrawler") Field = Context().get("Field") Crawler = Context().get("Crawler") class BuffOnsellCrawler(Crawler): type = "buff.onsell" def __init__(self, task): pass # super(BuffOnsellCrawler, self).__init__(task) def crawl(self):
# -*- coding: utf-8 -*- import os import socket import signal import time import logging import json import re from datetime import datetime, timedelta from context.context import Context ModelBase = Context().get("ModelBase") str2time = Context().get("datetimeutil.str2time") class Crawler(object): """ 业务爬虫的超类,所有业务爬虫都是该类的子类。 每个业务爬虫都必须有一个唯一标识符,该标识符为名为type的成员属性。 同时要重写crawl()方法。 """ type = "base.crawler" def __init__(self, task): self.task = task self.key = None self.data = None
''' @author: Yu ''' import time import sys import traceback import mailutil from context.context import Context Daemon = Context().get("utils.Daemon") class ServiceDefinition(object): def __init__(self, check_func, name="Service", check_interval=180, retries=3): if not callable(check_func): raise TypeError self.check_func = check_func self.name = name self.check_interval = check_interval self.retries = retries self.failures = 0 self.last_check = None def check(self): self.check_func()
# -*- coding: utf-8 -*- import sys root_mod = '/home/jshliu/Project/zjld/fix/common/crawler' sys.path.append(root_mod) import django, os os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings.development"); django.setup() import re from datetime import datetime from context.context import Context Url = Context().get("Url") SearchContentCrawler = Context().get("SearchContentCrawler") FatherCrawler = Context().get("FatherCrawler") Field = Context().get("Field") class BaiduCrawler(FatherCrawler): """ 百度新闻搜索爬虫,继承了通用一级爬虫类。 """ type = "baidu.news" #该爬虫的唯一标识符。 child = SearchContentCrawler #指定生成的任务由哪一爬虫执行。 item = Field(name="item", path="//div[@id='content_left']/div/div[@class='result']") #需要解析的字段,name为‘item’为特殊含义,不能被占用。 pubtime = Field(name="pubtime", path="div//p[@class='c-author']/text()", type=datetime)
def __init__(self): sc = Context() self.__session = SparkSession.builder.getOrCreate()
def load(self): sc = Context() return sc.get_context().textFile(self.__config.get_source_path())
# -*- coding: utf-8 -*- import copy from context.context import Context convert = Context().get("typeutil.convert") class ModelMeta(type): def __init__(self, name, bases, dct): fields = dct.get('FIELDS', {}) base = bases[0] while base != object: for k, v in base.__dict__.get('FIELDS', {}).iteritems(): fields[k] = v base = base.__base__ dct['FIELDS'] = fields indexes = dct.get('INDEXES', []) base = bases[0] while base != object: indexes.extend(base.__dict__.get('INDEXES', [])) base = base.__base__ dct['INDEXES'] = indexes type.__init__(self, name, bases, dct) class ModelBase(dict): __metaclass__ = ModelMeta
# -*- coding: utf-8 -*- import sys from django.conf import settings from context.context import Context CrawlerDaemon = Context().get("CrawlerDaemon") def run(*args): """ 注入任务服务进程的执行入口。 """ jobtracker = CrawlerDaemon(settings.CRAWLER_JOB_PID) if args[0] == 'start': jobtracker.start() elif args[0] == 'stop': jobtracker.stop()
# -*- coding: utf-8 -*- import logging import copy import time from uuid import uuid1 from datetime import datetime from context.context import Context unix_time = Context().get("datetimeutil.unix_time") ModelBase = Context().get("ModelBase") CassandraQueryApi = Context().get("CassandraQueryApi") import_logger = logging.getLogger("crawler.import") class ContentModel(ModelBase): TYPE = "base.content" FIELDS = { "id": uuid1(), "source": u"", "origin_source": u"", "pubtime": datetime.utcfromtimestamp(0), "crtime": datetime.now(), "crtime_int": int(time.time() * 1000000), "province": u"", "city": u"", "district": u"", "tag": "", "comment": {},
# -*- coding: utf-8 -*- import logging import copy from uuid import uuid1 from datetime import datetime from context.context import Context unix_time = Context().get("datetimeutil.unix_time") ModelBase = Context().get("ModelBase") _LOGGER = logging.getLogger("ecommerceimport") class ContentModel(ModelBase): TYPE = "base.content" FIELDS = { "id": uuid1(), "source": u"", "source_level": {}, "first_level": u"", "second_level": u"", "third_level": u"", "fourth_level": u"", "fifth_level": u"", "province": u"", "city": u"", "district": u"", "comment": {} }
#coding=utf-8 from django.contrib import admin from context.context import Context Task = Context().get("Task") class TaskAdmin(admin.ModelAdmin): list_display = ('crawler', 'key', 'update_time', 'status', 'interval') list_editable = ('crawler', 'key', 'status', 'interval') list_filter = ('crawler', 'status', 'category', 'application', 'interval', 'timeout') fields = ('key', 'data', 'producer_id', 'category', 'application', 'crawler', \ 'status', 'interval', 'timeout', 'last_run', 'next_run', 'update_time', 'create_time') readonly_fields = ('last_run', 'update_time', 'create_time') ordering = ('update_time', '-key') search_fields = ('key', ) admin.site.register(Task, TaskAdmin)
# -*- coding: utf-8 -*- from uuid import uuid1 from context.context import Context ContentModel = Context().get("weixin.ContentModel") CassandraQueryApi = Context().get("CassandraQueryApi") class WeixinArticleModel(ContentModel): """docstring for WeixinArticleModel""" #TYPE = "zjld.article" TYPE = "zjld.weixin" FIELDS = { "type": u"微信", "author": u"", "publisher": u"", "title": u"", "content": u"", "url": u"", # "province": u"", # "city": u"", # "district": u"" } def __init__(self, dct={}): super(WeixinArticleModel, self).__init__(dct) def find_dup(self):