def __init__(self, thcnt, log='spider.log'): Spider.__init__(self, thcnt) self.pagestore = None self.job_spliter = None logging.basicConfig(filename=os.path.join(os.getcwd(), log), level=logging.NOTSET, format='%(levelno)s:%(asctime)s:%(threadName)s:%(message)s', datefmt='%m/%d %I:%M:%S %p')
def __init__(self): self.is_debug = False self._can_use_proxy_num = 0 if self.is_debug: Spider.__init__(self, 1) else: self.proxies_dict = [] self.read_proxy("../../_ct_proxy/proxy_all_filter.txt") Spider.__init__(self, len(self.proxies_dict)) self.error_cnt = 0 self._aes_ = CCIQ_AES() #根据公司名字查询到的公司列表全部信息 self.query_company_list = FileSaver("all_company_list.txt") #已经爬取过的公司名 self.already_cname_list = FileSaver("all_company_list_already.txt") #爬过的 错误类型 self.already_error_type = FileSaver("all_already_error_type.txt") #初始化已经爬过的公司 self.init_cname() self.extJsons = [ "Hoi6oX70l9whauZmjq8jVAmoe3UspXXhX9mPG+KAeqs1rKZVr/uapICH92P/Crryt63u28aP4QP665AzcT/jN5Go1o3bvwMvVIkuN9e60k6WI2pVFBrwZMvxwW6BnQukSzDSlyPvEhgpR5DIHQEV6C51hMgp4Zc3OkTSsyezAm4=", "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G3Ac4K8xpX3aMB5s8Ci2a/YpTpioZxAvptqJsQUCoNn0tLCOVM4XxMJQWbrErkOcl4=", "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G1U2wdOlL49/aDwt3NZNp4TGa5iBFpYLm69F/6PPFoXIR/Aw5p48//8OgZFpddDUwQ=" ] self.user_agents = [ "=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)", "=CCIQ/2.0.1 (iPhone; iOS 8.1.3; Scale/2.00)", "=CCIQ/2.0.1 (iPhone; iOS 8.4; Scale/2.00)" ] self.bloom = set()
def __init__(self): spider.util.use_utf8() self.saver = RunGuangdong.Saver() self.is_debug = False if self.is_debug: Spider.__init__(self, 1) self.proxies_dict = [{ 'http': 'http://*****:*****@106.75.134.189:18889', 'https': 'https://*****:*****@106.75.134.189:18889' }] else: self.proxies_dict = [] self.read_proxy("../../../_ct_proxy/proxy_041810.txt") Spider.__init__(self, len(self.proxies_dict)) self._curltls = threading.local() self.gswebs = {} #已经访问成功的URL self.success_url = FileSaver("gsinfo_guangdong_success_url.txt") #初始化已经爬过的链接 self.init_spider_url() self.cnt = 1 self.run_time = time.time() self.lock = threading.Lock() self.not_show_save = FileSaver("not_show_error_out.txt")
def __init__(self, tc): Spider.__init__(self, tc) self._logport = 5556 # self.channel = 'gsid' # self.job_queue = 'gsid' self.savebin = BinSaver("gongshang.bin") self.faillog = open("fail_list.txt", "w+b")
def __init__(self, thcnt): Spider.__init__(self, thcnt) self.success_count = 0 self.fail_count = 0 self.fail_file = FileSaver("fail2db.txt") self.sus_file = FileSaver("SZ2DB.txt") self.init_filter()
def __init__(self): spider.util.use_utf8() self.saver = RunGuangdong.Saver() self.is_debug = True if self.is_debug: Spider.__init__(self, 100) #self.proxies_dict = [{'http': 'http://*****:*****@106.75.134.189:18889', 'https': 'https://*****:*****@106.75.134.189:18889'}] self.proxies_dict = [{ 'http': 'http://192.168.1.39:3428', 'https': 'https://192.168.1.39:3428' }] else: self.proxies_dict = [] self.read_proxy( "/home/windy/develop/getjd/_ct_proxy/proxy_all_filter.txt") Spider.__init__(self, len(self.proxies_dict)) self._curltls = threading.local() self.gswebs = {} #已经获取详情成功的关键字 self.success_name = FileSaver("已经查询过的关键字.txt") #已经获取详情成功的查询到的公司名 self.succes_query_name = FileSaver("已经拿到详情的公司名.txt") #页面提示无法显示的公司列表信息out[] self.not_show_save = FileSaver("页面提示无法显示的公司列表out.txt") #查不到内容的关键字 self.query_none_kw = FileSaver("查询内容为空的关键字.txt") #初始化已经爬过的链接 self.init_spider_url() self.lock = threading.Lock() #速度记录 self.run_time = time.time() self.cnt = 1 self.proxy_error_cnt = 0
def __init__(self): self._can_use_proxy_num = 0 self.is_debug = False if self.is_debug: Spider.__init__(self, 1) else: self.proxies_dict = [] self.read_proxy("proxy_032512.txt") Spider.__init__(self, len(self.proxies_dict)) self._aes_ = CCIQ_AES() #成功的 self.query_success = FileSaver("c_query_detail.txt") #失败的 self.query_failure = FileSaver("c_query_detail_failure.txt") #已经爬取过的 self.already_cname_list = FileSaver("c_already_detail.txt") #初始化已经爬过的公司 self.init_cname() #self.extJson = self._aes_.encrypt(spider.util.utf8str({"cl_screenSize": "640x960", "cl_cookieId": "B200BA9D-A3A0-4140-A293-9A1A671BA5CE", "Org_iOS_Version": "2.0.1"})) # self.extJson = "Hoi6oX70l9whauZmjq8jVAmoe3UspXXhX9mPG+KAeqs1rKZVr/uapICH92P/Crryt63u28aP4QP665AzcT/jN5Go1o3bvwMvVIkuN9e60k6WI2pVFBrwZMvxwW6BnQukSzDSlyPvEhgpR5DIHQEV6C51hMgp4Zc3OkTSsyezAm4=" # self.select_user_agent("=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)") self.bloom = set() self.extJsons = ["Hoi6oX70l9whauZmjq8jVAmoe3UspXXhX9mPG+KAeqs1rKZVr/uapICH92P/Crryt63u28aP4QP665AzcT/jN5Go1o3bvwMvVIkuN9e60k6WI2pVFBrwZMvxwW6BnQukSzDSlyPvEhgpR5DIHQEV6C51hMgp4Zc3OkTSsyezAm4=", "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G3Ac4K8xpX3aMB5s8Ci2a/YpTpioZxAvptqJsQUCoNn0tLCOVM4XxMJQWbrErkOcl4=", "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G1U2wdOlL49/aDwt3NZNp4TGa5iBFpYLm69F/6PPFoXIR/Aw5p48//8OgZFpddDUwQ="] self.user_agents = ["=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)", "=CCIQ/2.0.1 (iPhone; iOS 8.1.3; Scale/2.00)", "=CCIQ/2.0.1 (iPhone; iOS 8.4; Scale/2.00)"] self.is_first = True self.init_time = 0
def __init__(self, thcnt): Spider.__init__(self, thcnt) self.all_count = 0 self.yy_count = 0 self.bin_list = [ 'jobui_job_data1.bin', 'jobui_job_bu.bin', 'jobui_job_data2.bin' ]
def __init__(self, thcnt, company): Spider.__init__(self, thcnt) self.pagestore = PageStoreLP() self.list = [] with open(company) as file_: for line in file_: self.list.append(line.strip())
def __init__(self): spider.util.use_utf8() self.saver = RunGuangdong.Saver() self.is_debug = True if self.is_debug: Spider.__init__(self, 1) self.proxies_dict = [{ 'http': 'http://*****:*****@106.75.134.190:18889', 'https': 'https://*****:*****@106.75.134.190:18889' }] else: self.proxies_dict = [] self.read_proxy("../../_ct_proxy/proxy_041309.txt") Spider.__init__(self, len(self.proxies_dict)) self._curltls = threading.local() self.gswebs = {} #已经获取详情成功的公司名 self.success_name = FileSaver("guangdong_success_spider_cname.txt") #根据关键字查到的公司名再写入到这里面,防止丢失 self.un_spider_name = FileSaver("guangdong_temp_un_spider_cname.txt") #页面提示无法显示的公司列表信息out[] self.not_show_save = FileSaver("guangdong_not_show_out.txt") #初始化已经爬过的链接 self.init_spider_url() self.lock = threading.Lock() #速度记录 self.run_time = time.time() self.cnt = 1
def __init__(self,thcnt): Spider.__init__(self, thcnt) self.sessionReq = YouzyLogin() self.sessionReq.do_login() self.num_count = 0 self.savefile=FileSaver("youzy.txt") self.__fail_urls = FileSaver("fail_urls.txt")
def __init__(self): self.proxies_dict = [] self.read_proxy("proxy_20160218.txt") Spider.__init__(self, len(self.proxies_dict)) self.num_count = 0 #self.filter_name = [] self._aes_ = CCIQ_AES() #根据公司名字查询到的公司列表全部信息 self.query_company_info = FileSaver("query_company_info.txt") #根据公司名字查询到的公司列表局部信息 self.query_company_info_part = FileSaver("query_company_info_part.txt") #根据公司名字查询到的公司列表信息失败的 self.query_company_info_failure = FileSaver( "query_company_info_failure.txt") #已经爬取过的公司名 self.already_cname = FileSaver("already_cname.txt") #初始化已经爬过的公司 self.init_cname() #查询详情失败的公司名 self.detail_failure = FileSaver("detail_failure1.txt") #APP可以拿到的公司全部信息 包含股东信息 self.detail_company = FileSaver("detail_company.txt") self.extJson = self._aes_.encrypt( spider.util.utf8str({ "cl_screenSize": "640x960", "cl_cookieId": "16923697-D73E-485A-BDCF-68FAD456AC02", "Org_iOS_Version": "2.0.1" })) self.select_user_agent("=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)")
def __init__(self, thcnt, acs, type=1, process_num=0, max_process_cnt=1): Spider.__init__(self, thcnt) self._name = 'cvlpspider' self.lpm = MRLManager(acs, new_LPQYLogin) if type == 2: self.lpm = MRLManager(acs, new_LPLTLogin) self.pagestore = LPCVStore() self.hasher = spider.util.LocalHashChecker() self.lpm.ensure_login_do(None, lambda n: 1, None) self.lpm.release_obj() self.imgcnt = 0 self._type = type self._process_num = process_num self._max_process_cnt = max_process_cnt self._spider_cnt = 0 self._start_time = datetime.datetime.today() self.url_prefix = 'https://lpt.liepin.com/resume/showresumedetail/?res_id_encode={}&isBatch=0' if self._type == 2: self.url_prefix = 'https://h.liepin.com/resume/showresumedetail/?res_id_encode={}&isBatch=0' self.stat = spider.runtime.StatDict() self._limit_cnt = 200
def __init__(self, thcnt): Spider.__init__(self, thcnt) self.num_count = 0 self.page_store = PageStoreJobUI() self.page_store.testmode = False self.bin_list = [ 'jobui_job_data1.bin', 'jobui_job_bu.bin', 'jobui_job_data2.bin' ]
def __init__(self,thcnt): Spider.__init__(self, thcnt) self.sessionReq = YouzyLogin() self.sessionReq.do_login(1) self.num_count = 0 self.parse_count = 0 self.savefile=CsvSaver("spider_url_zhuanke_np.csv",fixed+pfcolumn) self.__fail_urls = FileSaver("spider_url_fail_zhuanke_np.txt")
def __init__(self, thcnt, company): Spider.__init__(self, thcnt) self.default_headers = {'Cookie': 'guide=1'} self.pagestore = PageStore51() self._name = "jd51" self.list = [] with open(company) as file_: for line in file_: self.list.append(line.strip())
def __init__(self): self.proxies_dict = [] self.read_proxy("proxy_030814.txt") Spider.__init__(self, len(self.proxies_dict)) self.success_count = 0 self.request_count = 0 self.__fail_urls = FileSaver("fail_urls.txt") self.start_time = time.time() self.page_store = PageStoreJobUI() self.page_store.testmode = False
def __init__(self): self.proxies_dict = [] self.read_proxy("../spider/proxy/proxy.txt") Spider.__init__(self, len(self.proxies_dict)) self.success_count = 0 self.request_count = 0 self.__fail_ids = FileSaver("fail_ids.txt") self.start_time = time.time() self.page_store = PageStoreJobUI() self.page_store.testmode = True
def __init__(self): self.proxies_dict = [] self.read_proxy("../spider/proxy/proxy.txt") Spider.__init__(self, len(self.proxies_dict)) self.success_count = 0 self.request_count = 0 self.__fail_add_url = FileSaver("fail_add_url.txt") self.start_time = time.time() self.domain = self.read_domain() self.domain_file = FileSaver("domains.txt")
def __init__(self,thcnt): Spider.__init__(self,thcnt) # self.uc_count = 0 # self.tc_count = 0 # self.yy_count = 0 self.all_count = 0 self.bin_list = ['jobui_job_data1.bin','jobui_job_bu.bin','jobui_job_data2.bin'] #self.bin_list = ['jobui_job.bin','jobui_job2.bin','jobui_job4.bin'] self.domains = [] self.file_s = FileSaver('domains.txt')
def __init__(self, thcnt): Spider.__init__(self, thcnt) self.request = SessionRequests() self.view_state = None self.event_valid = None self.rand = None self.loc = "浙江" self.data_file = FileSaver("浙江_data.txt") self.have_get_url_file = FileSaver("浙江_get_url.txt") self.init_already() self.login("38037395", "773950")
def __init__(self, *proxyfile): threadcnt = self.prepare_proxy(*proxyfile) Spider.__init__(self, threadcnt) if not os.path.exists("data1"): os.makedirs("data1") self.namefile = open( "data1/corpname." + str(time.time()).split(".")[0] + ".txt", "w+b") self.failfile = open( "data1/fail." + str(time.time()).split(".")[0] + ".txt", "w+b") self.binsaver = BinSaver("data1/gsinfo" + str(time.time()).split(".")[0] + ".bin")
def __init__(self, thcnt): Spider.__init__(self, thcnt) self.fail_file = FileSaver("fail2db.txt") self.sus_file = FileSaver("SZ2DB.txt") #self.invest_detail_url = FileSaver("invest_detail_url.txt") self.init_filter() self.proxies = {'http': 'http://*****:*****@haohr.com:50001', 'https': 'https://*****:*****@haohr.com:50001'} #{'http': 'http://*****:*****@192.168.1.39:3428', 'https': 'https://*****:*****@192.168.1.39:3428'} self.select_user_agent("=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/49.0.2623.108 Chrome/49.0.2623.108 Safari/537.36") self.all_count = 0 self.update_count = 0 self.new_count = 0 self.fail_count = 0
def __init__(self): #self.proxies_dict = [] #self.read_proxy("../spider/proxy/proxy.txt") #Spider.__init__(self, len(self.proxies_dict)) Spider.__init__(self, 1) self.num_count = 0 self._aes_ = CCIQ_AES() #APP可以拿到的公司全部信息 self.save_success = FileSaver("exist_company.txt") #APP可以拿到的公司局部信息 self.part_success = FileSaver("part_company.txt") #查询失败的公司名 self.fail_name = FileSaver("fail_name.txt")
def __init__(self, thcnt): Spider.__init__(self, thcnt) self.counter = 0 self.skipcnt = 0 self.skip_j = 1 self.ofile = FileSaver('people_result') self.headers = { 'X-Requested-With': 'XMLHttpRequest', #'Referer':'https://www.baidu.com/', 'DNT': 1, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3' }
def __init__(self): Spider.__init__(self, 20) self._aes_ = CCIQ_AES() #self.select_user_agent("=CCIQ/2.0.1 (iPhone; iOS 8.4; Scale/2.00)") self.proxy_filter = FileSaver("proxy_filter_030309_detail1.txt") self.extJsons = ['"Hoi6oX70l9whauZmjq8jVAmoe3UspXXhX9mPG+KAeqs1rKZVr\/uapICH92P\/Crryt63u28aP4QP665AzcT\/jN5Go1o3bvwMvVIkuN9e60k6WI2pVFBrwZMvxwW6BnQukSzDSlyPvEhgpR5DIHQEV6C51hMgp4Zc3OkTSsyezAm4="', '"ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ\/kgBkJt\/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G3Ac4K8xpX3aMB5s8Ci2a\/YpTpioZxAvptqJsQUCoNn0tLCOVM4XxMJQWbrErkOcl4="', '"ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ\/kgBkJt\/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G1U2wdOlL49\/aDwt3NZNp4TGa5iBFpYLm69F\/6PPFoXIR\/Aw5p48\/\/8OgZFpddDUwQ="'] self.user_agents = ["=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)", "=CCIQ/2.0.1 (iPhone; iOS 8.1.3; Scale/2.00)", "=CCIQ/2.0.1 (iPhone; iOS 8.4; Scale/2.00)"]
def __init__(self): self.is_debug = True if self.is_debug: Spider.__init__(self, 1) else: self.proxies_dict = [] self.read_proxy("../spider/proxy/proxy.txt") Spider.__init__(self, len(self.proxies_dict)) self.success_count = 0 self.request_count = 0 self.__fail_ids = FileSaver("fail_url.txt") self.start_time = time.time() self.page_store = PageStoreJobUI() self.page_store.testmode = True self.init_time = time.time() self.already_url = FileSaver("already_url.txt") self.init_already_url()
def __init__(self): spider.util.use_utf8() self.is_debug = True if self.is_debug: Spider.__init__(self, 1) else: self.proxies_dict = [] self.read_proxy("../../_ct_proxy/proxy_all_filter.txt") Spider.__init__(self, len(self.proxies_dict)) self._curltls = threading.local() self.success_name = FileSaver("query_success_name.txt") self.success_detail = FileSaver("query_success_detail.txt") #初始化已经爬过的链接 self.init_spider_url() self.cnt = 1 self.run_time = time.time() self.lock = threading.Lock()
def __init__(self): spider.util.use_utf8() self.saver = RunGansu.Saver() self.is_debug = True if self.is_debug: Spider.__init__(self, 200) # self.proxies_dict = [{'http': 'http://*****:*****@106.75.134.189:18889', # 'https': 'https://*****:*****@106.75.134.189:18889'}, # {'http': 'http://*****:*****@106.75.134.190:18889', # 'https': 'https://*****:*****@106.75.134.190:18889'}, # {'http': 'http://*****:*****@106.75.134.191:18889', # 'https': 'https://*****:*****@106.75.134.191:18889'}, # {'http': 'http://*****:*****@106.75.134.192:18889', # 'https': 'https://*****:*****@106.75.134.192:18889'}, # {'http': 'http://*****:*****@106.75.134.193:18889', # 'https': 'https://*****:*****@106.75.134.193:18889'}] self.proxies_dict = [{ 'http': 'http://*****:*****@192.168.1.39:3428', 'https': 'https://*****:*****@192.168.1.39:3428' }, { 'http': 'http://*****:*****@121.40.186.237:50001', 'https': 'https://*****:*****@121.40.186.237:50001' }] #self.proxies_dict = [{}] self.gsweb = SearchGSWebGansu(self.saver) else: self.proxies_dict = [] self.read_proxy("../../_ct_proxy/proxy_040510.txt") Spider.__init__(self, len(self.proxies_dict)) self._curltls = threading.local() self.gswebs = {} #已经查询成功的关键字 self.success_kw = FileSaver("gsinfo_Gansu_success_kw.txt") #对于查到的列表信息,爬取成功就写入到这个文本,防止重复爬取 self.success_queries = FileSaver("gsinfo_Gansu_success_queries.txt") #初始化已经爬过的链接 #self.init_spider_url() #time.sleep(2) self.cnt = 1 self.run_time = time.time() self.cnt_q = 1
def __init__(self, thcnt): Spider.__init__(self, thcnt) self._user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:42.0) Gecko/20100101 Firefox/42.0' self.baseurl = 'http://www.ewt360.com/LNLQXX/SearchResult?act=mark' self.headers = {'Referer': self.baseurl} #scores = range(450,750+1) + range(449, 0, -1) + [0] scores = range(750, 0, -1) + [0] self.possmap = { 'Years': range(2009, 2014 + 1), 'WL': ['l', 'w'], 'BZ': ['b', 'z'], 'PiCi': 0, 'Score': scores, 'ProvinceCode': 0, 'page': 1 } self.bs = BinSaver("fo.bin") self.racer = RaceValueByKey()