def main(): source_db = MongDb(mongo_db_conf['host'], mongo_db_conf['port'], mongo_db_conf['db'], mongo_db_conf['username'], mongo_db_conf['password'], log=log) count = 0 total = 0 already = 0 with open(conf_name) as p_file: for line in p_file: total += 1 company = line.strip("\n").strip("\r").strip(" ") item = source_db.find_one(table_name, {'company': company}) if item is None: log.error("当前企业没有抓到: {company}".format(company=company)) count += 1 else: log.info("已抓到企业: {}".format(company)) already += 1 log.info("总共企业数目为: {}".format(total)) log.info("当前已抓到的个数: {}".format(already)) log.info("当前总共没有抓到企业数目为: {}".format(count))
def __init__(self, page, log): self.__page = page self.log = log self.log.info("获得 {} 页之后的数据...".format(self.__page)) self.mongo = MongDb(LocalMongoConfig.HOST, LocalMongoConfig.PORT, LocalMongoConfig.DB, LocalMongoConfig.USER, LocalMongoConfig.PASSWD, log=self.log) self.table = "douban" self.request = self.__init_reqeust() self.douban_handler = DouBanInfoHandler()
def __init__(self, secrite_key, token, user_id, log): self.log = log self.secrite_key = secrite_key self.user_id = user_id self.token = token self.request = self.__init_reqeust() self.cp_mongo = MongDb(LocalMongoConfig.HOST, LocalMongoConfig.PORT, LocalMongoConfig.DB, LocalMongoConfig.USER, LocalMongoConfig.PASSWD, log=self.log) self.cp_table = "yizhou_cp"
def search_task(): log = Gsxtlogger('hunan.log').get_logger() mongo_db_conf = { 'host': '172.16.215.16', 'port': 40042, 'db': 'app_data', 'username': '******', 'password': '******' } # 搜索列表存储表 source_db = MongDb(mongo_db_conf['host'], mongo_db_conf['port'], mongo_db_conf['db'], mongo_db_conf['username'], mongo_db_conf['password'], log=log) for company in data_list: item = source_db.find_one('enterprise_data_gov', {'company': company}) if item is None: log.error(company) continue if 'shareholder_information' not in item: log.warn(company) continue
'重庆市': 'chongqing', '陕西省': 'shanxi', '总局': 'gsxt' } mongo_db_company_data = { 'host': '172.16.215.2', 'port': 40042, 'db': 'company_data', 'username': '******', 'password': '******' } source_db = MongDb(mongo_db_company_data['host'], mongo_db_company_data['port'], mongo_db_company_data['db'], mongo_db_company_data['username'], mongo_db_company_data['password'], log=log) db_query = pymongo.MongoClient('172.16.215.2', 40042)['schedule_data'] db_query.authenticate('work', 'haizhi') db_query_app_data = pymongo.MongoClient('172.16.215.16', 40042)['app_data'] db_query_app_data.authenticate('work', 'haizhi') # def main(): try: count = 0 log.info('开始读取数据...')
'port': 40042, 'db': 'company_data', 'username': '******', 'password': '******' } mongo_db_target = { 'host': "103.36.136.211", 'port': 40042, 'db': 'company_data', "username": '******', "password": '******', } log = Gsxtlogger('copy_data_to_beihai.log').get_logger() source_db = MongDb(mongo_db_source['host'], mongo_db_source['port'], mongo_db_source['db'], mongo_db_source['username'], mongo_db_source['password'], log=log) target_db = MongDb(mongo_db_target['host'], mongo_db_target['port'], mongo_db_target['db'], mongo_db_target['username'], mongo_db_target['password'], log=log) def main(): collection_table = 'offline_all_list' log.info("开始导入数据..") result_list = [] count = 0 for item in source_db.traverse(collection_table): item['crawl_online'] = 0 result_list.append(item)
class YizhoucpCrawl(object): __START_URL = "https://api.myrightone.com/api/feed/moment-list" __LIKE_PID_URL = "https://api.myrightone.com/api/feed/like" __CRACK_SIGN_URL = "http://wx.zxiaoji.com/cp" __HOST = "api.myrightone.com" def __init__(self, secret_key, token, user_id, check_code, log): self.log = log self.secret_key = secret_key self.user_id = user_id self.token = token self.check_code = check_code self.request = self.__init_reqeust() self.cp_mongo = MongDb(LocalMongoConfig.HOST, LocalMongoConfig.PORT, LocalMongoConfig.DB, LocalMongoConfig.USER, LocalMongoConfig.PASSWD, log=self.log) self.cp_table = "yizhou_cp" def __init_reqeust(self): headers = { "Host": self.__HOST, "App-Id": self.token.split("_")[0], "Platform": "ios", "Token": self.token, "User-Agent": "Right-iOS/3.33.2 (com.myrightone.datecha; build:224; iOS 12.1.2) Alamofire/4.8.0", "Accept": "*/*", "Accept-Encoding": "gzip;q=1.0, compress;q=0.5", "Accept-Language": "zh-Hans-CN;q=1.0, en-CN;q=0.9", } self.request = requests.Session() self.request.headers = headers return self.request def __get_sign(self, params): req = requests.get(self.__CRACK_SIGN_URL, params={ "secret_key": self.secret_key, "check_code": self.check_code, "params": json.dumps(params) }, timeout=30) req_json = req.json() if req_json.get("status") != 1: self.log.error("提取sign发生错误,错误原因是:") self.log.error(req_json.get("data")) return None return req_json.get("data") def get_moment_list(self): self.log.info("开始采集动态页") params = { "num": 20, "start": 0, "timestamp": int(time.time()), "type": "recommend", "user_id": self.user_id, "last_object_id": "", } sign = self.__get_sign(params) if not sign: return params["sign"] = sign resp = self.request.get(self.__START_URL, params=params, verify=False, timeout=30) resp_json = resp.json() return resp_json def like_sex(self, post_data, sex=2, exclude_cp=True): """ :param fid: 文章id :param sex: 性别 :return: """ is_cp = post_data.get('left_user', None) if exclude_cp and is_cp: self.log.info("过滤掉cp组") return False category = post_data.get("category") if category == "topic": self.log.info("过滤掉话题..") return False fid = post_data.get("fid") nick_name = post_data["user"].get("nickname") post_text = post_data["payload"].get("text") mongo_exists = self.__update_like_mongo(fid, nick_name, post_text) if mongo_exists == -1: self.log.info("之前已对这条数据点过赞了,跳过...") return False raw_sex = post_data["user"].get('sex') if raw_sex == sex: fid_params = { "cancel": "0", "fid": fid, "timestamp": "0", "user_id": self.user_id, } sign = self.__get_sign(fid_params) if not sign: return False fid_params["sign"] = sign resp = self.request.get(self.__LIKE_PID_URL, params=fid_params, verify=False, timeout=30) resp_json = resp.json() if resp_json.get("message") == "success": nick_name = post_data["user"].get("nickname") post_text = post_data["payload"].get("text") self.log.info("给用户({})发布的【{}】点赞成功".format( nick_name, post_text)) return True def start(self, *args, **kwargs): count = 0 like_count = 0 while True: count += 1 moment_data = self.get_moment_list() like_count_batch = 0 for per_post in moment_data["data"]["list"]: like_succeed = self.like_sex(per_post) if like_succeed: like_count_batch += 1 like_count += 1 time.sleep(random.randint(1, 2)) if like_count % 100 == 0: self.log.info("当前已经对 {} 位小姐姐点过赞了...".format(like_count)) self.log.info("当前已经遍历了第 {} 次动态".format(count)) time.sleep( random.randint(7 * like_count_batch, 10 * like_count_batch)) now = datetime.datetime.now() if now.hour in range(2, 6): time.sleep(random.randint(3600, 4000)) def __update_like_mongo(self, fid, nick_name, post_text): exist_data = self.cp_mongo.find_one(self.cp_table, {"_id": fid}) if exist_data: self.log.info(">>>找到相同的数据啦...") count = exist_data['count'] count += 1 exist_data.update({"count": count}) self.cp_mongo.insert_batch_data(self.cp_table, [exist_data]) return -1 new_data = { "_id": fid, "nick_name": nick_name, "post_text": post_text, "count": 1 } self.cp_mongo.insert_batch_data(self.cp_table, [new_data], insert=True) return 1
from logger import Gsxtlogger log = Gsxtlogger('find_in_gsxt.log').get_logger() db_conf = { 'host': '172.16.215.16', 'port': 40042, 'db': 'app_data', 'username': '******', 'password': '******', } source_db = MongDb(db_conf['host'], db_conf['port'], db_conf['db'], db_conf['username'], db_conf['password'], log=log) def classify(): with open("company_invalid.txt", "w") as invalid_file: with open("company_valid.txt", "w") as valid_file: with open("company_list.txt") as p_file: for line in p_file: company = line.strip("\r").strip("\n").strip() # if source_db.find_one("enterprise_data_gov", {"company": company}) is None: # log.warn("当前企业不存在: {}".format(company)) # else: # log.info("找到企业信息: {}".format(company)) if len(company) <= 15 or len(company) > 90:
'db': 'crawl_data', 'username': '******', 'password': '******' } mongo_db_gov = { 'host': '172.16.215.16', 'port': 40042, 'db': 'app_data_test', 'username': '******', 'password': '******' } crawl_data_db = MongDb(mongo_db_crawl_data['host'], mongo_db_crawl_data['port'], mongo_db_crawl_data['db'], mongo_db_crawl_data['username'], mongo_db_crawl_data['password'], log=log) gov_db = MongDb(mongo_db_gov['host'], mongo_db_gov['port'], mongo_db_gov['db'], mongo_db_gov['username'], mongo_db_gov['password'], log=log) # while True: # data = {'province': 'yunnan', 'company_name': '国网'} # producer.produce(json.dumps(data)) # log.info(count) # # time.sleep()
'password': '******' } app_data_conf = { 'host': '172.16.215.16', 'port': 40042, 'db': 'app_data', 'username': '******', 'password': '******' } log = Gsxtlogger('count_gansu.log').get_logger() company_data_db = MongDb(company_data_conf['host'], company_data_conf['port'], company_data_conf['db'], company_data_conf['username'], company_data_conf['password'], log=log) app_data_db = MongDb(app_data_conf['host'], app_data_conf['port'], app_data_conf['db'], app_data_conf['username'], app_data_conf['password'], log=log) def get_now_time(): from datetime import datetime return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
class DoubanCrawl(object): __START_URL = "https://www.douban.com/group/luohuzufang/discussion?start={}" __HOST = "www.douban.com" def __init__(self, page, log): self.__page = page self.log = log self.log.info("获得 {} 页之后的数据...".format(self.__page)) self.mongo = MongDb(LocalMongoConfig.HOST, LocalMongoConfig.PORT, LocalMongoConfig.DB, LocalMongoConfig.USER, LocalMongoConfig.PASSWD, log=self.log) self.table = "douban" self.request = self.__init_reqeust() self.douban_handler = DouBanInfoHandler() def __init_reqeust(self): headers = { "Host": self.__HOST, "Connection": "keep-alive", "Cache-Control": "max-age=0", "Upgrade-Insecure-Requests": "1", "DNT": "1", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7,ja;q=0.6", } self.request = requests.Session() self.request.headers = headers return self.request def __get_page_data(self, page_num=0, start_url=None): url = start_url.format( page_num) if start_url else self.__START_URL.format(page_num) resp = self.request.get(url) if resp is None: self.log.error("请求列表页出错...") return -1 html_resp = html.fromstring(resp.text) # 遍历所有的帖子 discussion_extract = html_resp.xpath( '//div[@class="article"]//tr[@class=""]') item_list = [] for per_discussion in discussion_extract: title = per_discussion.xpath('./td[@class="title"]/a/@title')[0] detail_url = per_discussion.xpath( './td[@class="title"]/a/@href')[0] author = per_discussion.xpath('./td[2]/a/text()')[0] author_url = per_discussion.xpath('./td[2]/a/@href')[0] comment_count_raw = per_discussion.xpath('./td[3]/text()') comment_count = comment_count_raw[0] if comment_count_raw else 0 comment_date = per_discussion.xpath('./td[4]/text()')[0] # titles.append(title) extract_info = self.douban_handler.clean_data(title) item = { "title": title, "detail_url": detail_url, "author": author, "author_url": author_url, "comment_count": comment_count, "comment_date": comment_date, } new_item = {**extract_info, **item} # print(new_item) item_list.append(new_item) self.mongo.insert_batch_data(self.table, item_list, key="detail_url") def start(self, *args, **kwargs): for url in init_urls: self.log.info("当前采集小组的链接是:{}".format(url)) for i in tqdm(range(0, self.__page + 1)): self.log.info("当前即将采集第 {} 页".format(i)) grab_list_page_status = self.__get_page_data(i * 25, url) if grab_list_page_status == -1: self.log.info("当前采集列表页出错, 当前页面是第 {} 页".format(i)) continue self.log.info("当前页面采集完成: page = {}".format(i)) self.log.info("成功退出采集程序...")
"db": "crawl_data_new", "username": "******", "password": "******", } mongo_db_webpage_old = { "host": "172.16.215.2", "port": 40042, "db": "crawl_data", "username": "******", "password": "******", } log = Gsxtlogger('find_equity_field.log').get_logger() target_db_new = MongDb(mongo_db_webpage_new['host'], mongo_db_webpage_new['port'], mongo_db_webpage_new['db'], mongo_db_webpage_new['username'], mongo_db_webpage_new['password'], log=log) target_db_old = MongDb(mongo_db_webpage_old['host'], mongo_db_webpage_old['port'], mongo_db_webpage_old['db'], mongo_db_webpage_old['username'], mongo_db_webpage_old['password'], log=log) mail_from_addr = '*****@*****.**' mail_password = '******' mail_to_addrs = ['*****@*****.**'] def send_email(from_addr, password, to_addrs, subject, msg, smtp_host="smtp.weibangong.com", smtp_port=465): email_client = SMTP(smtp_host, smtp_port) email_client.login(from_addr, password) msg['Subject'] = Header(subject, 'utf-8') msg['From'] = from_addr msg['To'] = str(to_addrs)
mongo_db_source = { 'host': '172.16.215.2', 'port': 40042, 'db': 'company_data', 'username': '******', 'password': '******' } global_logger = Gsxtlogger('insert_company.log') global_log = global_logger.get_logger() # 搜索列表存储表 source_db = MongDb(mongo_db_source['host'], mongo_db_source['port'], mongo_db_source['db'], mongo_db_source['username'], mongo_db_source['password'], log=global_log) beanstalk_consumer_conf = {'host': 'cs0.sz-internal.haizhi.com', 'port': 11400} beanstalk = PyBeanstalk(beanstalk_consumer_conf['host'], beanstalk_consumer_conf['port']) def main(search_name, province, unified_social_credit_code, param): item = { "_id": "9c9d8f8b848514f240f54a40b0a0c6f02622b3d87d54d353e525ca58d9dbe312", "province": province, "crawl_online": 0, "error_times": 0,
length = len(sys.argv) if length > 2: search_list = re.findall('config/(.*?)\.conf', sys.argv[1]) if len(search_list) > 0: log_name = search_list[0] + '_' + sys.argv[2] + '.log' return log_name global_logger = Gsxtlogger(get_log_name()) global_log = global_logger.get_logger() # 旧网页库 target_db = MongDb(mongo_db_target['host'], mongo_db_target['port'], mongo_db_target['db'], mongo_db_target['username'], mongo_db_target['password'], log=global_log) # 新网页库 target_db_new = MongDb(mongo_db_target_new['host'], mongo_db_target_new['port'], mongo_db_target_new['db'], mongo_db_target_new['username'], mongo_db_target_new['password'], log=global_log) # 搜索列表存储表 source_db = MongDb(mongo_db_source['host'], mongo_db_source['port'], mongo_db_source['db'],