def fetch_house_info(conf_json, total_num): house_info_csv = file(conf_json["house_info_file"]["file_name"], 'wb') write = csv.writer(house_info_csv) write.writerow(conf_json["house_info_file"]["column_name"].split(',')) cnt = 0 for page_no in range(total_num): for i in range(int(conf_json["retry_times"])): #format tv show real url url = conf_json["list_template_url"].format(page_no + 1) timeout = int(conf_json["timeout"]) print 'Begin fetch : {}'.format(url) content = nt.fetchUrlContent(url, timeout) if content != None: content = content.replace('\n', '').replace('\t', '').replace('\t', '') pattern = re.compile(conf_json["house_info_pattern"]) match_data = re.findall(pattern, content) if match_data == None: logger.error('Bad format for url content : {}'.format(url)) break print match_data # write.writerows(match_data) cnt = cnt + len(match_data) break
def fetch_total_page(conf_json): content = nt.fetchUrlContent(conf_json["url"]) pattern = re.compile(conf_json["tv_total_page_pattern"]) match_data = re.search(pattern, content) if match_data != None: return int(match_data.group(1)) return 0
def fetch_episode_info(conf_json): with open(conf_json["tv_show_file"]["file_name"], "r") as csvfile: reader = csv.reader(csvfile, delimiter=",") pattern = re.compile(conf_json['tv_episode_pattern']) reader.next() with open(conf_json["tv_episode_file"]["file_name"], "w") as writefile: writer = csv.writer(writefile) writer.writerow( conf_json["tv_episode_file"]["column_name"].split(',')) timeout = int(conf_json["timeout"]) eps_cnt = 0 for url, img, title, count, score_int, score_float in reader: real_url = conf_json['redirct_base_url'] + url.split('/')[-1] for i in range(int(conf_json['retry_times'])): content = nt.fetchUrlContent(real_url, timeout) if content != None: content = content.replace('\n', '').replace('\t', '') match = re.findall(pattern, content) if match: writer.writerows(match) eps_cnt = eps_cnt + len(match) else: logger.error( 'bad format for : {}'.format(real_url)) break logger.info('The number of episode is : {}'.format(eps_cnt))
def fetch_episode_info(conf, tv_url_list): logger.info("Begin to fetch_episode_info") result_file = codecs.open(conf["eps_info_file"], 'w', 'utf-8') for tv_info in iter(tv_url_list): tv_episode_pattern = re.compile(conf["tv_episode_pattern"]) episode_org_img_pattern = re.compile( r'a title="(.*?)"\s+href="(.*?)">.*?data-original="([^"]+)') episode_src_img_pattern = re.compile( r'a title="(.*?)"\s+href="(.*?)">.*?src="([^"]+)') tailer_url = tv_info[0] if tailer_url.startswith('/') == False: tailer_url = '/' + tv_info[0] tv_home_base_url = conf["home_base_url"] episode_url = tv_home_base_url + tailer_url for i in range(int(conf["retry_times"])): episode_page_content = nt.fetchUrlContent(episode_url) if episode_page_content != None: eps_match = re.findall(tv_episode_pattern, episode_page_content) for eps_detail in iter(eps_match): match = episode_org_img_pattern.search(eps_detail) if match == None: match = episode_src_img_pattern.search(eps_detail) if match == None: logger.error( 'Vaild eps_detail : {}'.format(eps_detail)) continue eps_title = match.group(1) eps_tailer_url = match.group(2) eps_img_url = match.group(3) if eps_tailer_url is not None and eps_title is not None: eps_season_info = eps_tailer_url.split('/')[3].split( '-') if len(eps_season_info) < 5: logger.error( 'Bad url format : {}'.format(eps_tailer_url)) continue season_number = int(eps_season_info[1]) eps_number = int(eps_season_info[3]) try: result_file.write( '{}\t{}\t{}\t{}\t{}\t{}\n'.format( tv_info[1], season_number, eps_number, tv_home_base_url + eps_tailer_url, eps_title, eps_img_url)) except Exception, msg: logger.error('Write file excetion {} : {}'.format( msg, eps_detail)) break time.sleep(5)
def fetch_tv_total_page(conf_json) : page_num = 1 while True: content = nt.fetchUrlContent(conf_json["url"].format(page_num)) if content != None: content = content.replace('\n', '').replace('\t', '') pattern = re.compile(conf_json["tv_total_page_pattern"]) match_data = re.findall(pattern, content) logger.info('page_num : {}, match_data : {}'.format(page_num, int(match_data[-1]))) if page_num >= int(match_data[-1]) : logger.info('find max page num!') return page_num else : page_num = max(page_num, int(match_data[-1]))
def fetch_tv_url_list(conf_json, total_num): tv_show_csv = file(conf_json["tv_show_file"]["file_name"], 'wb') result_file = codecs.open(conf_json["tv_show_file"]["file_name"], 'w', 'utf-8') write = csv.writer(tv_show_csv) write.writerow(conf_json["tv_show_file"]["column_name"].split(',')) cnt = 0 for page_no in range(total_num): for i in range(int(conf_json["retry_times"])): #format tv show real url tv_list_url = conf_json["tv_list_template_url"].format(page_no * 20) # get http page content and replace \n \t to '' timeout = int(conf_json["timeout"]) content = nt.fetchUrlContent(tv_list_url, timeout) if content != None: content = content.replace('\n', '').replace('\t', '').replace('\t', '') pattern = re.compile(conf_json["tv_show_pattern"]) # match all pattern return a list of all result match_data = re.findall(pattern, content) if match_data == None: logger.error( 'Bad format for url content : {}'.format(tv_list_url)) break write.writerows(match_data) # result_file.writelines(match_data) cnt = cnt + len(match_data) break time.sleep(5) logger.info('Total tv show is : {}'.format(cnt)) if cnt != 0: return True else: return False
def fetch_tv_url_list(conf): content = nt.fetchUrlContent(conf["url"]) pattern = re.compile(conf["tv_list_pattern"]) match_data = re.findall(pattern, content) return match_data