示例#1
0
def fetch_house_info(conf_json, total_num):
    house_info_csv = file(conf_json["house_info_file"]["file_name"], 'wb')
    write = csv.writer(house_info_csv)
    write.writerow(conf_json["house_info_file"]["column_name"].split(','))
    cnt = 0
    for page_no in range(total_num):
        for i in range(int(conf_json["retry_times"])):
            #format tv show real url
            url = conf_json["list_template_url"].format(page_no + 1)
            timeout = int(conf_json["timeout"])
            print 'Begin fetch : {}'.format(url)
            content = nt.fetchUrlContent(url, timeout)
            if content != None:
                content = content.replace('\n',
                                          '').replace('\t',
                                                      '').replace('\t', '')
                pattern = re.compile(conf_json["house_info_pattern"])
                match_data = re.findall(pattern, content)
                if match_data == None:
                    logger.error('Bad format for url content : {}'.format(url))
                    break
                print match_data
                # write.writerows(match_data)
                cnt = cnt + len(match_data)
                break
示例#2
0
def fetch_total_page(conf_json):
    content = nt.fetchUrlContent(conf_json["url"])
    pattern = re.compile(conf_json["tv_total_page_pattern"])
    match_data = re.search(pattern, content)
    if match_data != None:
        return int(match_data.group(1))
    return 0
示例#3
0
def fetch_episode_info(conf_json):
    with open(conf_json["tv_show_file"]["file_name"], "r") as csvfile:
        reader = csv.reader(csvfile, delimiter=",")
        pattern = re.compile(conf_json['tv_episode_pattern'])
        reader.next()
        with open(conf_json["tv_episode_file"]["file_name"], "w") as writefile:
            writer = csv.writer(writefile)
            writer.writerow(
                conf_json["tv_episode_file"]["column_name"].split(','))
            timeout = int(conf_json["timeout"])
            eps_cnt = 0
            for url, img, title, count, score_int, score_float in reader:
                real_url = conf_json['redirct_base_url'] + url.split('/')[-1]
                for i in range(int(conf_json['retry_times'])):
                    content = nt.fetchUrlContent(real_url, timeout)
                    if content != None:
                        content = content.replace('\n', '').replace('\t', '')
                        match = re.findall(pattern, content)
                        if match:
                            writer.writerows(match)
                            eps_cnt = eps_cnt + len(match)
                        else:
                            logger.error(
                                'bad format for : {}'.format(real_url))
                        break
            logger.info('The number of episode is : {}'.format(eps_cnt))
def fetch_episode_info(conf, tv_url_list):
    logger.info("Begin to fetch_episode_info")
    result_file = codecs.open(conf["eps_info_file"], 'w', 'utf-8')
    for tv_info in iter(tv_url_list):
        tv_episode_pattern = re.compile(conf["tv_episode_pattern"])
        episode_org_img_pattern = re.compile(
            r'a title="(.*?)"\s+href="(.*?)">.*?data-original="([^"]+)')
        episode_src_img_pattern = re.compile(
            r'a title="(.*?)"\s+href="(.*?)">.*?src="([^"]+)')
        tailer_url = tv_info[0]
        if tailer_url.startswith('/') == False:
            tailer_url = '/' + tv_info[0]
        tv_home_base_url = conf["home_base_url"]
        episode_url = tv_home_base_url + tailer_url
        for i in range(int(conf["retry_times"])):
            episode_page_content = nt.fetchUrlContent(episode_url)
            if episode_page_content != None:
                eps_match = re.findall(tv_episode_pattern,
                                       episode_page_content)
                for eps_detail in iter(eps_match):
                    match = episode_org_img_pattern.search(eps_detail)
                    if match == None:
                        match = episode_src_img_pattern.search(eps_detail)
                    if match == None:
                        logger.error(
                            'Vaild eps_detail : {}'.format(eps_detail))
                        continue
                    eps_title = match.group(1)
                    eps_tailer_url = match.group(2)
                    eps_img_url = match.group(3)
                    if eps_tailer_url is not None and eps_title is not None:
                        eps_season_info = eps_tailer_url.split('/')[3].split(
                            '-')
                        if len(eps_season_info) < 5:
                            logger.error(
                                'Bad url format : {}'.format(eps_tailer_url))
                            continue
                        season_number = int(eps_season_info[1])
                        eps_number = int(eps_season_info[3])
                        try:
                            result_file.write(
                                '{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                                    tv_info[1], season_number, eps_number,
                                    tv_home_base_url + eps_tailer_url,
                                    eps_title, eps_img_url))
                        except Exception, msg:
                            logger.error('Write file excetion {} : {}'.format(
                                msg, eps_detail))

                break
            time.sleep(5)
示例#5
0
def fetch_tv_total_page(conf_json) :
	page_num = 1
	while True:
		content = nt.fetchUrlContent(conf_json["url"].format(page_num))
		if content != None:
			content = content.replace('\n', '').replace('\t', '')
		pattern = re.compile(conf_json["tv_total_page_pattern"])
		match_data = re.findall(pattern, content)
		logger.info('page_num : {},  match_data : {}'.format(page_num, int(match_data[-1])))
		if page_num >= int(match_data[-1]) :
			logger.info('find max page num!')
			return page_num
		else :
			page_num = max(page_num, int(match_data[-1]))
示例#6
0
def fetch_tv_url_list(conf_json, total_num):
    tv_show_csv = file(conf_json["tv_show_file"]["file_name"], 'wb')
    result_file = codecs.open(conf_json["tv_show_file"]["file_name"], 'w',
                              'utf-8')
    write = csv.writer(tv_show_csv)
    write.writerow(conf_json["tv_show_file"]["column_name"].split(','))
    cnt = 0
    for page_no in range(total_num):
        for i in range(int(conf_json["retry_times"])):
            #format tv show real url
            tv_list_url = conf_json["tv_list_template_url"].format(page_no *
                                                                   20)
            # get http page content and replace \n \t  to ''
            timeout = int(conf_json["timeout"])
            content = nt.fetchUrlContent(tv_list_url, timeout)
            if content != None:
                content = content.replace('\n',
                                          '').replace('\t',
                                                      '').replace('\t', '')
                pattern = re.compile(conf_json["tv_show_pattern"])
                # match all pattern return a list of all result
                match_data = re.findall(pattern, content)
                if match_data == None:
                    logger.error(
                        'Bad format for url content : {}'.format(tv_list_url))
                    break
                write.writerows(match_data)
                # result_file.writelines(match_data)
                cnt = cnt + len(match_data)
                break
            time.sleep(5)
    logger.info('Total tv show is : {}'.format(cnt))
    if cnt != 0:
        return True
    else:
        return False
def fetch_tv_url_list(conf):
    content = nt.fetchUrlContent(conf["url"])
    pattern = re.compile(conf["tv_list_pattern"])
    match_data = re.findall(pattern, content)
    return match_data