def iqc_dump_sub_category(url, main_cat, sub_cat): printf("\n==> Start to parse category url: %s, main_cat: %s, sub_cat: %s ...", url, main_cat, sub_cat) global iqc_working_directory # Category with Referer #url = 'http://iqc.com.tw/List/2/' #referer = 'http://iqc.com.tw/List/1/' #referer = 'http://iqc.com.tw/List/0/1/61' # Return here for debugging #return printf("List is %d", str.find(url, "List")) iqc_working_directory = url[str.find(url, "List"):] html_text = iqc_dump_list_page(url) # Parse html page if len(html_text) > 0: ret = iqc_parse_pages.parse_list_page(html_text, True) # Calculate pages in this category page_count = 0 if ret['total_product_counts'] > 0: page_count = ret['total_product_counts']/24 + 1 # Fetch page from 2 ~ end for i in range(1, page_count): referer = url url = 'http://iqc.com.tw/List/' + str(i) + '/' html_text = iqc_dump_list_page(url)
def sixlucky_parse_list_dir(main_cat, sub_cat, list_dir): printf("\n=> Start to parse list pages in current directory: %s ...", list_dir) global sixlucky_working_directory global commodity_index sixlucky_working_directory = list_dir commodity_index = 0 all_files = [] db_name = os.path.join(list_dir, "products.db") printf("db_name = %s" % db_name) root = '' if os.path.exists(db_name): printf("db exists: %s , skip it", db_name) return for root, dirs, files in os.walk(list_dir): printf("root is %s, dirs is %s, files is %s", root, str(dirs), str(files)) if list_dir == root: all_files = files break for f in all_files: printf("parsing file: %s ...", f) try: if f.startswith("list_"): sixlucky_parse_list_file(main_cat, sub_cat, os.path.join(root, f), db_name) pass except Exception, e: traceback.print_exc()
def iqc_parse_commodity(barcode, ret = {}): printf("\n===> Start to dump barcode %s ...", barcode) # http://iqc.com.tw/Commodities/Detail/176725 url = 'http://iqc.com.tw/Commodities/Detail/' + barcode target_file = iqc_url_to_local_file(url) html_text = jagabee_pycurl.pycurl_wrapper_fetch(url, target_file) if len(html_text) > 0: iqc_parse_pages.parse_commodities_page(html_text, ret) # http://iqc.com.tw/Commodity/Detail/176725 url = 'http://iqc.com.tw/Commodity/Detail/' + barcode target_file = iqc_url_to_local_file(url) html_text = jagabee_pycurl.pycurl_wrapper_fetch(url, target_file) if len(html_text) > 0: iqc_parse_pages.parse_commodity_page(html_text, ret) return ret
def sixlucky_dump_list_page(url, main_cat = "main_cat", sub_cat = "sub_cat", index = 0): printf("sixlucky_dump_list_page: url = %s, main_cat = %s, sub_cat = %s " , url, main_cat, sub_cat) target_file = sixlucky_url_to_local_file(url, main_cat = main_cat, sub_cat = sub_cat, page_type = "list", index = index) printf('dump %s to target_file: %s ', url, target_file) # Fetch html page html_text = jagabee_pycurl.pycurl_wrapper_fetch(url, target_file) return html_text
def iqc_dump_list_page(url, referer = ''): printf("iqc_dump_list_page: url = %s " ,url) target_file = iqc_url_to_local_file(url) printf('dump %s to target_file: %s , referer: %s', url, target_file, referer) # Fetch html page html_text = jagabee_pycurl.pycurl_wrapper_fetch(url, target_file, referer) return html_text
def sixlucky_dump_and_parse_commodity(url, main_cat = "main_cat", sub_cat = "sub_cat", index = -1, ret = {}): target_file = sixlucky_url_to_local_file(url, main_cat, sub_cat, "commodity", index) printf("\n===> Start to dump commodity page url: %s to target_file: %s ...", url, target_file) html_text = jagabee_pycurl.pycurl_wrapper_fetch(url, target_file) if len(html_text) > 0: sixlucky_parse_pages.parse_commodity_page(html_text, ret) return ret
def sixlucky_url_to_local_file(url, main_cat = "main_cat", sub_cat = "sub_cat", page_type = None, index = -1): global sixlucky_working_directory sixlucky_working_directory = sixlucky_get_directory(main_cat, sub_cat) try: os.makedirs(sixlucky_working_directory) except Exception, e: if e.errno != errno.EEXIST: printf('sixlucky_url_to_local_file mkdir exception: !!') traceback.print_exc()
def iqc_dump_main_category(main_cat_dict): try: main_cat = main_cat_dict['main_cat'] sub_cats = main_cat_dict['sub_cats'] for sub_cat_dict in sub_cats: url = 'http://iqc.com.tw/' + sub_cat_dict['url'] sub_cat = sub_cat_dict['sub_cat'] iqc_dump_sub_category(url, main_cat, sub_cat) except Exception, e: printf('iqc_dump_main_category exception: !!') traceback.print_exc()
def sixlucky_dump_main_category(main_cat_dict): try: main_cat = main_cat_dict['main_cat'] sub_cats = main_cat_dict['sub_cats'] for sub_cat_dict in sub_cats: url = sub_cat_dict['url'] sub_cat = sub_cat_dict['sub_cat'] sixlucky_dump_sub_category(url, main_cat, sub_cat) time.sleep(10) except Exception, e: printf('sixlucky_dump_main_category exception: !!') traceback.print_exc()
def print_usage(cmd): usage = ''' -m, --parse-main-category=N Parse main category (index = N) -s, --parse-sub-category Parse sub category -l, --parse-list-file=FILE Parse a list file -d, --parse-list-dir=DIR Parse list files in a directory -c, --collect-all-db Collect all database into one -h, --help Print this usage ''' printf(('\n%s usage: ' + usage), cmd) return
def print_usage(cmd): usage = ''' -m, --dump-main-category=N Dump list html pages from sub categories from a main category (index = N) -s, --dump-sub-category Dump list html pages from a sub category -r, dump-commodity-from-sub-category-dirs Dump commodity pages from sub category dirs -l, --parse-list-file=FILE Parse a list file -d, --parse-list-dir=DIR Parse list files in a directory -a, --collect-all-db Collect all database into one -h, --help Print this usage ''' printf(('\n%s usage: ' + usage), cmd) return
def iqc_url_to_local_file(url): global iqc_working_directory if url[0:7] == 'http://': # Replace filename url = url[7:] # Remove 'http://' (7 byes) elif url[0:8] == 'https://': url = url[8:] url = string.replace(url, "//", "/") target_file = string.replace(url, '/', '_') + '.html' try: os.makedirs(iqc_working_directory) except Exception, e: if e.errno != errno.EEXIST: printf('iqc_url_to_local_file mkdir exception: !!') traceback.print_exc()
def sixlucky_parse_list_file(main_cat, sub_cat, list_file, db_name = 'test.db'): printf("\n==> Start to parse list file: %s ...", list_file) global commodity_index if list_file != '': f = open( list_file , 'r' ) html_text = f.read() f.close() if len(html_text) <= 0: return ret = {} ret = sixlucky_parse_pages.parse_list_page(html_text, True, ret) list_dump = {'list_file' : list_file} list_dump['commodities'] = [] for commodity_url in ret['link']: try: printf('commodity_index: %d', commodity_index) m = {'main_cat' : main_cat, 'sub_cat' : sub_cat} m = sixlucky_dump_and_parse_commodity(commodity_url, main_cat, sub_cat, commodity_index, m) list_dump['commodities'].append(m) commodity_index += 1 #if i >= 3: # break # dump leading 3 only because we're still debugging except Exception, e: printf("Parse commodity fails ...") traceback.print_exc() finally:
def sixlucky_dump_sub_category(url, main_cat, sub_cat): printf("\n==> Start to parse category url: %s, main_cat: %s, sub_cat: %s ...", url, main_cat, sub_cat) global sixlucky_working_directory i = 0 # Category with Referer #url = 'http://iqc.com.tw/List/2/' # Return here for debugging #return printf("List is %d", str.find(url, "List")) sixlucky_working_directory = url[str.find(url, "List"):] html_text = sixlucky_dump_list_page(url, main_cat = main_cat, sub_cat = sub_cat, index = i) # Parse html page if len(html_text) <= 0: print("Can not get list page html_text, len = 0") return ret = sixlucky_parse_pages.parse_list_page(html_text, True, ret={}) print("ret['total_product_counts'] = %d", ret['total_product_counts']) # Calculate pages in this category page_count = 0 if ret['total_product_counts'] > 0: page_count = ret['total_product_counts']/18 + 1 if page_count > 10: page_count = 10 # Fetch page from 2 ~ end for i in range(1, page_count): time.sleep(random.random()*10) url = "http://www.6lucky.com.tw/showroom/" + ret["list_link"][i] html_text = sixlucky_dump_list_page(url, main_cat = main_cat, sub_cat = sub_cat, index = i)
def main(argv): dump_main_category = None dump_sub_category = None dump_commodity_from_sub_category_dirs = None parse_list_file = None parse_list_dir = None collect_all_db = None try: #printf(" argv=%s", str(argv)) opts, other_args = getopt.getopt(argv[1:],"m:srlda",["dump-main-category=", "dump-sub-category", "dump-commodity-from-sub-category-dirs", "parse-list-file", "parse-list-dir", "collect-all-db"]) except getopt.GetoptError: printf("getopt.GetoptError: ") traceback.print_exc() print_usage(argv[0]) sys.exit(-1) for opt, arg in opts: if opt in ("-m", "--dump-main-category"): dump_main_category = int(arg) # index of main category (int) elif opt in ("-s", "--dump-sub-category"): dump_sub_category = True elif opt in ("-r", "--dump-commodity-from-sub-category-dirs"): dump_commodity_from_sub_category_dirs = True elif opt in ("-l", "--parse-list-file"): parse_list_file = True elif opt in ("-d", "--parse-list-dir"): # A dir containing whole list pages parse_list_dir = True elif opt in ("-a", "--collect-all-db"): collect_all_db = True if dump_main_category is None and dump_sub_category is None and dump_commodity_from_sub_category_dirs is None and parse_list_file is None and parse_list_dir is None and collect_all_db is None: print_usage(argv[0]) sys.exit() return dump_main_category, dump_sub_category, dump_commodity_from_sub_category_dirs, parse_list_file, parse_list_dir, collect_all_db
def main(argv): parse_main_category = None parse_sub_category = None parse_sub_category_dirs = None parse_list_file = None parse_list_dir = None collect_all_db = None try: #printf(" argv=%s", str(argv)) opts, other_args = getopt.getopt(argv[1:],"m:srf:d:c",["parse-main-category=", "parse-sub-category", "parse-sub-category-dirs", "parse-list-file=", "parse-list-dir=", "collect-all-db"]) except getopt.GetoptError: printf("getopt.GetoptError: ") traceback.print_exc() print_usage(argv[0]) sys.exit(-1) for opt, arg in opts: if opt in ("-m", "--parse-main-category"): parse_main_category = int(arg) # index of main category (int) elif opt in ("-s", "--parse-sub-category"): parse_sub_category = True elif opt in ("-r", "--parse-sub-category-dirs"): parse_sub_category_dirs = True elif opt in ("-f", "--parse-list-file"): parse_list_file = arg # file (string) elif opt in ("-d", "--parse-list-dir"): parse_list_dir = arg # dir (string) elif opt in ("-c", "--collect-all-db"): collect_all_db = True if parse_main_category is None and parse_sub_category is None and parse_sub_category_dirs is None and parse_list_file is None and parse_list_dir is None and collect_all_db is None: print_usage(argv[0]) sys.exit() return parse_main_category, parse_sub_category, parse_sub_category_dirs, parse_list_file, parse_list_dir, collect_all_db
def iqc_parse_list_file(main_cat, sub_cat, list_file, db_name = 'test.db'): printf("\n==> Start to parse list file: %s ...", list_file) if list_file != '': f = open( list_file , 'r' ) html_text = f.read() f.close() if len(html_text) <= 0: return ret = {} ret = iqc_parse_pages.parse_list_page(html_text, False, ret) i = 0 list_dump = {'list_file' : list_file} list_dump['commodities'] = [] list_dump_file = list_file + '.dict' # Dump to a dictionary for c in ret['link']: # /Commodities/Detail/171342 # printf('c=%s, type=%s, rindex of "/" is %d', str(c), str(type(c)), string.rindex(c, '/')) try: barcode = c[string.rindex(c, '/')+1:] printf('barcode: %s', barcode) if (barcode is not None) and (barcode != ''): commodity = {'barcode' : barcode, 'main_cat' : main_cat, 'sub_cat' : sub_cat} commodity = iqc_parse_commodity(barcode, commodity) list_dump['commodities'].append(commodity) i += 1 #if i >= 3: # break # dump leading 3 only because we're still debugging except Exception, e: printf("Parse commodity fails ...") traceback.print_exc() finally:
sixlucky_dump_sub_category(url, main_cat, sub_cat) pass if dump_commodity_from_sub_category_dirs is not None: # Define Sub Category Entry if True: # debug first main category m = sixlucky_categories.all_categories[10] main_cat = m['main_cat'] for s in m['sub_cats']: sub_cat = s['sub_cat'] sub_cat_dir = sixlucky_get_directory(main_cat, sub_cat) printf ("dump_commodity_from_sub_category_dirs, main_cat = %s, sub_cat = %s, dir = %s", main_cat, sub_cat, sub_cat_dir) sixlucky_parse_list_dir(main_cat, sub_cat, sub_cat_dir) else: for m in sixlucky_categories.all_categories: main_cat = m['main_cat'] printf ("parse main catetories ... %s", main_cat) for s in m['sub_cats']: sub_cat = s['sub_cat'] sub_cat_dir = sixlucky_get_directory(main_cat, sub_cat) printf ("dump_commodity_from_sub_category_dirs, main_cat = %s, sub_cat = %s, dir = %s", main_cat, sub_cat, sub_cat_dir) sixlucky_parse_list_dir(main_cat, sub_cat, sub_cat_dir)