Exemplo n.º 1
0
def iqc_dump_sub_category(url, main_cat, sub_cat):

    printf("\n==> Start to parse category url: %s, main_cat: %s, sub_cat: %s ...", url, main_cat, sub_cat)

    global iqc_working_directory

    # Category with Referer
    #url = 'http://iqc.com.tw/List/2/'
    #referer = 'http://iqc.com.tw/List/1/'
    #referer = 'http://iqc.com.tw/List/0/1/61'

    # Return here for debugging
    #return
    printf("List is %d", str.find(url, "List"))
    iqc_working_directory = url[str.find(url, "List"):]

    html_text = iqc_dump_list_page(url)

    # Parse html page
    if len(html_text) > 0:
        ret = iqc_parse_pages.parse_list_page(html_text, True)

    # Calculate pages in this category
    page_count = 0
    if ret['total_product_counts'] > 0:
        page_count = ret['total_product_counts']/24 + 1

    # Fetch page from 2 ~ end
    for i in range(1, page_count):
        referer = url
        url = 'http://iqc.com.tw/List/' + str(i) + '/'
        html_text = iqc_dump_list_page(url)
Exemplo n.º 2
0
def sixlucky_parse_list_dir(main_cat, sub_cat, list_dir):

    printf("\n=> Start to parse list pages in current directory: %s ...", list_dir)

    global sixlucky_working_directory
    global commodity_index
    
    sixlucky_working_directory = list_dir
    commodity_index = 0

    all_files = []
    db_name = os.path.join(list_dir, "products.db")
    printf("db_name = %s" % db_name)
    root = ''

    if os.path.exists(db_name):
        printf("db exists: %s , skip it", db_name)
        return

    for root, dirs, files in os.walk(list_dir):
        printf("root is %s, dirs is %s, files is %s", root, str(dirs), str(files))
        if list_dir == root:
            all_files = files
            break

    for f in all_files:
        printf("parsing file: %s ...", f)
        try:
            if f.startswith("list_"):
                sixlucky_parse_list_file(main_cat, sub_cat, os.path.join(root, f), db_name)

            pass

        except Exception, e:
            traceback.print_exc()
Exemplo n.º 3
0
def iqc_parse_commodity(barcode, ret = {}):

    printf("\n===> Start to dump barcode %s ...", barcode)

    # http://iqc.com.tw/Commodities/Detail/176725
    url = 'http://iqc.com.tw/Commodities/Detail/' + barcode

    target_file = iqc_url_to_local_file(url)

    html_text = jagabee_pycurl.pycurl_wrapper_fetch(url, target_file)

    if len(html_text) > 0:
        iqc_parse_pages.parse_commodities_page(html_text, ret)

    # http://iqc.com.tw/Commodity/Detail/176725
    url = 'http://iqc.com.tw/Commodity/Detail/' + barcode

    target_file = iqc_url_to_local_file(url)

    html_text = jagabee_pycurl.pycurl_wrapper_fetch(url, target_file)

    if len(html_text) > 0:
        iqc_parse_pages.parse_commodity_page(html_text, ret)

    return ret
Exemplo n.º 4
0
def sixlucky_dump_list_page(url, main_cat = "main_cat", sub_cat = "sub_cat", index = 0):

    printf("sixlucky_dump_list_page: url = %s, main_cat = %s, sub_cat = %s " , url, main_cat, sub_cat)

    target_file = sixlucky_url_to_local_file(url, main_cat = main_cat, sub_cat = sub_cat, page_type = "list", index = index)

    printf('dump %s to target_file: %s ', url, target_file)
    # Fetch html page
    html_text = jagabee_pycurl.pycurl_wrapper_fetch(url, target_file)
   
    return html_text
Exemplo n.º 5
0
def iqc_dump_list_page(url, referer = ''):

    printf("iqc_dump_list_page: url = %s " ,url)

    target_file = iqc_url_to_local_file(url)

    printf('dump %s to target_file: %s , referer: %s', url, target_file, referer)
    # Fetch html page
    html_text = jagabee_pycurl.pycurl_wrapper_fetch(url, target_file, referer)
   
    return html_text
Exemplo n.º 6
0
def sixlucky_dump_and_parse_commodity(url, main_cat = "main_cat", sub_cat = "sub_cat", index = -1, ret = {}):


    target_file = sixlucky_url_to_local_file(url, main_cat, sub_cat, "commodity", index)

    printf("\n===> Start to dump commodity page url: %s to target_file: %s ...", url, target_file)

    html_text = jagabee_pycurl.pycurl_wrapper_fetch(url, target_file)

    if len(html_text) > 0:
        sixlucky_parse_pages.parse_commodity_page(html_text, ret)

    return ret
Exemplo n.º 7
0
def sixlucky_url_to_local_file(url, main_cat = "main_cat", sub_cat = "sub_cat", page_type = None, index = -1):

    global sixlucky_working_directory

    sixlucky_working_directory = sixlucky_get_directory(main_cat, sub_cat)

    try:
        os.makedirs(sixlucky_working_directory)
    
    except Exception, e:
        if e.errno != errno.EEXIST:
            printf('sixlucky_url_to_local_file mkdir exception: !!')
            traceback.print_exc()
Exemplo n.º 8
0
def iqc_dump_main_category(main_cat_dict):

    try:
        main_cat = main_cat_dict['main_cat']
        sub_cats = main_cat_dict['sub_cats']

        for sub_cat_dict in sub_cats:
            url = 'http://iqc.com.tw/' + sub_cat_dict['url']
            sub_cat = sub_cat_dict['sub_cat']
            iqc_dump_sub_category(url, main_cat, sub_cat)

    except Exception, e:
        printf('iqc_dump_main_category exception: !!')
        traceback.print_exc()
Exemplo n.º 9
0
def sixlucky_dump_main_category(main_cat_dict):

    try:
        main_cat = main_cat_dict['main_cat']
        sub_cats = main_cat_dict['sub_cats']

        for sub_cat_dict in sub_cats:
            url = sub_cat_dict['url']
            sub_cat = sub_cat_dict['sub_cat']
            sixlucky_dump_sub_category(url, main_cat, sub_cat)
            time.sleep(10)

    except Exception, e:
        printf('sixlucky_dump_main_category exception: !!')
        traceback.print_exc()
Exemplo n.º 10
0
def print_usage(cmd):
    usage = '''
        -m, --parse-main-category=N
            Parse main category (index = N)
        -s, --parse-sub-category
            Parse sub category
        -l, --parse-list-file=FILE
            Parse a list file
        -d, --parse-list-dir=DIR
            Parse list files in a directory
        -c, --collect-all-db
            Collect all database into one
        -h, --help
            Print this usage
    '''

    printf(('\n%s usage: ' + usage), cmd)
    return
Exemplo n.º 11
0
def print_usage(cmd):
    usage = '''
        -m, --dump-main-category=N
            Dump list html pages from sub categories from a main category (index = N)
        -s, --dump-sub-category
            Dump list html pages from a sub category
        -r, dump-commodity-from-sub-category-dirs
            Dump commodity pages from sub category dirs
        -l, --parse-list-file=FILE
            Parse a list file
        -d, --parse-list-dir=DIR
            Parse list files in a directory
        -a, --collect-all-db
            Collect all database into one
        -h, --help
            Print this usage
    '''

    printf(('\n%s usage: ' + usage), cmd)
    return
Exemplo n.º 12
0
def iqc_url_to_local_file(url):

    global iqc_working_directory

    if url[0:7] == 'http://':
        # Replace filename
        url = url[7:]  # Remove 'http://' (7 byes)
    elif url[0:8] == 'https://':
        url = url[8:]

    url = string.replace(url, "//", "/")
    target_file = string.replace(url, '/', '_') + '.html'

    try:
        os.makedirs(iqc_working_directory)
    
    except Exception, e:
        if e.errno != errno.EEXIST:
            printf('iqc_url_to_local_file mkdir exception: !!')
            traceback.print_exc()
Exemplo n.º 13
0
def sixlucky_parse_list_file(main_cat, sub_cat, list_file, db_name = 'test.db'):
    
    printf("\n==> Start to parse list file: %s ...", list_file)

    global commodity_index

    if list_file != '':
        f = open( list_file , 'r' )
        html_text = f.read()
        f.close()

    if len(html_text) <= 0:
        return

    ret = {}
    ret = sixlucky_parse_pages.parse_list_page(html_text, True, ret)

    list_dump = {'list_file' : list_file}
    list_dump['commodities'] = []

    for commodity_url in ret['link']:
        try:
            printf('commodity_index: %d', commodity_index)
            m = {'main_cat' : main_cat, 'sub_cat' : sub_cat}
            m = sixlucky_dump_and_parse_commodity(commodity_url, main_cat, sub_cat, commodity_index, m)
            list_dump['commodities'].append(m)

            commodity_index += 1
            #if i >= 3:
            #    break # dump leading 3 only because we're still debugging

        except Exception, e:
            printf("Parse commodity fails ...")
            traceback.print_exc()
        finally:
Exemplo n.º 14
0
def sixlucky_dump_sub_category(url, main_cat, sub_cat):

    printf("\n==> Start to parse category url: %s, main_cat: %s, sub_cat: %s ...", url, main_cat, sub_cat)

    global sixlucky_working_directory
    i = 0

    # Category with Referer
    #url = 'http://iqc.com.tw/List/2/'

    # Return here for debugging
    #return
    printf("List is %d", str.find(url, "List"))
    sixlucky_working_directory = url[str.find(url, "List"):]

    html_text = sixlucky_dump_list_page(url, main_cat = main_cat, sub_cat = sub_cat, index = i)

    # Parse html page
    if len(html_text) <= 0:
        print("Can not get list page html_text, len = 0")
        return

    ret = sixlucky_parse_pages.parse_list_page(html_text, True, ret={})

    print("ret['total_product_counts'] = %d", ret['total_product_counts'])

    # Calculate pages in this category
    page_count = 0
    if ret['total_product_counts'] > 0:
        page_count = ret['total_product_counts']/18 + 1

    if page_count > 10:
        page_count = 10

    # Fetch page from 2 ~ end
    for i in range(1, page_count):
        time.sleep(random.random()*10)
        url = "http://www.6lucky.com.tw/showroom/" + ret["list_link"][i]
        html_text = sixlucky_dump_list_page(url, main_cat = main_cat, sub_cat = sub_cat, index = i)
Exemplo n.º 15
0
def main(argv):

    dump_main_category = None
    dump_sub_category = None
    dump_commodity_from_sub_category_dirs = None
    parse_list_file = None
    parse_list_dir = None
    collect_all_db = None

    try:
        #printf(" argv=%s", str(argv))
        opts, other_args = getopt.getopt(argv[1:],"m:srlda",["dump-main-category=", "dump-sub-category", "dump-commodity-from-sub-category-dirs", "parse-list-file", "parse-list-dir", "collect-all-db"])

    except getopt.GetoptError:
        printf("getopt.GetoptError: ")
        traceback.print_exc()
        print_usage(argv[0])
        sys.exit(-1)

    for opt, arg in opts:
        if opt in ("-m", "--dump-main-category"):
            dump_main_category = int(arg)       # index of main category (int)
        elif opt in ("-s", "--dump-sub-category"):
            dump_sub_category = True
        elif opt in ("-r", "--dump-commodity-from-sub-category-dirs"):
            dump_commodity_from_sub_category_dirs = True
        elif opt in ("-l", "--parse-list-file"):
            parse_list_file = True
        elif opt in ("-d", "--parse-list-dir"):                 # A dir containing whole list pages
            parse_list_dir = True
        elif opt in ("-a", "--collect-all-db"):
            collect_all_db = True

    if dump_main_category is None and dump_sub_category is None and dump_commodity_from_sub_category_dirs is None and parse_list_file is None and parse_list_dir is None and collect_all_db is None:
        print_usage(argv[0])
        sys.exit()

    return dump_main_category, dump_sub_category, dump_commodity_from_sub_category_dirs, parse_list_file, parse_list_dir, collect_all_db
Exemplo n.º 16
0
def main(argv):

    parse_main_category = None
    parse_sub_category = None
    parse_sub_category_dirs = None
    parse_list_file = None
    parse_list_dir = None
    collect_all_db = None

    try:
        #printf(" argv=%s", str(argv))
        opts, other_args = getopt.getopt(argv[1:],"m:srf:d:c",["parse-main-category=", "parse-sub-category", "parse-sub-category-dirs", "parse-list-file=", "parse-list-dir=", "collect-all-db"])

    except getopt.GetoptError:
        printf("getopt.GetoptError: ")
        traceback.print_exc()
        print_usage(argv[0])
        sys.exit(-1)

    for opt, arg in opts:
        if opt in ("-m", "--parse-main-category"):
            parse_main_category = int(arg)       # index of main category (int)
        elif opt in ("-s", "--parse-sub-category"):
            parse_sub_category = True
        elif opt in ("-r", "--parse-sub-category-dirs"):
            parse_sub_category_dirs = True
        elif opt in ("-f", "--parse-list-file"):
            parse_list_file = arg           # file (string)
        elif opt in ("-d", "--parse-list-dir"):
            parse_list_dir = arg            # dir (string)
        elif opt in ("-c", "--collect-all-db"):
            collect_all_db = True

    if parse_main_category is None and parse_sub_category is None and parse_sub_category_dirs is None and parse_list_file is None and parse_list_dir is None and collect_all_db is None:
        print_usage(argv[0])
        sys.exit()

    return parse_main_category, parse_sub_category, parse_sub_category_dirs, parse_list_file, parse_list_dir, collect_all_db
Exemplo n.º 17
0
def iqc_parse_list_file(main_cat, sub_cat, list_file, db_name = 'test.db'):
    
    printf("\n==> Start to parse list file: %s ...", list_file)

    if list_file != '':
        f = open( list_file , 'r' )
        html_text = f.read()
        f.close()

    if len(html_text) <= 0:
        return

    ret = {}
    ret = iqc_parse_pages.parse_list_page(html_text, False, ret)

    i = 0

    list_dump = {'list_file' : list_file}
    list_dump['commodities'] = []
    list_dump_file = list_file + '.dict'    # Dump to a dictionary

    for c in ret['link']:
        # /Commodities/Detail/171342
        # printf('c=%s, type=%s, rindex of "/" is %d', str(c), str(type(c)), string.rindex(c, '/'))
        try:
            barcode = c[string.rindex(c, '/')+1:]
            printf('barcode: %s', barcode)
            if (barcode is not None) and (barcode != ''):
                commodity = {'barcode' : barcode, 'main_cat' : main_cat, 'sub_cat' : sub_cat}
                commodity = iqc_parse_commodity(barcode, commodity)
                list_dump['commodities'].append(commodity)

            i += 1
            #if i >= 3:
            #    break # dump leading 3 only because we're still debugging

        except Exception, e:
            printf("Parse commodity fails ...")
            traceback.print_exc()
        finally:
Exemplo n.º 18
0
            sixlucky_dump_sub_category(url, main_cat, sub_cat)
            pass

        if dump_commodity_from_sub_category_dirs is not None:
            # Define Sub Category Entry

            if True:  # debug first main category
                m = sixlucky_categories.all_categories[10]
                main_cat = m['main_cat']

                for s in m['sub_cats']:
                    sub_cat = s['sub_cat']
                    sub_cat_dir = sixlucky_get_directory(main_cat, sub_cat)
        
                    printf ("dump_commodity_from_sub_category_dirs, main_cat = %s, sub_cat = %s, dir = %s", main_cat, sub_cat, sub_cat_dir)
                    sixlucky_parse_list_dir(main_cat, sub_cat, sub_cat_dir)

            else:

                for m in sixlucky_categories.all_categories:
                    
                    main_cat = m['main_cat']
                    printf ("parse main catetories ... %s", main_cat)

                    for s in m['sub_cats']:
                        sub_cat = s['sub_cat']
                        sub_cat_dir = sixlucky_get_directory(main_cat, sub_cat)
        
                        printf ("dump_commodity_from_sub_category_dirs, main_cat = %s, sub_cat = %s, dir = %s", main_cat, sub_cat, sub_cat_dir)
                        sixlucky_parse_list_dir(main_cat, sub_cat, sub_cat_dir)