def scrapeDetails(curUrl):
    global prevTrail
    ulog('curUrl= '+curUrl)
    try:
        d= pq(curUrl)
        md = elmToMd(d('#content')[0],True,True)
        # get device description
        mdl = md.splitlines()
        i = next(i for i,_ in enumerate(mdl) if _.strip().startswith('Home'))
        # step to non empty line
        brmd = [_.strip() for _ in mdl[i].split('» ')]
        brand = brmd[3]
        model = brmd[4].replace('Details','').strip()

        # get product Name
        i =findLineIdxWith(mdl,i+1,lambda _:_.strip().startswith('details:'))
        prodName = mdl[i].split(' |' )[1].strip()
        # "Dual-band wireless-AC3100 gigabit router"
        i = findLineIdxWith(mdl,i+1,
                lambda _:_.strip().startswith('hardware type:'))
        category = mdl[i].split(' | ')[1].strip()
        # "Wireless Router"

        # find empty line after details
        i = findLineIdxWith(mdl,i+1,lambda _:not _.strip())
        # find non empty line
        i = findLineIdxWith(mdl,i+1,lambda _:_.strip())
        j = findLineIdxWith(mdl,i+1,
                lambda _:re.match(r'All .+ products$',_.strip()))
        description='\n'.join(_.strip() for _ in mdl[i:j] if _.strip())
        default_user_name,default_password,wifi_proto,availability,\
                product_page,hw_fla1_amount,hw_ram1_amount = \
                None,None,None,None,None,None,None

        trs = d('.tblight tr')
        pr=OrderedDict()
        for tr in trs:
            l = elmToMd(tr,False,False)
            if ' | ' not in l:
                continue
            n,v = [_.strip() for _ in l.split(' | ',1)]
            v = '\n'.join(_.strip() for _ in v.splitlines())
            assert n not in pr
            n = n.rstrip(':')
            if v:
                v0 = v[0]
                if v0=='!':
                    yn = re.search(r'!\[(.+?)\]', v).group(1)
                    if yn=='yes': v= "true"
                    elif yn=='no': v = "false"
                    else: ipdb.set_trace(); uprint(yn)
                elif v0 =='<':
                    v = re.search(r'<(.+?)>',v).group(1).strip()
                    # "<http://router.asus.com>"
                elif v0 == '[':
                    hreftitle = re.search(r'\((.+?)(?<!\\)\)', v).group(1)
                    v = hreftitle.split()[0]
                    v = v.replace('\\', '')
                else:
                    """
                    'Transmit Power: |  +30 dBm'
                    'Receiver Sensitivity: |  -76 dBm'
                    'Street price: |  $52'
                    'Default admin password: |  (blank)'
                    """
                    if v0.isalnum() or v0 in "+-$" or (v0=='(' and v[-1]==')') :
                        pass
                    else:
                        ipdb.set_trace()
            else:
                pass
            pr[n] = v
            if n== 'Default admin username':
                default_user_name = convertUserPassword(v)
            elif n== 'Default admin password':
                default_password = convertUserPassword(v)
            elif n=='WiFi standards supported':
                wifi_proto = abgnac_format(v)
            elif n== 'Availability':
                availability = v
            elif n == 'Product page':
                product_page = v
            elif n=='Flash Memory':
                assert re.match(r'[\d]*\.?\d+\s*(Mb|Kb)',v,re.I)
                hw_fla1_amount = v
            elif n== 'RAM':
                assert re.match(r'[\d]*\.?\d+\s*(Mb|Kb)',v,re.I)
                hw_ram1_amount = v
            else:
                pr[n]=v

        props_hstore = dict2hstore(pr)
        img= d('a.piframe img')[0]
        image_url=urlChangePath(d.base_url,img.attrib['src'])
        trailStr = str(prevTrail)

        sql("INSERT OR REPLACE INTO TFiles(brand,model,prod_name,category"
            ", default_user_name, default_password, wifi_proto, availability"
            ", description, product_page, hw_fla1_amount, hw_ram1_amount"
            ", image_url, props_hstore, tree_trail) VALUES "
            "(:brand,:model,:prodName,:category"
            ",:default_user_name,:default_password,:wifi_proto,:availability"
            ",:description,:product_page,:hw_fla1_amount,:hw_ram1_amount"
            ",:image_url,:props_hstore,:trailStr)", locals())
        uprint('UPSERT "%(brand)s", "%(model)s", \'%(props_hstore)s\''
                ', %(trailStr)s '% locals())
    except Exception as ex:
        ipdb.set_trace()
        traceback.print_exc()
Exemplo n.º 2
0
def detailScraper(baseUrl):
    global prevTrail
    try:
        ulog('baseUrl= '+baseUrl)
        """
        OK: http://arris.force.com/consumers/ConsumerProductDetail_Ja?p=a0ha000000Rx4I4AAJ&c=Touchstone%20Modems%20and%20Gateways
        Not: http://shop.surfboard.com/
        """
        if not re.match(r'(http|https)://.*arris\..+\.com/.+', baseUrl):
            ulog('Not arris.force.com')
            return
        d = pq(url=baseUrl)
        try:
            dev_desc = elmToMd(d('div.row')[1])
        except IndexError:
            ulog('no model to harvest')
            return

        dev_desc = '\n'.join(re.sub(r'^\+', '', _, 1).strip() for _ in dev_desc.splitlines())
        model = dev_desc.splitlines()[0].strip()
        assert model
        ulog('model= '+model)

        dev_hstore = [_.text_content().strip() for _ in d('.specTbl tr')]
        dev_hstore = dict2hstore(OrderedDict(
            [(_.splitlines()[0].strip(),
                _.splitlines()[1].strip()) for _ in dev_hstore]))

        image_url= d('.box.boxProduct')[0].attrib['style']
        # "background: url(https://arris--c.na13.content.force.com/servlet/servlet.ImageServer?id=015a0000003NYHt&oid=00D30000000kUAL&lastMod=1442430676000);"
        image_url = re.search(r'url\((.+)(?<!\\)\)', image_url).group(1)
        assert fileUrlIsCdn(image_url)

        files = d('#panel4 .small-12.columns:not(.text-center)')
        numFiles = len(files)
        ulog('numFiles=%s'%numFiles)
        if not numFiles:
            upsertModel(model, image_url, dev_desc, dev_hstore, baseUrl, str(prevTrail))
            return

        startIdx= getStartIdx()
        for idx in range(startIdx, numFiles):
            file_name = '\n'.join(_.strip() for _ in files[idx].text_content().splitlines() if _.strip())
            file_name = file_name.splitlines()[0].strip()
            ulog('file_name="%s"'%file_name)
            if re.match(r'No .+ Available', file_name, re.I):
                upsertModel(model, image_url, dev_desc, dev_hstore, baseUrl, str(prevTrail))
                continue

            try:
                fw_ver = re.search(r"\d\.[\w\.\-]+", file_name).group(0)
            except AttributeError:
                fw_ver = file_name
            file_urls = files[idx].cssselect('a')
            if not file_urls:
                ulog('No files')
                upsertModel(model, image_url, dev_desc, dev_hstore, baseUrl, str(prevTrail))
                continue
            file_url = next(_.attrib['href'] for _ in file_urls if _.text_content().strip().startswith('Download'))
            if not fileUrlIsCdn(file_url):
                faqScraper(file_url, model, image_url, dev_desc, dev_hstore)
            tree_trail = str(prevTrail+[idx])
            sql("INSERT OR REPLACE INTO TFiles (model, image_url, dev_desc, dev_hstore, fw_ver, page_url, file_url, tree_trail) VALUES (:model, :image_url, :dev_desc, :dev_hstore, :fw_ver, :baseUrl, :file_url, :tree_trail)", locals())
            uprint('UPSERT "%(model)s", "%(fw_ver)s", %(tree_trail)s, %(file_url)s '%locals())
    except Exception as ex:
        ipdb.set_trace()
        traceback.print_exc()