Python filter_html 예제들, python_function.filter_html Python 예제들

예제 #1

0

파일 보기

파일: getdata_digikey.py 프로젝트: romali/ajax_show_stock

 def operate_cxjg_page(self,list_string,similar_partno,hc):
     ''' 
     处理查询结果页面 有精确匹配 精确匹配页面代码附加在list_string中；否则 相似页面url、partno附加在similar_partno中
     参数:   list_string:    保存每个digikey搜索型号对应精确页面的代码,格式[hc0,hc1...]
             similar_partno: 相似页面url 型号信息,格式[[url,partno],...] 
             hc:             当前处理页面的代码
     '''
     res_url_partno = self.p_url_partno.findall(hc)
     for url,partno in res_url_partno:
         if partno == self.mmp:
             url_info = get_url_whole(self.url_search, url)
             if debug: print url,partno
             self.list_url_exact.append(url_info)
             ##################
             #print "in operate_cxjg_page:",url_info
             temp = gethtml(url_info)
             if temp is None:
                 temp = ''
             hc_info = filter_html(temp)
             ##################
             if hc_info != 'timeout':
                 self.list_string.append(hc_info)
         else:
             ''' 型号不精确 相似而已 '''
             similar_partno.append([get_url_whole(self.url_search, url), partno])

예제 #2

0

파일 보기

파일: getdata_olc.py 프로젝트: romali/ajax_show_stock

    def get_list_string(self):
        """  
        self.list_string:
        ['timeout','timeout']
        ['exact','详细信息页面代码']
        ['exact','详细信息页面代码1','详细信息页面代码2',...] 或者 
        ['similar','不能精确匹配的查询页面代码',[(相似型号url,相似型号),()]]
        ['no_result','无查询结果的提示页面']
        [None,页面代码]
        """
        fun = 'function get_list_string of %s' % self.name
        html = filter_html(gethtml(self.url_search))

        res_noresult    = p_noresult.findall(html)
        res_table       = p_table.findall(html)
        res_info        = p_info.findall(html)

        if html == 'timeout':
            list_string = ['timeout','timeout']
        elif res_noresult:
            list_string = ['no_result', html]
        elif res_table:
            ''' 表格页面 多个记录 '''
            list_string_exact       = []
            list_similar            = []
            boolean_jq              = False
            table                   = res_table[0]
            res_tr_hc               = p_tr_hc.findall(table)
            res_url_partno          = p_url_partno.findall(table)

            ''' [(url, partno), ...] 元素个数应该与 res_tr_hc 个数相同 '''
            if len(res_tr_hc) != len(res_url_partno):
                if debug: print 'in %s error: length of res_tr_hc and res_url_partno do not equal' % fun
                ''' 异常的情况 '''
                list_string = [None, html]
                return list_string

            ''' 正常情况 '''
            i       = 0
            for tr_hc in res_tr_hc:
                if res_url_partno[i][1].upper().strip() == self.partno:
                    boolean_jq = True
                    ''' 特殊处理 加上该tr 行对应的详细信息页面url '''
                    tr_hc = '<url_info_page>%s</url_info_page> %s' % (res_url_partno[i][0], tr_hc)
                    list_string_exact.append(tr_hc)
                else:
                    list_similar.append(res_url_partno[i])
                i   += 1

            if boolean_jq:
                list_string = ['exact'] + list_string_exact
            else:
                list_string = ['similar', html, list_similar]
        elif res_info:
            ''' 详细信息页面 '''
            list_string     = ['info', html]
        else:
            ''' 未知的情况 '''
            list_string = [None, html]
        return list_string

예제 #3

0

파일 보기

파일: get_cartprice.py 프로젝트: romali/ajax_show_stock

    def get_carthtml(self, partno,qty,**dict_args):
        ''' 发出请求,获取页面代码 '''
        fun = 'function get_carthtml of get_cartprice.py'
        data = self.postdata
        mouser_partno = dict_args.get('mouser_partno','')
        if mouser_partno:
            ''' 此时得到精确的mouser型号名称,那么就能够快速获取到正确的价格 '''
            data['ctl00$ContentMain$txt1'] = mouser_partno
        else:
            data['ctl00$ContentMain$txt1'] = partno
        data['ctl00$ContentMain$txt3'] = qty

        try:
            req = self.urllib2.Request(url = self.url_post, data = urllib.urlencode(data))
            html = self.urllib2.urlopen(req).read()
            html = filter_html(html)

        except Exception,e:
            exce_info = u'处理[partno: %s, mouser_partno: %s, qty: %s]时发出异常: %s' % (partno,mouser_partno,qty,str(e))
            print fun,exce_info
            html = 'timeout'

예제 #4

0

파일 보기

파일: getdata_mouser.py 프로젝트: romali/ajax_show_stock

    def get_list_string_mmp(self,**dict_args):
        __dc__ = 'find the correct pageurl and get the correct pagestring of this correct pageurl'
        """  
        self.list_string:
        ['timeout','timeout']
        ['exact','详细信息页面代码']
        ['exact','详细信息页面代码1','详细信息页面代码2',...] 或者 ['similar','不能精确匹配的查询页面代码',[(相似型号url,相似型号),()]]
        ['no_result','无查询结果的提示页面']
        """
        if debug: print 113,'language: %s' % dict_args.get('login_language', '')

        self.list_string = []
        url_search = self.url_search
        ###########################################
        self.string = gethtml(self.mmp)
        if self.string is None:
            print 'NoneType coming!!'
            self.string = ''
        self.string = filter_html(self.string)
        ###########################################
        if debug: print '118 end string length: %s' % len(self.string)
            
        if self.string == 'timeout':
            self.list_string = ['timeout','timeout']
            if debug: print '119 enter into the timeout page'
        elif self.p_spec.findall(self.string):
            self.list_string = ['exact',self.string]
            self.list_url_exact.append(url_search)
            if debug: print '123 enter into the information page'
        elif self.p_buy_button.findall(self.string):
            if debug: print '125 enter into the table page'
            """ [('url','mfrpart'),(),...] """
            res_mouserpart_url_mfrpart = self.p_mouserpart_url_mfrpart.findall(self.string)
            final_res_mouserpart_url_mfrpart = []
            for i in res_mouserpart_url_mfrpart:
                if i[0] != "Not Assigned":
                    final_res_mouserpart_url_mfrpart.append(i)
            
            """
            2010年 09月 14日 星期二 15:31:57 CST
            增加功能 如果显示查询结果表格 并且表格每行型号与查寻型号不一致 则返回相似型号的信息
            """
            """ 是否存在精确匹配的型号 """
            boolean_jq = False
            for mouserpart,url,mfrpart in final_res_mouserpart_url_mfrpart:
                ''' 加上域名前缀 去掉../ '''
                ################################
                url_info = url.replace('../','')
                #################################
                if self.boolean_mp:
                    ''' 比较mouser型号名称 '''
                    if mouserpart.upper() == self.mmp.upper():
                        ###############################
                        strs = gethtml(url_info)
                        if strs is None:
                            strs = ''
                        strs = filter_html(strs)
                        self.list_string.append(strs)
                        self.list_url_exact.append(url_info)
                        ##############################
                        boolean_jq = True
                else:
                    ''' 比较厂商型号名称 '''
                    if mfrpart.upper() == self.mmp.upper():
                        #######################################
                        strs = gethtml(url_info)
                        if strs is None:
                            strs = ''
                        strs = filter_html(strs)
                        self.list_string.append(strs)
                        self.list_url_exact.append(url_info)
                        #######################################
                        boolean_jq = True

            if boolean_jq == False:
                if debug:print '152 in the table page not find exact partno'
                count_similar = 6
                """ 没有精确匹配 只获取6个相似型号 """

                ''' [(ur)] '''
                len_rhum = len(final_res_mouserpart_url_mfrpart)
                res_url_mfrpart = [(url,mfrpart) for mouserpart,url,mfrpart in final_res_mouserpart_url_mfrpart]
                if len_rhum > count_similar:
                    """ 大于6个，list_string: ['similar','hc',[[url,p1],...]] """
                    self.list_string = ['similar',self.string,res_url_mfrpart[:count_similar]]
                else:
                    """ 小于或者等于6个，list_string: ['similar','hc',[[url,p1],...]] """
                    self.list_string = ['similar',self.string,res_url_mfrpart]
            else:
                if debug:print '152 in the table page find exact partno'
                self.list_string.insert(0,'exact')
        else:
            if debug:print '170 analyse the partno has no result'
            self.list_string = ['no_result',self.string]
        
        return self.list_string

예제 #5

0

파일 보기

파일: getdata_digikey.py 프로젝트: romali/ajax_show_stock

    def get_list_string_mmp(self):
        """ 不需要进入详细信息页面，只需要在查询结果页面获取即可 """
        __dc__ = 'find the correct pageurl and get the correct pagestring of this correct pageurl'
        fun = 'function get_list_string_mmp of %s' % self.name
        self.list_string        = []
        self.similar_partno     = []
        
        tishi = u'in %s ' % fun
        if self.boolean_iu:
            ''' 用详细信息页面作为搜索条件 '''
            tishi += u'\n查询页面: %s ' % self.info_url
            ###################
            #print "用详细信息页面作为搜索条件",self.info_url
            temp = gethtml(self.info_url)
            if temp is None:
                temp = ''
            self.string = filter_html(temp)
            ###################
            
            self.hc_cxym = self.string
            if self.string == 'timeout':
                tishi += u'\n 页面超时 '
                self.list_string = ['timeout','timeout']
            elif self.p_xxxx.findall(self.string):
                tishi += u'\n 页面正常进入详细信息页面 '
                self.list_string = ['exact',self.string]
                self.list_url_exact.append(self.url_search)
            else:
                tishi += u'\n 页面异常 原本应该正常进入详细信息页面 '
                self.list_string = [None,self.string]

            ''' 直接返回  不执行下述步骤 '''
            if debug: print tishi
            return self.list_string
        ###########################
        #print "in get_list_string_mmp",self.url_search
        temp = gethtml(self.url_search)
        if temp is None:
            temp = ''
        self.string = filter_html(temp)
        ###########################
        self.hc_cxym = self.string

        tishi += u'\n查询页面: %s ' % self.url_search
        if self.string == 'timeout':
            tishi += u'\n 页面超时'
            self.list_string = ['timeout','timeout']
        elif self.p_cxjg.findall(self.string):
            """ 处于 1 查询结果页面；"""
            tishi += u'\n 进入查询结果页面 '
            ''' 处理表格页面中的两种情况: 1 精确 2 相似 '''
            self.operate_cxjg_page(self.list_string,self.similar_partno,self.string)
            if self.list_string:
                count_exact,count_similar = len(self.list_string),0
                self.list_string.insert(0,'exact')
            else:
                count_exact,count_similar = 0,len(self.similar_partno)
                self.list_string = ['similar',self.string,self.similar_partno]

            tishi += u'\n 得到 %s 个精确匹配页面 %s 个相似型号页面 ' % (count_exact,count_similar)
        elif self.p_xxxx.findall(self.string):
            """ 只有一个查询结果，查询结果页面跳转至详细信息页面 """
            tishi += u'\n 进入详细信息页面 '
            self.list_string = ['exact',self.string]
            self.list_url_exact.append(self.url_search)
        elif self.p_wjg.findall(self.string):
            """ 处于 3 提示无结果页面 """
            tishi += u'\n 进入无结果页面 '
            self.list_string = ['no_result',self.string]
        elif self.p_jgfl.findall(self.string):
            ''' 处于结果分类页面  十分麻烦 '''
            tishi += u'\n 进入结果分类页面 '
            res_jgfl_url_count = self.p_jgfl_url_count.findall(self.string)
            for jgfl_url,jgfl_count in res_jgfl_url_count:
                url_whole = get_url_whole(self.url_search,jgfl_url)
                #####################
                #print "i处于结果分类页面",url_whole
                temp = gethtml(url_whole)
                if temp is None:
                    temp = ''
                hc = filter_html(temp)
                #####################
                if int(jgfl_count) == 1:
                    ''' 进入详细信息页面 '''
                    if str('>%s<' % self.mmp) in hc:
                        self.list_url_exact.append(url_whole)
                        self.list_string.append(hc)
                else:
                    ''' 进入表格页面 '''
                    ''' 处理表格页面中的两种情况: 1 精确 2 相似 '''
                    self.operate_cxjg_page(self.list_string,self.similar_partno,hc)
            if self.list_string:
                count_exact,count_similar = len(self.list_string),0
                self.list_string.insert(0,'exact')
            else:
                count_exact,count_similar = 0,len(self.similar_partno)
                self.list_string = ['similar',self.string,self.similar_partno]
            
            tishi += u'\n 得到 %s 个精确匹配页面 %s 个相似型号页面 ' % (count_exact,count_similar)
        else:
            tishi += u'\n 发生异常 进入未知页面 '
            self.list_string = [None,self.string]

        if debug: print tishi
        return self.list_string

예제 #6

0

파일 보기

파일: getdata_olc.py 프로젝트: romali/ajax_show_stock

    def get_dict_info(self,hc,**dict_args):
        ''' 冲详细信息页面的代码中获取数据 {} '''
        fun         = 'function get_dict_info of %s' % self.name
        dict_info   = {}

        hc          = filter_html(hc)
        ''' 此时 hc 是否为详细信息页面的代码 '''
        iip         = dict_args.get('is_info_page', False)

        if iip:
            ''' 处在详细信息页面 '''
            res_imgurl      = p_imgurl_iip.findall(hc)
            res_partno      = p_partno_iip.findall(hc)
            res_mfr         = p_mfr_iip.findall(hc)
            res_desc        = p_desc_iip.findall(hc)
            res_stock       = p_stock_iip.findall(hc)
        else:
            res_imgurl      = p_imgurl.findall(hc)
            res_partno      = p_partno.findall(hc)
            res_mfr         = p_mfr.findall(hc)
            res_desc        = p_desc.findall(hc)
            res_stock       = p_stock.findall(hc)

        if res_imgurl:
            dict_info[keys_mouser_tt[28][1]] = p_jkh.sub('',res_imgurl[0]).replace('&quot;','').strip()

        if res_partno:
            dict_info[keys_mouser_tt[1][1]] = p_jkh.sub('',res_partno[0]).strip()

        if res_mfr:
            dict_info[keys_mouser_tt[2][1]] = p_jkh.sub('',res_mfr[0]).strip()

        if res_desc:
            dict_info[keys_mouser_tt[3][1]] = p_jkh.sub('',res_desc[0]).strip()

        if res_stock:
            dict_info[keys_mouser_tt[5][1]] = p_jkh.sub('',res_stock[0]).strip()


        ''' 处理区间价格 [('1-24','$82.08'),.('25 +','$61.56')] >>> ['1:$82.08','1000:$61.56'] >>> '1:$82.08|||25:$61.56'  '''
        if iip:
            ''' 详细信息页面 '''
            res_priceinfo  = p_priceinfo_iip.findall(hc)
            pc_pp_new = ['%s:%s' % (p_invalidnum.sub('',pc).strip(), pp) for pc,pp in res_priceinfo]
            pc_pp_finall = '|||'.join(pc_pp_new)
        else:
            ''' 现在为表格页面模式  获取该<tr>对应的详细信息页面url '''
            res_seemore     = p_seemore.findall(hc)
            res_priceinfo   = []#默认设置空
            if res_seemore:
                ''' 需要进入详细信息页面获取完整价格 '''
                res_uip         = p_uip.findall(hc)
                if res_uip:
                    dict_info[keys_mouser_tt[27][1]] = res_uip[0]
                    hc_more         = filter_html(get_html_urllib(res_uip[0], 2))
                    if hc_more  != 'timeout':
                        ''' 正常获取了 详细信息页面的价格信息 '''
                        res_priceinfo  = p_priceinfo_iip.findall(hc_more)
            else:
                ''' 只需从<tr>行字符串获取价格 即可 '''
                res_priceinfo       = p_priceinfo.findall(hc)

            pc_pp_new = ['%s:%s' % (p_invalidnum.sub('',pc).strip(), pp) for pc,pp in res_priceinfo]
            pc_pp_finall = '|||'.join(pc_pp_new)

        if pc_pp_finall:dict_info[keys_mouser_tt[10][1]] = pc_pp_finall

        ''' 加上型号搜索页面url  详细页面url 详细页面html代码 '''
        dict_info[keys_mouser_tt[19][1]] = self.url_search
        
        return dict_info

예제 #7

0

파일 보기

파일: getdata_mouserback.py 프로젝트: romali/ajax_show_stock

    def get_list_string_mmp(self,**dict_args):
        __dc__ = 'find the correct pageurl and get the correct pagestring of this correct pageurl'
        """  
        self.list_string:
        ['timeout','timeout']
        ['exact','详细信息页面代码']
        ['exact','详细信息页面代码1','详细信息页面代码2',...] 或者 ['similar','不能精确匹配的查询页面代码',[(相似型号url,相似型号),()]]
        ['no_result','无查询结果的提示页面']
        """
        global loginmouser,now_set_language
        if dict_args and dict_args.get('login_language','') != now_set_language and dict_args.get('login_language','') in ['english','chinese']:
            ''' 可以方便切换　中文 英文 '''
            if debug: print 111,'now_set_language: %s  will change to: %s' % (now_set_language, dict_args['login_language'])
            loginmouser     = LoginMouser(**{'language':dict_args['login_language']})
        if debug: print 113,'language: %s' % dict_args.get('login_language', '')

        self.list_string = []
        url_search = self.url_search
        self.string = gethtml(url_search)
        self.string = filter_html(self.string)
        if debug: print '118 end string length: %s' % len(self.string)
            
        if self.string == 'timeout':
            self.list_string = ['timeout','timeout']
            if debug: print '119 enter into the timeout page'
        elif self.p_spec.findall(self.string):
            self.list_string = ['exact',self.string]
            self.list_url_exact.append(url_search)
            if debug: print '123 enter into the information page'
        elif self.p_buy_button.findall(self.string):
            if debug: print '125 enter into the table page'
            """ [('url','mfrpart'),(),...] """
            res_mouserpart_url_mfrpart = self.p_mouserpart_url_mfrpart.findall(self.string)

            """
            2010年 09月 14日 星期二 15:31:57 CST
            增加功能 如果显示查询结果表格 并且表格每行型号与查寻型号不一致 则返回相似型号的信息
            """
            """ 是否存在精确匹配的型号 """
            boolean_jq = False
            for mouserpart,url,mfrpart in res_mouserpart_url_mfrpart:
                ''' 加上域名前缀 去掉../ '''
                url_info = self.url_mouser_qz + url.replace('../','')
                if self.boolean_mp:
                    ''' 比较mouser型号名称 '''
                    if mouserpart.upper() == self.mmp.upper():
                        self.list_string.append(filter_html(gethtml(url_info)))
                        self.list_url_exact.append(url_info)
                        boolean_jq = True
                else:
                    ''' 比较厂商型号名称 '''
                    if mfrpart.upper() == self.mmp.upper():
                        self.list_string.append(filter_html(gethtml(url_info)))
                        self.list_url_exact.append(url_info)
                        boolean_jq = True

            if boolean_jq == False:
                if debug:print '152 in the table page not find exact partno'
                count_similar = 6
                """ 没有精确匹配 只获取6个相似型号 """

                ''' [(ur)] '''
                len_rhum = len(res_mouserpart_url_mfrpart)
                res_url_mfrpart = [(url,mfrpart) for mouserpart,url,mfrpart in res_mouserpart_url_mfrpart]
                if len_rhum > count_similar:
                    """ 大于6个，list_string: ['similar','hc',[[url,p1],...]] """
                    self.list_string = ['similar',self.string,res_url_mfrpart[:count_similar]]
                else:
                    """ 小于或者等于6个，list_string: ['similar','hc',[[url,p1],...]] """
                    self.list_string = ['similar',self.string,res_url_mfrpart]
            else:
                if debug:print '152 in the table page find exact partno'
                self.list_string.insert(0,'exact')
        else:
            if debug:print '170 analyse the partno has no result'
            self.list_string = ['no_result',self.string]
        
        return self.list_string

예제 #8

0

파일 보기

파일: getdata_roch.py 프로젝트: romali/ajax_show_stock

    def get_dict_info(self,hc):
        ''' 冲详细信息页面的代码中获取数据 {} '''
        fun = 'function get_dict_info of %s' % self.name
        dict_info = {}

        hc = filter_html(hc)

        res_mfr         = p_mfr.findall(hc)
        if res_mfr:
            dict_info[keys_mouser_tt[2][1]] = res_mfr[0]

        res_partno = p_partno.findall(hc)
        if res_partno:
            dict_info[keys_mouser_tt[1][1]] = res_partno[0]

        res_desc = p_desc.findall(hc)
        if res_desc:
            dict_info[keys_mouser_tt[3][1]] = p_jkh.sub('', res_desc[0]).strip()

        res_pdf = p_pdf.findall(hc)
        if res_pdf:
            dict_info[keys_mouser_tt[23][1]] = res_pdf[0]

        res_pack = p_pack.findall(hc)
        if res_pack:
            dict_info[keys_mouser_tt[14][1]] = res_pack[0].strip()

        res_rohs = p_rohs.findall(hc)
        if res_rohs:
            dict_info[keys_mouser_tt[24][1]] = res_rohs[0].strip()

        res_pc_pp = p_pc_pp.findall(hc)
        if res_pc_pp:
            ''' 处理区间价格 '''
            ''' [('1-24','$82.08'),.('over 1000','$61.56')] >>> ['1:$82.08','1000:$61.56'] >>> '1:$82.08|||1000:$61.56'  '''
            pc_pp_new = ['%s:%s' % (p_invalidnum.sub('',pc), pp) for pc,pp in res_pc_pp]
            pc_pp_finall = '|||'.join(pc_pp_new)
            dict_info[keys_mouser_tt[10][1]] = pc_pp_finall
              
        res_yourcost = p_yourcost.findall(hc)
        if res_yourcost:
            dict_info[keys_mouser_tt[25][1]] = res_yourcost[0].strip()
            
        res_table_lnq = p_table_lnq.findall(hc)
        if res_table_lnq:
            table_lnq = res_table_lnq[0]

            res_tr_lnq = p_tr_lnq.findall(table_lnq)
            qty = 0
            for l,n,q in res_tr_lnq:
                qty += int(q)

            dict_info[keys_mouser_tt[5][1]] = qty
            dict_info[keys_mouser_tt[26][1]] = res_tr_lnq

        ''' 加上型号搜索页面url  详细页面url 详细页面html代码 '''
        dict_info[keys_mouser_tt[19][1]] = self.url_search
        
        res_uip = p_uip.findall(hc)
        if res_uip:
            dict_info[keys_mouser_tt[20][1]] = res_uip[0]
        
        return dict_info