def operate_cxjg_page(self,list_string,similar_partno,hc): ''' 处理查询结果页面 有精确匹配 精确匹配页面代码附加在list_string中;否则 相似页面url、partno附加在similar_partno中 参数: list_string: 保存每个digikey搜索型号对应精确页面的代码,格式[hc0,hc1...] similar_partno: 相似页面url 型号信息,格式[[url,partno],...] hc: 当前处理页面的代码 ''' res_url_partno = self.p_url_partno.findall(hc) for url,partno in res_url_partno: if partno == self.mmp: url_info = get_url_whole(self.url_search, url) if debug: print url,partno self.list_url_exact.append(url_info) ################## #print "in operate_cxjg_page:",url_info temp = gethtml(url_info) if temp is None: temp = '' hc_info = filter_html(temp) ################## if hc_info != 'timeout': self.list_string.append(hc_info) else: ''' 型号不精确 相似而已 ''' similar_partno.append([get_url_whole(self.url_search, url), partno])
def get_list_string(self): """ self.list_string: ['timeout','timeout'] ['exact','详细信息页面代码'] ['exact','详细信息页面代码1','详细信息页面代码2',...] 或者 ['similar','不能精确匹配的查询页面代码',[(相似型号url,相似型号),()]] ['no_result','无查询结果的提示页面'] [None,页面代码] """ fun = 'function get_list_string of %s' % self.name html = filter_html(gethtml(self.url_search)) res_noresult = p_noresult.findall(html) res_table = p_table.findall(html) res_info = p_info.findall(html) if html == 'timeout': list_string = ['timeout','timeout'] elif res_noresult: list_string = ['no_result', html] elif res_table: ''' 表格页面 多个记录 ''' list_string_exact = [] list_similar = [] boolean_jq = False table = res_table[0] res_tr_hc = p_tr_hc.findall(table) res_url_partno = p_url_partno.findall(table) ''' [(url, partno), ...] 元素个数应该与 res_tr_hc 个数相同 ''' if len(res_tr_hc) != len(res_url_partno): if debug: print 'in %s error: length of res_tr_hc and res_url_partno do not equal' % fun ''' 异常的情况 ''' list_string = [None, html] return list_string ''' 正常情况 ''' i = 0 for tr_hc in res_tr_hc: if res_url_partno[i][1].upper().strip() == self.partno: boolean_jq = True ''' 特殊处理 加上该tr 行对应的详细信息页面url ''' tr_hc = '<url_info_page>%s</url_info_page> %s' % (res_url_partno[i][0], tr_hc) list_string_exact.append(tr_hc) else: list_similar.append(res_url_partno[i]) i += 1 if boolean_jq: list_string = ['exact'] + list_string_exact else: list_string = ['similar', html, list_similar] elif res_info: ''' 详细信息页面 ''' list_string = ['info', html] else: ''' 未知的情况 ''' list_string = [None, html] return list_string
def get_carthtml(self, partno,qty,**dict_args): ''' 发出请求,获取页面代码 ''' fun = 'function get_carthtml of get_cartprice.py' data = self.postdata mouser_partno = dict_args.get('mouser_partno','') if mouser_partno: ''' 此时得到精确的mouser型号名称,那么就能够快速获取到正确的价格 ''' data['ctl00$ContentMain$txt1'] = mouser_partno else: data['ctl00$ContentMain$txt1'] = partno data['ctl00$ContentMain$txt3'] = qty try: req = self.urllib2.Request(url = self.url_post, data = urllib.urlencode(data)) html = self.urllib2.urlopen(req).read() html = filter_html(html) except Exception,e: exce_info = u'处理[partno: %s, mouser_partno: %s, qty: %s]时发出异常: %s' % (partno,mouser_partno,qty,str(e)) print fun,exce_info html = 'timeout'
def get_list_string_mmp(self,**dict_args): __dc__ = 'find the correct pageurl and get the correct pagestring of this correct pageurl' """ self.list_string: ['timeout','timeout'] ['exact','详细信息页面代码'] ['exact','详细信息页面代码1','详细信息页面代码2',...] 或者 ['similar','不能精确匹配的查询页面代码',[(相似型号url,相似型号),()]] ['no_result','无查询结果的提示页面'] """ if debug: print 113,'language: %s' % dict_args.get('login_language', '') self.list_string = [] url_search = self.url_search ########################################### self.string = gethtml(self.mmp) if self.string is None: print 'NoneType coming!!' self.string = '' self.string = filter_html(self.string) ########################################### if debug: print '118 end string length: %s' % len(self.string) if self.string == 'timeout': self.list_string = ['timeout','timeout'] if debug: print '119 enter into the timeout page' elif self.p_spec.findall(self.string): self.list_string = ['exact',self.string] self.list_url_exact.append(url_search) if debug: print '123 enter into the information page' elif self.p_buy_button.findall(self.string): if debug: print '125 enter into the table page' """ [('url','mfrpart'),(),...] """ res_mouserpart_url_mfrpart = self.p_mouserpart_url_mfrpart.findall(self.string) final_res_mouserpart_url_mfrpart = [] for i in res_mouserpart_url_mfrpart: if i[0] != "Not Assigned": final_res_mouserpart_url_mfrpart.append(i) """ 2010年 09月 14日 星期二 15:31:57 CST 增加功能 如果显示查询结果表格 并且表格每行型号与查寻型号不一致 则返回相似型号的信息 """ """ 是否存在精确匹配的型号 """ boolean_jq = False for mouserpart,url,mfrpart in final_res_mouserpart_url_mfrpart: ''' 加上域名前缀 去掉../ ''' ################################ url_info = url.replace('../','') ################################# if self.boolean_mp: ''' 比较mouser型号名称 ''' if mouserpart.upper() == self.mmp.upper(): ############################### strs = gethtml(url_info) if strs is None: strs = '' strs = filter_html(strs) self.list_string.append(strs) self.list_url_exact.append(url_info) ############################## boolean_jq = True else: ''' 比较厂商型号名称 ''' if mfrpart.upper() == self.mmp.upper(): ####################################### strs = gethtml(url_info) if strs is None: strs = '' strs = filter_html(strs) self.list_string.append(strs) self.list_url_exact.append(url_info) ####################################### boolean_jq = True if boolean_jq == False: if debug:print '152 in the table page not find exact partno' count_similar = 6 """ 没有精确匹配 只获取6个相似型号 """ ''' [(ur)] ''' len_rhum = len(final_res_mouserpart_url_mfrpart) res_url_mfrpart = [(url,mfrpart) for mouserpart,url,mfrpart in final_res_mouserpart_url_mfrpart] if len_rhum > count_similar: """ 大于6个,list_string: ['similar','hc',[[url,p1],...]] """ self.list_string = ['similar',self.string,res_url_mfrpart[:count_similar]] else: """ 小于或者等于6个,list_string: ['similar','hc',[[url,p1],...]] """ self.list_string = ['similar',self.string,res_url_mfrpart] else: if debug:print '152 in the table page find exact partno' self.list_string.insert(0,'exact') else: if debug:print '170 analyse the partno has no result' self.list_string = ['no_result',self.string] return self.list_string
def get_list_string_mmp(self): """ 不需要进入详细信息页面,只需要在查询结果页面获取即可 """ __dc__ = 'find the correct pageurl and get the correct pagestring of this correct pageurl' fun = 'function get_list_string_mmp of %s' % self.name self.list_string = [] self.similar_partno = [] tishi = u'in %s ' % fun if self.boolean_iu: ''' 用详细信息页面作为搜索条件 ''' tishi += u'\n查询页面: %s ' % self.info_url ################### #print "用详细信息页面作为搜索条件",self.info_url temp = gethtml(self.info_url) if temp is None: temp = '' self.string = filter_html(temp) ################### self.hc_cxym = self.string if self.string == 'timeout': tishi += u'\n 页面超时 ' self.list_string = ['timeout','timeout'] elif self.p_xxxx.findall(self.string): tishi += u'\n 页面正常进入详细信息页面 ' self.list_string = ['exact',self.string] self.list_url_exact.append(self.url_search) else: tishi += u'\n 页面异常 原本应该正常进入详细信息页面 ' self.list_string = [None,self.string] ''' 直接返回 不执行下述步骤 ''' if debug: print tishi return self.list_string ########################### #print "in get_list_string_mmp",self.url_search temp = gethtml(self.url_search) if temp is None: temp = '' self.string = filter_html(temp) ########################### self.hc_cxym = self.string tishi += u'\n查询页面: %s ' % self.url_search if self.string == 'timeout': tishi += u'\n 页面超时' self.list_string = ['timeout','timeout'] elif self.p_cxjg.findall(self.string): """ 处于 1 查询结果页面;""" tishi += u'\n 进入查询结果页面 ' ''' 处理表格页面中的两种情况: 1 精确 2 相似 ''' self.operate_cxjg_page(self.list_string,self.similar_partno,self.string) if self.list_string: count_exact,count_similar = len(self.list_string),0 self.list_string.insert(0,'exact') else: count_exact,count_similar = 0,len(self.similar_partno) self.list_string = ['similar',self.string,self.similar_partno] tishi += u'\n 得到 %s 个精确匹配页面 %s 个相似型号页面 ' % (count_exact,count_similar) elif self.p_xxxx.findall(self.string): """ 只有一个查询结果,查询结果页面跳转至详细信息页面 """ tishi += u'\n 进入详细信息页面 ' self.list_string = ['exact',self.string] self.list_url_exact.append(self.url_search) elif self.p_wjg.findall(self.string): """ 处于 3 提示无结果页面 """ tishi += u'\n 进入无结果页面 ' self.list_string = ['no_result',self.string] elif self.p_jgfl.findall(self.string): ''' 处于结果分类页面 十分麻烦 ''' tishi += u'\n 进入结果分类页面 ' res_jgfl_url_count = self.p_jgfl_url_count.findall(self.string) for jgfl_url,jgfl_count in res_jgfl_url_count: url_whole = get_url_whole(self.url_search,jgfl_url) ##################### #print "i处于结果分类页面",url_whole temp = gethtml(url_whole) if temp is None: temp = '' hc = filter_html(temp) ##################### if int(jgfl_count) == 1: ''' 进入详细信息页面 ''' if str('>%s<' % self.mmp) in hc: self.list_url_exact.append(url_whole) self.list_string.append(hc) else: ''' 进入表格页面 ''' ''' 处理表格页面中的两种情况: 1 精确 2 相似 ''' self.operate_cxjg_page(self.list_string,self.similar_partno,hc) if self.list_string: count_exact,count_similar = len(self.list_string),0 self.list_string.insert(0,'exact') else: count_exact,count_similar = 0,len(self.similar_partno) self.list_string = ['similar',self.string,self.similar_partno] tishi += u'\n 得到 %s 个精确匹配页面 %s 个相似型号页面 ' % (count_exact,count_similar) else: tishi += u'\n 发生异常 进入未知页面 ' self.list_string = [None,self.string] if debug: print tishi return self.list_string
def get_dict_info(self,hc,**dict_args): ''' 冲详细信息页面的代码中获取数据 {} ''' fun = 'function get_dict_info of %s' % self.name dict_info = {} hc = filter_html(hc) ''' 此时 hc 是否为详细信息页面的代码 ''' iip = dict_args.get('is_info_page', False) if iip: ''' 处在详细信息页面 ''' res_imgurl = p_imgurl_iip.findall(hc) res_partno = p_partno_iip.findall(hc) res_mfr = p_mfr_iip.findall(hc) res_desc = p_desc_iip.findall(hc) res_stock = p_stock_iip.findall(hc) else: res_imgurl = p_imgurl.findall(hc) res_partno = p_partno.findall(hc) res_mfr = p_mfr.findall(hc) res_desc = p_desc.findall(hc) res_stock = p_stock.findall(hc) if res_imgurl: dict_info[keys_mouser_tt[28][1]] = p_jkh.sub('',res_imgurl[0]).replace('"','').strip() if res_partno: dict_info[keys_mouser_tt[1][1]] = p_jkh.sub('',res_partno[0]).strip() if res_mfr: dict_info[keys_mouser_tt[2][1]] = p_jkh.sub('',res_mfr[0]).strip() if res_desc: dict_info[keys_mouser_tt[3][1]] = p_jkh.sub('',res_desc[0]).strip() if res_stock: dict_info[keys_mouser_tt[5][1]] = p_jkh.sub('',res_stock[0]).strip() ''' 处理区间价格 [('1-24','$82.08'),.('25 +','$61.56')] >>> ['1:$82.08','1000:$61.56'] >>> '1:$82.08|||25:$61.56' ''' if iip: ''' 详细信息页面 ''' res_priceinfo = p_priceinfo_iip.findall(hc) pc_pp_new = ['%s:%s' % (p_invalidnum.sub('',pc).strip(), pp) for pc,pp in res_priceinfo] pc_pp_finall = '|||'.join(pc_pp_new) else: ''' 现在为表格页面模式 获取该<tr>对应的详细信息页面url ''' res_seemore = p_seemore.findall(hc) res_priceinfo = []#默认设置空 if res_seemore: ''' 需要进入详细信息页面获取完整价格 ''' res_uip = p_uip.findall(hc) if res_uip: dict_info[keys_mouser_tt[27][1]] = res_uip[0] hc_more = filter_html(get_html_urllib(res_uip[0], 2)) if hc_more != 'timeout': ''' 正常获取了 详细信息页面的价格信息 ''' res_priceinfo = p_priceinfo_iip.findall(hc_more) else: ''' 只需从<tr>行字符串获取价格 即可 ''' res_priceinfo = p_priceinfo.findall(hc) pc_pp_new = ['%s:%s' % (p_invalidnum.sub('',pc).strip(), pp) for pc,pp in res_priceinfo] pc_pp_finall = '|||'.join(pc_pp_new) if pc_pp_finall:dict_info[keys_mouser_tt[10][1]] = pc_pp_finall ''' 加上型号搜索页面url 详细页面url 详细页面html代码 ''' dict_info[keys_mouser_tt[19][1]] = self.url_search return dict_info
def get_list_string_mmp(self,**dict_args): __dc__ = 'find the correct pageurl and get the correct pagestring of this correct pageurl' """ self.list_string: ['timeout','timeout'] ['exact','详细信息页面代码'] ['exact','详细信息页面代码1','详细信息页面代码2',...] 或者 ['similar','不能精确匹配的查询页面代码',[(相似型号url,相似型号),()]] ['no_result','无查询结果的提示页面'] """ global loginmouser,now_set_language if dict_args and dict_args.get('login_language','') != now_set_language and dict_args.get('login_language','') in ['english','chinese']: ''' 可以方便切换 中文 英文 ''' if debug: print 111,'now_set_language: %s will change to: %s' % (now_set_language, dict_args['login_language']) loginmouser = LoginMouser(**{'language':dict_args['login_language']}) if debug: print 113,'language: %s' % dict_args.get('login_language', '') self.list_string = [] url_search = self.url_search self.string = gethtml(url_search) self.string = filter_html(self.string) if debug: print '118 end string length: %s' % len(self.string) if self.string == 'timeout': self.list_string = ['timeout','timeout'] if debug: print '119 enter into the timeout page' elif self.p_spec.findall(self.string): self.list_string = ['exact',self.string] self.list_url_exact.append(url_search) if debug: print '123 enter into the information page' elif self.p_buy_button.findall(self.string): if debug: print '125 enter into the table page' """ [('url','mfrpart'),(),...] """ res_mouserpart_url_mfrpart = self.p_mouserpart_url_mfrpart.findall(self.string) """ 2010年 09月 14日 星期二 15:31:57 CST 增加功能 如果显示查询结果表格 并且表格每行型号与查寻型号不一致 则返回相似型号的信息 """ """ 是否存在精确匹配的型号 """ boolean_jq = False for mouserpart,url,mfrpart in res_mouserpart_url_mfrpart: ''' 加上域名前缀 去掉../ ''' url_info = self.url_mouser_qz + url.replace('../','') if self.boolean_mp: ''' 比较mouser型号名称 ''' if mouserpart.upper() == self.mmp.upper(): self.list_string.append(filter_html(gethtml(url_info))) self.list_url_exact.append(url_info) boolean_jq = True else: ''' 比较厂商型号名称 ''' if mfrpart.upper() == self.mmp.upper(): self.list_string.append(filter_html(gethtml(url_info))) self.list_url_exact.append(url_info) boolean_jq = True if boolean_jq == False: if debug:print '152 in the table page not find exact partno' count_similar = 6 """ 没有精确匹配 只获取6个相似型号 """ ''' [(ur)] ''' len_rhum = len(res_mouserpart_url_mfrpart) res_url_mfrpart = [(url,mfrpart) for mouserpart,url,mfrpart in res_mouserpart_url_mfrpart] if len_rhum > count_similar: """ 大于6个,list_string: ['similar','hc',[[url,p1],...]] """ self.list_string = ['similar',self.string,res_url_mfrpart[:count_similar]] else: """ 小于或者等于6个,list_string: ['similar','hc',[[url,p1],...]] """ self.list_string = ['similar',self.string,res_url_mfrpart] else: if debug:print '152 in the table page find exact partno' self.list_string.insert(0,'exact') else: if debug:print '170 analyse the partno has no result' self.list_string = ['no_result',self.string] return self.list_string
def get_dict_info(self,hc): ''' 冲详细信息页面的代码中获取数据 {} ''' fun = 'function get_dict_info of %s' % self.name dict_info = {} hc = filter_html(hc) res_mfr = p_mfr.findall(hc) if res_mfr: dict_info[keys_mouser_tt[2][1]] = res_mfr[0] res_partno = p_partno.findall(hc) if res_partno: dict_info[keys_mouser_tt[1][1]] = res_partno[0] res_desc = p_desc.findall(hc) if res_desc: dict_info[keys_mouser_tt[3][1]] = p_jkh.sub('', res_desc[0]).strip() res_pdf = p_pdf.findall(hc) if res_pdf: dict_info[keys_mouser_tt[23][1]] = res_pdf[0] res_pack = p_pack.findall(hc) if res_pack: dict_info[keys_mouser_tt[14][1]] = res_pack[0].strip() res_rohs = p_rohs.findall(hc) if res_rohs: dict_info[keys_mouser_tt[24][1]] = res_rohs[0].strip() res_pc_pp = p_pc_pp.findall(hc) if res_pc_pp: ''' 处理区间价格 ''' ''' [('1-24','$82.08'),.('over 1000','$61.56')] >>> ['1:$82.08','1000:$61.56'] >>> '1:$82.08|||1000:$61.56' ''' pc_pp_new = ['%s:%s' % (p_invalidnum.sub('',pc), pp) for pc,pp in res_pc_pp] pc_pp_finall = '|||'.join(pc_pp_new) dict_info[keys_mouser_tt[10][1]] = pc_pp_finall res_yourcost = p_yourcost.findall(hc) if res_yourcost: dict_info[keys_mouser_tt[25][1]] = res_yourcost[0].strip() res_table_lnq = p_table_lnq.findall(hc) if res_table_lnq: table_lnq = res_table_lnq[0] res_tr_lnq = p_tr_lnq.findall(table_lnq) qty = 0 for l,n,q in res_tr_lnq: qty += int(q) dict_info[keys_mouser_tt[5][1]] = qty dict_info[keys_mouser_tt[26][1]] = res_tr_lnq ''' 加上型号搜索页面url 详细页面url 详细页面html代码 ''' dict_info[keys_mouser_tt[19][1]] = self.url_search res_uip = p_uip.findall(hc) if res_uip: dict_info[keys_mouser_tt[20][1]] = res_uip[0] return dict_info