def get_contributive_info_page(text): total_page = PyQuery(text, parser='html').find('#pagescount').attr('value') if total_page is None or total_page.strip() == '': return 1 return int(total_page)
def __get_company_name(self, text): try: name = PyQuery( text, parser='html').find('.overview').find('#entName').text() if name is not None and name.strip() != '': return name.strip() name = PyQuery(text, parser='html').find('h1.fullName').text() if name is not None and name.strip() != '': return name.strip() return None except Exception as e: self.log.exception(e) return None
def __init__(self, elem, trims, should_cleanup): text = PyQuery(elem).text() for trim in (trims or []): text = text.replace(trim, '') self.rx = re.compile(r'\W+') self.text = text.strip() self.trimmed_text = non_trimmed.sub(' ', self.text) self.html = PyQuery(elem).html() if should_cleanup: self.html = self.cleanup_html() self.normalized_text = nonword.sub('', text.lower())
def get(key): try: en_reg = re.compile('[a-zA-Z]') jp_reg = re.compile('[ぁ-んァ-ヶ]') language = '' if re.findall(en_reg, key): language = 'en' if re.findall(jp_reg, key): language = 'ja' if not language: language = guess_language(key) print('来自 ' + language + ' wiki') headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36', 'Referer': 'https://www.sanseido.biz/', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Connection': 'close', 'authority': f'{language}.wikipedia.org' } s = requests.session() s.keep_alive = False page = s.get(f'https://{language}.wikipedia.org/wiki/{key}', headers=headers) #print(f'https://{language}.wikipedia.org/wiki/{key}') page.encoding = 'UTF-8' print(page.text) tree = html.fromstring(page.text) #print(html.tostring(tree, encoding='UTF-8')) p1 = tree.xpath('//*[@id="mw-content-text"]/div/p[1]') if not p1: return f'wiki里查不到 {key}' p1_text = html.tostring(p1[0], encoding='UTF-8') de = PyQuery(p1_text.decode('UTF-8')).text() if ('refer' in de) or ('可指' in de): refer = tree.xpath('//*[@id="mw-content-text"]/div/ul[1]/li') for each in refer: refer_text = html.tostring(each, encoding='UTF-8') refer_link = PyQuery(refer_text.decode('UTF-8')).text() de += '\n' + refer_link return de.strip() except Exception as e: return f'似乎有些问题:\n{e}'
def parse_text_page(url): """解析文本内容页面""" resp = session.get(url) doc = PyQuery(resp.content) text_list = [] log.info("parsing %s", url) for item in doc('tr td.ctext').items(): item_html = item.html() if not re.search(r'<div.*</p>', item_html): log.warning("undesired text: %s", item_html[:20]) continue item_text = PyQuery(item_html).text() text_list.append(item_text.strip()) return text_list
def parse_withdraw_tr(tr: PyQuery) -> dict: """ 解析一页二号平台站点的出金申请的tr :param tr: :return: """ init_dict = dict() tds = tr.find("td") if len(tds) < 15: return None else: domain = domain2 init_dict['system'] = domain """第一个td,取mt帐号和mt分组""" first = PyQuery(tds[0]) texts_1 = first.text().split("\n") account = int(re.search(r'\d{4,}', texts_1[0].lower()).group()) # mt账户 group = texts_1[-1].lower()[5:] # mt分组 init_dict['account'] = account init_dict['group'] = group """第二个td,取客户2""" second = PyQuery(tds[1]) init_dict['manager'] = second.text().strip() """第三个td,取英文名""" third = PyQuery(tds[2]) texts_3 = third.text().split("\n") nick_name = texts_3[0][4:].strip("") init_dict['nick_name'] = nick_name """第四个,金额""" fourth = PyQuery(tds[3]) texts_4 = fourth.text().split("\n") amount_usd = float(texts_4[0].split("$")[-1].strip()) # 金额/美元 amount_cny = float(texts_4[-1].split("¥")[-1].strip()) # 金额/人民币 init_dict['amount_usd'] = amount_usd init_dict['amount_cny'] = amount_cny """ 第五个,取手续费 """ fifth = PyQuery(tds[4]) texts_5 = fifth.text().split("\n") commission_usd = float(texts_5[0].split("$")[-1].strip()) # 手续费/美元 commission_cny = float(texts_5[-1].split("¥")[-1].strip()) # 手续费/人民币 init_dict['commission_usd'] = commission_usd init_dict['commission_cny'] = commission_cny """ 第六个,转账方式, """ sixth = PyQuery(tds[5]) init_dict['channel'] = sixth.text().strip() """ 第七个,时间 """ seventh = PyQuery(tds[6]) seventh = seventh.text().split("\n") apply_time = seventh[0][5:].strip("") apply_time = get_datetime_from_str(apply_time) close_time = seventh[-1][5:].strip("") close_time = get_datetime_from_str(close_time) init_dict['apply_time'] = apply_time init_dict['close_time'] = close_time """第八个,开户行""" eighth = PyQuery(tds[7]).text() init_dict['blank_name'] = eighth.strip() """第九个,开户行代码""" ninth = PyQuery(tds[8]).text() init_dict['blank_code'] = ninth.strip() """第十个,银行""" tenth = PyQuery(tds[9]).text() init_dict['code_id'] = tenth.strip() """第十一个,状态""" eleventh = PyQuery(tds[10]).text() init_dict['status'] = eleventh.strip() """第十二个,账户余额""" twelfth = PyQuery(tds[11]).text() init_dict['account_balance'] = float(twelfth.strip()[1:]) """第十三个,账户净值""" thirteenth = PyQuery(tds[12]).text() init_dict['account_value'] = float(thirteenth.strip()[1:]) """第十四个,持仓量""" fourteenth = PyQuery(tds[13]).text() init_dict['open_interest'] = float(fourteenth.strip()[0:-1]) """第十五个,可用保证金""" fifteenth = PyQuery(tds[14]).text() init_dict['account_margin'] = float(fifteenth.strip()[1:]) """第十六个,单号""" sixth = PyQuery(tds[15].find("a")) init_dict['ticket'] = int(sixth.attr("href").split("/")[-1]) init_dict = {k: v for k, v in init_dict.items()} """只记录指定类型的单子""" if init_dict['status'] == "审核中": return init_dict else: return None
def contributive_info_list(con_table_list): con_table_dict = {} if con_table_list is None or len(con_table_list) <= 0: return con_table_dict for con_item in con_table_list: status = con_item.get('status', 'fail') if status != 'success': break text = con_item.get('text') if text is None or text == '': break json_data = util.json_loads(text) if json_data is None: break data_array = json_data.get('data') if not isinstance(data_array, list): break for item in data_array: b_lic_no = item.get('bLicNo') b_lic_type_cn = item.get('blicType_CN') inv = item.get('inv') inv_type_cn = item.get('invType_CN') inv_id = item.get('invId') if inv is None or inv.strip() == '': continue if inv_id is None or inv_id.strip() == '': continue inv = inv.strip() inv_id = inv_id.strip() if b_lic_no is not None and b_lic_no.strip() != '': b_lic_no = PyQuery(b_lic_no, parser='html').remove('div').remove('span'). \ text().replace(' ', '').strip() else: b_lic_no = '' if b_lic_type_cn is None or b_lic_no.strip() == '': b_lic_type_cn = '' else: b_lic_type_cn = b_lic_type_cn.strip() if inv_type_cn is None or inv_type_cn.strip() == '': inv_type_cn = '' else: inv_type_cn = PyQuery(inv_type_cn, parser='html').remove('div').remove('span'). \ text().replace(' ', '').strip() sub_model = { GsModel.ContributorInformation.SHAREHOLDER_NAME: inv, GsModel.ContributorInformation.SHAREHOLDER_TYPE: inv_type_cn, GsModel.ContributorInformation.CERTIFICATE_TYPE: b_lic_type_cn, GsModel.ContributorInformation.CERTIFICATE_NO: b_lic_no } con_table_dict[inv_id] = sub_model return con_table_dict
SinaBlogUrl = 'http://blog.sina.com.cn/s/articlelist_' + SinaBlogID + '_0_1.html' print(' >> Read Url: ' + SinaBlogUrl) BlogML = urllib2.urlopen(SinaBlogUrl).read() #读取博客目录 BlogMLHtml = PyQuery(BlogML)('div.menuList').html() BlogMLHtml = PyQuery(BlogMLHtml)('a') BlogMLList = {} for li in BlogMLHtml.items(): #忽略博文收藏目录 if li.text() != u'\u535a\u6587\u6536\u85cf': BlogMLList[li.text()] = li.attr('href') BlogMLList = sorted(BlogMLList.items(), key=lambda d: d[0]) BlogLB = BlogML #分析页数 BlogLsHtml = PyQuery(BlogLB)('ul.SG_pages').html() if BlogLsHtml.strip() != '': BlogPgHtml = int( PyQuery(BlogLsHtml)('span').text().replace(u'共', '').replace(u'页', '')) else: BlogPgHtml = 1 BlogPgHtmlZ = BlogPgHtml #分析记录数 BlogLsHtml = PyQuery(BlogLB)('div.SG_colW73').html() BlogLsHtml = PyQuery(BlogLsHtml)('div.SG_connHead').html() BlogLsHtml = PyQuery(BlogLsHtml)('span.title').html() BlogCtHtml = int( PyQuery(BlogLsHtml)('em').text().replace(u'(', '').replace(u')', '')) BlogCtHtmlZ = BlogCtHtml BlogMLList2 = {} BlogCounts = 0 print(' >> 类别数: ' + str(len(BlogMLList)) + ', 总页数:' + str(BlogPgHtmlZ) +
def parse_detail(title, date, url, content, filename): if not content: return None jq = PyQuery(content) res_json = { 'bread': [u'留学'], 'title': title, 'date': date, 'source': u'liuxue86', 'url': url, 'class': 36, 'subject': u'经验', 'data_weight': 0, } methods = [] content = [each for each in jq('p').items() if each.text().strip() != ''] #print PyQuery(content[0]).html() print len(content) if not content: return None if len(content) == 0: return None flag_abstract = True flag_method = False flag_first = True steps = [] _list = [] img = '' step_title = '' substeps = [] for each_json in content: each = (PyQuery(each_json).text()) if u'点击查看' in each or u'原文来源' in each or u'点击此处' in each or u'推荐阅读' in each or u'相关推荐' in each: break if 'ue86.com' in PyQuery(each_json).text(): if u'】' != PyQuery(each_json).text().strip( )[-1] and u'】' in PyQuery(each_json).text(): each = PyQuery(each_json).text().split('】')[1] #print each elif u'】' == PyQuery(each_json).text().strip()[-1]: #print each continue #print each[1].decode('utf8') #break if each == 'None' or each.strip() == '': continue if u'相关阅读' in each or u'扫一扫' in each or u'相关链接' in each or u'天道提示' in each: break #print each if each.strip()[1] == u'、' or '<strong>' in PyQuery(each_json).html( ) or (each.strip()[3] == ':' if len(each.strip()) > 3 else False) or each.strip()[-1] == u'】': flag_abstract = False flag_method = True if flag_abstract: steps.append(each) else: if each.strip()[1] == u'、' or '<strong>' in PyQuery( each_json).html() or (each.strip()[3] == ':' if len(each.strip()) > 3 else False) or each.strip()[-1] == u'】': #print each.strip()[1] if not flag_first: _list.append({ 'img': img, 'title': step_title, 'substeps': substeps, }) img = '' step_title = '<strong>' + each + '</strong>' substeps = [] if flag_first: step_title = '<strong>' + each + '</strong>' flag_first = False else: substeps.append(each) _list.append({ 'img': img, 'title': step_title, 'substeps': substeps, }) if flag_method: methods.append({'title': u'方法/步骤', 'steps': _list}) abstract = { 'title': '', 'steps': steps, 'img': '', } else: _list1 = [] for v in steps[1:]: _list1.append({ 'img': '', 'title': v, 'substeps': '', }) methods.append({'title': u'方法/步骤', 'steps': _list1}) if len(steps) == 0: steps = [''] abstract = { 'title': '', 'steps': [steps[0]], 'img': '', } res_json['methods'] = methods res_json['abstract'] = abstract #print res_json #print methods good = True print json.dumps(res_json) return json.dumps(res_json), good