def remove_some_elem(self, elem): elem = elem.replace('<span class="MathJax_Preview"></span>', '')\ .replace(' style=""', '')\ .replace('MJXc-processed', '') elem = remove_tag('<script ', elem, all=True) elem = remove_tag('<span class="MathJax_Preview">', elem, all=True) elem = re.sub(r' id=".+?"', '', elem) elem = self.compress_class(elem) return elem
def get_fenxi(entity): fx = get_html_element('<li class="Analytical">', entity, with_tag=False)[0] if not fx: return '' fx = remove_tag('<XHTML', fx, all=False).strip() fx = fx.replace('【解析】', '', 1) return fx.strip()
def get_answer_all_html(entity): ans = get_html_element('<li class="Answer">', entity, with_tag=False)[0] if not ans: return '' ans = remove_tag('<XHTML', ans, all=False).strip() ans = ans.replace('【答案】', '', 1) return ans.strip()
def __init__(self, span): self.span = span self.left = int(re_left.search(span).group(1)) self.top = int(re_top.search(span).group(1)) self.text = remove_tag('<span ', span) mod = re_width.search(span) if mod: self.width = int(mod.group(1)) else: self.width = None
def format_spans(html_string): _LINE_THROUGH = LINE_THROUGH.replace('<span', '<sspan')\ .replace('</span>', '</sspan>') _UNDERLINE = UNDERLINE.replace('<span', '<sspan')\ .replace('</span>', '</sspan>') spans = get_html_element('<span [^<>]+(text-decoration|vertical-align)', html_string, regex=True, flags=re.I) spans = list(set(spans)) spans = sorted(spans, key=lambda x: len(x), reverse=True) for span in spans: txt = remove_start_tag(span) i = span.find('>') tag = span[:i].lower() if 'text-decoration' in tag: if 'underline' in tag: nspan = _UNDERLINE.format(txt) html_string = html_string.replace(span, nspan) elif 'none' in tag: html_string = html_string.replace(span, txt) elif 'line-through' in tag: nspan = _LINE_THROUGH.format(txt) html_string = html_string.replace(span, nspan) elif 'vertical-align' in tag: if ':sub' in tag: nspan = '<sub>{}</sub>'.format(txt) elif ':sup' in tag: nspan = '<sup>{}</sup>'.format(txt) else: nspan = txt html_string = html_string.replace(span, nspan) while True: html_string = remove_tag('<span', html_string, all=False, flags=re.I) if not get_html_element('<span', html_string): break html_string = html_string.replace('<sspan', '<span')\ .replace('</sspan>', '</span>') return html_string
def _discard_mathml_displaystyle_for_subsup(mathml): """ # fractions that are in sub/sup are not needed to display remove displaystyle in which of sub/sup """ subsups = find_valid_elements(mathml, '<msu(b|p)', regex=True, with_tag=False) subsups = list(set(subsups)) subsups = sort_by_len(subsups, reverse=True) for subsup in subsups: subsup_t = remove_tag('<mstyle displaystyle', subsup, all=False) mathml = mathml.replace(subsup, subsup_t, 1) return mathml
def fix_any(self, html_string): html_string = html_string.replace('\n', '') html_string = re_p_tag.sub('<p>', html_string) html_string = handle_spans(html_string) html_string = remove_tag('<span', html_string) html_string = html_string.replace('<p></p>', '')\ .replace('<p><br></p>', '')\ .replace('<div><br></div>', '')\ .replace('<o:p></o:p>', '')\ .replace('</p><br>', '</p>') html_string = re_nbsp.sub(' ' * 6, html_string) html_string = re_underline.sub(UNDERLINE.format(' ' * 6), html_string) html_string = html_string.replace('<sspan', '<span') return html_string
def get_question_html(entity): if entity.startswith('<li'): qs = get_html_element('<li class="IsTopic">', entity, with_tag=False, limit=1)[0] if not qs: return '' else: qs = get_html_element('<span class="optionoption">', entity, with_tag=False, limit=1)[0] if not qs: return '' qs = remove_tag('<XHTML', qs, all=False).strip() if entity.startswith('<span'): qs = make_option(qs) return qs.strip()
def fix_any(self, html_string): html_string = format_spans(html_string) html_string = remove_tag('<font', html_string) html_string = remove_a_tag(html_string) return html_string.strip()
def fix_any(html_string): html_string = remove_tag('<div', html_string, flags=re.I) html_string = html_string.replace(' ', ' ') return html_string
def parse(self, key, qs_json, as_json, aft_subj_id): cols = dict() question_html = qs_json['test'] question_html = self.html_magic.bewitch(question_html, spider_url=key) question_html = fix_any(question_html) cols['question_html'] = center_image(question_html) ################################################################ if not qs_json.get('diff'): difficulty = 0 else: difficulty = (100 - int(qs_json.get('diff', 0) * 100)) cols['difficulty'] = difficulty ################################################################ paper_name = (qs_json.get('docname') or '') cols['paper_name'] = paper_name ################################################################ mod = re.search(r'([12][09][0189]\d)[^\d]', paper_name) if mod: exam_year = mod.group(1) else: exam_year = 0 cols['exam_year'] = int(exam_year) ################################################################ cols['question_type_str'] = (qs_json.get('typesname') or '') ################################################################ as_js = as_json['data'][1][0][0] answer_all_html = self.html_magic.bewitch((as_js.get('answer') or ''), spider_url=key) answer_all_html = fix_any(answer_all_html) cols['answer_all_html'] = center_image(answer_all_html) ################################################################ fenxi = self.html_magic.bewitch((as_js.get('analytic') or ''), spider_url=key) fenxi = fix_any(fenxi) cols['fenxi'] = center_image(fenxi) ################################################################ knowledge_point_json = list() knowledge_point = list() kpstr = (as_js.get('kllist') or '') kpstr = remove_tag('<span', kpstr, all=True) kpl = kpstr.split('<br>') for kps in kpl: kps = kps.split(' >> ') knowledge_point.append(kps[-1]) knowledge_point_json.append(kps) knowledge_point = ';'.join(knowledge_point) knowledge_point_json = json.dumps(knowledge_point_json, ensure_ascii=False) cols['knowledge_point'] = knowledge_point cols['knowledge_point_json'] = knowledge_point_json ################################################################ other_info = (as_js.get('remark') or '') other_info = self.html_magic.bewitch(other_info, spider_url=key) other_info = fix_any(other_info) cols['other_info'] = center_image(other_info) ################################################################ cols['spider_url'] = key cols['subject'] = aft_subj_id cols['exam_city'] = '' cols['paper_url'] = '' cols['zhuanti'] = '' cols['option_html'] = '' cols['jieda'] = '' cols['dianping'] = '' cols['spider_source'] = 52 cols['question_type'] = 0 cols['question_quality'] = 0 return cols