Пример #1
0
def main():
    html_string = '''
<TBODY>
    <TR>
        <TD>若
            <IMG style="WIDTH: 18px; HEIGHT: 16px; VERTICAL-ALIGN: middle" src="http://pic1.mofangge.com/upload/papers/c02/20120814/20120814192716662863.png">=3,
            <IMG style="WIDTH: 18px; HEIGHT: 14px; VERTICAL-ALIGN: middle" src="http://pic1.mofangge.com/upload/papers/c02/20120814/20120814192716732789.png">=7,则x﹣y的值为&nbsp;&nbsp;&nbsp;&nbsp;</TD>
    </TR>
    <TR>
        <TD>
            <DIV align=right>[&nbsp;&nbsp;&nbsp;&nbsp; ]</DIV>
        </TD>
    </TR>
    <TR>
        <TD>A.±4&nbsp;&nbsp;&nbsp;&nbsp;
            <BR>B.±10&nbsp;&nbsp;&nbsp;&nbsp;
            <BR>C.﹣4或﹣10&nbsp;&nbsp;&nbsp;&nbsp;
            <BR>D.±4或±10</TD>
    </TR>
</TBODY>
</TABLE>
    '''

    html_magic = HtmlMagic(8, download=True, beautify=False)
    html_string = html_magic.bewitch(
        html_string,
        spider_url=
        'http://www.mofangge.com/html/qDetail/02/c1/201208/1kzkc102222121.html',
        spider_source=8,
    )

    html_string = center_image(html_string)

    print(html_string)
Пример #2
0
class ImageCover(object):
    NAME = "image_cover"
    model = AnoahQuestion
    fields = [
        'fenxi', 'option_html', 'question_html_origin', 'option_html_origin',
        'fenxi_origin', 'answer_all_html_origin', 'answer_all_html'
    ]

    def set_magic(self):
        from afanti_tiku_lib.html.magic import HtmlMagic
        self.html_magic = HtmlMagic(75, archive_image=True, download=True)

    def get_objects_id(self):
        ids = self.model.objects.all().values_list('question_id')
        ids = list(map(lambda x: x[0], ids))
        return ids

    def has_cover(self, html):
        return True if 'http://qimg.afanti100.com/data' in html else False

    def is_image_in(self, question):
        def in_question(field):
            print(question.question_id)
            html = getattr(question, field)
            if not html:
                return False
            if self.has_cover(html):
                return False
            return True if '<img' in html else False

        return in_question

    def bewitch_html(self, question):
        def bewitch_question(field):
            new_html = self.html_magic.bewitch(getattr(question, field),
                                               spider_url=question.spider_url)
            setattr(question, field, new_html)

        return bewitch_question

    def run_parser(self, _id):
        q = self.model.objects.get(question_id=_id)
        magic = self.bewitch_html(q)
        has_image = self.is_image_in(q)
        is_change = list(map(magic, filter(has_image, self.fields)))
        is_change and q.save()

    def start(self):
        self.run()

    def run(self):
        self.set_magic()
        all_ids = self.get_objects_id()
        list(map(self.run_parser, all_ids))
Пример #3
0
def tableToJson(table):
    config = json.load(open(CONFIG_FILE))
    first_id = 1
    conn = pymysql.connect(host=config['host'], user=config['user'], passwd=config['password'], db='html_archive',
                           port=3306, charset= "utf8", use_unicode=True, cursorclass = pymysql.cursors.DictCursor)
    cur = conn.cursor()
    #sql = 'select * from %s where question_type = 12 limit 5000' % table
    #sql = 'select * from %s where question_type = 2 limit 500 ' % table
    #sql = 'select * from {} where topic not like "%yitikuimage.oss-cn-qingdao.aliyuncs.com%" '.format(table)
    sql = 'select * from {0} where source_id > {1} and topic not like "%yitikuimage.oss-cn-qingdao.aliyuncs.com%"  limit 1000'.format(
        table, first_id)
    cur.execute(sql)
    data = cur.fetchall()
    cur.close()
    pattern_item = {
        '单选': '1',
        '填空': '2',
        '多选': '4'
    }
    jsonData = []
    for row in data:
        spider_source = int(row['spider_source'])
        spider_url = row['spider_url']
        image_parse = HtmlMagic(spider_source=spider_source,download=True, archive_image=False)
        result1 = {}

        pattern = row['pattern']
        result1['question_type_name'] = pattern
        for key, value in pattern_item.items():
            if key in pattern:
                pattern = value
        if len(pattern) >= 2:
            pattern = '3'
        result1['question_type'] = pattern

        topic = row['topic']
        topic = replace_href(topic)
        topic = remove_tags(text=topic, which_ones=('h1', 'div'))
        topic = image_parse.bewitch(html_string=topic, spider_url=spider_url,
                                    spider_source=spider_source)
        result1['question_body'] = topic
        answer = row['answer']
        answer = replace_href(answer)
        answer = image_parse.bewitch(html_string=answer, spider_url=spider_url,
                                     spider_source=spider_source)
        result1['answer'] = answer
        analy = row['analy']
        analy = replace_href(analy)
        analy = image_parse.bewitch(html_string=analy, spider_url=spider_url,
                                    spider_source=spider_source)
        result1['analy'] = analy

        # html = row['html']
        # analy = re.findall('<font>试题解析</font>(.+)</li><li class="noborder"><font>答案</font>', html)
        # if len(analy) != 0:
        #     analy = replace_href(analy[0])
        #     result1['analy'] = analy
        # answer = re.findall('<font>答案</font><div class="editorBox">(.+?)</div>', html)
        # if len(answer) != 0:
        #     answer = replace_href(answer[0])
        #     result1['answer'] = answer

        source_shijuan = row['source_shijuan']
        source_shijuan = re.findall('<span class="colf43">来源:(.+?)</span>', source_shijuan)
        if len(source_shijuan) != 0:
            result1['paper_name'] = source_shijuan[0]

        mapping_dict = {
            'question_id': 'source_id',
            'subject': 'subject',
            'spider_url': 'spider_url',
            'knowledge_point': 'kaodian',
            'difficulty': 'difficulty',
            'source': 'spider_source'
        }
        result2 = {
            key: row.get(value, '')
            for key, value in mapping_dict.items()
        }

        result = dict(result1, **result2)
        jsonData.append(result)
    return jsonData
Пример #4
0
class Dz101QuestionParser(object):
    def __init__(self, archive_image=False, download=False):
        self.html_magic = HtmlMagic(56,
                                    archive_image=archive_image,
                                    download=download,
                                    beautify=False)

    def parse(self, html_string, url, aft_subj_id):
        cols = dict()

        exam_year = 0
        paper_name = ''

        question_html_t = list()
        answer_all_html_t = list()
        fenxi_t = list()

        cols_dict = {
            '"IsTopic"': question_html_t,
            '"optionoption"': question_html_t,
            '"Answer"': answer_all_html_t,
            '"Analytical"': fenxi_t,
        }

        entities = {
            '"IsTopic"': get_question_html,
            '"optionoption"': get_question_html,
            '"Answer"': get_answer_all_html,
            '"Analytical"': get_fenxi,
        }

        elems = get_html_element(
            '<(li|span) class="(IsTopic|Answer|Analytical|optionoption)',
            html_string,
            regex=True)

        q = -1
        for elem in elems:
            for key in entities.keys():
                if key in elem[:30]:
                    entity = entities[key](elem)
                    if q > 0 and key in ('"Answer"', '"Analytical"'):
                        entity = '({}). {}'.format(q, entity)

                    if q == -1 and key == '"IsTopic"':
                        exam_year, paper_name = get_exam_info(entity)
                        entity = remove_exam_info(entity)

                    cols_dict[key].append(entity)

                    if key == '"IsTopic"':
                        q += 1
                    break

        question_all_html = '<br>\n'.join(question_html_t)

        question_html = self.html_magic.bewitch(question_all_html,
                                                spider_url=url)
        question_html = center_image(question_html)
        question_html = fix_any(question_html)
        question_html = displaystyle(question_html, latex=False, mml=True)
        #cols['question_html_origin'] = question_html

        answer_all_html = '<br>\n'.join(answer_all_html_t)
        answer_all_html = self.html_magic.bewitch(answer_all_html,
                                                  spider_url=url)
        answer_all_html = center_image(answer_all_html)
        answer_all_html = fix_any(answer_all_html)
        answer_all_html = displaystyle(answer_all_html, latex=False, mml=True)
        #cols['answer_all_html_origin'] = answer_all_html

        fenxi = '<br>\n'.join(fenxi_t)
        fenxi = self.html_magic.bewitch(fenxi, spider_url=url)
        fenxi = center_image(fenxi)
        fenxi = fix_any(fenxi)
        fenxi = displaystyle(fenxi, latex=False, mml=True)
        #cols['fenxi_origin'] = fenxi

        cols['difficulty'] = get_difficulty(html_string)
        cols['question_type_str'] = get_question_type_str(html_string)

        cols['question_html'] = ''
        cols['option_html'] = ''
        cols['answer_all_html'] = ''
        cols['jieda'] = ''
        cols['fenxi'] = ''
        cols['dianping'] = ''

        cols['option_html_origin'] = ''
        cols['jieda_origin'] = ''
        cols['dianping_origin'] = ''

        cols['zhuanti'] = ''
        cols['paper_name'] = paper_name
        cols['paper_url'] = ''
        cols['spider_url'] = url
        cols['subject'] = aft_subj_id
        cols['spider_source'] = 56
        cols['question_type'] = 0
        cols['question_quality'] = 0
        cols['knowledge_point'] = ''
        cols['exam_year'] = exam_year
        cols['exam_city'] = ''

        _question = Question(
            question_body=question_html,
            answer=answer_all_html,
            analy=fenxi,
        )
        standard_question = _question.normialize()
        cols['question_html_origin'] = standard_question['question_body']
        cols['answer_all_html_origin'] = standard_question['answer']
        cols['fenxi_origin'] = standard_question['analy']

        return cols
Пример #5
0
class GzywtkParser(object):

    def __init__(self, archive_image=False, download=False):
        # img 格式化
        self.html_magic = HtmlMagic(68, # XXX, spider_source
                                    archive_image=archive_image,
                                    download=download, beautify=False)


    def parse(self, html_string, url):
        self.url = url

        cols = dict()

        question_html, jieda = self.get_question_html(html_string)
        cols['question_html'] = question_html
        cols['jieda'] = jieda

        kps = self.get_kps(html_string)
        cols['knowledge_point'] = kps

        paper_url, paper_name = self.get_paper(html_string)

        cols['paper_url'] = paper_url
        cols['paper_name'] = paper_name

        cols['answer_all_html'] = ''
        cols['fenxi'] = ''
        cols['dianping'] = ''

        cols['difficulty'] = 0
        cols['zhuanti'] = ''
        cols['spider_url'] = url
        cols['subject'] = 21
        cols['spider_source'] = 68
        cols['question_type'] = 0
        cols['question_quality'] = 0
        cols['exam_year'] = 0
        cols['exam_city'] = ''
        cols['option_html'] = ''

        return cols


    def get_question_html(self, html_string):
        rs = []
        cns = get_html_element('<div class="content">', html_string, with_tag=False)
        for cn in cns:
            cn = abs_url(cn)
            cn = center_image(cn)
            cn = self.html_magic.bewitch(cn, spider_url=self.url)
            rs.append(cn.strip())
        rs[1] = self.fix_any(rs[1]).replace('\r', '').strip()
        return rs


    def get_kps(self, html_string):
        for line in html_string.split('\n'):
            if '<b>考点详细:</b>' in line:
                kps = re.findall('</b>(.+?)</li>', line)
                kps2 = kps[0].replace('-', ';')
                return ';'.join(re_kps.findall(line)) or kps2


    def get_paper(self, html_string):
        e = re.search('所属试卷:(.+?)</a>', html_string).group()
        #e = get_html_element('<li>所属试卷:', html_string, limit=1)[0]
        mod = re_paper.search(e)
        if mod:
            paper = 'http://www.gzywtk.com' + mod.group(1)
            paper_name = mod.group(2)
            return paper, paper_name
        else:
            return '', ''


    def fix_any(self, html_string):
        i = html_string.find('<a href=')
        return html_string[:i]
Пример #6
0
def parse_detail(row):
    pattern_item = {'单选': '1', '填空': '2', '多选': '4'}
    spider_source = int(row['spider_source'])
    spider_url = row['spider_url']
    image_parse = HtmlMagic(spider_source=spider_source,
                            download=True,
                            archive_image=False)
    result1 = {}

    pattern = row['pattern']
    result1['question_type_name'] = pattern
    for key, value in pattern_item.items():
        if key in pattern:
            pattern = value
    if len(pattern) >= 2:
        pattern = '3'
    result1['question_type'] = pattern

    topic = row['topic']
    topic = replace_href(topic)
    topic = remove_tags(text=topic, which_ones=('h1', 'div'))
    topic = image_parse.bewitch(html_string=topic,
                                spider_url=spider_url,
                                spider_source=spider_source)
    #result1['question_body'] = topic
    answer = row['answer']
    answer = replace_href(answer)
    answer = image_parse.bewitch(html_string=answer,
                                 spider_url=spider_url,
                                 spider_source=spider_source)
    #result1['answer'] = answer
    analy = row['analy']
    analy = replace_href(analy)
    analy = image_parse.bewitch(html_string=analy,
                                spider_url=spider_url,
                                spider_source=spider_source)
    #result1['analy'] = analy
    _question = Question(
        question_body=topic,
        answer=answer,
        analy=analy,
    )
    standard_question = _question.normialize()
    result1['question_body'] = standard_question['question_body']
    result1['answer'] = standard_question['answer']
    result1['analy'] = standard_question['analy']

    source_shijuan = row['source_shijuan']
    source_shijuan = re.findall('<span class="colf43">来源:(.+?)</span>',
                                source_shijuan)
    if len(source_shijuan) != 0:
        result1['paper_name'] = source_shijuan[0]
    else:
        result1['paper_name'] = ''

    mapping_dict = {
        'question_id': 'source_id',
        'subject': 'subject',
        'spider_url': 'spider_url',
        'knowledge_point': 'kaodian',
        'difficulty': 'difficulty',
        'source': 'spider_source',
        'spider_source': 'spdier_source'
    }
    result2 = {key: row.get(value, '') for key, value in mapping_dict.items()}

    result = dict(result1, **result2)

    return result
Пример #7
0
class Manfen5ZujuanParser(object):

    def __init__(self, archive_image=False, download=False):
        # img 格式化
        self.html_magic = HtmlMagic(80, # XXX, spider_source
                                    archive_image=archive_image,
                                    download=download, beautify=False)


    def parse(self, html_string, url, info):
        self.url = url
        cols = dict()

        tds = find_valid_elements(html_string, '<td')

        question_html      = self.get_question_html(tds[3])
        jieda              = self.get_jieda(tds[4])
        kps                = self.get_kps(tds[2])
        question_type_name = self.get_question_type_name(tds[0])

        # format question object
        _question = Question(question_body = question_html,
                             jieda         = jieda)
        # unity question style
        unity_question = _question.normialize()

        cols['question_html']      = unity_question['question_body']
        cols['jieda']              = unity_question['jieda']

        cols['knowledge_point']    = kps
        cols['question_type_name'] = question_type_name

        cols['subject']            = self.get_subject(info)
        cols['fenxi']              = ''
        cols['dianping']           = ''
        cols['answer_all_html']    = ''
        cols['option_html']        = ''

        cols['difficulty']         = 0
        cols['zhuanti']            = ''
        cols['spider_url']         = url
        cols['spider_source']      = 80
        cols['question_type']      = 0
        cols['question_quality']   = 0
        cols['exam_year']          = 0
        cols['exam_city']          = ''

        return cols

    def get_question_html(self, html_string):
        e = get_html_element('<div', html_string, with_tag=False, limit=1)
        if e:
            e = e[0]
        else:
            e = remove_start_tag(html_string)
        e = self.fix_any(e)
        e = center_image(e)
        e = self.html_magic.bewitch(e, spider_url=self.url)
        e = self.format_options(e)
        return e.strip()

    def get_jieda(self, html_string):
        e = get_html_element('<font color=red>', html_string,
                             with_tag=False, limit=1)[0]
        e = self.fix_any(e)
        e = center_image(e)
        e = self.html_magic.bewitch(e, spider_url=self.url)
        if e.endswith('</div></p>'):
            e = e[:-4]
        return e.strip()

    def get_kps(self, html_string):
        e = get_html_element('<b', html_string, with_tag=False, limit=1)[0].replace(',', ';')
        return e.strip()

    def get_question_type_name(self, html_string):
        e = get_html_element('题型:<b>', html_string, with_tag=False, limit=1)[0]
        return e.strip()

    def get_subject(self, info):
        return SUBJS.get(info['subj'])

    def fix_any(self, html_string):
        html_string = format_spans(html_string)
        html_string = remove_tag('<font', html_string)
        html_string = remove_a_tag(html_string)
        return html_string.strip()

    def format_options(self, html_string):
        html_string = re_opts.sub(r' <br>\2 ', html_string)
        return html_string
Пример #8
0
class Parser(object):
    def __init__(self):
        self.logger = logging.getLogger('iter')
        self.sql_client = MySQLClient(**QUESTION_DICT)
        self.html_magic = HtmlMagic(spider_source=78, download=True, proxy=True)

    def deal_one_item(self, item):
        html = item['html']
        html_id = item['html_id']
        spider_url = item['key']

        subject_dict = True and item['info'] or {}
        subject_dict = json.loads(subject_dict)
        subject_string = subject_dict.get('name', '')
        subject = convert_str_subject_to_int(subject_string)
        question_item = dict(
            spider_source = 78,
            spider_url = spider_url,
            subject=subject
        )
        question_dict = self.parse(html)
        question_item['knowledge_point'] = question_dict['knowledge_point']
        question_item['paper_name_abbr'] = question_dict['paper_name_abbr']
        question_dict = Question(**question_dict).normialize()
        question_item['question_html'] = question_dict['question_body']
        question_item['option_html'] = ''
        question_item['jieda'] = ''
        question_item['zhuanti'] = ''
        question_item['question_type'] = 0
        question_item['answer_all_html'] = question_dict['answer']
        question_item['fenxi'] = question_dict['analy']
        question_item['dianping'] = question_dict['comment']

        for key in ['question_html', 'option_html', 'answer_all_html',
                    'fenxi', 'dianping']:
            question_item[key] = sub_word_tag(question_item[key])
            question_item[key] = self.html_magic.bewitch(
                  question_item[key], question_item['spider_url'], spider_source=78, headers=HEADERS
            )

        # print(question_item)
        try:
            if self.sql_client.select('select spider_url from question_db_offline.jtyhjy_question_20171010 where spider_url =%s', spider_url):
                self.sql_client.update('delete from question_db_offline.jtyhjy_question_20171010 where spider_url = %s limit 1', spider_url)
            self.sql_client.insert('question_db_offline.jtyhjy_question_20171010', **question_item)
        except Exception as err:
            self.logger.warning(
                'html_id: %s. error happend when insert question: %s',
                html_id, err
            )
            raise err


    def parse(self, html):
        if isinstance(html, dict):
            question_json = html
        else:
            question_json = json.loads(html)

        question_dict = {}
        if not question_json:
            return question_dict
        question_dict['question_body'] = question_json.get('bodyHtmlText', '')
        question_dict['answer'] = question_json.get('answerHtmlText', '')
        question_dict['analy'] = question_json.get('analysisHtmlText', '')
        question_dict['knowledge_point'] = question_json.get('knowledgeName', '')
        question_dict['paper_name_abbr'] = question_json.get('queSource', '')
        question_dict['difficulty'] = question_json.get('difficult', '')

        return question_dict
Пример #9
0
class Dz101QuestionParser(object):
    def __init__(self, archive_image=False, download=False):
        self.html_magic = HtmlMagic(56,
                                    archive_image=archive_image,
                                    download=download,
                                    beautify=False)
        self.uri2oss = self.html_magic.image_magic.uri2oss

    def parse(self, html_string, url, aft_subj_id):
        cols = dict()

        exam_year = 0
        paper_name = ''

        question_html_t = list()
        answer_all_html_t = list()
        fenxi_t = list()

        cols_dict = {
            '"IsTopic"': question_html_t,
            '"optionoption"': question_html_t,
            '"Answer"': answer_all_html_t,
            '"Analytical"': fenxi_t,
        }

        entities = {
            '"IsTopic"': get_question_html,
            '"optionoption"': get_question_html,
            '"Answer"': get_answer_all_html,
            '"Analytical"': get_fenxi,
        }

        elems = get_html_element(
            '<(li|span) class="(IsTopic|Answer|Analytical|optionoption)',
            html_string,
            regex=True)

        q = -1
        for elem in elems:
            for key in entities.keys():
                if key in elem[:30]:
                    entity = entities[key](elem)
                    if q > 0 and key in ('"Answer"', '"Analytical"'):
                        entity = '({}). {}'.format(q, entity)

                    if q == -1 and key == '"IsTopic"':
                        exam_year, paper_name = get_exam_info(entity)
                        entity = remove_exam_info(entity)

                    cols_dict[key].append(entity)

                    if key == '"IsTopic"':
                        q += 1
                    break

        question_all_html = '<br>\n'.join(question_html_t)
        cols['question_all_html'] = question_all_html

        question_html = self.html_magic.bewitch(question_all_html,
                                                spider_url=url)
        question_html = center_image(question_html)
        question_html = handle_mathml(question_html, self.uri2oss, url)
        if question_html is False:
            return False
        cols['question_html'] = question_html

        answer_all_html = '<br>\n'.join(answer_all_html_t)
        answer_all_html = self.html_magic.bewitch(answer_all_html,
                                                  spider_url=url)
        answer_all_html = center_image(answer_all_html)
        answer_all_html = handle_mathml(answer_all_html, self.uri2oss, url)
        if answer_all_html is False:
            return False
        cols['answer_all_html'] = answer_all_html

        fenxi = '<br>\n'.join(fenxi_t)
        fenxi = self.html_magic.bewitch(fenxi, spider_url=url)
        fenxi = center_image(fenxi)
        fenxi = handle_mathml(fenxi, self.uri2oss, url)
        if fenxi is False:
            return False
        cols['fenxi'] = fenxi

        cols['difficulty'] = get_difficulty(html_string)
        cols['question_type_str'] = get_question_type_str(html_string)

        cols['dianping'] = ''
        cols['zhuanti'] = ''
        cols['paper_name'] = paper_name
        cols['paper_url'] = ''
        cols['spider_url'] = url
        cols['subject'] = aft_subj_id
        cols['spider_source'] = 56
        cols['question_type'] = 0
        cols['question_quality'] = 0
        cols['knowledge_point'] = ''
        cols['knowledge_point_json'] = json.dumps([])
        cols['exam_year'] = exam_year
        cols['exam_city'] = ''
        cols['option_html'] = ''

        return cols
Пример #10
0
def tableToJson(table):
    config = json.load(open(CONFIG_FILE))
    conn = pymysql.connect(host=config['host'],
                           user=config['user'],
                           passwd=config['password'],
                           db='html_archive',
                           port=3306,
                           charset="utf8",
                           use_unicode=True,
                           cursorclass=pymysql.cursors.DictCursor)
    cur = conn.cursor()
    #sql = 'select * from %s ' % table
    sql = 'select * from %s limit 320000' % table
    cur.execute(sql)
    data = cur.fetchall()
    cur.close()
    jsonData = []
    pattern_item = {'单选': '1', '填空': '2', '多选': '4'}
    for row in data:
        spider_source = int(row['spider_source'])
        image_parse = HtmlMagic(spider_source=spider_source,
                                download=True,
                                archive_image=False)
        result1 = {}
        spider_url = row['spider_url']
        result1['spider_url'] = spider_url
        question_id = re.findall('shiti/(.+).html', spider_url)
        result1['question_id'] = question_id[0]

        pattern = row['pattern']
        result1['question_type_name'] = pattern
        for key, value in pattern_item.items():
            if key in pattern:
                pattern = value
        if len(pattern) >= 2:
            pattern = '3'
        result1['question_type'] = pattern

        topic = row['topic']
        topic = replace_href(topic)
        topic = remove_tags(text=topic, which_ones=('h1', 'div'))
        topic = image_parse.bewitch(html_string=topic,
                                    spider_url=spider_url,
                                    spider_source=spider_source)
        result1['question_body'] = topic
        answer = row['answer']
        answer = replace_href(answer)
        answer = image_parse.bewitch(html_string=answer,
                                     spider_url=spider_url,
                                     spider_source=spider_source)
        result1['answer'] = answer
        analy = row['analy']
        analy = replace_href(analy)
        analy = image_parse.bewitch(html_string=analy,
                                    spider_url=spider_url,
                                    spider_source=spider_source)
        result1['analy'] = analy

        source_shijuan = row['source_shijuan']
        source_shijuan = re.findall('<span class="colf43">来源:(.+?)</span>',
                                    source_shijuan)
        if len(source_shijuan) != 0:
            result1['paper_name'] = source_shijuan[0]

        mapping_dict = {
            'spider_sorce': 'spider_source',
            'subject': 'subject',
            'knowledge_point': 'kaodian',
            'difficulty': 'difficulty',
            'book': 'book',
            'version': 'version',
            'source': 'spider_source'
        }
        result2 = {
            key: row.get(value, '')
            for key, value in mapping_dict.items()
        }

        #result['exam_year'] = row['year']
        #result['exam_city'] = row['province']
        result = dict(result1, **result2)
        jsonData.append(result)
    return jsonData
Пример #11
0
class Wln100QuestionParser(object):
    def __init__(self, archive_image=False, download=False):
        self.html_magic = HtmlMagic(52,
                                    archive_image=archive_image,
                                    download=download,
                                    beautify=False)
        self.subject_item = {
            '语文': '1',
            '数学': '2',
            '英语': '3',
            '科学': '4',
            '物理': '5',
            '化学': '6',
            '地理': '7',
            '历史': '8',
            '生物': '9',
            '政治': '10'
        }
        self.pattern_item = {'单选': '1', '填空': '2', '多选': '4', '选择': '1'}

    def parse(self, key, qs_json, as_json, html_id):

        cols = dict()

        html = self.html_magic.bewitch(qs_json, spider_url=key)
        html = fix_any(html)
        html = center_image(html)

        ################################################################

        knowledge_point = re.findall(
            '<div class="answer-context f-roman">(.+?)</div>', html, re.S)
        if len(knowledge_point) != 0:
            knowledge_point_jsons = []
            knowledge_points = ''
            knowledge_point = knowledge_point[0]
            knowledge_point_json = knowledge_point.split('<br/>')
            for i in knowledge_point_json:
                knowledge_points += remove_tags(i).split(' >> ')[-1] + ';'
                node_i = remove_tags(i).split(' >> ')
                #node_i = json.dumps(node_i, ensure_ascii=False)
                knowledge_point_jsons.append(node_i)
            knowledge_point_jsons = json.dumps(knowledge_point_jsons,
                                               ensure_ascii=False)
            cols['knowledge_point'] = knowledge_points[:-2]
            cols['knowledge_point_json'] = knowledge_point_jsons

        ################################################################

        paper_name = re.findall('id="docname">(.+?)</span>', html)
        if len(paper_name) != 0:
            paper_name = paper_name[0]
            cols['paper_name_abbr'] = paper_name
            subject = 0
            for key1, value1 in self.subject_item.items():
                if key1 in paper_name:
                    subject = value1
            cols['subject'] = subject

        ################################################################

        question_type_str = re.findall(
            '<p class="left">(.+?)</p><p class="right">', html)
        if len(question_type_str) != 0:
            question_type_str = question_type_str[0]
            cols['question_type_str'] = question_type_str
            for keys, values in self.pattern_item.items():
                if keys in question_type_str:
                    question_type_str = values
            if len(question_type_str) >= 2:
                question_type_str = '3'
            cols['question_type'] = question_type_str

        ################################################################

        question_html = re.findall(
            '<div class="test-item-body TD-body f-roman">(.+?)</div>', html,
            re.S)
        question_html = question_html[0].strip()
        #cols['question_html'] = question_html

        ################################################################

        diff = re.findall('class="staryellow">(.+?)<a', html)
        difficulty = len(diff) * 20
        cols['difficulty'] = difficulty

        ################################################################

        mod = re.search(r'([12][09][0189]\d)[^\d]', paper_name)
        if mod:
            exam_year = mod.group(1)
        else:
            exam_year = 0
        cols['exam_year'] = int(exam_year)

        ################################################################

        as_js = as_json['data'][1][0][0]
        answer_all_html = self.html_magic.bewitch((as_js.get('answer') or ''),
                                                  spider_url=key)
        answer_all_html = fix_any(answer_all_html)
        answer_all_html = center_image(answer_all_html)
        #cols['answer_all_html'] = center_image(answer_all_html)

        ################################################################

        fenxi = self.html_magic.bewitch((as_js.get('analytic') or ''),
                                        spider_url=key)
        fenxi = fix_any(fenxi)
        fenxi = center_image(fenxi)
        #cols['fenxi'] = fenxi

        ################################################################

        _question = Question(
            question_body=question_html,
            answer=answer_all_html,
            analy=fenxi,
        )
        standard_question = _question.normialize()
        cols['question_html'] = standard_question['question_body']
        cols['answer_all_html'] = standard_question['answer']
        cols['fenxi'] = standard_question['analy']

        ################################################################

        other_info = (as_js.get('remark') or '')
        other_info = self.html_magic.bewitch(other_info, spider_url=key)
        other_info = fix_any(other_info)
        cols['other_info'] = center_image(other_info)

        ################################################################

        cols['spider_url'] = key
        cols['exam_city'] = ''
        cols['paper_url'] = ''
        cols['zhuanti'] = ''
        cols['option_html'] = ''
        cols['jieda'] = ''
        cols['dianping'] = ''
        cols['spider_source'] = 52
        cols['question_quality'] = 0
        cols['html_id'] = html_id

        return cols
Пример #12
0
def tableToJson(table):
    config = json.load(open(CONFIG_FILE))
    conn = pymysql.connect(host=config['host'], user=config['user'], passwd=config['password'], db='html_archive',
                           port=3306, charset= "utf8", use_unicode=True, cursorclass = pymysql.cursors.DictCursor)
    cur = conn.cursor()
    #sql = 'select * from {}  where html like "%img%" limit 300'.format(table)
    sql = 'select * from %s limit 100000,100000' % table
    cur.execute(sql)
    data = cur.fetchall()
    cur.close()
    jsonData = []
    for row in data:
        image_parse = HtmlMagic(75,download=True, archive_image=False)
        # row = list(row)
        result = {}  # temp store one jsonObject
        result['question_id'] = row['source_id']
        result['spider_sorce'] = 75
        result['spider_url'] = row['key2']
        result['subject'] = row['subject']
        result['question_type'] = row['question_type']

        #由于html解析后出现"aorder":false等情况,如果不加下列两行,则出现name 'false' is not defined报错
        false = False
        true = True
        null = None
        try:
            if isinstance(row['html'], str):
                html_contents = row['html']
                # try:
                #     html_contents = image_parse.bewitch(html_string=html_contents, spider_url=row['key2'],
                #                                         spider_source='75')
                # except Exception as e:
                #     print(traceback.print_exc())
                html_contents = remove_biaoqian(html_contents)
                html_contents = eval(html_contents)
                if isinstance(html_contents,bytes):
                    html_contents = html_contents.decode()
                    html_contents = image_parse.bewitch(html_string=html_contents, spider_url=row['key2'],
                                                        spider_source='75')
                    html_content = eval(html_contents)
                elif isinstance(html_contents,dict):
                    html_contents = image_parse.bewitch(html_string=str(html_contents), spider_url=row['key2'],
                                                        spider_source='75')
                    html_content = eval(html_contents)

        except Exception as e:
            # print(row)
            # print(row['html'])
            # print('++' * 20)
            # print(traceback.print_exc())
            # print(e)
            pass

        mapping_dict = {
            'difficulty': 'difficulty',
            'question_body': 'prompt',
            'comment': 'comment',
            'analy': 'parse'
        }

        result2 = {
            key: html_content.get(value, '')
            for key, value in mapping_dict.items()
            }

        try:
            options = html_content['options']
            option = []
            if options:
                for keys, values in options.items():
                    value_items = {}
                    value_items['value'] = keys
                    value_items['content'] = values
                    option.append(value_items)
            result['option_lst'] = option
        except:
            pass

        try:
            answer = html_content['answer']
            if len(answer) == 0:
                answer = ''
                result['answer'] = answer
            else:
                if isinstance(answer, str):
                    result['answer'] = answer
                elif isinstance(answer, list):
                    answers = ''
                    for i in answer:
                        if isinstance(i, str):
                            answers += i + ' '
                        elif isinstance(i, list):
                            answers += i[0] + ' '
                    if len(answers) == 0:
                        answers = ''
                    result['answer'] = answers
        except:
            pass

        try:
            sub_question_lst = html_content['items']
            sub_question_lsts = []
            if sub_question_lst:
                for i in range(len(sub_question_lst)):
                    sub_question = parse_sub_question_lst(sub_question_lst[i])
                    sub_question_lsts.append(sub_question)
                result['sub_question_lst'] = sub_question_lsts
        except:
            pass

        try:
            result['flag'] = row['flag']
        except:
            pass
        result1 = dict(result , **result2)
        # question_body = result1['question_body']
        # result1['question_body'] = image_parse.bewitch(html_string=question_body, spider_url=row['key2'],
        #                                     spider_source='75')
        # if len(result1['answer']) != 0:
        #     answer = result1['answer']
        #     result1['answer'] = image_parse.bewitch(html_string=answer, spider_url=row['key2'],
        #                                                    spider_source='75')
        jsonData.append(result1)
        #jsonData.append(result)
    return jsonData
Пример #13
0
class Wln100QuestionParser(object):

    def __init__(self, archive_image=False, download=False):
        self.html_magic = HtmlMagic(52, archive_image=archive_image,
                                    download=download, beautify=False)

    def parse(self, key, qs_json, as_json, aft_subj_id):

        cols = dict()

        question_html = qs_json['test']
        question_html = self.html_magic.bewitch(question_html,
                                                spider_url=key)
        question_html = fix_any(question_html)
        cols['question_html'] = center_image(question_html)

        ################################################################

        if not qs_json.get('diff'):
            difficulty = 0
        else:
            difficulty = (100 - int(qs_json.get('diff', 0) * 100))
        cols['difficulty'] = difficulty

        ################################################################

        paper_name = (qs_json.get('docname') or '')
        cols['paper_name'] = paper_name

        ################################################################

        mod = re.search(r'([12][09][0189]\d)[^\d]', paper_name)
        if mod:
            exam_year = mod.group(1)
        else:
            exam_year = 0
        cols['exam_year'] = int(exam_year)

        ################################################################

        cols['question_type_str'] = (qs_json.get('typesname') or '')

        ################################################################

        as_js = as_json['data'][1][0][0]
        answer_all_html = self.html_magic.bewitch((as_js.get('answer') or ''),
                                                  spider_url=key)
        answer_all_html = fix_any(answer_all_html)
        cols['answer_all_html'] = center_image(answer_all_html)

        ################################################################

        fenxi = self.html_magic.bewitch((as_js.get('analytic') or ''),
                                        spider_url=key)
        fenxi = fix_any(fenxi)
        cols['fenxi'] = center_image(fenxi)

        ################################################################

        knowledge_point_json = list()
        knowledge_point = list()
        kpstr = (as_js.get('kllist') or '')
        kpstr = remove_tag('<span', kpstr, all=True)
        kpl = kpstr.split('<br>')
        for kps in kpl:
            kps = kps.split(' >> ')
            knowledge_point.append(kps[-1])
            knowledge_point_json.append(kps)
        knowledge_point = ';'.join(knowledge_point)
        knowledge_point_json = json.dumps(knowledge_point_json,
                                          ensure_ascii=False)
        cols['knowledge_point'] = knowledge_point
        cols['knowledge_point_json'] = knowledge_point_json

        ################################################################

        other_info = (as_js.get('remark') or '')
        other_info = self.html_magic.bewitch(other_info, spider_url=key)
        other_info = fix_any(other_info)
        cols['other_info'] = center_image(other_info)

        ################################################################

        cols['spider_url'] = key
        cols['subject'] = aft_subj_id
        cols['exam_city'] = ''
        cols['paper_url'] = ''
        cols['zhuanti'] = ''
        cols['option_html'] = ''
        cols['jieda'] = ''
        cols['dianping'] = ''
        cols['spider_source'] = 52
        cols['question_type'] = 0
        cols['question_quality'] = 0

        return cols
Пример #14
0
class VkoParser(object):
    def __init__(self, archive_image=False, download=False):
        # img 格式化
        self.html_magic = HtmlMagic(
            74,  # XXX, spider_source
            archive_image=archive_image,
            download=download,
            beautify=False)

    def parse(self, js, url):
        self.url = url
        self._paper = ''
        self._year = 0

        cols = dict()

        question_html = self.get_question_html(js)
        cols['question_html'] = question_html

        answer_all_html = self.get_answer_all_html(js)
        cols['answer_all_html'] = answer_all_html

        jieda = self.get_jieda(js)
        cols['jieda'] = jieda

        cols['option_html'] = ''
        cols['fenxi'] = ''
        cols['dianping'] = ''

        cols['paper_name'] = self._paper
        cols['difficulty'] = 0
        cols['zhuanti'] = ''
        cols['spider_url'] = url
        cols['subject'] = 0
        cols['spider_source'] = 74
        cols['question_type'] = 0
        cols['question_quality'] = 0
        cols['knowledge_point'] = ''
        cols['exam_year'] = self._year
        cols['exam_city'] = ''

        return cols

    def get_question_html(self, js):
        html_string = js['content']
        html_string = self.format_html(html_string)
        mod = re_paper.search(html_string)
        if mod:
            self._paper = mod.group(1)
            html_string = re_paper.sub('<p>', html_string)

            mod = re_year.search(self._paper)
            if mod:
                self._year = int(mod.group(1))
        return html_string

    def get_answer_all_html(self, js):
        html_string = js.get('answer') or ''
        html_string = self.format_html(html_string)
        return html_string

    def get_jieda(self, js):
        if not js.get('examsResolve'):
            return ''

        jiedas = []
        for er in js['examsResolve']:
            jiedas.append(er['content'])

        html_string = '<br>'.join(jiedas)
        html_string = self.format_html(html_string)
        return html_string

    def format_html(self, html_string):
        html_string = self.fix_any(html_string)
        html_string = center_image(html_string)
        html_string = self.html_magic.bewitch(html_string, spider_url=self.url)
        return html_string

    def fix_any(self, html_string):
        html_string = html_string.replace('\n', '')
        html_string = re_p_tag.sub('<p>', html_string)
        html_string = handle_spans(html_string)
        html_string = remove_tag('<span', html_string)
        html_string = html_string.replace('<p></p>', '')\
                                 .replace('<p><br></p>', '')\
                                 .replace('<div><br></div>', '')\
                                 .replace('<o:p></o:p>', '')\
                                 .replace('</p><br>', '</p>')
        html_string = re_nbsp.sub('&nbsp;' * 6, html_string)
        html_string = re_underline.sub(UNDERLINE.format('&nbsp;' * 6),
                                       html_string)
        html_string = html_string.replace('<sspan', '<span')
        return html_string
Пример #15
0
class Zuoye17QuestionParser(object):

    def __init__(self, archive_image=False, download=False):
        self.html_magic = HtmlMagic(53, archive_image=archive_image,
                                    download=download, beautify=False)

    def parse(self, url, js, aft_subj_id):

        cols = dict()

        # 检测是否是多题 (如:完形填空)
        is_mqs = is_multi_qs(js)

        if is_mqs:
            question_html, option_html = get_multi_question(js)
        else:
            question_html, option_html = get_question(js)

        question_html = fix_any(question_html)
        question_html = self.html_magic.bewitch(question_html, spider_url=url)
        question_html = center_image(question_html)
        cols['question_html_origin'] = question_html
        cols['question_html'] = ''
        if 'afanti-latex' not in question_html:
            cols['question_html'] = question_html

        if option_html:
            option_html = fix_any(option_html)
            option_html = self.html_magic.bewitch(option_html, spider_url=url)
            option_html = center_image(option_html)
        cols['option_html_origin'] = option_html
        cols['option_html'] = ''
        if 'afanti-latex' not in option_html:
            cols['option_html'] = option_html

        ################################################################

        answer_all_html, fenxi = get_answers(js)

        answer_all_html = fix_any(answer_all_html)
        answer_all_html = self.html_magic.bewitch(answer_all_html,
                                                  spider_url=url)
        answer_all_html = center_image(answer_all_html)
        cols['answer_all_html_origin'] = answer_all_html
        cols['answer_all_html'] = ''
        if 'afanti-latex' not in answer_all_html:
            cols['answer_all_html'] = answer_all_html

        fenxi = fix_any(fenxi)
        fenxi = self.html_magic.bewitch(fenxi, spider_url=url)
        fenxi = center_image(fenxi)
        cols['fenxi_origin'] = fenxi
        cols['fenxi'] = ''
        if 'afanti-latex' not in fenxi:
            cols['fenxi'] = fenxi

        ################################################################

        cols['difficulty'] = (js['difficulty_int'] or 0)

        ################################################################

        cols['question_type_name'] = get_question_type_name(js)

        ################################################################

        cols['knowledge_point'] = ''
        cols['jieda_origin'] = ''
        cols['jieda'] = ''
        cols['exam_year'] = 0
        cols['exam_city'] = ''
        cols['spider_url'] = url
        cols['subject'] = aft_subj_id
        cols['zhuanti'] = ''
        cols['dianping'] = ''
        cols['spider_source'] = 53
        cols['question_type'] = 0
        cols['question_quality'] = 0

        return cols