Exemplo n.º 1
0
 def __init__(self, pageid):
     self.pageid = pageid
     self.newLine_nonBreak_pattern = re.compile(self.newLine_nonBreak_regex)
     self.es_ops = ElasticSearchOperate()
     wiki_data = self.es_ops.get_wiki_article(pageid)
     if wiki_data is not None and __wiki_raw__ in wiki_data:
         self.html_data = wiki_data[__wiki_raw__]
         parser = etree.XMLParser(ns_clean=True, remove_comments=True)
         self.html_tree = etree.fromstring(self.html_data, parser)
Exemplo n.º 2
0
 def __init__(self, html_data, isFile):
     self.es_ops = ElasticSearchOperate()
     self.html_data = html_data
     self.newLine_nonBreak_pattern = re.compile(self.newLine_nonBreak_regex)
     parser = etree.XMLParser(ns_clean=True, remove_comments=True)
     if isFile:
         self.html_tree = etree.parse(self.html_data, parser)
     else:
         self.html_tree = etree.fromstring(self.html_data, parser)
Exemplo n.º 3
0
 def __init__(self, html_data, is_file):
     self.es_ops = ElasticSearchOperate()
     self.html_data = html_data
     self.new_line_non_break_pattern = re.compile(
         self.new_line_non_break_regex)
     parser = etree.HTMLParser(remove_blank_text=True, remove_comments=True)
     if is_file:
         self.html_tree = etree.parse(self.html_data, parser)
     else:
         self.html_tree = etree.fromstring(self.html_data, parser)
Exemplo n.º 4
0
class TestWikiScraper(TestCase):

    es_ops = None

    def __init__(self, *args, **kwargs):
        super(TestWikiScraper, self).__init__(*args, **kwargs)
        self.es_ops = ElasticSearchOperate()

    @classmethod
    def test_query_wiki_pages(cls):
        query_set = [
            "Alan Turing", "Harry Potter and the Deathly Hallows", "Tiger",
            "Melbourne"
        ]
        for query in query_set:
            wikiq = WikiQuery(query)
            page_list = wikiq.fetch_wiki_pages()
            assert len(page_list) == wikiq.wiki_max_results

    def test_fetch_wiki_pages(self):
        query_set = [736]
        wikif = WikiFetch(query_set)
        wikif.parse_wiki_page()

        for pageid in query_set:
            wiki_data = self.es_ops.get_wiki_article(pageid)
            assert wiki_data is not None
            assert wiki_data[__wiki_title__]
            assert wiki_data[__wiki_raw__]
            assert wiki_data[__wiki_revision__]

    def test_parse_wiki_pages(self):
        query_set = [736]

        html_tag_expr = '<([a-zA-Z][a-zA-Z0-9]*)\b[^>]*>(.*?)</\1>'

        for page in query_set:
            with self.subTest(page):
                xpe = XPathExtractor(page)
                xpe.strip_tag()
                xpe.strip_headings()
                img = xpe.img_extract()
                info = xpe.extract_info()
                table = xpe.extract_tables()
                text = xpe.extract_text()
                self.assertNotRegex(text, html_tag_expr)
Exemplo n.º 5
0
class WikiFetch:

    base_url = 'https://en.wikipedia.org/w/api.php'
    # noinspection PyDictCreation
    wiki_query_payload = {'action': 'parse', 'format': 'json'}
    wiki_query_payload['prop'] = 'text|links|images|externallinks|sections|revid|displaytitle|iwlinks'
    es_ops = None
    page_list = []
    wiki_text = []

    def __init__(self, page_list):
        self.page_list = page_list
        self.es_ops = ElasticSearchOperate()

    def parse_wiki_page(self):

        for page in self.page_list:

            self.wiki_query_payload['pageid'] = page

            wiki_query_req = requests.get(self.base_url, params=self.wiki_query_payload)
            wiki_query_response = wiki_query_req.json()
            wiki_revid = wiki_query_response.get('parse').get('revid')
            wiki_title = wiki_query_response.get('parse').get('title')
            wiki_html_text = wiki_query_response.get('parse').get('text').get('*')

            res = self.es_ops.insert_wiki_article(page, wiki_revid, wiki_title, wiki_html_text)
            if res:
                logger.info("Wiki article {0} inserted.".format(page))
            else:
                logger.error("Wiki article insertion failed")

            self.wiki_text.append(wiki_html_text)
        return self.wiki_text

    @staticmethod
    def save_html(content, page):
        parser = etree.XMLParser(ns_clean=True, remove_comments=True)
        html_tree = etree.fromstring(content, parser)
        html_str = etree.tostring(html_tree, pretty_print=True)
        with open(OUTPUT_DIR + '/wiki_content_'+str(page)+'.html', 'wb') as fp:
            fp.write(html_str)
Exemplo n.º 6
0
 def __init__(self, *args, **kwargs):
     super(TestWikiScraper, self).__init__(*args, **kwargs)
     self.es_ops = ElasticSearchOperate()
Exemplo n.º 7
0
class XPathExtractor:

    regex_expressions = "http://exslt.org/regular-expressions"
    non_break_space = u'\xa0'
    new_line_non_break_regex = r'(\n+)|(\xa0)'

    toc_pattern = '''//*[@id="toc"]'''
    non_searchable_pattern = '''/html/body/div/div[starts-with(@class, "hatnote")]'''
    description_list_pattern = '''/html/body/div/dl'''
    references_pattern = '''/html/body/div/div[starts-with(@class, "refbegin")]'''
    references_list_pattern = '''/html/body/div/div[starts-with(@class, "reflist")]'''
    meta_data_box_pattern = '''/html/body/div/div[starts-with(@class, "metadata")]'''
    nav_boxes_pattern = '''/html/body/div/div[@class="navbox"]'''
    vertical_nav_boxes_pattern = '''/html/body/div/table[starts-with(@class, "vertical-navbox")]'''
    no_print_metadata_pattern = '''/html/body/div/div[starts-with(@class, "noprint")]'''
    subscript_pattern = '''//sup[@class="reference"]'''
    edit_pattern = '''//span[@class="mw-editsection"]'''
    meta_data_table = '''/html/body/div/table[contains(@class, "metadata")]'''

    see_also_pattern = '''//*[@id="See_also"]'''
    external_links_pattern = '''//*[@id="External_links"]'''

    img_pattern = '''/html/body/div//div[starts-with(@class, "thumb ")]'''
    img_href = '''./div//a/@href'''
    img_caption = '''.//div[@class="thumbcaption"]/text()'''

    info_box_pattern = '''/html/body/div/table[starts-with(@class, "infobox")]'''
    info_box_item = '''./tr'''
    info_key_pattern = '''./th//text()'''
    info_value_pattern = '''./td//text()'''

    table_pattern = '''/html/body/div/table[@class="wikitable"]'''
    table_row_pattern = '''./tr'''
    table_key_pattern = '''./th'''
    table_value_pattern = '''./td'''
    all_text_pattern = '''.//text()'''

    irrelevant_headlines = [
        '''//*[@id="See_also"]''', '''//*[@id="Notes_and_references"]''',
        '''//*[@id="Explanatory_notes"]''', '''//*[@id="Citations"]''',
        '''//*[@id="Further_reading"]''', '''//*[@id="External_links"]''',
        '''//*[@id="References"]'''
    ]

    html_data = ''
    extracted_img = {}
    html_tree = None
    is_file = False
    page_id = None
    es_ops = None
    new_line_non_break_pattern = None

    def __init__(self, html_data, is_file):
        self.es_ops = ElasticSearchOperate()
        self.html_data = html_data
        self.new_line_non_break_pattern = re.compile(
            self.new_line_non_break_regex)
        parser = etree.HTMLParser(remove_blank_text=True, remove_comments=True)
        if is_file:
            self.html_tree = etree.parse(self.html_data, parser)
        else:
            self.html_tree = etree.fromstring(self.html_data, parser)

    def __init__(self, pageid):
        self.page_id = pageid
        self.new_line_non_break_pattern = re.compile(
            self.new_line_non_break_regex)
        self.es_ops = ElasticSearchOperate()
        wiki_data = self.es_ops.get_wiki_article(pageid)
        if wiki_data is not None and __wiki_raw__ in wiki_data:
            self.html_data = wiki_data[__wiki_raw__]
            parser = etree.HTMLParser(remove_blank_text=True,
                                      remove_comments=True)
            self.html_tree = etree.fromstring(self.html_data, parser)

    def strip_tag(self):

        toc_list = self.html_tree.xpath(self.toc_pattern)
        for toc in toc_list:
            toc.getparent().remove(toc)

        non_searchable_list = self.html_tree.xpath(self.non_searchable_pattern)
        for non_searchable in non_searchable_list:
            non_searchable.getparent().remove(non_searchable)

        dl_list = self.html_tree.xpath(self.description_list_pattern)
        for dl in dl_list:
            dl.getparent().remove(dl)

        ref_begin_list = self.html_tree.xpath(self.references_pattern)
        for ref_begin in ref_begin_list:
            ref_begin.getparent().remove(ref_begin)

        ref_list = self.html_tree.xpath(self.references_list_pattern)
        for ref in ref_list:
            ref.getparent().remove(ref)

        meta_data_list = self.html_tree.xpath(self.meta_data_box_pattern)
        for meta_data in meta_data_list:
            meta_data.getparent().remove(meta_data)

        meta_data_table = self.html_tree.xpath(self.meta_data_table)
        for meta_data in meta_data_table:
            meta_data.getparent().remove(meta_data)

        nav_box_list = self.html_tree.xpath(self.nav_boxes_pattern)
        for nav_box in nav_box_list:
            nav_box.getparent().remove(nav_box)

        vnav_box_list = self.html_tree.xpath(self.vertical_nav_boxes_pattern)
        for vnav_box in vnav_box_list:
            vnav_box.getparent().remove(vnav_box)

        no_print_list = self.html_tree.xpath(self.no_print_metadata_pattern)
        for no_print in no_print_list:
            no_print.getparent().remove(no_print)

        sub_ref_list = self.html_tree.xpath(self.subscript_pattern)
        for sub_ref in sub_ref_list:
            sub_ref.getparent().remove(sub_ref)

        edit_list = self.html_tree.xpath(self.edit_pattern)
        for edit in edit_list:
            edit.getparent().remove(edit)

        see_also_list = self.html_tree.xpath(self.see_also_pattern)
        for see_also in see_also_list:
            see_also_data = see_also.getparent().getnext()
            see_also_data.getparent().remove(see_also_data)

        external_link_list = self.html_tree.xpath(self.external_links_pattern)
        for external_link in external_link_list:
            external_link_data = external_link.getparent().getnext()
            external_link_data.getparent().remove(external_link_data)

    def strip_headings(self):
        for heading in self.irrelevant_headlines:
            heading_parent_list = self.html_tree.xpath(heading)
            if len(heading_parent_list) > 0:
                heading_parent = heading_parent_list[0].getparent()
                heading_parent.getparent().remove(heading_parent)

    def img_extract(self):
        img_list = self.html_tree.xpath(self.img_pattern)
        for img in img_list:
            img_url, img_caption = "", ""
            img_url_list = img.xpath(self.img_href)
            if len(img_url_list) > 0:
                img_url = str(img_url_list[0])
            img_caption_list = img.xpath(self.img_caption)
            if len(img_caption_list) > 0:
                img_caption = ''.join(img_caption_list).strip()
            img.getparent().remove(img)
            if img_url != "":
                self.extracted_img[img_url] = img_caption
        logger.debug("Extracted Images: %d", len(self.extracted_img))
        return self.extracted_img

    def extract_info(self):
        info_box = self.html_tree.xpath(self.info_box_pattern)
        wiki_info = WikiInfo()
        for info in info_box:
            info_key = info.xpath(self.info_box_item)
            info_list = []
            info_title = ""
            for ikey in info_key:
                info_key = ''.join(ikey.xpath(self.info_key_pattern)).strip(
                )  # issues with &nbsp;    // https://stackoverflow.com/a/33829869/8646414
                info_value = ''.join(ikey.xpath(
                    self.info_value_pattern)).strip()
                info_value = info_value.split('\n')
                info_value = [item.strip() for item in info_value]
                if info_key != "" and len(info_value) >= 1:
                    if info_title == "":
                        info_title = info_key
                    if info_value[0] != '':
                        info_pair = {info_key: info_value}
                        info_list.append(info_pair)
            wiki_info.add_info(info_title, info_list)
            info.getparent().remove(info)
        res = self.es_ops.update_wiki_article(self.page_id,
                                              content_info=json.dumps(
                                                  wiki_info.info_data))
        if res:
            logger.info("Inserted parsed content info for: %d", self.page_id)
        else:
            logger.error("Inserted of parsed content info failed")
        logger.debug("Extracted Bios: %d", len(wiki_info.info_data))
        return wiki_info.info_data

    def extract_tables(self):
        table_list = self.html_tree.xpath(self.table_pattern)
        wikit = WikiTable()
        for table in table_list:
            table_row_list = table.xpath(self.table_row_pattern)
            for table_row in table_row_list:
                table_head_list = table_row.xpath(self.table_key_pattern)
                for table_head in table_head_list:
                    wikit.add_header(''.join(
                        table_head.xpath(self.all_text_pattern)))
                tab_data = []
                table_data_list = table_row.xpath(self.table_value_pattern)
                for table_data in table_data_list:
                    tab_data.append(''.join(
                        table_data.xpath(self.all_text_pattern)))
                wikit.set_values(tab_data)
            table.getparent().remove(table)
        res = self.es_ops.update_wiki_article(self.page_id,
                                              content_table=json.dumps(
                                                  wikit.tab_data))
        if res:
            logger.info("Inserted parsed content table for: %d", self.page_id)
        else:
            logger.error("Inserted of parsed content table failed")
        logger.debug("Extracted Tables: %d", len(wikit.tab_data))
        return wikit.tab_data

    def extract_text(self):
        text_data = ''.join(self.html_tree.xpath(
            self.all_text_pattern)).strip()
        text_data = re.sub(self.new_line_non_break_pattern, ' ', text_data)
        res = self.es_ops.update_wiki_article(self.page_id, content=text_data)
        logger.debug("Parsed content length: %d", len(text_data))
        if res:
            logger.info("Inserted parsed content for: %d", self.page_id)
        else:
            logger.error("Inserted of parsed content failed")
        return text_data

    def save_html(self, page=0):
        html_str = etree.tostring(self.html_tree, pretty_print=True)
        with open(OUTPUT_DIR + '/wiki_content_cleaned_' + str(page) + '.html',
                  'wb') as fp:
            fp.write(html_str)
Exemplo n.º 8
0
 def __init__(self, page_list):
     self.page_list = page_list
     self.es_ops = ElasticSearchOperate()
Exemplo n.º 9
0
def search_rank(query):
    es = ElasticSearchOperate()
    result_all = es.search_wiki_article(query)
    logging.debug("Ranked Wiki Pages Title: {0}".format(
        [result.get_wiki_title() for result in result_all]))
    return result_all
Exemplo n.º 10
0
import logging
from qas.esstore.es_operate import ElasticSearchOperate
"""
Created by felix on 25/3/18 at 5:19 PM
"""

logger = logging.getLogger(__name__)


def search_rank(query):
    es = ElasticSearchOperate()
    result_all = es.search_wiki_article(query)
    logging.debug("Ranked Wiki Pages Title: {0}".format(
        [result.get_wiki_title() for result in result_all]))
    return result_all


if __name__ == "__main__":

    logging.basicConfig(level=logging.DEBUG)

    # mquery = list([[['Cushman', 'known', 'Wakefield', 'are'], [['Cushman', 'Wakefield'], 'or'], [], []]])
    mquery = list([[['Albert', 'Einstein', 'birth'], [], [], []]])

    les = ElasticSearchOperate()
    res_all = les.search_wiki_article(mquery)
    for res in res_all:
        print(res.get_wiki_title())