Exemplo n.º 1
0
    def parse_page(self, district: str, page: int = 1) -> None:
        """To parse  through each page.

        Parsing through pages for given district. Calls itself if it's not last page. Starts with first place, unless
        another page given.

        :param district: Given district
        :param page: Page
        """
        print("Parse", self.domain, district, page)
        r = self.get(district=district, page=page)
        html = HTML(html=r.content)
        flats = html.xpath("//div[@data-name='LinkArea']")
        for flat in flats:
            self.parse_flat(flat)

        if r.status_code == 302 or not flats:
            print(r.status_code, "Failed page")
            sleep(10)
            return self.parse_page(district=district, page=page)

        page += 1
        if html.xpath(
                f'//div[@data-name="Pagination"]//ul//li//a[text()="{page}"]'
        ):  # noqa: R503, calls itself
            self.save_parsed(district, page)
            return self.parse_page(district=district, page=page)
Exemplo n.º 2
0
async def test_get_poll(http_client, choices_fixtures):
    resp = await http_client.get('/poll/1')
    assert resp.status == 200
    html = HTML(html=await resp.text())
    assert html.xpath("//label[@for='choice1']", first=True).text == 'Not much'
    assert html.xpath("//label[@for='choice2']", first=True).text == 'The sky'
    assert html.xpath("//label[@for='choice3']", first=True).text == 'Just hacking again'
Exemplo n.º 3
0
def html_parsing_chromium(fp=r'utils/commands.py'):
    p = """# Licensed to the White Turing under one or more
    # contributor license agreements.  See the NOTICE file
    # distributed with this work for additional information
    # regarding copyright ownership.  The SFC licenses this file
    # to you under the Apache License, Version 2.0 (the
    # "License"); you may not use this file except in compliance
    # with the License.  You may obtain a copy of the License at
    #
    #   http://www.apache.org/licenses/LICENSE-2.0
    #
    # Unless required by applicable law or agreed to in writing,
    # software distributed under the License is distributed on an
    # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
    # KIND, either express or implied.  See the License for the
    # specific language governing permissions and limitations
    # under the License.

    '''List of Chromium Command Line Switches.'''


    class Chromium(object):
        '''Frequently used commands mappings.
        
        There are lots of command lines which can be used with the Google Chrome browser.
        Some change behavior of features, others are for debugging or experimenting.
        This page lists the available switches including their conditions and descriptions.
        Last update occurred on 2018-06-08 from `https://peter.sh/experiments/chromium-command-line-switches/`.
        '''
    """

    remove(fp)
    print(p, file=open(fp, 'a', encoding='utf-8'))

    with open(
            r'html/List of Chromium Command Line Switches « Peter Beverloo.html',
            encoding='utf-8') as f:
        doc = f.read()

    html = HTML(html=doc)
    condition = html.xpath('//tr/@id')
    explanation = html.xpath('//tr/td[2]/text()')

    for i, j in zip(condition, explanation):
        k = i.split('   ')[0].replace("'",
                                      '').replace('-',
                                                  '_').replace('.',
                                                               '_').upper()
        j = j.replace('\n', '')
        if len(k) < 1 or not k[0].isalpha():
            continue
        print(f'    {k} = {i.strip()!r}   # {j.strip()}',
              file=open(fp, 'a', encoding='utf-8'))
Exemplo n.º 4
0
    def parse_edb_cve(self, url, item, html):
        edb_html = HTML(html=html)

        raw_id, edb_title, edb_author, edb_type, edb_platform, edb_rport, edb_published = item

        edb_id = "EDB-{}".format(raw_id)
        edb_url = url
        edb_verified = get_val(edb_html.xpath(element_xpath['edb_verified']))

        try:
            edb_cve_num = [
                i.strip() for i in edb_html.xpath(element_xpath['edb_cve'])
            ]
            if edb_cve_num:
                maped_edb_cve = [
                    "CVE-{}".format(cve_id) for cve_id in edb_cve_num
                ]
                edb_cve = ','.join(maped_edb_cve)
                tqdm.write("Detected {} <--> {}".format(edb_id, edb_cve))
        except Exception:
            edb_cve = 'N/A'

        if 'mdi-close' in edb_verified:
            edb_verified = 'Unverified'
        else:
            edb_verified = 'Verified'

        edb_exploit_raw_url = 'https://www.exploit-db.com/raw/{}'.format(
            raw_id)
        edb_vulnerable_app_url = get_val(
            edb_html.xpath(element_xpath['edb_vulnerable_app_url']))

        if edb_vulnerable_app_url != "":
            edb_vulnerable_app_url = 'https://www.exploit-db.com' + edb_vulnerable_app_url

        edb_collect_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())

        exploit_record = EdbRecord(
            edb_id=edb_id,
            edb_title=edb_title,
            edb_url=edb_url,
            edb_author=edb_author,
            edb_cve=edb_cve,
            edb_type=edb_type,
            edb_platform=edb_platform,
            edb_remote_ports=edb_rport,
            edb_verified=edb_verified,
            edb_vulnerable_app_url=edb_vulnerable_app_url,
            edb_exploit_raw_url=edb_exploit_raw_url,
            edb_published=edb_published,
            edb_collect_date=edb_collect_date)
        self.insert_record(exploit_record)
Exemplo n.º 5
0
def html_parsing_ios(fp=r'utils/ios.json'):
    import json

    with open(r'html/iOS version history - Wikipedia.htm',
              encoding='utf-8') as f:
        doc = f.read()

    html = HTML(html=doc)
    nv = html.xpath('//tr[@valign="top"]/th[not(@colspan)]/text()[1]')[64:-1]
    cv = html.xpath('//tr[@valign="top"]/td[1]/text()[1]')[64:-1]

    nv = map(lambda x: x.strip(), nv)
    cv = map(lambda x: x.strip().split('/')[-1], cv)

    json.dump(dict(zip(nv, cv)), fp=open(fp, 'w'))
Exemplo n.º 6
0
    def extract(html, number):
        """
        从搜索页面提取标题和 url, 标题中含有番号则返回 url
        估计这个用xpath提取标题很容易失效
        Args:
            html:
            number:

        Returns:

        """
        html = HTML(html=html)
        link_content = html.xpath("//a")
        # 直接维护列表
        title_xpath = ["//h3/div/text()", "//h3/span/text()"]
        for content in link_content:
            for xpath in title_xpath:
                title = content.xpath(xpath, first=True)
                if not title:
                    continue
                if re.search(
                        "".join(filter(str.isalnum, number)),
                        "".join(filter(str.isalnum, title)),
                        flags=re.I,
                ):
                    link = content.xpath("//@href", first=True)
                    if link:
                        return link
Exemplo n.º 7
0
def worker(domain):
    while True:

        url = LINKS_QUEUE.get()
        SCANNED_LINKS.add(url)

        try:
            with webdriver.Chrome(executable_path='./chromedriver') as browser:
                browser.get(url)
                html_code = browser.page_source

        except Exception as e:
            print(e, type(e))
            continue

        html = HTML(html=html_code)

        try:
            page_title = html.xpath('//title')[0].text
        except IndexError:
            page_title = 'Not Found'

        try:
            page_h1 = html.xpath('//h1')[0].text
        except IndexError:
            page_h1 = 'Not Found'

        Page.create(url=url, title=page_title, h1=page_h1)
        print('[OK]', url)

        for link in html.absolute_links:
            link = link.split('#')[0]
            if domain not in link:
                continue
            if link in SCANNED_LINKS:
                continue
            if any(part in link for part in BAD_PARTS):
                continue

            LINKS_QUEUE.put(link)
Exemplo n.º 8
0
    def find_raw_file_link(self, file_page: HTML):
        raw_link_element = file_page.xpath(self.raw_link_xpath, first=True)
        raw_url = raw_link_element.attrs.get("href")

        if raw_url is None:
            raise ContinueException(
                f"Failed to find the raw file link in the attrs: "
                f"{raw_link_element.attrs}."
            )

        full_raw_url = self.domain_url + raw_url

        return full_raw_url
Exemplo n.º 9
0
def crawler(link):
    driver.get(link)
    base_url = 'https://www.coursera.org'
    r = HTML(html=driver.page_source)
    course_name_xpath = '//div[@class="rc-Welcome"]/div/div//h1/text()'
    course_name = r.xpath(course_name_xpath)[0]
    weeks_hrefs = r.xpath(
        '//*[@id="rendered-content"]/div/div/div/div[2]/nav/div[1]/div/a/@href'
    )
    weeks_hrefs = [base_url + url for url in weeks_hrefs]
    table_of_content = {}
    for week_href in weeks_hrefs:
        driver.get(week_href)
        time.sleep(7)
        r = HTML(html=driver.page_source)
        heading_elements = r.xpath('//div[@class="rc-NamedItemList"]')
        for heading_element in heading_elements:
            heading_text = heading_element.xpath('//h3/text()')[0]
            elements = heading_element.xpath(
                '//div[@class="rc-WeekItemName headline-1-text"]/text()')
            table_of_content[heading_text] = elements
    return course_name, table_of_content
Exemplo n.º 10
0
def parser(page_source):
    r = HTML(html=page_source)
    data = []
    posts = r.xpath('//div[contains(@class, "userContent")]')
    for post in posts:
        try:
            content = post.xpath('//div[contains(@data-testid, "post_message")]')[0]
        except IndexError:
            continue
        text = content.text
        text = cleanup_text(text)
        attached_file = get_attached_file(post)
        data.append((text, attached_file))
    return data
Exemplo n.º 11
0
 def step_5(self):
     script = 'window.scrollTo(0,document.body.scrollHeight)'
     self._browser.execute_script(script)# 滚动到页面底部
     src = self._browser.page_source# 先看一下还有没有加载更多的button
     html = HTML(html=src)
     buttons = html.xpath('//*[@id="app"]/div/div[2]/div/div[2]/button')
     if len(buttons) == 0:# 如果没有,说明加载完了
         raise DownloadOver('[step_5] download is over')
     else:
         select = '//*[@id="app"]/div/div[2]/div/div[2]/button'
         try:
             more_button = self._browser.find_element_by_xpath(select)
             more_button.click()
             self.Log('[step_5] <more button> clicked, waiting for load more pics......')
             time.sleep(10)
         except EX.NoSuchElementException:
             raise DownloadInterrupt('[step_5] [error] can not find <more button>')
Exemplo n.º 12
0
class ECNU_WEBGRAB:
    def __init__(self, uri):
        self.uri = uri
        self.root = None

    def get_fz_data(self):
        html = query_fz_from_ecnu(self.uri)
        self.root = HTML(html=html)

        return self.__get_data()

    def __get_data(self):
        #property_div = self.root.xpath(r'//*[@id="directs"]')[0]
        prefix_uri = self.root.xpath(r'//*[@id="directs"]/label/a/@href')
        prefix = cleanup(self.root.xpath(r'//*[@id="directs"]/label/a/text()'))
        name = self.root.xpath(r'//*[@id="directs"]/label/a/span/text()')

        properties = makeprops(prefix, name)
        #print(properties)

        values = [
            v.strip()
            for v in self.root.xpath(r'//*[@id="directs"]/div/*/*/text()')
        ]
        #print(values)

        fz_brief = {k: v for k, v in zip(properties, values)}
        return self.__unfold_fz_data(fz_brief)

    def __unfold_fz_data(self, brief):
        need2unfold = {k: v for (k, v) in brief.items() if v.startswith(r'_:')}
        #print(need2unfold)

        prefix = cleanup(self.root.xpath(r'//*[@id="bnodes"]/label/a/text()'))
        name = self.root.xpath(r'//*[@id="bnodes"]/label/a/span/text()')

        properties = makeprops(prefix, name)
        #print(properties)

        values = self.root.xpath(
            r'//*[@id="bnodes"]/div[@class="c2 valuecnt"]')
        for value in values:
            print(value.absolute_links)
            print(value.text)
            print(value.links)
            print("-------------")
Exemplo n.º 13
0
def parse_movie(doc):
    html = HTML(html=doc)
    title = html.xpath("head/title")[0]
    title_str = title.text

    # director = html.find('#info > span:nth-child(1) > span.attrs > a')[0]
    director = html.find('#info > span  a[rel="v:directedBy"]')[0]
    director_str = director.text

    type = html.find('#info > span[property="v:genre"]')[0]
    type_str = type.text.split()[0]

    # release_date = html.find('#info > span[property="v:initialReleaseDate"]')[-1]
    # release_date_str = release_date.text

    length_in_minute = html.find('#info > span[property="v:runtime"]')[0]
    length_in_minute_str = length_in_minute.attrs['content']

    comments_str = html.text
    is3D = comments_str.find("3D") != -1

    score = html.find('#interest_sectl strong[property="v:average"]')[0]
    score_str = score.text

    # divs = html.xpath("body/div")
    # print()
    print(director_str)
    print(type_str)
    # print(release_date_str)
    print(length_in_minute_str)
    print(is3D)
    print(score_str)

    actors = html.find('#info > span.actor > span.attrs')[0].find('a')
    actor_list = [(actor.attrs['href'], actor.text.split()[0])
                  for actor in actors]
    print(actor_list)
Exemplo n.º 14
0
 def new_instance(submit_url: str, html: HTML):
     form_definition_script = \
         html.xpath('//script[text()[contains(.,"FB_PUBLIC_LOAD_DATA_")]]', first=True)
     if not form_definition_script:
         raise FormNotFoundException()
     form_definition = \
         GoogleForm.__parse_form_definition_script(form_definition_script)
     title = form_definition[1][8]
     description = form_definition[1][0]
     file_name = form_definition[3]
     if form_definition[1][10] is not None:
         requires_login = bool(form_definition[1][10][1])
         requires_email = bool(form_definition[1][10][4])
     else:
         requires_login = False
         requires_email = False
     questions_data = form_definition[1][1]
     pages = []
     page_title = title
     page_description = description
     page_questions = []
     for question_data in questions_data:
         question = GoogleFormQuestion.new_instance(question_data)
         if question.question_type is GoogleFormQuestion.Type.PAGE_SWITCH:
             pages.append(
                 GoogleFormPage(page_title, page_description,
                                page_questions))
             page_title = question.title
             page_description = question.description
             page_questions = []
         else:
             page_questions.append(question)
     pages.append(
         GoogleFormPage(page_title, page_description, page_questions))
     return GoogleForm(submit_url, title, description, file_name,
                       requires_login, requires_email, pages)
Exemplo n.º 15
0
def change_content(content, xpath, url=None):
    """
    处理内容
    :param content:
    :param xpath:
    :param url:
    :return:
    """
    rule = r'src="(.*?)"'
    img_list = re.compile(rule, re.S).findall(content)
    html_body = content
    # 没有图片不需要处理
    if img_list != []:
        for img_url in img_list:
            img_src = urljoin(url, img_url)
            html_body = html_body.replace(img_url, img_src)
        img_link = re.findall(r'<img.*?>', html_body)
        if img_link != []:
            for img in img_link:
                url = re.findall(r'src="(.*?)"', img)
                if url == []:
                    logging.error(content)
                img_str = img_str_link.format(url[0])
                html_body = html_body.replace(img, img_str)
    from requests_html import HTML
    html1 = HTML(html=html_body)
    # # print(html1.markdown)
    html_body = html1.xpath(xpath)[0].text
    return html_body


# import hashlib
#
# # 待加密信息
# str = '江苏明月光电科技有限公司'
#
# # 创建md5对象
# hl = hashlib.md5()
#
# # Tips
# # 此处必须声明encode
# # 若写法为hl.update(str)  报错为: Unicode-objects must be encoded before hashing
# hl.update(str.encode(encoding='utf-8'))
# print(hl.hexdigest())
#
#
#
#
# html="""
# <div class="bmsg job_msg inbox">
# 						岗位职责<br>1、销售管理职位,负责其功能领域内主要目标和计划;<br>2、制定、参与或协助上层执行相关的政策和制度;<br>3、负责区域的销售运作,包括计划、组织、进度控制和检讨;<br>4、分析和开发市场并搞好售后服务;<br><br>任职资格<br>1、大专以上学历;<br>2、有做营销的愿望和激情;<br>2、有销售经验或应届大学毕业生均可;<br>3、出色的市场分析洞察能力、具备全面深刻营销知识和技能;<br>4、具备一定的管理领导能力和沟通协调能力;<br>5、江苏省13个地级市驻地区域经理,各城市本地人。
# 												<div class="mt10">
# 														<p class="fp">
# 								<span class="label">职能类别:</span>
# 																	<span class="el">销售代表</span>
# 																</p>
# 																					<p class="fp">
# 								<span class="label">关键字:</span>
# 																	<span class="el">销售营销业务</span>
# 															</p>
# 													</div>
# 						<div class="share">
# 							<a track-type="jobsButtonClick" event-type="6" class="a" href="javascript:void(0);" id="fenxiang">分享</a>
# 							<div class="shareBox">
# 								<div id="weixinMa_fx" style="display:none;"><img width="198" height="198" alt="二维码" src="https://jobs.51job.com/comm/qrcode.php?url=https%3A%2F%2Fm.51job.com%2Fsearch%2Fjobdetail.php%3Fjobid%3D96516324"></div>
# 								<a class="icon_b i_weixin" href="javascript:;" onclick="weixinMa();">微信</a>
# 								<a class="icon_b i_mail" target="_blank" href="http://my.51job.com/sc/sendjob_tofriend.php?jobid=96516324&amp;coid=3511134&amp;divid=0">邮件</a>
# 							</div>
# 						</div>
# 						<div class="clear"></div>
# 					</div>
#
#
# 					"""
# from requests_html import HTML
# html1 = HTML(html=html)
# # # print(html1.markdown)
# html_body = html1.xpath('//div[@class="bmsg job_msg inbox"]')[0].text
# print(html_body)
# # html_body = change_content(html,'//section[@class="textblock"]')
# # with open('1.txt','w',encoding='utf-8') as f:
# #     f.write(html_body)
# # # print(html_body)
# # from requests_html import HTMLSession
# #
# # session = HTMLSession()
# # r = session.get('https://toutiao.hc360.com/2/29822.html')
# # print(r.html.xpath('//div[@class="textblock"]'))
# # print(r.html.decode('utf-8','ignore').find('#textblock'))
# # print(r)
# # print(r.html.xpath('//div[@class="textblock"'))
# # print(r.html.links)
# # from requests_html import session
# # from requests_html import session
# # soup = BeautifulSoup(html_body,'html.parser',from_encoding='utf-8')
# # info = soup.find('div',class_='text_box1 cl')
# # print(soup.text)
# # html_body = etree.HTML(html_body)
# # info = html_body.xpath('//div[@class="text_box1 cl"]')
# # print(info[0])
# # selector = Selector(text=html_body)
# # bloger = selector.xpath('//section [@class="textblock"]')
# # print(bloger.xpath('string(.)').extract_first())
# # bloger = selector.xpath('//div[@class="art-con article_body"]')
# # info = bloger.text
# # print(info)
# # import html2text
# # print (html2text.html2text(html))
#
#
# # print(html1.links)
# # from tomd import Tomd
# # a = Tomd(html_body).markdown
# # # pattern = '[\\\`\*\_\[\]\#\+\-\!\>]'
# # pattern = '[\\\`\*\_\[\]\#\+\-\!\>]'
# # partter1 = '&nbsp;&nbsp;&nbsp;&nbsp;'
# # content_text3 = re.sub(pattern, ' ', a)
# # content_text4 = re.sub(partter1, '  ', content_text3)
# # partter2 = '(http:.*?.com)'
# # content_text5 = re.sub(partter2, '  ', content_text4)
#
# # print(content_text5)
# # print(a)
#
# # sample_text = '''
# #     The textwrap module can be used to format text for output in
# #     situations where pretty-printing is desired.  It offers
# #     programmatic functionality similar to the paragraph wrapping
# #     or filling features found in many text editors.
# # '''
# # import textwrap
# # print(a[0].text)
# # b = textwrap.fill(a[0].text,initial_indent='',subsequent_indent=' ' * 4,)
# #
# # print(b)
Exemplo n.º 16
0
class ExtractionInteret(object):

    REGEX_HTML_TAGS = re.compile(
        r'<(br|basefont|hr|input|source|frame|param|area|meta|!--|col|link|option|base|img|wbr|!DOCTYPE|html|head).*?>|<(a|abbr|acronym|address|applet|article|aside|audio|b|bdi|bdo|big|blockquote|body|button|canvas|caption|center|cite|code|colgroup|command|datalist|dd|del|details|dfn|dialog|dir|div|dl|dt|em|embed|fieldset|figcaption|figure|font|footer|form|frameset|head|header|hgroup|h1|h2|h3|h4|h5|h6|html|i|iframe|ins|kbd|keygen|label|legend|li|map|mark|menu|meter|nav|noframes|noscript|object|ol|optgroup|output|p|pre|progress|q|rp|rt|ruby|s|samp|script|section|select|small|span|strike|strong|style|sub|summary|sup|table|tbody|td|textarea|tfoot|th|thead|time|title|tr|track|tt|u|ul|var|video).*?</\2>'
    )

    def __init__(self, titre, source):
        """
        :param string source:
        """
        self._titre = titre.replace('\r', '').replace('\n', '')
        self._source = source.lstrip().strip('\r')
        self._interets = {'informations': list(), 'titre': self._titre, 'hyperliens': list(), 'identifiants': list()}
        self._recyles = list()

        self._may_html = re.search(ExtractionInteret.REGEX_HTML_TAGS, self._source) is not None
        self._dom = HTML(html=source.replace('<br>', '<br>\n').replace('<br/>', '<br/>\n')) if self._may_html else None

        nb_interet_pre = len(self._interets.keys())

        if self._may_html:

            for table in self._dom.find('table'):

                if table.attrs.get('class') is not None and 'MsoNormalTable' in table.attrs.get('class'):
                    continue

                try:
                    df = pd.read_html(table.html)
                except ValueError:
                    continue

                if len(df) == 0:
                    continue

                for el in df[0].to_dict(orient='records'):
                    keys = el.keys()

                    if 0 in keys and 1 in keys:
                        possible_key = str(el[0]).lstrip().rstrip()
                        possible_value = str(el[1]).lstrip().rstrip()

                        self[possible_key] = possible_value
                        self._recyles.append(possible_value)

                    elif 1 not in keys and 0 in keys:
                        possible_line = str(el[0])
                        self._recyles.append(possible_line)

            self._interets['hyperliens'] = list(self._dom.links)

            if self._may_html is True and len(self._interets.keys()) == nb_interet_pre:
                self._source = self._dom.full_text.replace('\r', '\n')
                self._may_html = False

        self._sentences = ExtractionInteret.extract_sentences(self._source.replace('\n', '\n '))

        for line in self._source.split('\n') + [self._titre]:

            self._recyles.append(line)

            mes_associations = re.findall(r'(([^\w])|^)([a-zA-Z $\u00C0-\u017F\'_]{3,})(:|→|⟶|-->|->)(.+?(?=[\n\"><\]\[]))', line+'\n')
            mes_hyperliens = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', line)

            self._interets['hyperliens'] += [el.strip('<>') for el in mes_hyperliens if el.strip('<>') not in self._interets['hyperliens']]

            for association in mes_associations:  # type: tuple[str, str, str, str, str]

                a, b, c, e, d = association

                partie_possible_cle, partie_possible_valeur = c.rstrip().lstrip(), d.rstrip().lstrip()

                if not partie_possible_valeur.startswith('//') and (self[partie_possible_cle] is None or self[partie_possible_cle] != partie_possible_valeur):
                    self[partie_possible_cle] = partie_possible_valeur

        self._interets['informations'] = self.retrive_informations_balisees()
        self._interets['identifiants'] = self.retrieve_identifer(None, multiple=True)

    @property
    def recycles(self):
        return self._recyles

    def __contains__(self, item):
        return slugify(item) in self._interets.keys()

    def __getitem__(self, key):
        key = slugify(key)
        return self._interets[key] if key in self._interets.keys() else None

    def __setitem__(self, key, value):
        key = slugify(key)

        if self[key] is not None:
            self[key+'-0'] = value
            return

        self._interets[key] = value

    def injecter_interet(self, cle, donnee):
        """
        :param str cle:
        :param str donnee:
        :return:
        """
        cle = slugify(cle)
        if cle in self._interets.keys():
            raise KeyError
        self._interets[cle] = donnee

    @property
    def interets(self):
        return self._interets

    @property
    def source(self):
        return self._source

    @property
    def sentences(self):
        return self._sentences

    def retrieve_xpath(self, expression_xpath):
        """
        Découverte d'un chemin xpath
        :param str expression_xpath:
        :return:
        """
        if self._may_html is False:
            return None

        r = self._dom.xpath(expression_xpath, first=True)

        return r.full_text if r is not None else None

    def retrive_informations_balisees(self, focus=None):

        def extract(my_string):
            mes_informations = [el[1:-1] for el in re.findall(r'\[[a-zA-Z0-9:\-# _\'\u00C0-\u017F]{1,36}\]', my_string)] + \
                               [''.join(el) for el in re.findall(r'(([^\w#])|^)#(\w*[0-9a-zA-Z]+\w*[0-9a-zA-Z])', my_string)]
            return mes_informations

        informations = list()

        if focus is None:
            cts_listes = self._recyles + self._sentences + [self._interets[el] for el in self._interets.keys() if isinstance(self._interets[el], str)]
        elif focus == 'corpus':
            cts_listes = self._recyles + self._sentences
        else:
            cts_listes = [self._interets[focus] if isinstance(self._interets[focus], str) else str(self._interets[focus])]

        for my_str in cts_listes:
            informations += extract(my_str)

        return list(set(informations))

    def retrieve_expression_reguliere(self, expression_reguliere, focus=None):

        expression_reguliere = re.compile(expression_reguliere)

        if focus is None:
            cts_listes = self._recyles + self._sentences + [self._interets[el] for el in self._interets.keys() if isinstance(self._interets[el], str)]
        elif focus == 'corpus':
            cts_listes = self._recyles + self._sentences
        else:
            cts_listes = [self._interets[focus] if isinstance(self._interets[focus], str) else str(self._interets[focus])]

        for my_str in cts_listes:

            my_extract = re.findall(expression_reguliere, my_str)

            if len(my_extract) > 0:
                return str(my_extract[0]) if isinstance(my_extract, list) else ''.join(my_extract)

        return None

    def retrieve_date(self, prefix, focus=None, multiple=False):
        """
        :param str prefix:
        :param bool multiple:
        :return:
        """

        def extract(my_string):
            """
            :param str my_string:
            :return:
            :rtype: str
            """

            date_fr_regex = re.compile(
                r'{}'.format(re.escape(prefix+' '))+r'([0-2 ][0-9]|(3)[0-1])([\/-])(((0)[0-9])|((1)[0-2]))([\/-])\d{2,4}'
            )

            date_us_regex = re.compile(
                r'{}'.format(re.escape(prefix+' '))+r'\d{4}([\/-])(((0)[0-9])|((1)[0-2]))([\/-])([0-2][0-9]|(3)[0-1])'
            )

            date_rfc_3339 = re.compile(
                r'{}'.format(re.escape(prefix+' '))+r'((?:(\d{4}-\d{2}-\d{2})T(\d{2}:\d{2}:\d{2}(?:\.\d+)?))(Z|[+-]\d{2}:\d{2})?)'
            )

            date_rfc_2822 = re.compile(
                r'{}'.format(re.escape(prefix+' '))+r'(?:(Sun|Mon|Tue|Wed|Thu|Fri|Sat),\s+)?(0[1-9]|[1-2]?[0-9]|3[01])\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+(19[0-9]{2}|[2-9][0-9]{3})\s+(2[0-3]|[0-1][0-9]):([0-5][0-9])(?::(60|[0-5][0-9]))?\s+([-\+][0-9]{2}[0-5][0-9]|(?:UT|GMT|(?:E|C|M|P)(?:ST|DT)|[A-IK-Z]))(\s+|\(([^\(\)]+|\\\(|\\\))*\))*'
            )

            date_fr_reduite_regex = re.compile(
                r'{}'.format(re.escape(prefix+' '))+r'(((0)[0-9])|((1)[0-2]))([\/-])\d{4}'
            )

            date_us_reduite_regex = re.compile(
                r'{}'.format(re.escape(prefix+' '))+r'\d{4}([\/-])(((0)[0-9])|((1)[0-2]))'
            )

            dates_expressions_regulieres = [
                date_rfc_3339,
                date_rfc_2822,
                date_fr_regex,
                date_us_regex,
                date_fr_reduite_regex,
                date_us_reduite_regex
            ]

            for el in dates_expressions_regulieres:
                mt = re.search(el, my_string)

                if mt is not None:
                    return mt.group().replace(prefix, '')

            return None

        dates = list()

        if focus is None:
            cts_listes = self._interets['informations'] + self._recyles + self._sentences + [self._interets[el] for el in self._interets.keys() if isinstance(self._interets[el], str)]
        elif focus == 'corpus':
            cts_listes = self._recyles + self._sentences
        else:
            cts_listes = [self._interets[focus] if isinstance(self._interets[focus], str) else str(self._interets[focus])]

        for my_str in cts_listes:
            ma_date = extract(my_str)
            if ma_date:
                if multiple is False:
                    return ma_date
                dates.append(ma_date)

        return None if multiple is False else dates

    def retrieve_inner_expression(self, expr_left, expr_right, focus=None, multiple=False):
        """
        :param str focus:
        :param str expr_left:
        :param str expr_right:
        :param bool multiple:
        :return:
        """
        expr_left = unidecode(expr_left).lower() if expr_left is not None else ''
        expr_right = unidecode(expr_right).lower() if expr_right is not None else ''

        def extract(ma_chaine):
            """
            :param str ma_chaine:
            :return:
            """
            ma_chaine_unidecoded = unidecode(ma_chaine).lower()

            if expr_left is not None and len(expr_left) > 0 and expr_left in ma_chaine_unidecoded:

                if expr_right is None or len(expr_right) == 0:
                    return ma_chaine[ma_chaine_unidecoded.index(expr_left) + len(expr_left):].lstrip().rstrip()

                if expr_right in ma_chaine_unidecoded[ma_chaine_unidecoded.index(expr_left) + len(expr_left) - 1:]:
                    return ma_chaine[ma_chaine_unidecoded.index(expr_left) + len(expr_left):ma_chaine_unidecoded.index(expr_right)].lstrip().rstrip()

            elif (expr_left is None or len(expr_left) == 0) and expr_right is not None and expr_right in ma_chaine_unidecoded:
                return ma_chaine[:ma_chaine_unidecoded.index(expr_right)-1].lstrip().rstrip()

            return None

        expressions = list()

        if focus is None:
            cts_listes = self._interets['informations'] + self._recyles + self._sentences + [self._interets[el] for el in self._interets.keys() if isinstance(self._interets[el], str)]
        elif focus == 'corpus':
            cts_listes = self._recyles + self._sentences
        else:
            cts_listes = [self._interets[focus] if isinstance(self._interets[focus], str) else str(self._interets[focus])]

        for my_str in cts_listes:
            k = extract(my_str)
            if k is not None:
                if multiple is False:
                    return k
                expressions.append(k)

        return None if multiple is False else expressions

    def retrieve_identifer(self, prefix, focus=None, exclude_prefix=False, cast_integer=False, multiple=False):
        """
        Récupération d'un identifiant
        :param str prefix:
        :param bool exclude_prefix:
        :param bool cast_integer:
        :param bool multiple:
        :return:
        """

        def extract(ma_chaine):
            matchs = re.search(r'(([^\w-])|^){prefix}([^\W\n]+[\d]+)'.format(prefix=prefix.replace(' ', '\\ ')), ma_chaine)
            if matchs:
                digits = ExtractionInteret.extract_digits(matchs.group())

                if digits is not None:
                    return (int(digits) if cast_integer is True else digits) if exclude_prefix is True else matchs.group().replace(matchs.group(1), '')

            return None

        if prefix is None or len(prefix) == 0:
            prefix = '[A-Za-z-°]+'

        identifiants = list()

        if focus is None:
            cts_listes = self._interets['informations'] + self._recyles + self._sentences + [self._interets[el] for el in self._interets.keys() if isinstance(self._interets[el], str)]
        elif focus == 'corpus':
            cts_listes = self._recyles + self._sentences
        else:
            cts_listes = [self._interets[focus] if isinstance(self._interets[focus], str) else str(self._interets[focus])]

        for my_str in cts_listes:
            identifiant = extract(my_str)
            if identifiant is not None:
                if multiple is False:
                    return identifiant
                if identifiant not in identifiants:
                    identifiants.append(identifiant)

        return None if multiple is False else identifiants

    def has_expression_cle(self, expression_cle, focus=None):
        """
        :param str expression_cle:
        :return:
        """

        expression_cle = unidecode(expression_cle).lower()

        if focus is None:
            cts_listes = self._recyles+self._sentences+self.interets['informations']+[self.interets['titre']]
        elif focus == 'corpus':
            cts_listes = self._recyles + self._sentences
        else:
            cts_listes = [self._interets[focus] if isinstance(self._interets[focus], str) else str(self._interets[focus])]

        for el in cts_listes:
            if not isinstance(el, str):
                continue
            if expression_cle in unidecode(el).lower():
                return True

        return False

    def has_expression_dans_cle(self, ma_cle, mon_expression):
        mon_expression = unidecode(mon_expression).lower()

        if self.has_interet(ma_cle) is True:
            el = self[ma_cle]
            if isinstance(el, str):
                return mon_expression in unidecode(el).lower()
            elif isinstance(el, list):
                for el_l in el:
                    if isinstance(el_l, str):
                        if mon_expression in unidecode(el_l).lower():
                            return True

        return False

    def has_information(self, information_cible, focus=None):
        """
        :param focus:
        :param str information_cible:
        :return:
        """
        information_cible = unidecode(information_cible).lower()

        for el in self.interets['informations'] if focus is None else self.retrive_informations_balisees(focus):
            if information_cible in unidecode(el).lower():
                return True
        return False

    def has_interet(self, interet_cible):
        """
        :param str interet_cible:
        :return:
        """
        return slugify(interet_cible) in self.interets.keys()

    def get_interet(self, interet_cible):
        return self.interets[slugify(interet_cible)] if self.has_interet(interet_cible) else None

    @staticmethod
    def extract_digits(string):
        """

        :param str string:
        :return:
        """
        final_str = ''
        first_digit_mt = False

        for c in string:
            if c.isdigit():
                first_digit_mt = True
                final_str += c
            elif first_digit_mt is True and c.isdigit() is False:
                break

        return final_str if final_str != '' else None

    @staticmethod
    def alnum_percentage(source):
        """

        :param string source:
        :return:
        """
        o_len = len(source)
        f_len = 0

        for el in source:
            if el.isalnum():
                f_len += 1

        return f_len / o_len

    @staticmethod
    def extract_sentences(source):
        """
        :param str source:
        :return:
        """

        source_splitted = source.split(' ')
        sentences = ['']

        for possible_word in source_splitted:
            if len(possible_word) == 0:
                continue
            if re.fullmatch(r'[\w\'’/.,!?;\-\u00C0-\u017F\n]{1,26}', possible_word):
                sentences[-1] += ' ' + possible_word if len(sentences[-1]) > 0 else possible_word
                if possible_word in ['.', '?', '!', '\n'] or sentences[-1][-1] in ['.', '?', '!', '\n']:
                    sentences.append('')
            elif sentences[-1] != '':
                if len(sentences[-1].split(' ')) > 3:
                    sentences.append('')
                else:
                    sentences[-1] = ''

        return sentences
Exemplo n.º 17
0
    def parse(self, response):
        follow_urls = set()

        paper_info = {
            'title':
            response.xpath(path.TITLE).get(),
            'url':
            response.url,
            'date':
            response.xpath(path.DATE).get(),
            'DOI':
            response.xpath(path.DOI).get(),
            'conference':
            response.xpath(path.CONFERENCE).get(),
            'citation count':
            transform_number(response.xpath(path.CITATIONS_COUNT).get()),
            'reference count':
            transform_number(response.xpath(path.REFERENCES_COUNT).get())
        }

        self.file_name = paper_info['title'].replace(" ", "_")

        target_file = open(f'../output/{self.file_name}.json', 'w')
        target_file.write('{"result": [' + json.dumps(paper_info, indent=4) +
                          ',\n')
        target_file.close()

        publication_id = paper_info['url'][paper_info['url'].find("publication"
                                                                  ) +
                                           12:paper_info['url'].find("_")]
        request_token = response.xpath(path.RG_REQUEST_TOKEN).attrib['content']
        offset = 10

        if get_reference(token=request_token,
                         uid=publication_id,
                         offset=offset).status_code != 200:
            self.logger.info(
                f"response status {get_reference(uid=publication_id, offset=offset).status_code} instead of 200, possibly need to update cookies & token"
            )

        while get_reference(token=request_token,
                            uid=publication_id,
                            offset=offset).status_code == 200:
            ref_response = get_reference(token=request_token,
                                         uid=publication_id,
                                         offset=offset)
            if (ref_response.text == ''):
                break
            html = HTML(html=ref_response.text)
            links = html.xpath(path.REFERENCES_LINK)
            if len(links) == 0:
                break
            for link in links:
                follow_urls.add(path.BASE_URL + link)
            offset = offset + 5

        for reference in response.xpath(path.REFERENCES):
            reference_link = path.BASE_URL + reference.xpath(
                path.REFERENCES_LINK).get() if reference.xpath(
                    path.REFERENCES_LINK).get() is not None else ""
            if (reference_link != ''):
                follow_urls.add(reference_link)

        self.logger.info(f"total urls to follow: {len(follow_urls)}")

        for url in follow_urls:
            if url is not None:
                yield response.follow(url, self.reference_parse)
Exemplo n.º 18
0
from requests_html import HTML
import codecs
fp = codecs.open(
    "About this Documentation _ Node.js v8.9.4 Documentation.html", "r",
    "utf-8")
html = HTML(html=fp.read())
# c2=html.find('#column2', first=True)
# print(c2,dir(c2))
h1s = html.xpath("./body/div/div/div/h1/span/a")
for h1 in h1s:
    print(h1.attrs["id"])
print(len(h1s))
h2s = html.xpath("./body/div/div/ul/li/a")
for i in range(len(h1s)):
    print(h2s[i].attrs["href"])
    print("#" + h1s[i].attrs["id"])
    #print(h2s[i].attrs["href"])
    pass