Python ClientParsing.ParseFormulaHTML примеры использования

Язык программирования: Python

Класс/Тип: ClientParsing

Метод/Функция: ParseFormulaHTML

Примеров на hotexamples.com: 2

Python ClientParsing.ParseFormulaHTML - 2 примера найдено. Это лучшие примеры Python кода для ClientParsing.ParseFormulaHTML, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

GetSoup(14)

GetURLsFromParseResults(3)

StringMatch(3)

ParseFormulaHTML(2)

ParseRuleHTML(2)

StringConverter(2)

ContentParser(1)

GetTagsFromContentResults(1)

GetTagsFromParseResults(1)

GetTitleFromAllParseResults(1)

PageParser(1)

Пример #1

Показать файл

 def LoginTumblrGDPR( self ):
     
     # t-thanks, EU
     # this is cribbed from poking around here https://github.com/johanneszab/TumblThree/commit/3563d6cebf1a467151d6b8d6eee9806ddd6e6364
     
     network_job = ClientNetworkingJobs.NetworkJob( 'GET', 'http://www.tumblr.com/' )
     
     network_job.SetForLogin( True )
     
     self.engine.AddJob( network_job )
     
     network_job.WaitUntilDone()
     
     html = network_job.GetContent()
     
     formula = ClientParsing.ParseFormulaHTML( tag_rules = [ ClientParsing.ParseRuleHTML( rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING, tag_name = 'meta', tag_attributes = { 'id' : 'tumblr_form_key' } ) ], content_to_fetch = ClientParsing.HTML_CONTENT_ATTRIBUTE, attribute_to_fetch = "content" )
     
     results = formula.Parse( {}, html )
     
     if len( results ) != 1:
         
         raise HydrusExceptions.ParseException( 'Could not figure out the tumblr form key for the GDPR click-through.' )
         
     
     tumblr_form_key = results[0]
     
     #
     
     body = '{\"eu_resident\":true,\"gdpr_is_acceptable_age\":true,\"gdpr_consent_core\":true,\"gdpr_consent_first_party_ads\":true,\"gdpr_consent_third_party_ads\":true,\"gdpr_consent_search_history\":true,\"redirect_to\":\"\"}'
     referral_url = 'https://www.tumblr.com/privacy/consent?redirect='
     
     network_job = ClientNetworkingJobs.NetworkJob( 'POST', 'https://www.tumblr.com/svc/privacy/consent', body = body, referral_url = referral_url )
     
     network_job.SetForLogin( True )
     
     network_job.AddAdditionalHeader( 'Accept', 'application/json, text/javascript, */*; q=0.01')
     network_job.AddAdditionalHeader( 'Content-Type', 'application/json' )
     network_job.AddAdditionalHeader( 'X-Requested-With', 'XMLHttpRequest' )
     network_job.AddAdditionalHeader( 'X-tumblr-form-key', tumblr_form_key )
     
     self.engine.AddJob( network_job )
     
     network_job.WaitUntilDone()
     
     # test cookies here or something
     
     HydrusData.ShowText( 'Looks like tumblr GDPR click-through worked! You should be good for a year, at which point we should have an automatic solution for this!' )

Пример #2

Показать файл

def ConvertBooruToNewObjects(booru):

    name = booru.GetName()

    name = 'zzz - auto-generated from legacy booru system - ' + name

    (search_url, search_separator, advance_by_page_num, thumb_classname,
     image_id, image_data, tag_classnames_to_namespaces) = booru.GetData()

    if advance_by_page_num:

        search_url = search_url.replace('%index%', '1')

    else:

        search_url = search_url.replace('%index%', '0')

    gug = ClientNetworkingDomain.GalleryURLGenerator(
        name + ' search',
        url_template=search_url,
        replacement_phrase='%tags%',
        search_terms_separator=search_separator,
        initial_search_text='tag search',
        example_search_text='blonde_hair blue_eyes')

    #

    tag_rules = []

    rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
    tag_name = None
    tag_attributes = {'class': thumb_classname}
    tag_index = None

    tag_rules.append(
        ClientParsing.ParseRuleHTML(rule_type=rule_type,
                                    tag_name=tag_name,
                                    tag_attributes=tag_attributes,
                                    tag_index=tag_index))

    rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
    tag_name = 'a'
    tag_attributes = None
    tag_index = None

    tag_rules.append(
        ClientParsing.ParseRuleHTML(rule_type=rule_type,
                                    tag_name=tag_name,
                                    tag_attributes=tag_attributes,
                                    tag_index=tag_index))

    formula = ClientParsing.ParseFormulaHTML(
        tag_rules=tag_rules,
        content_to_fetch=ClientParsing.HTML_CONTENT_ATTRIBUTE,
        attribute_to_fetch='href')

    url_type = HC.URL_TYPE_DESIRED
    priority = 50

    additional_info = (url_type, priority)

    thumb_content_parser = ClientParsing.ContentParser(
        name='get post urls (based on old booru thumb search)',
        content_type=HC.CONTENT_TYPE_URLS,
        formula=formula,
        additional_info=additional_info)

    gallery_parser = ClientParsing.PageParser(
        name + ' gallery page parser',
        content_parsers=[thumb_content_parser],
        example_urls=[gug.GetExampleURL()])

    #

    content_parsers = []

    if image_id is not None:

        tag_rules = []

        rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
        tag_name = 'a'
        tag_attributes = {'id': image_id}
        tag_index = None

        tag_rules.append(
            ClientParsing.ParseRuleHTML(rule_type=rule_type,
                                        tag_name=tag_name,
                                        tag_attributes=tag_attributes,
                                        tag_index=tag_index))

        formula = ClientParsing.ParseFormulaHTML(
            tag_rules=tag_rules,
            content_to_fetch=ClientParsing.HTML_CONTENT_ATTRIBUTE,
            attribute_to_fetch='href')

        url_type = HC.URL_TYPE_DESIRED
        priority = 75

        additional_info = (url_type, priority)

        image_link_content_parser = ClientParsing.ContentParser(
            name='get image file link url (based on old booru parser)',
            content_type=HC.CONTENT_TYPE_URLS,
            formula=formula,
            additional_info=additional_info)

        content_parsers.append(image_link_content_parser)

        #

        tag_rules = []

        rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
        tag_name = 'img'
        tag_attributes = {'id': image_id}
        tag_index = None

        tag_rules.append(
            ClientParsing.ParseRuleHTML(rule_type=rule_type,
                                        tag_name=tag_name,
                                        tag_attributes=tag_attributes,
                                        tag_index=tag_index))

        formula = ClientParsing.ParseFormulaHTML(
            tag_rules=tag_rules,
            content_to_fetch=ClientParsing.HTML_CONTENT_ATTRIBUTE,
            attribute_to_fetch='src')

        url_type = HC.URL_TYPE_DESIRED
        priority = 50

        additional_info = (url_type, priority)

        image_src_content_parser = ClientParsing.ContentParser(
            name='get image file src url (based on old booru parser)',
            content_type=HC.CONTENT_TYPE_URLS,
            formula=formula,
            additional_info=additional_info)

        content_parsers.append(image_src_content_parser)

    elif image_data is not None:

        tag_rules = []

        rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
        tag_name = 'a'
        tag_attributes = None
        tag_index = None

        string_match = ClientParsing.StringMatch(
            match_type=ClientParsing.STRING_MATCH_FIXED,
            match_value=image_data,
            example_string=image_data)

        tag_rules.append(
            ClientParsing.ParseRuleHTML(rule_type=rule_type,
                                        tag_name=tag_name,
                                        tag_attributes=tag_attributes,
                                        tag_index=tag_index,
                                        should_test_tag_string=True,
                                        tag_string_string_match=string_match))

        formula = ClientParsing.ParseFormulaHTML(
            tag_rules=tag_rules,
            content_to_fetch=ClientParsing.HTML_CONTENT_ATTRIBUTE,
            attribute_to_fetch='href')

        url_type = HC.URL_TYPE_DESIRED
        priority = 50

        additional_info = (url_type, priority)

        image_link_content_parser = ClientParsing.ContentParser(
            name='get image file url (based on old booru parser)',
            content_type=HC.CONTENT_TYPE_URLS,
            formula=formula,
            additional_info=additional_info)

        content_parsers.append(image_link_content_parser)

    for (classname, namespace) in tag_classnames_to_namespaces.items():

        tag_rules = []

        rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
        tag_name = None
        tag_attributes = {'class': classname}
        tag_index = None

        tag_rules.append(
            ClientParsing.ParseRuleHTML(rule_type=rule_type,
                                        tag_name=tag_name,
                                        tag_attributes=tag_attributes,
                                        tag_index=tag_index))

        rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
        tag_name = 'a'
        tag_attributes = None
        tag_index = None

        tag_rules.append(
            ClientParsing.ParseRuleHTML(rule_type=rule_type,
                                        tag_name=tag_name,
                                        tag_attributes=tag_attributes,
                                        tag_index=tag_index))

        formula = ClientParsing.ParseFormulaHTML(
            tag_rules=tag_rules,
            content_to_fetch=ClientParsing.HTML_CONTENT_STRING)

        additional_info = namespace

        tag_content_parser = ClientParsing.ContentParser(
            name='get "' + namespace + '" tags',
            content_type=HC.CONTENT_TYPE_MAPPINGS,
            formula=formula,
            additional_info=additional_info)

        content_parsers.append(tag_content_parser)

    post_parser = ClientParsing.PageParser(name + ' post page parser',
                                           content_parsers=content_parsers,
                                           example_urls=[])

    #

    return (gug, gallery_parser, post_parser)