Python ClientParsing.StringMatch примеры использования

Язык программирования: Python

Класс/Тип: ClientParsing

Метод/Функция: StringMatch

Примеров на hotexamples.com: 3

Python ClientParsing.StringMatch - 3 примера найдено. Это лучшие примеры Python кода для ClientParsing.StringMatch, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

GetSoup(14)

GetURLsFromParseResults(3)

StringMatch(3)

ParseFormulaHTML(2)

ParseRuleHTML(2)

StringConverter(2)

ContentParser(1)

GetTagsFromContentResults(1)

GetTagsFromParseResults(1)

GetTitleFromAllParseResults(1)

PageParser(1)

Пример #1

Показать файл

Файл: ClientNetworkingDomain.py Проект: velemi/hydrus

    def __init__(
            self,
            name,
            url_type=None,
            preferred_scheme='https',
            netloc='hostname.com',
            allow_subdomains=False,
            keep_subdomains=False,
            path_components=None,
            parameters=None,
            example_url='https://hostname.com/post/page.php?id=123456&s=view'):

        if url_type is None:

            url_type = HC.URL_TYPE_POST

        if path_components is None:

            path_components = HydrusSerialisable.SerialisableList()

            path_components.append(
                ClientParsing.StringMatch(
                    match_type=ClientParsing.STRING_MATCH_FIXED,
                    match_value='post',
                    example_string='post'))
            path_components.append(
                ClientParsing.StringMatch(
                    match_type=ClientParsing.STRING_MATCH_FIXED,
                    match_value='page.php',
                    example_string='page.php'))

        if parameters is None:

            parameters = HydrusSerialisable.SerialisableDictionary()

            parameters['s'] = ClientParsing.StringMatch(
                match_type=ClientParsing.STRING_MATCH_FIXED,
                match_value='view',
                example_string='view')
            parameters['id'] = ClientParsing.StringMatch(
                match_type=ClientParsing.STRING_MATCH_FLEXIBLE,
                match_value=ClientParsing.NUMERIC,
                example_string='123456')

        # if the args are not serialisable stuff, lets overwrite here

        path_components = HydrusSerialisable.SerialisableList(path_components)
        parameters = HydrusSerialisable.SerialisableDictionary(parameters)

        HydrusSerialisable.SerialisableBaseNamed.__init__(self, name)

        self._url_type = url_type
        self._preferred_scheme = preferred_scheme
        self._netloc = netloc
        self._allow_subdomains = allow_subdomains
        self._keep_subdomains = keep_subdomains
        self._path_components = path_components
        self._parameters = parameters

        self._example_url = example_url

Пример #2

Показать файл

Файл: ClientNetworkingDomain.py Проект: kororok/hydrus

    def __init__(
            self,
            name,
            preferred_scheme='https',
            netloc='hostname.com',
            subdomain_is_important=False,
            path_components=None,
            parameters=None,
            example_url='https://hostname.com/post/page.php?id=123456&s=view'):

        if path_components is None:

            path_components = HydrusSerialisable.SerialisableList()

            path_components.append(
                ClientParsing.StringMatch(
                    match_type=ClientParsing.STRING_MATCH_FIXED,
                    match_value='post',
                    example_string='post'))
            path_components.append(
                ClientParsing.StringMatch(
                    match_type=ClientParsing.STRING_MATCH_FIXED,
                    match_value='page.php',
                    example_string='page.php'))

        if parameters is None:

            parameters = HydrusSerialisable.SerialisableDictionary()

            parameters['s'] = ClientParsing.StringMatch(
                match_type=ClientParsing.STRING_MATCH_FIXED,
                match_value='view',
                example_string='view')
            parameters['id'] = ClientParsing.StringMatch(
                match_type=ClientParsing.STRING_MATCH_FLEXIBLE,
                match_value=ClientParsing.NUMERIC,
                example_string='123456')

        # an edit dialog panel for this that has example url and testing of current values
        # a parent panel or something that lists all current urls in the db that match and how they will be clipped, is this ok? kind of thing.

        HydrusSerialisable.SerialisableBaseNamed.__init__(self, name)

        self._preferred_scheme = preferred_scheme
        self._netloc = netloc
        self._subdomain_is_important = subdomain_is_important
        self._path_components = path_components
        self._parameters = parameters

        self._example_url = example_url

Пример #3

Показать файл

def ConvertBooruToNewObjects(booru):

    name = booru.GetName()

    name = 'zzz - auto-generated from legacy booru system - ' + name

    (search_url, search_separator, advance_by_page_num, thumb_classname,
     image_id, image_data, tag_classnames_to_namespaces) = booru.GetData()

    if advance_by_page_num:

        search_url = search_url.replace('%index%', '1')

    else:

        search_url = search_url.replace('%index%', '0')

    gug = ClientNetworkingDomain.GalleryURLGenerator(
        name + ' search',
        url_template=search_url,
        replacement_phrase='%tags%',
        search_terms_separator=search_separator,
        initial_search_text='tag search',
        example_search_text='blonde_hair blue_eyes')

    #

    tag_rules = []

    rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
    tag_name = None
    tag_attributes = {'class': thumb_classname}
    tag_index = None

    tag_rules.append(
        ClientParsing.ParseRuleHTML(rule_type=rule_type,
                                    tag_name=tag_name,
                                    tag_attributes=tag_attributes,
                                    tag_index=tag_index))

    rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
    tag_name = 'a'
    tag_attributes = None
    tag_index = None

    tag_rules.append(
        ClientParsing.ParseRuleHTML(rule_type=rule_type,
                                    tag_name=tag_name,
                                    tag_attributes=tag_attributes,
                                    tag_index=tag_index))

    formula = ClientParsing.ParseFormulaHTML(
        tag_rules=tag_rules,
        content_to_fetch=ClientParsing.HTML_CONTENT_ATTRIBUTE,
        attribute_to_fetch='href')

    url_type = HC.URL_TYPE_DESIRED
    priority = 50

    additional_info = (url_type, priority)

    thumb_content_parser = ClientParsing.ContentParser(
        name='get post urls (based on old booru thumb search)',
        content_type=HC.CONTENT_TYPE_URLS,
        formula=formula,
        additional_info=additional_info)

    gallery_parser = ClientParsing.PageParser(
        name + ' gallery page parser',
        content_parsers=[thumb_content_parser],
        example_urls=[gug.GetExampleURL()])

    #

    content_parsers = []

    if image_id is not None:

        tag_rules = []

        rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
        tag_name = 'a'
        tag_attributes = {'id': image_id}
        tag_index = None

        tag_rules.append(
            ClientParsing.ParseRuleHTML(rule_type=rule_type,
                                        tag_name=tag_name,
                                        tag_attributes=tag_attributes,
                                        tag_index=tag_index))

        formula = ClientParsing.ParseFormulaHTML(
            tag_rules=tag_rules,
            content_to_fetch=ClientParsing.HTML_CONTENT_ATTRIBUTE,
            attribute_to_fetch='href')

        url_type = HC.URL_TYPE_DESIRED
        priority = 75

        additional_info = (url_type, priority)

        image_link_content_parser = ClientParsing.ContentParser(
            name='get image file link url (based on old booru parser)',
            content_type=HC.CONTENT_TYPE_URLS,
            formula=formula,
            additional_info=additional_info)

        content_parsers.append(image_link_content_parser)

        #

        tag_rules = []

        rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
        tag_name = 'img'
        tag_attributes = {'id': image_id}
        tag_index = None

        tag_rules.append(
            ClientParsing.ParseRuleHTML(rule_type=rule_type,
                                        tag_name=tag_name,
                                        tag_attributes=tag_attributes,
                                        tag_index=tag_index))

        formula = ClientParsing.ParseFormulaHTML(
            tag_rules=tag_rules,
            content_to_fetch=ClientParsing.HTML_CONTENT_ATTRIBUTE,
            attribute_to_fetch='src')

        url_type = HC.URL_TYPE_DESIRED
        priority = 50

        additional_info = (url_type, priority)

        image_src_content_parser = ClientParsing.ContentParser(
            name='get image file src url (based on old booru parser)',
            content_type=HC.CONTENT_TYPE_URLS,
            formula=formula,
            additional_info=additional_info)

        content_parsers.append(image_src_content_parser)

    elif image_data is not None:

        tag_rules = []

        rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
        tag_name = 'a'
        tag_attributes = None
        tag_index = None

        string_match = ClientParsing.StringMatch(
            match_type=ClientParsing.STRING_MATCH_FIXED,
            match_value=image_data,
            example_string=image_data)

        tag_rules.append(
            ClientParsing.ParseRuleHTML(rule_type=rule_type,
                                        tag_name=tag_name,
                                        tag_attributes=tag_attributes,
                                        tag_index=tag_index,
                                        should_test_tag_string=True,
                                        tag_string_string_match=string_match))

        formula = ClientParsing.ParseFormulaHTML(
            tag_rules=tag_rules,
            content_to_fetch=ClientParsing.HTML_CONTENT_ATTRIBUTE,
            attribute_to_fetch='href')

        url_type = HC.URL_TYPE_DESIRED
        priority = 50

        additional_info = (url_type, priority)

        image_link_content_parser = ClientParsing.ContentParser(
            name='get image file url (based on old booru parser)',
            content_type=HC.CONTENT_TYPE_URLS,
            formula=formula,
            additional_info=additional_info)

        content_parsers.append(image_link_content_parser)

    for (classname, namespace) in tag_classnames_to_namespaces.items():

        tag_rules = []

        rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
        tag_name = None
        tag_attributes = {'class': classname}
        tag_index = None

        tag_rules.append(
            ClientParsing.ParseRuleHTML(rule_type=rule_type,
                                        tag_name=tag_name,
                                        tag_attributes=tag_attributes,
                                        tag_index=tag_index))

        rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
        tag_name = 'a'
        tag_attributes = None
        tag_index = None

        tag_rules.append(
            ClientParsing.ParseRuleHTML(rule_type=rule_type,
                                        tag_name=tag_name,
                                        tag_attributes=tag_attributes,
                                        tag_index=tag_index))

        formula = ClientParsing.ParseFormulaHTML(
            tag_rules=tag_rules,
            content_to_fetch=ClientParsing.HTML_CONTENT_STRING)

        additional_info = namespace

        tag_content_parser = ClientParsing.ContentParser(
            name='get "' + namespace + '" tags',
            content_type=HC.CONTENT_TYPE_MAPPINGS,
            formula=formula,
            additional_info=additional_info)

        content_parsers.append(tag_content_parser)

    post_parser = ClientParsing.PageParser(name + ' post page parser',
                                           content_parsers=content_parsers,
                                           example_urls=[])

    #

    return (gug, gallery_parser, post_parser)