Пример #1
0
def extract_ipa(word_soup, region):
    ipa_class = 'PronCodes'
    br_ipa_audio_class = 'brefile'
    us_ipa_audio_class = 'amefile'
    inflections_class = 'Inflections'
    audio_url_param_name = 'data-src-mp3'

    # remove inflections (say -> said, says)
    # they contain only IPA without any audio
    inflections = word_soup.find(class_=inflections_class)
    if inflections:
        inflections.decompose()

    if region == 'br':
        audio_class = br_ipa_audio_class
    else:
        audio_class = us_ipa_audio_class

    ipa = Ipa()
    ipa.region = region
    try:
        ipa.ipa = parsing_tools.find_single_class(
            word_soup, ipa_class).text.strip().replace(u'/', '')
    except ClassNotFound:
        ipa.ipa = ''
    try:
        audio_div = parsing_tools.find_single_class(word_soup, audio_class)
    except ClassNotFound:
        return ipa
    audio_url = audio_div[audio_url_param_name]
    ipa.audio = cache.File(audio_url, 'mp3')
    return ipa
Пример #2
0
def parse_html(html):
    soup = parsing_tools.html_to_soup(html)
    valid(soup)
    word_objects = []

    words = soup.find_all(class_=word_section_class)
    if len(words) == 0:
        raise ParseError(
            "Can't find any '{}' classes.".format(word_section_class))
    for word in words:
        word_head = parsing_tools.find_single_class(word, word_head_class)
        word_object = Word()
        word_object.source = 'Longman'
        if word.find(class_='bussdictEntry'):
            word_object.source = 'Longman Business'
        word_object.word = parsing_tools.find_single_class(
            word_head, name_class).string.replace(u'‧', u'')

        pos = word_head.find_all(class_=pos_class)
        if len(pos) > 0:
            word_object.pos = ', '.join(
                [p.text.replace(',', '').strip() for p in pos])
        else:
            word_object.pos = ''

        try:
            word_object.pos_additional = parsing_tools\
                .find_single_class(word_head, pos_additional_class).text.strip()
        except ClassNotFound:
            word_object.pos_additional = ''

        word_object.ipas.append(extract_ipa(word_head, 'br'))
        word_object.ipas.append(extract_ipa(word_head, 'us'))

        try:
            definitions = parsing_tools.find_all_classes(
                word, definition_parent_class)
        except ClassNotFound:
            pass
        else:
            for def_parent in definitions:
                cross_refs = soup.find_all(class_=crossref_class)
                for cr in cross_refs:
                    cr.decompose()

                subdefinitions = def_parent.find_all(
                    class_=subdefinition_parent_class)
                if subdefinitions:
                    for subdef in subdefinitions:
                        extract_definition(subdef, word_object)
                else:
                    extract_definition(def_parent, word_object)

        word_objects.append(word_object)

    return word_objects
Пример #3
0
def extract_definition(def_parent, word_object):
    definition = Definition()
    try:
        definition.definition = parsing_tools.find_single_class(
            def_parent, definition_class).text.strip()
    except ClassNotFound:
        # Can't find the definition, it's probably just a link to another page
        return

    registers = []
    register_divs = def_parent.find_all(class_=definition_register_class)
    for register_div in register_divs:
        registers.append('[{}]'.format(register_div.text.strip()))
    if len(registers) > 0:
        definition.definition = ' '.join(
            registers) + ' ' + definition.definition

    geo = def_parent.find(class_=definition_geo_class)
    if geo:
        definition.definition = '[{}] '.format(
            geo.text.strip()) + definition.definition

    try:
        definition.definition_additional = parsing_tools.find_single_class(
            def_parent, definition_additional_class).text
    except ClassNotFound:
        definition.definition_additional = ''

    sentences = def_parent.find_all(class_=sentence_class)
    for s in sentences:
        sentence = Sentence()
        sentence.content = s.text.strip()
        audio = s.find(class_=sentence_audio_class)
        if audio:
            audio_url = audio[audio_url_param_name]
            sentence.audio = cache.File(audio_url, 'mp3')
        definition.sentences.append(sentence)

    word_object.definitions.append(definition)
def parse_html(html):
    word_header_class = 'webtop-g'
    name_class = 'h'
    pos_class = 'pos'
    pron_section_class = 'vp-g'
    pron_top_class = 'pron-g'
    ipa_class = 'phon'
    audio_class = 'sound'
    audio_url_param_name = 'data-src-mp3'
    idioms_parent = 'idm-gs'
    definition_parent_class = 'sn-g'
    definition_class = 'def'
    definition_additional_class = 'gram-g'
    definition_label_class = 'label-g'  # "informal", "especially north american", etc
    sentence_class = 'x'
    collapse_class = 'collapse'
    synonyms_title = 'Synonyms'
    collocations_title = 'Collocations'
    soup = parsing_tools.html_to_soup(html)

    header = parsing_tools.find_single_class(soup, word_header_class)
    word = Word()
    word.source = 'Oxford'
    word.word = parsing_tools.find_single_class(header, name_class).text
    try:
        word.pos = parsing_tools.find_single_class(header, pos_class).string
    except ClassNotFound:
        word.pos = 'undefined'

    try:
        prons = parsing_tools.find_all_classes(soup, pron_top_class)
    except ClassNotFound:
        pass
    else:
        for pron in prons:
            ipa = Ipa()
            try:
                ipa_content = parsing_tools.find_single_class(pron, ipa_class)
            except ClassNotFound:
                pass
            else:
                ipa.ipa = extract_ipa(ipa_content.text)

            audio_div = parsing_tools.find_single_class(pron, audio_class)
            audio_url = audio_div[audio_url_param_name]
            ipa.audio = cache.File(audio_url, 'mp3')
            try:
                geo = pron['geo']
            except KeyError:
                raise ParseError("Can't find 'geo' attribute in a pronunciation class {}".format(str(pron)))
            if 'br' in geo and 'am' in geo:
                raise ParseError("Can't decide if IPA is UK or US, geo name: '{}'".format(geo))
            if 'br' in geo:
                ipa.region = 'BR'
            elif 'am' in geo:
                ipa.region = 'US'
            else:
                raise ParseError("Can't decide if IPA is UK or US, geo name: '{}'".format(geo))
            pron_section = pron.find_parent(class_=pron_section_class)
            if pron_section:
                description_words = pron_section.find(class_='vp').text.split(' ')
                description_words[-1] = '<b>' + description_words[-1] + '</b>'
                ipa.description = ' '.join(description_words)
            word.ipas.append(ipa)

    # remove idiom div, it also has definitions we don't need
    idiom_div = soup.find(class_=idioms_parent)
    if idiom_div:
        idiom_div.decompose()
    try:
        definitions = parsing_tools.find_all_classes(soup, definition_parent_class)
    except ClassNotFound:
        pass
    else:
        for def_parent in definitions:
            definition = Definition()
            try:
                definition_header = parsing_tools.find_single_class(
                    def_parent, definition_class)
            except ClassNotFound:
                # Probably a link to some other page
                continue
            definition.definition = definition_header.text

            # remove synonyms etc., they can have labels we don't need
            collapsed = soup.find_all(class_=collapse_class)
            for c in collapsed:
                c.decompose()

            label = def_parent.find(class_=definition_label_class)
            if label:
                definition.definition = label.text.replace('(', '[').replace(')', ']') \
                                        + ' ' + definition.definition

            try:
                definition.definition_additional = parsing_tools.find_single_class(
                    def_parent, definition_additional_class)
            except ClassNotFound:
                definition.definition_additional = ''

            sentences = def_parent.find_all(class_=sentence_class)
            for s in sentences:
                sentence = Sentence()
                sentence.content = s.text
                definition.sentences.append(sentence)

            word.definitions.append(definition)

    return word
Пример #5
0
def parse_html(html):
    top_container_class = 'webtop'
    name_class = 'headword'
    pos_class = 'pos'
    pos_additional_classes = ['labels', 'inflections', 'variants']
    verb_form_root_class = 'verb_form'
    verb_form_description_class = 'verb_form'
    pron_top_class = 'phonetics'
    ipa_class = 'phon'
    audio_class = 'sound'
    audio_url_param_name = 'data-src-mp3'
    idioms_parent = 'idioms'
    definition_parent_class = 'sense'
    definition_class = 'def'
    definition_additional_class = 'grammar'  # "uncountable", etc
    definition_label_class = 'labels'  # "informal", "especially north american", etc
    sentence_class = 'x'
    collapse_class = 'collapse'
    synonyms_title = 'Synonyms'
    collocations_title = 'Collocations'
    soup = parsing_tools.html_to_soup(html)

    # there are many class with this name, get the first one
    top_container = soup.find(class_=top_container_class)
    word = Word()
    word.source = 'Oxford'
    word.word = parsing_tools.find_single_class(top_container, name_class).text

    pos = top_container.find_all(class_=pos_class)
    if len(pos) > 0:
        word.pos = ', '.join([p.string for p in pos])
    else:
        word.pos = ''

    pos_additionals = []
    for c in pos_additional_classes:
        pos_additional = top_container.find(class_=c, recursive=False)
        if pos_additional:
            pos_additionals.append(
                pos_additional.text.replace('(', '[').replace(')', ']'))

    word.pos_additional = ' '.join(pos_additionals)

    try:
        pron_collections = parsing_tools.find_all_classes(
            top_container, pron_top_class)
    except ClassNotFound:
        pass
    else:
        for pron_collection in pron_collections:
            prons = pron_collection.find_all('div', recursive=False)
            for pron in prons:
                ipas = []
                ipa_contents = pron.find_all(class_=ipa_class)
                for i, ipa_content in enumerate(ipa_contents):
                    if len(ipas) == i:
                        ipas.append(Ipa())
                    ipas[i].ipa = extract_ipa(ipa_content.text)

                audio_divs = pron.find_all(class_=audio_class)
                for i, audio_div in enumerate(audio_divs):
                    if len(ipas) == i:
                        ipas.append(Ipa())
                    audio_url = audio_div[audio_url_param_name]
                    ipas[i].audio = cache.File(audio_url, 'mp3')

                if len(ipas) > 1 and (len(ipa_contents) != len(audio_divs)):
                    raise ParseError(
                        "Found multiple pronunciations for a region, "
                        "but audio and ipa length are different.")

                try:
                    geo = pron['geo']
                except KeyError:
                    raise ParseError(
                        "Can't find 'geo' attribute in a pronunciation class {}"
                        .format(str(pron)))
                if 'br' in geo and 'am' in geo:
                    raise ParseError(
                        "Can't decide if IPA is UK or US, geo name: '{}'".
                        format(geo))
                if 'br' in geo:
                    for ipa in ipas:
                        ipa.region = 'BR'
                elif 'am' in geo:
                    for ipa in ipas:
                        ipa.region = 'US'
                else:
                    raise ParseError(
                        "Can't decide if IPA is UK or US, geo name: '{}'".
                        format(geo))

                pron_section = pron.find_parent(class_=verb_form_root_class)
                if pron_section:
                    description_words = pron_section.find(
                        class_=verb_form_description_class).text.split(' ')
                    description_words[
                        -1] = '<b>' + description_words[-1] + '</b>'
                    for ipa in ipas:
                        ipa.description = ' '.join(description_words)

                for ipa in ipas:
                    word.ipas.append(ipa)

    # remove idiom div, it also has definitions we don't need
    idiom_div = soup.find(class_=idioms_parent)
    if idiom_div:
        idiom_div.decompose()
    try:
        definitions = parsing_tools.find_all_classes(soup,
                                                     definition_parent_class)
    except ClassNotFound:
        pass
    else:
        for def_parent in definitions:
            definition = Definition()
            try:
                definition_header = parsing_tools.find_single_class(
                    def_parent, definition_class)
            except ClassNotFound:
                # Probably a link to some other page
                continue
            definition.definition = definition_header.text

            # remove synonyms etc., they can have labels we don't need
            collapsed = soup.find_all(class_=collapse_class)
            for c in collapsed:
                c.decompose()

            label = def_parent.find(class_=definition_label_class)
            if label and len(label.text.strip()) > 0:
                definition.definition = label.text.replace('(', '[').replace(')', ']') \
                                        + ' ' + definition.definition

            try:
                definition.definition_additional = parsing_tools.find_single_class(
                    def_parent, definition_additional_class).text
            except ClassNotFound:
                definition.definition_additional = ''

            sentences = def_parent.find_all(class_=sentence_class)
            for s in sentences:
                sentence = Sentence()
                sentence.content = s.text
                definition.sentences.append(sentence)

            word.definitions.append(definition)

    return word