Пример #1
0
def validateImageHolders(report_dict, xml_root, stylename, para, image_string,
                         sectionnames):
    logger.info("* * * commencing validateImageHolders function")
    imagestring_regex = re.compile(r"[^\w-]")
    valid_file_extensions = cfg.imageholder_supported_ext
    errstring, errstringb = '', ''
    image_name, image_ext = os.path.splitext(image_string)
    # check filename and extension separately against regex
    badchars = re.findall(imagestring_regex, image_name)
    badchars_ext = re.findall(imagestring_regex, image_ext[1:])
    # report errors re: unwanted chars
    if badchars:
        # note: not using 'format' string interpolation below b/c it threw error for unicode chars
        #   using string concat here allows us to centralize utf-8 encoding at generate/build report
        lxml_utils.logForReport(report_dict, xml_root, para,
                                'image_holder_badchar',
                                stylename + "_" + image_string,
                                ['section_info'], sectionnames)
    # report separate error for no file extension
    if not image_ext or image_ext not in valid_file_extensions:
        lxml_utils.logForReport(report_dict, xml_root, para,
                                'image_holder_ext_error',
                                stylename + "_" + image_string,
                                ['section_info'], sectionnames)
    return report_dict
Пример #2
0
def logTextOfRunsWithStyle(report_dict,
                           doc_root,
                           stylename,
                           report_category,
                           sectionnames,
                           scriptname=""):
    logger.info("Logging runs styled as %s to report_dict['%s']" %
                (stylename, report_category))
    runs = lxml_utils.findRunsWithStyle(
        lxml_utils.transformStylename(stylename), doc_root)
    for run in runs:
        # skip if the prev runstyle matches this one; that means we already processed it
        rneighbors = lxml_utils.getNeighborRuns(run)
        if rneighbors['prevstyle'] == lxml_utils.transformStylename(stylename):
            continue
        # aggregate next text of subsequent runs if stylename is the same
        runtxt = lxml_utils.getParaTxt(run)
        while rneighbors['nextstyle'] == lxml_utils.transformStylename(
                stylename):
            runtmp = rneighbors['next']
            runtxt += lxml_utils.getParaTxt(runtmp)
            rneighbors = lxml_utils.getNeighborRuns(runtmp)
        para = run.getparent()
        # if we're running this for rsuitevalidate & have an imageholder style, need to do extra checks:
        if stylename in cfg.imageholder_styles and scriptname == 'rsuitevalidate':
            validateImageHolders(report_dict, doc_root, stylename, para,
                                 runtxt, sectionnames)
        lxml_utils.logForReport(report_dict, doc_root, para, report_category,
                                runtxt, ['para_string', 'para_index'])
    return report_dict
Пример #3
0
def checkFirstPara(report_dict, doc_root, sectionnames, report_category):
    logger.info(
        "Checking first para style to make sure it is a SectionStart..")
    firstpara = doc_root.find(".//*w:p", wordnamespaces)
    stylename = lxml_utils.getParaStyle(firstpara)
    if stylename not in sectionnames:
        logger.warn("first para style is not a required style, instead is: " +
                    stylename)
        lxml_utils.logForReport(report_dict, doc_root, firstpara,
                                report_category,
                                lxml_utils.getStyleLongname(stylename))
    return report_dict, firstpara
def deleteObjects(report_dict, xml_root, objects_to_delete, object_name):
    logger.info("* * * commencing deleteObjects function for %s..." %
                object_name)
    for object in objects_to_delete:
        searchstring = ".//{}".format(object)
        for element in xml_root.findall(searchstring, wordnamespaces):
            # get para for report (before we delete theelement!):
            para = lxml_utils.getParaParentofElement(element)
            # remove element
            element.getparent().remove(element)
            # optional - log to report_dict
            if para is not None:
                lxml_utils.logForReport(
                    report_dict, xml_root, para,
                    "deleted_objects-%s" % object_name,
                    "deleted %s of type %s" % (object_name, object))
    return report_dict, xml_root
Пример #5
0
def logTextOfParasWithStyle(report_dict,
                            doc_root,
                            stylename,
                            report_category,
                            sectionnames,
                            scriptname=""):
    logger.info("Logging paras styled as '%s' to report_dict['%s']" %
                (stylename, report_category))
    paras = lxml_utils.findParasWithStyle(
        lxml_utils.transformStylename(stylename), doc_root)
    for para in paras:
        paratxt = lxml_utils.getParaTxt(para)
        # if we're running this for rsuitevalidate & have an imageholder style, need to do extra checks:
        if stylename in cfg.imageholder_styles and scriptname == 'rsuitevalidate':
            validateImageHolders(report_dict, doc_root, stylename, para,
                                 paratxt, sectionnames)
        lxml_utils.logForReport(report_dict, doc_root, para, report_category,
                                paratxt, ['para_string', 'para_index'])
    return report_dict
Пример #6
0
def getAllStylesUsed_ProcessParaStyle(
        report_dict, stylename, styles_root, doc_root, macmillanstyles,
        sectionnames, found_para_context, container_styles, container_prefix,
        macmillan_styles_found_dict, macmillan_styles_found, para, call_type,
        bookmakerstyles):
    # search styles.xlm for corresponding full stylename so we can determine if its a Macmillan style
    stylesearchstring = ".//w:style[@w:styleId='%s']/w:name" % stylename
    stylematch = styles_root.find(stylesearchstring, wordnamespaces)

    # get fullname value and test against Macmillan style list
    stylename_full = stylematch.get('{%s}val' % wnamespace)
    if stylename_full in macmillanstyles:
        if stylename not in sectionnames and stylename not in container_styles:
            macmillan_styles_found_dict.append(found_para_context)
            macmillan_styles_found.append(stylename)
            fullstylename_with_container = container_prefix + stylename_full
            # log style
            lxml_utils.logForReport(report_dict, doc_root, para,
                                    'Macmillan_style_first_use',
                                    fullstylename_with_container,
                                    ['section_info', 'para_string'],
                                    sectionnames)
        # skipping this check for rsuitevalidate - since it is moot. Testing by presence of container styles.
        if not container_styles:
            if stylename_full not in bookmakerstyles:
                lxml_utils.logForReport(report_dict, doc_root, para,
                                        'non_bookmaker_macmillan_style',
                                        stylename_full, ['section_info'],
                                        sectionnames)
    else:
        # if we're "validating", revert custom_styles based on Macmillan styles to base_style (for _non_ rsuite styled)
        if call_type == "validate" and not container_styles:
            report_dict = getAllStylesUsed_RevertToBase(
                stylematch, macmillanstyles, report_dict, doc_root,
                stylename_full, para, sectionnames)
        # else log non-Macmillan style used; separate categories for table-paras...
        elif para.getparent().tag == '{{{}}}tc'.format(wnamespace):
            lxml_utils.logForReport(report_dict, doc_root, para,
                                    'non-Macmillan_style_used_in_table',
                                    stylename_full, ['section_info'],
                                    sectionnames)
        # versus regular paras:
        else:
            lxml_utils.logForReport(report_dict, doc_root, para,
                                    'non-Macmillan_style_used', stylename_full,
                                    ['section_info'], sectionnames)
    return report_dict
Пример #7
0
def getAllStylesUsed_RevertToBase(stylematch,
                                  macmillanstyles,
                                  report_dict,
                                  doc_root,
                                  stylename_full,
                                  para,
                                  sectionnames,
                                  run_style=None):
    macmillanstyle_shortnames = [
        lxml_utils.transformStylename(s) for s in macmillanstyles
    ]
    basedon_element = stylematch.getparent().find(".//w:basedOn",
                                                  wordnamespaces)
    if basedon_element is not None:
        basedonstyle = basedon_element.get('{%s}val' % wnamespace)
        if basedonstyle in macmillanstyle_shortnames:
            if run_style is not None:
                run_style.set(attrib_style_key, basedonstyle)
            else:
                attrib_style_key = '{%s}val' % wnamespace
                para.find(".//*w:pStyle",
                          wordnamespaces).set(attrib_style_key, basedonstyle)
            # optionally, log to json:
            lxml_utils.logForReport(
                report_dict, doc_root, para,
                'changed_custom_style_to_Macmillan_basestyle',
                "'{}', based on '{}'".format(stylename_full, basedonstyle))
        else:
            if run_style is not None:
                # log char styles
                lxml_utils.logForReport(report_dict, doc_root, para,
                                        'non-Macmillan_charstyle_used',
                                        stylename_full)
            # log para styles not reverted to base; separate categories for table-paras...
            elif para.getparent().tag == '{{{}}}tc'.format(wnamespace):
                lxml_utils.logForReport(report_dict, doc_root, para,
                                        'non-Macmillan_style_used_in_table',
                                        stylename_full, ['section_info'],
                                        sectionnames)
            # and regular paras:
            else:
                lxml_utils.logForReport(report_dict, doc_root, para,
                                        'non-Macmillan_style_used',
                                        stylename_full, ['section_info'],
                                        sectionnames)
    return report_dict
Пример #8
0
def getAllStylesUsed(report_dict,
                     doc_root,
                     styles_xml,
                     sectionnames,
                     macmillanstyledata,
                     bookmakerstyles,
                     call_type,
                     valid_native_word_styles,
                     container_starts=[],
                     container_ends=[],
                     runs_only=False):
    logger.info("** running function 'getAllStylesUsed'")
    styles_tree = etree.parse(styles_xml)
    styles_root = styles_tree.getroot()
    # macmillanstyle_shortnames = [lxml_utils.transformStylename(s) for s in macmillanstyledata]
    # get a list of macmillan stylenames from macmillan json, start with native word styles
    # if we want to exclude valid native word styles from report instead, would add them to conditional on line 110
    macmillanstyles = valid_native_word_styles[:]  # <- slice the orig. list, to make a shallow copy
    for stylename in macmillanstyledata:
        macmillanstyles.append(stylename)
    macmillan_styles_found = []  # <- non-rsuite Macmillan para styles
    macmillan_styles_found_dict = []  # <- for rsuite para styles
    charstyles_found = [
    ]  # <- for all Macmillan char styles, to make sure we don't report them more than once (we are summarizing)
    # now capture / add Macmillan charstyles found in previous runs of other xml files in doc
    if "Macmillan_charstyle_first_use" in report_dict:
        for charstyle_dict in report_dict["Macmillan_charstyle_first_use"]:
            styleshortname = lxml_utils.transformStylename(
                charstyle_dict['description'])
            charstyles_found.append(styleshortname)
    if "non-Macmillan_charstyle_used" in report_dict:
        for charstyle_dict in report_dict["non-Macmillan_charstyle_used"]:
            styleshortname = lxml_utils.transformStylename(
                charstyle_dict['description'])
            charstyles_found.append(styleshortname)

    # adding "runs_only" option so I can re-use this to capture charstyles for footnotes/endnotes
    if runs_only == True:
        logger.info(
            "runs_only set to: %s, we are probably scanning xml other than doc itself, just for charstyles"
            % runs_only)
    else:
        logger.info(
            "logging 1st use of every Macmillan para style, and any use of other style"
        )
        this_section = ""
        container_prefix = ""
        for para in doc_root.findall(".//*w:p", wordnamespaces):
            # get stylename from each para
            stylename = lxml_utils.getParaStyle(para)

            # track current section & container as we loop through styles
            if stylename in sectionnames:
                this_section = stylename
                container_prefix = ""
                continue
            elif stylename in container_starts:
                container_prefix = lxml_utils.getStyleLongname(
                    stylename).split()[0] + " > "
                continue
            elif stylename in container_ends:
                container_prefix = ""
                continue

            shortstylename_with_container = container_prefix + stylename
            found_para_context = {this_section: shortstylename_with_container}

            # check index to see if style has already been noted (with section / container context where apropos)
            test_if_present = False
            if not container_starts and stylename in macmillan_styles_found:
                test_if_present = True
            elif container_starts:
                for d in macmillan_styles_found_dict:
                    if this_section in d and d[
                            this_section] == shortstylename_with_container:
                        test_if_present = True

            # if stylename not in macmillan_styles_found, proceed to process/ log it!:
            if test_if_present == False:
                container_styles = container_starts + container_ends
                report_dict = getAllStylesUsed_ProcessParaStyle(
                    report_dict, stylename, styles_root, doc_root,
                    macmillanstyles, sectionnames, found_para_context,
                    container_styles, container_prefix,
                    macmillan_styles_found_dict, macmillan_styles_found, para,
                    call_type, bookmakerstyles)

    # Now get runstyles!
    logger.info(
        "logging 1st use of every Macmillan char style, and any use of other char-style"
    )
    for run_style in doc_root.findall(".//*w:rStyle", wordnamespaces):
        # get run_stylename from each styled run
        attrib_style_key = '{%s}val' % wnamespace
        stylename = run_style.get(attrib_style_key)

        # There are seven cases / conditions for charstyles:
        #   first checking if we've already encountered this style, b/c unless calltype is "validate",
        #   we can maybe skip some processing & goto next
        if stylename in charstyles_found and call_type == "validate":
            # search styles.xlm for corresponding full stylename so we can determine if its a Macmillan style
            stylesearchstring = ".//w:style[@w:styleId='%s']/w:name" % stylename
            stylematch = styles_root.find(stylesearchstring, wordnamespaces)
            stylename_full = stylematch.get('{%s}val' % wnamespace)
            if stylename_full not in macmillanstyles and container_starts:
                # for RSuite styles, just delete all previously encountered non-Macmillan charstyles
                run_style.getparent().remove(run_style)
            ## Right now we are not handling subsequent non-MAcmillan charstyles any differentyl outside
            ##  of RSuite validator -- if we do, we would uncomment here \/ & add & return values to charstyles_found
            # elif stylename_full not in macmillanstyles and not container_starts:
            #     # for non-RSuite styles, try to revert all non-Macmillan charstyles
            #     para = run_style.getparent().getparent().getparent()
            #     report_dict = getAllStylesUsed_RevertToBase(stylematch, macmillanstyles, report_dict, doc_root, stylename_full, para, run_style)

        # cases for first time a stylename is encountered:
        elif stylename not in charstyles_found:
            # get para for report
            para = run_style.getparent().getparent().getparent()
            # search styles.xlm for corresponding full stylename so we can determine if its a Macmillan style
            stylesearchstring = ".//w:style[@w:styleId='%s']/w:name" % stylename
            stylematch = styles_root.find(stylesearchstring, wordnamespaces)
            stylename_full = stylematch.get('{%s}val' % wnamespace)
            # First encounter of Macmillan charstyle, logging for report and appending to 'found' list
            if stylename_full in macmillanstyles:
                charstyles_found.append(stylename)
                lxml_utils.logForReport(report_dict, doc_root, para,
                                        'Macmillan_charstyle_first_use',
                                        stylename_full)
            # First encounter of non-Macmillan style, NOT 'validate' call-type
            elif call_type != "validate" and container_starts:
                # log for report
                lxml_utils.logForReport(report_dict, doc_root, para,
                                        'non-Macmillan_charstyle_used',
                                        stylename_full)
                # add to the list of found charstyles so we don't reprocess:
                charstyles_found.append(stylename)
            # First encounter of non-Macmillan style, for RSuite-styled docs, with 'validate' call-type
            elif call_type == "validate" and container_starts:
                # report first encounter for each, then add to list of found charstyles so we don't re-log
                lxml_utils.logForReport(report_dict, doc_root, para,
                                        'non-Macmillan_charstyle_removed',
                                        stylename_full)
                charstyles_found.append(stylename)
                # then delete the runstyle!
                run_style.getparent().remove(run_style)
            # First encounter of non-Macmillan style, for NON-RSuite-styled docs, with 'validate' call-type
            elif call_type == "validate" and not container_starts:
                # for non-RSuite styles, try to revert all non-Macmillan charstyles
                para = run_style.getparent().getparent().getparent()
                report_dict = getAllStylesUsed_RevertToBase(
                    stylematch, macmillanstyles, report_dict, doc_root,
                    stylename_full, para, sectionnames, run_style)

    return report_dict