예제 #1
0
    def initFromStr(self,
                    src,
                    reading_position,
                    srclanguage):
        """
                Text.initFromStr()

                src     :       list of string(s).
        """
        self.reset()

        if len(src) == 0:
            msg = "(ERR057) Extract with no text; src={0}; srclanguage={1}; -> {2}"
            self.errors.error(msg.format(src,
                                         srclanguage,
                                         self.reading_position))

        # \n is the internal character used to cut the string in lines :
        # This character has NOTHING TO DO with the corresponding TextDBDict character.
        # confer documentation @001
        self.textdata.text = "\n".join(src)

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # text's normalization
        text = self.textdata.text

        if "texts" in logotheras.options.OPTIONS["normalize"]:

            _text = srclanguage.normalize(text)

            if text != _text:

                msg = "(WAR007) " \
                      "text's normalization : \n'{0}'\n>\n'{1}'\n* {2}\n* {3};\n-> {4}"
                self.errors.warning(msg.format(text,
                                               _text,
                                               getDetailsAboutAString(text),
                                               getDetailsAboutAString(_text),
                                               self.reading_position))

                text = _text

            self.textdata.text = text

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # links in <self.textdata.text> ?
        self.textdata.links = Link().getExtracts(source = self.textdata.text,
                                                 context = "extract.text.text+" + \
                                                 reading_position.getShortDescription())

        return self
예제 #2
0
    def analyseFirstLine(self,
                         line,
                         srclanguage,
                         do_not_normalize):
        """
                Header.analyseFirstLine

                line                    : (str)
                srclanguage             : Language object
                do_not_normalize        : (bool)

                Return a list made of 4 elements extracted from <line> :
                (artiname, fullname, sortingname, articlescategory)
        """
        res_artiname = None
        res_fullname = None
        res_sortingname = Header.defaultSortingName
        res_articlescategory = None

        pos_fullname = None
        pos_articlescategory = None

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # fullname's setting :
        fullname = TextBetweenMarkers( HEADER_FULLNAME_START,
                                       HEADER_FULLNAME_END ).getExtracts(line)

        if len(fullname) == 0:
            res_fullname = None

        elif len(fullname) == 1:
            res_fullname = fullname[0].substring
            pos_fullname = fullname[0].pos0

        else:
            msg = "(ERR038) More than one 'fullname' in a header's first line; -> {0}"
            self.errors.error(msg.format(self.reading_position))

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # articlescategory's setting :
        articlescategory = TextBetweenMarkers( HEADER_ARTCATEGORY_START,
                                               HEADER_ARTCATEGORY_END ).getExtracts(line)

        if len(articlescategory) == 0:
            res_articlescategory = None

        elif len(articlescategory) == 1:
            res_articlescategory = articlescategory[0].substring
            pos_articlescategory = articlescategory[0].pos0

        else:
            msg = "(ERR037) " \
                  "More than one 'articles' category' in a header's first line; -> {0}"
            self.errors.error(msg.format(self.reading_position))

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # artiname's setting :
        if len(line) > 0:
            res_artiname = ""

            for char in line:
                if char in HEADER_ARTINAME_STOPS:
                    break
                else:
                    res_artiname += char

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # sortingname's setting :
        sortingname = TextBetweenMarkers( HEADER_SORTINGNAME_START,
                                          HEADER_SORTINGNAME_END ).getExtracts(line)

        if len(sortingname) == 0:
            res_sortingname = Header.defaultSortingName

        elif len(sortingname) == 1:
            res_sortingname = sortingname[0].substring
            pos_sortingname = sortingname[0].pos0

        else:
            msg = "(ERR036) " \
                  "More than one 'sortingname' in a header's first line; -> {0}"
            self.errors.error(msg.format(self.reading_position))

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Are fullname and articlescategory mixed up ?
        if len(fullname) == 1 and len(articlescategory) == 1:

            if pos_fullname < pos_articlescategory < pos_fullname + \
                                                        len(res_fullname) + \
                                                        len(HEADER_FULLNAME_END):
                msg = "(ERR035) 'articles' category' placed inside a 'fullname' " \
                      "in a header's first line; -> {0}"
                self.errors.error(msg.format(self.reading_position))

            if pos_articlescategory < pos_fullname <  pos_articlescategory + \
                                                         len(res_articlescategory) + \
                                                         len(HEADER_ARTCATEGORY_END):
                msg = "(ERR034) 'fullname' placed inside an 'articles' category' " \
                      "in a header's first line; -> {0}"
                self.errors.error(msg.format(self.reading_position))

        # Are sortingname and articlescategory mixed up ?
        if len(sortingname) == 1 and len(articlescategory) == 1:

            if pos_sortingname < pos_articlescategory < pos_sortingname + \
                                                        len(res_sortingname) + \
                                                        len(HEADER_SORTINGNAME_END):
                msg = "(ERR033) 'articles' category' placed inside a 'sortingname' " \
                      "in a header's first line; -> {0}"
                self.errors.error(msg.format(self.reading_position))

            if pos_articlescategory < pos_sortingname <  pos_articlescategory + \
                                                         len(res_articlescategory) + \
                                                         len(HEADER_ARTCATEGORY_END):
                msg = "(ERR032) 'sortingname' placed inside a 'articles' category'" \
                      "in a header's first line; -> {0}"
                self.errors.error(msg.format(self.reading_position))

        # Are sortingname and fullname mixed up ?
        if len(sortingname) == 1 and len(fullname) == 1:

            if pos_sortingname < pos_fullname < pos_sortingname + \
                                                        len(res_sortingname) + \
                                                        len(HEADER_SORTINGNAME_END):
                msg = "(ERR031) 'fullname' placed inside a 'sortingname' " \
                      "in a header's first line; -> {0}"
                self.errors.error(msg.format(self.reading_position))

            if pos_fullname < pos_sortingname <  pos_fullname + \
                                                         len(res_fullname) + \
                                                         len(HEADER_FULLNAME_END):
                msg = "(ERR030) 'sortingname' placed inside a 'fullname' " \
                      "in a header's first line; -> {0}"
                self.errors.error(msg.format(self.reading_position))

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Is there any pending character ?
        for index, char in enumerate(line):

            if char not in HEADER_SPACE_CHARACTERS:

                index_ok = False

                if res_artiname is not None and \
                   index < len(res_artiname):
                    # ok, the character belongs to a artiname :
                    index_ok = True

                if res_fullname is not None and \
                   pos_fullname - len(HEADER_FULLNAME_START) <= index <= pos_fullname + \
                                                                         len(res_fullname) + \
                                                                         len(HEADER_FULLNAME_END):
                    # ok, the character belongs to a fullname :
                    index_ok = True

                if res_sortingname != Header.defaultSortingName and \
                   pos_sortingname - len(HEADER_SORTINGNAME_START) <= index <= pos_sortingname + \
                                                                        len(res_sortingname) + \
                                                                        len(HEADER_SORTINGNAME_END):
                    # ok, the character belongs to a sortingname :
                    index_ok = True

                if res_articlescategory is not None and \
                   pos_articlescategory - len(HEADER_ARTCATEGORY_END) <= index <= \
                                                       pos_articlescategory + \
                                                       len(res_articlescategory) + \
                                                       len(HEADER_ARTCATEGORY_END):
                    # ok, the character belongs to a articlescategory :
                    index_ok = True

                if not index_ok:
                    msg = "(ERR029) Pending character '{0}' in header's first line; -> {1}"
                    self.errors.error(msg.format(char,
                                                 self.reading_position))

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Is NOTALEXICAL_ARTINAME at the beginning of the artiname ?
        # Is there only one NOTALEXICAL_ARTINAME in the artiname ?
        if res_artiname.count(NOTALEXICAL_ARTINAME) == 1:

            if res_artiname.find(NOTALEXICAL_ARTINAME) != 0:
                msg = "(ERR028) '{0}' symbol authorized only " \
                      "at the beginning of a artiname; " \
                      "current artiname = '{1}'; -> {2}"
                self.errors.error(msg.format(NOTALEXICAL_ARTINAME,
                                             res_artiname,
                                             self.reading_position))

        elif res_artiname.count(NOTALEXICAL_ARTINAME) > 1:
            msg = "(ERR027) Only one '{0}' symbol authorized in a artiname; " \
                  "current artiname = '{1}'; -> {2}"
            self.errors.error(msg.format(NOTALEXICAL_ARTINAME,
                                         res_artiname,
                                         self.reading_position))

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Return values : artiname
        if res_artiname is not None:
            res_artiname = res_artiname.strip()

            #~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~
            # artiname's normalization :
            if not do_not_normalize and \
               "artinames" in logotheras.options.OPTIONS["normalize"]:

                if NOTALEXICAL_ARTINAME in res_artiname:

                    msg = "Skipping the normalization of the artiname '{0}' : '{1}' detected"
                    self.errors.info(msg.format(res_artiname,
                                                NOTALEXICAL_ARTINAME))

                else:
                    _res_artiname = srclanguage.normalize(res_artiname)

                    if _res_artiname != res_artiname:

                        msg = "(WAR004) artiname's normalization : " \
                              "'{0}' > '{1}'\n* {2}\n* {3};\n-> {4}"
                        self.errors.warning(msg.format(res_artiname,
                                                       _res_artiname,
                                                       getDetailsAboutAString(res_artiname),
                                                       getDetailsAboutAString(_res_artiname),
                                                       self.reading_position))
                    res_artiname = _res_artiname

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Return values : fullname
        if res_fullname is not None:
            res_fullname = res_fullname.strip()

            #~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~
            # fullname's normalization
            if not do_not_normalize and \
               "fullnames" in logotheras.options.OPTIONS["normalize"]:

                _res_fullname = srclanguage.normalize(res_fullname)

                if _res_fullname != res_fullname:

                    msg = "(WAR003) fullname's normalization : " \
                          "'{0}' > '{1}'\n* {2}\n* {3};\n-> {4}"
                    self.errors.warning(msg.format(res_fullname,
                                                   _res_fullname,
                                                   getDetailsAboutAString(res_fullname),
                                                   getDetailsAboutAString(_res_fullname),
                                                   self.reading_position))

                res_fullname = _res_fullname

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Return values : sortingname
        #
        # By default, res_sortingname is equal to Header.defaultSortingName :
        #
        # - (1) if its value is given in the header, res_sortingname will be equal
        #   to this value
        # - (2) otherwise, it takes the value of the artiname if this last one has been
        #   defined
        # - (3) otherwise, it will be equal to Header.defaultSortingName.
        #
        if res_sortingname != Header.defaultSortingName:
            #~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.
            # (1) value defined in the header :
            #~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.
            res_sortingname = res_sortingname.strip()

            # sortingname's normalization
            if not do_not_normalize and \
               "sortingnames" in logotheras.options.OPTIONS["normalize"]:

                _res_sortingname = srclanguage.normalize(res_sortingname)

                if _res_sortingname != res_sortingname:

                    msg = "(WAR002) sortingname's normalization : " \
                          "'{0}' > '{1}'\n* {2}\n* {3};\n-> {4}"
                    self.errors.warning(msg.format(res_sortingname,
                                                   _res_sortingname,
                                                   getDetailsAboutAString(res_sortingname),
                                                   getDetailsAboutAString(_res_sortingname),
                                                   self.reading_position))

                res_sortingname = _res_sortingname

        elif res_artiname is not None:
            #~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.
            # (2) value equal to the artiname
            #~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.
            res_sortingname = res_artiname
        else:
            #~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.
            # (3) default value
            #~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.
            res_sortingname = Header.defaultSortingName

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # error : the artiname begins with NOTALEXICAL_ARTINAME and there's a
        #         fullname.
        if NOTALEXICAL_ARTINAME in res_artiname and \
           res_fullname is not None:

            msg = "(ERR026) " \
                  "artiname, fullname = '{0}', '{1}' : " \
                  "the artiname having the '{2}' symbol, " \
                  "no fullname is allowed; -> {3}."
            self.errors.error(msg.format(res_artiname,
                                         res_fullname,
                                         NOTALEXICAL_ARTINAME,
                                         self.reading_position))

        if res_articlescategory is not None:
            res_articlescategory = res_articlescategory.strip()

        return (self.headerdata.artiname_prefix + res_artiname,
                res_fullname,
                res_sortingname,
                res_articlescategory)