def initFromStr(self, src, reading_position, srclanguage): """ Text.initFromStr() src : list of string(s). """ self.reset() if len(src) == 0: msg = "(ERR057) Extract with no text; src={0}; srclanguage={1}; -> {2}" self.errors.error(msg.format(src, srclanguage, self.reading_position)) # \n is the internal character used to cut the string in lines : # This character has NOTHING TO DO with the corresponding TextDBDict character. # confer documentation @001 self.textdata.text = "\n".join(src) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # text's normalization text = self.textdata.text if "texts" in logotheras.options.OPTIONS["normalize"]: _text = srclanguage.normalize(text) if text != _text: msg = "(WAR007) " \ "text's normalization : \n'{0}'\n>\n'{1}'\n* {2}\n* {3};\n-> {4}" self.errors.warning(msg.format(text, _text, getDetailsAboutAString(text), getDetailsAboutAString(_text), self.reading_position)) text = _text self.textdata.text = text #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # links in <self.textdata.text> ? self.textdata.links = Link().getExtracts(source = self.textdata.text, context = "extract.text.text+" + \ reading_position.getShortDescription()) return self
def analyseFirstLine(self, line, srclanguage, do_not_normalize): """ Header.analyseFirstLine line : (str) srclanguage : Language object do_not_normalize : (bool) Return a list made of 4 elements extracted from <line> : (artiname, fullname, sortingname, articlescategory) """ res_artiname = None res_fullname = None res_sortingname = Header.defaultSortingName res_articlescategory = None pos_fullname = None pos_articlescategory = None #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # fullname's setting : fullname = TextBetweenMarkers( HEADER_FULLNAME_START, HEADER_FULLNAME_END ).getExtracts(line) if len(fullname) == 0: res_fullname = None elif len(fullname) == 1: res_fullname = fullname[0].substring pos_fullname = fullname[0].pos0 else: msg = "(ERR038) More than one 'fullname' in a header's first line; -> {0}" self.errors.error(msg.format(self.reading_position)) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # articlescategory's setting : articlescategory = TextBetweenMarkers( HEADER_ARTCATEGORY_START, HEADER_ARTCATEGORY_END ).getExtracts(line) if len(articlescategory) == 0: res_articlescategory = None elif len(articlescategory) == 1: res_articlescategory = articlescategory[0].substring pos_articlescategory = articlescategory[0].pos0 else: msg = "(ERR037) " \ "More than one 'articles' category' in a header's first line; -> {0}" self.errors.error(msg.format(self.reading_position)) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # artiname's setting : if len(line) > 0: res_artiname = "" for char in line: if char in HEADER_ARTINAME_STOPS: break else: res_artiname += char #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # sortingname's setting : sortingname = TextBetweenMarkers( HEADER_SORTINGNAME_START, HEADER_SORTINGNAME_END ).getExtracts(line) if len(sortingname) == 0: res_sortingname = Header.defaultSortingName elif len(sortingname) == 1: res_sortingname = sortingname[0].substring pos_sortingname = sortingname[0].pos0 else: msg = "(ERR036) " \ "More than one 'sortingname' in a header's first line; -> {0}" self.errors.error(msg.format(self.reading_position)) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Are fullname and articlescategory mixed up ? if len(fullname) == 1 and len(articlescategory) == 1: if pos_fullname < pos_articlescategory < pos_fullname + \ len(res_fullname) + \ len(HEADER_FULLNAME_END): msg = "(ERR035) 'articles' category' placed inside a 'fullname' " \ "in a header's first line; -> {0}" self.errors.error(msg.format(self.reading_position)) if pos_articlescategory < pos_fullname < pos_articlescategory + \ len(res_articlescategory) + \ len(HEADER_ARTCATEGORY_END): msg = "(ERR034) 'fullname' placed inside an 'articles' category' " \ "in a header's first line; -> {0}" self.errors.error(msg.format(self.reading_position)) # Are sortingname and articlescategory mixed up ? if len(sortingname) == 1 and len(articlescategory) == 1: if pos_sortingname < pos_articlescategory < pos_sortingname + \ len(res_sortingname) + \ len(HEADER_SORTINGNAME_END): msg = "(ERR033) 'articles' category' placed inside a 'sortingname' " \ "in a header's first line; -> {0}" self.errors.error(msg.format(self.reading_position)) if pos_articlescategory < pos_sortingname < pos_articlescategory + \ len(res_articlescategory) + \ len(HEADER_ARTCATEGORY_END): msg = "(ERR032) 'sortingname' placed inside a 'articles' category'" \ "in a header's first line; -> {0}" self.errors.error(msg.format(self.reading_position)) # Are sortingname and fullname mixed up ? if len(sortingname) == 1 and len(fullname) == 1: if pos_sortingname < pos_fullname < pos_sortingname + \ len(res_sortingname) + \ len(HEADER_SORTINGNAME_END): msg = "(ERR031) 'fullname' placed inside a 'sortingname' " \ "in a header's first line; -> {0}" self.errors.error(msg.format(self.reading_position)) if pos_fullname < pos_sortingname < pos_fullname + \ len(res_fullname) + \ len(HEADER_FULLNAME_END): msg = "(ERR030) 'sortingname' placed inside a 'fullname' " \ "in a header's first line; -> {0}" self.errors.error(msg.format(self.reading_position)) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Is there any pending character ? for index, char in enumerate(line): if char not in HEADER_SPACE_CHARACTERS: index_ok = False if res_artiname is not None and \ index < len(res_artiname): # ok, the character belongs to a artiname : index_ok = True if res_fullname is not None and \ pos_fullname - len(HEADER_FULLNAME_START) <= index <= pos_fullname + \ len(res_fullname) + \ len(HEADER_FULLNAME_END): # ok, the character belongs to a fullname : index_ok = True if res_sortingname != Header.defaultSortingName and \ pos_sortingname - len(HEADER_SORTINGNAME_START) <= index <= pos_sortingname + \ len(res_sortingname) + \ len(HEADER_SORTINGNAME_END): # ok, the character belongs to a sortingname : index_ok = True if res_articlescategory is not None and \ pos_articlescategory - len(HEADER_ARTCATEGORY_END) <= index <= \ pos_articlescategory + \ len(res_articlescategory) + \ len(HEADER_ARTCATEGORY_END): # ok, the character belongs to a articlescategory : index_ok = True if not index_ok: msg = "(ERR029) Pending character '{0}' in header's first line; -> {1}" self.errors.error(msg.format(char, self.reading_position)) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Is NOTALEXICAL_ARTINAME at the beginning of the artiname ? # Is there only one NOTALEXICAL_ARTINAME in the artiname ? if res_artiname.count(NOTALEXICAL_ARTINAME) == 1: if res_artiname.find(NOTALEXICAL_ARTINAME) != 0: msg = "(ERR028) '{0}' symbol authorized only " \ "at the beginning of a artiname; " \ "current artiname = '{1}'; -> {2}" self.errors.error(msg.format(NOTALEXICAL_ARTINAME, res_artiname, self.reading_position)) elif res_artiname.count(NOTALEXICAL_ARTINAME) > 1: msg = "(ERR027) Only one '{0}' symbol authorized in a artiname; " \ "current artiname = '{1}'; -> {2}" self.errors.error(msg.format(NOTALEXICAL_ARTINAME, res_artiname, self.reading_position)) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Return values : artiname if res_artiname is not None: res_artiname = res_artiname.strip() #~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~ # artiname's normalization : if not do_not_normalize and \ "artinames" in logotheras.options.OPTIONS["normalize"]: if NOTALEXICAL_ARTINAME in res_artiname: msg = "Skipping the normalization of the artiname '{0}' : '{1}' detected" self.errors.info(msg.format(res_artiname, NOTALEXICAL_ARTINAME)) else: _res_artiname = srclanguage.normalize(res_artiname) if _res_artiname != res_artiname: msg = "(WAR004) artiname's normalization : " \ "'{0}' > '{1}'\n* {2}\n* {3};\n-> {4}" self.errors.warning(msg.format(res_artiname, _res_artiname, getDetailsAboutAString(res_artiname), getDetailsAboutAString(_res_artiname), self.reading_position)) res_artiname = _res_artiname #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Return values : fullname if res_fullname is not None: res_fullname = res_fullname.strip() #~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~ # fullname's normalization if not do_not_normalize and \ "fullnames" in logotheras.options.OPTIONS["normalize"]: _res_fullname = srclanguage.normalize(res_fullname) if _res_fullname != res_fullname: msg = "(WAR003) fullname's normalization : " \ "'{0}' > '{1}'\n* {2}\n* {3};\n-> {4}" self.errors.warning(msg.format(res_fullname, _res_fullname, getDetailsAboutAString(res_fullname), getDetailsAboutAString(_res_fullname), self.reading_position)) res_fullname = _res_fullname #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Return values : sortingname # # By default, res_sortingname is equal to Header.defaultSortingName : # # - (1) if its value is given in the header, res_sortingname will be equal # to this value # - (2) otherwise, it takes the value of the artiname if this last one has been # defined # - (3) otherwise, it will be equal to Header.defaultSortingName. # if res_sortingname != Header.defaultSortingName: #~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~. # (1) value defined in the header : #~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~. res_sortingname = res_sortingname.strip() # sortingname's normalization if not do_not_normalize and \ "sortingnames" in logotheras.options.OPTIONS["normalize"]: _res_sortingname = srclanguage.normalize(res_sortingname) if _res_sortingname != res_sortingname: msg = "(WAR002) sortingname's normalization : " \ "'{0}' > '{1}'\n* {2}\n* {3};\n-> {4}" self.errors.warning(msg.format(res_sortingname, _res_sortingname, getDetailsAboutAString(res_sortingname), getDetailsAboutAString(_res_sortingname), self.reading_position)) res_sortingname = _res_sortingname elif res_artiname is not None: #~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~. # (2) value equal to the artiname #~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~. res_sortingname = res_artiname else: #~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~. # (3) default value #~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~. res_sortingname = Header.defaultSortingName #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # error : the artiname begins with NOTALEXICAL_ARTINAME and there's a # fullname. if NOTALEXICAL_ARTINAME in res_artiname and \ res_fullname is not None: msg = "(ERR026) " \ "artiname, fullname = '{0}', '{1}' : " \ "the artiname having the '{2}' symbol, " \ "no fullname is allowed; -> {3}." self.errors.error(msg.format(res_artiname, res_fullname, NOTALEXICAL_ARTINAME, self.reading_position)) if res_articlescategory is not None: res_articlescategory = res_articlescategory.strip() return (self.headerdata.artiname_prefix + res_artiname, res_fullname, res_sortingname, res_articlescategory)