def getExtracts(self, context, source): """ Link.getExtracts context : (str) source : (str) Return a list of LinkInfo objects """ res = [] for textbetweenmarkers in TextBetweenMarkers.getExtracts(self, source): name = textbetweenmarkers.substring pos0 = textbetweenmarkers.pos0 separator = source.find(LINK_SEPARATOR) if separator == -1: # no separator, artiname only : artiname = name entryname = None else: # separator between artiname and entryname : artiname = source[separator:] entryname = source[:separator] res.append( LinkInfo( context = context, pos0 = pos0, artiname = artiname, entryname = entryname, source = source )) return res
def initFromStr(self, informationsdata, title_hlevel, title_pending_text, str_content, reading_position, srclanguage): """ Entry.initFromStr() informationsdata : InformationsData object title_hlevel : HierarchicalLevel object title_pending_text : None or a string. str_content : a list of strings. reading_position : ReadingPosition object """ self.reset() self.reading_position = reading_position #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # hierarchical level : self.entrydata.hlevel = title_hlevel.hleveldata #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # title : if title_pending_text is None: self.entrydata.title = None else: # links are forbidden in the entries' title : if LINK_START in title_pending_text: msg = "(ERR059) Link found in an entry's title; title={0} -> {1}" self.errors.error(msg.format(title_pending_text, self.reading_position)) # to-be-duplicated entry ? if BODY_ARTICLE_TO_DUPLICATED_NOTIMPORTANT in title_pending_text or \ BODY_ARTICLE_TO_DUPLICATED_IMPORTANT in title_pending_text: if BODY_ARTICLE_TO_DUPLICATED_NOTIMPORTANT in title_pending_text: # 'normal' symbol : dup_symbol = BODY_ARTICLE_TO_DUPLICATED_NOTIMPORTANT else: # 'important' symbol : dup_symbol = BODY_ARTICLE_TO_DUPLICATED_IMPORTANT # only one result expected : tbm = TextBetweenMarkers(marker_start=dup_symbol, marker_end=dup_symbol) results = tbm.getExtracts( title_pending_text ) if len(results) != 1: msg = "(ERR058) Wrong format : " \ "the to-be-duplicated entry can't be read; -> {0}" self.errors.error(msg.format(self.reading_position)) else: self.entrydata.entry_to_be_duplicated = results[0].substring self.entrydata.important_entry_to_be_dup = (dup_symbol == \ BODY_ARTICLE_TO_DUPLICATED_IMPORTANT) # we erase the markers linked to the 'to-be-duplicated' entry : title_pending_text = title_pending_text.replace(dup_symbol, "") self.entrydata.title = title_pending_text.strip() # links in <self.entrydata.title> ? self.entrydata.links = Link().getExtracts(source = self.entrydata.title, context = "entry.title+" + \ self.reading_position.getShortDescription()) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # text and extract(s) : current_position = None # None or "extract" current_extract = [] # list of strings for line in str_content: if line.strip() == "": if current_position == "extract": # end of the current extract : new_extract = Extract( errors = self.errors, logotherasdata = self.logotherasdata ) new_extract.initFromStr( informationsdata = informationsdata, src = current_extract, reading_position = reading_position, srclanguage = srclanguage) extract_freeness = \ informationsdata.getTheMinFreenessOfAnExtract(new_extract.extractdata) if extract_freeness >= logotheras.options.OPTIONS["minimal freeness"]: self.entrydata.append( new_extract.extractdata ) else: msg = "skipping an extract due to its freeness value; extract={0}; -> {1}" self.errors.info(msg.format(new_extract.extractdata, reading_position)) current_position = None current_extract = [] elif not line.startswith(BODY_PREFIX_BEFORE_EXTRACT): self.entrydata.text.append(line) else: if current_position is None: # first line of an extract : current_position = "extract" current_extract.append( line[len(BODY_PREFIX_BEFORE_EXTRACT):] ) else: # next line of the current extract : current_extract.append( line[len(BODY_PREFIX_BEFORE_EXTRACT):] ) if current_position is not None: # we add the last extract : new_extract = Extract( errors = self.errors, logotherasdata = self.logotherasdata ) new_extract.initFromStr( informationsdata = informationsdata, src = current_extract, reading_position = reading_position, srclanguage = srclanguage) extract_freeness = \ informationsdata.getTheMinFreenessOfAnExtract(new_extract.extractdata) if extract_freeness >= logotheras.options.OPTIONS["minimal freeness"]: self.entrydata.append( new_extract.extractdata ) else: msg = "skipping an extract due to its freeness value; extract={0}; -> {1}" self.errors.info(msg.format(new_extract.extractdata, reading_position)) return self