def ficAoooCommon(self): # le sujet if self.subject: self.subject = ', ' + self.subject if subject and subject not in self.subject: self.subject = self.subject + ', ' + subject d = self.index('Category:<ul><li><a') + 20 d = self.index('>', d) + 1 f = self.index('</a>', d) if self.text[d:f] in ('F/M', 'F/F') and 'romance' not in self.subject: self.subject = self.subject + ', romance' d = self.index('Fandoms:<ul><li><a', f) + 20 d = self.index('>', d) + 1 f = self.index('</a>', d) if self.text[d:f] not in self.subject: self.subject = self.subject + ', ' + self.text[d:f] self.subject = self.subject.replace(' - Fandom', "") self.subject = self.subject[2:] # l'auteur d = self.index("<h3><a href='/users/") + 13 f = self.index('>', d) - 1 self.autlink = self.text[d:f] d = f + 2 f = self.index('<', d) self.author = self.text[d:f] self.author = self.author.replace('-', ' ') self.author = self.author.replace('_', ' ') self.author = self.author.strip() self.replace('<h3>Chapter Text</h3>') # le texte ne compte qu'un seul chapître d = self.index('<h3>Work Text:</h3>') + 19 # le texte compte plusieurs chapîtres if d == 18: d = self.index("<h3><a href='/works/") f = self.rindex('<h3>Actions</h3>') self.text = self.text[d:f] self.replace('<div>') self.replace('</div>') if self.contain("<h3><a href='/works/"): chapters = List() chapters.fromText("<h3><a href='/works/", self.text) chapterRange = chapters.range(1) for c in chapterRange: d = chapters[c].find('>') + 1 chapters[c] = chapters[c][d:] self.text = '<h2>'.join(chapters) self.replace('</a></h3>', '</h2>') if self.contain('<h2>Chapter ') and not self.contain('</h2>'): chapters = List() chapters.fromText('<h2>Chapter ', self.text) chapterRange = chapters.range(1) for c in chapterRange: d = chapters[c].find('</a>: ') + 6 chapters[c] = chapters[c][d:] chapters[c] = chapters[c].replace('</h3>', '</h2>', 1) self.text = '<h2>'.join(chapters) # nettoyer le texte if self.contain('<h3>Notes:</h3>'): halfText = self.length() / 2 d = self.index('<h3>Notes:</h3>') if d > halfText: self.text = self.text[:d] self.usePlaceholders()
def aooo(self, subject=None): if 'This work could have adult content. If you proceed you have agreed that you are willing to see such content' in self.text: print('fichier protégé', self.title) return self.cleanWeb() self.replace('<br>', '</p><p>') self.text = findTextBetweenTag(self.text, 'body') # le titre d = self.index('<h2>') + 4 f = self.index('</h2>', d) self.title = self.text[d:f] self.title = self.title.strip() self.title = self.title.strip('.') # l'auteur et sa page d = self.index("<a href='/users/", f) + 9 f = self.index("'", d) self.autlink = self.text[d:f] d = self.index('>', d) + 1 f = self.index('</a>', d) self.author = self.text[d:f] # le sujet self.findSubject(subject) if self.subject == 'histoire': d = self.index('Additional Tags:<ul>') + 24 d = self.index('>', d) + 1 f = self.index('<', d) self.subject = self.text[d:f] # le texte ne compte qu'un seul chapître d = self.index('<h3>Work Text:</h3>') + 19 # le texte compte plusieurs chapîtres if d == 18: d = self.index("<h3><a href='/works/") f = self.rindex('<h3>Actions</h3>') self.text = self.text[d:f] self.replace('<div>') self.replace('</div>') self.replace('<h3>Chapter Text</h3>') if self.contain("<h3><a href='/works/"): chapters = List() chapters.fromText("<h3><a href='/works/", self.text) chapterRange = chapters.range(1) for c in chapterRange: d = chapters[c].find('>') + 1 chapters[c] = chapters[c][d:] self.text = '<h2>'.join(chapters) self.replace('</a></h3>', '</h2>') if self.contain('<h2>Chapter ') and not self.contain('</h2>'): chapters = List() chapters.fromText('<h2>Chapter ', self.text) chapterRange = chapters.range(1) for c in chapterRange: d = chapters[c].find('</a>: ') + 6 chapters[c] = chapters[c][d:] chapters[c] = chapters[c].replace('</h3>', '</h2>', 1) self.text = '<h2>'.join(chapters) # nettoyer le texte if self.contain('<h3>Notes:</h3>'): halfText = self.length() / 2 d = self.index('<h3>Notes:</h3>') if d > halfText: self.text = self.text[:d] self.usePlaceholders()
def cleanTags (self): # supprimer les attributs inutiles self.replace ('<br/>', '<br>') self.replace ('<hr/>', '<hr>') tagList = List() textList = List() textList.addList (self.text.split ('<')) textRange = textList.range (1) # textRange.reverse() for t in textRange: if len (textList[t]) ==0: continue elif textList[t] [0] in '/ !': continue elif '>' not in textList[t]: textList[t] = textList[t] [:f] +'>' f= textList[t].find ('>') tag = textList[t][:f].lower() textList[t] = textList[t][f:] if ' ' in tag: f= tag.find (' ') attributes = tag[f:] tag = tag[:f] if tag in ('a', 'img', 'form', 'input'): tag = self.cleanTagsSpecial (tag, attributes) elif tag not in tagList: tagList.add (tag) elif tag not in tagList: tagList.add (tag) textList[t] = tag + textList[t] self.text = '<'.join (textList) self.replace (' <', '<') # supprimer les balises inutiles self.replace ('<img>') while '<br><br>' in self.text: self.replace ('<br><br>', '<br>') self.replace ('><br>', '>') self.replace ('<br><', '<') for tag in tagList: if tag not in listTagsKeep: self.replace ('</'+ tag +'>', " ") self.replace ('<'+ tag +'>', " ") while self.contain (" "): self.replace (" "," ") if self.contain ('<a>'): textList = List() textList.addList (self.text.split ('<a>')) textRange = textList.range (1) for a in textRange: d= textList[a].find ('</a>') textList[a] = textList[a] [:d].strip() +' '+ textList[a] [d+4:].strip() # textList[a] = textList[a] [d+4:].strip() self.text = ' '.join (textList) # retrouver les balises vides self.clean() self.replace ('\n') for tag in tagList: self.replace ('<'+ tag +'></'+ tag +'>', " ") while self.contain (" "): self.replace (" "," ")
def fromFile(self): tmpList = FileList() tmpList.copyFile(self) tmpList.fromFile() rangeLin = tmpList.range() for l in rangeLin: tmp = List() tmp.fromText(self.sepCol, tmpList[l]) self.addLine(tmp)
def delImgLink (self): self.text = self.text.replace ('</div>',"") self.text = self.text.replace ('<div>',"") # supprimer les liens if self.contain ('<a href='): textList = List() textList.addList (self.text.split ('<a href=')) textRange = textList.range (1) for i in textRange: d= textList [i].find ('>') +1 textList [i] = textList [i] [d:] self.text = "".join (textList) self.text = self.text.replace ('</a>',"") # supprimer les images if self.contain ('<img src='): textList = List() textList.addList (self.text.split ('<img src=')) textRange = textList.range (1) for i in textRange: d= textList [i].find ('>') +1 textList [i] = textList [i] [d:] self.text = "".join (textList)
def ficVg(self): self.subject = 'sortie' self.author = 'vide-grenier' d = self.index('<h2>') f = self.rindex('<span>Voir les prochaines') self.text = self.text[d:f] toReplace = (('<h2><span>', '<h2>'), ('</span></h2>', '</h2>'), ('<span><span>', '<span>'), ('</span></span>', '</span>')) for i, j in toReplace: self.replace(i, j) textList = List() textList.fromText('<h3>', self.text) textRange = textList.range(1) """
def cleanSpan (self): # supprimer les span en trop self.replace ('<SPAN', '<span') self.replace ('</SPAN', '</span') textList = List() textList.addList (self.text.split ('<span ')) textRange = textList.range (1) textRange.reverse() for t in textRange: f= textList[t].find ('>') textList[t] = textList[t] [f:] self.text = '<span'.join (textList) self.replace ('</span>=', '=') self.replace ('<</span>', '<') self.replace ('<span>>', '>') self.replace ('<span>', ' ') self.replace ('</span>', ' ') # supprimer les liens en trop textList = List() textList.addList (self.text.split ('="<a ')) textRange = textList.range (1) textRange.reverse() for t in textRange: f= textList[t].find ('</a>"') +4 textList[t] = textList[t] [f:] self.text = '="'.join (textList) self.replace (' href=""') # nettoyer while self.contain (' '): self.replace (' ', ' ') self.replace (' >', '>') self.replace ('< ', '<') self.replace ('> ', '>') self.replace (' <', '<') self.replace ('</ ', '</') self.replace (' />', '/>') self.text = self.text.strip()
def seLoger(self): # https://www.seloger.com/list.htm?projects=2,5&types=1&natures=1,2,4&places=[{%22subDivisions%22:[%2275%22]}]&price=NaN/200000&surface=30/NaN&enterprise=0&qsVersion=1.0&m=search_refine self.subject = 'immobilier' self.author = 'se loger' self.autlink = 'https://www.seloger.com/' self.title = 'se-loger' self.fileFromData() self.styles = [] self.metas = {} d = self.index('<div data-test="sl.title') d = self.index('<ul', d) f = self.rindex('</div>') self.text = self.text[d:f] self.cleanWeb() self.text = self.text[4:] logeList = List() logeList.fromText('Appartement<ul>', self.text) logeRang = logeList.range() self.text = "" for l in logeRang: text = seLogerUnite(logeList[l]) if text: self.text = self.text + text
def cleanWeb (self): self.clean() self.replace ('<br/>', '<br>') self.replace ('<hr/>', '<hr>') # supprimer les commentaires self.replace ('< ! --', '<!--') self.replace ('< !--', '<!--') textList = List() textList.addList (self.text.split ('<!--')) textRange = textList.range (1) for t in textRange: f= textList[t].find ('-->') +3 textList[t] = textList[t] [f:] self.text = "".join (textList) # effacer certaines balises # self.cleanSpan() self.cleanTags() if self.contain ('</body>'): self.text = findTextBetweenTag (self.text, 'body') self.replace ('\n') self.replace ('\t') self.clean() for tag in listTags: while self.contain ('<'+tag+'></'+tag+'>'): self.replace ('<'+tag+'></'+tag+'>',"")
def __init__(self, path='b/'): List.__init__(self) if path: path = fs.shortcut(path) self.path = path
def fromText(self): self.list = [] List.fromText(self, self.sepLin, self.text)
def __init__(self, sepLin='\n', file=None): File.__init__(self, file) List.__init__(self) self.sepLin = sepLin