def aooo(self, subject=None):
    if 'This work could have adult content. If you proceed you have agreed that you are willing to see such content' in self.text:
        print('fichier protégé', self.title)
        return
    self.cleanWeb()
    self.replace('<br>', '</p><p>')
    self.text = findTextBetweenTag(self.text, 'body')
    # le titre
    d = self.index('<h2>') + 4
    f = self.index('</h2>', d)
    self.title = self.text[d:f]
    self.title = self.title.strip()
    self.title = self.title.strip('.')
    # l'auteur et sa page
    d = self.index("<a href='/users/", f) + 9
    f = self.index("'", d)
    self.autlink = self.text[d:f]
    d = self.index('>', d) + 1
    f = self.index('</a>', d)
    self.author = self.text[d:f]
    # le sujet
    self.findSubject(subject)
    if self.subject == 'histoire':
        d = self.index('Additional Tags:<ul>') + 24
        d = self.index('>', d) + 1
        f = self.index('<', d)
        self.subject = self.text[d:f]
    # le texte ne compte qu'un seul chapître
    d = self.index('<h3>Work Text:</h3>') + 19
    # le texte compte plusieurs chapîtres
    if d == 18: d = self.index("<h3><a href='/works/")
    f = self.rindex('<h3>Actions</h3>')
    self.text = self.text[d:f]
    self.replace('<div>')
    self.replace('</div>')
    self.replace('<h3>Chapter Text</h3>')
    if self.contain("<h3><a href='/works/"):
        chapters = List()
        chapters.fromText("<h3><a href='/works/", self.text)
        chapterRange = chapters.range(1)
        for c in chapterRange:
            d = chapters[c].find('>') + 1
            chapters[c] = chapters[c][d:]
        self.text = '<h2>'.join(chapters)
        self.replace('</a></h3>', '</h2>')
    if self.contain('<h2>Chapter ') and not self.contain('</h2>'):
        chapters = List()
        chapters.fromText('<h2>Chapter ', self.text)
        chapterRange = chapters.range(1)
        for c in chapterRange:
            d = chapters[c].find('</a>: ') + 6
            chapters[c] = chapters[c][d:]
            chapters[c] = chapters[c].replace('</h3>', '</h2>', 1)
        self.text = '<h2>'.join(chapters)
    # nettoyer le texte
    if self.contain('<h3>Notes:</h3>'):
        halfText = self.length() / 2
        d = self.index('<h3>Notes:</h3>')
        if d > halfText: self.text = self.text[:d]
    self.usePlaceholders()
Exemplo n.º 2
0
 def ficAoooCommon(self):
     # le sujet
     if self.subject: self.subject = ', ' + self.subject
     if subject and subject not in self.subject:
         self.subject = self.subject + ', ' + subject
     d = self.index('Category:<ul><li><a') + 20
     d = self.index('>', d) + 1
     f = self.index('</a>', d)
     if self.text[d:f] in ('F/M', 'F/F') and 'romance' not in self.subject:
         self.subject = self.subject + ', romance'
     d = self.index('Fandoms:<ul><li><a', f) + 20
     d = self.index('>', d) + 1
     f = self.index('</a>', d)
     if self.text[d:f] not in self.subject:
         self.subject = self.subject + ', ' + self.text[d:f]
     self.subject = self.subject.replace(' - Fandom', "")
     self.subject = self.subject[2:]
     # l'auteur
     d = self.index("<h3><a href='/users/") + 13
     f = self.index('>', d) - 1
     self.autlink = self.text[d:f]
     d = f + 2
     f = self.index('<', d)
     self.author = self.text[d:f]
     self.author = self.author.replace('-', ' ')
     self.author = self.author.replace('_', ' ')
     self.author = self.author.strip()
     self.replace('<h3>Chapter Text</h3>')
     # le texte ne compte qu'un seul chapître
     d = self.index('<h3>Work Text:</h3>') + 19
     # le texte compte plusieurs chapîtres
     if d == 18: d = self.index("<h3><a href='/works/")
     f = self.rindex('<h3>Actions</h3>')
     self.text = self.text[d:f]
     self.replace('<div>')
     self.replace('</div>')
     if self.contain("<h3><a href='/works/"):
         chapters = List()
         chapters.fromText("<h3><a href='/works/", self.text)
         chapterRange = chapters.range(1)
         for c in chapterRange:
             d = chapters[c].find('>') + 1
             chapters[c] = chapters[c][d:]
         self.text = '<h2>'.join(chapters)
         self.replace('</a></h3>', '</h2>')
     if self.contain('<h2>Chapter ') and not self.contain('</h2>'):
         chapters = List()
         chapters.fromText('<h2>Chapter ', self.text)
         chapterRange = chapters.range(1)
         for c in chapterRange:
             d = chapters[c].find('</a>: ') + 6
             chapters[c] = chapters[c][d:]
             chapters[c] = chapters[c].replace('</h3>', '</h2>', 1)
         self.text = '<h2>'.join(chapters)
     # nettoyer le texte
     if self.contain('<h3>Notes:</h3>'):
         halfText = self.length() / 2
         d = self.index('<h3>Notes:</h3>')
         if d > halfText: self.text = self.text[:d]
     self.usePlaceholders()
Exemplo n.º 3
0
	def cleanTags (self):
		# supprimer les attributs inutiles
		self.replace ('<br/>', '<br>')
		self.replace ('<hr/>', '<hr>')
		tagList = List()
		textList = List()
		textList.addList (self.text.split ('<'))
		textRange = textList.range (1)
		# textRange.reverse()
		for t in textRange:
			if len (textList[t]) ==0: continue
			elif textList[t] [0] in '/ !': continue
			elif '>' not in textList[t]: textList[t] = textList[t] [:f] +'>'
			f= textList[t].find ('>')
			tag = textList[t][:f].lower()
			textList[t] = textList[t][f:]
			if ' ' in tag:
				f= tag.find (' ')
				attributes = tag[f:]
				tag = tag[:f]
				if tag in ('a', 'img', 'form', 'input'): tag = self.cleanTagsSpecial (tag, attributes)
				elif tag not in tagList: tagList.add (tag)
			elif tag not in tagList: tagList.add (tag)
			textList[t] = tag + textList[t]
		self.text = '<'.join (textList)
		self.replace (' <', '<')
		# supprimer les balises inutiles
		self.replace ('<img>')
		while '<br><br>' in self.text: self.replace ('<br><br>', '<br>')
		self.replace ('><br>', '>')
		self.replace ('<br><', '<')
		for tag in tagList:
			if tag not in listTagsKeep:
				self.replace ('</'+ tag +'>', " ")
				self.replace ('<'+ tag +'>', " ")
		while self.contain ("  "): self.replace ("  "," ")
		if self.contain ('<a>'):
			textList = List()
			textList.addList (self.text.split ('<a>'))
			textRange = textList.range (1)
			for a in textRange:
				d= textList[a].find ('</a>')
				textList[a] = textList[a] [:d].strip() +' '+ textList[a] [d+4:].strip()
			#	textList[a] = textList[a] [d+4:].strip()
			self.text = ' '.join (textList)
		# retrouver les balises vides
		self.clean()
		self.replace ('\n')
		for tag in tagList: self.replace ('<'+ tag +'></'+ tag +'>', " ")
		while self.contain ("  "): self.replace ("  "," ")
Exemplo n.º 4
0
 def fromFile(self):
     tmpList = FileList()
     tmpList.copyFile(self)
     tmpList.fromFile()
     rangeLin = tmpList.range()
     for l in rangeLin:
         tmp = List()
         tmp.fromText(self.sepCol, tmpList[l])
         self.addLine(tmp)
Exemplo n.º 5
0
	def delImgLink (self):
		self.text = self.text.replace ('</div>',"")
		self.text = self.text.replace ('<div>',"")
		# supprimer les liens
		if self.contain ('<a href='):
			textList = List()
			textList.addList (self.text.split ('<a href='))
			textRange = textList.range (1)
			for i in textRange:
				d= textList [i].find ('>') +1
				textList [i] = textList [i] [d:]
			self.text = "".join (textList)
			self.text = self.text.replace ('</a>',"")
		# supprimer les images
		if self.contain ('<img src='):
			textList = List()
			textList.addList (self.text.split ('<img src='))
			textRange = textList.range (1)
			for i in textRange:
				d= textList [i].find ('>') +1
				textList [i] = textList [i] [d:]
			self.text = "".join (textList)
Exemplo n.º 6
0
 def ficVg(self):
     self.subject = 'sortie'
     self.author = 'vide-grenier'
     d = self.index('<h2>')
     f = self.rindex('<span>Voir les prochaines')
     self.text = self.text[d:f]
     toReplace = (('<h2><span>', '<h2>'), ('</span></h2>', '</h2>'),
                  ('<span><span>', '<span>'), ('</span></span>', '</span>'))
     for i, j in toReplace:
         self.replace(i, j)
     textList = List()
     textList.fromText('<h3>', self.text)
     textRange = textList.range(1)
     """
Exemplo n.º 7
0
	def cleanSpan (self):
		# supprimer les span en trop
		self.replace ('<SPAN', '<span')
		self.replace ('</SPAN', '</span')
		textList = List()
		textList.addList (self.text.split ('<span '))
		textRange = textList.range (1)
		textRange.reverse()
		for t in textRange:
			f= textList[t].find ('>')
			textList[t] = textList[t] [f:]
		self.text = '<span'.join (textList)
		self.replace ('</span>=', '=')
		self.replace ('<</span>', '<')
		self.replace ('<span>>', '>')
		self.replace ('<span>', ' ')
		self.replace ('</span>', ' ')
		# supprimer les liens en trop
		textList = List()
		textList.addList (self.text.split ('="<a '))
		textRange = textList.range (1)
		textRange.reverse()
		for t in textRange:
			f= textList[t].find ('</a>"') +4
			textList[t] = textList[t] [f:]
		self.text = '="'.join (textList)
		self.replace (' href=""')
		# nettoyer
		while self.contain ('  '): self.replace ('  ', ' ')
		self.replace (' >', '>')
		self.replace ('< ', '<')
		self.replace ('> ', '>')
		self.replace (' <', '<')
		self.replace ('</ ', '</')
		self.replace (' />', '/>')
		self.text = self.text.strip()
def seLoger(self):
    # https://www.seloger.com/list.htm?projects=2,5&types=1&natures=1,2,4&places=[{%22subDivisions%22:[%2275%22]}]&price=NaN/200000&surface=30/NaN&enterprise=0&qsVersion=1.0&m=search_refine
    self.subject = 'immobilier'
    self.author = 'se loger'
    self.autlink = 'https://www.seloger.com/'
    self.title = 'se-loger'
    self.fileFromData()
    self.styles = []
    self.metas = {}
    d = self.index('<div data-test="sl.title')
    d = self.index('<ul', d)
    f = self.rindex('</div>')
    self.text = self.text[d:f]
    self.cleanWeb()
    self.text = self.text[4:]
    logeList = List()
    logeList.fromText('Appartement<ul>', self.text)
    logeRang = logeList.range()
    self.text = ""
    for l in logeRang:
        text = seLogerUnite(logeList[l])
        if text: self.text = self.text + text
Exemplo n.º 9
0
	def cleanWeb (self):
		self.clean()
		self.replace ('<br/>', '<br>')
		self.replace ('<hr/>', '<hr>')
		# supprimer les commentaires
		self.replace ('< ! --', '<!--')
		self.replace ('< !--', '<!--')
		textList = List()
		textList.addList (self.text.split ('<!--'))
		textRange = textList.range (1)
		for t in textRange:
			f= textList[t].find ('-->') +3
			textList[t] = textList[t] [f:]
			self.text = "".join (textList)
		# effacer certaines balises
		# self.cleanSpan()
		self.cleanTags()
		if self.contain ('</body>'): self.text = findTextBetweenTag (self.text, 'body')
		self.replace ('\n')
		self.replace ('\t')
		self.clean()
		for tag in listTags:
			while self.contain ('<'+tag+'></'+tag+'>'): self.replace ('<'+tag+'></'+tag+'>',"")