def extractClassFeature(name, liste, section, baseURL): newObj = False descr = "" altLink = [] classfeature = None for s in section: if s.name == 'h3': if newObj: classfeature['Description'] = cleanInlineDescription(descr) classfeature['Niveau'] = extractLevel( classfeature['Description'], 150) liste.append(classfeature) classfeature = {'Auto': False} descr = "" newObj = True altLink = [] classfeature['Nom'] = name + ": " + cleanSectionName(s.text) classfeature['Classe'] = "Prêtre combattant" classfeature['Source'] = 'MJRA' classfeature['Référence'] = baseURL + s.find('a')['href'] else: descr += html2text(s) if not classfeature: return ## last element classfeature['Description'] = cleanInlineDescription(descr) classfeature['Niveau'] = extractLevel(classfeature['Description'], 150) liste.append(classfeature)
for URL in URLS: if MOCK_W: content = BeautifulSoup(open(MOCK_W),features="lxml").body else: content = BeautifulSoup(urllib.request.urlopen(URL).read(),features="lxml").body sections = content.find_all('h2',{'class':'separator'}) type1 = [] # extraction des types for s in sections: if s.text.startswith("Accès rapide"): continue section = cleanSectionName(s.text) section = section.replace("Les armes", "Arme").replace("s","") type1.append(section) tables = content.find_all('table',{'class':'tablo'}) weapon = {} type2 = "" print("Extraction des armes...") idx = 0 for t in tables: rows = t.find_all('tr') for r in rows: cols = r.find_all('td')
section = jumpTo(content, 'h2', {'class': 'separator'}, "Description des astuces de ninja") LVL = 2 astuce = {'Niveau': LVL} newObj = False descr = "" source = 'AG' for s in section: if s.name == 'h2' and "Description des astuces de maître" in s.text: LVL = 10 elif s.name == "table": for td in s.find_all('td'): for el in td.children: if el.name == "h3": nom = cleanSectionName(el.text) reference = URL + el.find_next("a")['href'] if newObj: astuce['Classe'] = 'Ninja' astuce['Description'] = descr.strip() liste.append(astuce) astuce = {'Niveau': LVL} descr = "" astuce['Nom'] = u"Astuce: " + nom astuce['Source'] = source astuce['Référence'] = reference source = "AG" newObj = True
URLS = [] for el in navigation.find_all('a'): if (el.text.startswith("Bénédiction")): URLS.append("https://www.pathfinder-fr.org/Wiki/" + el['href']) liste = [] for u in URLS: if MOCK_BENE: content = BeautifulSoup(open(MOCK_BENE), features="lxml").body else: content = BeautifulSoup(urllib.request.urlopen(u).read(), features="lxml").body name = cleanSectionName(content.find('h1', {'class': 'pagetitle'}).text) print(" - " + name) section = content.find('div', { 'class': 'presentation navmenu' }).next_siblings extractClassFeature(name, liste, section, u) if MOCK_BENE: break print("Fusion avec fichier YAML existant...") HEADER = ""
level = 2 if level == 0 else 11 exploitation = {'Source':source,'Niveau':level} newObj = False brCount = 0 descr = "" for e in s.children: if e.name == 'h3': if newObj: exploitation['Classe'] = 'Arcaniste' exploitation['Description'] = descr.strip() liste.append(exploitation) exploitation = {'Source':'MJ','Niveau':level} brCount = 0 descr = "" exploitation['Nom'] = "Exploitation: " + cleanSectionName(e.text) exploitation['Référence'] = URL + e.find_next("a")['href'] newObj = True elif e.name == 'br': brCount+=1 if(brCount==2 and u'Prérequis' in exploitation): descr = "" elif e.name == 'div' and not e.has_attr('class'): exit(1) else: descr += html2text(e, False) ## last element
print("No descriptions found for %s" % data["category"]) exit(1) newObj = True name = "" descr = "" source = None sourceNext = None for e in section: if e.name == 'h3': if not newObj: addInfos(liste, name, descr, sourceNext) sourceNext = source if e.name == 'h3': name = cleanSectionName(e.text) descr = "" source = None newObj = False else: descr += html2text(e) if e.name == 'div' or e.name == 'img': src = extractSource(e) if src: source = src addInfos(liste, name, descr, sourceNext) for l in liste: if 'Complete' in l and not l['Complete']:
level = 2 if level == 0 else 10 talent = {'Source': 'MJ', 'Niveau': level} newObj = False brCount = 0 descr = "" for e in s.children: if e.name == 'h3': if newObj: talent['Classe'] = 'Roublard' talent['Description'] = descr.strip() liste.append(talent) talent = {'Source': 'MJ', 'Niveau': level} brCount = 0 descr = "" talent['Nom'] = "Talent: " + cleanSectionName(e.text) talent['Référence'] = URL + e.find_next("a")['href'] newObj = True elif e.name == 'br': brCount += 1 if (brCount == 2 and u'Prérequis' in talent): descr = "" else: descr += html2text(e, False) if e.name == 'div' or e.name == 'a': src = extractSource(e) if not src is None: talent['Source'] = src ## last element
for s in content.children: if s.name == 'h3': if newObj: classfeature['Classe'] = classe classfeature['Archétype'] = nom classfeature['Source'] = source classfeature['Description'] = cleanDescription(descr) classfeature['Niveau'] = extractLevel( classfeature['Description'], 30) classfeatures.append(classfeature) classfeature = {'Source': 'MJ', 'Niveau': 1, 'Auto': True} brCount = 0 descr = "" featureName = cleanSectionName(s.text) if featureName.endswith('.'): featureName = featureName[:-1] classfeature['Nom'] = cleanName(featureName[0] + featureName[1:]) newObj = True for e in s.children: if e.name == 'a': classfeature['Référence'] = pageURL + e['href'] else: descr += html2text(s) ## last element classfeature['Classe'] = classe classfeature['Archétype'] = nom
for e in s.children: if e.name == 'h3': if newObj: rage['Classe'] = 'Barbare' rage['Description'] = descr.strip() if not sourceNext is None: rage['Source'] = sourceNext liste.append(rage) sourceNext = source source = None rage = {'Source': 'MJ', 'Niveau': 1} brCount = 0 descr = "" else: sourceNext = source rage['Nom'] = "Rage: " + cleanSectionName(e.text) rage['Référence'] = URL + e.find_next("a")['href'] newObj = True elif e.name == 'b' and e.text == 'Prérequis': prerequis = str(e.next_sibling) if prerequis.startswith("'"): prerequis = prerequis[1:] m = re.search(',? ?niveau (\d+)', prerequis) rage['Niveau'] = extractLevel(prerequis, 300) rage['Prérequis'] = prerequis.replace(':', '').strip() elif e.name == 'br': brCount += 1 if (brCount == 2 and 'Prérequis' in rage): descr = "" else: descr += html2text(e)