def extractText(list): text = "" html = "" for el in list: text += html2text(el) html += html2text(el, True, 2) return {'text': text, 'html': html}
def extractClassFeature(name, liste, section, baseURL): newObj = False descr = "" altLink = [] classfeature = None for s in section: if s.name == 'h3': if newObj: classfeature['Description'] = cleanInlineDescription(descr) classfeature['Niveau'] = extractLevel( classfeature['Description'], 150) liste.append(classfeature) classfeature = {'Auto': False} descr = "" newObj = True altLink = [] classfeature['Nom'] = name + ": " + cleanSectionName(s.text) classfeature['Classe'] = "Prêtre combattant" classfeature['Source'] = 'MJRA' classfeature['Référence'] = baseURL + s.find('a')['href'] else: descr += html2text(s) if not classfeature: return ## last element classfeature['Description'] = cleanInlineDescription(descr) classfeature['Niveau'] = extractLevel(classfeature['Description'], 150) liste.append(classfeature)
tableIdx = 0 for t in tables: tableIdx += 1 caption = t.find('caption').text for tr in t.find_all('tr'): if tr.has_attr('class') and (tr['class'][0] == 'titre' or tr['class'][0] == 'soustitre'): continue columnIdx = 0 for td in tr.find_all('td'): columnIdx += 1 if columnIdx == TABLEDEF[tableIdx][0]: nom = html2text(td) href = td.find('a') if href: href = href['href'] elif columnIdx == TABLEDEF[tableIdx][1]: prix = html2text(td) # sauter les entrées du type "relancer le dé" if u"le dé" in nom: continue # ignorer certaines entrées (référence à un autre tableau dans la page) if nom in IGNORE: continue else: nom = TABLEDEF[tableIdx][2] + nom
sourceNext = source if e.name == 'h2': newObj = True else: name = e.text.strip() if name.endswith('.'): name = name[:-1].strip() descr = "" source = None newObj = False elif e.name == 'div': sourceFound = extractSource(e) if sourceFound: source = sourceFound else: descr += html2text(e) addInfos(liste, name, sourceNext) for l in liste: if not l['Complete']: print("- aucune description n'existe pour '" + l['Nom'] + "'!"); del l['Complete'] print("Fusion avec fichier YAML existant...") HEADER = "" mergeYAML("../data/armes.yml", MATCH, FIELDS, HEADER, liste)
for t in tables: if "ignore" in t.attrs['class']: continue rows = t.find_all('tr') for r in rows: # ignore some rows if 'class' in r.attrs and 'titre' in r.attrs['class']: continue cols = r.find_all('td') if len(cols) >= 2: # Name & Reference nameLink = cols[0].find('a') if not nameLink is None: equipment['Nom'] = html2text(cols[0]).strip() equipment[ 'Référence'] = "http://www.pathfinder-fr.org/Wiki/" + nameLink[ 'href'] else: equipment['Nom'] = html2text(cols[0]).strip() equipment['Référence'] = data["URL"] if not cols[1].text.strip() and (len(cols) == 2 or not cols[2].text.strip()): parent = { 'Nom': equipment['Nom'], 'Référence': equipment['Référence'] } continue elif (cols[0].text.startswith(' ')
liste.append(exploitation) exploitation = {'Source':'MJ','Niveau':level} brCount = 0 descr = "" exploitation['Nom'] = "Exploitation: " + cleanSectionName(e.text) exploitation['Référence'] = URL + e.find_next("a")['href'] newObj = True elif e.name == 'br': brCount+=1 if(brCount==2 and u'Prérequis' in exploitation): descr = "" elif e.name == 'div' and not e.has_attr('class'): exit(1) else: descr += html2text(e, False) ## last element exploitation['Classe'] = 'Arcaniste' exploitation['Description'] = descr.strip() liste.append(exploitation) print("Fusion avec fichier YAML existant...") HEADER = "" mergeYAML("../data/classfeatures.yml", MATCH, FIELDS, HEADER, liste)
if MOCK_REVELATION_SUB: content = BeautifulSoup(open(MOCK_REVELATION_SUB),features="lxml").body else: content = BeautifulSoup(urllib.request.urlopen(u).read(),features="lxml").body name = cleanSectionName(content.find('h1', {'class':'pagetitle'}).text) print(" - " + name); # extraire informations sur mystère description = "" source = "MJ" for el in content.find('div', {'id': 'PageContentDiv'}): if el.name == "h2": break else: description += html2text(el) if el.name == "a" or el.name == "div": src = extractSource(el) if src: source = src benediction = {} benediction['Nom'] = name benediction['Classe'] = 'Oracle' benediction['Niveau'] = 1 benediction['Auto'] = False benediction['Description'] = cleanDescription(description) benediction['Source'] = source benediction['Référence'] = u liste.append(benediction)
classfeature = {'Source': 'MJ', 'Niveau': 1, 'Auto': True} brCount = 0 descr = "" featureName = cleanSectionName(s.text) if featureName.endswith('.'): featureName = featureName[:-1] classfeature['Nom'] = cleanName(featureName[0] + featureName[1:]) newObj = True for e in s.children: if e.name == 'a': classfeature['Référence'] = pageURL + e['href'] else: descr += html2text(s) ## last element classfeature['Classe'] = classe classfeature['Archétype'] = nom classfeature['Source'] = source classfeature['Description'] = cleanDescription(descr) classfeature['Niveau'] = extractLevel(classfeature['Description'], 30) # extraire niveau lvl = re.search('Au niveau (\d+)', descr) if lvl: classfeature['Niveau'] = int(lvl.group(1)) classfeatures.append(classfeature)
ordre = {'Niveau': 1} descr = "" ordre['Nom'] = nom ordre['Source'] = source source = 'MJRA' ordre['Référence'] = reference newObj = True elif el.name == 'div': src = extractSource(el) if not src is None: source = src else: descr += html2text(el) # last element ordre['Classe'] = 'Chevalier' ordre['Description'] = descr.strip() liste.append(ordre) section = jumpTo(content, 'h2', {'class': 'separator'}, "Ordres de Samouraï") ordre = {'Niveau': 1} newObj = False descr = "" source = 'MJRA' for el in section: if el.name == "h2": break
if MOCK_DOMAINE: domainHTML = BeautifulSoup(open(MOCK_DOMAINE_PAGE), features="lxml").body else: domainHTML = BeautifulSoup(urllib.request.urlopen( domain['Référence']).read(), features="lxml").body pouvoirs = jumpTo(domainHTML, 'h2', {'class': 'separator'}, "Pouvoirs accordés") if pouvoirs is None: pouvoirs = jumpTo(domainHTML, 'b', {}, "Pouvoirs accordés") if pouvoirs is None: print("NOT FOUND!!") continue for p in pouvoirs: if (p.name == 'h2'): break else: domain['Description'] += html2text(p) liste.append(domain) #exit(1) print("Fusion avec fichier YAML existant...") HEADER = "" mergeYAML("../data/classfeatures.yml", MATCH, FIELDS, HEADER, liste)
key = attr.text.strip() if key[-1] == '.': key = key[:-1] if key.endswith(' :'): key = key[:-2] refs = [] for s in attr.next_siblings: #print "%s %s" % (key,s.name) if s.name == 'b': break else: # keep links (href) to build dependency tables if s.name == 'a': refs.append(s['href']) text += html2text(s) html += html2text(s, True, 2) if key in PROPERTIES: if key == "Condition" or key == "Conditions requises": key = "Conditions" elif key == "Avantages": key = "Avantage" elif key == "Catégories": key = "Catégorie" elif key == "À noter": key = "Spécial" don[key] = cleanProperty(text, False) if key == "Avantage": don["AvantageHTML"] = html