Python cleanSectionName示例，libhtml.cleanSectionName Python示例

示例#1

0

显示文件

文件： extract-classfeatures-benedictions.py 项目： deurk/pathfinderfr-data

def extractClassFeature(name, liste, section, baseURL):
    newObj = False
    descr = ""

    altLink = []
    classfeature = None
    for s in section:
        if s.name == 'h3':
            if newObj:
                classfeature['Description'] = cleanInlineDescription(descr)
                classfeature['Niveau'] = extractLevel(
                    classfeature['Description'], 150)
                liste.append(classfeature)

            classfeature = {'Auto': False}
            descr = ""
            newObj = True
            altLink = []
            classfeature['Nom'] = name + ": " + cleanSectionName(s.text)
            classfeature['Classe'] = "Prêtre combattant"
            classfeature['Source'] = 'MJRA'
            classfeature['Référence'] = baseURL + s.find('a')['href']

        else:
            descr += html2text(s)

    if not classfeature:
        return

    ## last element
    classfeature['Description'] = cleanInlineDescription(descr)
    classfeature['Niveau'] = extractLevel(classfeature['Description'], 150)
    liste.append(classfeature)

示例#2

0

显示文件

文件： extract-equipment-weapons.py 项目： deurk/pathfinderfr-data

for URL in URLS:
    if MOCK_W:
        content = BeautifulSoup(open(MOCK_W),features="lxml").body
    else:
        content = BeautifulSoup(urllib.request.urlopen(URL).read(),features="lxml").body

    sections = content.find_all('h2',{'class':'separator'})

    type1 = []

    # extraction des types
    for s in sections:
        if s.text.startswith("Accès rapide"):
            continue
        section = cleanSectionName(s.text)
        section = section.replace("Les armes", "Arme").replace("s","")
        type1.append(section)

    tables = content.find_all('table',{'class':'tablo'})

    weapon = {}
    type2 = ""

    print("Extraction des armes...")

    idx = 0
    for t in tables:
        rows = t.find_all('tr')
        for r in rows:
            cols = r.find_all('td')

示例#3

0

显示文件

文件： extract-classfeatures-astuces.py 项目： deurk/pathfinderfr-data

section = jumpTo(content, 'h2', {'class': 'separator'},
                 "Description des astuces de ninja")

LVL = 2
astuce = {'Niveau': LVL}
newObj = False
descr = ""
source = 'AG'
for s in section:
    if s.name == 'h2' and "Description des astuces de maître" in s.text:
        LVL = 10
    elif s.name == "table":
        for td in s.find_all('td'):
            for el in td.children:
                if el.name == "h3":
                    nom = cleanSectionName(el.text)
                    reference = URL + el.find_next("a")['href']

                    if newObj:
                        astuce['Classe'] = 'Ninja'
                        astuce['Description'] = descr.strip()
                        liste.append(astuce)
                        astuce = {'Niveau': LVL}

                    descr = ""
                    astuce['Nom'] = u"Astuce: " + nom
                    astuce['Source'] = source
                    astuce['Référence'] = reference
                    source = "AG"
                    newObj = True

示例#4

0

显示文件

文件： extract-classfeatures-benedictions.py 项目： deurk/pathfinderfr-data

URLS = []
for el in navigation.find_all('a'):
    if (el.text.startswith("Bénédiction")):
        URLS.append("https://www.pathfinder-fr.org/Wiki/" + el['href'])

liste = []
for u in URLS:

    if MOCK_BENE:
        content = BeautifulSoup(open(MOCK_BENE), features="lxml").body
    else:
        content = BeautifulSoup(urllib.request.urlopen(u).read(),
                                features="lxml").body

    name = cleanSectionName(content.find('h1', {'class': 'pagetitle'}).text)
    print(" - " + name)

    section = content.find('div', {
        'class': 'presentation navmenu'
    }).next_siblings

    extractClassFeature(name, liste, section, u)

    if MOCK_BENE:
        break

print("Fusion avec fichier YAML existant...")

HEADER = ""

示例#5

0

显示文件

        level = 2 if level == 0 else 11

        exploitation = {'Source':source,'Niveau':level}
        newObj = False
        brCount = 0
        descr = ""
        for e in s.children:
            if e.name == 'h3':
                if newObj:
                    exploitation['Classe'] = 'Arcaniste'
                    exploitation['Description'] = descr.strip()
                    liste.append(exploitation)
                    exploitation = {'Source':'MJ','Niveau':level}
                    brCount = 0
                    descr = ""
                exploitation['Nom'] = "Exploitation: " + cleanSectionName(e.text)
                exploitation['Référence'] = URL + e.find_next("a")['href']
                newObj = True
            elif e.name == 'br':
                brCount+=1
                if(brCount==2 and u'Prérequis' in exploitation):
                    descr = ""
                    
            elif e.name == 'div' and not e.has_attr('class'):
                exit(1)
            
            else:
                descr += html2text(e, False)
            
        
        ## last element

示例#6

0

显示文件

        print("No descriptions found for %s" % data["category"])
        exit(1)

    newObj = True
    name = ""
    descr = ""
    source = None
    sourceNext = None
    for e in section:
        if e.name == 'h3':
            if not newObj:
                addInfos(liste, name, descr, sourceNext)

            sourceNext = source
            if e.name == 'h3':
                name = cleanSectionName(e.text)
                descr = ""
                source = None
                newObj = False

        else:
            descr += html2text(e)
            if e.name == 'div' or e.name == 'img':
                src = extractSource(e)
                if src:
                    source = src

    addInfos(liste, name, descr, sourceNext)

for l in liste:
    if 'Complete' in l and not l['Complete']:

示例#7

0

显示文件

        level = 2 if level == 0 else 10

        talent = {'Source': 'MJ', 'Niveau': level}
        newObj = False
        brCount = 0
        descr = ""
        for e in s.children:
            if e.name == 'h3':
                if newObj:
                    talent['Classe'] = 'Roublard'
                    talent['Description'] = descr.strip()
                    liste.append(talent)
                    talent = {'Source': 'MJ', 'Niveau': level}
                    brCount = 0
                    descr = ""
                talent['Nom'] = "Talent: " + cleanSectionName(e.text)
                talent['Référence'] = URL + e.find_next("a")['href']
                newObj = True
            elif e.name == 'br':
                brCount += 1
                if (brCount == 2 and u'Prérequis' in talent):
                    descr = ""

            else:
                descr += html2text(e, False)
                if e.name == 'div' or e.name == 'a':
                    src = extractSource(e)
                    if not src is None:
                        talent['Source'] = src

        ## last element

示例#8

0

显示文件

文件： extract-classes-archetypes.py 项目： deurk/pathfinderfr-data

        for s in content.children:
            if s.name == 'h3':
                if newObj:
                    classfeature['Classe'] = classe
                    classfeature['Archétype'] = nom
                    classfeature['Source'] = source
                    classfeature['Description'] = cleanDescription(descr)
                    classfeature['Niveau'] = extractLevel(
                        classfeature['Description'], 30)
                    classfeatures.append(classfeature)
                    classfeature = {'Source': 'MJ', 'Niveau': 1, 'Auto': True}
                    brCount = 0

                descr = ""
                featureName = cleanSectionName(s.text)
                if featureName.endswith('.'):
                    featureName = featureName[:-1]
                classfeature['Nom'] = cleanName(featureName[0] +
                                                featureName[1:])
                newObj = True

                for e in s.children:
                    if e.name == 'a':
                        classfeature['Référence'] = pageURL + e['href']
            else:
                descr += html2text(s)

        ## last element
        classfeature['Classe'] = classe
        classfeature['Archétype'] = nom

示例#9

0

显示文件

文件： extract-classfeatures-rages.py 项目： deurk/pathfinderfr-data

 for e in s.children:
     if e.name == 'h3':
         if newObj:
             rage['Classe'] = 'Barbare'
             rage['Description'] = descr.strip()
             if not sourceNext is None:
                 rage['Source'] = sourceNext
             liste.append(rage)
             sourceNext = source
             source = None
             rage = {'Source': 'MJ', 'Niveau': 1}
             brCount = 0
             descr = ""
         else:
             sourceNext = source
         rage['Nom'] = "Rage: " + cleanSectionName(e.text)
         rage['Référence'] = URL + e.find_next("a")['href']
         newObj = True
     elif e.name == 'b' and e.text == 'Prérequis':
         prerequis = str(e.next_sibling)
         if prerequis.startswith("'"):
             prerequis = prerequis[1:]
         m = re.search(',? ?niveau (\d+)', prerequis)
         rage['Niveau'] = extractLevel(prerequis, 300)
         rage['Prérequis'] = prerequis.replace(':', '').strip()
     elif e.name == 'br':
         brCount += 1
         if (brCount == 2 and 'Prérequis' in rage):
             descr = ""
     else:
         descr += html2text(e)