Пример #1
0
def getDocFromWeb():
    #Find for the short writeups
    conn = http.client.HTTPConnection("goossens.web.cern.ch")
    conn.request("GET", "/goossens/cernlibshort.html")
    soup = BeautifulSoup(conn.getresponse().read(), 'html.parser')

    #Fill the information from each of them and store it
    for cat in soup.find_all('h2'):
        if re.match("[A-Z]", cat.get_text()):
            print("Got a category:")
            print(cat.get_text())
            print("Getting documents on it:")
            if cat.next_sibling.next_sibling and cat.next_sibling.next_sibling.name == "dl":
                dl = cat.next_sibling.next_sibling
                for dt in dl.children:
                    if dt.name == "dt":
                        urlDoc = "/goossens/" + dt.a['href'][2:] # Here i got the html link
                        print("Getting link to html doc at:", urlDoc)
                        # conn = http.client.HTTPConnection("goossens.web.cern.ch")
                        doc = BeautifulSoup(auxTools.getFixedHTML('http://goossens.web.cern.ch'+urlDoc), 'html.parser')
                        print("Parsing:", doc.title.get_text())
                        proc = ShortWriteUp()
                        proc.loadFromHTML(doc)
                        ShortWriteUp.insertShortWriteup(proc)
                        print(proc)
Пример #2
0
def testSinglePaper():
    doc = BeautifulSoup(auxTools.getFixedHTML("http://goossens.web.cern.ch/goossens/wwwdir/shortwrupsdir/b002/top.html"), 'html.parser')
    proc = ShortWriteUp()
    proc.loadFromHTML(doc)
    print(proc)