def getDocFromWeb(): #Find for the short writeups conn = http.client.HTTPConnection("goossens.web.cern.ch") conn.request("GET", "/goossens/cernlibshort.html") soup = BeautifulSoup(conn.getresponse().read(), 'html.parser') #Fill the information from each of them and store it for cat in soup.find_all('h2'): if re.match("[A-Z]", cat.get_text()): print("Got a category:") print(cat.get_text()) print("Getting documents on it:") if cat.next_sibling.next_sibling and cat.next_sibling.next_sibling.name == "dl": dl = cat.next_sibling.next_sibling for dt in dl.children: if dt.name == "dt": urlDoc = "/goossens/" + dt.a['href'][2:] # Here i got the html link print("Getting link to html doc at:", urlDoc) # conn = http.client.HTTPConnection("goossens.web.cern.ch") doc = BeautifulSoup(auxTools.getFixedHTML('http://goossens.web.cern.ch'+urlDoc), 'html.parser') print("Parsing:", doc.title.get_text()) proc = ShortWriteUp() proc.loadFromHTML(doc) ShortWriteUp.insertShortWriteup(proc) print(proc)
def testSinglePaper(): doc = BeautifulSoup(auxTools.getFixedHTML("http://goossens.web.cern.ch/goossens/wwwdir/shortwrupsdir/b002/top.html"), 'html.parser') proc = ShortWriteUp() proc.loadFromHTML(doc) print(proc)