def parseDataFromIndiWords(data): declesions = [] for word, url in data: domain = "https://en.wiktionary.org%s" page = utils.fetchSoup(domain % url) try: table = page("table", {"class" : "prettytable"})[0] # print(str(table)[30:50]) nouns = [unicode(i.string) for i in page("ol")[0]("a")] print(nouns) gender = unicode(page("span", {"class" : "gender"})[0].string) print(gender) except: print("[+] Table for word : %s not found!" % word) table, nouns = [], [] # processing begins here try: tableProcd = [[unicode(k.string) for k in j("td")] for j in table("tr")] print("[+] Word %s has %d nouns" % (word, len(nouns))) declesions.append((word, nouns, tableProcd, gender)) except Exception as e: print(e) print("[!] Seems like there exists No required Data, SKIP!") utils.sleeper(3) # break return declesions
#!/usr/bin/env python import utils import parser data = [] url = "https://en.wiktionary.org/wiki/Category:Faroese_nouns" while True: # fetching and creating soup soup = utils.fetchSoup(url) # div housing the words page = soup("div", {"id": "mw-pages"}) # word groups groups = page[0]("div", {"class": "mw-category-group"}) # traverse of very group and extract data into groupD for g in groups: words = [(unicode(i("a")[0].string), i("a")[0]["href"]) for i in g("li")] categ = unicode(g("h3")[0].string) data.extend(words) print ("[*] Letter %s has %d words" % (categ, len(words))) # extract url for next page links = page[0]("a") nextL = [x for x in links if x.string == "next page"] if len(nextL) == 0: print ("Done Scraping!")