示例#1
0
def parseDataFromIndiWords(data):

	declesions = []
	for word, url in data:
	
		domain = "https://en.wiktionary.org%s"
		page   = utils.fetchSoup(domain % url)
	
		try:	
			table = page("table", {"class" : "prettytable"})[0]
			# print(str(table)[30:50])
			nouns  = [unicode(i.string) for i in page("ol")[0]("a")]
			print(nouns)
			gender = unicode(page("span", {"class" : "gender"})[0].string)
			print(gender)
		except:
			print("[+] Table for word : %s not found!" % word)
			table, nouns = [], []

		# processing begins here
		try:
			tableProcd  = [[unicode(k.string) for k in j("td")] for j in table("tr")]
			print("[+] Word %s has %d nouns" % (word, len(nouns)))
			declesions.append((word, nouns, tableProcd, gender))
		except Exception as e:
			print(e)
			print("[!] Seems like there exists No required Data, SKIP!")
		utils.sleeper(3)
		# break
	return declesions
示例#2
0
#!/usr/bin/env python

import utils
import parser

data = []
url = "https://en.wiktionary.org/wiki/Category:Faroese_nouns"
while True:

    # fetching and creating soup
    soup = utils.fetchSoup(url)
    # div housing the words
    page = soup("div", {"id": "mw-pages"})

    # word groups
    groups = page[0]("div", {"class": "mw-category-group"})

    # traverse of very group and extract data into groupD
    for g in groups:

        words = [(unicode(i("a")[0].string), i("a")[0]["href"]) for i in g("li")]
        categ = unicode(g("h3")[0].string)

        data.extend(words)
        print ("[*] Letter %s has %d words" % (categ, len(words)))

        # extract url for next page
    links = page[0]("a")
    nextL = [x for x in links if x.string == "next page"]
    if len(nextL) == 0:
        print ("Done Scraping!")