def loadData(self): from wikitools import wiki from wikitools import category wikiobj = wiki.Wiki("https://en.wikipedia.org/w/api.php") wikicat = category.Category(wikiobj, title="2016_films") self.wikipages = wikicat.getAllMembers()
def promote(self, title=False, removePrefix=False, force=False, reason=False, watch=False, unwatch=False): ''' Promote a page to a category newtitle - title of the category removePrefix - when using the page title as the category title remove the prefix force - in case the category already exists, delete it ''' if not self.exists: print self.getWikiText() raise page.NoPage if self.title.startswith("Category:"): raise NotPromotable if not title: title = self.title if removePrefix: title = re.sub(".*:", "", title) title = "Category:" + title cat = category.Category(self.site, title) if cat.exists and not force: raise AlreadyExists cat.edit(text=self.getWikiText()) self.rewriteReferences(self.getBacklinks(), title) self.delete(reason=reason, watch=watch, unwatch=unwatch) return cat
def get_category_recursively(self, category_title, max_articles_num=None): """ Iterative BFS on the category tree returns all articles found in the run, as wiki Page objects :param category_title: title of needed category :param max_articles_num: maximum number of articles to fetch. stops after reaching the limit 'None' means without limit. :return: """ closed_categories = set() open_categories = [category_title] articles = set() while open_categories: current_category_name = open_categories.pop() if current_category_name in closed_categories: continue current_category = category.Category(self.site, current_category_name) for d in self.get_category_articles(current_category): if self.is_category(d.title): open_categories.append(d.title) else: articles.add(self.attach_metadata(d)) # quit if maximum_articles_num reached if max_articles_num is not None and len(articles) >= max_articles_num: return articles closed_categories.add(current_category) return articles
def get_category(self, catname): cat = category.Category(self.site, catname) items = [] for article in cat.getAllMembersGen(namespaces=[0]): items.append(article.title.lower()) if len(items) % 1000 == 0: print 'Downloading item %5d : %20s' % (len(items), items[-1]) return items
def get_count(district): cat = category.Category(site, district.strip("\n") + " மாவட்ட ஆசிரியர்கள் தொடங்கிய கட்டுரைகள்") counter = 0 for article in cat.getAllMembersGen(): counter = counter +1 return counter
def get_snpedia_snp_names(): site = wiki.Wiki('http://bots.snpedia.com/api.php') snps = category.Category(site, 'Is_a_snp') snpedia = set() for article in snps.getAllMembersGen(namespaces=[0]): snpedia.add(article.title.lower()) return snpedia
def test_getAllMembers(self): c = category.Category(self.site, "Test pages") api.logging = True members = c.getAllMembers() self.assertIsInstance(members[0], page.Page) log = api.querylog.pop() self.assertNotIn("cmnamespace", log) members = c.getAllMembers(namespaces=[3, 5]) self.assertEqual(len(members), 0) log = api.querylog.pop() self.assertIn("cmnamespace", log)
def get_drugs(fname): site = wiki.Wiki("http://bots.snpedia.com/api.php") drugs = category.Category(site, "Is_a_medicine") n = 0 with open(fname, 'w') as f: for article in drugs.getAllMembersGen(namespaces=[0]): drug = _normalize_str(article.title.strip()) f.write(drug + '\n') n += 1 print 'drugs extracted:', n
def snpedia_getter(): site = wiki.Wiki("http://snpedia.com/api.php") # open snpedia snps = category.Category(site, "Is_a_snp") snpedia = {} for article in snps.getAllMembersGen(namespaces=[0]): # get all snp-names snpedia[article.title.lower()] = "in snpedia" print article.title snpedia_outfile = open("snpedia.data", "wb") # save all snps to cache pickle.dump(snpedia, snpedia_outfile) snpedia_outfile.close() return snpedia
def get_articles(self, lang, update=False): if update==True: for r in self.con.execute('select distinct category from ' + lang + '_wikipages where category not in (select ' + lang + ' from categories)'): self.con.execute('delete from ' + lang + '_wikipages where category=?', (str(r[0]),)) print('Deleting category', r[0], '...') else: self.con.execute('delete from ' + lang + '_wikipages') self.con.commit() wikipedia.set_lang(lang) wikisite = 'http://' + lang + '.wikipedia.org/w/api.php' wikiObject = w.Wiki(wikisite) cats = self.con.execute('select ' + lang + ' from categories').fetchall() for cat in cats: print('Checking category:', cat[0]) if lang == 'ru': wikiCategory = c.Category(wikiObject, title='Категория:' + cat[0]) elif lang == 'en': wikiCategory = c.Category(wikiObject, title='Category:' + cat[0]) else: break articles = wikiCategory.getAllMembers(namespaces=[0]) if len(articles) > 200: articles = articles[0:200] for article in articles: try: if self.is_indexed(article.title, lang): continue print('Loading article', article.title, '...') new_article = wikipedia.page(article.title) if len(new_article.content) == 0: continue self.con.execute('insert into ' + lang + '_wikipages(name, content, category) values(?, ?, ?)', (article.title, new_article.content, cat[0])) except: continue self.con.commit()
def run(self): cat = category.Category(self.wiki, self.categoryname) self.overviewpage = page.Page(self.wiki, u"VEIDs") self.veidlist = {} for article in cat.getAllMembersGen(namespaces=[0]): self.collect_page_detail(article) try: oldtext = self.overviewpage.getWikiText() except page.NoPage: oldtext = "" newtext = self.build_new_overviewpage_text() # only save if something was changed if newtext == oldtext: return self.overviewpage.edit(text=newtext, skipmd5=True, bot=True, summary=u"Regenerated list.")
logging.info("Checking for bot access rights") bot_flag = check_for_bot(wiki_username) if bot_flag: logging.info("The user " + wiki_username + " has bot access.") else: logging.info("The user " + wiki_username + " does not have bot access") commons_url = "https://commons.wikimedia.org/w/api.php" commons = wikitools.wiki.Wiki(commons_url) counter = 1 cat = category.Category(commons, "PDF files in Tamil with OCR conversion") # iterate through all the pages in ns 0 for pdf in cat.getAllMembersGen(namespaces=[6]): print str(counter) + ". " + pdf.title.encode('utf-8') pdf_name = pdf.title.encode('utf-8').split("File:")[1] #pdf_name = "சிந்தனைப் பந்தாட்டம்.pdf" index_page = wikitools.Page(wiki, "Index:" + pdf_name, followRedir=True) edit_summary = "Index creation" content = " " #if index_page.exists: # print index_page.getWikiText() # logger.info("page already there") #else:
def wikiupdate(self, title, url): cat = category.Category(self.site, "Linklist") for article in cat.getAllMembersGen(namespaces=[0]): print article.edit(appendtext="\n* {title} - {url} \n".format( title=title, url=url))
def get_snps(self): """generator which returns snps from snpedia""" snps = category.Category(self.site, "Is_a_snp") for article in snps.getAllMembersGen( namespaces=[0]): # get all snp-names as list and print them yield article.title.lower()
else: s[compress(k)] = v field_lookup[compress(k)] = 1 print json.dumps(s) scores.append(s) return (scores, files) db.execute( '''create table if not exists score (id text, category text, scoretype text)''' ) db.execute( '''create table if not exists scorefile (file_n text, score text)''') catname = "Category:Mozart,_Wolfgang_Amadeus" cat = category.Category(site, catname) (scores, files) = getCat(cat) # Make sure the database has the right columns available.. add_dbfields(u"score", field_lookup.keys()) add_dbfields(u"scorefile", file_field_lookup.keys()) c = db.cursor() for score in scores: c.execute('''select count(*) from score where id = ?''', (score['id'], )) (n, ) = c.fetchone() if n == 0: columns = map(to_unicode, score.keys()) values = map((lambda column: to_unicode(score[column])), columns) query = "insert into score (" + ", ".join(columns) + ") values (" + ( ", ".join(["?"] * len(columns))) + ")"
""" Carmi Rothberg Assignment 2: Building a Corpus """ ### IMPORTS ### import re, json import parsing, extract from wikitools import wiki, category ### GET WIKI PAGES ### print('importing pages...') wikiobj = wiki.Wiki("https://en.wikipedia.org/w/api.php") wikicat = category.Category(wikiobj, title = "2017_films") wikipages = wikicat.getAllMembers() print('pages imported...') ### EXTRACT INFORMATION FROM A PAGE ### def page2dict(page): ### SETUP ### d = {'title': None, 'director': None, 'producer': None, 'starring': None, 'runtime': None, 'country': None, 'language': None, 'time': None, 'location': None, 'text': None} ### PRELIMINARY DATA ### title = re.sub(r'\s*\((2017)?\s*film\)\s*', '', str(page.title.encode('utf8'))) #guess film title from page title #print('\t'+title) d['title'] = title categories = [c[9:] for c in page.getCategories()] d['categories'] = categories ### DATA FROM PAGE TEXT ### pagetext = parsing.remove_html(page.getWikiText()) #clean page text # get as much as possible from infobox # extract.infobox(pagetext, d)
#coding=utf-8 from wikitools import wiki, category import sys #category: # Is_a_medical_condition # Is_a_gene # Is_a_genoset # Is_a_medicine # Is_a_medical_condition # Topic c = sys.argv[1] site = wiki.Wiki("http://bots.snpedia.com/api.php") snps = category.Category(site, c) for article in snps.getAllMembersGen(namespaces=[0]): print article.title.encode('u8')
zipfile.ZipFile("snpedia-archive.zip", mode="a", compression=zipfile.ZIP_DEFLATED) as ziparchive: site = wiki.Wiki("http://bots.snpedia.com/api.php") snpsfile = csv.DictReader( genomefile, delimiter="\t", fieldnames=["rsid", "chromosome", "position", "genotype"]) if "snpedia_rsids" in ziparchive.namelist(): snpedia_rsids = { line.rstrip() for line in ziparchive.read("snpedia_rsids") } else: puts("Get list of SNPs on SNPedia ... ") snps = category.Category(site, "Is_a_snp") snpedia_rsids = { article.title.lower() for article in snps.getAllMembersGen(namespaces=[0]) } ziparchive.writestr("snpedia_rsids", "\n".join(sorted(snpedia_rsids))) puts("done\n") try: with open("snpedia-archive.json", "r") as snpinfofile: snpinfo = json.load(snpinfofile) except (IOError, ValueError): snpinfo = {} namelist = set(ziparchive.namelist())
plt.barh(y_pos, z, align='center', alpha=0.4) plt.yticks(y_pos, hot_tokens) plt.xlabel('Average number of occurrences per article') plt.title('Token distribution') plt.show() site = wiki.Wiki("https://en.wikipedia.org/w/api.php") # Select a category with a reasonable number of articles (>100) #cat = "Culture" cat = "Games" print cat print "Loading category data. This may take a while..." cat_data = category.Category(site, cat) corpus_titles = [] corpus_text = [] for n, page in enumerate(cat_data.getAllMembersGen()): print "\r Loading article {0}".format(n + 1), corpus_titles.append(page.title) corpus_text.append(page.getWikiText()) n_art = len(corpus_titles) print "\nLoaded " + str(n_art) + " articles from category " + cat corpus_tokens = [] corpus_filtered = [] for n, art in enumerate(corpus_text):
from wikitools import wiki from wikitools import page from wikitools import category # List of movie categories to be extracted categories = [ "American horror films", "American Western (genre) films", "American children's films" ] site = wiki.Wiki("http://en.wikipedia.org/w/api.php") porter = nltk.stem.porter.PorterStemmer() #to be used in stemming below for c in categories: i = 0 cat = category.Category(site, title=c) pageList = cat.getAllMembersGen() #Page generator for page in pageList: print i text = page.getWikiText() # Find the Plot section in the page plot = "" beg_i = text.find('==Plot') if beg_i == -1: continue plot = text[beg_i + 2:] end_i = plot.find('\n==') if end_i == -1: continue plot = plot[:end_i] # lowercasing, removing stopwords and stemming