Exemplo n.º 1
0
 def test_equality(self):
     u1 = page.Page(self.site, "GoodUsername", check=True)
     u2 = page.Page(self.site, "GoodUsername", check=False)
     self.assertEqual(u1, u2)
     site2 = wiki.Wiki("https://en.wikipedia.org/w/api.php")
     u3 = user.User(site2, "GoodUsername")
     self.assertNotEqual(u1, u3)
Exemplo n.º 2
0
 def test_equality(self):
     p1 = page.Page(self.site, "Page", check=True)
     p2 = page.Page(self.site, "Page", check=False)
     self.assertEqual(p1, p2)
     site2 = wiki.Wiki("https://en.wikipedia.org/w/api.php")
     p3 = page.Page(site2, "Page")
     self.assertNotEqual(p1, p3)
     p4 = page.Page(self.site, "Talk:Page")
     self.assertNotEqual(p1, p4)
Exemplo n.º 3
0
def setupProject(project, abbrv):
	site = wiki.Wiki()
	site.login(settings.bot, settings.botpass)
	site.setMaxlag(-1)
	date = datetime.datetime.utcnow()+datetime.timedelta(days=5)	
	table = date.strftime('pop_%b%y')
	db = MySQLdb.connect(host="sql-s1-user", read_default_file="/home/alexz/.my.cnf")
	cursor = db.cursor()
	projecttitles = set()
	project = project.replace(' ', '_')
	types = ['FA', 'FL', 'A', 'GA', 'B', 'C', 'start', 'stub', 'list', 'image', 'portal', 'category', 'book', 'disambig', 'template', 'unassessed', 'blank', 'non-article']
	insertquery = 'INSERT INTO u_alexz.'+table+' (title, project_assess) VALUES( %s, %s )'
	updatequery = 'UPDATE u_alexz.'+table+' SET project_assess=CONCAT(project_assess,",",%s) WHERE title=%s'
	selectquery = """SELECT page_namespace-1, page_title, SUBSTRING_INDEX(clB.cl_to, '-', 1) FROM enwiki_p.page 
		JOIN enwiki_p.categorylinks AS clA ON page_id=clA.cl_from 
		LEFT JOIN enwiki_p.categorylinks AS clB ON page_id=clB.cl_from AND clB.cl_to LIKE "%%-importance_"""+project+"""_articles"
		WHERE clA.cl_to=%s AND page_is_redirect=0 """
	for type in types:
		if type == "unassessed":
			cat = "Category:Unassessed "+project+" articles"
		elif type == "non-article":
			cat = "Category:Non-article "+project+" pages"
		elif type == "blank":
			cat = "Category:"+project+" pages"
		else:
			cat = "Category:"+type+"-Class "+project+" articles"
		catpage = page.Page(site, cat)
		if not catpage.exists:
			continue
		catpage.setNamespace(0)
		catname = catpage.title.replace(' ', '_').encode('utf-8')
		print catname
		cursor.execute(selectquery, (catname))
		pagesincat = cursor.fetchall()
		for title in pagesincat:			
			if not title[0]%2 == 0:
				continue
			realtitle = title[1].decode('utf8').encode('utf8')
			if title[0] != 0:
				p = page.Page(site, realtitle, check=False, namespace=title[0])
				realtitle = p.title.encode('utf8').replace(' ', '_')
			if realtitle in projecttitles:
				continue
			if title[2] is None:
				project_assess = "'%s':('%s',None)" % (abbrv, type)
			else:
				project_assess = "'%s':('%s','%s')" % (abbrv, type, title[2])
			projecttitles.add(realtitle)
			if realtitle in titlelist:
				bits = (project_assess, realtitle)
				cursor.execute(updatequery, bits)
			else:
				titlelist.add(realtitle)
				bits = (realtitle, project_assess)
				cursor.execute(insertquery, bits)	
	del projecttitles
	db.close()
Exemplo n.º 4
0
 def test_constructor_section_priority(self):
     p1 = page.Page(self.site,
                    "Page#Section 2",
                    sectionnumber=0,
                    section="Section 1")
     self.assertIs(p1.section, 0)
     p1 = page.Page(self.site, "Page#Section 2", section="Section 1")
     self.assertIs(p1.section, 1)
     p1 = page.Page(self.site, "Page#Section 2")
     self.assertIs(p1.section, 2)
Exemplo n.º 5
0
 def test_getWikiText(self):
     p1 = page.Page(self.site, "Page")
     p2 = page.Page(self.site, "Page#Section 1")
     api.logging = True
     p1.getWikiText()
     self.assertIs(len(api.querylog), 1)
     log = api.querylog.pop()
     self.assertNotIn("rvsection", log)
     p2.getWikiText()
     self.assertIs(len(api.querylog), 1)
     log = api.querylog.pop()
     self.assertIn("rvsection", log)
     self.assertNotEqual(p1.lastedittime, "")
Exemplo n.º 6
0
 def get_text(self, name):
     try:
         try:
             normalized_name = (name[0].upper() + name[1:]).replace(
                 ' ', '_')
             # very special cases
             if name[0] != 'i' and name[0:2] != 'rs' and name[0:2] != 'gs':
                 splits = name.split(' ')
                 splits[0] = splits[0].upper()
                 normalized_name = '_'.join(splits)
             print 'Using normalized name %s' % normalized_name
             pagehandle = page.Page(self.site, normalized_name, False,
                                    False)
             try:
                 return pagehandle.getWikiText(
                     getrequest=self.config.use_get_requests)
             except:
                 # For unpatched wikitools versions
                 return pagehandle.getWikiText()
         except NoPage:
             print 'ERROR : page %s is not found!' % normalized_name
             return None
     except Exception as error:
         print 'ERROR : wikitools exception while getting page %s!' % normalized_name
         print error.message
         return None
Exemplo n.º 7
0
def getcontent(title):
    site = wiki.Wiki("http://wiki.chinahpo.org/api.php?")
    pagehandle = page.Page(site, title)  #title is the name of each SNP
    snp_page = pagehandle.getWikiText()  #Wiki page parse
    #print snp_page.encode('u8')
    title = title.replace("/", "&")
    open('./CHPO/%s' % title, 'w+').write(snp_page)  # write into file
Exemplo n.º 8
0
    def snp_info(self, snp):
        """get all the data for a snp"""

        pagehandle = page.Page(self.site, snp)
        wikitext = pagehandle.getWikiText()

        return self.snp_info_from_wikitext(snp, wikitext)
Exemplo n.º 9
0
 def test_protect(self):
     self.site.login("GoodUsername", "goodpassword")
     p1 = page.Page(self.site, "Talk:Page")
     api.logging = True
     r = {"edit": "autoconfirmed", "move": "sysop"}
     e = {"edit": "1 week"}
     res = p1.protect(restrictions=r, expirations=e, reason="test_protect")
     log1 = api.querylog.pop()
     self.assertEqual(log1["action"], "query")
     self.assertEqual(log1["meta"], "tokens")
     log2 = api.querylog.pop()
     ebits = log2["expiry"].split("|")
     pbits = log2["protections"].split("|")
     self.assertIn("edit=autoconfirmed", pbits)
     self.assertIn("move=sysop", pbits)
     i = pbits.index("edit=autoconfirmed")
     self.assertEqual(ebits[i], "1 week")
     i = pbits.index("move=sysop")
     self.assertEqual(ebits[i], "indefinite")
     self.assertNotIn("cascade", log2)
     res = p1.protect(restrictions={
         "edit": "all",
         "move": "all"
     },
                      reason="test_protect")
     self.assertEqual(len(res["protect"]["protections"]), 2)
     for prot in res["protect"]["protections"]:
         if "edit" in prot:
             self.assertEqual(prot["edit"], "")
         else:
             self.assertEqual(prot["move"], "")
Exemplo n.º 10
0
 def test_getHistory_no_revs(self):
     p1 = page.Page(self.site, "Page")
     api.logging = True
     hist = p1.getHistory(content=False, user="******")
     self.assertIs(len(api.querylog), 1)
     self.assertIs(hist[0], None)
     self.assertIs(len(hist), 1)
Exemplo n.º 11
0
 def tryedit(self,
             pagetitle,
             oldtext,
             newtext,
             summary,
             watchlist='watch',
             minor=False):
     #print('in tryedit', pagetitle)
     #print('self.wiki', self.wiki)
     #print('self.username', self.username)
     if not self.wiki.isLoggedIn(self.username):
         print('not logged in. trying to login...')
         print("self.username, self.password", self.username, self.password)
         print('self.wiki', self.wiki)
         ok = self.wiki.login(self.username, self.password)
         if not ok:
             print('not logged in')
             return False, CANTLOGIN
     print('in tryedit 2', pagetitle)
     thepage = page.Page(self.wiki, pagetitle)
     wikitext = thepage.getWikiText()
     ts = thepage.lastedittime
     print('ts', ts)
     #minor = basic.fixes.minor
     #print('minor',minor)
     if oldtext == wikitext:
         editlemma = thepage.edit(text=newtext,
                                  summary=summary,
                                  basetimestamp=ts,
                                  watchlist=watchlist,
                                  minor=1)
         return True, ''
         #print('editlemma',editlemma)
     print('oldtext >< wikitext')
     return False, PAGECHANGEDFROMOLD
Exemplo n.º 12
0
 def test_getHistory(self):
     p1 = page.Page(self.site, "Page")
     api.logging = True
     p1.getHistory(content=False)
     self.assertIs(len(api.querylog), 1)
     log = api.querylog.pop()
     self.assertNotIn("content", log["rvprop"])
Exemplo n.º 13
0
def genotype_getter(my_filtered_snps):
    site = wiki.Wiki("http://snpedia.com/api.php")
    genotypes = {}

    for single_snp in my_filtered_snps:
        type_counter = 1
        wikipage = page.Page(site, single_snp.name)
        snp_page = wikipage.getWikiText()

        while snp_page.find("geno" + str(type_counter)) != -1:

            if genotypes.has_key(single_snp.name):
                current_genotypes = genotypes[single_snp.name]
                type_start = snp_page.find("geno" + str(type_counter))
                type_start = snp_page.find("(", type_start)
                type_stop = snp_page.find(")", type_start)
                current_genotypes.append(
                    str(snp_page[type_start:type_stop + 1]))
                genotypes[single_snp.name] = current_genotypes

            else:
                type_start = snp_page.find("geno" + str(type_counter))
                type_start = snp_page.find("(", type_start)
                type_stop = snp_page.find(")", type_start)
                genotypes[single_snp.name] = [
                    str(snp_page[type_start:type_stop + 1])
                ]

            type_counter += 1

        print "Got genotypes for " + str(single_snp.name)
    genotype_outfile = open("genotypes.data", "wb")
    pickle.dump(genotypes, genotype_outfile)
    genotype_outfile.close()
    return genotypes
Exemplo n.º 14
0
def search_snpedia(snp):
    """
    http://snpedia.com/index.php/Bulk
    """
    site = wiki.Wiki("http://bots.snpedia.com/api.php")
    pagehandle = page.Page(site, snp)
    snp_page = pagehandle.getWikiText()
    return snp_page
Exemplo n.º 15
0
 def test_toggleTalk(self):
     p1 = page.Page(self.site, "Page")
     p2 = p1.toggleTalk()
     self.assertIs(p1.namespace, 0)
     self.assertGreater(p1.pageid, 0)
     self.assertEqual(p2.title, "Talk:Page")
     self.assertEqual(p2.unprefixedtitle, "Page")
     self.assertEqual(p2.urltitle, "Talk%3APage")
Exemplo n.º 16
0
 def test_getLogsGen(self):
     p1 = page.Page(self.site, "File:Test1.jpg")
     api.logging = True
     for log in p1.getLogsGen(logtype="upload", limit=5):
         pass
     self.assertGreater(len(api.querylog), 1)
     log = api.querylog.pop()
     self.assertNotIn("leuser", log)
Exemplo n.º 17
0
 def test_getHistoryGen(self):
     p1 = page.Page(self.site, "Page")
     api.logging = True
     for rev in p1.getHistoryGen():
         pass
     self.assertGreater(len(api.querylog), 1)
     log = api.querylog.pop()
     self.assertIn("content", log["rvprop"])
Exemplo n.º 18
0
 def test_constructor_separate_ns(self):
     p1 = page.Page(self.site, "page", namespace=1)
     self.assertEqual(p1.title, "Talk:Page")
     self.assertEqual(p1.unprefixedtitle, "Page")
     self.assertEqual(p1.urltitle, "Talk%3APage")
     self.assertTrue(p1.exists)
     self.assertIs(p1.namespace, 1)
     self.assertGreater(p1.pageid, 0)
Exemplo n.º 19
0
    def get_wikitext(self, snp):
        """get the wikitext for a snp"""
        pagehandle = page.Page(self.site, snp)

        try:
            wikitext = pagehandle.getWikiText()
            return {'snp': snp, 'wikitext': wikitext}
        except page.NoPage:
            return {'snp': snp, 'wikitext': ""}
Exemplo n.º 20
0
 def test_getLogs(self):
     p1 = page.Page(self.site, "File:Test1.jpg")
     api.logging = True
     log = p1.getLogs(logtype="upload", limit=10)
     self.assertIs(len(api.querylog), 1)
     log = api.querylog.pop()
     self.assertEqual(log["letitle"], "File:Test1.jpg")
     self.assertEqual(log["letype"], "upload")
     self.assertNotIn("leuser", log)
Exemplo n.º 21
0
 def test_constructor_check_false(self):
     api.logging = True
     p1 = page.Page(self.site, "talk:page", check=False)
     self.assertIs(p1.exists, None)
     self.assertIs(p1.pageid, 0)
     self.assertIs(p1.namespace, 1)
     self.assertEqual(p1.unprefixedtitle, "Page")
     self.assertTrue(p1.followRedir)
     self.assertIs(len(api.querylog), 0)
Exemplo n.º 22
0
 def test_constructor(self):
     api.logging = True
     p1 = page.Page(self.site, "talk:page")
     self.assertEqual(p1.title, "Talk:Page")
     self.assertEqual(p1.unprefixedtitle, "Page")
     self.assertEqual(p1.urltitle, "Talk%3APage")
     self.assertTrue(p1.exists)
     self.assertIs(p1.namespace, 1)
     self.assertGreater(p1.pageid, 0)
     self.assertIs(len(api.querylog), 1)
     log = api.querylog.pop()
     self.assertIn("redirects", log)
Exemplo n.º 23
0
 def test_delete(self):
     self.site.login("GoodUsername", "goodpassword")
     p1 = page.Page(self.site, "Page to delete")
     p1.edit(text="text")
     api.logging = True
     res = p1.delete(reason="test_delete")
     log1 = api.querylog.pop()
     self.assertEqual(log1["action"], "query")
     self.assertEqual(log1["meta"], "tokens")
     log2 = api.querylog.pop()
     self.assertIn("reason", log2)
     self.assertIn("watchlist", log2)
     self.assertIn("delete", res)
Exemplo n.º 24
0
 def test_setNamespace(self):
     p1 = page.Page(self.site, "Page")
     p1_id = p1.pageid
     api.logging = True
     p1.setNamespace(1)
     self.assertNotEqual(p1_id, p1.pageid)
     self.assertGreater(p1.pageid, 0)
     self.assertEqual(p1.title, "Talk:Page")
     self.assertEqual(p1.unprefixedtitle, "Page")
     self.assertEqual(p1.urltitle, "Talk%3APage")
     self.assertIs(len(api.querylog), 1)
     log = api.querylog.pop()
     self.assertIn("redirects", log)
Exemplo n.º 25
0
 def test_move(self):
     self.site.login("GoodUsername", "goodpassword")
     p1 = page.Page(self.site, "Anotherpage")
     api.logging = True
     res = p1.move(mvto="User:Anotherpage", reason="test_move")
     log1 = api.querylog.pop()
     self.assertEqual(log1["action"], "query")
     self.assertEqual(log1["meta"], "tokens")
     log2 = api.querylog.pop()
     self.assertIn("reason", log2)
     self.assertIn("watchlist", log2)
     self.assertEqual(p1.namespace, 2)
     self.assertEqual(p1.title, "User:Anotherpage")
     self.assertEqual(p1.unprefixedtitle, "Anotherpage")
     self.assertEqual(p1.urltitle, "User%3AAnotherpage")
     res = p1.move(mvto="Anotherpage", reason="test_move")
     self.assertIn("move", res)
Exemplo n.º 26
0
def crawl(folder):
    n = 0
    site = wiki.Wiki("http://bots.snpedia.com/api.php")
    for result in query({'cmtitle': 'Category:Is_a_genotype'}):
        for item in result.values()[0]:
            snp = item['title']
            if not (snp.startswith('I') or snp.startswith('R')):
                continue

            pagehandle = page.Page(site, snp)
            snp_page = pagehandle.getWikiText()
            with open(folder + '/' + snp + '.txt', 'w') as f:
                f.write(snp_page)

            print n, snp
            time.sleep(0.5)
            n += 1
def crawl(folder, db_session):
    # n = 0
    # for file in os.listdir(folder):
    #   if not file.endswith('.txt'): continue
    #   with open(fodler + '/' + file) as f:
    #     snp_name = file[:-4]
    #     wikicode = mwparserfromhell.parse(f.read())
    #     templates = wikicode.filter_templates(recursive=False)

    site = wiki.Wiki("http://bots.snpedia.com/api.php")
    snp_name = "Rs7412(C;C)"
    pagehandle = page.Page(site, snp_name)
    snp_page = pagehandle.getWikiText()

    wikicode = mwparserfromhell.parse(snp_page)
    templates = wikicode.filter_templates(recursive=True)

    # get SNP

    for t in templates:
        tname = t.name.strip()

        if tname == 'Genotype':
            rs_id = _parse_entry(t, 'rsid')
            allele1 = _parse_entry(t, 'allele1')
            allele2 = _parse_entry(t, 'allele2')
            magnitude = _parse_entry(t, 'magnitude', out_type=float)
            repute = _parse_entry(t, 'repute')
            summary = _normalize_str(_parse_entry(t, 'summary'))
            genotype = allele1 + allele2

            # get snp
            snp = db_session.query(SNP).filter(SNP.rs_id == rs_id).first()
            if not snp:
                snp = SNP(rs_id=rs_id)

            # create association
            db_session.add(
                Association(snp=snp,
                            genotype=genotype,
                            magnitude=magnitude,
                            repute=repute,
                            description=summary,
                            source='snpedia'))

    db_session.commit()
Exemplo n.º 28
0
def description_getter(genotypes):
    site = wiki.Wiki("http://snpedia.com/api.php")
    genotype_descriptions = {}
    for single_type in genotypes:
        for variant in genotypes[single_type]:
            genotype_name = str(single_type) + str(variant)
            print genotype_name
            wikipage = page.Page(site, genotype_name)
            if wikipage.exists == True:
                genotype_page = wikipage.getWikiText()
                if genotype_page.find("summary=") != -1:
                    summary_start = genotype_page.find("summary=") + 8
                    summary_stop = genotype_page.find("\n", summary_start)
                    print genotype_page[summary_start:summary_stop]
                    genotype_descriptions[genotype_name] = genotype_page[
                        summary_start:summary_stop]
    description_outfile = open("description.data", "wb")
    pickle.dump(genotype_descriptions, description_outfile)
    description_outfile.close()
    return genotype_descriptions
Exemplo n.º 29
0
def fill_article_table(num_art_want, multi):
    """
    Write content to file and save paths etc to the database.
    When multi is true we're building the dataset for the multi-class
    classification task. Binary task otherwise.
    """

    site = wiki.Wiki(u"https://en.wikipedia.org/w/api.php")
    MIN_PID = 10  # Page ids start at 10
    MAX_PID = 50401510  # Set this to whatever within range
    NS = 0  # Main article namespace. https://goo.gl/Sa3yBC

    # Get num_art_want number of random articles
    count = 0
    art_dict = {}

    while count < num_art_want:
        rand_pid = random.randint(MIN_PID, MAX_PID)
        if rand_pid not in art_dict:
            try:
                p = page.Page(site=site, namespace=NS, pageid=rand_pid)

                # Ensuring that its not a page from a different namespace.
                # This shouldn't happen according to what the API
                # documentation says. IDK why its happening.
                p.getCategories()
                # Over writing the page objects' list of templates with just
                # the infobox templates; seems like a bad thing to do but idk.
                p.templates = getInfoboxes(p.getTemplates())

                if isWanted(p, multi=multi):
                    count += 1
                    print "dl:", p.title
                    rand_pid = unicode(str(rand_pid), 'utf-8')
                    data = save_all_article_data(p, rand_pid)

                    art_dict[rand_pid] = data

            except wiki.WikiError:
                pass
    populate_tables(art_dict)
Exemplo n.º 30
0
    def run(self):
        cat = category.Category(self.wiki, self.categoryname)
        self.overviewpage = page.Page(self.wiki, u"VEIDs")

        self.veidlist = {}
        for article in cat.getAllMembersGen(namespaces=[0]):
            self.collect_page_detail(article)

        try:
            oldtext = self.overviewpage.getWikiText()
        except page.NoPage:
            oldtext = ""

        newtext = self.build_new_overviewpage_text()
        # only save if something was changed
        if newtext == oldtext: return

        self.overviewpage.edit(text=newtext,
                               skipmd5=True,
                               bot=True,
                               summary=u"Regenerated list.")