def runTests(fileName): redirects = {} articleTitles = {} testCount = 0 failedCount = 0 for test in iterTests(fileName): orig = test.orig expected = test.expected converted = articleconvert.convertArticle(test.name, orig) expected = arsutils.normalizeNewlines(expected) converted = arsutils.normalizeNewlines(converted) if converted != expected: failedCount += 1 test.setConverted(converted) failedList.append(test) sys.stdout.write("-") else: sys.stdout.write(".") noLinks = articleconvert.removeInvalidLinks(converted, redirects, articleTitles) testCount += 1 print print "Total tests: %d" % testCount print "Failed tests: %d" % failedCount dumpFailed() diffFirstFailed()
def runTests(fileName): redirects = {} articleTitles = {} testCount = 0 failedCount = 0 for test in iterTests(fileName): orig = test.orig expected = test.expected converted = articleconvert.convertArticle(test.name,orig) expected = arsutils.normalizeNewlines(expected) converted = arsutils.normalizeNewlines(converted) if converted != expected: failedCount += 1 test.setConverted(converted) failedList.append(test) sys.stdout.write("-") else: sys.stdout.write(".") noLinks = articleconvert.removeInvalidLinks(converted,redirects,articleTitles) testCount += 1 print print "Total tests: %d" % testCount print "Failed tests: %d" % failedCount dumpFailed() diffFirstFailed()
def convertArticles(sqlDump, articleLimit): count = 0 redirects = {} articleTitles = {} fTesting = False if fTesting: fUseCache = False fRecreateCache = True else: fUseCache = True fRecreateCache = False for article in wikipediasql.iterWikipediaArticles(sqlDump, articleLimit, fUseCache, fRecreateCache): # we only convert article from the main namespace assert article.getNamespace() == wikipediasql.NS_MAIN title = article.getTitle() if article.fRedirect(): redirects[title] = article.getRedirect() else: txt = article.getText() #links = articleconvert.articleExtractLinks(txt) #articleTitles[title] = links articleTitles[title] = 1 count += 1 if 0 == count % 1000: sys.stderr.write("processed %d rows, last title=%s\n" % (count,title.strip())) if articleLimit and count >= articleLimit: break # verify redirects print "Number of real articles: %d" % len(articleTitles) print "Number of all redirects: %d (%d in total)" % (len(redirects), len(articleTitles)+len(redirects)) unresolvedCount = 0 setUnresolvedRedirectWriter(sqlDump) redirectsExisting = {} for (title,redirect) in redirects.items(): redirectResolved = resolveRedirect(title,redirect,redirects,articleTitles) if None == redirectResolved: unresolvedCount +=1 #print "redirect '%s' (to '%s') not resolved" % (title,redirect) else: redirectsExisting[title] = redirectResolved closeUnresolvedRedirectWriter() print "Number of unresolved redirects: %d" % unresolvedCount dbName = getDbNameFromFileName(sqlDump) ipedia_write_cur = getNamedCursor(getIpediaConnection(dbName), "ipedia_write_cur") # go over articles again (hopefully now using the cache), # convert them to a destination format (including removing invalid links) # and insert into a database sizeStats = {} count = 0 convWriter = wikipediasql.ConvertedArticleCacheWriter(sqlDump) convWriter.open() for article in wikipediasql.iterWikipediaArticles(sqlDump, articleLimit, True, False): title = article.getTitle() articleSize = 0 # 0 is for redirects, which we don't log if article.fRedirect(): convertedArticle = ConvertedArticleRedirect(article.getNamespace(), title, article.getRedirect()) else: txt = article.getText() converted = articleconvert.convertArticle(title, txt) try: noLinks = articleconvert.removeInvalidLinks(converted,redirects,articleTitles) except: print "exception in articleconvert.removeInvalidLinks" print "title: _%s_" % title print "txt:\n_%s_" % txt print "converted:\n_%s_" % converted raise if noLinks: converted = noLinks convertedArticle = ConvertedArticle(article.getNamespace(), article.getTitle(), converted) articleSize = len(converted) if article.fRedirect(): if redirectsExisting.has_key(title): redirect = redirectsExisting[title] try: title = title.replace("_", " ") redirect = redirect.replace("_", " ") ipedia_write_cur.execute("""INSERT INTO redirects (title, redirect) VALUES ('%s', '%s')""" % (dbEscape(title), dbEscape(redirect))) except: print "DUP REDERICT '%s' => '%s'" % (title, redirect) else: title = title.replace("_", " ") if g_fVerbose: log_txt = "title: %s " % title try: ipedia_write_cur.execute("""INSERT INTO articles (title, body) VALUES ('%s', '%s')""" % (dbEscape(title), dbEscape(converted))) if g_fVerbose: log_txt += "*New record" except: # assuming that the exception happend because of trying to insert # item with a duplicate title (duplication due to lower-case # conversion might convert 2 differnt titles into the same, # lower-cased title) if g_fShowDups: print "dup: " + title if g_fVerbose: log_txt += "Update existing record" print "DUP ARTICLE: '%s'" % title ipedia_write_cur.execute("""UPDATE articles SET body='%s' WHERE title='%s'""" % (dbEscape(converted), dbEscape(title))) if g_fVerbose: print log_txt convWriter.write(convertedArticle) if articleSize != 0: if not sizeStats.has_key(articleSize): sizeStats[articleSize] = 1 else: sizeStats[articleSize] = sizeStats[articleSize]+1 count += 1 if count % 1000 == 0: sys.stderr.write("phase 2 processed %d, last title=%s\n" % (count,article.getTitle())) convWriter.close() # dump size stats to a file statsFileName = wikipediasql.getSizeStatsFileName(sqlDump) statsFo = open(statsFileName, "wb") sizes = sizeStats.keys() sizes.sort() for size in sizes: count = sizeStats[size] statsFo.write("%d\t\t%d\n" % (size,count)) statsFo.close()
def convertArticles(sqlDump, articleLimit): count = 0 redirects = {} articleTitles = {} fTesting = False if fTesting: fUseCache = False fRecreateCache = True else: fUseCache = True fRecreateCache = False for article in wikipediasql.iterWikipediaArticles(sqlDump, articleLimit, fUseCache, fRecreateCache): # we only convert article from the main namespace assert article.getNamespace() == wikipediasql.NS_MAIN title = article.getTitle() if article.fRedirect(): redirects[title] = article.getRedirect() else: txt = article.getText() #links = articleconvert.articleExtractLinks(txt) #articleTitles[title] = links articleTitles[title] = 1 count += 1 if 0 == count % 1000: sys.stderr.write("processed %d rows, last title=%s\n" % (count, title.strip())) if articleLimit and count >= articleLimit: break # verify redirects print "Number of real articles: %d" % len(articleTitles) print "Number of all redirects: %d (%d in total)" % ( len(redirects), len(articleTitles) + len(redirects)) unresolvedCount = 0 setUnresolvedRedirectWriter(sqlDump) redirectsExisting = {} for (title, redirect) in redirects.items(): redirectResolved = resolveRedirect(title, redirect, redirects, articleTitles) if None == redirectResolved: unresolvedCount += 1 #print "redirect '%s' (to '%s') not resolved" % (title,redirect) else: redirectsExisting[title] = redirectResolved closeUnresolvedRedirectWriter() print "Number of unresolved redirects: %d" % unresolvedCount dbName = getDbNameFromFileName(sqlDump) ipedia_write_cur = getNamedCursor(getIpediaConnection(dbName), "ipedia_write_cur") # go over articles again (hopefully now using the cache), # convert them to a destination format (including removing invalid links) # and insert into a database sizeStats = {} count = 0 convWriter = wikipediasql.ConvertedArticleCacheWriter(sqlDump) convWriter.open() for article in wikipediasql.iterWikipediaArticles(sqlDump, articleLimit, True, False): title = article.getTitle() articleSize = 0 # 0 is for redirects, which we don't log if article.fRedirect(): convertedArticle = ConvertedArticleRedirect( article.getNamespace(), title, article.getRedirect()) else: txt = article.getText() converted = articleconvert.convertArticle(title, txt) try: noLinks = articleconvert.removeInvalidLinks( converted, redirects, articleTitles) except: print "exception in articleconvert.removeInvalidLinks" print "title: _%s_" % title print "txt:\n_%s_" % txt print "converted:\n_%s_" % converted raise if noLinks: converted = noLinks convertedArticle = ConvertedArticle(article.getNamespace(), article.getTitle(), converted) articleSize = len(converted) if article.fRedirect(): if redirectsExisting.has_key(title): redirect = redirectsExisting[title] try: title = title.replace("_", " ") redirect = redirect.replace("_", " ") ipedia_write_cur.execute( """INSERT INTO redirects (title, redirect) VALUES ('%s', '%s')""" % (dbEscape(title), dbEscape(redirect))) except: print "DUP REDERICT '%s' => '%s'" % (title, redirect) else: title = title.replace("_", " ") if g_fVerbose: log_txt = "title: %s " % title try: ipedia_write_cur.execute( """INSERT INTO articles (title, body) VALUES ('%s', '%s')""" % (dbEscape(title), dbEscape(converted))) if g_fVerbose: log_txt += "*New record" except: # assuming that the exception happend because of trying to insert # item with a duplicate title (duplication due to lower-case # conversion might convert 2 differnt titles into the same, # lower-cased title) if g_fShowDups: print "dup: " + title if g_fVerbose: log_txt += "Update existing record" print "DUP ARTICLE: '%s'" % title ipedia_write_cur.execute( """UPDATE articles SET body='%s' WHERE title='%s'""" % (dbEscape(converted), dbEscape(title))) if g_fVerbose: print log_txt convWriter.write(convertedArticle) if articleSize != 0: if not sizeStats.has_key(articleSize): sizeStats[articleSize] = 1 else: sizeStats[articleSize] = sizeStats[articleSize] + 1 count += 1 if count % 1000 == 0: sys.stderr.write("phase 2 processed %d, last title=%s\n" % (count, article.getTitle())) convWriter.close() # dump size stats to a file statsFileName = wikipediasql.getSizeStatsFileName(sqlDump) statsFo = open(statsFileName, "wb") sizes = sizeStats.keys() sizes.sort() for size in sizes: count = sizeStats[size] statsFo.write("%d\t\t%d\n" % (size, count)) statsFo.close()