def getSelected(self): #Get selected wikipedia articles inHandle1 = open(r'/home/lhy/ESA/wikiprep-esa/selected.txt') lines = inHandle1.readlines() inHandle2 = open(r'/home/lhy/ESA/text/20051105_pages_articles.hgw.xml') selectedID = set([]) selectedArticles = [] self.linkNum = [] self.idList = [] self.inLinkDict = {} for i in range(len(lines)): selectedID.add(int(lines[i].strip('\n'))) for _id in selectedID: self.inLinkDict[_id] = 0 for doc in xmlwikiprep.read(inHandle2): page_id = int(doc["_id"]) for link in doc["links"]: linkID = int(link) if linkID in selectedID: self.inLinkDict[linkID] += 1 if page_id not in selectedID: continue #for link in doc["links"]: # linkID = int(link) # if linkID in selectedID: # self.inLinkDict[linkID] += 1.0 self.idList.append(page_id) title = doc["title"] title = html.fromstring(title).text_content().lower() text = doc["text"] text = html.fromstring(text).text_content().lower() mergeList = [title] mergeList.append(text) mergeText = ' '.join(mergeList) selectedArticles.append(mergeText) #self.linkNum.append(len(doc["links"])) self.text = selectedArticles words = self.getWordSim() #counter = wordCounter.WordCounter(words,self.text) #counter.tfidf(20) print "SelectedArticles ok"
""", linkBuffer) linkBuffer = [] linkBuflen = 0 return args = sys.argv[1:] # scanData.py <hgw_file> if len(args) < 1: sys.exit() f = open(args[0], 'r') for doc in xmlwikiprep.read(f): recordArticle(doc) f.close() if nsBuflen > 0: cursor.executemany( """ INSERT INTO namespace (id) VALUES (%s) """, nsBuffer) nsBuffer = [] nsBuflen = 0 if linkBuflen > 0: cursor.executemany(
cursor.executemany("""INSERT INTO pagelinks (source_id,target_id) VALUES (%s,%s)""", linkBuffer) linkBuffer = [] return if __name__ == '__main__': if len(sys.argv) < 2: print "scanLinks.py file1.gz file2.gz ... > links.txt" sys.exit(1) for fname in sys.argv[1:]: print >>sys.stderr, " -> Processing file", fname #f = Popen(['zcat', fname], stdout=PIPE) # much faster than python gzip f = Popen(['pigz', '-d', '-c', fname], stdout=PIPE) # even faster for doc in xmlwikiprep.read(f.stdout, set(['text'])): recordArticle(doc) if nsBuffer: cursor.executemany(""" INSERT INTO namespace (id) VALUES (%s) """, nsBuffer) if linkBuffer: cursor.executemany(""" INSERT INTO pagelinks (source_id,target_id) VALUES (%s,%s) """, linkBuffer) print >>sys.stderr, "Some db mangling.."
cs = [] for c in cats: if catDict.has_key(c): catDict[c].add(curId) else: catDict[c] = set([curId]) return # scanCatHier.py <hgw/gum.xml> --stopcats=<category list file> f = open(args[0],'r') for doc in xmlwikiprep.read(f): recordArticle(doc) f.close() print 'cat_hier output complete' print 'traversing category tree..' cats = set(STOP_CATS) outcats = set(STOP_CATS) while cats: parent = cats.pop() childs = [] if catDict.has_key(parent):
INSERT INTO article (id,title) VALUES (%s,%s) """, articleBuffer) cursor.executemany(""" INSERT INTO text (old_id,old_text) VALUES (%s,%s) """, textBuffer) articleBuffer = [] textBuffer = [] for fname in args: print >>sys.stderr, " -> Processing file", fname #f = Popen(['zcat', fname], stdout=PIPE) # much faster than python gzip f = Popen(['pigz', '-d', '-c', fname], stdout=PIPE) # even faster for doc in xmlwikiprep.read(f.stdout): recordArticle(doc) # f = open(hgwpath, 'r') # for doc in xmlwikiprep.read(f): # recordArticle(doc) # f.close() if articleBuffer: cursor.executemany(""" INSERT INTO article (id,title) VALUES (%s,%s) """, articleBuffer) cursor.executemany(""" INSERT INTO text (old_id,old_text) VALUES (%s,%s)