from Frontier import Frontier from PageRanker import PageRanker from Indexer import Indexer from Searcher import Searcher import re frontier = Frontier() pageRanker = PageRanker() indexer = Indexer() seedDocuments = [ 'http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d01.html', 'http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d06.html', 'http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d08.html' ] def printWebGraph(webGraph): print print '-*( Web Graph )*-' print for entry in sorted(webGraph.keys()): print entry + ' -> ' + ', '.join(webGraph[entry]) def printIndex(index): print print '-*( Indices )*-' print for term,occurences in sorted(index.iteritems()): print '(' + term[0] + ', df:' + str(term[1]) + ') ->', print re.sub('(u)?\'', '', str(occurences))
from Frontier import Frontier from PageRanker import PageRanker from Indexer import Indexer from Searcher import Searcher import re frontier = Frontier() pageRanker = PageRanker() indexer = Indexer() seedDocuments = [ 'http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d01.html', 'http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d06.html', 'http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d08.html' ] def printWebGraph(webGraph): print print '-*( Web Graph )*-' print for entry in sorted(webGraph.keys()): print entry + ' -> ' + ', '.join(webGraph[entry]) def printIndex(index): print print '-*( Indices )*-' print for term, occurences in sorted(index.iteritems()): print '(' + term[0] + ', df:' + str(term[1]) + ') ->',
value = pairSplit[1].strip() query[ field] = value quotes = {} quotedBy = {} similars = {} acordaos = {} def mergeDictsSets( h1, h2): for k in h2: if k in h1: h1[k] = h1[k].union( h2[k]) return h1 try: graphMaker = GraphMaker( dbName, collectionInName, collectionOutName) pageRanker = PageRanker() tini = t1 = datetime.now() [acordaos, quotes, quotedBy, similars] = graphMaker.buildDicts( query) with open('graphPageRankingLog', 'a') as f: f.write( "build dicts time %d\n" % (datetime.now() - t1).seconds) #pageRanks = pageRanker.calculatePageRanks( acordaos, quotes, quotedBy, pageRankMode) t1 = datetime.now() [quotes, quotedBy] = graphMaker.removeInvalidAcordaosFromDicts( acordaos, quotes, quotedBy) with open('graphPageRankingLog', 'a') as f: f.write("remove invalid acordaos from dicts %d\n" % (datetime.now() - t1).seconds) t1 = datetime.now() quotesPlusSimilars = mergeDictsSets( quotes, similars) quotedByPlusSimilars = mergeDictsSets( quotedBy, similars) with open('graphPageRankingLog', 'a') as f: f.write("merge quotes with similars %d\n" % (datetime.now() - t1).seconds) t1 = datetime.now()