def main(): csv_file = conf.get('output') times = int(conf.get('no_iterations')) no_epochs = int(conf.get('no_epochs')) graph = _load_adjlist(csv_file) x = {auth: 1 - Crawler.p_move for auth in graph.keys()} for _ in xrange(times): crawlers = [] # run epochs for _ in xrange(no_epochs): # generate 1 - d crawlers crawlers.extend( [Crawler(graph, pos) for pos in graph.keys() if random.random() > Crawler.p_move] ) for crawler in crawlers: crawler.next() for crawler in crawlers: if crawler.active: x[crawler.pos] += 1 # compute the average and normalize x = {auth: float(rank) / times / len(graph) for auth, rank in x.iteritems()} ranking = x.keys() ranking.sort(key=x.get, reverse=True) fname = 'stochastic.out' with open(fname, 'w') as f: for auth in ranking: f.write(str(auth) + ': ' + str(x[auth]) + '\n') print print 'You can find the complete crawling output in {}'.format(fname)
import urllib2 from lxml import html from crawler import config as conf _OUTPUT_FILE = conf.get("output") _BASE_URL = conf.get("base_url") _START_URL = conf.get("start_author_url") _START_LABEL = conf.get("start_author_label") _NO_VERTICES = int(conf.get("no_vertices")) _YEAR = conf.get("year") _base_query = ( "//tr[td[2][text() >= " + _YEAR + ']]/td[3]//*[a[1][text() = "{}"]]/a[position()>1][@class = "authority author"]/' ) _labels_query = _base_query + "text()" _hrefs_query = _base_query + "@href" def main(): adjmatrix = [[0]] labels = {_START_LABEL: 0} urls = {_START_LABEL: _START_URL} explored = set() last_explored_count = -1 def update_labels(label): new_vertex = max(labels.values()) + 1 labels[label] = new_vertex def update_matrix(): for row in adjmatrix: