Пример #1
0
Файл: crawl.py Проект: affo/sand
def main():
    csv_file = conf.get('output')
    times = int(conf.get('no_iterations'))
    no_epochs = int(conf.get('no_epochs'))

    graph = _load_adjlist(csv_file)

    x = {auth: 1 - Crawler.p_move for auth in graph.keys()}
    for _ in xrange(times):
        crawlers = []

        # run epochs
        for _ in xrange(no_epochs):
            # generate 1 - d crawlers
            crawlers.extend(
                [Crawler(graph, pos) for pos in graph.keys()
                    if random.random() > Crawler.p_move]
            )
            for crawler in crawlers:
                crawler.next()

        for crawler in crawlers:
            if crawler.active:
                x[crawler.pos] += 1

    # compute the average and normalize
    x = {auth: float(rank) / times / len(graph)
            for auth, rank in x.iteritems()}

    ranking = x.keys()
    ranking.sort(key=x.get, reverse=True)

    fname = 'stochastic.out'
    with open(fname, 'w') as f:
        for auth in ranking:
            f.write(str(auth) + ': ' + str(x[auth]) + '\n')

    print
    print 'You can find the complete crawling output in {}'.format(fname)
Пример #2
0
import urllib2
from lxml import html
from crawler import config as conf

_OUTPUT_FILE = conf.get("output")
_BASE_URL = conf.get("base_url")
_START_URL = conf.get("start_author_url")
_START_LABEL = conf.get("start_author_label")
_NO_VERTICES = int(conf.get("no_vertices"))
_YEAR = conf.get("year")

_base_query = (
    "//tr[td[2][text() >= " + _YEAR + ']]/td[3]//*[a[1][text() = "{}"]]/a[position()>1][@class = "authority author"]/'
)
_labels_query = _base_query + "text()"
_hrefs_query = _base_query + "@href"


def main():
    adjmatrix = [[0]]
    labels = {_START_LABEL: 0}
    urls = {_START_LABEL: _START_URL}
    explored = set()
    last_explored_count = -1

    def update_labels(label):
        new_vertex = max(labels.values()) + 1
        labels[label] = new_vertex

    def update_matrix():
        for row in adjmatrix: