Пример #1
0
 def handle(self, website_url, **kwargs):
     langs = ["ar", "bg", "ca", "cs", "da", "de", "el", "en", "eo", "es",
             "eu", "fa", "fi", "fr", "he", "hu", "id", "it", "ja", "ko",
             "lt", "ms", "nl", "no", "pl", "pt", "ro", "ru", "sk", "sl",
             "sv", "tr", "uk", "vi", "vo", "zh"]
     for lang in langs:
         for src, target in links(website_url, lang=lang):
             print lang, "\t", src.encode('utf8'), "\t", target.encode('utf8')
Пример #2
0
 def handle(self, website_url, **kwargs):
     langs = [
         "ar", "bg", "ca", "cs", "da", "de", "el", "en", "eo", "es", "eu",
         "fa", "fi", "fr", "he", "hu", "id", "it", "ja", "ko", "lt", "ms",
         "nl", "no", "pl", "pt", "ro", "ru", "sk", "sl", "sv", "tr", "uk",
         "vi", "vo", "zh"
     ]
     for lang in langs:
         for src, target in links(website_url, lang=lang):
             print lang, "\t", src.encode('utf8'), "\t", target.encode(
                 'utf8')
Пример #3
0
def crawl(website):
    """
    Execute a crawl, but only if it hasn't been started already.
    """
    logging.info("starting crawl for %s" % website.url)

    crawl = m.Crawl(website=website)
    crawl.started = datetime.datetime.now()
    crawl.save()

    # look at all wikipedia pages that reference a particular website
    count = 0
    for source, target in wikipedia.links(website.url):

        # get the wikipedia page
        page, created = m.WikipediaPage.new_from_wikipedia(url=source)
        if created: 
            logging.info("created wikipedia page for %s" % source)

        # create the link
        link, created = m.Link.objects.get_or_create(
                website=website, 
                wikipedia_page=page,
                target=target)

        if created:
            logging.info("created link: %s -> %s" % (source, target))

        link.last_checked = datetime.datetime.now()
        link.save()
        reset_queries()
        count += 1

        if CRAWL_CUTOFF and count > CRAWL_CUTOFF:
            logging.info("stopping crawl at crawl cutoff: %s" % CRAWL_CUTOFF)
            break

    crawl.finished = datetime.datetime.now()
    crawl.save()

    logging.info("finished crawl for %s" % crawl.website.url)
    return crawl
Пример #4
0
def crawl(website):
    """
    Execute a crawl, but only if it hasn't been started already.
    """
    logging.info("starting crawl for %s" % website.url)

    crawl = m.Crawl(website=website)
    crawl.started = datetime.datetime.now()
    crawl.save()

    # look at all wikipedia pages that reference a particular website
    count = 0
    for source, target in wikipedia.links(website.url):

        # get the wikipedia page
        page, created = m.WikipediaPage.new_from_wikipedia(url=source)
        if created:
            logging.info("created wikipedia page for %s" % source)

        # create the link
        link, created = m.Link.objects.get_or_create(website=website,
                                                     wikipedia_page=page,
                                                     target=target)

        if created:
            logging.info("created link: %s -> %s" % (source, target))

        link.last_checked = datetime.datetime.now()
        link.save()
        reset_queries()
        count += 1

        if CRAWL_CUTOFF and count > CRAWL_CUTOFF:
            logging.info("stopping crawl at crawl cutoff: %s" % CRAWL_CUTOFF)
            break

    crawl.finished = datetime.datetime.now()
    crawl.save()

    logging.info("finished crawl for %s" % crawl.website.url)
    return crawl