示例#1
0
def bar_data(crawl_id, bar_no):
    """Returns the pages for a bar (interval) inside a crawl"""
    delegate = Delegate()

    # Find the total number of internal full links = T
    no_total = delegate.url_count_internal_full(crawl_id)

    # Select all pages
    pages = delegate.resource_get_all_by_crawl(crawl_id)

    (lower, upper) = no_2_interval(bar_no)
    selected_pages = []
    for page in pages:
        no = delegate.url_count_incoming_for_resource(page.id)
        percent = no * 100 / no_total
        if upper == 100:
            if lower <= percent and percent <= upper:
                selected_pages.append({'page': page, 'no': no})
        else:
            if lower <= percent and percent < upper:
                selected_pages.append({'page': page, 'no': no})

    return selected_pages
示例#2
0
def inner_links_data(crawl_id):
    delegate = Delegate()
    """
    crawl_id - crawl id
    """
    intervals = []
    for i in range(0, 100, STEP):
        intervals.append([i, i + STEP])
    print("Intervals %r " % intervals)

    # Select all pages
    pages = delegate.resource_get_all_by_crawl(crawl_id)

    # For every page select the no of internal full urls pointing to it. = Li
    d = dict()
    check = 0
    for page in pages:
        no = delegate.url_count_incoming_for_resource(page.id)
        d[page.id] = no
        check = check + no

    for k, v in d.items():
        print("\n%d -> %d" % (k, v))

    # Find the total number of internal full links = T
    no_total = delegate.url_count_internal_full(crawl_id)
    print("Total full internal links: %d " % no_total)

    assert check == no_total, "The no of total internal links do not match"

    # For every page select the percent % of internal full urls pointing to it. Pi = Li * 100 / T
    percents = dict()
    for page in pages:
        percents[page.id] = d[page.id] * 100 / no_total

    print("\nPercentages")
    for k, v in percents.items():
        print("\n%d -> %.2f%%" % (k, v))

    # Count total links for every interval I1[0-10], I2[10-20],...., I10[90-100] the number of links for the pages
    # that fall into that interval
    #    I1....Ti1...Pi1 = Ti1 *100 /T
    #    I2....Ti2...Pi2 = Ti1 * 100 / T

    # Compute percentage of every interval

    partials = dict()
    labels = []
    for interval in intervals:
        key = "{}-{}%".format(interval[0], interval[1])
        labels.append(key)
        partials[key] = 0
        for page in pages:
            if interval[1] == 100:
                if interval[0] <= percents[page.id] <= interval[1]:
                    partials[key] = partials[key] + percents[page.id]
            else:
                if interval[0] <= percents[page.id] < interval[1]:
                    partials[key] = partials[key] + percents[page.id]

    print("\nPartials")
    for k, v in partials.items():
        print("\n{} {} ".format(k, v))

    # Prepare the char data, sample bellow
    '''
    {
                labels: ['Red', 'Blue', 'Yellow', 'Green', 'Purple', 'Orange'],
                datasets: [{
                    label: '# of Votes',
                    data: [12, 19, 3, 5, 2, 3],
                    backgroundColor: [
                        'rgba(255, 99, 132, 0.2)',
                        'rgba(54, 162, 235, 0.2)',
                        'rgba(255, 206, 86, 0.2)',
                        'rgba(75, 192, 192, 0.2)',
                        'rgba(153, 102, 255, 0.2)',
                        'rgba(255, 159, 64, 0.2)'
                    ],
                    borderColor: [
                        'rgba(255, 99, 132, 1)',
                        'rgba(54, 162, 235, 1)',
                        'rgba(255, 206, 86, 1)',
                        'rgba(75, 192, 192, 1)',
                        'rgba(153, 102, 255, 1)',
                        'rgba(255, 159, 64, 1)'
                    ],
                    borderWidth: 1
                }]
            }
    '''

    new_data = {
        'labels': list(partials.keys()),
        'datasets': [{
            'label': 'Inner links',
            'data': list(partials.values())
        }]
    }

    return new_data
示例#3
0
    def test_link(self):
        delegate = XDelegate()

        print("test_page started")
        # Site 1
        site1 = Site()
        site1.name = "Site1"
        site1.url = 'http://foo.com'
        delegate.site_create(site1)

        # Crawl
        crawl = Crawl(site_id=site1.id)
        delegate.crawl_create(crawl)
        assert crawl.id > 0

        # Page
        page = Resource()
        page.crawl_id = crawl.id
        page.content = "Ala bala portocala"
        page.absolute_url = "https://scriptoid.com/index.php"
        delegate.resource_create(page)

        # Link

        # Test url_is_present()
        p1 = delegate.url_is_present('https://scriptoid.com/index.php',
                                     crawl.id)
        assert not p1

        # Test url_count_unvisited()
        n1 = delegate.url_count_unvisited(crawl_id=crawl.id)
        assert n1 == 0, 'n1 is {}'.format(n1)

        # Test url_get_all_by_crawl_id()
        crawl_urls = delegate.url_get_all_by_crawl_id(crawl.id)
        assert len(crawl_urls) == 0

        # Test url_count_incoming_for_resource()
        uc1 = delegate.url_count_incoming_for_resource(page.id)
        assert uc1 == 0

        # Test url_count_internal_full()
        cif = delegate.url_count_internal_full(crawl.id)
        assert cif == 0

        url1 = Url()
        url1.src_resource_id = page.id
        url1.url = '/team'
        url1.absolute_url = 'https://scriptoid.com/team'
        url1.type = Url.TYPE_INTERNAL
        url1.crawl_id = crawl.id
        url1.job_status = Url.JOB_STATUS_IN_PROGRESS
        lid1 = delegate.url_create(url1)
        assert url1.id > 0
        assert lid1 == url1.id

        url2 = Url()
        url2.src_resource_id = page.id
        url2.dst_resource_id = page.id
        url2.url = '/contact'
        url2.absolute_url = 'https://scriptoid.com/index.php'
        url2.type = Url.TYPE_INTERNAL
        url2.crawl_id = crawl.id
        delegate.url_create(url2)
        assert url2.id > 0

        url3 = Url()
        url3.dst_resource_id = page.id
        url3.url = '/jobs'
        url3.absolute_url = 'https://scriptoid.com/jobs.php'
        url3.type = Url.TYPE_INTERNAL
        url3.crawl_id = crawl.id
        delegate.url_create(url3)
        assert url3.id > 0

        # Test url_count_incoming_for_resource()
        uc1 = delegate.url_count_incoming_for_resource(page.id)
        assert uc1 == 1

        # Test url_get_by_id()
        u1 = delegate.url_get_by_id(url1.id)
        assert u1.id == url1.id

        # Test url_is_present()
        p1 = delegate.url_is_present('https://scriptoid.com/index.php',
                                     crawl.id)
        assert p1

        # Test url_get_all_by_crawl_id()
        crawl_urls = delegate.url_get_all_by_crawl_id(crawl.id)
        assert len(crawl_urls) == 3

        # Test first unvisited link
        l1 = delegate.url_get_first_unvisited(crawl_id=crawl.id)
        assert l1.id == url2.id, 'l1.id = {} and url.id = {}'.format(
            l1.id, url2.id)

        # Test url_get_all_unvisited()
        unvisited1 = delegate.url_get_all_unvisited(crawl.id)
        assert len(unvisited1) == 2

        # Test url_count_unvisited()
        n1 = delegate.url_count_unvisited(crawl_id=crawl.id)
        assert n1 == 2, 'n1 is {}'.format(n1)

        n2 = delegate.url_count_visited(crawl_id=crawl.id)
        assert n2 == 0, 'Actually n2 is {}'.format(n2)

        url1.job_status = Url.JOB_STATUS_VISITED
        delegate.url_update(url1)
        l1 = delegate.url_get_first_unvisited(crawl_id=crawl.id)
        assert l1.id == url2.id

        n1 = delegate.url_count_unvisited(crawl_id=crawl.id)
        assert n1 == 2, 'n1 is {}'.format(n1)

        n2 = delegate.url_count_visited(crawl_id=crawl.id)
        assert n2 == 1, 'n2 is {}'.format(n2)

        # Test url_count_internal_full()
        cif = delegate.url_count_internal_full(crawl.id)
        assert cif == 1

        # Test url_count_pending()
        ucp = delegate.url_count_pending(crawl.id)
        assert ucp == 2

        # Test url_delete_all()
        delegate.url_delete_all()
        links = delegate.url_get_all()
        assert len(links) == 0, "When actually there are {}".format(len(links))

        # Test url_count_external()
        uce = delegate.url_count_external(crawl.id)
        assert uce == 0

        url4 = Url()
        url4.dst_resource_id = page.id
        url4.url = '/jobs'
        url4.absolute_url = 'https://scriptoid.com/jobs.php'
        url4.type = Url.TYPE_EXTERNAL
        url4.crawl_id = crawl.id
        delegate.url_create(url4)
        assert url4.id > 0

        uce = delegate.url_count_external(crawl.id)
        assert uce == 1

        assert delegate.url_delete_by_id(url4.id)

        # Test a cascade delete from parent Page resource_delete_all() to Link
        url = Url()
        url.src_resource_id = page.id
        url.url = '/contact'
        url.absolute_url = 'https://scriptoid.com/index.php'
        url.type = Url.TYPE_INTERNAL
        url.crawl_id = crawl.id
        delegate.url_create(url)
        assert url.id > 0

        delegate.resource_delete_all()
        links = delegate.url_get_all()
        assert len(links) == 0, "When actually there are {}".format(len(links))

        # Clean up
        # delegate.link_delete_all()
        delegate.resource_delete_all()
        delegate.crawl_delete_all()
        delegate.site_delete_all()

        print("test_page done")