Exemplo n.º 1
0
    def add_links_to_frontier(self):
        print("self.links_to_crawl: ", len(self.links_to_crawl))
        for link in self.links_to_crawl:

            current_link_url = link.geturl()

            # print("SCHEME: --->", link.scheme)
            current_link_domain = link.scheme + "://" + link.netloc

            #print("current_link_domain: ", current_link_domain)

            # Only scrape sites in the gov.si domain
            if not self.check_if_current_domain_is_allowed(
                    current_link_domain):
                continue

            # Only add pages in the allowed domain

            #self.lock.acquire()
            #all_sites = db.get_all_sites()
            #all_pages = db.get_all_pages()

            # check if the link exists in any of the pages in db
            # check if the domain of the link already exists in db
            same_domain = False

            #domain_id = self.return_domain_if_it_already_exists(all_sites, current_link_domain)
            domain_id_or_false = db.check_site_exists(current_link_domain)

            if not domain_id_or_false:
                # new domain

                robotstext_content, sitemap_content = Crawler.get_robots_and_sitemap_content(
                    current_link_domain)
                new_site = db.insert_site(current_link_domain,
                                          robotstext_content, sitemap_content)

                if self.check_if_page_is_allowed_by_robots_txt(
                        new_site, current_link_url):
                    new_page = db.insert_page(new_site[0], PAGE_TYPE_CODES[2],
                                              current_link_url, "", "", "200",
                                              "040521")
                    db.insert_link(self.page_currently_crawling[0],
                                   new_page[0])
                    #print("inserting new page new domain")
            else:
                # existing domain
                if self.check_if_page_is_allowed_by_robots_txt(
                        self.site_currently_crawling, current_link_url):

                    #print("inserting", current_link_url)
                    new_page = db.insert_page(domain_id_or_false,
                                              PAGE_TYPE_CODES[2],
                                              current_link_url, "", "", "200",
                                              "040521")
Exemplo n.º 2
0
def insert_seed_urls_into_db():
    for seed_url in SEED_URLS:
        page_obj = urllib.parse.urlparse(seed_url)

        current_url = page_obj.geturl()
        current_site_url =  page_obj.scheme + "://" + page_obj.netloc

        robotstext_content, sitemap_content = Crawler.get_robots_and_sitemap_content(current_site_url)

        current_site = db.insert_site(current_site_url, robotstext_content, sitemap_content)
        current_page = db.insert_page(current_site[0], PAGE_TYPE_CODES[2], current_url, "", "","200", "040521")
Exemplo n.º 3
0
import db_methods as db
PAGE_TYPE_CODES = ["HTML","DUPLICATE","FRONTIER","BINARY"]
DATA_TYPES = ["DOC","DOCX","PDF","PPT","PPTX"]

'''Insert site'''
site = db.insert_site("test.com123", "robotstext", "sitemaptext")
print("inserted site:", site)

'''Insert page'''
page1 = db.insert_page(site[0], PAGE_TYPE_CODES[0], "test.com/index.html", "html_content", "300", "040521")
print("inserted page:", page1)

page2 = db.insert_page(site[0], PAGE_TYPE_CODES[0], "test1.com/index.html", "html_content2", "303", "040522")
print("inserted page:", page2)

'''Insert image'''
image = db.insert_image(page1[0], "slika.jpg", "image/jpg", "asd", "040521")
print("inserted image:", image)

'''Insert page_data'''
page_data = db.insert_page_data(page2[0], DATA_TYPES[0], "asd")
print("page_data_id:", page_data)

'''Insert link'''
link = db.insert_link(page1[0], page2[0])
print("inserted link:", link)

print("getting all sites:", db.get_all_sites())
print("getting all pages:", db.get_all_pages())
print("getting all images:", db.get_all_images())
print("getting all page data:", db.get_all_page_data())