Пример #1
0
def web_scraper(page_id):
    """This function accepts the id,checks if it is within the list of ids in the database, and
    scrapes only 10 links on that particular link page"""
    all_ids = Pages(DB.connect()).select_id()
    new_all_id = [pid[0] for pid in all_ids]

    if page_id not in new_all_id:
        raise TypeError('Id does not exist.')

    else:
        url = Pages(DB.connect()).select_url(page_id)
        DB.pages().update(True, page_id)
        value = requests.get(url)
        soup = BeautifulSoup(value.text, 'html.parser')

        list_urls = []
        for link in soup.find_all('a', href=True):
          if link['href'].startswith('https'):
            list_urls.append(link['href'])

        new_list_urls = list_urls[:10]
        DB.links().delete_by_page_id(page_id)

        for item in new_list_urls:
            Links(DB.connect()).insert(page_id, item)

        DB.pages().update(False, page_id)
Пример #2
0
def scraping_function(id):
    """
  This function implements the web scraper that inserts into the liks table.
  :param
  id(int): The id at which the url to be scraped is retrieved.
  :return:
  None: Returns None
  :raises:
  TypeError: Raises a TypeError
  """
    try:
        # retrieves the url from the pages table
        url = DB.pages().fetch(id)
        DB.pages().update(True, id)

        link_list = []
        r = requests.get(url[0])
        # scrapes the url for hyperlinks
        soup = BeautifulSoup(r.text, features='html.parser')
        for link in soup.find_all('a', href=True):
            if 'https' in link['href']:
                link_list.append(link['href'])
        links = link_list[:10]
        DB.links().delete(id)
        for i in links:
            DB.links().insert(id, i)
        DB.pages().update(False, id)
        return None
    except TypeError:
        raise TypeError('Id not found in Pages Table')
Пример #3
0
def spider_scrap(page_id):
    '''function that recieve a page_id and insert links in the link table'''

    page_ids = [i[0] for i in DB().pages().select()]
    if page_id in page_ids:
        url = DB().pages().fetch_url(page_id)
    else:
        raise ValueError('page_id not valid')

    #update is_scraping to true
    DB().pages().update_id_true(page_id)

    #fetch the html content at the page url
    page = requests.get(url[0])

    # fetching the html content to extract maximum 10 hyperlinks
    soup = BeautifulSoup(page.text, features='html.parser')
    links_list = []
    for link in soup.find_all('a', href=True):
        links = link['href']
        if re.search("^https", links):
            links_list.append(links)
    link_url = links_list[:10]

    DB.links().delete(page_id)

    #saves the newly extratcted links to the database for the page
    for url in link_url:
        DB.links().insert(page_id, url)

    DB().pages().update_id_false(page_id)


# print(spider_scrap(1))
Пример #4
0
def scrape(id):
    DB.pages().update('True', id)
    url = DB().pages().fetch(id)
    page = requests.get(url[0])
    soup = BeautifulSoup(page.text, features='html.parser')
    a_soup = soup.find_all('a', href=True)
    ext_links = [
        link.get("href") for link in a_soup if "http" in link.get("href")
    ]
    new_links = ext_links[:10]
    DB.links().delete(id)
    for i in new_links:
        DB.links().insert(i, id)
Пример #5
0
def spider(page_id):
    ''' Takes a page id, selects the url linked to page id and runs the scraper
      Scraper takes url and returns a list of urls scraped,
      a maximum of 10 links are inserted into the database '''

    if type(page_id) != int or page_id == 0:
        raise ValueError('Page Id is not valid')

    get_url = DB.pages().get_url(page_id)

    if get_url is None:
        return ValueError('Page Id not found')

    else:
        url = get_url[0]
        all_links = []

        # set is_scraping to True where id == page_id
        DB.pages().update_by_id(True, page_id)

        res = requests.get(url)
        soup = BeautifulSoup(res.text, 'html.parser')

        for link in soup.find_all('a', href=True):

            if link['href'].startswith('http'):
                all_links.append(link['href'])

        # check if page id is in already in links table, delete all data with page id
        DB.links().delete_by_page_id(page_id)

        for link in all_links[:10]:
            # Insert each link into the links table
            Links(DB().connect()).insert(page_id, link)

        # set is_scraping to False in  where id == page_id
        DB.pages().update_by_id(False, page_id)
Пример #6
0
class TestDatabase(TestCase):
    '''Class to test the database (db) functions'''
    def setUp(self):
        self.db = DB()

    def test_connection(self):
        '''tests that the connection function does it's work.'''
        connection = self.db.connect()
        self.assertIsNotNone(connection)

    def test_setup(self):
        '''tests that the setup function does what it was designed to do.'''
        self.db.setup()
        self.assertIsNone(self.db.setup())

    def test_seed(self):
        '''tests that the seed function does what it was designed to do.'''
        self.db.connect()
        self.db.setup()
        self.db.seed()
        self.assertIsNone(self.db.seed())

    def test_pages(self):
        '''tests that the pages function does what it was designed to do.'''
        self.db.connect()
        self.db.setup()
        self.db.seed()
        selecter = self.db.pages().select()
        self.assertIsNotNone(selecter)

    def test_links(self):
        '''tests that the links function does what it was designed to do.'''
        self.db.connect()
        self.db.setup()
        select_link = self.db.links().select()
        self.assertIsNotNone(select_link)

    def TearDown(self):
        '''the teardown function for all the tests.'''
        self.db.connect().close()
Пример #7
0
 def test_links(self):
     self.assertIsNotNone(DB.links())
Пример #8
0
from src.db import DB
from src.spider import spider_scrap
from celery import Celery
from decouple import config
#
db = DB()
db.connect()
db.new_connect()
db.setup()
db.seed()
dd = DB.new_connect()
pages = DB.pages()
# pages.fetch_url(2)
print(pages.fetch_url(2))
print(pages.select())
print(pages.find(2))
# print(pages.update_id(1))
links = DB.links()
print(links.insert(1, 'www.goggle.com'))
print(links.delete(1))
print(links.select(1))
# #
# app = Celery('main', broker=config('CELERY_BROKER'), backend=config('CELERY_BACKEND'))
#
#
# @app.task
# def scrap_url():
#   return spider_scrap(1)

# spider_scrap(1)
Пример #9
0
from celery import Celery
from decouple import config
from src.spider import spider
from src.db.pages import Pages
from src.db import DB

# Celery Task
app = Celery('main',
             broker=config('CELERY_BROKER'),
             backend=config('CELERY_BACKEND'))


@app.task
def test():
    return spider(1)


# some tests with pages()
DB.pages().get_url(2)
DB.pages().find_by_id(1)
DB.pages().update_by_id()

# some tests with links
DB.links().select()
DB.links().insert(3, 'https://google.com')
DB.links().delete_by_page_id(2)
Пример #10
0
app = Celery('spider', broker=config('CELERY_BROKER'), backend=config('CELERY_BACKEND'))


@app.task()
def cel_spider():
  """
  Implements the celery task for the spider
  :return
  None: Returns the value of the scraping function.
  """
  return scraping_function(2)
#
#  ---- Usage of DB class ----
#
# DB.serv_conn()
# # # DB.connect()
# DB.setup()
# DB.seed()
#
# # ----Usage of pages.py----
# DB.pages().select()
# DB.pages().fetch(2)
# DB.pages().update(True, 2)
#
# # ----Usage of links.py----
# DB.links().select()
# DB.links().fetch()
# DB.links().insert(1, 'https://www.facebook.com')
DB.links().delete(1)

# scraping_function(3)
Пример #11
0
 def test_links(self):
     ''' Test links interface '''
     self.assertIsNotNone(DB.links(), None)