Python DB.links примеры использования

Язык программирования: Python

Пространство имен/Пакет: src.db

Класс/Тип: DB

Метод/Функция: links

Примеров на hotexamples.com: 11

Python DB.links - 11 примеров найдено. Это лучшие примеры Python кода для src.db.DB.links, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

DB(30)

seed(20)

connect(12)

links(11)

pages(10)

new_connect(4)

selectAll(3)

create(3)

execute(3)

get_user(1)

selectOne(1)

retrieve_records(1)

query(1)

only_server(1)

greythr_user_id(1)

greythr_password(1)

get_all(1)

get_id(1)

add_record(1)

get(1)

freeze(1)

drop_all(1)

delete(1)

create_user(1)

create_all(1)

commit(1)

all_users(1)

all_nips_papers_missing_abstracts(1)

all(1)

server_conn(1)

Пример #1

Показать файл

def web_scraper(page_id):
    """This function accepts the id,checks if it is within the list of ids in the database, and
    scrapes only 10 links on that particular link page"""
    all_ids = Pages(DB.connect()).select_id()
    new_all_id = [pid[0] for pid in all_ids]

    if page_id not in new_all_id:
        raise TypeError('Id does not exist.')

    else:
        url = Pages(DB.connect()).select_url(page_id)
        DB.pages().update(True, page_id)
        value = requests.get(url)
        soup = BeautifulSoup(value.text, 'html.parser')

        list_urls = []
        for link in soup.find_all('a', href=True):
          if link['href'].startswith('https'):
            list_urls.append(link['href'])

        new_list_urls = list_urls[:10]
        DB.links().delete_by_page_id(page_id)

        for item in new_list_urls:
            Links(DB.connect()).insert(page_id, item)

        DB.pages().update(False, page_id)

Пример #2

Показать файл

def scraping_function(id):
    """
  This function implements the web scraper that inserts into the liks table.
  :param
  id(int): The id at which the url to be scraped is retrieved.
  :return:
  None: Returns None
  :raises:
  TypeError: Raises a TypeError
  """
    try:
        # retrieves the url from the pages table
        url = DB.pages().fetch(id)
        DB.pages().update(True, id)

        link_list = []
        r = requests.get(url[0])
        # scrapes the url for hyperlinks
        soup = BeautifulSoup(r.text, features='html.parser')
        for link in soup.find_all('a', href=True):
            if 'https' in link['href']:
                link_list.append(link['href'])
        links = link_list[:10]
        DB.links().delete(id)
        for i in links:
            DB.links().insert(id, i)
        DB.pages().update(False, id)
        return None
    except TypeError:
        raise TypeError('Id not found in Pages Table')

Пример #3

Показать файл

def spider_scrap(page_id):
    '''function that recieve a page_id and insert links in the link table'''

    page_ids = [i[0] for i in DB().pages().select()]
    if page_id in page_ids:
        url = DB().pages().fetch_url(page_id)
    else:
        raise ValueError('page_id not valid')

    #update is_scraping to true
    DB().pages().update_id_true(page_id)

    #fetch the html content at the page url
    page = requests.get(url[0])

    # fetching the html content to extract maximum 10 hyperlinks
    soup = BeautifulSoup(page.text, features='html.parser')
    links_list = []
    for link in soup.find_all('a', href=True):
        links = link['href']
        if re.search("^https", links):
            links_list.append(links)
    link_url = links_list[:10]

    DB.links().delete(page_id)

    #saves the newly extratcted links to the database for the page
    for url in link_url:
        DB.links().insert(page_id, url)

    DB().pages().update_id_false(page_id)


# print(spider_scrap(1))

Пример #4

Показать файл

Файл: spider.py Проект: ooakhu/web_scraper

def scrape(id):
    DB.pages().update('True', id)
    url = DB().pages().fetch(id)
    page = requests.get(url[0])
    soup = BeautifulSoup(page.text, features='html.parser')
    a_soup = soup.find_all('a', href=True)
    ext_links = [
        link.get("href") for link in a_soup if "http" in link.get("href")
    ]
    new_links = ext_links[:10]
    DB.links().delete(id)
    for i in new_links:
        DB.links().insert(i, id)

Пример #5

Показать файл

Файл: spider.py Проект: Resa-Obamwonyi/spider-pyapp

def spider(page_id):
    ''' Takes a page id, selects the url linked to page id and runs the scraper
      Scraper takes url and returns a list of urls scraped,
      a maximum of 10 links are inserted into the database '''

    if type(page_id) != int or page_id == 0:
        raise ValueError('Page Id is not valid')

    get_url = DB.pages().get_url(page_id)

    if get_url is None:
        return ValueError('Page Id not found')

    else:
        url = get_url[0]
        all_links = []

        # set is_scraping to True where id == page_id
        DB.pages().update_by_id(True, page_id)

        res = requests.get(url)
        soup = BeautifulSoup(res.text, 'html.parser')

        for link in soup.find_all('a', href=True):

            if link['href'].startswith('http'):
                all_links.append(link['href'])

        # check if page id is in already in links table, delete all data with page id
        DB.links().delete_by_page_id(page_id)

        for link in all_links[:10]:
            # Insert each link into the links table
            Links(DB().connect()).insert(page_id, link)

        # set is_scraping to False in  where id == page_id
        DB.pages().update_by_id(False, page_id)

Пример #6

Показать файл

Файл: test_db.py Проект: Rafiatu/spider-pyapp

class TestDatabase(TestCase):
    '''Class to test the database (db) functions'''
    def setUp(self):
        self.db = DB()

    def test_connection(self):
        '''tests that the connection function does it's work.'''
        connection = self.db.connect()
        self.assertIsNotNone(connection)

    def test_setup(self):
        '''tests that the setup function does what it was designed to do.'''
        self.db.setup()
        self.assertIsNone(self.db.setup())

    def test_seed(self):
        '''tests that the seed function does what it was designed to do.'''
        self.db.connect()
        self.db.setup()
        self.db.seed()
        self.assertIsNone(self.db.seed())

    def test_pages(self):
        '''tests that the pages function does what it was designed to do.'''
        self.db.connect()
        self.db.setup()
        self.db.seed()
        selecter = self.db.pages().select()
        self.assertIsNotNone(selecter)

    def test_links(self):
        '''tests that the links function does what it was designed to do.'''
        self.db.connect()
        self.db.setup()
        select_link = self.db.links().select()
        self.assertIsNotNone(select_link)

    def TearDown(self):
        '''the teardown function for all the tests.'''
        self.db.connect().close()

Пример #7

Показать файл

 def test_links(self):
     self.assertIsNotNone(DB.links())

Пример #8

Показать файл

Файл: main.py Проект: Remi288/Spiderapp

from src.db import DB
from src.spider import spider_scrap
from celery import Celery
from decouple import config
#
db = DB()
db.connect()
db.new_connect()
db.setup()
db.seed()
dd = DB.new_connect()
pages = DB.pages()
# pages.fetch_url(2)
print(pages.fetch_url(2))
print(pages.select())
print(pages.find(2))
# print(pages.update_id(1))
links = DB.links()
print(links.insert(1, 'www.goggle.com'))
print(links.delete(1))
print(links.select(1))
# #
# app = Celery('main', broker=config('CELERY_BROKER'), backend=config('CELERY_BACKEND'))
#
#
# @app.task
# def scrap_url():
#   return spider_scrap(1)

# spider_scrap(1)

Пример #9

Показать файл

Файл: main.py Проект: Resa-Obamwonyi/spider-pyapp

from celery import Celery
from decouple import config
from src.spider import spider
from src.db.pages import Pages
from src.db import DB

# Celery Task
app = Celery('main',
             broker=config('CELERY_BROKER'),
             backend=config('CELERY_BACKEND'))


@app.task
def test():
    return spider(1)


# some tests with pages()
DB.pages().get_url(2)
DB.pages().find_by_id(1)
DB.pages().update_by_id()

# some tests with links
DB.links().select()
DB.links().insert(3, 'https://google.com')
DB.links().delete_by_page_id(2)

Пример #10

Показать файл

app = Celery('spider', broker=config('CELERY_BROKER'), backend=config('CELERY_BACKEND'))


@app.task()
def cel_spider():
  """
  Implements the celery task for the spider
  :return
  None: Returns the value of the scraping function.
  """
  return scraping_function(2)
#
#  ---- Usage of DB class ----
#
# DB.serv_conn()
# # # DB.connect()
# DB.setup()
# DB.seed()
#
# # ----Usage of pages.py----
# DB.pages().select()
# DB.pages().fetch(2)
# DB.pages().update(True, 2)
#
# # ----Usage of links.py----
# DB.links().select()
# DB.links().fetch()
# DB.links().insert(1, 'https://www.facebook.com')
DB.links().delete(1)

# scraping_function(3)

Пример #11

Показать файл

Файл: test_DB.py Проект: Resa-Obamwonyi/spider-pyapp

 def test_links(self):
     ''' Test links interface '''
     self.assertIsNotNone(DB.links(), None)