Python scrape примеры, scraper.scraper.scrape Python примеры использования

Пример #1

0

Показать файл

def main():  # pragma: no cover
    parser = argparse.ArgumentParser()
    parser.add_argument("--platform",
                        "-p",
                        help="scrape platform. (eg: doctolib,keldoc or all)")
    parser.add_argument("--url",
                        "-u",
                        action="append",
                        help="scrape one url, can be repeated")
    parser.add_argument("--merge",
                        "-m",
                        help="merge platform results",
                        action="store_true")
    args = parser.parse_args()

    if args.merge:
        merge_platforms()
        return
    if args.url:
        scrape_debug(args.url)
        return
    platforms = []
    if args.platform and args.platform != 'all':
        platforms = args.platform.split(',')
    scrape(platforms=platforms)

Пример #2

0

Показать файл

Файл: main.py Проект: Olivier4477/vitemadose

def main():  # pragma: no cover
    parser = argparse.ArgumentParser()
    parser.add_argument("--platform",
                        "-p",
                        help="scrape platform. (eg: doctolib,keldoc or all)")
    parser.add_argument("--url",
                        "-u",
                        action="append",
                        help="scrape one url, can be repeated")
    parser.add_argument("--url-file",
                        type=argparse.FileType("r"),
                        help="scrape urls listed in file (one per line)")
    parser.add_argument("--merge",
                        "-m",
                        help="merge platform results",
                        action="store_true")
    args = parser.parse_args()

    if args.merge:
        merge_platforms()
        return
    if args.url_file:
        args.url = [line.rstrip() for line in args.url_file]
    if args.url:
        scrape_debug(args.url)
        return
    platforms = []
    if args.platform and args.platform != "all":
        platforms = args.platform.split(",")
    scrape(platforms=platforms)

Пример #3

0

Показать файл

def scrape(url):
    """
    Scrape the meta.link of the girder item with the given id.
    Update the entry with the results, then update it with cleaned and 
    translated versions of the scraped content.
    """
    return make_json_compat(scraper.scrape(url))

Пример #4

0

Показать файл

Файл: vax_runner.py Проект: danhurwit/covax_bot

def run():
    sources = scraper.scrape()
    for source in sources:
        locations_to_publish = get_locations_to_publish(source)
        if locations_to_publish:
            publisher.publish_locations(source, locations_to_publish)
            record_availability(locations_to_publish, source)
        if source.should_update_availability:
            update_availability_counts(source)
        print("Found new availability at: {} / {} {} sites...".format(
            len(locations_to_publish), len(source.get_locations()),
            source.get_name()))

Пример #5

0

Показать файл

Файл: test_scraper.py Проект: james-o-johnstone/article-scraper

    def test_bad_status_code(self):
        url = "http://www.bad_url.com"
        responses.add(
            responses.GET,
            url,
            status=404,
        )

        articles = scrape([url])
        expected_articles = []

        self.assertEqual(articles, expected_articles)

Пример #6

0

Показать файл

Файл: test_scraper.py Проект: james-o-johnstone/article-scraper

    def test_bad_url(self):
        url = "http://www.bad_url.com"
        responses.add(
            responses.GET,
            url,
            body=Exception(),
        )

        articles = scrape([url])
        expected_articles = []

        self.assertEqual(articles, expected_articles)

Пример #7

0

Показать файл

Файл: test_scraper.py Проект: james-o-johnstone/article-scraper

    def test_unsupported_content_type(self):
        url = "https://www.url.com"
        responses.add(
            responses.GET,
            url,
            headers={'Content-Type': 'application/json'},
        )

        articles = scrape([url])
        expected_articles = []

        self.assertEqual(articles, expected_articles)

Пример #8

0

Показать файл

Файл: main.py Проект: josh-gree/python-task

def main():
    """Main entrypoint for scraper. Creates needed tables and extracts
    data from website and stores in DB.
    """
    try:
        db_user = os.environ["DB_USER"]
        db_pw = os.environ["DB_PW"]
        db_host = os.environ["DB_HOST"]
        db_port = os.environ["DB_PORT"]
        db_name = os.environ["DB_NAME"]
    except KeyError:
        print(
            "connection env vars not all set - all of DB_USER,DB_PW,DB_HOST,DB_PORT and DB_NAME needed."
        )

    conn_str = f"postgres://{db_user}:{db_pw}@{db_host}:{db_port}/{db_name}"
    engine = create_model(Advert, conn_str)
    session = get_session(engine)

    try:
        url = os.environ["URL_TO_SCRAPE"]
    except KeyError:
        print("Need to set URL_TO_SCRAPE env var")

    # We try to scrape the data here - if it fails because the structure of
    # the page has changed or the page is not reachable we will get an exception
    # and exit.
    try:
        html = get_html(url)
    except ConnectionError as e:
        raise e

    parsed_html = parse_html(html)

    try:
        adverts = scrape(parsed_html, CONTAINER_SELECTOR, TITLE_SELECTOR)
    except HTMLStructureChanged as e:
        raise e

    for advert in adverts:
        # For each advert we have scraped try to add to the DB. We will
        # get an IntegrityError if we try to add a duplicate and then will
        # move on to next advert. This allows for running this same script
        # multiple times and only adding new data to db.
        try:
            Advert.commit_new(**advert, session=session)
        except IntegrityError:
            # need a new session
            session = get_session(engine)
            print("duplicate")

Пример #9

0

Показать файл

    def click_insert_form_result(self):
        self.url = self.url_line_edit.text()

        from scraper.scraper import scrape
        self.result = scrape(self.url)

        self.forms = find_all_form(self.result)

        self.tblForm = PyQt5.QtWidgets.QTableWidget()
        self.tblForm.setRowCount(len(self.forms))

        self.tblForm.setColumnCount(3)

        header = ("Method", "Action", "Event")
        self.tblForm.setHorizontalHeaderLabels(header)
        self.tblForm.horizontalHeader().setSectionResizeMode(3)

        self.rowcount = 0

        self.main_scroll_widged = QWidget()
        self.main_scroll_vbox = QVBoxLayout()

        header = ("Method", "Action", "Event")
        self.tblForm.setHorizontalHeaderLabels(header)

        rowcount = 0

        for f in self.forms:
            header = getheader(f)
            methoditem = PyQt5.QtWidgets.QTableWidgetItem(header["method"])

            self.tblForm.setItem(self.rowcount, 0, methoditem)
            self.tblForm.setItem(
                self.rowcount, 1,
                PyQt5.QtWidgets.QTableWidgetItem(header["action"]))
            button = PyQt5.QtWidgets.QPushButton("Input", self)

            curr = self.rowcount
            button.clicked.connect(
                partial(self.click_insert_input_result, curr))
            self.tblForm.setCellWidget(self.rowcount, 2, button)
            rowcount += 1

        self.main_scroll_vbox.addWidget(self.tblForm)
        self.main_scroll_widged.setLayout(self.main_scroll_vbox)

        self.right_v_layout.addWidget(self.main_scroll_widged)

Пример #10

0

Показать файл

Файл: test_scraper.py Проект: james-o-johnstone/article-scraper

    def test_html_scrape(self):
        url = "http://www.crainscleveland.com/node/688181"
        with open(os.path.join(TEST_DATA_PATH, 'article.html')) as f:
            body = f.read()
        responses.add(
            responses.GET,
            url,
            body=body,
            status=200,
            content_type='text/html; charset=utf-8',
        )

        expected_title = "Cleveland Clinic sets opening for new Lakewood Family Health Center"
        with open(os.path.join(TEST_DATA_PATH, 'html_article_body.txt')) as f:
            expected_body = f.read()

        articles = scrape([url])

        self.assertEqual(articles[0].title, expected_title)
        self.assertEqual(articles[0].body, expected_body)

Пример #11

0

Показать файл

Файл: application.py Проект: MakrandBhale/brew

def scrape_data():
    search_json = validator.clean_response(request.get_json())

    if isinstance(search_json, ErrorResponse):
        return app.response_class(response=json.dumps(search_json.serialize()),
                                  status=search_json.code,
                                  mimetype='application/json')
    did = db.create_new_document("query", search_json)
    search_query = Query.SearchQuery(search_json['query'],
                                     search_json['startDate'],
                                     search_json['endDate'],
                                     search_json['stepCount'],
                                     search_json['tweetFrequency'])
    job_list = scraper.scrape(search_query, did, MONGO_URI)

    # processed_tweets = preprocessor.analyze(tweets)
    # response = preprocessor.compile_result(processed_tweets)
    # return render_template('index.html', response='')

    db.update(did, "timestamp", str(time.time()))
    return app.response_class(response=json.dumps({"id": did}),
                              status=200,
                              mimetype='application/json')

Пример #12

0

Показать файл

Файл: app.py Проект: andrewyang96/CapitalOneChallenge

def instagram_scrape():
    instagram_data = scrape(days=0.5)

Пример #13

0

Показать файл

Файл: bot.py Проект: andreagubellini/shitpostbot

 async def on_message(message):
     if message.content.startswith('!shitpost'):
         #pdb.set_trace()
         channel = client.get_channel(int(ch_id))
         img = scraper.scrape(boards)
         await channel.send("Ecco la tua daily dose di shitpost: \n"+img)

Пример #14

0

Показать файл

#!/usr/bin/env python3
from scraper import scraper

scraper.scrape()

Пример #15

0

Показать файл

Файл: views.py Проект: aswinzz/rancode

 def get(self, request, format=None):
     problems = []
     type_question = 'school'
     data = scrape(type_question)
     for i in data:
         problems.append(i)
         if not Problem.objects.filter(code=i['code']).exists():
             problem = Problem.objects.create(code=i['code'],
                                              name=i['name'],
                                              question_type=i['type'],
                                              url=i['url'],
                                              submissions=i['submissions'],
                                              accuracy=i['accuracy'],
                                              submit_url=i['submit_url'],
                                              status=i['status'])
             problem.save()
     type_question = 'easy'
     data = scrape(type_question)
     for i in data:
         problems.append(i)
         if not Problem.objects.filter(code=i['code']).exists():
             problem = Problem.objects.create(code=i['code'],
                                              name=i['name'],
                                              question_type=i['type'],
                                              url=i['url'],
                                              submissions=i['submissions'],
                                              accuracy=i['accuracy'],
                                              submit_url=i['submit_url'],
                                              status=i['status'])
             problem.save()
     type_question = 'medium'
     data = scrape(type_question)
     for i in data:
         problems.append(i)
         if not Problem.objects.filter(code=i['code']).exists():
             problem = Problem.objects.create(code=i['code'],
                                              name=i['name'],
                                              question_type=i['type'],
                                              url=i['url'],
                                              submissions=i['submissions'],
                                              accuracy=i['accuracy'],
                                              submit_url=i['submit_url'],
                                              status=i['status'])
             problem.save()
     type_question = 'hard'
     data = scrape(type_question)
     for i in data:
         problems.append(i)
         if not Problem.objects.filter(code=i['code']).exists():
             problem = Problem.objects.create(code=i['code'],
                                              name=i['name'],
                                              question_type=i['type'],
                                              url=i['url'],
                                              submissions=i['submissions'],
                                              accuracy=i['accuracy'],
                                              submit_url=i['submit_url'],
                                              status=i['status'])
             problem.save()
     try:
         return Response({
             "success": True,
             "data": problems
         },
                         status=status.HTTP_200_OK)
     except:
         return Response(
             {
                 "success": False,
                 "message": "No Problems Available"
             },
             status=status.HTTP_400_BAD_REQUEST)

Пример #16

0

Показать файл

from scraper import scraper
from utils import write

print('Running init.py...')

programs_info, courses_studyplan, courses_coursebook = scraper.scrape()
write.write_object_raw('programs', programs_info)
write.write_object_raw('courses-studyplan', courses_studyplan)
write.write_object_raw('courses-coursebook', courses_coursebook)

from postprocess import postprocess
from db import store

Пример #17

0

Показать файл

Файл: test_scraper.py Проект: nvminhtu/infinite-scroll-scraper

 def test_invalid_url(self):
     """ Makes sure that the correct exception is raised when an invalid URL is pased in."""
     self.args.url = "notavalidurl"
     with self.assertRaises(scraper.UserInputException):
         scraper.scrape(self.args)

Python scrape примеры использования