Exemplos de scraper em Python, exemplos de scrape.scraper em Python

Exemplo n.º 1

0

Exibir arquivo

def box_office_scraper():
    # Save log file
    trigger_log_save()
    # Run scraper for latest data refresh
    scraper()
    # return "Scrape complete (Flask)!" # returns HTML/text (r.text)
    return {"data": "Flask data key"}  # returns JSON (r.json())

Exemplo n.º 2

0

Exibir arquivo

def box_office_scraper_view():
    # Save the log file
    trigger_log_save()
    # Run the scaper for latest data refresh
    scraper()
    # return "Scrape complete! (FastAPI)" # Sending HTML (r.text)
    return {"data": "FastAPI data key"}  # Sending JSON (r.json())

Exemplo n.º 3

0

Exibir arquivo

Arquivo: main.py Projeto: tahmeedchowdhury/Reddit-Searcher

 def onClick(self, event):
     window = Toplevel(self.root)
     window.geometry("670x500")
     x = scraper()
     res = x.search(self.entry.get(), self.entry2.get(),
                    int(self.limitentry.get()))
     self.text = Text(window, borderwidth=0, relief=SUNKEN)
     self.text.insert(INSERT, res)
     self.text.place(x=0, y=0)

Exemplo n.º 4

0

Exibir arquivo

def scrapeDataOnePage(pageNum, pageSize, statusId, organizationId, outputFile):
    target_url = 'https://bidsandtenders.ic9.esolg.ca/Modules/BidsAndTenders/services/bidsSearch.ashx?pageNum=' + str(
        pageNum) + '&pageSize=' + str(pageSize) + '&statusId=' + str(
            statusId) + '&organizationId=' + str(
                organizationId) + '&sortColumn=UtcPublishDate&sortDir=DESC'

    # request the URL and parse the JSON
    try:
        response = requests.get(target_url)
        response.raise_for_status()
        jsonRes = response.json()
    except HTTPError as http_err:
        print(f'HTTP error occurred: {http_err}')
    except Exception as err:
        print(f'Other error occurred: {err}')

    #extract viewUrl & run scraper on each view site
    data = jsonRes["data"]["tenders"]

    #scrape data
    for each in data:
        scraper(each["viewUrl"], outputFile)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: pull_reddit_posts.py Projeto: bhsgsh10/hongkongmei

def main():
    #create an empty json file to fill with post information
    posts = {
        "hearthstone": [],
        "blizzard": [],
        "HongKong": [],
        "overwatch": [],
        "gaming": []
    }

    #api endpoint to
    submission_endpoint = "https://api.pushshift.io/reddit/search/submission/"

    #Set the scraper start time as the beginning of two days before the blizzard announcement
    start_time = datetime.datetime(2019, 10, 7, 0, 0,
                                   0).timestamp()  #converts to epoch time

    #When the most recent time to get posts
    #end_time = 1570824407 #Timestamp of the last post we got when we first pulled the data
    end_time = datetime.datetime(
        2019, 10, 12, 0, 0,
        0).timestamp()  #Get the posts through Friday Night THIS IS A NEW RUN

    #Calls the scaper function to contiously build up a json file with the posts returned from each subreddit
    #Full json object is realized with the final scraper method return into all_posts
    hearthstone_posts = scraper(submission_endpoint, subreddits[0], start_time,
                                end_time, posts)
    blizzard_posts = scraper(submission_endpoint, subreddits[1], start_time,
                             end_time, hearthstone_posts)
    HongKong_posts = scraper(submission_endpoint, subreddits[2], start_time,
                             end_time, blizzard_posts)
    overwatch_posts = scraper(submission_endpoint, subreddits[3], start_time,
                              end_time, HongKong_posts)
    all_posts = scraper(submission_endpoint, subreddits[4], start_time,
                        end_time, overwatch_posts)

    #Save all_posts
    with open("posts/all_posts_new.txt", 'w') as output:
        json.dump(all_posts, output)

Exemplo n.º 6

0

Exibir arquivo

def main():
    cases = None
    casualties = None
    recoveries = None

    while True:
        if DEBUG is True:
            print("Going for another run")

        wp = (world_pop(WP_URL, DEBUG))
        results = scraper(URL, DEBUG)
        if results is False:
            logger.error(
                "An error occurred retrieving stats from {}. Trying again in {}"
                .format(URL, WAIT_TIME))
        else:
            try:
                r_keys = list(results.keys())
                r_cases = int(results[r_keys[0]].replace(',', ''))
                r_casualties = int(results[r_keys[1]].replace(',', ''))
                r_recoveries = int(results[r_keys[2]].replace(',', ''))

                clean_fatal_rate = r_casualties / r_cases * 100
                fatal_rate = "{0:.2f}%".format(clean_fatal_rate, 2)
                clean_recov_rate = r_recoveries / r_cases * 100
                recover_rate = "{0:.2f}%".format(clean_recov_rate, 2)
                clean_active_cases = r_cases - r_casualties - r_recoveries
                active_rate = "{0:.2f}%".format(
                    clean_active_cases / r_cases * 100, 2)
                active_cases = f'{clean_active_cases:,}'
                clean_closed_cases = r_cases - clean_active_cases
                closed_rate = "{0:.2f}%".format(
                    clean_closed_cases / r_cases * 100, 2)
                closed_cases = f'{clean_closed_cases:,}'
                closed_fatal_rate = "{0:.2f}%".format(
                    r_casualties / clean_closed_cases * 100, 2)
                closed_recov_rate = "{0:.2f}%".format(
                    r_recoveries / clean_closed_cases * 100, 2)

                results["Fatality Rate"] = fatal_rate
                results["Recovered Rate"] = recover_rate
                results["Active Cases"] = active_cases
                results["Active Cases %"] = active_rate
                results["Closed Cases"] = closed_cases
                results["Closed Cases %"] = closed_rate
                results["Closed Fatality Rate"] = closed_fatal_rate
                results["Closed Recovered Rate"] = closed_recov_rate

                results.update(wp)

                wp_keys = list(wp.keys())
                wp_count = int(wp[wp_keys[0]].replace(',', ''))

                clean_infect_rate = r_cases / wp_count * 100
                infect_rate = "{0:.2f}%".format(clean_infect_rate, 2)

                results["Total Population Infected"] = infect_rate

                logger.info(results)

                if cases is None:
                    cases = r_cases
                    casualties = r_casualties
                    recoveries = r_recoveries

                if cases != r_cases:
                    if cases < r_cases:
                        diff = r_cases - cases
                        cases = r_cases
                        logger.info(
                            "Confirmed cases have risen by: {}. Count now stands at: {}"
                            .format(diff, cases))
                    elif cases > r_cases:
                        diff = cases - r_cases
                        cases = r_cases
                        logger.info(
                            "Confirmed cases have decreased by: {}. Count now stands at: {}"
                            .format(diff, cases))

                if casualties != r_casualties:
                    if casualties < r_casualties:
                        diff = r_casualties - casualties
                        casualties = r_casualties
                        logger.info(
                            "Fatal cases have risen by: {}. Count now stands at: {}. Rate: {}"
                            .format(diff, casualties, fatal_rate))
                    elif casualties > r_casualties:
                        diff = casualties - r_casualties
                        casualties = r_casualties
                        logger.info(
                            "Fatal cases have decreased by: {}. Count now stands at: {}. Rate: {}"
                            .format(diff, casualties, fatal_rate))

                if recoveries != r_recoveries:
                    if recoveries < r_recoveries:
                        diff = r_recoveries - recoveries
                        recoveries = r_recoveries
                        logger.info(
                            "Recovery cases have risen by: {}. Count now stands at: {}. Rate: {}"
                            .format(diff, recoveries, recover_rate))
                    if recoveries > r_recoveries:
                        diff = recoveries - r_recoveries
                        recoveries = r_recoveries
                        logger.info(
                            "Recovery cases have decreased by: {}. Count now stands at: {}. Rate: {}"
                            .format(diff, recoveries, recover_rate))
            except KeyError:
                logger.error(
                    "KeyError in data. Trying again in {}. Data: {}".format(
                        WAIT_TIME, results))
                pass
            except ValueError:
                logger.error(
                    "ValueError in data. Trying again in {}. Data: {}".format(
                        WAIT_TIME, wp))
                pass

        time.sleep(WAIT_TIME)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: test_posts.py Projeto: bhsgsh10/hongkongmei

if __name__ == "__main__":
    # get 50 posts each from the above subreddits in the past 2 days
    submission_endpoint = "https://api.pushshift.io/reddit/search/submission/"
    start_time = datetime.datetime(2019, 10, 30, 0, 0, 0).timestamp()
    end_time = datetime.datetime(2019, 11, 1, 0, 0, 0).timestamp()
    posts = {
        "hearthstone": [],
        "blizzard": [],
        "HongKong": [],
        "overwatch": [],
        "gaming": []
    }
    subreddits = ["hearthstone", "blizzard", "HongKong", "overwatch", "gaming"]
    sample = []
    hearthstone_posts = scraper(submission_endpoint, subreddits[0], start_time,
                                end_time, posts)
    # randomly select 50 posts
    hearthstone_sample = random.sample(hearthstone_posts["hearthstone"], 10)
    sample.extend(hearthstone_sample)
    blizzard_posts = scraper(submission_endpoint, subreddits[1], start_time,
                             end_time, hearthstone_posts)
    blizzard_sample = random.sample(blizzard_posts["blizzard"], 10)
    sample.extend(blizzard_sample)
    hongkong_posts = scraper(submission_endpoint, subreddits[2], start_time,
                             end_time, blizzard_posts)
    hongkong_samples = random.sample(hongkong_posts["HongKong"], 10)
    sample.extend(hongkong_samples)
    overwatch_posts = scraper(submission_endpoint, subreddits[3], start_time,
                              end_time, hongkong_posts)
    overwatch_samples = random.sample(overwatch_posts["overwatch"], 10)
    sample.extend(overwatch_samples)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: scrape-one-view.py Projeto: JudyBui127/bid-scraping

#this script is to scrape data on 1 single view site
from scrape import scraper
URL='https://lkdsb.bidsandtenders.ca/Module/Tenders/en/Tender/Detail/be1fba85-2eb6-4094-b7ad-f7fdd0c3a1f2#'
filename = "single-view"

scraper(URL, filename)

Exemplo n.º 9

0

Exibir arquivo

Arquivo: automate.py Projeto: vedant32/Python-Automation

from scrape import scraper
from file_utils import read_csv, write_csv, read_file, write_file
from generate import _render_template, preprocess
from image import pass_gen
from mail import sendmail
import json

# Scraping the webpage and storing the data in a csv
data = scraper('http://scrape.kjscecodecell.com/')
write_csv(data)

# Reading the scraped data from the csv and preprocessing the data
participants = read_csv()
participants = preprocess(participants)

# Getting the list of mails to whom mails have already been sent
sent_mails = read_file()

# Looping over all participants
for participant in participants:
    # Checking if the participant was sent a mail previously
    if participant['email'] not in sent_mails:
        name = participant['name']
        email = participant['email']
        phone = participant['phone']
        payment_status = participant['payment']

        # Generating a message from the template
        message = _render_template(name, payment_status)

        # Generating a custom image

Exemplo n.º 10

0

Exibir arquivo

Arquivo: import_script.py Projeto: AnnaBanana51/TKH-Final-Project

from app import db, Bitcoin
from scrape import scraper

url = "https://coinmarketcap.com/2/"
scraper_data = scraper(url)


def put_together():
    #will make sure the database empty
    db.drop_all()
    #create the columns of database
    db.create_all()
    #iterates via the table while scraping at the same time, and filling the coumns with the info
    #ממלא את העמודות עם המידע והשמות של העמודות
    for coin in scraper_data:
        new_row = Bitcoin(Name=coin[0],
                          Price=coin[1],
                          _24h=coin[2],
                          _7d=coin[3],
                          Market_Cap=coin[4],
                          Volume=coin[5],
                          Circulating_Supply=coin[6])
        db.session.add(new_row)
        db.session.commit()


if __name__ == '__main__':
    put_together()

Exemplo n.º 11

0

Exibir arquivo

from scrape import scraper
from csv_utils import read_csv, write_csv, get_unpaid_participants
from generate import _render_template
from mail import sendmail
import json

data = scraper('http://scrape.surge.sh/')
write_csv(data, "studentdetails.csv")

unpaid_participants, paid_count = get_unpaid_participants("studentdetails.csv")
total_seats = 500

for participant in unpaid_participants:
    html = _render_template(participant[0], total_seats - paid_count)
    sendmail(to_email=participant[0], html=html)

Exemplo n.º 12

0

Exibir arquivo

Arquivo: spreadsheet.py Projeto: strvight/DropStat

from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from scrape import scraper

# If modifying these scopes, delete the file token.pickle.
SCOPES = ['https://www.googleapis.com/auth/spreadsheets']

# The ID and range of the spreadsheet you are adding to.
SAMPLE_SPREADSHEET_ID = '1GR5X2Ryk-S3cs1CE7fl60bpXw0TI-lLv7EMAkzAtUj0'
#SAMPLE_RANGE_NAME = 'Sheet1!A2:C'
range_name = 'Sheet1!A2:V'
# FirstTime = True

#The stats of all players on the Toronto Raptors for each games in a list
stats = scraper()


def main():
    """Shows basic usage of the Sheets API.
    Prints values from a sample spreadsheet.
    """
    creds = None
    # The file token.pickle stores the user's access and refresh tokens, and is
    # created automatically when the authorization flow completes for the first
    # time.
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)
    # If there are no (valid) credentials available, let the user log in.
    if not creds or not creds.valid:

Exemplo n.º 13

0

Exibir arquivo

Arquivo: app.py Projeto: ptoru001/webscrape

def scrape_data():
    print('scraping...')
    x = myColl.delete_many({})
    resp = scraper()
    x = myColl.insert_many(resp)
    return 'Scraped'

Exemplo n.º 14

0

Exibir arquivo

Arquivo: threadscraper.py Projeto: threadscraper/threadscraper

def main():
    parser = argparse.ArgumentParser()

    # Positional arguments
    parser.add_argument('url',
                        help='URL to the 4chan thread you want to scrape',
                        type=str)
    parser.add_argument('destination',
                        help='Destination folder in your home folder',
                        type=str)

    # Optional arguments
    parser.add_argument('-q',
                        '--quiet',
                        help='Run the script in quiet mode, no outputs',
                        action='store_true',
                        default=False)
    parser.add_argument('-v',
                        '--verbose',
                        help='Run with increased verbosity',
                        action='store_true',
                        default=False)
    parser.add_argument(
        '-w',
        '--watch',
        help=
        'Watch the thread, will check thread every 5 minutes for new posts until thread 404s',
        action='store_true',
        default=False)
    parser.add_argument(
        '-i',
        '--interval',
        help='Specify the wait-time when watching a thread in seconds',
        type=int)

    args = parser.parse_args()

    # Set behaviour:
    verbose = args.verbose
    quiet = args.quiet
    watch = args.watch
    interval = 300 if not args.interval else args.interval

    # Set variables
    link = args.url
    destination = args.destination
    board = link.split('/')[3]  # 'tv', 'wg', or similar
    thread_id = link.split('/')[5]  # '114804039' or similar
    url = f'https://a.4cdn.org/{board}/thread/{thread_id}.json'
    content_url = f'https://i.4cdn.org/{board}/'

    # Determine platform/OS and set appropriate path
    system = platform.system()
    if system == 'Linux':
        home = os.environ['HOME']
        destination = f'{home}/{destination}/'
    elif system == 'Windows':
        home == os.environ['HOMEPATH']
        destination = f'{home}\\{destination}\\'
    else:
        if not quiet:
            print('Unsupported system, exiting')
        sys.exit(2)

    if verbose:
        print('Will scrape using the following information:')
        print(f'\tLink: \t\t{link}')
        print(f'\tBoard: \t\t{board}')
        print(f'\tThread ID: \t{thread_id}')
        print(f'\tURL: \t\t{url}')
        print(f'\tContent URL: \t{content_url}')
        print(f'\tDestination: \t{destination}\n')
        if watch:
            print(f'Will watch the thread. Interval: {interval}')

    # Create the destination folder
    if verbose:
        print(f'--> creating folder: {destination}')
    try:
        os.makedirs(destination, exist_ok=True)
    except Exception as e:
        if not quiet:
            print(f'Could not create destination folder: {e}')
        sys.exit(3)

    # Get the thread in JSON-representation:
    try:
        if verbose:
            print(
                f'--> getting the thread metadata from thread id: {board}/{thread_id}'
            )
        posts = refresh_post_list(url, quiet, verbose)
    except Exception as e:
        if not quiet:
            print(f'Could not get thread metadata, reason: {e}')
        sys.exit(4)

    # Set timestamp
    start_time = posts[0]['time']

    # Provide more verbose information about the thread:
    if verbose:
        first_post = posts[0]
        if first_post.get('sub'):
            title = first_post['sub']
        else:
            title = None
        no_of_images = first_post['images']
        no_of_replies = first_post['replies']
        time_of_first_post = datetime.utcfromtimestamp(start_time).strftime(
            '%Y-%m-%d %H:%M:%S')

        print('--> metainformation about the thread:')
        if title:
            print(f'\tTitle: {title}')
        print(f'\tNumber of images: {no_of_images}')
        print(f'\tNumber of replies: {no_of_replies}')
        print(f'\tTime of first post: {time_of_first_post} UTC')

    new_time = scraper(posts, start_time, content_url, destination, quiet,
                       verbose)
    if verbose:
        print(f'--> timestamp of last post: {new_time}')

    if watch:
        if not quiet:
            print('--- watching thread ---')
        while True:
            if verbose:
                print(
                    f'--> waiting {timedelta(seconds=interval)} before refreshing thread'
                )
            time.sleep(interval)

            if verbose:
                print('--> refreshing list of posts')
            posts = refresh_post_list(url, quiet, verbose)

            # Check if thread is closed:
            if posts[0].get('closed'):
                if posts[0]['closed']:
                    if not quiet:
                        print('Thread is closed, exiting')
                    break
            if verbose:
                print('--> attempting to download new images')
            new_time = scraper(posts, new_time, content_url, destination,
                               quiet, verbose)