Python scrape示例，scrapers.scrape Python示例

示例#1

0

显示文件

def scrape_latest(scrape_target: Text):
    # This header is added to cron requests by GAE, and stripped from any external
    # requests. See
    # https://cloud.google.com/appengine/docs/standard/python3/scheduling-jobs-with-cron-yaml#validating_cron_requests
    if not flask.request.headers.get('X-Appengine-Cron'):
        return 'Attempted to access internal endpoint.', status.HTTP_403_FORBIDDEN
    scrapers.scrape(scrape_target)
    return 'Successfully scraped latest %s.' % scrape_target, status.HTTP_200_OK

示例#2

0

显示文件

文件： make_politician_table.py 项目： r-parvulescu/ro_parliament

def scrape_dep_sites():
    """Scrapes two of the websites associated with deputy-sessions, the CV and overview sites, and dumps the htmls in folder."""

    scrape_counter = 0

    #load urls in list
    db_name_user = '******'
    table_command = 'SELECT dep_name, dep_url FROM dep_name_url'
    urls = ftt.get_list_from_table(db_name_user, table_command)

    cv_fldr = "/media/radu/romparl/CD/htmls/politicians/cv_pages/"
    smry_fldr = "/media/radu/romparl/CD/htmls/politicians/summary_pages/"
    err_fldr = "/media/radu/romparl/CD/errors/"

    #go thorugh urls and scrape, ignore 2016 session, no CVs yet
    for u in urls:
        if '2016' not in u[1]:
            session = u[1][-4:]
            scrape_counter += 1
            print(u[0], session, scrape_counter)
            cv_u = u[1] + '&pag=0'
            cv_file = str(u[0]) + '-' + session + '-cv.txt'
            smry_u = u[1] + '&pag=1'
            smry_file = str(u[0]) + '-' + session + '-summary.txt'

            try:
                cv_uht = sc.scrape(cv_u)
                cv_dictio = {
                    'url': cv_uht[0],
                    'html': cv_uht[1],
                    'scrape_time_utc': cv_uht[2]
                }
                ftt.json_file_dump(cv_fldr, cv_file, cv_dictio)

                smry_uht = sc.scrape(smry_u)
                smry_dictio = {
                    'url': smry_uht[0],
                    'html': smry_uht[1],
                    'scrape_time_utc': smry_uht[2]
                }
                ftt.json_file_dump(smry_fldr, smry_file, smry_dictio)

            except:
                err_file = str(u[0]) + '.txt'
                tb = traceback.format_stack()
                error = {
                    'message': tb,
                    'error_index': scrape_counter,
                    'list_element': u[0]
                }
                ftt.json_file_dump(err_fldr, err_file, error)
                continue

示例#3

0

显示文件

        print 'Got {0} out of {1} total'.format(str(contained_report),
                                                str(total_report))
        raise dke.MissingPlayersException(
            'Total missing players at price point: ' + str(miss_len))


if __name__ == "__main__":
    args = get_args()
    uploader = nba_upload if args.l == 'NBA' else nfl_upload
    if not args.keep_pids:
        uploader.create_upload_file()
    if args.pids:
        player_map = uploader.map_pids(args.pids)
    if args.s == _YES:
        try:
            scrapers.scrape(args.source)
        except KeyError:
            raise dke.InvalidProjectionSourceException(
                'You must choose from the following data sources {}.'.format(
                    scrapers.scrape_dict.keys()))

    rosters, remove = [], []
    for x in xrange(0, int(args.i)):
        rosters.append(run(args.l, remove, args))
        if args.pids:
            uploader.update_upload_csv(player_map,
                                       rosters[x].sorted_players()[:])
        if None not in rosters:
            for roster in rosters:
                for player in roster.players:
                    remove.append(player.name)

示例#4

0

显示文件

文件： optimize.py 项目： BenBrostoff/draft-kings-fun

    missing = filter(lambda x: x.marked != "Y" and x.cost > min_cost, all_players)
    miss_len = len(missing)
    if e_raise < miss_len:
        print "Got {0} out of {1} total".format(str(contained_report), str(total_report))
        raise dke.MissingPlayersException("Total missing players at price point: " + str(miss_len))


if __name__ == "__main__":
    args = get_args()
    if not args.keep_pids:
        upload.create_upload_file()
    if args.pids:
        player_map = upload.map_pids(args.pids)
    if args.s == _YES:
        try:
            scrapers.scrape(args.source)
        except KeyError:
            raise dke.InvalidProjectionSourceException(
                "You must choose from the following data sources {}.".format(scrapers.scrape_dict.keys())
            )

    rosters, remove = [], []
    for x in xrange(0, int(args.i)):
        rosters.append(run(cons.POSITIONS[args.l], args.l, remove, args))
        if args.pids:
            upload.update_upload_csv(player_map, rosters[x].sorted_players()[:])
        if None not in rosters:
            for roster in rosters:
                for player in roster.players:
                    remove.append(player.name)
        else:

示例#5

0

显示文件

文件： main.py 项目： Manoj-nathwani/simple-web-scraping-platform

import time

from utils import intput_output
from utils.http import get_page
from utils.validation import get_errors
import scrapers

urls = intput_output.import_urls()
output = []
for url in urls:
    html = get_page(url)
    result = scrapers.scrape(url, html)
    errors = get_errors(result)
    output.append({'url': url, 'errors': errors, 'result': result})
    print('Scraped {} with {} errors'.format(url, len(errors)))
    time.sleep(1)  #  to avoid spamming

intput_output.export_results(output)
print('Scrape complete! See: ' + intput_output.OUTPUT_FILE)

示例#6

0

显示文件

文件： jobs.py 项目： preetsmohan/check-ai

def jobs_route_get():

    if session.get('signedIn') == None:
        return redirect('/login')
    sites = '-site:yelp.com/* -site:dice.com/* -site:indeed.com/* -site:monster.com/* -site:glassdoor.com/ -site:jobs.climber.com/* -site:ziprecruiter.com/* site:jobs.*.com/* OR site:careers.*.com/* OR site:*.com/careers/* OR site:*.com/jobs/* OR site:*.org/careers/* OR site:*.org/jobs/* OR site:jobs.lever.co/* OR site:boards.greenhouse.io/* OR site:linkedin.com/jobs/view/* '

    results = pref_sql(
        "SELECT skills, exclusions, postype, field, explevel FROM user WHERE uid = '{0}'",
        (session['uid'], ))

    if len(results) and not None in results[
            0][:5]:  #if we have something in the database
        skills = results[0][0].split(";")
        exclusions = results[0][1].split(";")
        postype = results[0][2].split(";")
        fields = results[0][3].split(";")
        experience_level = results[0][4].split(";")

    experience_level = experience_level[0]

    all_fields = fields[0]
    for field in range(1, len(fields)):
        if fields[field] != '':
            all_fields += ' OR ' + fields[field]
    all_positions = postype[0]
    for pos in range(1, len(postype)):
        if postype[pos] != '':
            all_positions += ' ' + postype[pos]
    all_exclusions = exclusions[0]
    for exclusion in range(1, len(exclusions)):
        if exclusions[exclusion] != '':
            all_exclusions += ' -' + exclusions[exclusion]
            if experience_level == 'New Grad' or experience_level == 'Intern' or experience_level == 'Entry Level':
                all_exclusions += " -senior -lead"

    all_skills = '"' + skills[0] + '"'
    for skill in range(1, len(skills)):
        if skills[skill] != '':
            all_skills += ' OR ' + '"' + skills[skill] + '"'

    query = sites + all_positions + ' ' + experience_level + ' ' + all_fields + ' ' + all_skills + ' -' + all_exclusions

    jobs, summaries, num, full_desc = scraper.scrape(query)

    match_skills = []
    match_pos = []
    match_fields = []

    for desc in full_desc:
        skilz = findAllMatches(skills, desc)
        positionz = findAllMatches(postype, desc)
        fieldz = findAllMatches(fields, desc)
        match_skills.append(skilz)
        match_pos.append(positionz)
        match_fields.append(fieldz)

    print(match_skills)

    return render_template("jobs.html",
                           jobs=jobs,
                           summaries=summaries,
                           num=num,
                           match_skills=match_skills,
                           match_pos=match_pos,
                           match_fields=match_fields,
                           signedIn=True)