Python run_scraper示例，scraper.run_scraper Python示例

示例#1

0

显示文件

文件： scraper_sched.py 项目： haleystorm/nets-pipeline

def timeout_runner():
    time_start = datetime.utcnow()
    print('----[ timeout_runner, begin. time: {}'.format(str(time_start)))
    try:
        run_scraper()
    except Exception:
        print "====[ Exception in scraper_sched.timeout_runner: ", sys.exc_info()[0]

    time_end = datetime.utcnow()
    print('----[ timeout_runner, end. duration: {} time: {}'.format(str(time_end - time_start), time_end))

示例#2

0

显示文件

文件： run_tool.py 项目： lussierc/nhlStatisticProjections

def main():
    """Runs the project."""

    print("Welcome to the NHL Statistics Projection Tool:")
    input_dec = input("* Do you want to scrape data? Y/N: ")

    if input_dec.upper() == "Y":
        scraper.run_scraper()
        data_cleaner.run_data_cleaner()
    else:
        data_cleaner.run_data_cleaner()

示例#3

0

显示文件

文件： main.py 项目： JustNunuz/python2020-workshop

def main():
    global username
    if not path.exists("config.ini"):
        setup.initialize()
    else:
        print("Configuration File Found!\n")
        if load_file():
            password = prompt_password()
            scraper.run_scraper(username, password)
            scraper.get_announcements()
            scraper.open_subjects(retrieve_subjects())
        else:
            print("Something went wrong... try deleting 'config.ini' and setup again!")

示例#4

0

显示文件

文件： fapp.py 项目： shshank/hackernews_chrono

def fetch_new():
    """
    TODO: Run this task in celery or async thread.
    """
    next_url = request.args.get('next', url_for('home'))

    if session.get('user_type') == 5:
        scraper.run_scraper()
        flash("New posts have been fetched.", category='success')
    else:
        flash("You are not allowed to perform that action.", category='warning')

    return redirect(next_url)

示例#5

0

显示文件

def scrape_profiles():
    login_email = request.form['username']
    keyring.set_password(service_id, login_email, request.form['password'])

    profile = request.form['fb_profile'].split("/")[-1]
    if request.form['twitter_profile']:  # optional twitter link
        twitter_profile = request.form['twitter_profile'].split("?")[0].split(
            "/")[-1]
    else:
        twitter_profile = None

    session['profile'] = profile
    if not os.path.isdir("data/" + profile):
        scraper.run_scraper(login_email, profile, twitter_profile, sys.argv[1])
    return redirect(url_for('recommendations'))

示例#6

0

显示文件

文件： scraper.py 项目： JohnCrickett/WebScraper

def main():
    parser = argparse.ArgumentParser(description='Crawl a website.')
    parser.add_argument("-u",
                        "--url",
                        help='the root URL of the website to crawl')
    parser.add_argument("-ll",
                        "--loglevel",
                        default=0,
                        help='the log level to use')

    args = parser.parse_args()
    levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG]
    logging.basicConfig(level=levels[min(int(args.loglevel), len(levels) - 1)])
    url = args.url
    if is_valid_url(url):
        print('Scraping website: {url}'.format(url=url))
        run_scraper(url)
    else:
        print('Error, URL does not appear to be valid, '
              'please check and try again')

示例#7

0

显示文件

def main(input_title, subreddit):
    with open('auth.json', 'r') as file:
        auth = json.load(file)

    # reddit = praw.Reddit(client_id=auth['client_id'],
    #                      client_secret=auth['client_secret'],
    #                      username=auth['username'],
    #                      password=auth['password'],
    #                      user_agent='This is a test.')

    if not Path('subreddits/{0}'.format(subreddit)).exists():
        run_scraper(subreddit)

    # populate_freq_dist_stop_list('this is a title', 'AskReddit')

    #prediction = clf.predict(input_title, subreddit)
    #print('Prediction for "{0}": {1}'.format(input_title, prediction))

    prediction_guess = clf.predict_guess(input_title, subreddit)
    print('Prediction_Guess for "{0}": {1}'.format(input_title,
                                                   prediction_guess))

示例#8

0

显示文件

import time
import traceback

from scraper import run_scraper
import settings

import logging


logging.basicConfig(filename="scraper.log", level=logging.INFO)


if __name__ == '__main__':
    while True:
        print("{}: Starting scrape cycle".format(time.ctime()))
        try:
            run_scraper()
        except KeyboardInterrupt:
            print("Exiting...")
            logging.info("Exiting...")
            sys.exit(1)
        except Exception as exc:
            print("Error with scraping:", sys.exc_info()[0])
            logging.error("Error with scraping:", sys.exc_info()[0])
            traceback.print_exc()
        else:
            print("{}: Successfully finished scraping".format(time.ctime()))
            logging.info(
                "{}: Successfully finished scraping".format(time.ctime()))
        time.sleep(settings.SLEEP_INTERVAL)

示例#9

0

显示文件

# Developed by Gyu Lim.  Jan 2017

# Overview
# This program is develeped to perform data mining from a web-forum.
#   For the purpose of demonstration of python-web-data-mining, the following 
#   particular web-forum is used.
#   http://www.f150ecoboost.net/forum/42-2015-ford-f150-ecoboost-chat

# Output: Web-scraoed data is exported as a csv file.
#         It describes the attiributes of the 100 most viewed threads. 



# STEP1. Read data from the scraper function
from scraper import run_scraper
(sTitle, sThreadLink, sNumViews, sLastPostDate, sLastPostTime) = run_scraper()



# STEP2. Save the imported data into Pasdas DATA FRAME
import pandas as pd
threads = pd.DataFrame(
	{"Title": sTitle,
	 "Thread Link": sThreadLink,
	 "Number of Views": sNumViews,
	 "Last Pot Date": sLastPostDate,
	 "LastPostTime": sLastPostTime
	})

示例#10

0

显示文件

文件： scraper_sched.py 项目： AroMorin/scraper

def timeout_runner():
    run_scraper()

示例#11

0

显示文件

文件： bot.py 项目： aaronprograms/Starcraft-Community-Bot

import praw
import config
import time
import os
import re
import scraper

scraper.run_scraper()
scraper.removeWebTags()


def bot_login():
    print("Logging in...")
    reddit = praw.Reddit(username=config.username,
                         password=config.password,
                         client_id=config.client_id,
                         client_secret=config.client_secret,
                         user_agent="basedcraft's test bot comment responder")

    print("Logged in!")

    return reddit


def run_bot(reddit, comments_replied_to):
    print("Obtaining 10 comments...")

    for submission in reddit.subreddit('test').new(limit=20):
        if submission.id not in comments_replied_to and submission.author != r.user.me(
        ):
            if re.search("Community Update", submission.title, re.IGNORECASE):

示例#12

0

显示文件

文件： main.py 项目： nnja/utah-mls-scraper

from time import time

import models
from scraper import run_scraper


if __name__ == '__main__':
    t0 = time()
    models.init_db()
    run_scraper()
    t1 = time()
    models.export_csv()
    models.export_json()
    print 'Completed in.  %f' % (t1 - t0)

示例#13

0

显示文件

文件： scraper_sched.py 项目： saivivek15/Event-Detection

def timeout_runner():
    run_scraper()

示例#14

0

显示文件

文件： main.py 项目： comsaint/momask

from scraper import run_scraper
from rparser import run_parser
from processing import run_processor
from viz_processing import run_viz_processing
from settings import DATA_FOLDER, LIST_MODES
from settings import UPLOAD_TO_CLOUD

# create data folder
DATA_FOLDER.mkdir(exist_ok=True)

p_phq, p_hc, p_org = run_scraper()
f_parsed = run_parser(path_to_phq=p_phq, path_to_hc=p_hc, path_to_org=p_org)
run_processor()
df = run_viz_processing()

# upload the stock data to cloud storage for app to access
if UPLOAD_TO_CLOUD is True:
    from commonfunc import upload_to_gcs
    from settings import GCP_PROJECT
    from settings import CLOUD_STORAGE_BUCKET
    upload_to_gcs(project=GCP_PROJECT,
                  src_file=str((DATA_FOLDER / 'df.gz').resolve()),
                  dst_bucket=CLOUD_STORAGE_BUCKET,
                  dst_blob_name='df.gz')
    upload_to_gcs(project=GCP_PROJECT,
                  src_file=str((DATA_FOLDER / 'df_full.gz').resolve()),
                  dst_bucket=CLOUD_STORAGE_BUCKET,
                  dst_blob_name='df_full.gz')
    for mode in LIST_MODES:
        upload_to_gcs(project=GCP_PROJECT,
                      src_file=str((DATA_FOLDER /

示例#15

0

显示文件

def scrape(id=None):
    if (id == None):
        return "<html>No id provided</html>"
    else:
        chats = scraper.run_scraper(id)
        return Response(json.dumps(chats), mimetype='application/json')

示例#16

0

显示文件

#!/usr/bin/env python

import scraper
import time
from itertools import count

iteration = count()
while True:
    scraper.run_scraper("crypto.db")
    print(f"get data ke - {next(iteration)}")
    time.sleep(10)