def timeout_runner(): time_start = datetime.utcnow() print('----[ timeout_runner, begin. time: {}'.format(str(time_start))) try: run_scraper() except Exception: print "====[ Exception in scraper_sched.timeout_runner: ", sys.exc_info()[0] time_end = datetime.utcnow() print('----[ timeout_runner, end. duration: {} time: {}'.format(str(time_end - time_start), time_end))
def main(): """Runs the project.""" print("Welcome to the NHL Statistics Projection Tool:") input_dec = input("* Do you want to scrape data? Y/N: ") if input_dec.upper() == "Y": scraper.run_scraper() data_cleaner.run_data_cleaner() else: data_cleaner.run_data_cleaner()
def main(): global username if not path.exists("config.ini"): setup.initialize() else: print("Configuration File Found!\n") if load_file(): password = prompt_password() scraper.run_scraper(username, password) scraper.get_announcements() scraper.open_subjects(retrieve_subjects()) else: print("Something went wrong... try deleting 'config.ini' and setup again!")
def fetch_new(): """ TODO: Run this task in celery or async thread. """ next_url = request.args.get('next', url_for('home')) if session.get('user_type') == 5: scraper.run_scraper() flash("New posts have been fetched.", category='success') else: flash("You are not allowed to perform that action.", category='warning') return redirect(next_url)
def scrape_profiles(): login_email = request.form['username'] keyring.set_password(service_id, login_email, request.form['password']) profile = request.form['fb_profile'].split("/")[-1] if request.form['twitter_profile']: # optional twitter link twitter_profile = request.form['twitter_profile'].split("?")[0].split( "/")[-1] else: twitter_profile = None session['profile'] = profile if not os.path.isdir("data/" + profile): scraper.run_scraper(login_email, profile, twitter_profile, sys.argv[1]) return redirect(url_for('recommendations'))
def main(): parser = argparse.ArgumentParser(description='Crawl a website.') parser.add_argument("-u", "--url", help='the root URL of the website to crawl') parser.add_argument("-ll", "--loglevel", default=0, help='the log level to use') args = parser.parse_args() levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] logging.basicConfig(level=levels[min(int(args.loglevel), len(levels) - 1)]) url = args.url if is_valid_url(url): print('Scraping website: {url}'.format(url=url)) run_scraper(url) else: print('Error, URL does not appear to be valid, ' 'please check and try again')
def main(input_title, subreddit): with open('auth.json', 'r') as file: auth = json.load(file) # reddit = praw.Reddit(client_id=auth['client_id'], # client_secret=auth['client_secret'], # username=auth['username'], # password=auth['password'], # user_agent='This is a test.') if not Path('subreddits/{0}'.format(subreddit)).exists(): run_scraper(subreddit) # populate_freq_dist_stop_list('this is a title', 'AskReddit') #prediction = clf.predict(input_title, subreddit) #print('Prediction for "{0}": {1}'.format(input_title, prediction)) prediction_guess = clf.predict_guess(input_title, subreddit) print('Prediction_Guess for "{0}": {1}'.format(input_title, prediction_guess))
import time import traceback from scraper import run_scraper import settings import logging logging.basicConfig(filename="scraper.log", level=logging.INFO) if __name__ == '__main__': while True: print("{}: Starting scrape cycle".format(time.ctime())) try: run_scraper() except KeyboardInterrupt: print("Exiting...") logging.info("Exiting...") sys.exit(1) except Exception as exc: print("Error with scraping:", sys.exc_info()[0]) logging.error("Error with scraping:", sys.exc_info()[0]) traceback.print_exc() else: print("{}: Successfully finished scraping".format(time.ctime())) logging.info( "{}: Successfully finished scraping".format(time.ctime())) time.sleep(settings.SLEEP_INTERVAL)
# Developed by Gyu Lim. Jan 2017 # Overview # This program is develeped to perform data mining from a web-forum. # For the purpose of demonstration of python-web-data-mining, the following # particular web-forum is used. # http://www.f150ecoboost.net/forum/42-2015-ford-f150-ecoboost-chat # Output: Web-scraoed data is exported as a csv file. # It describes the attiributes of the 100 most viewed threads. # STEP1. Read data from the scraper function from scraper import run_scraper (sTitle, sThreadLink, sNumViews, sLastPostDate, sLastPostTime) = run_scraper() # STEP2. Save the imported data into Pasdas DATA FRAME import pandas as pd threads = pd.DataFrame( {"Title": sTitle, "Thread Link": sThreadLink, "Number of Views": sNumViews, "Last Pot Date": sLastPostDate, "LastPostTime": sLastPostTime })
def timeout_runner(): run_scraper()
import praw import config import time import os import re import scraper scraper.run_scraper() scraper.removeWebTags() def bot_login(): print("Logging in...") reddit = praw.Reddit(username=config.username, password=config.password, client_id=config.client_id, client_secret=config.client_secret, user_agent="basedcraft's test bot comment responder") print("Logged in!") return reddit def run_bot(reddit, comments_replied_to): print("Obtaining 10 comments...") for submission in reddit.subreddit('test').new(limit=20): if submission.id not in comments_replied_to and submission.author != r.user.me( ): if re.search("Community Update", submission.title, re.IGNORECASE):
from time import time import models from scraper import run_scraper if __name__ == '__main__': t0 = time() models.init_db() run_scraper() t1 = time() models.export_csv() models.export_json() print 'Completed in. %f' % (t1 - t0)
from scraper import run_scraper from rparser import run_parser from processing import run_processor from viz_processing import run_viz_processing from settings import DATA_FOLDER, LIST_MODES from settings import UPLOAD_TO_CLOUD # create data folder DATA_FOLDER.mkdir(exist_ok=True) p_phq, p_hc, p_org = run_scraper() f_parsed = run_parser(path_to_phq=p_phq, path_to_hc=p_hc, path_to_org=p_org) run_processor() df = run_viz_processing() # upload the stock data to cloud storage for app to access if UPLOAD_TO_CLOUD is True: from commonfunc import upload_to_gcs from settings import GCP_PROJECT from settings import CLOUD_STORAGE_BUCKET upload_to_gcs(project=GCP_PROJECT, src_file=str((DATA_FOLDER / 'df.gz').resolve()), dst_bucket=CLOUD_STORAGE_BUCKET, dst_blob_name='df.gz') upload_to_gcs(project=GCP_PROJECT, src_file=str((DATA_FOLDER / 'df_full.gz').resolve()), dst_bucket=CLOUD_STORAGE_BUCKET, dst_blob_name='df_full.gz') for mode in LIST_MODES: upload_to_gcs(project=GCP_PROJECT, src_file=str((DATA_FOLDER /
def scrape(id=None): if (id == None): return "<html>No id provided</html>" else: chats = scraper.run_scraper(id) return Response(json.dumps(chats), mimetype='application/json')
#!/usr/bin/env python import scraper import time from itertools import count iteration = count() while True: scraper.run_scraper("crypto.db") print(f"get data ke - {next(iteration)}") time.sleep(10)