def scrape_article_data(): target = request.get_json() url = target['url'] source_name = target['name'] response = requests.get(url) article = scrape_data(response, source_name) return jsonify(data=article)
def downloader_daemon(self): logger.error_msg("downloader_daemon: Start of function.", None) for page in range(1, 126): time.sleep(0.1) while True: if self.check_for_new_maps(): break else: time.sleep(2) while self.beatmaps: self.beatmaps.pop() scraper.scrape_data(self.beatmaps, page) evaluator.filter_maps(self.beatmaps) scraper.scrape_data_after_filtering(self.beatmaps) self.emit(SIGNAL("add_widgets_to_copy()"))
def index(username): result = scrape_data(username) if result == None: return jsonify({ 'error': 'Account Not Found!' }), 404 return jsonify( result ), 200
def chart_data(): print("Serving chart data...") global last_updated_macro, macro_data_obj, one_day print("Fetching chart data", time.time(), last_updated_macro, time.time() - last_updated_macro) if time.time() - last_updated_macro > one_day: print("Updating Macro Data") macro_data_obj = sc.scrape_data() last_updated_macro = time.time() return jsonify(macro_data_obj)
def index(username): response_data = scrape_data(username) if response_data == None: return jsonify({ 'response': { 'error': "Can't find data." } }), 404 return jsonify({ 'response': response_data }), 200
def main(): logger.configure() logger.log(logging.INFO, msg="User started interacting with the scraper", destination=logger.FILE) parser = argparse.ArgumentParser(description="scrape instagram by keyword (hashtag)") # used only within cli mode parser.add_argument("-k", "--keyword", help="the keyword to find in instagram (by hashtag or username)") parser.add_argument("-l", "--limit", default=1000, help="limit of instagram posts to scrap") parser.add_argument("-f", "--filename", help="option for logging in through a file\n" "username must be in the first line and password in the second one") args = parser.parse_args() username, password, keyword = "", "", "" # cli mode if args.keyword: keyword = args.keyword filename = args.filename if args.filename else "auth.txt" try: username, password = get_auth_by_file(filename) except FileNotFoundError: logger.log(logging.ERROR, msg="Neither the credentials file were provided nor the default auth.txt were found") quit(0) # interactive mode (default) else: username, password, keyword = interactive_credentials() try: # We initialize the DB db.initialize() # If all good we go scraping scrape_data(username=username, password=password, keyword=keyword, limit=args.limit) except FileNotFoundError: print(f"You must have a file called {config.AUTH_DB_FILE} with your MySQL credentials: " f"host, username and password each in a separate line.")
# Here's the SQL to create our database table TBL_CREATE_STMT = """ CREATE TABLE IF NOT EXISTS failed_banks ( bank varchar (54) NOT NULL, city varchar (17) NOT NULL, state varchar (4) NOT NULL, cert_num INTEGER NOT NULL, acq_inst VARCHAR (65) NOT NULL, closed DATE NOT NULL, updated DATE NOT NULL, url VARCHAR (100) NOT NULL ) """ # Execute the create table sql cur.execute(TBL_CREATE_STMT) # Commit our change conn.commit() # Get results data (recall that it's a list of two elements [headers, data]) results = scrape_data() data = results[1] cur.executemany('INSERT INTO failed_banks (bank, city, state, cert_num, acq_inst, ' \ 'closed, updated, url) VALUES (?, ?, ?, ?, ?, ?, ?, ?);', data) # Commit our inserts conn.commit() # Close db connection conn.close()
import scraper import sa_data import sys if __name__ == '__main__': print(sys.argv) scraper.scrape_data(1) sa_data.preprocess_all() if len(sys.argv) > 1 and sys.argv[1] == "pythonanywhere": pass else: sa_data.copy_data_local()
""" data = [ { 'bank': 'First Alliance', 'city': 'Manchester', 'state': 'NH', 'cert_num': '34264', 'acq_inst': 'Southern New Hampshire Bank & Trust', 'closed': 'February 15, 2013', 'updated': 'February 20, 2013', 'url': 'http://www.fdic.gov/bank/individual/failed/firstalliance.html' } ] """ data = scrape_data() # Let's mess up one row to demo try/except: # data[0]['closed'] = 'Jnauary 15, 2013' # Each dictionary has these keys # bank, city, state, cert_num, acq_inst, closed, updated, url # Iterate through each row of our data and verify data types valid for row in data: # First, we'll verify cert_num is an integer try: row['cert_num'] = int(row['cert_num']) except: row['cert_num'] = 0
import os os.chdir(os.path.dirname(os.path.abspath(__file__))) #os.chdir('local\\directory') import scraper import preprocessor import loader data = scraper.scrape_data() data = preprocessor.preprocess_data(data) loader.gsheets_upload(data) ### saving local copy of data #data.to_excel('./data/processed_data_extract.xlsx')
import scraper # Test 1: test get_form_data(). Should return dict of 3 key:val pairs ############################################################################### url = "https://www.fangraphs.com/dailyprojections.aspx?pos=all&stats=bat&type=sabersim" test1 = scraper.get_form_data(url) if len(test1.values()) == 3 and type(test1) == dict: test1_result = True # Test 2: test scrape_data(url, form_data). Should return a non-zero length string. ############################################################################### test2 = scraper.scrape_data(url, form_data=test1) # use test1 as 2nd arg input if len(test2) > 0 and type(test2) == str: test2_result = True # Test 3: test parse_text_to_df(txt). Should yield dataframe with >50 rows, 20 cols ############################################################################### test3 = scraper.parse_text_to_df(test2) rows, cols = test3.shape if rows > 50 and cols == 20: test3_result = True # Test 4: test series_today_date(df). Should return pandas Series of same # length as test3 rows count, containing today's date.
def api(searchTerm): return render_template("jobs.html", query=searchTerm, jobs=scrape_data(searchTerm))
import scraper import time import os import sys print("welcome to web scraper") query = input("enter a google search: ") result = scraper.search(query) print("Loading websiets with info on search.......................") time.sleep(3) print(result) time.sleep(3) print("scraping data................................") content = scraper.scrape_data(result) print("scraping complete") time.sleep(3) print("parsing data") data = scraper.parse_data(content) time.sleep(3) print("data parsed") file_name = input("enter .txt filename: ") scraper.write_data(file_name, data)
'https://www.czc.cz/operacni-pameti/produkty', 'https://www.czc.cz/disky/produkty', 'https://www.czc.cz/skrine/produkty', 'https://www.czc.cz/zdroje/produkty', 'https://www.czc.cz/rozsirujici-karty/produkty', 'https://www.czc.cz/chladice/produkty', 'https://www.czc.cz/monitory/produkty', 'https://www.czc.cz/mysi/produkty', 'https://www.czc.cz/tiskarny-a-naplne/produkty', 'https://www.czc.cz/klavesnice/produkty', 'https://www.czc.cz/sluchatka-a-mikrofony/produkty', 'https://www.czc.cz/reproduktory/produkty', 'https://www.czc.cz/pametove-karty/produkty', 'https://www.czc.cz/flash-disky/produkty' ] for url in urls_list: scraped_list, table_name = scrape_data(create_page_urls(url)) conn = sqlite3.connect('database.db') c = conn.cursor() try: c.execute(f''' CREATE TABLE {table_name}( product_name TEXT, price INTEGER )''') c.commit() except: pass