def gather_links(page_url): html_string = '' url_info = urllib.request.urlparse(page_url) if url_info.netloc != Crawler.domain_name: return set() try: response = urllib.request.urlopen(page_url) except: print("Cannot open page: " + page_url) return set() if 'text/html' in response.getheader('Content-Type'): html_bytes = response.read() soup = bs4.BeautifulSoup(html_bytes, "html.parser") if config.GENERATE_SITE_MAP: path = url_info.path Crawler.xml_writer.write(path) if config.DOWNLOAD_HTML: file_path = file_writing.get_file_path(Crawler.project_name, page_url) file_writing.create_dir_from_file_path(file_path) try: html_string = html_bytes.decode("utf-8") file_writing.write_file(file_path, html_string) except: print("Cannot write to file: " + page_url) scraping.scrape(soup, Crawler.domain_name, Crawler.xml_writer) return scraping.get_links(soup, Crawler.base_url)
def bing_the_query_field(query_field, bing_api_key, n_results): # Forbidden list forbidden_list = ["wikipedia", "bloomberg", "companiesintheuk", "duedil", "companycheck", "prnewswire", "google", "companieslist", "linkedin", "endole.co.uk", "tuugo", "companiesireland", "top1000", "directorsintheuk", "companydirectorcheck", "yell", "192.com", "facebook", "solocheck", "reuters.com", "idevon.co.uk", "slideshare"] # Bings a query, returns the n first items bing_search_url = 'https://api.datamarket.azure.com/Data.ashx/Bing/Search/Web?Query='+ query_field + '&$format=json' #print bing_search_url bing_response = scrape(bing_search_url, bing_api_key) list_of_url = [] if bing_response: if 'd' in bing_response: if 'results' in bing_response['d']: for result in bing_response['d']['results']: flag = True for element in forbidden_list: if element in result['Url']: flag = False if flag == True: list_of_url.append(result['Url']) return list_of_url[:min(len(list_of_url), n_results)]
def getScore(link): vidName = scrape(link) process_likes(vidName) score = 0 video_comm = pd.read_csv(vidName) video_comm = video_comm.drop(columns=['Unnamed: 0', 'Unnamed: 0.1']) video_comm.rename(columns={'0': 'comment', '1': 'likes'}, inplace=True) print('likes:') print(video_comm.get('likes')) num_likes = video_comm.get('likes').sum() #num_comments = video_comm['comment'].count() video_comm['likes'] = video_comm['likes'] + 1 for index, comm in video_comm.iterrows(): try: pol = TextBlob(comm['comment']).sentiment score_comm = pol.polarity * comm['likes'] score += score_comm except: continue return score / num_likes #Add links here #print(getScore('https://www.youtube.com/watch?v=-niuhBmUPLU')) #print(getScore('https://www.youtube.com/watch?v=fP17mIEv8lo')) #print(getScore('https://www.youtube.com/watch?v=QD0IM5tfnVQ')) #print(getScore('https://www.youtube.com/watch?v=rmQMKowvYeo')) #print(getScore('https://www.youtube.com/watch?v=QPkXJvULrN8'))
def search_result(): if request.method == 'POST': productname = request.form.get('username') global data data = scrape(productname) return render_template('search_result.html', message=data) if request.method == 'GET': data = [] return render_template('search_result.html', message=data)
def index(): data = str(request.args.get('query')) data = scrape(data) # response = app.response_class( # response=json.dumps(data), # status=200, # mimetype='application/json' # ) # return response return render_template("index.html", datas = data)
def test_scrape(self): # Makes sure empty dicts are returned when neither bse nor nse is found for company. For example: SBI Magnum Express url = 'http://www.moneycontrol.com/india/stockpricequote/finance-investments/sbimagnumexpress/SBI06' company_url = requests.get(url) soup = BS(company_url.text, "html.parser") b, n = scrape(soup) self.assertEqual(b, {}) self.assertEqual(n, {})
def main(): if request.method == 'POST': company_name = request.form['cname'] frequency = request.form['freq'] start_time = request.form['stime'] end_time = request.form['etime'] company_url = search_url(company_name) if company_url: c_url = requests.get(company_url) soup = BS(c_url.text, "html.parser") # Returns bse and nse contents if present b, n = scrape(soup) # Adding info to Database bse_db = mongo.db.bse nse_db = mongo.db.nse if b: bse_entry = bse_db.insert({'BSE Date': b['bse_date'], 'BSE Time': b['bse_time'], \ 'BSE Current Price': b['bse_current_price'], 'BSE absolute price': b['bse_abs_price'],\ 'BSE percentage': b['bse_per'], 'BSE Volume': b['bse_volume'], \ 'BSE Prev close': b['bse_prev_close'], 'BSE Open price': b['bse_open_price'],\ 'BSE bid price': b['bse_bid_price'], 'BSE offer price': b['bse_offer_price']}) if n: nse_entry = nse_db.insert({'NSE Date': n['nse_date'], 'NSE Time':n['nse_time'], \ 'NSE Current Price': n['nse_current_price'], 'NSE absolute price': n['nse_abs_price'], \ 'NSE percentage': n['nse_per'], 'NSE Volume': n['nse_volume'], \ 'NSE Prev close': n['nse_prev_close'], 'NSE Open price': n['nse_open_price'], \ 'NSE bid price': n['nse_bid_price'], 'NSE offer price': n['nse_offer_price']}) # Job scheduling if frequency and start_time and end_time: # Check to ensure start time is before end time if start_time < end_time: trigger = OrTrigger([ CronTrigger(hour=start_time + '-' + end_time, minute=frequency) ]) scheduler.add_job(main, trigger) else: error = "End time should be after start time" return render_template('index.html', error=error) if bse_db or nse_db: return redirect(url_for('info')) else: error = "Sorry! Company not found." return render_template('index.html', error=error) return render_template('index.html')
def duedil_company_search(company_name, duedil_api_key): # Searches a company by its name # Requires scrap # Clean company name clean_company_name = company_name clean_company_name = clean_company_name.lower() clean_company_name = clean_company_name.replace(' ', '%20') # Do search search_url = 'http://duedil.io/v3/companies?filters={"name":"'+clean_company_name+'"}&api_key='+duedil_api_key search_response = scrape(search_url) if search_response: company_url_root = search_response["response"]["data"][0]["company_url"] company_url = company_url_root+'?api_key='+duedil_api_key+'&format=json' director_url = company_url_root+'/directors'+'?api_key='+duedil_api_key+'&format=json' else: return False # Company profile profile_response = scrape(company_url) if profile_response and 'response' in profile_response: company_profile = profile_response['response'] else: return False director_response = scrape(director_url) if director_response and 'response' in director_response: company_profile['directors'] = director_response['response']['data'] else: return False return company_profile
def books(): if request.method == 'GET': return "hello" if request.method == 'POST': content2 = request.json name = content2['name'] spage = content2['spage'] epage = content2['epage'] url = content2['url'] data = { 'pid': 0, 'name': name, 'url': 0, 'start_page': 0, 'end_page': 0, 'goodreviews': 0, 'badreviews': 0, 'no_of_comm': 0, 'avg': 0, 'exit': 0 } result = firebase.post(y, data) print(result) subPart = result.get('name') path = y + subPart url = str(url) good, bad, no_of_comm, avg = scrape(url, spage, epage) #whole code #a1 = ['car', 'bike', 'bhavik', 'truck', 'quality555', 'little', 'shabby', 'side', 'money', 'expecting', 'dollar', # 'snap', 'jumper', 'cable', 'chinese', 'knock', 'shop', 'harbor', 'freight', 'buck'] send(url, spage, epage, good, bad, path, no_of_comm, avg) #new_obj2 = { # 'response': result #} # os.remove("temp.png") # return Response(response = image_url) try: return jsonify(result), 201 except FileNotFoundError: abort(404)
def bing_companies(name, bing_api_key, blacklist=website_blacklist): """Bings a query, returns the n first items omits urls containing blacklisted words""" query_field = "'%s'" % urllib.quote(name) bing_search_url = \ 'https://api.datamarket.azure.com/Data.ashx/Bing/Search/Web?Query=' + \ query_field + '&$format=json' response = scrape(bing_search_url, bing_api_key) list_of_urls = [] if response and 'd' in response and 'results' in response['d']: for result in response['d']['results']: if not any(b in result['Url'] for b in blacklist): list_of_urls.append(result['Url']) return list_of_urls
def confirm(): if request.method == 'POST': global ingList try: ingName = request.form['submitButton'] ingList.remove(ingName) except Exception: try: ingList = nlp_parser.ingredient_getter(scraping.scrape(request.form['Name'])) except Exception: try: ingName = request.form['ing'] ingList.append(ingName) except: pass pass pass return render_template("confirm.html", ingList=ingList)
def getScore(link): vidName = scrape(link) process_likes(vidName) score = 0 video_comm = pd.read_csv(vidName) video_comm = video_comm.drop(columns=['Unnamed: 0', 'Unnamed: 0.1']) video_comm.rename(columns={'0': 'comment', '1': 'likes'}, inplace=True) print('likes:') print(video_comm.get('likes')) num_likes = video_comm.get('likes').sum() #num_comments = video_comm['comment'].count() video_comm['likes'] = video_comm['likes'] + 1 for index, comm in video_comm.iterrows(): try: pol = TextBlob(comm['comment']).sentiment score_comm = pol.polarity * comm['likes'] score += score_comm except: continue return score / num_likes
def jobs(): return jsonify(Jobs=scrape())
def index(): #return jsonify(scrape()) return render_template("index.html", articles=scrape(), word=scrapeWord())
def scrape(): mars = mongo.db.mars_db mars_info = scraping.scrape() mars.update({}, mars_info, upsert=True) return "I think it worked"
import scraping #selects each muscle group for i in range(1, 19): scraping.scrape(i)
def index(): return jsonify(Diet=scrape())
choices=["create_db", "scrape", "scrape_once"]) args = parser.parse_args() command = args.command configpath = os.getenv('DAC_CONFIG_PATH') with open(configpath, 'r') as config_file: config = json.load(config_file) return args.command, config, configpath if __name__ == "__main__": logformat = "%(asctime)-15s %(name)-12s %(levelname)-8s %(message)s" logging.basicConfig(level=logging.DEBUG, format=logformat) log = logging.getLogger("dac") logging.getLogger("urllib3.connectionpool").setLevel(logging.INFO) command, config, configpath = parse_args() log.info(f"Command: {command}") log.debug(f"Config loaded from: {configpath}") if command == "scrape": interval = float(config["scraper"].get("interval")) thread = start_scheduled_scraping(interval, configpath) elif command == "create_db": create_db() elif command == "scrape_once": scrape(configpath)
def index(): return jsonify(Elements = scrape())
def scrape(): mars = mongo.db.mars mars_data = scraping.scrape() mars.replace_on({}, mars_data, upsert=True) return "Complete"
def index(): return jsonify(Cars=scrape())
def index(): return jsonify(Products=scrape())
import glob import json import urllib.request import scraping import sqlitedatastore as datastore if __name__ == '__main__': datastore.connect() values = [] for filename in glob.glob('./data/wikipedia/*.html'): with open(filename) as fin: html = fin.read() text, title = scraping.scrape(html) print('scraped:', title) url = 'https://ja.wikipedia.org/wiki/{{}}'.format( urllib.parse.quote(title)) values.append((text, json.dumps({'url': url, 'title': title}))) datastore.load(values) print(list(datastore.get_all_ids(limit=-1))) datastore.close()
import sys import pymongo import scraping import config if __name__ == "__main__": uri = "mongodb://" + config.user + ":" + \ config.password + "@ds243441.mlab.com:43441/gofundme" client = pymongo.MongoClient(uri) db = client.get_default_database() campaigns = db['campaigns'] scraped_data = scraping.scrape() campaigns.insert_many(scraped_data) client.close()
import math import scraping import spotify_api import spotipy import sys if len(sys.argv) > 3: username = sys.argv[1] playlist_name = sys.argv[2] apple_url = sys.argv[3] else: print "Usage: %s username playlist_id track_id ..." % (sys.argv[0], ) sys.exit() song_list = scraping.scrape(apple_url) # for song in song_list: # print song token = spotify_api.authenticate(username) tracks = [] failed = [] if token: sp = spotipy.Spotify(auth=token) sp.trace = False new_playlist = sp.user_playlist_create(username, playlist_name, public=False) if new_playlist:
def index(): return jsonify(Jobs=scrape())
def graphs(): initial_time = datetime.now() # 1. Get infos for scraping # Mandatory argument : Category category = request.args.get('category') # Optional argument # number of site to scrape per category, default 5 (0 for max) if request.args.get('num_of_site'): num_of_site = int(request.args.get('num_of_site')) else: num_of_site = 5 # number of page to scrape per site, default 2 (0 for max) if request.args.get('num_page'): num_page = int(request.args.get('num_page')) else: num_page = 2 # city where the scraping is desired (better with department number) if request.args.get('location'): location = request.args.get('location') else: location = 'no city' # model to use for scraping (one option 'camembert', else default model) model_to_test = '' if request.args.get('model'): model_to_test = 'camembert' print('\n', '#'*50) print(f' Start Analyse on {category} '.center(50, '#')) print('#'*50, '\n') # 2. Scrape trustpilot to get dataframe init_time = datetime.now() print(' Start scraping '.center(30, '#')) refs, df = scraping.scrape(category, location, num_of_site, num_page) time_elapsed = datetime.now() - init_time print(f'Scraping time : {time_elapsed}') if len(df)>0: # 3. Preprocess dataframe before prediction init_time = datetime.now() print(' Start preprocess '.center(30, '#')) df = process.preprocess_df(df) time_elapsed = datetime.now() - init_time print(f'Preprocess time : {time_elapsed}') # 4. Predict sentiment and add it to dataframe init_time = datetime.now() print(' Start prediction '.center(30, '#')) if model_to_test == 'camembert': df = model.predict_camembert(df) else: df = model.predict(df) time_elapsed = datetime.now() - init_time print(f'Prediction time : {time_elapsed}') # 5. Apply postprocess to transform data into json init_time = datetime.now() print(' Start postprocess '.center(30, '#')) json_review = process.postprocess(df, refs) time_elapsed = datetime.now() - init_time print(f'Postprocess time : {time_elapsed}') else: print("No data found") json_review = "<h1>Pas de données</h1>" time_elapsed = datetime.now() - initial_time print(f'Total time elapsed : {time_elapsed}') return json_review
def index(): return jsonify(Dogs=scrape())
def topGainLose(): return jsonify(Stocks=scrape())
def index(): data = request.json print(data) return jsonify(Links=scrape(data))