def getDataFromLastYears(symbol, historic=False): # get the year from which we have to start scraping. year = int(props.get("startYear")) # check for company's db or create it if not created already. isCreated = dbop.createTable(symbol, historic) # this loop code will form dates and scrape data from the startYear say 2000 till last year's December say 2017. while year < currentYear: startMonth = 1 endMonth = startMonth + 1 while endMonth < 13: if not historic: result = formDateAndCallScrapper(startMonth, endMonth, year) else: result = formDateAndCallScrapper(startMonth, endMonth, year, historic=True) startMonth = endMonth + 1 endMonth = startMonth + 1 year += 1 startDay = 1 startMonth = 1 endMonth = startMonth + 1 limitMonth = int(datetime.datetime.now().strftime("%m")) # Current month # now this loop is for the last slot of month/months which couldn't form 2 months pack. while endMonth < limitMonth: if not historic: result = formDateAndCallScrapper(startMonth, endMonth, year) else: result = formDateAndCallScrapper(startMonth, endMonth, year, historic=True) startMonth = endMonth + 1 endMonth = startMonth + 1 if limitMonth - startMonth == 0 or limitMonth - startMonth == 1: startDate = "0" + str(startDay) + "-0" + str(startMonth) + "-" + str( year) endDate = str(datetime.datetime.now().strftime("%d-%m-%Y")) print "start - ", startDate, " to end - ", endDate msg = "start - " + startDate + " to end - " + endDate Log(msg) if not historic: sc = Scrapper() result = sc.equityScrapper(symbol, startDate, endDate, selected=True, timeout=100) else: sc = Scrapper(historic=True) result = sc.historicScrapper(startDate, endDate)
def initialise_stats(self): if self.has_scrapper_links and self.has_valid_predictions: self.predictions = Prediction().initialise_prediction( ).get_all_prediction() self.driver_standings = Scrapper().initialise_links( ).scrape_driver() self.team_standings = Scrapper().initialise_links( ).scrape_constructor() return self else: print("Links and predictions not initialised properly") return self
def formDateAndCallScrapper(startMonth, endMonth, year, historic=False): dates = dt.dateCreator(startMonth, endMonth, year) print "start - ", dates[0], " to end - ", dates[1] msg = "start - " + dates[0] + " to end - " + dates[1] Log(msg) if not historic: sc = Scrapper() return sc.equityScrapper(symbol, dates[0], dates[1], selected=True, timeout=100) else: sc = Scrapper(historic=True) return sc.historicScrapper(dates[0], dates[1])
def aule(): aula = request.args.get('aula') settimanaDopo = request.args.get('settimanaDopo') # Conversione da string a boolean if settimanaDopo == 'True': settimanaDopo = True else: settimanaDopo = False scrapper = Scrapper() dati = scrapper.cerca_orario_aule(aula, settimanaDopo) if dati is None: return "SETTIMANA DI VACANZA" ris = "Aula " + aula + "<br>" for giorni in dati: for giorno in giorni.values(): if isinstance(giorno, str): ris += giorno + " " else: for materie in giorno: for materia in materie.values(): if isinstance(materia, str): ris += materia + " " else: for classe in materia: ris += classe + " " ris += "<br>" ris += "<br>" return ris
def create_app(): app = Flask(__name__) CORS(app) from blueprints import npcs_blueprint from blueprints import gears_blueprint from blueprints import runes_blueprint from blueprints import biomes_blueprint from blueprints import bosses_blueprint from blueprints import outfits_blueprint from blueprints import pickups_blueprint from blueprints import enemies_blueprint from blueprints import mutations_blueprint from blueprints import achievements_blueprint app.register_blueprint(npcs_blueprint.bp) app.register_blueprint(gears_blueprint.bp) app.register_blueprint(runes_blueprint.bp) app.register_blueprint(biomes_blueprint.bp) app.register_blueprint(bosses_blueprint.bp) app.register_blueprint(outfits_blueprint.bp) app.register_blueprint(pickups_blueprint.bp) app.register_blueprint(enemies_blueprint.bp) app.register_blueprint(mutations_blueprint.bp) app.register_blueprint(achievements_blueprint.bp) app.scrapper_manager = Scrapper() @app.errorhandler(404) def route_not_found(error): app.logger.error(error) return 'Route not found.', 404 return app
def count(id): logger.info(f'Adding task for id: {id}') session = Session() task = session.query(Tasks).filter_by(id=id).first() res = Results(address=task.address, words_count=0, http_status_code=0) try: scrpr = Scrapper(task.address) except: scrpr = None if scrpr: err = scrpr.get_page() if not err: task.http_status_code, matches = scrpr.count_matches() task.task_status = 'FINISHED' res = Results(address=task.address, words_count=matches, http_status_code=task.http_status_code) else: print(err) session.add(res) session.commit() logger.info(task) logger.info(res)
def extract_acts(): scrapper = Scrapper(constants.base_url) #when the url is requested without data, the search form is retrieved home_page = scrapper.request({}) acts_scrapper = ActsParser(home_page) acts_scrapper.parse() scrapper.save_data(acts_scrapper.acts, "acts.json")
def scrap_and_upload(vehicle_category): """ """ if vehicle_category is None: sys.exit("vehicle category cannot be null") vehicles = load_scrapping_links(vehicle_category) start_time = datetime.utcnow().strftime("%Y-%m-%d") create_directory(f"tmp") create_directory(f"tmp/{vehicle_category}") file_path = f"{DIR_NAME}/tmp/{vehicle_category}/{start_time}.csv" if os.path.exists(file_path): header = None else: header = ["Make", "Model", "Trim", "Year", "Mileage", "Price"] for make, model, urls in vehicles: for website_name, link in urls.items(): if website_name == 'cg': urlsuffix = "#resultsPage=" elif website_name == 'ed': urlsuffix = "?pagenumber=" site_scrapper = Scrapper(website_name, link, urlsuffix, make, model, vehicle_category) site_scrapper.fetch_batch(NUM_OF_PAGES) if site_scrapper.listings: with open(file_path, "a") as csvfile: write(csvfile, site_scrapper.listings, header) header = None if os.path.exists(file_path): s3_client = boto3.client('s3') s3_client.upload_file(file_path, DESTINATION_BUCKET, f"{vehicle_category}/{start_time}.csv")
def check_prices(): users = session.query(User).all() scrapper = Scrapper() items = session.query(Item).all() for item in items: scrapper.go_to(item.link) price = scrapper.get_price() title = scrapper.get_title() if not item.title: item.title = title session.commit() if item.price: change_percentage = (abs(price - item.price) / item.price) * 100.0 if change_percentage >= 3: item.price = price session.commit() markup = InlineKeyboardMarkup( [InlineKeyboardButton('Check', url=item.link)]) for u in users: try: bot.send_message( u.tg_id, '<code>{}</code> price changed'.format(title), parse_mode=ParseMode.HTML, reply_markup=markup) except Exception as e: config.logger.error( 'Error sending a message: {}'.format(e)) else: item.price = price session.commit()
def getDataFromLast7Dayz(symbol): isCreated = dbop.createTable(symbol) print "getting data from last 7 days for ", symbol msg = "getting data from last 7 days for " + symbol Log(msg) sc = Scrapper() result = sc.equityScrapper(symbol, selected=False, timeout=100)
def scrapeURL(): data = request.json url = data['url'] response = dict() scrapper = None if urlExists(url, timeout=20, check_is_image=False): if isInCustomSites(url): scrapper = CustomScrapper() response['custom'] = True else: scrapper = Scrapper() response['custom'] = False image_or_data_urls = scrapper.scrape(url) if len(image_or_data_urls) > 0: response['success'] = True response['output'] = image_or_data_urls response['stats'] = scrapper.stats else: response['success'] = False response['output'] = "NO_IMAGES_FOUND" else: response['success'] = False response['output'] = "INVALID_URL" return response
def olx_bot(): scrapper = Scrapper() if (Scrapper.isExecution): return render_template('running.html') else: scrapper.start() return render_template('sucess.html')
def olx_bot(): scrapper = Scrapper() if (Scrapper.isExecution): print('O Programa já está sendo executado') else: scrapper.start() print('O Programa está sedo iniciado')
def run(self): scapper = Scrapper() linklist = scapper.loadLink(self.rooturl) dbr = DB() dbr.rawlinks_save(linklist) pass
def run(self): scrapper = Scrapper() global folder_path global test_df test_df = generate_test_data(self.username, self.threshold) folder_path = scrapper.dowload_data(self.username, self.threshold) #user_account="skyemcalpine" folder_path = folder_path.replace("\\", "/") print(folder_path) self.signals.result.emit(True)
def check_and_scrap_reviews(self, hotel_name, platforms): for platform in platforms: if (platform == 'TA'): data = self.read_csv_to_list( "C:/Users/acfelk/Documents/IIT_Files/final year/FYP/fyp_workfiles/final_project/backend/drops/" + hotel_name + "-tripadvisor.csv") if data is None: # NOW CALL THE SCRAPPER TO SCRAP REVIEWS TO drops scrapper = Scrapper() scrapper.scrap_reviews(hotel_name, platform) if (platform == 'BC'): data = self.read_csv_to_list( "C:/Users/acfelk/Documents/IIT_Files/final year/FYP/fyp_workfiles/final_project/backend/drops/" + hotel_name + "-bookingscom.csv") if data is None: # NOW CALL THE SCRAPPER TO SCRAP REVIEWS TO drops scrapper = Scrapper() scrapper.scrap_reviews(hotel_name, platform)
def main(): """ Instancie mes classes Requester et Scrapper, effectue une première requete puis transmet la réponse au scrapper """ requester = Requester() scrapper = Scrapper(requester) requested_response = requester.html_requester(constants.URL) category_list = scrapper.get_category_list(requested_response) scrapper.scrap_books_in_category(category_list, scrapper)
def get_table_info(self): details_movie = None try: if self.download_url is not None: self.sc = Scrapper(self.download_url) details_movie = self.sc.get_movie_details() except Exception as e: print("Error initializing the Scrapper: " + e) if details_movie is not None: return details_movie
def book_download(query, book_name): data = Scrapper(query).parse_data() # gets the book_name from the data and gets the direct download link for the book try: book = list( filter(lambda book: book['Book']['title'] == book_name, data))[0] direct_dl = DownloadFetcher(book).get_direct_download() return jsonify({'book': book, 'download': direct_dl}), 200 except Exception as e: print(e) print(book_name) return f"Error specified book name not found for query = {query}", 404
def create_recipe(): json_data = request.get_json() url = json_data.get('url') type_recipe = json_data.get('typeRecipe') print(f'Creating entry \'{type_recipe}\' for url: \'{url}\'') if type_recipe is None: raise ValueError("typeRecipe is empty") if url is None: raise ValueError("URL is empty") recipe = mongo.add_recipe( Scrapper(url=url, type_recipe=type_recipe).scrap()) return {'success': True, 'recipe': recipe}
def __init__(self): ap = argparse.ArgumentParser() ap.add_argument("-train", "--train", required=True, help="whether to train a model or not") self.args = vars(ap.parse_args()) self.scrapper = Scrapper() # self.dataManager = DataManager() self.filterImage = FilterImage() self.faceRecognition = FaceRecognition() self.emotionDetection = EmotionDetection() self.model = Model()
async def main(): with open('settings.yml') as file: settings = load(file) scrapper = Scrapper() war = scrapper.scape('https://en.wikipedia.org/wiki/War') charity = scrapper.scape( 'https://en.wikipedia.org/wiki/Charity_(practice)') beer = scrapper.scape('https://en.wikipedia.org/wiki/Beer') death = scrapper.scape('https://en.wikipedia.org/wiki/Death') witai_settings = settings.get('witai', {}) witai = Witai(witai_settings.get('entity'), witai_settings.get('token')) await witai.put_words((await war)[0], Value.NEGATIVE) await witai.put_words((await charity)[0], Value.POSITIVE) await witai.put_words((await beer)[0], Value.POSITIVE) await witai.put_words((await death)[0], Value.NEGATIVE)
def test(self,username,threshold): scrapper=Scrapper() folder_path=scrapper.dowload_data(username,threshold) dataProcessor=DataProcessor(folder_path) data=dataProcessor.create_dataframe_input() #print(data) class_names=['food and drink', 'entertainment', 'business and industry', 'family and relationships', 'fitness and wellness', 'hobbies and activities', 'shopping and fashion', 'sports and outdoors', 'technology'] model_path="./last_cnn_model.h5" cnnModel=CnnModel(class_names,model_path,data) model=cnnModel.load_model() test_generator=cnnModel.create_generator() prediction=cnnModel.getPrediction(model,test_generator) result=np.sum(prediction,axis=0) result*=(1/len(prediction)) return result
def start_scraper(): global SELENIUM global FILE_NAME global TEST kwargs = { 'selenium': SELENIUM, 'url': FILE_NAME, 'test': TEST, 'skip_after': 0, 'skip_before': 0, 'export': 'json' } if not TEST: print('test not enabled...') Scrapper(**kwargs).crawl() else: Scrapper.test()
def main(args): username = args.username cid = os.environ['SPOTIPY_CLIENT_ID'] secret = os.environ['SPOTIPY_CLIENT_SECRET'] redirect_uri = os.environ['SPOTIPY_REDIRECT_URI'] content = args.content spoti = TrackExtractor(username, cid, secret, redirect_uri) sc = Scrapper() if content == 'all': ret_tracks, _, _ = spoti.all_tracks() elif content == 'playlists': ret_tracks, _, _ = spoti.tracks_in_all_playlists() elif content == 'saved_tracks': ret_tracks, _, _ = spoti.saved_tracks() else: print( 'Wrong set of filter! Please enter one of [\'all\', \'playlists\',\'saved_tracks\']' )
def main(): scrapper = Scrapper() merger = Merger() parser = Parser() client = MongoClient('localhost', 27017) db = client['Data'] collection_socialmedia = db['socialmedia'] #Begin real time collecting while True: scrapper.scrap() merger.main() parser.main() sleep(3600) #Storing to mangoDB f = open( '/home/sartharion/Bureau/stage/POO/data.json', 'r') file_data = json.load(f) collection_socialmedia.delete_many({}) collection_socialmedia.insert_many(file_data) client.close()
def run_bot(): """ Load config, connect to database, initialize and launch bot """ loop = asyncio.get_event_loop() logger = logging.getLogger(__name__) with open('../config/config.yml', 'r') as config_file: config = yaml.load(config_file, Loader=yaml.BaseLoader) try: db = Db(config) loop.run_until_complete(db.connect_db()) except asyncpg.PostgresError as err: logger.error('Cannot connect to database: %s', err) return bot = Bot(command_prefix=config['bot']['prefix']) bot.db = db bot.add_cog(Scrapper(bot)) bot.add_cog(QuiADit(bot)) bot.run(config['bot']['token'])
def Scrap(): product = request.args.get('product') maxpages = request.args.get('max') website = request.args.get('website') if not maxpages: maxpages = 2 print(product, maxpages) scrap = Scrapper() scrapped_data, csvfile = scrap.start(product, max=maxpages, website=website) record = Record(product=product, created=datetime.today().strftime('%d_%m_%Y'), pages=maxpages, data=csvfile.split('/')[-1], user=session.get('user')) db.session.add(record) db.session.commit() return jsonify(scrapped_data)
from scrapper import Scrapper q = Scrapper("https://www.cpttrevano.ti.ch/orario/invite?invite=true") q.cercaOrarioAule("417 (A-413)")
from scrapper import Scrapper import sys import time try: scrapper_obj = Scrapper() print("Created OBJ") scrapper_obj.go_to_page(1) driver = scrapper_obj.get_driver() except Exception as errmsg: print("Error: {}".format(errmsg)) sys.exit(1) company_data_list = [] count = 0 # ================================================= # Custom code to extract data # ================================================= table_data = driver.find_elements_by_class_name( "zp_3UsOq") # for the entire row data for row_data in table_data: count += 1 print("Count : {}".format(count)) company_data_dict = { "Company_Name": "", "Employee_Headcount": "", "Industry_Sector": "", "Linkedin_URL": "", "FB": "", "Twitter": "", "Website": "" }