def dashboard(request): if request.user.is_authenticated(): r = requests.get( 'https://newsapi.org/v1/articles?source=mirror&apiKey=6df0769e0d6244aaa00768c02f123fb2' ) response_dictionary = {} response_dictionary["source"] = "mirror" data = json.loads(r.text) list_of_urls = [] article_data = [] main_dict = {} i = 0 for item in data["articles"]: mid_dictionary = {} if (type(item["title"]) == "unicode"): ti = unicodedata.normalize('NFKD', item["title"]).encode( 'ascii', 'ignore') else: ti = item["title"] n = len(ti) if (n > 10): mid_dictionary["title"] = ti sc = Scraper() print item mid_dictionary["data"] = sc.scrape_mirror(item["url"]) article_data.append(mid_dictionary) mid_dictionary["image"] = (item["urlToImage"]) print mid_dictionary["image"] if (type(mid_dictionary["data"]) == "unicode"): st = unicodedata.normalize('NFKD', mid_dictionary["data"]).encode( 'ascii', 'ignore') else: st = mid_dictionary["data"] try: print "********Summary******" summary = summarizer.summarize(st, words=50) print summary.encode('ascii', 'ignore') print "---------Summary---------" mid_dictionary["summary"] = summarizer.summarize(st, words=50) except ZeroDivisionError: mid_dictionary["summary"] = st print mid_dictionary main_dict["article_" + str(i)] = mid_dictionary i = i + 1 response_dictionary["articles"] = article_data print main_dict for key, value in main_dict.iteritems(): print "***************" return render(request, 'accounts/dashboard.html', {'main_dict': main_dict}) else: return HttpResponseRedirect("/login")
def dashboard(request): if request.user.is_authenticated(): r = requests.get('https://newsapi.org/v1/articles?source=mirror&apiKey=6df0769e0d6244aaa00768c02f123fb2') response_dictionary = {} # response_dictionary["status"] = r response_dictionary["source"] = "mirror" data = json.loads(r.text) # print data["articles"] list_of_urls = [] article_data = [] main_dict = {} i = 0; for item in data["articles"]: mid_dictionary = {} mid_dictionary["title"] = item["title"] #mid_dictionary["url"] = item["url"] #mid_dictionary["urlToImage"] = item["urlToImage"] sc = Scraper() mid_dictionary["data"] = sc.scrape_mirror(item["url"]) article_data.append(mid_dictionary) print type(mid_dictionary["data"]) if(type(mid_dictionary["data"])=="unicode"): st = unicodedata.normalize('NFKD', mid_dictionary["data"]).encode('ascii','ignore') else: st = mid_dictionary["data"] if(len(st)>250): print "********Summary******" print summarizer.summarize(st,words=50) print "---------Summary---------" mid_dictionary["summary"] = summarizer.summarize(st,words=50) else : mid_dictionary["summary"] = st mid_dictionary["article_"+str(i)] = mid_dictionary i = i+1 response_dictionary["articles"] = article_data # news_dictionary = {} #print main_dict for key,value in main_dict.iteritems(): print "***************" # print key print value["title"] # # print news_dictionary # list_of_urls = [] # return render(request, 'newslist.html', {'newslist':response_dictionary}) return render(request, 'accounts/dashboard.html', {'main_dict': main_dict} ) else: return HttpResponseRedirect("/login")
def on_ok(self): """ Carried out when OK button is pressed """ npyscreen.notify("Please wait", "Searching...") self.search = Scraper(self.rom.value, parent=self) self.results = clean_results_list(self.search.fill_in_form()) self.clean_results = self.results[0] self.parentApp.SCRAPER_OBJ = self.search self.parentApp.CLEAN_RESULTS = self.clean_results self.parentApp.RESULTS = self.results[1]
def handle(self, *args, **options): print("Task running!") max_followers_count = 22064 for cookie in config.COOKIES: print(cookie) scraper = Scraper(country_name="Russia", cookie=cookie) max_followers_count = scraper.parse_all( max_followers_count=max_followers_count)
def search_rom(self): """ Searches for the entered ROM """ rom = str(self.editSearchTitle.text()) self.search = Scraper(rom) result = self.search.fill_in_form() self.set_results(result) self.search_signal.emit('Completed') time.sleep(3) self.search_signal.emit('Idle')
def checkTweets(self): #check tweets is also in scraping/scraper.py and follow the same algorithm as news but now in twitter logging.info("Retrieving tweets about artists...") try: data = json.load(open('spotify/top_artists.json')) stringoList = [] i = 0 for datum in data: logging.info("Checking Tweets for : "+datum['name']) entries = int(self.win4.getmaxyx()[0] / 4) if i >= entries: break logging.info("Tweet entries : "+str(i)+"/"+str(entries)) tuplo = Scraper.scrapeTweets(datum['name']) logging.info("Tuplo retrieved : ") stringo = "Artist: "+tuplo[0]+"\n@"+tuplo[1]+" said:\n"+tuplo[2] stringoList.append({"url":tuplo[3],"stringo":stringo}) currpos = (i*4+2,3) maxes = self.win4.getmaxyx() self.addMessage(self.win4,stringo,(currpos[0]+3,maxes[1]-2),currpos,i) self.saveInCache(4,stringoList) i = i + 1 except Exception as e: logging.error(str(e))
def getNews(self): #For this we use scraping/scraper.py and beautiful soup with urllib which gives us a touple of the information we previously needed #The outputing algorith is pretty similar to that of spotify so the explanation is the same logging.info("Retrieving news about artists...") try: data = json.load(open('spotify/top_artists.json')) stringoList = [] i = 0 for datum in data: entries = self.win2.getmaxyx()[0] / 6 if i >= entries: return tuplo = Scraper.scrapeNews(datum['name']) stringo = "Artist : "+tuplo[0]+"\nNews : \n"+tuplo[1] maxes = self.win2.getmaxyx() stringoList.append({"url":tuplo[2],"stringo":stringo}) self.saveInCache(2,stringoList) currpos = (i*6+2,3) logging.debug("NEWSO : \n"+stringo) self.addMessage(self.win2,stringo,(currpos[0] +5,maxes[1] -2),currpos,i) i = i+1 except Exception as e: logging.error(str(e))
def main(args: argparse.Namespace): """ Scrape a pff url and save the data from each table to its own csv in the specified output directory :param args: The parsed command line arguments """ url = f"http://www.pro-football-reference.com/years/{args.year}/{args.stat}.htm" scraper = Scraper(url=url) table_ids = scraper.find_table_ids() output_dir_path = Path(args.output_dir) if not output_dir_path.exists(): output_dir_path.mkdir() for table_id in table_ids: df = scraper.scrape(table_id) output_file_path = output_dir_path.joinpath(f"{table_id}.csv") print(output_file_path) df.to_csv(output_file_path)
class SearchForm(npyscreen.ActionForm): """ This form presents the user with a search form from where they can search for a ROM or other game from EmuParadise. """ def create(self): """ Creates form upon initialization by main app. """ self.rom = self.add(npyscreen.TitleText, name='Game: ') def on_ok(self): """ Carried out when OK button is pressed """ npyscreen.notify("Please wait", "Searching...") self.search = Scraper(self.rom.value, parent=self) self.results = clean_results_list(self.search.fill_in_form()) self.clean_results = self.results[0] self.parentApp.SCRAPER_OBJ = self.search self.parentApp.CLEAN_RESULTS = self.clean_results self.parentApp.RESULTS = self.results[1] def on_cancel(self): """ Carried out when Cancel button is pressed """ sys.exit() def afterEditing(self): """ Everything here is ran after on_ok is completed. Note that all forms added in the parentApp are loaded with their data before the app formally begins. Therefore the Results form is declared here to ensure that the results data is loaded AFTER we have the results. Declaring the results form in the parentApp would load the form without the results. """ self.parentApp.addForm('RESULTS', ResultsForm, name="Results") self.parentApp.setNextForm('RESULTS')
def do_scrape(self, line: str) -> None: """ defines what happens on scrape command. In this case it tries to run the Scraper :param line: inputline """ args_dict: Dict[str, Any] = self._get_args(line) if self._scrape_check_args(args_dict): print("Starting to scrape. This may take a while...\n") try: progress_callback: Callable[ [float], None] = lambda progress: self._print_progress_bar( progress * 100, 100) with Spinner(): self._print_progress_bar(0, 100) Scraper(email=args_dict['email'], password=args_dict['password'], headless=args_dict['headless'], start=args_dict['start'], end=args_dict['end'], extensive=True, progress_observer_callback=progress_callback) except (LoginError, PasswordFileNotFound, AssertionError): pass self.refresh_cli = False
def __init__(self, *args, **kwargs): super(Download, self).__init__() self.url = None self.dirs_obj = Directories() self.search = Scraper()
from scraping import fake_requests from scraping.scraper import Scraper scraper = Scraper()
from common.engine import Engine from scraping.honeypot.processor import add_search_results_to_logs_buffer, honeypot_keyword from scraping.scraper import Scraper if __name__ == '__main__': with open(scraping_log) as f: logs = json.load(f) buffer = logs['scraping'] user_agent = UserAgent().random firefox_options = Options() firefox_options.add_argument('-headless') firefox_options.add_argument(f'user-agent={user_agent}') driver = webdriver.Firefox(executable_path=gecko_path, options=firefox_options) for engine_item in Engine: engine = engine_item.value scraper = Scraper(user_agent, driver, honeypot_keyword, engine, with_omitted_results=False) search_results = scraper.obtain_first_page_search_results() buffer = add_search_results_to_logs_buffer(search_results, buffer) driver.quit() with open(scraping_log, 'w') as f: json.dump(logs, f, indent=4)
def get_test_scraper(requester=Requester()): return Scraper(TEST_TICKER, requester)
class MainWindow(QtGui.QMainWindow, Ui_MainWindow): search_signal = pyqtSignal(str) def __init__(self, parent=None): """ Main window controller """ self.session = session() if self.session.query(Settings).count() <= 0: InitialData() self.search = None self.settings_obj = self.session.query(Settings).first() self.product_obj = self.session.query(Product).first() QtGui.QWidget.__init__(self, parent) self.setupUi(self) self._set_defaults() self.live_url = None self.settings_window = None self.sync_window = None self.results = None self.rasp_ip = self.session.query(RetropieSettings).first().last_known_ip # Triggers self.actionQuit.triggered.connect(self._quit_romulus) self.actionSettings.triggered.connect(self._settings) self.btnSearch.clicked.connect(self.search_thread) self.search_signal.connect(self.set_status) self.tableSearchResults.cellClicked.connect(self.selected_rom) self.btnDownloadSelected.clicked.connect(self.download_rom) def download_rom(self): """ Downloads selected ROM """ url = self.live_url self.search.download(url) def selected_rom(self): """ Fetches selected ROM details """ row_selected = self.tableSearchResults.currentRow() rom = self.results[row_selected] result = self.search.get_link(rom) self.live_url = result description = self.search.get_description(result) self.lblSearchDescriptionSelected.setText(description) self.btnDownloadSelected.setEnabled(True) def set_status(self, text): """ Sets status due to signal """ status = '<h2>Status: {0}</h2>'.format(text) self.lblStatus.setText(status) def search_thread(self): """ Sets a search thread """ self.lblStatus.setText('<h2>Status: Searching...</h2>') th = Thread(target=self.search_rom) th.setDaemon(True) th.start() def search_rom(self): """ Searches for the entered ROM """ rom = str(self.editSearchTitle.text()) self.search = Scraper(rom) result = self.search.fill_in_form() self.set_results(result) self.search_signal.emit('Completed') time.sleep(3) self.search_signal.emit('Idle') def set_results(self, results): """ Sets the results from a search query """ self.tableSearchResults.clear() search_headers = ['Title'] self.tableSearchResults.setColumnCount(1) self.tableSearchResults.setHorizontalHeaderLabels(search_headers) self.tableSearchResults.setRowCount(len(results)) self.results = results row = 0 for item in results: self.tableSearchResults.setItem(row, 0, QtGui.QTableWidgetItem(item.text)) row += 1 def _quit_romulus(self): """ Closes Romulus """ self.close() def _settings(self): """ Initialize Settings window """ if self.settings_window is None: self.settings_window = SettingsWindow(self.rasp_ip) self.settings_window.show() def _set_defaults(self): """ Set default visuals """ search_headers = ['Title'] download_headers = ['Title', 'Status'] self.tableSearchResults.setColumnCount(1) self.tableDownloadProgress.setColumnCount(2) self.tableSearchResults.setHorizontalHeaderLabels(search_headers) self.tableDownloadProgress.setHorizontalHeaderLabels(download_headers) search_header = self.tableSearchResults.horizontalHeader() download_header = self.tableDownloadProgress.horizontalHeader() search_header.setStretchLastSection(True) download_header.setStretchLastSection(True) self.comboPlatformSearch.addItems(SUPPORTED_PLATFORMS) self.btnDownloadSelected.setEnabled(False)
query_to_scrap = add_keyword_quotes(query) for engine_item in Engine: engine = engine_item.value if engine != Engine.GOOGLE.value and engine != Engine.YANDEX.value: continue engine_orders_filename = f'{orders_dir}{engine}.json' with open(engine_orders_filename) as f: orders_buffer = json.load(f) scraper = Scraper(user_agent, driver, query_to_scrap, engine, with_omitted_results=True) search_results = scraper.obtain_all_pages_search_results() chunks_order = [] for search_result_item in search_results.items: chunk_index = find_chunk_index(search_result_item) chunks_order.append(chunk_index) order_item = {'query': query, 'order': chunks_order} orders_buffer.append(order_item) pretty_dump_json(obj=orders_buffer, filename=engine_orders_filename) driver.quit()
def main(): # get lyrics lyric_scraper = Scraper('lyrics', Lyrics, Lyrics.find_lyrics) lyric_scraper.main() # get spotify features spotify_scraper = Scraper('spotify', Spotify, Spotify.get_features) spotify_scraper.main() # get genres genre_scraper = Scraper('genres', Spotify, Spotify.get_genres) genre_scraper.main()
class MainWindow(QtGui.QMainWindow, Ui_MainWindow): search_signal = pyqtSignal(str) status_signal = pyqtSignal(str) def __init__(self, parent=None): """ Main window controller """ QtGui.QWidget.__init__(self, parent) self.setupUi(self) self.session = session() if self.session.query(Settings).count() <= 0: InitialData() self.search = None self.settings_obj = self.session.query(Settings).first() self.product_obj = self.session.query(Product).first() self._set_defaults() self.live_url = None self.dirs_obj = Directories() self.settings_window = None self.sync_window = None self.results = None self.rasp_ip = self.session.query( RetropieSettings).first().last_known_ip self.retro_settings = self.session.query(RetropieSettings).first() self.sync_obj = None self.local_library = None self.games_dict = GAMES_CLEAN # Triggers self.actionQuit.triggered.connect(self._quit_romulus) self.actionSettings.triggered.connect(self._settings) self.btnSearch.clicked.connect(self.search_thread) self.search_signal.connect(self.set_status) self.status_signal.connect(self.set_status) self.tableSearchResults.cellClicked.connect(self.selected_rom) self.btnDownloadSelected.clicked.connect(self._download_thread) self.actionSync_Library.triggered.connect(self._pi_window) self.comboLocalFilter.activated.connect(self.filter_local_library) def _download_thread(self): """ Starts a thread for the download """ th = Thread(target=self.download_rom) th.setDaemon(True) th.start() def download_rom(self): """ Downloads selected ROM """ url = self.live_url platform = " ".join(url.split('/')[-3].replace('_', ' ').split()[:-1]) target = self.dirs_obj.target_directory( self.settings_obj.download_location, platform) self.search.download(url, target) def selected_rom(self): """ Fetches selected ROM details """ row_selected = self.tableSearchResults.currentRow() rom = self.results[row_selected] result = self.search.get_link(rom) self.live_url = result description = self.search.get_description(result) self.lblSearchDescriptionSelected.setText(description) self.btnDownloadSelected.setEnabled(True) def fetch_local_collection(self): """ Returns local games collection as a dictionary """ games_loc = self.settings_obj.download_location library = {} if not os.path.exists(games_loc): os.makedirs(games_loc) dirs = [dirs for root, dirs, files in os.walk(games_loc)][0] for rom in dirs: library[GAMES_CLEAN[rom]] = [ games for root, dirs, games in os.walk(os.path.join(games_loc, rom)) ][0] return library def set_status(self, text): """ Sets status due to signal """ status = '<h2>Status: {0}</h2>'.format(text) self.lblStatus.setText(status) def search_thread(self): """ Sets a search thread """ self.lblStatus.setText('<h2>Status: Searching...</h2>') th = Thread(target=self.search_rom) th.setDaemon(True) th.start() def search_rom(self): """ Searches for the entered ROM """ rom = str(self.editSearchTitle.text()) self.search = Scraper(rom, parent=self) result = self.search.fill_in_form() self.set_results(result) self.search_signal.emit('Completed') time.sleep(3) self.search_signal.emit('Idle') def set_results(self, results): """ Sets the results from a search query """ self.tableSearchResults.clear() search_headers = ['Title'] self.tableSearchResults.setColumnCount(1) self.tableSearchResults.setHorizontalHeaderLabels(search_headers) self.tableSearchResults.setRowCount(len(results)) self.results = results row = 0 for item in results: self.tableSearchResults.setItem(row, 0, QtGui.QTableWidgetItem(item.text)) row += 1 def _quit_romulus(self): """ Closes Romulus """ self.close() def _settings(self): """ Initialize Settings window """ if self.settings_window is None: self.settings_window = SettingsWindow(self.rasp_ip) self.settings_window.show() def _pi_window(self): """ Starts Pi Controller in separate thread """ self.set_status('Connecting to Retropie') self._sync() def _sync(self): """ Initialize Pi Control Centre window """ self.status_signal.emit('Connecting to Retropie') if self.sync_window is None: if self.sync_obj is None: self.sync_obj = Sync(self.retro_settings) self.sync_window = PiWindow(self.sync_obj, self.settings_obj, self.games_dict, self.fetch_local_collection()) self.status_signal.emit('Idle') self.sync_window.show() def _set_defaults(self): """ Set default visuals """ search_headers = ['Title'] self.tableSearchResults.setColumnCount(1) self.tableSearchResults.setHorizontalHeaderLabels(search_headers) search_header = self.tableSearchResults.horizontalHeader() search_header.setStretchLastSection(True) self.btnDownloadSelected.setEnabled(False) self._set_default_local() def _set_default_local(self): local_headers = ['Title', 'Platform'] library = self.fetch_local_collection() total_rows = 0 for platform, roms in library.iteritems(): total_rows += len(roms) self.tableLocalCollection.clear() local_header = self.tableLocalCollection.horizontalHeader() self.tableLocalCollection.setColumnCount(2) self.tableLocalCollection.setRowCount(total_rows) local_header.setStretchLastSection(True) local_header.setResizeMode(0, QHeaderView.Stretch) self.tableLocalCollection.setHorizontalHeaderLabels(local_headers) row = 0 local_platforms = [] for platform, roms in library.iteritems(): local_platforms.append(platform) for rom in roms: self.tableLocalCollection.setItem(row, 0, QtGui.QTableWidgetItem(rom)) self.tableLocalCollection.setItem( row, 1, QtGui.QTableWidgetItem(platform)) row += 1 self.comboLocalFilter.clear() self.comboLocalFilter.addItem('All') self.comboLocalFilter.addItems(local_platforms) def filter_local_library(self): """ Sets a filtered list of local ROMS """ selected = str(self.comboLocalFilter.currentText()) if selected != 'All': local_headers = ['Title', 'Platform'] library = self.fetch_local_collection() filtered_roms = library[selected] total_rows = len(filtered_roms) local_header = self.tableLocalCollection.horizontalHeader() self.tableLocalCollection.clear() self.tableLocalCollection.setColumnCount(2) self.tableLocalCollection.setRowCount(total_rows) local_header.setStretchLastSection(True) local_header.setResizeMode(0, QHeaderView.Stretch) self.tableLocalCollection.setHorizontalHeaderLabels(local_headers) row = 0 for rom in filtered_roms: self.tableLocalCollection.setItem(row, 0, QtGui.QTableWidgetItem(rom)) self.tableLocalCollection.setItem( row, 1, QtGui.QTableWidgetItem(selected)) row += 1 else: self._set_default_local()