Python Scraper 예제들, scraping.scraper.Scraper Python 예제들

예제 #1

1

파일 보기

파일: views.py 프로젝트: deeshashah/AutoShorts

def dashboard(request):
    if request.user.is_authenticated():
        r = requests.get(
            'https://newsapi.org/v1/articles?source=mirror&apiKey=6df0769e0d6244aaa00768c02f123fb2'
        )
        response_dictionary = {}
        response_dictionary["source"] = "mirror"
        data = json.loads(r.text)
        list_of_urls = []
        article_data = []
        main_dict = {}
        i = 0
        for item in data["articles"]:
            mid_dictionary = {}
            if (type(item["title"]) == "unicode"):
                ti = unicodedata.normalize('NFKD', item["title"]).encode(
                    'ascii', 'ignore')
            else:
                ti = item["title"]
            n = len(ti)
            if (n > 10):
                mid_dictionary["title"] = ti
                sc = Scraper()
                print item
                mid_dictionary["data"] = sc.scrape_mirror(item["url"])
                article_data.append(mid_dictionary)
                mid_dictionary["image"] = (item["urlToImage"])
                print mid_dictionary["image"]
                if (type(mid_dictionary["data"]) == "unicode"):
                    st = unicodedata.normalize('NFKD',
                                               mid_dictionary["data"]).encode(
                                                   'ascii', 'ignore')
                else:
                    st = mid_dictionary["data"]

                try:

                    print "********Summary******"
                    summary = summarizer.summarize(st, words=50)
                    print summary.encode('ascii', 'ignore')
                    print "---------Summary---------"
                    mid_dictionary["summary"] = summarizer.summarize(st,
                                                                     words=50)
                except ZeroDivisionError:
                    mid_dictionary["summary"] = st

                print mid_dictionary
                main_dict["article_" + str(i)] = mid_dictionary
                i = i + 1

        response_dictionary["articles"] = article_data

        print main_dict
        for key, value in main_dict.iteritems():
            print "***************"

        return render(request, 'accounts/dashboard.html',
                      {'main_dict': main_dict})
    else:
        return HttpResponseRedirect("/login")

예제 #2

0

파일 보기

파일: backupView.py 프로젝트: deeshashah/AutoShorts

def dashboard(request):
    if request.user.is_authenticated():
        r = requests.get('https://newsapi.org/v1/articles?source=mirror&apiKey=6df0769e0d6244aaa00768c02f123fb2')
        response_dictionary = {}
        # response_dictionary["status"] = r
        response_dictionary["source"] = "mirror"
        data = json.loads(r.text)
        # print data["articles"]
        list_of_urls = []
        article_data = []
        main_dict = {}
        i = 0;
        for item in data["articles"]:
            mid_dictionary = {}
            mid_dictionary["title"] = item["title"]
            #mid_dictionary["url"] = item["url"]
            #mid_dictionary["urlToImage"] = item["urlToImage"]
            sc = Scraper()
            mid_dictionary["data"] = sc.scrape_mirror(item["url"])
            article_data.append(mid_dictionary)
            print type(mid_dictionary["data"])
            if(type(mid_dictionary["data"])=="unicode"):
                st = unicodedata.normalize('NFKD', mid_dictionary["data"]).encode('ascii','ignore')
            else:
                st = mid_dictionary["data"]
            if(len(st)>250):
                print "********Summary******"
                print summarizer.summarize(st,words=50)
                print "---------Summary---------"
                mid_dictionary["summary"] = summarizer.summarize(st,words=50)
            else :
                mid_dictionary["summary"] = st

            mid_dictionary["article_"+str(i)] = mid_dictionary
            i = i+1
        
        response_dictionary["articles"] = article_data
        
        # news_dictionary = {}
        #print main_dict
        for key,value in main_dict.iteritems():
            print "***************"
            # print key
            
            
            print value["title"]
        # # print news_dictionary
        # list_of_urls = []
        # return render(request, 'newslist.html', {'newslist':response_dictionary})
        return render(request,
                'accounts/dashboard.html',
                {'main_dict': main_dict}
                )
    else:
         return HttpResponseRedirect("/login")

예제 #3

0

파일 보기

파일: search.py 프로젝트: si200/pi_romulus

 def on_ok(self):
     """
     Carried out when OK button is pressed
     """
     npyscreen.notify("Please wait", "Searching...")
     self.search = Scraper(self.rom.value, parent=self)
     self.results = clean_results_list(self.search.fill_in_form())
     self.clean_results = self.results[0]
     self.parentApp.SCRAPER_OBJ = self.search
     self.parentApp.CLEAN_RESULTS = self.clean_results
     self.parentApp.RESULTS = self.results[1]

예제 #4

0

파일 보기

    def handle(self, *args, **options):
        print("Task running!")

        max_followers_count = 22064

        for cookie in config.COOKIES:
            print(cookie)

            scraper = Scraper(country_name="Russia", cookie=cookie)
            max_followers_count = scraper.parse_all(
                max_followers_count=max_followers_count)

예제 #5

0

파일 보기

파일: romulus.py 프로젝트: waffle-iron/romulus

 def search_rom(self):
     """
     Searches for the entered ROM
     """
     rom = str(self.editSearchTitle.text())
     self.search = Scraper(rom)
     result = self.search.fill_in_form()
     self.set_results(result)
     self.search_signal.emit('Completed')
     time.sleep(3)
     self.search_signal.emit('Idle')

예제 #6

0

파일 보기

파일: main.py 프로젝트: ottersome/dashTerm

    def checkTweets(self):
        #check tweets is also in scraping/scraper.py  and follow the same algorithm as news but now in twitter
        logging.info("Retrieving tweets about artists...")
        try:
            data = json.load(open('spotify/top_artists.json'))

            stringoList = []
            i = 0
            for datum in data:
                logging.info("Checking Tweets for : "+datum['name'])
                entries = int(self.win4.getmaxyx()[0] / 4)
                if i >= entries:
                    break

                logging.info("Tweet entries : "+str(i)+"/"+str(entries))
                tuplo = Scraper.scrapeTweets(datum['name'])
                logging.info("Tuplo retrieved : ")
                stringo = "Artist: "+tuplo[0]+"\n@"+tuplo[1]+" said:\n"+tuplo[2]
    
                stringoList.append({"url":tuplo[3],"stringo":stringo})

                currpos = (i*4+2,3)
                maxes = self.win4.getmaxyx()
                self.addMessage(self.win4,stringo,(currpos[0]+3,maxes[1]-2),currpos,i)
                self.saveInCache(4,stringoList)

                i = i + 1
        except Exception as e:
            logging.error(str(e))

예제 #7

0

파일 보기

파일: main.py 프로젝트: ottersome/dashTerm

    def getNews(self):
        #For this we use scraping/scraper.py and beautiful soup with urllib which gives us a touple of the information we previously needed
        #The outputing algorith is pretty similar to that of spotify so the explanation is the same
        logging.info("Retrieving news about artists...")
        try:
            data = json.load(open('spotify/top_artists.json'))

            stringoList = []
            i = 0
            for datum in data:
                entries = self.win2.getmaxyx()[0] / 6
                if i >= entries:
                    return
                tuplo = Scraper.scrapeNews(datum['name'])
                stringo = "Artist : "+tuplo[0]+"\nNews : \n"+tuplo[1]
                maxes = self.win2.getmaxyx()

                stringoList.append({"url":tuplo[2],"stringo":stringo})
                self.saveInCache(2,stringoList)
                currpos = (i*6+2,3)
                logging.debug("NEWSO : \n"+stringo)

                self.addMessage(self.win2,stringo,(currpos[0] +5,maxes[1] -2),currpos,i)
                i = i+1
        except Exception as e:
            logging.error(str(e))

예제 #8

0

파일 보기

def main(args: argparse.Namespace):
    """
    Scrape a pff url and save the data from each table to its own csv in the specified output directory

    :param args: The parsed command line arguments
    """
    url = f"http://www.pro-football-reference.com/years/{args.year}/{args.stat}.htm"
    scraper = Scraper(url=url)
    table_ids = scraper.find_table_ids()

    output_dir_path = Path(args.output_dir)
    if not output_dir_path.exists():
        output_dir_path.mkdir()

    for table_id in table_ids:
        df = scraper.scrape(table_id)
        output_file_path = output_dir_path.joinpath(f"{table_id}.csv")
        print(output_file_path)
        df.to_csv(output_file_path)

예제 #9

0

파일 보기

파일: search.py 프로젝트: si200/pi_romulus

class SearchForm(npyscreen.ActionForm):
    """
    This form presents the user with a search form from where they can search for a
    ROM or other game from EmuParadise.
    """
    def create(self):
        """
        Creates form upon initialization by main app.
        """
        self.rom = self.add(npyscreen.TitleText, name='Game: ')

    def on_ok(self):
        """
        Carried out when OK button is pressed
        """
        npyscreen.notify("Please wait", "Searching...")
        self.search = Scraper(self.rom.value, parent=self)
        self.results = clean_results_list(self.search.fill_in_form())
        self.clean_results = self.results[0]
        self.parentApp.SCRAPER_OBJ = self.search
        self.parentApp.CLEAN_RESULTS = self.clean_results
        self.parentApp.RESULTS = self.results[1]

    def on_cancel(self):
        """
        Carried out when Cancel button is pressed
        """
        sys.exit()

    def afterEditing(self):
        """
        Everything here is ran after on_ok is completed.
        Note that all forms added in the parentApp are loaded with their data before
        the app formally begins. Therefore the Results form is declared here to ensure
        that the results data is loaded AFTER we have the results. Declaring the results form
        in the parentApp would load the form without the results.
        """
        self.parentApp.addForm('RESULTS', ResultsForm, name="Results")
        self.parentApp.setNextForm('RESULTS')

예제 #10

0

파일 보기

 def do_scrape(self, line: str) -> None:
     """
     defines what happens on scrape command. In this case it tries to run the Scraper
     :param line: inputline
     """
     args_dict: Dict[str, Any] = self._get_args(line)
     if self._scrape_check_args(args_dict):
         print("Starting to scrape. This may take a while...\n")
         try:
             progress_callback: Callable[
                 [float], None] = lambda progress: self._print_progress_bar(
                     progress * 100, 100)
             with Spinner():
                 self._print_progress_bar(0, 100)
                 Scraper(email=args_dict['email'],
                         password=args_dict['password'],
                         headless=args_dict['headless'],
                         start=args_dict['start'],
                         end=args_dict['end'],
                         extensive=True,
                         progress_observer_callback=progress_callback)
         except (LoginError, PasswordFileNotFound, AssertionError):
             pass
     self.refresh_cli = False

예제 #11

0

파일 보기

파일: download.py 프로젝트: uyd2006/pi_romulus

 def __init__(self, *args, **kwargs):
     super(Download, self).__init__()
     self.url = None
     self.dirs_obj = Directories()
     self.search = Scraper()

예제 #12

0

파일 보기

from scraping import fake_requests
from scraping.scraper import Scraper

scraper = Scraper()

예제 #13

0

파일 보기

파일: __main__.py 프로젝트: alyoanton9/thesis-code

from common.engine import Engine
from scraping.honeypot.processor import add_search_results_to_logs_buffer, honeypot_keyword
from scraping.scraper import Scraper


if __name__ == '__main__':

  with open(scraping_log) as f:
    logs = json.load(f)
    buffer = logs['scraping']

  user_agent = UserAgent().random

  firefox_options = Options()
  firefox_options.add_argument('-headless')
  firefox_options.add_argument(f'user-agent={user_agent}')

  driver = webdriver.Firefox(executable_path=gecko_path, options=firefox_options)

  for engine_item in Engine:
    engine = engine_item.value

    scraper = Scraper(user_agent, driver, honeypot_keyword, engine, with_omitted_results=False)
    search_results = scraper.obtain_first_page_search_results()
    buffer = add_search_results_to_logs_buffer(search_results, buffer)

  driver.quit()

  with open(scraping_log, 'w') as f:
    json.dump(logs, f, indent=4)

예제 #14

0

파일 보기

 def get_test_scraper(requester=Requester()):
     return Scraper(TEST_TICKER, requester)

예제 #15

0

파일 보기

파일: romulus.py 프로젝트: waffle-iron/romulus

class MainWindow(QtGui.QMainWindow, Ui_MainWindow):
    search_signal = pyqtSignal(str)

    def __init__(self, parent=None):
        """
        Main window controller
        """
        self.session = session()
        if self.session.query(Settings).count() <= 0:
            InitialData()
        self.search = None
        self.settings_obj = self.session.query(Settings).first()
        self.product_obj = self.session.query(Product).first()
        QtGui.QWidget.__init__(self, parent)
        self.setupUi(self)
        self._set_defaults()
        self.live_url = None
        self.settings_window = None
        self.sync_window = None
        self.results = None
        self.rasp_ip = self.session.query(RetropieSettings).first().last_known_ip

        # Triggers
        self.actionQuit.triggered.connect(self._quit_romulus)
        self.actionSettings.triggered.connect(self._settings)
        self.btnSearch.clicked.connect(self.search_thread)
        self.search_signal.connect(self.set_status)
        self.tableSearchResults.cellClicked.connect(self.selected_rom)
        self.btnDownloadSelected.clicked.connect(self.download_rom)

    def download_rom(self):
        """
        Downloads selected ROM
        """
        url = self.live_url
        self.search.download(url)

    def selected_rom(self):
        """
        Fetches selected ROM details
        """
        row_selected = self.tableSearchResults.currentRow()
        rom = self.results[row_selected]
        result = self.search.get_link(rom)
        self.live_url = result
        description = self.search.get_description(result)
        self.lblSearchDescriptionSelected.setText(description)
        self.btnDownloadSelected.setEnabled(True)

    def set_status(self, text):
        """
        Sets status due to signal
        """
        status = '<h2>Status: {0}</h2>'.format(text)
        self.lblStatus.setText(status)

    def search_thread(self):
        """
        Sets a search thread
        """
        self.lblStatus.setText('<h2>Status: Searching...</h2>')
        th = Thread(target=self.search_rom)
        th.setDaemon(True)
        th.start()

    def search_rom(self):
        """
        Searches for the entered ROM
        """
        rom = str(self.editSearchTitle.text())
        self.search = Scraper(rom)
        result = self.search.fill_in_form()
        self.set_results(result)
        self.search_signal.emit('Completed')
        time.sleep(3)
        self.search_signal.emit('Idle')

    def set_results(self, results):
        """
        Sets the results from a search query
        """
        self.tableSearchResults.clear()
        search_headers = ['Title']
        self.tableSearchResults.setColumnCount(1)
        self.tableSearchResults.setHorizontalHeaderLabels(search_headers)
        self.tableSearchResults.setRowCount(len(results))
        self.results = results
        row = 0
        for item in results:
            self.tableSearchResults.setItem(row, 0, QtGui.QTableWidgetItem(item.text))
            row += 1

    def _quit_romulus(self):
        """
        Closes Romulus
        """
        self.close()

    def _settings(self):
        """
        Initialize Settings window
        """
        if self.settings_window is None:
            self.settings_window = SettingsWindow(self.rasp_ip)
        self.settings_window.show()

    def _set_defaults(self):
        """
        Set default visuals
        """
        search_headers = ['Title']
        download_headers = ['Title', 'Status']
        self.tableSearchResults.setColumnCount(1)
        self.tableDownloadProgress.setColumnCount(2)
        self.tableSearchResults.setHorizontalHeaderLabels(search_headers)
        self.tableDownloadProgress.setHorizontalHeaderLabels(download_headers)
        search_header = self.tableSearchResults.horizontalHeader()
        download_header = self.tableDownloadProgress.horizontalHeader()
        search_header.setStretchLastSection(True)
        download_header.setStretchLastSection(True)
        self.comboPlatformSearch.addItems(SUPPORTED_PLATFORMS)
        self.btnDownloadSelected.setEnabled(False)

예제 #16

0

파일 보기

파일: __main__.py 프로젝트: alyoanton9/thesis-code

    query_to_scrap = add_keyword_quotes(query)

    for engine_item in Engine:
        engine = engine_item.value

        if engine != Engine.GOOGLE.value and engine != Engine.YANDEX.value:
            continue

        engine_orders_filename = f'{orders_dir}{engine}.json'
        with open(engine_orders_filename) as f:
            orders_buffer = json.load(f)

        scraper = Scraper(user_agent,
                          driver,
                          query_to_scrap,
                          engine,
                          with_omitted_results=True)
        search_results = scraper.obtain_all_pages_search_results()

        chunks_order = []
        for search_result_item in search_results.items:
            chunk_index = find_chunk_index(search_result_item)
            chunks_order.append(chunk_index)

        order_item = {'query': query, 'order': chunks_order}
        orders_buffer.append(order_item)

        pretty_dump_json(obj=orders_buffer, filename=engine_orders_filename)

    driver.quit()

예제 #17

0

파일 보기

파일: main.py 프로젝트: gpschnaars/spotify_hist

def main():

    # get lyrics
    lyric_scraper = Scraper('lyrics', Lyrics, Lyrics.find_lyrics)
    lyric_scraper.main()

    # get spotify features 
    spotify_scraper = Scraper('spotify', Spotify, Spotify.get_features)
    spotify_scraper.main()

    # get genres
    genre_scraper = Scraper('genres', Spotify, Spotify.get_genres)
    genre_scraper.main()

예제 #18

0

파일 보기

class MainWindow(QtGui.QMainWindow, Ui_MainWindow):
    search_signal = pyqtSignal(str)
    status_signal = pyqtSignal(str)

    def __init__(self, parent=None):
        """
        Main window controller
        """
        QtGui.QWidget.__init__(self, parent)
        self.setupUi(self)
        self.session = session()
        if self.session.query(Settings).count() <= 0:
            InitialData()
        self.search = None
        self.settings_obj = self.session.query(Settings).first()
        self.product_obj = self.session.query(Product).first()
        self._set_defaults()
        self.live_url = None
        self.dirs_obj = Directories()
        self.settings_window = None
        self.sync_window = None
        self.results = None
        self.rasp_ip = self.session.query(
            RetropieSettings).first().last_known_ip
        self.retro_settings = self.session.query(RetropieSettings).first()
        self.sync_obj = None
        self.local_library = None
        self.games_dict = GAMES_CLEAN

        # Triggers
        self.actionQuit.triggered.connect(self._quit_romulus)
        self.actionSettings.triggered.connect(self._settings)
        self.btnSearch.clicked.connect(self.search_thread)
        self.search_signal.connect(self.set_status)
        self.status_signal.connect(self.set_status)
        self.tableSearchResults.cellClicked.connect(self.selected_rom)
        self.btnDownloadSelected.clicked.connect(self._download_thread)
        self.actionSync_Library.triggered.connect(self._pi_window)
        self.comboLocalFilter.activated.connect(self.filter_local_library)

    def _download_thread(self):
        """
        Starts a thread for the download
        """
        th = Thread(target=self.download_rom)
        th.setDaemon(True)
        th.start()

    def download_rom(self):
        """
        Downloads selected ROM
        """
        url = self.live_url
        platform = " ".join(url.split('/')[-3].replace('_', ' ').split()[:-1])
        target = self.dirs_obj.target_directory(
            self.settings_obj.download_location, platform)
        self.search.download(url, target)

    def selected_rom(self):
        """
        Fetches selected ROM details
        """
        row_selected = self.tableSearchResults.currentRow()
        rom = self.results[row_selected]
        result = self.search.get_link(rom)
        self.live_url = result
        description = self.search.get_description(result)
        self.lblSearchDescriptionSelected.setText(description)
        self.btnDownloadSelected.setEnabled(True)

    def fetch_local_collection(self):
        """
        Returns local games collection as a dictionary
        """
        games_loc = self.settings_obj.download_location
        library = {}
        if not os.path.exists(games_loc):
            os.makedirs(games_loc)
        dirs = [dirs for root, dirs, files in os.walk(games_loc)][0]
        for rom in dirs:
            library[GAMES_CLEAN[rom]] = [
                games
                for root, dirs, games in os.walk(os.path.join(games_loc, rom))
            ][0]
        return library

    def set_status(self, text):
        """
        Sets status due to signal
        """
        status = '<h2>Status: {0}</h2>'.format(text)
        self.lblStatus.setText(status)

    def search_thread(self):
        """
        Sets a search thread
        """
        self.lblStatus.setText('<h2>Status: Searching...</h2>')
        th = Thread(target=self.search_rom)
        th.setDaemon(True)
        th.start()

    def search_rom(self):
        """
        Searches for the entered ROM
        """
        rom = str(self.editSearchTitle.text())
        self.search = Scraper(rom, parent=self)
        result = self.search.fill_in_form()
        self.set_results(result)
        self.search_signal.emit('Completed')
        time.sleep(3)
        self.search_signal.emit('Idle')

    def set_results(self, results):
        """
        Sets the results from a search query
        """
        self.tableSearchResults.clear()
        search_headers = ['Title']
        self.tableSearchResults.setColumnCount(1)
        self.tableSearchResults.setHorizontalHeaderLabels(search_headers)
        self.tableSearchResults.setRowCount(len(results))
        self.results = results
        row = 0
        for item in results:
            self.tableSearchResults.setItem(row, 0,
                                            QtGui.QTableWidgetItem(item.text))
            row += 1

    def _quit_romulus(self):
        """
        Closes Romulus
        """
        self.close()

    def _settings(self):
        """
        Initialize Settings window
        """
        if self.settings_window is None:
            self.settings_window = SettingsWindow(self.rasp_ip)
        self.settings_window.show()

    def _pi_window(self):
        """
        Starts Pi Controller in separate thread
        """
        self.set_status('Connecting to Retropie')
        self._sync()

    def _sync(self):
        """
        Initialize Pi Control Centre window
        """
        self.status_signal.emit('Connecting to Retropie')
        if self.sync_window is None:
            if self.sync_obj is None:
                self.sync_obj = Sync(self.retro_settings)
            self.sync_window = PiWindow(self.sync_obj, self.settings_obj,
                                        self.games_dict,
                                        self.fetch_local_collection())
        self.status_signal.emit('Idle')
        self.sync_window.show()

    def _set_defaults(self):
        """
        Set default visuals
        """
        search_headers = ['Title']

        self.tableSearchResults.setColumnCount(1)
        self.tableSearchResults.setHorizontalHeaderLabels(search_headers)
        search_header = self.tableSearchResults.horizontalHeader()
        search_header.setStretchLastSection(True)
        self.btnDownloadSelected.setEnabled(False)
        self._set_default_local()

    def _set_default_local(self):
        local_headers = ['Title', 'Platform']
        library = self.fetch_local_collection()
        total_rows = 0
        for platform, roms in library.iteritems():
            total_rows += len(roms)
        self.tableLocalCollection.clear()
        local_header = self.tableLocalCollection.horizontalHeader()
        self.tableLocalCollection.setColumnCount(2)
        self.tableLocalCollection.setRowCount(total_rows)
        local_header.setStretchLastSection(True)
        local_header.setResizeMode(0, QHeaderView.Stretch)
        self.tableLocalCollection.setHorizontalHeaderLabels(local_headers)
        row = 0
        local_platforms = []
        for platform, roms in library.iteritems():
            local_platforms.append(platform)
            for rom in roms:
                self.tableLocalCollection.setItem(row, 0,
                                                  QtGui.QTableWidgetItem(rom))
                self.tableLocalCollection.setItem(
                    row, 1, QtGui.QTableWidgetItem(platform))
                row += 1
        self.comboLocalFilter.clear()
        self.comboLocalFilter.addItem('All')
        self.comboLocalFilter.addItems(local_platforms)

    def filter_local_library(self):
        """
        Sets a filtered list of local ROMS
        """
        selected = str(self.comboLocalFilter.currentText())
        if selected != 'All':
            local_headers = ['Title', 'Platform']
            library = self.fetch_local_collection()
            filtered_roms = library[selected]
            total_rows = len(filtered_roms)
            local_header = self.tableLocalCollection.horizontalHeader()
            self.tableLocalCollection.clear()
            self.tableLocalCollection.setColumnCount(2)
            self.tableLocalCollection.setRowCount(total_rows)
            local_header.setStretchLastSection(True)
            local_header.setResizeMode(0, QHeaderView.Stretch)
            self.tableLocalCollection.setHorizontalHeaderLabels(local_headers)
            row = 0
            for rom in filtered_roms:
                self.tableLocalCollection.setItem(row, 0,
                                                  QtGui.QTableWidgetItem(rom))
                self.tableLocalCollection.setItem(
                    row, 1, QtGui.QTableWidgetItem(selected))
                row += 1
        else:
            self._set_default_local()