Python Scraper示例，scraping.Scraper Python示例

示例#1

0

显示文件

文件： test_scraping.py 项目： miitcher/OsakkeetScraper

def test_pretty_val_Equal(tester, expected_type, v, expected_v):
    pretty_v = Scraper.pretty_val_st(v, expected_type)
    tester.assertEqual(pretty_v, expected_v)
    if pretty_v is None:
        pass
    elif expected_type == date:
        tester.assertIsInstance(pretty_v, str)
    else:
        tester.assertIsInstance(pretty_v, expected_type)

示例#2

0

显示文件

文件： test_scraping.py 项目： miitcher/OsakkeetScraper

def test_get_perustiedot_Controll(tester, company_id,
                                  part_of_expected_perustiedot):
    type_dict = {
        "porssi": str,
        "listattu": str,
        "kaupankayntitunnus": str,
        "isin-koodi": str,
        "toimialaluokka": str,
        "nimellisarvo": str,
        "kaupankaynti_valuutta": str,
        "toimiala": str,
        "markkina-arvo": float,
        "osakkeet_kpl": int
    }
    scraper = Scraper(company_id)
    perustiedot = scraper.get_perustiedot()
    tester.assertEqual(len(perustiedot), 10)
    for key in perustiedot:
        if perustiedot[key] is not None:
            tester.assertIsInstance(perustiedot[key], type_dict[key])
        if type_dict[key] == str:
            tester.assertEqual(perustiedot[key],
                               part_of_expected_perustiedot[key])

示例#3

0

显示文件

文件： test_scraping.py 项目： miitcher/OsakkeetScraper

def test_get_osinko_Controll(tester, company_id, one_expected_osinko):
    type_dict = {
        "vuosi": int,
        "irtoaminen": str,
        "oikaistu_euroina": float,
        "maara": float,
        "valuutta": str,
        "tuotto_%": float,
        "lisatieto": str
    }
    scraper = Scraper(company_id)
    osingot = scraper.get_osingot()
    matches = 0
    for top_key in osingot:
        tester.assertIsInstance(top_key, str)
        tester.assertEqual(len(osingot[top_key]), 7)
        if osingot[top_key]["irtoaminen"] == one_expected_osinko["irtoaminen"] \
        and osingot[top_key]["maara"] == one_expected_osinko["maara"]:
            tester.assertDictEqual(osingot[top_key], one_expected_osinko)
            matches += 1
        for key in osingot[top_key]:
            if osingot[top_key][key] is not None:
                tester.assertIsInstance(osingot[top_key][key], type_dict[key])
    tester.assertEqual(matches, 1)

示例#4

0

显示文件

文件： app.py 项目： muhammedalikocabey/Text-Similarity-of-Products

def urun_bilgileri():
    if request.method == "POST":
        global category
        category = request.form.get("category_selected")

        # urunler_df = pd.read_csv("Tüm_Ürünler.csv")
        # urunler_df.fillna("-", inplace=True)

        result_list = Scraper.getAll(category=category)
        urunler_df = result_list[1]
        urunler_df.fillna("-", inplace=True)
        global urunler_df_list
        urunler_df_list = result_list[0]

        return render_template("urun-bilgileri.html",
                               category=category,
                               urunler_df=urunler_df)

    else:
        return render_template(url_for("home"))

示例#5

0

显示文件

class Application:
    """
    Main window of the program.
    """
    window_width = 1000
    window_height = 600

    border_width = 5
    border_color = 'black'

    __slots__ = (
        'root',
        'left_frame',
        'url_label',
        'url_entry',
        'check_button',
        'url_check_label',
        'start_dl_button',
        'mid_frame',
        'url_tracking_label',
        'url_tracking_text',
        'right_frame',
        'log_text',
        'bottom_frame',
        'download_tracking_label',
        'download_tracking_bar',
        'scraper',
        'driver',
        'login',
        'ig_url_re',
        'ig_profile_url_re',
        'general_img_re',
        'imgur_re',
        'youtube_re',
        'yt_re',
        'reddit_re',
        'reddit_fallback_re',
        'gfycat_re',
        'tumblr_re',
        'twitter_re',
        'exprs',
    )

    def __init__(self, root):
        self.root = root

        self.left_frame = tk.Frame()
        self.url_label = tk.Label()
        self.url_entry = tk.Entry()
        self.check_button = tk.Button()
        self.url_check_label = tk.Label()
        self.start_dl_button = tk.Button()
        self.setup_left_frame()

        self.mid_frame = tk.Frame()
        self.url_tracking_label = tk.Label()
        self.url_tracking_text = ScrollText()
        self.setup_mid_frame()

        self.right_frame = tk.Frame()
        self.log_text = ScrollText()
        self.setup_right_frame()

        self.bottom_frame = tk.Frame()
        self.download_tracking_label = tk.Label()
        self.download_tracking_bar = ttk.Progressbar()
        self.setup_bottom_frame()

        # Initialise classes here so we can pass the logging widget
        self.scraper = Scraper(self.log_text, self.download_tracking_label)
        self.driver = Driver(self.log_text)
        self.driver.start_driver()  # Start webdriver to be used for scraping
        self.login = None

        # Lots of regexes to check the validity of wanted URLs
        # Make sure only IG posts are specified, not user's pages
        self.ig_url_re = re.compile(r'^https://www\.instagram\.com/p/.+/')
        self.ig_profile_url_re = re.compile(
            r'^https://www\.instagram\.com/(\w+|\d+)/$')
        self.general_img_re = re.compile(
            r'^https?://.+\..+\..+\.(?:jpg|png|gif)')
        self.imgur_re = re.compile(
            r'^https?://imgur\.com/(?:.)+$(?<!(png|gif|jpg))')
        self.youtube_re = re.compile(
            'https://(?:www\.)?youtube\.com/watch\?v=.+')
        self.yt_re = re.compile(r'https://youtu\.be/.+')
        self.reddit_re = re.compile(
            r'https?://(?:www|old)\.reddit\.com/(?:r|u|user)/(\w+)/.+')
        self.reddit_fallback_re = re.compile(
            r'https://v\.redd\.it/.+\?source=fallback')
        self.gfycat_re = re.compile(r'https://gfycat\.com/\w+$(?<!-)')
        self.tumblr_re = re.compile(
            r'https://(.+)\.tumblr\.com/post/(\d+)(?:/.+)?')
        self.twitter_re = re.compile(r'https://twitter.com/.+/status/(\d+)')

        # Map URLs to the methods needed to extract the images in them
        # All of these methods take a single argument, the URL/text
        self.exprs = {
            self.ig_url_re: self.process_ig_url,
            self.ig_profile_url_re: self.process_ig_profile_url,
            self.general_img_re: self.process_general_url,
            self.imgur_re: self.process_imgur_url,
            self.youtube_re: self.process_yt_url,
            self.yt_re: self.process_yt_url,
            self.reddit_re: self.process_reddit_url,
            self.reddit_fallback_re: self.process_general_url,
            self.gfycat_re: self.process_gfycat_url,
            self.tumblr_re: self.process_tumblr_url,
            self.twitter_re: self.process_twitter_url,
        }

    def setup_left_frame(self):
        """
        Set up the left frame of the application's window.
        """
        self.left_frame = tk.Frame(
            self.root,
            bg=MID_GREY,
            width=self.window_width / 3,
            height=self.window_height - self.border_width * 40,
            highlightbackground=self.border_color,
            highlightcolor=self.border_color,
            highlightthickness=self.border_width,
        )
        self.left_frame.grid(row=0, column=0)
        self.left_frame.grid_propagate(
            False)  # Keep the frame from automatically resizing

        self.url_label = tk.Label(self.left_frame,
                                  bg=MID_GREY,
                                  text='Enter URLs:',
                                  font=('Arial', 15, 'bold'))
        self.url_label.place(relx=0.5, rely=0.3, anchor='center')

        self.url_entry = tk.Entry(
            self.left_frame,
            width=int(self.left_frame.winfo_reqwidth() * 0.1),
            borderwidth=3,
        )
        self.url_entry.bind(
            '<Return>',
            lambda e: threading.Thread(target=self.process_input).start())
        self.url_entry.place(relx=0.5, rely=0.4, anchor='center')

        self.check_button = tk.Button(self.left_frame,
                                      text='OK',
                                      bg='black',
                                      fg='white',
                                      activebackground=DARK_GREY,
                                      font=('Arial', 12),
                                      cursor='hand2')
        self.check_button.bind(
            '<ButtonRelease-1>',
            lambda e: threading.Thread(target=self.process_input).start())
        self.check_button.place(relx=0.5, rely=0.5, anchor='center')

        self.url_check_label = tk.Label(self.left_frame,
                                        bg=MID_GREY,
                                        font=('Arial', 12))
        self.url_check_label.place(relx=0.5, rely=0.6, anchor='center')

        self.start_dl_button = tk.Button(self.left_frame,
                                         text='Start Downloading',
                                         bg='black',
                                         fg='white',
                                         activebackground=DARK_GREY,
                                         font=('Arial', 12),
                                         cursor='hand2')
        self.start_dl_button.bind(
            '<ButtonRelease-1>',
            lambda e: threading.Thread(target=self.download_files).start())
        self.start_dl_button.place(relx=0.5, rely=0.7, anchor='center')

    def setup_mid_frame(self):
        """
        Set up the middle frame of the application's window.
        """
        self.mid_frame = tk.Frame(
            self.root,
            bg=MID_GREY,
            width=self.window_width / 3,
            height=self.window_height - self.border_width * 40,
            highlightbackground=self.border_color,
            highlightcolor=self.border_color,
            highlightthickness=self.border_width,
        )
        self.mid_frame.grid(row=0, column=1)
        self.mid_frame.grid_propagate(False)
        # Keep the frame border from disappearing by adding weights
        self.mid_frame.rowconfigure(1, weight=1)
        self.mid_frame.columnconfigure(0, weight=1)

        self.url_tracking_label = tk.Label(
            self.mid_frame,
            bg=MID_GREY,
            text='Saved URLs:',
            font=('Arial', 15, 'bold'),
        )
        self.url_tracking_label.place(relx=0.5, rely=0.04, anchor='center')
        # Retroactively scale the mid_frame's first row to make space for the tracking label
        self.mid_frame.rowconfigure(
            0, weight=1, minsize=self.url_tracking_label.winfo_reqheight())

        self.url_tracking_text = ScrollText(
            self.mid_frame,
            bg=MID_GREY,
            font=('Arial', 10),
            borderwidth=0,
        )
        self.url_tracking_text.grid(row=1, column=0, sticky='ew')

    def setup_right_frame(self):
        """
        Set up the right frame of the application's window.
        """
        self.right_frame = tk.Frame(
            self.root,
            bg=MID_GREY,
            width=self.window_width / 3,
            height=self.window_height - self.border_width * 40,
            highlightbackground=self.border_color,
            highlightcolor=self.border_color,
            highlightthickness=self.border_width,
        )
        self.right_frame.grid(row=0, column=2)
        self.right_frame.grid_propagate(False)
        self.right_frame.columnconfigure(0, weight=1)
        self.right_frame.rowconfigure(0, weight=1)

        self.log_text = ScrollText(
            self.right_frame,
            bg=MID_GREY,
            font=('Arial', 10),
            borderwidth=0,
        )
        self.log_text.grid(sticky='nsew')

    def setup_bottom_frame(self):
        """
        Set up the bottom frame of the application's window.
        """
        self.bottom_frame = tk.Frame(
            self.root,
            bg=MID_GREY,
            width=self.window_width,
            height=self.border_width * 40,
            highlightbackground=self.border_color,
            highlightcolor=self.border_color,
            highlightthickness=self.border_width,
        )
        self.bottom_frame.grid(row=1, column=0, columnspan=3)
        self.bottom_frame.grid_propagate(False)

        self.download_tracking_label = tk.Label(self.bottom_frame,
                                                bg=MID_GREY,
                                                text='Downloaded 0 / 0 files',
                                                font=('Arial', 15, 'bold'))
        self.download_tracking_label.place(relx=0.5, rely=0.2, anchor='center')

        style = ttk.Style()
        # ('winnative', 'clam', 'alt', 'default', 'classic', 'vista', 'xpnative')
        style.theme_use('alt')
        style.configure('black.Horizontal.TProgressbar',
                        foreground='red',
                        background='black')
        self.download_tracking_bar = ttk.Progressbar(
            self.bottom_frame,
            style='black.Horizontal.TProgressbar',
            orient='horizontal',
            length=self.window_width * 0.8,
            mode='determinate',
        )
        self.download_tracking_bar.place(relx=0.5, rely=0.5, anchor='center')

    def disable_input_widgets(self):
        """
        Disable the interactive widgets around input and downloading.
        """
        self.url_entry.configure(state='disabled')
        self.check_button.configure(state='disabled')
        self.start_dl_button.configure(state='disabled')

    def enable_input_widgets(self):
        """
        Enable the interactive widgets around input and downloading.
        """
        self.url_entry.configure(state='normal')
        self.check_button.configure(state='normal')
        self.start_dl_button.configure(state='normal')

    def process_input(self):
        """
        Disable the input widgets and check/process the input,
        then enable the widgets again.
        """
        text = self.url_entry.get().strip()
        if not text:
            return

        # Don't allow more input while current input is being processed
        self.disable_input_widgets()

        # Allow pasting multiple links at once, separated by spaces
        if len(text.split()) > 1:
            is_input_accepted = False

            for url in text.split():
                is_input_accepted = self.check_url(text=url)
                # Sleep to not spam APIs
                time.sleep(0.5)
        else:
            is_input_accepted = self.check_url(text=text)

        self.enable_input_widgets()
        # Cannot delete text while widget is disabled
        if is_input_accepted is True:
            self.url_entry.delete(0, tk.END)

    def check_url(self, text=None):
        """
        Check the text to see if it fits one of the specified URL regexes.
        Then process the URL as needed.
        """
        if not text:
            return False

        # We only need to track Reddit URLs in JSON format
        if self.reddit_re.match(text) and not text.endswith('.json'):
            text += '.json'

        if not any(regex.match(text) for regex in self.exprs.keys()):
            self.url_check_label.configure(text='ERR: URL not accepted',
                                           fg='red')
            return False

        if text in self.scraper.tracking_links + self.scraper.display_links:
            self.url_check_label.configure(text='WARN: URL already added.',
                                           fg='brown')
            return False

        # In case a URL gets ctrl+v'd into the entry multiple times
        if any(link in text for link in self.scraper.tracking_links +
               self.scraper.display_links):
            self.url_check_label.configure(text='WARN: URL already added.',
                                           fg='brown')
            return False

        self.url_check_label.configure(text='OK: URL accepted', fg='black')
        self.process_url(text)

        # Signify that the method completed
        return True

    def process_url(self, url):
        """
        Get the corresponding extraction method of a URL by matching a regex,
        then execute the method and update the tracking label.
        """
        for regex in self.exprs.keys():
            # Guaranteed to happen for at least one regex
            if regex.match(url):
                extraction_method = self.exprs[regex]
                extraction_method(url)
                break

        self.scraper.display_links.append(url)
        self.url_tracking_text.display_these_lines(self.scraper.display_links)

        self.log_text.newline('URL processing complete')
        self.log_text.newline('.')

    def process_general_url(self, url):
        """
        Append a link directly pointing to an image to the lists
        as no further actions are needed.
        """
        type_ = 'image'
        if url.startswith('https://v.redd.it/'):
            type_ = 'video'

        self.scraper.append_link(url, type_=type_)

    def process_ig_url(self, url):
        """
        Prepare data and handle extraction of images of Instagram posts.
        """
        self.driver.webdriver.get(url)
        self.log_text.newline(f'Got URL - {url}')
        soup = BeautifulSoup(self.driver.webdriver.page_source,
                             features='html.parser')
        data = self.scraper.get_ig_data(soup)
        self.log_text.newline('Extracted JSON data')

        if self.scraper.is_private(data) and self.driver.is_logged_in is False:

            def show_root(_):
                """
                Needed for the pos arg getting passed with tkinter bindings.
                """
                self.root.deiconify()
                # self.process_url(url)
                self.process_ig_url(url)
                # Not unbinding here would lead to an infinite loop
                # of calling the above function again and again
                self.login.unbind('<Destroy>')

            self.log_text.newline('Login initiated')
            self.create_login_window()
            self.root.withdraw()
            self.login.bind('<Destroy>', show_root)
            return

        # Logging for IG links is done inside of this function already
        self.scraper.extract_ig_images(data)
        self.scraper.tracking_links.append(url)

    def process_ig_profile_url(self, url):
        """
        Extract an Instagram user's profile name and get their
        avatar's URL from instadp.com.
        """
        profile_name = self.ig_profile_url_re.match(url).group(1)
        instadp_url = f'https://www.instadp.com/fullsize/{profile_name}'
        self.driver.webdriver.get(instadp_url)
        self.log_text.newline(f'Got URL - {url}')

        soup = BeautifulSoup(self.driver.webdriver.page_source,
                             features='html.parser')
        self.scraper.extract_ig_avatar(soup)

    def process_imgur_url(self, url):
        """
        Prepare data needed for extracting images from an Imgur link
        and then actually extract them.
        """
        self.driver.webdriver.get(url)
        self.log_text.newline(f'Got URL - {url}')

        soup = BeautifulSoup(self.driver.webdriver.page_source,
                             features='html.parser')
        self.scraper.extract_imgur_images(soup)

    def process_yt_url(self, url):
        """
        Simply call the scraper's method to keep the method class uniform here.
        """
        self.scraper.extract_yt_thumbnail(url)

    def process_reddit_url(self, url):
        """
        Get the JSON data of a Reddit post and extract the video link.
        NOTE: Video and audio are separated on Reddit, so the audio will be missing.
        """
        self.driver.webdriver.get(url)
        self.log_text.newline(f'Got URL - {url}')

        soup = BeautifulSoup(self.driver.webdriver.page_source,
                             features='html.parser')
        data_str = soup.find_all('pre')[0].text
        data = json.loads(data_str)

        post_url = self.scraper.extract_reddit_link(data)
        # Need to process the URL which a Reddit post points to
        # ... if it's not a self-post
        if url.replace('/.json', '/') == post_url:
            self.log_text.newline('Reddit post is a self-post, aborting')
            return
        self.check_url(text=post_url)

    def process_gfycat_url(self, url):
        """
        Check to see if the entered Gfycat URL is valid.
        """
        # Usually I would insist on doing everything with Selenium
        # But it's so f*****g slow with Gfycat (~5s to .get the URL)
        # that it's better to use requests -.-
        # With that being said, the commented out Selenium code does work

        # self.driver.webdriver.get(url)
        # self.log_text.newline(f'Got URL - {url}')
        #
        # logs = self.driver.webdriver.get_log('browser')
        # messages = [log['message'] for log in logs]
        # request_failed = ('Failed to load resource:'
        #                   ' the server responded with a status of 404')
        #
        # if any(request_failed in message for message in messages):
        #     self.log_text.newline('Invalid response 404 for Gfycat URL')
        #     return

        res = requests.get(url)
        self.log_text.newline(f'Got URL - {url}')
        if res.status_code != 200:
            self.log_text.newline(f'Unexpected response code'
                                  f' ({res.status_code}) for Gfycat URL')
            return

        self.scraper.extract_gfycat_video(url)

    def process_tumblr_url(self, url):
        """
        Complete extra navigation step if necessary.
        Prep BeautifulSoup to be used in extraction.
        """
        self.driver.webdriver.get(url)
        self.driver.log_text.newline(f'Got URL - {url}')
        self.driver.confirm_tumblr_gdpr()

        # Wait for page to reload
        while True:
            soup = BeautifulSoup(self.driver.webdriver.page_source,
                                 features='html.parser')
            if config.tumblr_ascii_logo not in str(soup):
                break
            time.sleep(0.2)

        self.scraper.extract_tumblr_links(soup)

    def process_twitter_url(self, url):
        """
        Navigate to the Twitter URL and prep BeautifulSoup object.
        """
        self.driver.webdriver.get(url)
        self.driver.log_text.newline(f'Got URL - {url}')

        soup = BeautifulSoup(self.driver.webdriver.page_source,
                             features='html.parser')
        self.scraper.extract_twitter_images(soup)

    def download_files(self):
        """
        Wrapper to call the scraper's download method,
        to avoid arg weirdness with tkinter widget bindings.
        """
        if not self.scraper.download_links:
            return

        # Disable some widgets to not mess with running downloads
        self.disable_input_widgets()

        threading.Thread(target=self.scraper.download_files).start()
        # Intentionally block here to re-enable widgets only after this returns
        self.update_widgets()

        self.enable_input_widgets()

    def update_widgets(self):
        """
        Update the download tracking widgets while downloading
        and reset them when the downloads are finished.
        """
        self.download_tracking_bar['maximum'] = len(
            self.scraper.download_links)
        last_download = self.scraper.last_download
        finished_dls = 0

        while finished_dls < len(self.scraper.download_links):
            if self.scraper.last_download != last_download:
                last_download = self.scraper.last_download
                finished_dls += 1

                self.download_tracking_bar['value'] += 1
                self.download_tracking_label.configure(
                    text=f'Downloaded {finished_dls}'
                    f' / {len(self.scraper.download_links)} files')

        self.scraper.download_links = []
        self.scraper.display_links = []
        self.download_tracking_bar['value'] = 0

        self.download_tracking_label.configure(text='Downloaded 0 / 0 files')
        self.url_tracking_text.clear_text()

        self.log_text.newline('Reset tracking widgets')
        self.log_text.newline('.')

    def create_login_window(self):
        """
        Create a login window.
        """
        self.login = LoginWindow(self.driver)

        screen_width = self.root.winfo_screenwidth()
        screen_height = self.root.winfo_screenheight()
        window_x = int(screen_width / 2 - self.login.window_width * 0.5)
        window_y = int(screen_height * 0.25)

        self.login.title('Login')
        self.login.geometry(
            f'{self.login.window_width}x{self.login.window_height}'
            f'+{window_x}+{window_y}')
        self.login.resizable(width=False, height=False)

示例#6

0

显示文件

    def scrape_main(self, jb):

        #job = business-analyst
        #city = omaha-ne
        #local = raw_input("Local or Web (L/W): ")
        #job = raw_input("Job title with + as spaces: ")

        #-----need a method to scrub job title --------

        job = jb.split()
        local = "W"
        #city = ""
        #pages = 1
        path1 = settings.MEDIA_ROOT
        #path1 = ''
        path2 = '/jobs_' + jb.replace(" ","_")
        path3 = '.txt'
        filename = path1 + path2 + path3

        #===== CHECK IF FILE EXISTS ========

        if os.path.exists(filename):
            filename = filename
        else:
            #===== CREATE FILE IF IT DOESNT EXISTS ========

            file = open(filename, 'w+')
            header = 'Res' + ';' + "URL" + ';' +  "Job_num" + ';' +  "Job_Title" + ';' +  "Company" + ';' + "Location" + ';' +  "End_Date" + ';' +  "Duration" + ';' + "Avg_Sal" + ';' + "Company_Prestige" + ';' + "Work_Description" + '\n'
            file.write(header)


            #===== GRAB LIST OF RESUMES FOR INPUTS ABOVE ======

            # if job[1]:
            #     file.write(job[0] + "---" + job[(len(job)-1)] + jb )
            #     resume_links = Links.objects.filter(job_name__icontains=job[1])  #.filter(job_name__contains=job[1])
            #     file.write(resume_links[0])
            # else:
            #file.write('hello')

            resume_links = Links.objects.filter(job_name__icontains=jb.replace(" ","-"))

            #file.write(resume_links)
            #======= GRAB SPECIFIC RESUME BY HYPERLINK AND SCRAPE DATA ============

            pers = 0
            threadlist = []
            s2 = Scraper()

            for res in resume_links:
                print 'Resume' + res.job_url
                pers = pers + 1

        #---------- THREADING CODE -----------------
            # --- so that we dont kick off more than 10 threads ------
                if pers % 10 == 0:
                    time.sleep(1)

                try:
                    t = Thread(target=s2.person, args=(res.job_url,pers,file))
                    t.start()
                    threadlist.append(t)

                except:
                    time.sleep(1)
                    try:
                        t = Thread(target=s2.person, args=(res.job_url,pers,file))
                        t.start()
                        threadlist.append(t)
                    except:
                        print "this person didnt work"

            # -- rejoining the threads -----

            for b in threadlist:
                b.join()

            file.close()

        #-------- DO PLOTTING -----------

        job_cluster = []

        if local == "L":
            print "local"
            #plots(filename)  -  ADD FOR LOCAL  - https://bitbucket.org/njs/rnumpy/wiki/API
        elif local == "W":
            r = Rcode()
            r_data = r.rots(filename, jb)
            med = r_data['median']
            for i in range(1,(med+2)):
                print "-----------JOB " + str(i) + " ----------------"
                plots = r_data['plot_files'][i-1]
                #plot_listing.append(plots)
                job_list = []
                for j in range(1,4):
                    print '--cluster '+ str(j) + "--"
                    job = r_data['jobz'][(i*j)-1]
                    job1 = tuple(job)
                    title = job1[3][0]
                    company = job1[4][0]
                    sal = job1[8][0]
                    #{'title' 'company''sal'}
                    tup = [title, company, sal, plots]
                    job_list.append(tup)

                job_cluster.append(job_list)


        return {'stat':"done", 'jobs_cluster':job_cluster, 'med':range((med+2))}
    # when doing multithreading you need to define a function and give seperate memory allocation for each thread


#sc = ScrapeMain()
#sc.scrape_main('architect')

示例#7

0

显示文件

文件： test_scraping.py 项目： miitcher/OsakkeetScraper

def test_get_sijoittajan_tunnuslukuja_Controll(tester, company_id):
    scraper = Scraper(company_id)
    sijoittajan_tunnuslukuja = scraper.get_sijoittajan_tunnuslukuja()
    assert_tulostietoja(tester, company_id, sijoittajan_tunnuslukuja, 12)

示例#8

0

显示文件

文件： test_scraping.py 项目： miitcher/OsakkeetScraper

def test_get_maksuvalmius_Controll(tester, company_id):
    scraper = Scraper(company_id)
    maksuvalmius = scraper.get_maksuvalmius()
    assert_tulostietoja(tester, company_id, maksuvalmius, 3)

示例#9

0

显示文件

文件： test_scraping.py 项目： miitcher/OsakkeetScraper

def test_get_vakavaraisuus_Controll(tester, company_id):
    scraper = Scraper(company_id)
    vakavaraisuus = scraper.get_vakavaraisuus()
    assert_tulostietoja(tester, company_id, vakavaraisuus, 6)

示例#10

0

显示文件

文件： test_scraping.py 项目： miitcher/OsakkeetScraper

 def test_get_name(self):
     for company_id in some_company_ids:
         scraper = Scraper(company_id)
         name = scraper.get_name()
         self.assertIsInstance(name, str)
         self.assertGreater(len(name), 2)

示例#11

0

显示文件

文件： app.py 项目： muhammedalikocabey/Text-Similarity-of-Products

# -*- coding: utf-8 -*-
"""
@author: Muhammed
"""

from scraping import Scraper
from text_similarity import TextSimilarity

from flask import Flask, render_template, redirect, url_for, request
import pandas as pd

Scraper = Scraper()
TextSimilarity = TextSimilarity()

app = Flask(__name__)

categories = ["Cep Telefonu", "Dizüstü Bilgisayar", "Tablet"]
###
import time

###


@app.route("/", methods=["GET", "POST"])
def home():
    return render_template("index.html", categories=categories)


@app.route("/urun-bilgileri", methods=["POST", "GET"])
def urun_bilgileri():
    if request.method == "POST":

示例#12

0

显示文件

文件： test_scraping.py 项目： miitcher/OsakkeetScraper

def test_get_tunnuslukuja_Controll(tester, company_id):
    scraper = Scraper(company_id)
    tunnuslukuja = scraper.get_tunnuslukuja()
    tester.assertEqual(len(tunnuslukuja), 6)
    for key in tunnuslukuja:
        tester.assertIsInstance(tunnuslukuja[key], float)

示例#13

0

显示文件

文件： test_scraping.py 项目： miitcher/OsakkeetScraper

 def test_get_kuvaus(self):
     for company_id in some_company_ids:
         scraper = Scraper(company_id)
         kuvaus = scraper.get_kuvaus()
         self.assertIsInstance(kuvaus, str)

示例#14

0

显示文件

文件： test_scraping.py 项目： miitcher/OsakkeetScraper

 def test_get_kurssi(self):
     for company_id in some_company_ids:
         scraper = Scraper(company_id)
         kurssi = scraper.get_kurssi()
         self.assertIsInstance(kurssi, float)

示例#15

0

显示文件

    def __init__(self, root):
        self.root = root

        self.left_frame = tk.Frame()
        self.url_label = tk.Label()
        self.url_entry = tk.Entry()
        self.check_button = tk.Button()
        self.url_check_label = tk.Label()
        self.start_dl_button = tk.Button()
        self.setup_left_frame()

        self.mid_frame = tk.Frame()
        self.url_tracking_label = tk.Label()
        self.url_tracking_text = ScrollText()
        self.setup_mid_frame()

        self.right_frame = tk.Frame()
        self.log_text = ScrollText()
        self.setup_right_frame()

        self.bottom_frame = tk.Frame()
        self.download_tracking_label = tk.Label()
        self.download_tracking_bar = ttk.Progressbar()
        self.setup_bottom_frame()

        # Initialise classes here so we can pass the logging widget
        self.scraper = Scraper(self.log_text, self.download_tracking_label)
        self.driver = Driver(self.log_text)
        self.driver.start_driver()  # Start webdriver to be used for scraping
        self.login = None

        # Lots of regexes to check the validity of wanted URLs
        # Make sure only IG posts are specified, not user's pages
        self.ig_url_re = re.compile(r'^https://www\.instagram\.com/p/.+/')
        self.ig_profile_url_re = re.compile(
            r'^https://www\.instagram\.com/(\w+|\d+)/$')
        self.general_img_re = re.compile(
            r'^https?://.+\..+\..+\.(?:jpg|png|gif)')
        self.imgur_re = re.compile(
            r'^https?://imgur\.com/(?:.)+$(?<!(png|gif|jpg))')
        self.youtube_re = re.compile(
            'https://(?:www\.)?youtube\.com/watch\?v=.+')
        self.yt_re = re.compile(r'https://youtu\.be/.+')
        self.reddit_re = re.compile(
            r'https?://(?:www|old)\.reddit\.com/(?:r|u|user)/(\w+)/.+')
        self.reddit_fallback_re = re.compile(
            r'https://v\.redd\.it/.+\?source=fallback')
        self.gfycat_re = re.compile(r'https://gfycat\.com/\w+$(?<!-)')
        self.tumblr_re = re.compile(
            r'https://(.+)\.tumblr\.com/post/(\d+)(?:/.+)?')
        self.twitter_re = re.compile(r'https://twitter.com/.+/status/(\d+)')

        # Map URLs to the methods needed to extract the images in them
        # All of these methods take a single argument, the URL/text
        self.exprs = {
            self.ig_url_re: self.process_ig_url,
            self.ig_profile_url_re: self.process_ig_profile_url,
            self.general_img_re: self.process_general_url,
            self.imgur_re: self.process_imgur_url,
            self.youtube_re: self.process_yt_url,
            self.yt_re: self.process_yt_url,
            self.reddit_re: self.process_reddit_url,
            self.reddit_fallback_re: self.process_general_url,
            self.gfycat_re: self.process_gfycat_url,
            self.tumblr_re: self.process_tumblr_url,
            self.twitter_re: self.process_twitter_url,
        }

示例#16

0

显示文件

import colorama
import yaml
# Local.
from scraping import Scraper

# Load config.
cp = "conf.yaml"  # Config Path string
with open(cp, encoding='utf-8') as cf:  # Config File object
    cd = yaml.load(cf, Loader=yaml.FullLoader)  # Config Dict

# Configure logging.
logging.basicConfig(filename=cd["TitleUrlLog"],
                    filemode="w",
                    level=logging.INFO,
                    format=f"\n {'-'*23} \n %(asctime)s %(message)s")
"""Overwrite log file on every interpreter (not script) launch."""

# Console.
colorama.init()
print(colorama.Fore.CYAN, end="")  # Set text color.
# print(colorama.Style.BRIGHT, end="")  # Set text brightness. Default: colorama.Style.NORMAL
s = Scraper(cd)
s.start()
print(f"Type {cd['QuitKw']} to quit.")
while s.is_alive():
    if input().strip().lower() == cd["QuitKw"]:
        s.quit()
        break
print(colorama.Style.RESET_ALL, end="")
colorama.deinit()

示例#17

0

显示文件

文件： test_scraping.py 项目： miitcher/OsakkeetScraper

def test_get_toiminnan_laajuus_Controll(tester, company_id):
    scraper = Scraper(company_id)
    toiminnan_laajuus = scraper.get_toiminnan_laajuus()
    assert_tulostietoja(tester, company_id, toiminnan_laajuus, 7)

示例#18

0

显示文件

文件： test_scraping.py 项目： miitcher/OsakkeetScraper

def test_get_kannattavuus_Controll(tester, company_id):
    scraper = Scraper(company_id)
    kannattavuus = scraper.get_kannattavuus()
    assert_tulostietoja(tester, company_id, kannattavuus, 7)

示例#19

0

显示文件

文件： get_musician.py 项目： asanoboy/circus

        if not page.has_attrs():
            raise 'Can\'t load: url = %s' % (page.url,)

        self._add_page(page)

    def _add_page(self, page):
        if not page.is_valid():
            return

        for link_page in page.create_similar_pages():
            code = link_page.get_code()
            if code in self.codes:
                continue
            self.page_stack.append(link_page)
            self.codes.add(code)


if __name__ == '__main__':
    init_logger()

    parser = argparse.ArgumentParser()
    parser.add_argument('-l', '--lang', required=True)  # ja,us
    args = parser.parse_args()
    args = vars(args)
    lang = args['lang']

    scrape_target = ArtistScrapeTarget(lang)
    scraper = Scraper(scrape_target)
    scraper.run()