Python Scrape 예제들, scrape.Scrape Python 예제들

예제 #1

0

파일 보기

파일: app.py 프로젝트: tsiyuki/scrape-tianyancha-website

def get_info():
    companyName = request.args.get("companyName")
    scrape = Scrape(companyName)
    number, driver = scrape.login()
    driver = scrape.select_company(driver, companyName)
    list = scrape.scrapy(driver)
    return jsonify(list)

예제 #2

0

파일 보기

파일: pokedex.py 프로젝트: DnLKnR/Pokedex

 def reset(self, array):
     self.p_filter = ''
     self.track_mode = 0
     self.Tracked = PokeList()
     self.Pokedex = PokeList()
     self.Scrape = Scrape()
     self.Pokedex.add_all(array)

예제 #3

0

파일 보기

    def __init__(self, headers=None):
        if headers == None:
            #valor de headers default
            headers = [
                'Origin: http://buscador.compras.imss.gob.mx',
                'Accept-Encoding: gzip, deflate',
                'Accept-Language: en-US,en;q=0.8,es;q=0.6',
                'User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36',
                'Content-Type: application/x-www-form-urlencoded',
                'Accept: */*',
                'Referer: http://buscador.compras.imss.gob.mx/wrap/index.html',
                'X-Requested-With: WAJAF::Ajax - WebAbility(r) v5',
                'Connection: keep-alive', 'DNT: 1'
            ]

        #inicializa el objeto que maneja las peticiones curl
        Scrape.__init__(self, headers)

        #inicializa el objeto mongo_client que se instancia en la clase Scrape
        #getattr hace una cosa como getattr('x','valor') ==> x.valor
        self.db = getattr(self.mongo_client, 'imss')
        self.compras = getattr(self.db, 'compras')
        self.raw = getattr(self.db, 'raw')

        #estos son los valores default de la página. se pueden modificar pero no lo haré para permanecer como bajo perfil
        self.numperpage = 20
        self.page = 1
        self.url = 'http://buscador.compras.imss.gob.mx/index.php'
        self.datos_default = '&type=compras&message=X&filtered=1&descripcion=&proveedor=&numcompra=&delegacion=values%3D&fecha=min%3D%3Bmax%3D&procedimiento=values%3D&exact=false&numperpage={}&page={}&order=fecha%20desc'

예제 #4

0

파일 보기

파일: test_scrape.py 프로젝트: ltebben/distributed-keyword-indexer

    def testScrape(self):
        url = "https://www.bbc.com"
        s = Scrape()
        s.setUrl(url)

        keywords, links = s.scrape()
        
        self.assertTrue(keywords, msg='No Keywords found')
        self.assertTrue(links, msg='No links found')

예제 #5

0

파일 보기

파일: app.py 프로젝트: tsiyuki/scrape-tianyancha-website

def get_list():
    keyWord = request.args.get("keyWord")
    scrape = Scrape(keyWord)
    number, driver = scrape.login()
    tags, names = scrape.get_company_list(driver)
    name_list = []
    for key in names.keys():
        name_list.append(key)
    return jsonify({"num": number, "list": name_list})

예제 #6

0

파일 보기

파일: app.py 프로젝트: tsiyuki/scrape-tianyancha-website

def key_word():
    keyForm = KeyForm(request.form)
    companyForm = CompanyForm(request.form)
    key_word = None
    names = None
    company = None
    number = 0
    if keyForm.validate_on_submit():
        key_word = keyForm.key_word.data
        scrape = Scrape(key_word)
        number, driver = scrape.login()
        tags, names = scrape.get_company_list(driver)
    if companyForm.validate_on_submit():
        company = companyForm.company.data
        scrape = Scrape(company)
        number, driver = scrape.login()
        driver = scrape.select_company(driver, company)
        lists = scrape.scrapy(driver)
        return render_template('company.html', lists=lists)
    return render_template('index.html',
                           keyForm=keyForm,
                           companyForm=companyForm,
                           key_word=key_word,
                           names=names,
                           number=number,
                           company=company)

예제 #7

0

파일 보기

파일: main.py 프로젝트: nickbonne/canyon_outlet_scrape

def main():

    page = 'https://www.canyon.com/en-us/outlet?--wysiwyg_cany\
on_products-factoryoutlet%5B%40package%5D=wysiwyg.canyon.products&--wysiwyg_cany\
on_products-factoryoutlet%5B%40controller%5D=factoryoutlet&--wysiwyg_canyon_pro\
ducts-factoryoutlet%5B%40action%5D=road&--wysiwyg_canyon_products-factoryoutlet\
%5B%40format%5D=html'

    links = Scrape.scrape(Scrape(), page)

    if links:

        message = Message.format(Message(), links)
        Message.send(Message(), message)

예제 #8

0

파일 보기

파일: main.py 프로젝트: nickbonne/craigslist_bike_find

def main():

    distance = '30'
    zip_ = '80223'
    min_price = '300'
    max_price = '1500'
    has_pic = '1'  # 0 to disable
    bundle = '1'  # 0 to disable

    main_search = 'https://denver.craigslist.org/search/bia?h\
asPic={}&bundleDuplicates={}&search_distance={}&postal={}&min_price=\
{}&max_price={}'.format(has_pic, bundle, distance, zip_, min_price, max_price)

    post_list = Scrape.scrape_search_pg(main_search)
    results = []

    for post in post_list:

        post_Obj = Filter(post[0])
        result = Filter.quick_filter(post_Obj)

        if result:

            result = Filter.size_filter(post_Obj)

            if result:

                results.append(
                    Keywords.score(post_Obj, Keywords.find(post_Obj)))

    if len(results) > 0:

        results = sorted(results, key=lambda x: x[2], reverse=True)
        Message.send(Message.format(results))

예제 #9

0

파일 보기

파일: server.py 프로젝트: sroheed/ELU-project

def home():
    if 'subreddit' and 'post_count' in request.args:
        subreddit = str(request.args['subreddit'])
        post_count = int(request.args['post_count'])
        #months_old = int(request.args['months_old'])
        months_old = 1
    else:
        return jsonify({"message": "Invalid request", 'code': 400})

    scrape_instance = Scrape(subreddit, months_old, post_count)
    subreddit_exists = scrape_instance.sub_exists()
    if(subreddit_exists):
        return_data = scrape_instance.get_data()
        return jsonify({"data": return_data, 'code': 200})
    else:
        return jsonify({"message": "Subreddit does not exist", 'code': 400})

예제 #10

0

파일 보기

def scrape_and_index():
    prefix = 'WEBPAGES_RAW/'

    with open(prefix + 'bookkeeping.json', 'r') as file_handle:
        urls = json.load(file_handle)

    # index = Index()

    count = 0
    start_time = time.time()
    terms = []
    documents = []
    for key in urls:
        file_name = prefix + key
        # print 'Processing ', file_name, ' ', format(count/374.97, '.2f'), ' % done'
        s = Scrape(file_name,
                   ['a', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title', 'p'])
        # document = Document(doc=key, url=urls[key])
        document = {'doc': key, 'url': urls[key]}
        document_terms = s.parse_content(document)

        terms.extend(document_terms)
        documents.append(document)
        # s.parse_content()
        # tags = s.get_token_tags()
        # tokens = s.get_token_frequencies()

        # for token in tokens:
        #     index.update_index(token, key, tokens[token], list(tags[token]))

        count += 1

        if count % 500 == 0:
            write_to_disk(documents, terms)
            print '=' * 30
            print 'Documents parsed = ', format(count / 374.97,
                                                '.2f'), ' % done'
            print 'Write to Disk Successful'
            print 'Time taken (seconds)\t:\t:', str((time.time() - start_time))
            print '=' * 30
            documents = []
            terms = []

    print 'Time taken (seconds)\t:\t"', str((time.time() - start_time))

    write_to_disk(documents, terms)

예제 #11

0

파일 보기

파일: compras.py 프로젝트: mekler/taller-de-scraping

    def __init__(self,headers=None):
        if headers == None:
            #valor de headers default
            headers = ['Origin: http://buscador.compras.imss.gob.mx','Accept-Encoding: gzip, deflate','Accept-Language: en-US,en;q=0.8,es;q=0.6','User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36','Content-Type: application/x-www-form-urlencoded','Accept: */*','Referer: http://buscador.compras.imss.gob.mx/wrap/index.html','X-Requested-With: WAJAF::Ajax - WebAbility(r) v5','Connection: keep-alive','DNT: 1']
        
        #inicializa el objeto que maneja las peticiones curl
        Scrape.__init__(self,headers)

        #inicializa el objeto mongo_client que se instancia en la clase Scrape
        #getattr hace una cosa como getattr('x','valor') ==> x.valor
        self.db = getattr(self.mongo_client, 'imss')
        self.compras = getattr(self.db, 'compras')
        self.raw = getattr(self.db, 'raw')

        #estos son los valores default de la página. se pueden modificar pero no lo haré para permanecer como bajo perfil
        self.numperpage = 20
        self.page=1
        self.url = 'http://buscador.compras.imss.gob.mx/index.php'
        self.datos_default = '&type=compras&message=X&filtered=1&descripcion=&proveedor=&numcompra=&delegacion=values%3D&fecha=min%3D%3Bmax%3D&procedimiento=values%3D&exact=false&numperpage={}&page={}&order=fecha%20desc'

예제 #12

0

파일 보기

파일: save_db.py 프로젝트: mramire6/news_rec_eng

def store_latest_URLs_db(latest_URLs):
    #Iterate through each of the URLs
    for url in latest_URLs:
        s = Scrape(url).pipeline()
        for info in s.get_main_info():
            new_article = Articles(section=info['article'],
                                   headline=info['headline'],
                                   main_text=info['main_text'],
                                   date_published=...,
                                   word_count=info['word_count'],
                                   hyper_link=...)
            # new_author = Authors(name=...,
            #                      position=...,
            #                      description=...,
            #                      section=...,
            #                      hyper_link=...)
            db.session.add(new_article)
        db.session.commit()
        print('New Article Added')

예제 #13

0

파일 보기

class Client:
    def __init__(self, apikey):
        #Initialize client with the Elsevier API key
        self._apikey = apikey

    @property
    def apikey(self):
        return self._apikey

    @apikey.setter
    def apikey(self, key):
        #Validator for API key
        self._apikey = key

    def geolocate(self, data):
        self._operation = Geolocate(data)
        self._operation.execute()

    def scrape(self, data):
        self._operation = Scrape(self._apikey, data)
        self._operation.execute()

예제 #14

0

파일 보기

파일: jobs.py 프로젝트: leonrjg/Treball

 def populate(self):
     scrape_details = Scrape_Info.select(Scrape_Info, Site).join(Site)
     columns = Job._meta.sorted_field_names[1:]
     for details in scrape_details:
         scrape = Scrape(details.url)
         site = details.site_id
         links = scrape.get_selector(site.link_selector, all=True)
         selectors = [
             site.date_selector, site.city_selector, site.title_selector,
             site.location_selector, site.company_selector,
             site.content_selector, site.image_selector
         ]
         foreign_keys = [details.site_id.id, details.category_id.id]
         data = []
         updated = False
         for link in links:
             if Job.select().where(Job.link == link).exists():
                 updated = True
                 break
             scrape_link = Scrape(link)
             offer = [link]
             for selector in selectors:
                 offer.append(scrape_link.get_selector(selector))
             offer = foreign_keys + offer
             data.append(dict(zip(columns, offer)))
             time.sleep(0.5)
         if updated is False:
             with db.atomic():
                 Job.insert_many(list(reversed(data))).execute()

예제 #15

0

파일 보기

파일: filter.py 프로젝트: nickbonne/craigslist_bike_find

    def quick_filter(self):

        title = word_tokenize(self.title)

        if set(title) & set(Filter.all_) and \
           not set(title) & set(Filter.bad):

            item_info = Scrape.scrape_post_pg(self.url)
            self.price = item_info[1]
            self.post = self.post + list(item_info)
            self.size = item_info[2]

            return self

예제 #16

0

파일 보기

파일: pages.py 프로젝트: LucasFatas/Glassdoor-Data-Extraction

    def getPages(url, maxPages):

        # this list will be used to store the reviews
        list = []

        # this driver is what controls google chrome
        driver = webdriver.Chrome(ChromeDriverManager().install())

        # tracks the page numbers
        pageNumber = 0

        # this counter is used to check if the page is empty because of a glitch that sometime occurs
        # if three empty pages are found the code assumes that no reviews are left
        emptyPages = 0

        while pageNumber >= 0 and pageNumber < int(maxPages) and emptyPages < 3:

            # increments page number
            pageNumber += 1

            # goes to page using the url and page number
            driver.get(url + str(pageNumber) + ".htm")

            # sets up the page to be parsed
            html = driver.page_source
            soup = bs4.BeautifulSoup(html, 'html.parser')

            # finds every review on the page and saves it to the list review
            reviews = soup.find_all("li", "empReview")

            # checks if pages has reviews
            if not reviews:
                emptyPages += 1  # increments empty page count since there are no reviews on page
            else:
                emptyPages = 0  # resets empty page counter in case last page was empty

                # this for loop goes through the list of reviews
                for review in reviews:
                    # call the function scrape_review for each review from the class Scrape and then adds
                    # the reviews to the list at the beginning of this class
                    list.append(Scrape.scrape_review(review))

        driver.close()  # closes the chrome tab

        return list  # returns the list of reviews

예제 #17

0

파일 보기

파일: controller.py 프로젝트: AboarDev/BoarBot

def runBot():
    loop = asyncio.get_event_loop()
    theClient = bot.BotClient()
    commands = Commands(theClient)
    #levels = UserLevels(theClient)
    scrape = Scrape(theClient)
    saved = SaveLinks(theClient)
    messages = Messages(theClient)
    coins = Coins(theClient)
    try:
        loop.run_until_complete(theClient.start(Token))
    except KeyboardInterrupt:
        loop.run_until_complete(theClient.close())
    finally:
        loop.close()

    if theClient.restart == True:
        if asyncio.get_event_loop().is_closed():
            asyncio.set_event_loop(asyncio.new_event_loop())
        reload(bot)
        runBot()

예제 #18

0

파일 보기

파일: pokedex.py 프로젝트: DnLKnR/Pokedex

class Pokedex(wx.Frame):
    def __init__(self, parent, id, title):
        wx.Frame.__init__(self,
                          parent,
                          style=wx.DEFAULT_FRAME_STYLE ^ wx.RESIZE_BORDER
                          ^ wx.MAXIMIZE_BOX)
        self.list_pokeweak = []
        file = open('pokedata.cfg', 'r')
        all_data_str = file.read()
        file.close()
        if all_data_str == '':
            file = open('pokedata.default.cfg', 'r')
            all_data_str = file.read()
            file.close()
            file = open('pokedata.cfg', 'w')
            file.write(all_data_str)
            file.close()
        self.all_data = ast.literal_eval(all_data_str)
        self.reset(self.all_data)
        self.typestrings = [
            "Quad", "Double", "Normal", "Half", "Fourth", "Immune"
        ]
        self.colors = [
            '#E8E8E8', '#D0D0D0', '#B8B8B8', '#A0A0A0', '#888888', '#707070'
        ]
        self.icon = wx.Icon('Images/pokeball.ico', wx.BITMAP_TYPE_ICO)
        self.SetIcon(self.icon)
        self.SB1 = wx.StaticBox(self, -1, 'Stats')
        self.SB1.SetBackgroundColour('white')
        self.CB = wx.ComboBox(self,
                              -1,
                              value='',
                              choices=['Name', 'Type'],
                              style=wx.CB_READONLY,
                              size=(103, -1))
        self.CB.SetSelection(0)
        #create menubar
        self.menubar = wx.MenuBar()
        #create file menu
        self.filemenu = wx.Menu()
        self.m_stayontop = self.filemenu.Append(wx.ID_ANY,
                                                'Stay on Top [Off]\tCtrl-S',
                                                '')
        self.m_resize = self.filemenu.Append(wx.ID_ANY,
                                             'Resizable [Off]\tCtrl-E', '')
        self.m_track = self.filemenu.Append(wx.ID_ANY, 'Show Tracked\tCtrl-T',
                                            '')
        self.m_rescrape = self.filemenu.Append(wx.ID_ANY,
                                               'Rescrape Html\tCtrl-R', '')
        self.m_close = self.filemenu.Append(wx.ID_ANY, 'Close\tCtrl-Q', '')
        self.menubar.Append(self.filemenu, 'File')
        #bind events to menu options
        self.Bind(wx.EVT_MENU, self.stay_on_top, self.m_stayontop)
        self.Bind(wx.EVT_MENU, self.resizable, self.m_resize)
        self.Bind(wx.EVT_MENU, self.toggle_track_mode, self.m_track)
        self.Bind(wx.EVT_MENU, self.rescrape, self.m_rescrape)
        self.Bind(wx.EVT_MENU, self.on_close, self.m_close)
        #Creating list that will store all pokemon
        self.LC = wx.ListCtrl(self, -1, style=wx.LC_REPORT | wx.LC_SINGLE_SEL)
        #Creating list for weaknesses
        self.LC2 = AutoWidthListCtrl(self)
        #Inserting columns into that list of pokemon
        self.LC.InsertColumn(0,
                             'Pokemon',
                             format=wx.LIST_FORMAT_LEFT,
                             width=90)
        self.LC.InsertColumn(1,
                             'Type',
                             format=wx.LIST_FORMAT_CENTER,
                             width=100)
        self.LC.InsertColumn(2,
                             'Total',
                             format=wx.LIST_FORMAT_CENTER,
                             width=45)
        self.LC.InsertColumn(3, 'HP', format=wx.LIST_FORMAT_CENTER, width=30)
        self.LC.InsertColumn(4, 'Atk', format=wx.LIST_FORMAT_CENTER, width=30)
        self.LC.InsertColumn(5, 'Def', format=wx.LIST_FORMAT_CENTER, width=30)
        self.LC.InsertColumn(6,
                             'SpAtk',
                             format=wx.LIST_FORMAT_CENTER,
                             width=44)
        self.LC.InsertColumn(7,
                             'SpDef',
                             format=wx.LIST_FORMAT_CENTER,
                             width=44)
        self.LC.InsertColumn(8, 'Spd', format=wx.LIST_FORMAT_CENTER, width=35)
        self.LC2.InsertColumn(0,
                              'Damage Taken [(2 ^ Row) * 1/4]',
                              format=wx.LIST_FORMAT_RIGHT,
                              width=355)

        #creating boxsizers
        mainbox = wx.BoxSizer(wx.VERTICAL)
        statsbox = wx.BoxSizer(wx.HORIZONTAL)
        imagebox = wx.StaticBoxSizer(self.SB1, wx.HORIZONTAL)
        defensebox = wx.BoxSizer(wx.HORIZONTAL)
        box = wx.BoxSizer(wx.VERTICAL)
        hbox = wx.BoxSizer(wx.HORIZONTAL)

        #creating initial image for imagebox
        self.image = wx.StaticBitmap(
            self, -1, wx.Bitmap('Images/0.png', wx.BITMAP_TYPE_PNG))
        self.input = wx.TextCtrl(self, value='', size=(270, -1))

        self.SetBackgroundColour('pink')

        #Adding text entry and combobox to a hbox then adding it the box
        hbox.Add(self.input, wx.EXPAND | wx.ALIGN_LEFT)
        hbox.Add(self.CB, wx.ALIGN_CENTER)
        box.Add(hbox, flag=wx.ALL)

        #Adding list that will store all pokemon
        box.Add(self.LC, flag=wx.ALL | wx.EXPAND)

        #Adding image area to imagebox
        imagebox.Add(self.image, flag=wx.ALIGN_CENTER | wx.ALL)

        defensebox.Add(self.LC2, flag=wx.ALIGN_CENTER | wx.EXPAND)
        #Adding image to boxsizer
        statsbox.Add(imagebox, flag=wx.ALIGN_CENTER | wx.ALL)

        #Adding boxsizer to mainbox
        box.Add(statsbox, flag=wx.ALIGN_CENTER | wx.EXPAND)

        self.SetMenuBar(self.menubar)

        statsbox.Add(defensebox, flag=wx.ALIGN_CENTER | wx.EXPAND)
        mainbox.Add(box, flag=wx.ALL | wx.EXPAND)

        #Bind Events to search bar
        self.Bind(wx.EVT_TEXT, self.search, self.input)
        self.Bind(wx.EVT_CHAR_HOOK, self.on_keyboard_search, self.input)

        #Bind Events to Search Combobox
        self.Bind(wx.EVT_COMBOBOX, self.search, self.CB)
        self.Bind(wx.EVT_CHAR_HOOK, self.on_keyboard_combobox, self.CB)

        #Bind Events to Pokemon ListCtrl
        self.Bind(wx.EVT_LIST_ITEM_FOCUSED, self.set_information, self.LC)
        self.Bind(wx.EVT_LIST_KEY_DOWN, self.on_keyboard_list, self.LC)
        self.Bind(wx.EVT_LIST_ITEM_RIGHT_CLICK, self.track, self.LC)
        self.Bind(wx.EVT_LIST_ITEM_ACTIVATED, self.track, self.LC)
        self.Bind(wx.EVT_TEXT_ENTER, self.track, self.input)

        #Resize and Refresh the list
        self.SetSizer(mainbox)
        mainbox.Fit(self)
        self.refresh('')

    def on_keyboard_search(self, event):
        if event.GetKeyCode() == wx.WXK_DOWN:
            if self.LC.GetItemCount():
                self.LC.Focus(0)
                self.LC.SetFocus()
        elif event.GetKeyCode() == wx.WXK_UP:
            #Do Nothing
            pass
        elif event.GetKeyCode() == wx.WXK_TAB:
            self.CB.SetFocus()
        else:
            #Handle event default
            event.Skip()

    def on_keyboard_list(self, event):
        if event.GetKeyCode() == wx.WXK_UP:
            index = self.LC.GetFocusedItem()
            if index == 0:
                self.input.SetFocus()
        elif event.GetKeyCode() == wx.WXK_TAB:
            self.input.SetFocus()
        else:
            #Handle event default
            event.Skip()

    def on_keyboard_combobox(self, event):
        if event.GetKeyCode() == wx.WXK_TAB:
            self.input.SetFocus()
        else:
            #Handle Event default
            event.Skip()

    def on_close(self, event):
        self.Destroy()

    def reset(self, array):
        self.p_filter = ''
        self.track_mode = 0
        self.Tracked = PokeList()
        self.Pokedex = PokeList()
        self.Scrape = Scrape()
        self.Pokedex.add_all(array)

    def stay_on_top(self, event):
        if self.ToggleWindowStyle(flag=wx.STAY_ON_TOP):
            self.m_stayontop.SetItemLabel('Stay on Top [On]\tCtrl-S')
        else:
            self.m_stayontop.SetItemLabel('Stay on Top [Off]\tCtrl-S')

    def resizable(self, event):
        if self.ToggleWindowStyle(flag=wx.RESIZE_BORDER):
            self.m_resize.SetItemLabel('Resizable [On]\tCtrl-E')
        else:
            self.m_resize.SetItemLabel('Resizable [Off]\tCtrl-E')

    def track(self, event):
        if self.LC.GetItemCount():
            index = self.LC.GetFocusedItem()
            if index == -1:
                index = 0
            ListItem = self.LC.GetItem(index, 0)
            Name = ListItem.GetText()
            if self.track_mode:
                self.Tracked.remove(Name)
                filter = self.input.GetValue().strip().lower()
                self.refresh(filter)
            else:
                pokemon = self.Pokedex.get(Name)
                self.Tracked.add(pokemon)
            if self.LC.GetItemCount() == 0:
                self.input.SetFocus()

    def toggle_track_mode(self, event):
        if self.track_mode:
            self.m_track.SetItemLabel('Show Tracked\tCtrl-T')
            self.track_mode = 0
        else:
            self.m_track.SetItemLabel('Show All\tCtrl-T')
            self.track_mode = 1
        self.input.SetValue('')
        self.refresh('')

    def set_item_color(self, index, color1, color2):
        if index % 2:
            self.LC.SetItemBackgroundColour(index, color1)
        else:
            self.LC.SetItemBackgroundColour(index, color2)

    def set_items(self, set):
        self.LC.DeleteAllItems()
        for index, pokemon in enumerate(set):
            self.append_item(pokemon)
            self.set_item_color(index, 'pink', 'white')

    def add_items(self, set):
        column = self.CB.GetSelection()
        i, max_i = (0, self.LC.GetItemCount())
        j, max_j = (0, len(set))
        while True:
            if j == max_j: break
            elif i == max_i:
                self.insert_item(i, set[j])
                max_i += 1
            else:
                ListItem = self.LC.GetItem(i, column)
                ItemText = ListItem.GetText()
                if not set[j].get_name() == ItemText:
                    self.insert_item(i, set[j])
                    max_i += 1
            self.set_item_color(i, 'pink', 'white')
            i, j = (i + 1, j + 1)

    def refresh(self, filter):
        if self.track_mode:
            if filter == '':
                subset = self.Tracked.get_all()
            elif self.CB.GetSelection():
                subset = self.Tracked.type_filter(filter)
            else:
                subset = self.Tracked.name_filter(filter)
        else:
            if filter == '':
                subset = self.Pokedex.get_all()
            elif self.CB.GetSelection():
                subset = self.Pokedex.type_filter(filter)
            else:
                subset = self.Pokedex.name_filter(filter)
        if subset[0].get_name() == '???':
            self.LC.DeleteAllItems()
        elif self.p_filter in filter:
            self.set_items(subset)
        else:
            self.add_items(subset)
        self.p_filter = filter
        self.SB1.SetLabel(subset[0].get_name())
        self.set_image(subset[0].get_image())
        self.set_weaknesses(subset[0])
        self.LC.Select(0, on=1)

    def insert_item(self, index, pokemon):
        self.LC.InsertItem(index, pokemon.get_name())
        self.LC.SetItem(index, 1, pokemon.get_type())
        self.LC.SetItem(index, 2, pokemon.get_total())
        self.LC.SetItem(index, 3, pokemon.get_hp())
        self.LC.SetItem(index, 4, pokemon.get_atk())
        self.LC.SetItem(index, 5, pokemon.get_def())
        self.LC.SetItem(index, 6, pokemon.get_spatk())
        self.LC.SetItem(index, 7, pokemon.get_spdef())
        self.LC.SetItem(index, 8, pokemon.get_spd())

    def append_item(self, pokemon):
        self.LC.Append(pokemon.get())

    def set_information(self, event):
        self.LC2.DeleteAllItems()
        ListItem = self.LC.GetItem(self.LC.GetFocusedItem(), 0)
        Name = ListItem.GetText()
        pokemon = self.Pokedex.get(Name)
        self.set_weaknesses(pokemon)
        self.SB1.SetLabel(pokemon.get_name())
        self.set_image(pokemon.get_image())

    def set_weaknesses(self, pokemon):
        self.LC2.DeleteAllItems()
        self.LC2.InsertItem(0, pokemon.get_quad())
        self.LC2.SetItemBackgroundColour(0, self.colors[0])
        self.LC2.InsertItem(1, pokemon.get_double())
        self.LC2.SetItemBackgroundColour(1, self.colors[1])
        self.LC2.InsertItem(2, pokemon.get_normal())
        self.LC2.SetItemBackgroundColour(2, self.colors[2])
        self.LC2.InsertItem(3, pokemon.get_half())
        self.LC2.SetItemBackgroundColour(3, self.colors[3])
        self.LC2.InsertItem(4, pokemon.get_fourth())
        self.LC2.SetItemBackgroundColour(4, self.colors[4])
        self.LC2.InsertItem(5, pokemon.get_immune())
        self.LC2.SetItemBackgroundColour(5, self.colors[5])
        self.LC2.resizeLastColumn(-1)

    def set_image(self, image):
        self.image.SetBitmap(image)

    def search(self, event):
        filter = self.input.GetValue().strip().lower()
        self.refresh(filter)

    def rescrape(self, event):
        self.Hide()
        self.mySplash = wx.adv.SplashScreen(
            app.myBitmap,
            wx.adv.SPLASH_NO_TIMEOUT | wx.adv.SPLASH_CENTER_ON_SCREEN, -1,
            None)
        self.mySplash.Show()
        self.all_data = self.Scrape.scrape()
        self.reset(self.all_data)
        self.refresh('')
        self.mySplash.Destroy()
        self.Show()

예제 #19

0

파일 보기

파일: scrape_to_publish.py 프로젝트: basilleaf/marsfromspace

if not sys.argv[1]:
    sys.exit("please provide page start and end numbers \n " + info)

# currently we are counting down to 1..
page_min = int(sys.argv[1])
page_max = int(sys.argv[2])

posts_limit = 10  # only publish this many to WP at a time

base_url = 'http://hirise.lpl.arizona.edu/releases/all_captions.php'
# base_url_wallpapers = 'http://hirise.lpl.arizona.edu/'
base_url_wallpapers = 'http://static.uahirise.org/images/wallpaper/'
local_img_dir = '/app/tmp/'

# setup some tools
scrape = Scrape(base_url=base_url, local_img_dir=local_img_dir, base_url_wallpapers=base_url_wallpapers)
wp_publish = WPPublish()

if not debug:
    previously_published = wp_publish.get_all_published()
else:
    previously_published = []

# grab links to all the detail pages we need
all_detail_page_urls, urls_by_page = scrape.grab_all_page_urls(page_min, page_max)

# set to False if you don't wnat to publish to Wordpress
# this will also cause it to ignore previously published list
# grab content each page and publish to api and perhaps WP too
post_count = 0
last_page = 0

예제 #20

0

파일 보기

파일: save_db.py 프로젝트: mramire6/news_rec_eng

def get_latest_URLs():
    scrape_latest = Scrape(Defaults.main_url)
    scrape_latest_soup =  scrape_latest.get_soup()
    latest_URLs = find_latest_URLs(scrape_latest_soup['soup'])
    return latest_URLs

예제 #21

0

파일 보기

    def db_check(self):
        outdated = True
        while outdated:
            # Try table read otherwise create
            try:
                connection = sqlite3.connect("posts.db")
                cursor = connection.cursor()
                sql = f"""SELECT * FROM "{self.handle}" """
                cursor.execute(sql)
                result = cursor.fetchall()
                connection.close()
                # Try table read otherwise populate
                try:
                    #print(result)
                    #print(result[-1])
                    last_location_data = result[-1][0]
                    print(
                        f"Database last recorded location: {last_location_data}"
                    )

                    last_location = ScrapeLast(self.handle).get_last()[0]
                    print(f"Instagram last posted location: {last_location}")

                    # Check up to date
                    if last_location_data == last_location:
                        print("Database is up to date!")
                        outdated = False

                    else:
                        print("Uh oh! we need to update table")

                        # scrape latest data
                        print(f"Starting  Scrape: @{self.handle}")

                        data = Scrape(self.handle).get_locations()
                        locations = data[0]
                        links = data[1]
                        latitudes = data[2]
                        longitudes = data[3]
                        #print(locations[0])
                        connection = sqlite3.connect("posts.db")
                        for i in range(0, len(locations), 1):
                            try:
                                sql = f"""INSERT INTO `{self.handle}` (`location`,`link`,`latitude`,`longitude`) VALUES ("{locations[i]}","{links[i]}","{latitudes[i]}","{longitudes[i]}")"""
                                #print(sql)
                                connection.execute(sql)
                                connection.commit()
                                # print("Table populated")
                                # connection.close()
                            except:
                                pass

                        print("Table updated")
                        outdated = False
                        connection.close()
                        print("connection closed")

                except:
                    print(
                        "Uh oh! we need to populate the table for the first time"
                    )

                    # scrape latest data
                    print(f"Starting  Scrape: @{self.handle}")

                    data = Scrape(self.handle).get_locations()
                    locations = data[0]
                    links = data[1]
                    latitudes = data[2]
                    longitudes = data[3]
                    #print(locations[0])
                    connection = sqlite3.connect("posts.db")
                    for i in range(0, len(locations), 1):
                        try:
                            sql = f"""INSERT INTO `{self.handle}` (`location`,`link`,`latitude`,`longitude`) VALUES ("{locations[i]}","{links[i]}","{latitudes[i]}","{longitudes[i]}")"""
                            #print(sql)
                            connection.execute(sql)
                            connection.commit()
                            # print("Table populated")
                            # connection.close()
                        except:
                            pass

            # create table
            except:
                print("table needs to be created")
                # create table
                sql = f"""CREATE TABLE "{self.handle}" ("location" REAL UNIQUE, "link" REAL UNIQUE, "latitude" REAL UNIQUE, "longitude" REAL UNIQUE);"""
                #print(sql)
                connection = sqlite3.connect("posts.db")
                connection.execute(sql)
                connection.commit()
                connection.close()
                print("Table created")

예제 #22

0

파일 보기

class Verify(object):

    # Initialize the logpath and server name obtained from the config file
    def __init__(self, logpath, server):
        self.logpath = logpath
        self.timestamp = ''  # Time stamp will be updated when the file is created
        self.scrape = Scrape(server)
        self.rule = Rule()

    # Send an email by refering to the dictionary that includes list of users for which discrepancy was found for particular rule type
    def sendEmail(self, UserRuleDict):
        # Send an email if at least one discrepancy was found
        msgbody = ''
        if len(UserRuleDict) > 0:
            for rule, userlist in UserRuleDict.items():
                msgbody += ' \n\n*** Dashboard data does not match GNATS for rule type %(1)s for following users: *** \n%(2)s' \
                % {"1": rule, "2": ', '.join(user for user in userlist)}

            msgbody += '\n\nRefer to the following log file for details: \n%s/DAM_%s.log' % (
                self.logpath, self.timestamp)
            EmailAlert().send('*****@*****.**',
                              ['*****@*****.**'], 'DAM Alert',
                              msgbody)

    '''
    Run the logic to compare the list of PRs found in Dashboard and the one generated from Gnats
    Log all the details and if discrepancy is found, send and email alert to concerned users
    '''

    def run(self, usernamelist, rulelist):
        self.timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        logfile = '%s/DAM_%s.log' % (self.logpath, self.timestamp)
        logging.basicConfig(filename=logfile,
                            filemode='w',
                            format='%(message)s',
                            level=logging.WARNING)
        hdlr = logging.FileHandler(logfile)
        try:
            discrepancyUserdict = {}
            for ruletype in rulelist:
                for user in usernamelist:
                    logline = '-----------------------------------------------------------------------------------'
                    print logline
                    logging.critical(logline)

                    logline = 'Verifying for user ||%(1)s|| for ||%(2)s|| type of PRs: ' % {
                        "1": user,
                        "2": ruletype
                    }
                    print logline
                    logging.critical(logline)

                    gnatsPRlist = self.rule.getPRlist(user, ruletype)
                    dashPRlist = self.scrape.getPRList(user, ruletype)

                    discrepancylist1 = list(set(gnatsPRlist) - set(dashPRlist))
                    discrepancylist2 = list(set(dashPRlist) - set(gnatsPRlist))

                    logging.critical('%s:' % datetime.datetime.now().strftime(
                        "%A, %d - %B %Y %I:%M%p"))
                    logline = 'Count of PRs from GNATS: %d' % len(gnatsPRlist)
                    print logline
                    logging.critical(logline)

                    logline = 'Count of PRs from Dashboard: %d' % len(
                        dashPRlist)
                    print logline
                    logging.critical(logline)

                    logline = 'List of PRs missing in dashboard: [%s] ' % ','.join(
                        d for d in discrepancylist1)
                    print logline
                    logging.critical(logline)

                    logline = 'List of PRs additionally found in dashboard: [%s]' % ','.join(
                        d for d in discrepancylist2)
                    print logline
                    logging.critical(logline)

                    if len(discrepancylist1 + discrepancylist2) > 0:
                        print 'Checking for recently updated PRs and discarding false negatives from the list...'
                        finalDescList = self.removeRecentPRs(discrepancylist1 +
                                                             discrepancylist2)
                        if len(finalDescList) > 0:
                            logline = '\nDiscrepancies found after removing recently updated PRs: [%s] ' % ','.join(
                                f for f in finalDescList)
                            discrepancyUserdict.setdefault(ruletype,
                                                           []).append(user)
                        else:
                            logline = '\nDiscrepancies found after removing recently updated PRs: None'
                        print logline
                        logging.critical(logline)
                    else:
                        logline = '\nNo discrepancies found at all'
                        print logline
                        logging.critical(logline)

            # alert admin by sending an autogenerated email:
            if len(discrepancyUserdict) > 0:
                self.sendEmail(discrepancyUserdict)

            # close the log file
            hdlr.close()
        except Exception, e:
            logging.exception(e)

예제 #23

0

파일 보기

 def scrape(self, data):
     self._operation = Scrape(self._apikey, data)
     self._operation.execute()

예제 #24

0

파일 보기

    dates.next_day()

dates.driver.close()

total, counter = len(set(games)), 0

games = [
    i for i in games if i not in [
        j.replace('\n', '')
        for j in open('games_inserted_already.txt', 'r').readlines()
    ]
]

with open('queries.txt', 'w') as queries:
    for game in set(games):
        a = Scrape(game)
        #The below is to create tables gameData.datetime,gameData.flags,
        #gameData.game,gameData.officialScorer,gameData.primaryDatacaster,
        #gameData.status,gameData.weather

        try:
            b = jsonParse(a.get_raw_data()['gameData']['datetime'])
            b.parse()
            c = Table('datetime', 'gameData')
            c.add_data(b.keys, b.values)
            queries.write(c.insert(a.id))
        except:
            print((game, "Issue with datetime"))

        try:
            b = jsonParse(a.get_raw_data()['gameData']['flags'])

예제 #25

0

파일 보기

파일: verify.py 프로젝트: richil-bhalerao/DAM

 def __init__(self, logpath, server):
     self.logpath = logpath
     self.timestamp = ''    # Time stamp will be updated when the file is created
     self.scrape = Scrape(server)
     self.rule = Rule()

예제 #26

0

파일 보기

 def read_message(self):
     #quearies websites on a topic, puts output in message file and returns it for email structuring
     Scrape(self.technology).upwork()
     with open("message.txt", 'r', encoding='utf-8') as template_file:
         message_template = template_file.read()
     return message_template

예제 #27

0

파일 보기

 def __init__(self, logpath, server):
     self.logpath = logpath
     self.timestamp = ''  # Time stamp will be updated when the file is created
     self.scrape = Scrape(server)
     self.rule = Rule()

예제 #28

0

파일 보기

파일: bioscraper.py 프로젝트: vmantese/bioscraper

from bs4 import BeautifulSoup  # turns raw web pages into object hierarchy and provides selectors
from urlparse import urljoin
#from urlunshort import resolve
import re     # regular expression module for matching, parsing
import csv    # simplifies process of writing data to csv 
import nltk   # natural language tool kit module for quick analysis
import cStringIO #string buffer
from scrape import Scrape
#  a list of URLs about venture capital, investing,  data stores

allowedDomains = ["bizjournals.com","stltoday.com","stlpublicradio.org","alivemag.com","stlamerican.com","techli.com","stlregionalchamber.com","cbslocal.com","ktrs.com","ksdk.com","kmov.com","fox2now.com","kplr11.com"]


pagesToScrape = ["http://www.bizjournals.com/stlouis/blog/biznext/2015/07/10-biggest-funding-rounds-for-startups-so-far-this.html", "http://www.bizjournals.com/stlouis/news/", "http://www.stltoday.com/", "http://news.stlpublicradio.org/#stream/0", "http://www.alivemag.com/", "http://www.stlamerican.com/", "http://techli.com/#.", "http://www.stlregionalchamber.com/who-we-are/chamber-blog", "http://stlouis.cbslocal.com/station/kmox/", "http://www.ktrs.com/", "http://www.ksdk.com/", "http://www.kmov.com/", "http://fox2now.com/", "http://kplr11.com/", "http://www.biospace.com/", "http://medcitynews.com/", "http://www.fiercebiotech.com/", "http://blogs.wsj.com/venturecapital/page/2/", "http://techcrunch.com/", "http://venturebeat.com/", "http://www.bloomberg.com/", "http://www.americanentrepreneurship.com/", "http://siteselection.com/", "http://businessfacilities.com/", "http://www.tradeandindustrydev.com/", "http://www.sec.gov/edgar.shtml", "https://www.sec.gov/about/forms/formd.pdf", "http://www.edgar-online.com/"]

firstScrape = Scrape(pagesToScrape)
for index,URL in enumerate(firstScrape.getDictionary()):
    output = cStringIO.StringIO()
    webpage = requests.get(URL)
    content = webpage.content
    soup = BeautifulSoup(content,'html.parser')
    #print soup.body.text  # good line , gives good text info
    #print soup.title.text  #  also decent line here for quick line-up of titles
    #print soup.select("td")  # good, produces a lot of information but it is all in html as <td info, need to parse futher
    #print soup.body
    output.write("<scrape id=\"" + firstScrape.getAddress() + ":" + str(index) + "\" baseURI=\"" + URL + "\">\n")
    for link in soup.find_all("a", href=True):
        output.write(link['href'])
        output.write('\n')
    output.write("</scrape>\n")
    firstScrape.writeScrape(output)

예제 #29

0

파일 보기

 def geolocate(self, data):
     self._operation = Geolocate(data)
     self._operation.execute()

예제 #30

0

파일 보기

        df_final = pd.read_sql(sql_query, engine)

        df_final.to_csv(file_path + 'SQL_generated.csv')

        if source == 'Quandl':
            return [str(item) for item in list(df_final['Code'])]
        elif source == 'Yahoo':
            return [str(item) for item in list(df_final['Ticker'])]


if __name__ == '__main__':

    current_time = datetime.datetime.now().time()
    print 'Start time:' + str(current_time)

    scrape = Scrape()
    sym = SymbolDb()

    # Refresh symbol files from Quandl link
    #scrape.scrape_quandl_codes_us()
    #scrape.scrape_quandl_cboe_data()

    #change total pages to scrape in function above
    #scrape.scrape_finviz_codes_overview(7141,20)

    #scrape.scrape_finviz_codes_overview()

    # Merge all the symbol files from finviz and quandl into SQLite database
    #sym.merge_symbol_files_to_db()

    # Returns the final table from the database

예제 #31

0

파일 보기

파일: collapse.py 프로젝트: oshorefueled/scraper

 def get_collapse_content(self):
     html = super(Collapse, self).get_html()
     strain = super(Collapse, self).strain_by_id(self.strain_id)
     soup = Scrape.get_soup(html, strain)
     return soup

예제 #32

0

파일 보기

파일: scrape_zivity.py 프로젝트: dmertl/ImageCollector

if __name__ == '__main__':
    # Command line arguments
    parser = argparse.ArgumentParser(description='Scrape')
    parser.add_argument('url', type=str, help='URL')
    parser.add_argument('--debug', action='store_true', help='include debug output')
    args = parser.parse_args()

    # Setup logging
    sh = logging.StreamHandler(sys.stdout)
    if args.debug:
        sh.setLevel(logging.DEBUG)
    else:
        sh.setLevel(logging.INFO)
    sh.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
    root_log.addHandler(sh)

    # Init scraper
    scrape = Scrape(ZivityScraper(root_log), root_log)

    # Download page
    if os.path.exists(args.url):
        scrape.scrape_file(args.url)
    else:
        scraped = set()
        to_scrape = set([args.url])
        while len(to_scrape):
            current_url = to_scrape.pop()
            to_scrape.update(scrape.scrape_url(current_url))
            scraped.add(current_url)
            to_scrape -= scraped

예제 #33

0

파일 보기

파일: verify.py 프로젝트: richil-bhalerao/DAM

class Verify(object):
    
    # Initialize the logpath and server name obtained from the config file
    def __init__(self, logpath, server):
        self.logpath = logpath
        self.timestamp = ''    # Time stamp will be updated when the file is created
        self.scrape = Scrape(server)
        self.rule = Rule()
    
    # Send an email by refering to the dictionary that includes list of users for which discrepancy was found for particular rule type
    def sendEmail(self, UserRuleDict):
        # Send an email if at least one discrepancy was found
        msgbody = ''
        if len(UserRuleDict) > 0:
            for rule, userlist in UserRuleDict.items():
                msgbody += ' \n\n*** Dashboard data does not match GNATS for rule type %(1)s for following users: *** \n%(2)s' \
                % {"1": rule, "2": ', '.join(user for user in userlist)} 
                
            msgbody += '\n\nRefer to the following log file for details: \n%s/DAM_%s.log' % (self.logpath, self.timestamp) 
            EmailAlert().send('*****@*****.**', 
                              ['*****@*****.**'], 
                              'DAM Alert', msgbody)
    
    '''
    Run the logic to compare the list of PRs found in Dashboard and the one generated from Gnats
    Log all the details and if discrepancy is found, send and email alert to concerned users
    '''    
    def run(self, usernamelist, rulelist):
        self.timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        logfile = '%s/DAM_%s.log' % (self.logpath, self.timestamp)
        logging.basicConfig(filename=logfile, filemode='w', format='%(message)s' ,level=logging.WARNING)
        hdlr = logging.FileHandler(logfile)
        try:
            discrepancyUserdict = {}
            for ruletype in rulelist:
                for user in usernamelist:
                    logline = '-----------------------------------------------------------------------------------'
                    print logline
                    logging.critical(logline)
                    
                    logline = 'Verifying for user ||%(1)s|| for ||%(2)s|| type of PRs: ' % {"1": user, "2": ruletype}
                    print logline
                    logging.critical(logline)
                    
                    gnatsPRlist = self.rule.getPRlist(user, ruletype)
                    dashPRlist = self.scrape.getPRList(user, ruletype)
                    
                    discrepancylist1 = list(set(gnatsPRlist) - set(dashPRlist))
                    discrepancylist2 = list(set(dashPRlist) - set(gnatsPRlist))
                    
                    logging.critical('%s:' % datetime.datetime.now().strftime("%A, %d - %B %Y %I:%M%p"))
                    logline = 'Count of PRs from GNATS: %d' % len(gnatsPRlist)
                    print logline
                    logging.critical(logline)
                    
                    logline = 'Count of PRs from Dashboard: %d' % len(dashPRlist)
                    print logline
                    logging.critical(logline)
                    
                    logline = 'List of PRs missing in dashboard: [%s] ' % ','.join(d for d in discrepancylist1)
                    print logline
                    logging.critical(logline)
                    
                    logline = 'List of PRs additionally found in dashboard: [%s]' % ','.join(d for d in discrepancylist2)
                    print logline
                    logging.critical(logline)
                    
                    if len(discrepancylist1 + discrepancylist2) > 0:
                        print 'Checking for recently updated PRs and discarding false negatives from the list...'
                        finalDescList = self.removeRecentPRs(discrepancylist1 + discrepancylist2)
                        if len(finalDescList) > 0:
                            logline = '\nDiscrepancies found after removing recently updated PRs: [%s] ' % ','.join(f for f in finalDescList)
                            discrepancyUserdict.setdefault(ruletype, []).append(user)
                        else:
                            logline = '\nDiscrepancies found after removing recently updated PRs: None'
                        print logline
                        logging.critical(logline)
                    else:
                        logline = '\nNo discrepancies found at all'
                        print logline
                        logging.critical(logline)
            
            
            # alert admin by sending an autogenerated email:
            if len(discrepancyUserdict) > 0:
                self.sendEmail(discrepancyUserdict)
            
            # close the log file
            hdlr.close()
        except Exception, e:
            logging.exception(e)

예제 #34

0

파일 보기

 def get_summary_content(self):
     html = Scrape.get_html(self)
     strain = Scrape.strain_by_id(self.summary_id)
     soup = Scrape.get_soup(html, strain)
     return soup

예제 #35

0

파일 보기

파일: main.py 프로젝트: ltebben/distributed-keyword-indexer

#MONGO_HOST = os.environ["MONGO_HOST"]
#MONGO_PORT = os.environ["MONGO_PORT"]
#MONGO_USER = os.environ["MONGO_USER"]
#MONGO_PASS = os.environ["MONGO_PASS"]
#MONGO_NAME = os.environ["MONGO_NAME"]
#MONGO_COLLECTION = os.environ["MONGO_COLLECTION"]

# Initialize database connection
#client = MongoClient(MONGO_HOST, int(MONGO_PORT))
#db = client[MONGO_NAME]
#db.authenticate(MONGO_USER, MONGO_PASS)
#urls_collection = db[MONGO_COLLECTION]

# Initialize scraper
s = Scrape()

# Initialize mpi cluster variables
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()

# Initialize root set of sources
sources_file = open(SOURCES_LOC, "r")
sources = sources_file.readlines()
explored = set()
uniqueKeywords = set()

# Initialize stopTime
if len(sys.argv) == 2:
    stopTime = time.time() + int(sys.argv[1])

예제 #36

0

파일 보기

 def __init__(self, url):
     self.summary_id = "tableSummaryHeader"
     self.filter_class = "strongRow"
     self.table_class = "summaryTable"
     Scrape.__init__(self, url)