def get_info(): companyName = request.args.get("companyName") scrape = Scrape(companyName) number, driver = scrape.login() driver = scrape.select_company(driver, companyName) list = scrape.scrapy(driver) return jsonify(list)
def reset(self, array): self.p_filter = '' self.track_mode = 0 self.Tracked = PokeList() self.Pokedex = PokeList() self.Scrape = Scrape() self.Pokedex.add_all(array)
def __init__(self, headers=None): if headers == None: #valor de headers default headers = [ 'Origin: http://buscador.compras.imss.gob.mx', 'Accept-Encoding: gzip, deflate', 'Accept-Language: en-US,en;q=0.8,es;q=0.6', 'User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36', 'Content-Type: application/x-www-form-urlencoded', 'Accept: */*', 'Referer: http://buscador.compras.imss.gob.mx/wrap/index.html', 'X-Requested-With: WAJAF::Ajax - WebAbility(r) v5', 'Connection: keep-alive', 'DNT: 1' ] #inicializa el objeto que maneja las peticiones curl Scrape.__init__(self, headers) #inicializa el objeto mongo_client que se instancia en la clase Scrape #getattr hace una cosa como getattr('x','valor') ==> x.valor self.db = getattr(self.mongo_client, 'imss') self.compras = getattr(self.db, 'compras') self.raw = getattr(self.db, 'raw') #estos son los valores default de la página. se pueden modificar pero no lo haré para permanecer como bajo perfil self.numperpage = 20 self.page = 1 self.url = 'http://buscador.compras.imss.gob.mx/index.php' self.datos_default = '&type=compras&message=X&filtered=1&descripcion=&proveedor=&numcompra=&delegacion=values%3D&fecha=min%3D%3Bmax%3D&procedimiento=values%3D&exact=false&numperpage={}&page={}&order=fecha%20desc'
def testScrape(self): url = "https://www.bbc.com" s = Scrape() s.setUrl(url) keywords, links = s.scrape() self.assertTrue(keywords, msg='No Keywords found') self.assertTrue(links, msg='No links found')
def get_list(): keyWord = request.args.get("keyWord") scrape = Scrape(keyWord) number, driver = scrape.login() tags, names = scrape.get_company_list(driver) name_list = [] for key in names.keys(): name_list.append(key) return jsonify({"num": number, "list": name_list})
def key_word(): keyForm = KeyForm(request.form) companyForm = CompanyForm(request.form) key_word = None names = None company = None number = 0 if keyForm.validate_on_submit(): key_word = keyForm.key_word.data scrape = Scrape(key_word) number, driver = scrape.login() tags, names = scrape.get_company_list(driver) if companyForm.validate_on_submit(): company = companyForm.company.data scrape = Scrape(company) number, driver = scrape.login() driver = scrape.select_company(driver, company) lists = scrape.scrapy(driver) return render_template('company.html', lists=lists) return render_template('index.html', keyForm=keyForm, companyForm=companyForm, key_word=key_word, names=names, number=number, company=company)
def main(): page = 'https://www.canyon.com/en-us/outlet?--wysiwyg_cany\ on_products-factoryoutlet%5B%40package%5D=wysiwyg.canyon.products&--wysiwyg_cany\ on_products-factoryoutlet%5B%40controller%5D=factoryoutlet&--wysiwyg_canyon_pro\ ducts-factoryoutlet%5B%40action%5D=road&--wysiwyg_canyon_products-factoryoutlet\ %5B%40format%5D=html' links = Scrape.scrape(Scrape(), page) if links: message = Message.format(Message(), links) Message.send(Message(), message)
def main(): distance = '30' zip_ = '80223' min_price = '300' max_price = '1500' has_pic = '1' # 0 to disable bundle = '1' # 0 to disable main_search = 'https://denver.craigslist.org/search/bia?h\ asPic={}&bundleDuplicates={}&search_distance={}&postal={}&min_price=\ {}&max_price={}'.format(has_pic, bundle, distance, zip_, min_price, max_price) post_list = Scrape.scrape_search_pg(main_search) results = [] for post in post_list: post_Obj = Filter(post[0]) result = Filter.quick_filter(post_Obj) if result: result = Filter.size_filter(post_Obj) if result: results.append( Keywords.score(post_Obj, Keywords.find(post_Obj))) if len(results) > 0: results = sorted(results, key=lambda x: x[2], reverse=True) Message.send(Message.format(results))
def home(): if 'subreddit' and 'post_count' in request.args: subreddit = str(request.args['subreddit']) post_count = int(request.args['post_count']) #months_old = int(request.args['months_old']) months_old = 1 else: return jsonify({"message": "Invalid request", 'code': 400}) scrape_instance = Scrape(subreddit, months_old, post_count) subreddit_exists = scrape_instance.sub_exists() if(subreddit_exists): return_data = scrape_instance.get_data() return jsonify({"data": return_data, 'code': 200}) else: return jsonify({"message": "Subreddit does not exist", 'code': 400})
def scrape_and_index(): prefix = 'WEBPAGES_RAW/' with open(prefix + 'bookkeeping.json', 'r') as file_handle: urls = json.load(file_handle) # index = Index() count = 0 start_time = time.time() terms = [] documents = [] for key in urls: file_name = prefix + key # print 'Processing ', file_name, ' ', format(count/374.97, '.2f'), ' % done' s = Scrape(file_name, ['a', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title', 'p']) # document = Document(doc=key, url=urls[key]) document = {'doc': key, 'url': urls[key]} document_terms = s.parse_content(document) terms.extend(document_terms) documents.append(document) # s.parse_content() # tags = s.get_token_tags() # tokens = s.get_token_frequencies() # for token in tokens: # index.update_index(token, key, tokens[token], list(tags[token])) count += 1 if count % 500 == 0: write_to_disk(documents, terms) print '=' * 30 print 'Documents parsed = ', format(count / 374.97, '.2f'), ' % done' print 'Write to Disk Successful' print 'Time taken (seconds)\t:\t:', str((time.time() - start_time)) print '=' * 30 documents = [] terms = [] print 'Time taken (seconds)\t:\t"', str((time.time() - start_time)) write_to_disk(documents, terms)
def __init__(self,headers=None): if headers == None: #valor de headers default headers = ['Origin: http://buscador.compras.imss.gob.mx','Accept-Encoding: gzip, deflate','Accept-Language: en-US,en;q=0.8,es;q=0.6','User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36','Content-Type: application/x-www-form-urlencoded','Accept: */*','Referer: http://buscador.compras.imss.gob.mx/wrap/index.html','X-Requested-With: WAJAF::Ajax - WebAbility(r) v5','Connection: keep-alive','DNT: 1'] #inicializa el objeto que maneja las peticiones curl Scrape.__init__(self,headers) #inicializa el objeto mongo_client que se instancia en la clase Scrape #getattr hace una cosa como getattr('x','valor') ==> x.valor self.db = getattr(self.mongo_client, 'imss') self.compras = getattr(self.db, 'compras') self.raw = getattr(self.db, 'raw') #estos son los valores default de la página. se pueden modificar pero no lo haré para permanecer como bajo perfil self.numperpage = 20 self.page=1 self.url = 'http://buscador.compras.imss.gob.mx/index.php' self.datos_default = '&type=compras&message=X&filtered=1&descripcion=&proveedor=&numcompra=&delegacion=values%3D&fecha=min%3D%3Bmax%3D&procedimiento=values%3D&exact=false&numperpage={}&page={}&order=fecha%20desc'
def store_latest_URLs_db(latest_URLs): #Iterate through each of the URLs for url in latest_URLs: s = Scrape(url).pipeline() for info in s.get_main_info(): new_article = Articles(section=info['article'], headline=info['headline'], main_text=info['main_text'], date_published=..., word_count=info['word_count'], hyper_link=...) # new_author = Authors(name=..., # position=..., # description=..., # section=..., # hyper_link=...) db.session.add(new_article) db.session.commit() print('New Article Added')
class Client: def __init__(self, apikey): #Initialize client with the Elsevier API key self._apikey = apikey @property def apikey(self): return self._apikey @apikey.setter def apikey(self, key): #Validator for API key self._apikey = key def geolocate(self, data): self._operation = Geolocate(data) self._operation.execute() def scrape(self, data): self._operation = Scrape(self._apikey, data) self._operation.execute()
def populate(self): scrape_details = Scrape_Info.select(Scrape_Info, Site).join(Site) columns = Job._meta.sorted_field_names[1:] for details in scrape_details: scrape = Scrape(details.url) site = details.site_id links = scrape.get_selector(site.link_selector, all=True) selectors = [ site.date_selector, site.city_selector, site.title_selector, site.location_selector, site.company_selector, site.content_selector, site.image_selector ] foreign_keys = [details.site_id.id, details.category_id.id] data = [] updated = False for link in links: if Job.select().where(Job.link == link).exists(): updated = True break scrape_link = Scrape(link) offer = [link] for selector in selectors: offer.append(scrape_link.get_selector(selector)) offer = foreign_keys + offer data.append(dict(zip(columns, offer))) time.sleep(0.5) if updated is False: with db.atomic(): Job.insert_many(list(reversed(data))).execute()
def quick_filter(self): title = word_tokenize(self.title) if set(title) & set(Filter.all_) and \ not set(title) & set(Filter.bad): item_info = Scrape.scrape_post_pg(self.url) self.price = item_info[1] self.post = self.post + list(item_info) self.size = item_info[2] return self
def getPages(url, maxPages): # this list will be used to store the reviews list = [] # this driver is what controls google chrome driver = webdriver.Chrome(ChromeDriverManager().install()) # tracks the page numbers pageNumber = 0 # this counter is used to check if the page is empty because of a glitch that sometime occurs # if three empty pages are found the code assumes that no reviews are left emptyPages = 0 while pageNumber >= 0 and pageNumber < int(maxPages) and emptyPages < 3: # increments page number pageNumber += 1 # goes to page using the url and page number driver.get(url + str(pageNumber) + ".htm") # sets up the page to be parsed html = driver.page_source soup = bs4.BeautifulSoup(html, 'html.parser') # finds every review on the page and saves it to the list review reviews = soup.find_all("li", "empReview") # checks if pages has reviews if not reviews: emptyPages += 1 # increments empty page count since there are no reviews on page else: emptyPages = 0 # resets empty page counter in case last page was empty # this for loop goes through the list of reviews for review in reviews: # call the function scrape_review for each review from the class Scrape and then adds # the reviews to the list at the beginning of this class list.append(Scrape.scrape_review(review)) driver.close() # closes the chrome tab return list # returns the list of reviews
def runBot(): loop = asyncio.get_event_loop() theClient = bot.BotClient() commands = Commands(theClient) #levels = UserLevels(theClient) scrape = Scrape(theClient) saved = SaveLinks(theClient) messages = Messages(theClient) coins = Coins(theClient) try: loop.run_until_complete(theClient.start(Token)) except KeyboardInterrupt: loop.run_until_complete(theClient.close()) finally: loop.close() if theClient.restart == True: if asyncio.get_event_loop().is_closed(): asyncio.set_event_loop(asyncio.new_event_loop()) reload(bot) runBot()
class Pokedex(wx.Frame): def __init__(self, parent, id, title): wx.Frame.__init__(self, parent, style=wx.DEFAULT_FRAME_STYLE ^ wx.RESIZE_BORDER ^ wx.MAXIMIZE_BOX) self.list_pokeweak = [] file = open('pokedata.cfg', 'r') all_data_str = file.read() file.close() if all_data_str == '': file = open('pokedata.default.cfg', 'r') all_data_str = file.read() file.close() file = open('pokedata.cfg', 'w') file.write(all_data_str) file.close() self.all_data = ast.literal_eval(all_data_str) self.reset(self.all_data) self.typestrings = [ "Quad", "Double", "Normal", "Half", "Fourth", "Immune" ] self.colors = [ '#E8E8E8', '#D0D0D0', '#B8B8B8', '#A0A0A0', '#888888', '#707070' ] self.icon = wx.Icon('Images/pokeball.ico', wx.BITMAP_TYPE_ICO) self.SetIcon(self.icon) self.SB1 = wx.StaticBox(self, -1, 'Stats') self.SB1.SetBackgroundColour('white') self.CB = wx.ComboBox(self, -1, value='', choices=['Name', 'Type'], style=wx.CB_READONLY, size=(103, -1)) self.CB.SetSelection(0) #create menubar self.menubar = wx.MenuBar() #create file menu self.filemenu = wx.Menu() self.m_stayontop = self.filemenu.Append(wx.ID_ANY, 'Stay on Top [Off]\tCtrl-S', '') self.m_resize = self.filemenu.Append(wx.ID_ANY, 'Resizable [Off]\tCtrl-E', '') self.m_track = self.filemenu.Append(wx.ID_ANY, 'Show Tracked\tCtrl-T', '') self.m_rescrape = self.filemenu.Append(wx.ID_ANY, 'Rescrape Html\tCtrl-R', '') self.m_close = self.filemenu.Append(wx.ID_ANY, 'Close\tCtrl-Q', '') self.menubar.Append(self.filemenu, 'File') #bind events to menu options self.Bind(wx.EVT_MENU, self.stay_on_top, self.m_stayontop) self.Bind(wx.EVT_MENU, self.resizable, self.m_resize) self.Bind(wx.EVT_MENU, self.toggle_track_mode, self.m_track) self.Bind(wx.EVT_MENU, self.rescrape, self.m_rescrape) self.Bind(wx.EVT_MENU, self.on_close, self.m_close) #Creating list that will store all pokemon self.LC = wx.ListCtrl(self, -1, style=wx.LC_REPORT | wx.LC_SINGLE_SEL) #Creating list for weaknesses self.LC2 = AutoWidthListCtrl(self) #Inserting columns into that list of pokemon self.LC.InsertColumn(0, 'Pokemon', format=wx.LIST_FORMAT_LEFT, width=90) self.LC.InsertColumn(1, 'Type', format=wx.LIST_FORMAT_CENTER, width=100) self.LC.InsertColumn(2, 'Total', format=wx.LIST_FORMAT_CENTER, width=45) self.LC.InsertColumn(3, 'HP', format=wx.LIST_FORMAT_CENTER, width=30) self.LC.InsertColumn(4, 'Atk', format=wx.LIST_FORMAT_CENTER, width=30) self.LC.InsertColumn(5, 'Def', format=wx.LIST_FORMAT_CENTER, width=30) self.LC.InsertColumn(6, 'SpAtk', format=wx.LIST_FORMAT_CENTER, width=44) self.LC.InsertColumn(7, 'SpDef', format=wx.LIST_FORMAT_CENTER, width=44) self.LC.InsertColumn(8, 'Spd', format=wx.LIST_FORMAT_CENTER, width=35) self.LC2.InsertColumn(0, 'Damage Taken [(2 ^ Row) * 1/4]', format=wx.LIST_FORMAT_RIGHT, width=355) #creating boxsizers mainbox = wx.BoxSizer(wx.VERTICAL) statsbox = wx.BoxSizer(wx.HORIZONTAL) imagebox = wx.StaticBoxSizer(self.SB1, wx.HORIZONTAL) defensebox = wx.BoxSizer(wx.HORIZONTAL) box = wx.BoxSizer(wx.VERTICAL) hbox = wx.BoxSizer(wx.HORIZONTAL) #creating initial image for imagebox self.image = wx.StaticBitmap( self, -1, wx.Bitmap('Images/0.png', wx.BITMAP_TYPE_PNG)) self.input = wx.TextCtrl(self, value='', size=(270, -1)) self.SetBackgroundColour('pink') #Adding text entry and combobox to a hbox then adding it the box hbox.Add(self.input, wx.EXPAND | wx.ALIGN_LEFT) hbox.Add(self.CB, wx.ALIGN_CENTER) box.Add(hbox, flag=wx.ALL) #Adding list that will store all pokemon box.Add(self.LC, flag=wx.ALL | wx.EXPAND) #Adding image area to imagebox imagebox.Add(self.image, flag=wx.ALIGN_CENTER | wx.ALL) defensebox.Add(self.LC2, flag=wx.ALIGN_CENTER | wx.EXPAND) #Adding image to boxsizer statsbox.Add(imagebox, flag=wx.ALIGN_CENTER | wx.ALL) #Adding boxsizer to mainbox box.Add(statsbox, flag=wx.ALIGN_CENTER | wx.EXPAND) self.SetMenuBar(self.menubar) statsbox.Add(defensebox, flag=wx.ALIGN_CENTER | wx.EXPAND) mainbox.Add(box, flag=wx.ALL | wx.EXPAND) #Bind Events to search bar self.Bind(wx.EVT_TEXT, self.search, self.input) self.Bind(wx.EVT_CHAR_HOOK, self.on_keyboard_search, self.input) #Bind Events to Search Combobox self.Bind(wx.EVT_COMBOBOX, self.search, self.CB) self.Bind(wx.EVT_CHAR_HOOK, self.on_keyboard_combobox, self.CB) #Bind Events to Pokemon ListCtrl self.Bind(wx.EVT_LIST_ITEM_FOCUSED, self.set_information, self.LC) self.Bind(wx.EVT_LIST_KEY_DOWN, self.on_keyboard_list, self.LC) self.Bind(wx.EVT_LIST_ITEM_RIGHT_CLICK, self.track, self.LC) self.Bind(wx.EVT_LIST_ITEM_ACTIVATED, self.track, self.LC) self.Bind(wx.EVT_TEXT_ENTER, self.track, self.input) #Resize and Refresh the list self.SetSizer(mainbox) mainbox.Fit(self) self.refresh('') def on_keyboard_search(self, event): if event.GetKeyCode() == wx.WXK_DOWN: if self.LC.GetItemCount(): self.LC.Focus(0) self.LC.SetFocus() elif event.GetKeyCode() == wx.WXK_UP: #Do Nothing pass elif event.GetKeyCode() == wx.WXK_TAB: self.CB.SetFocus() else: #Handle event default event.Skip() def on_keyboard_list(self, event): if event.GetKeyCode() == wx.WXK_UP: index = self.LC.GetFocusedItem() if index == 0: self.input.SetFocus() elif event.GetKeyCode() == wx.WXK_TAB: self.input.SetFocus() else: #Handle event default event.Skip() def on_keyboard_combobox(self, event): if event.GetKeyCode() == wx.WXK_TAB: self.input.SetFocus() else: #Handle Event default event.Skip() def on_close(self, event): self.Destroy() def reset(self, array): self.p_filter = '' self.track_mode = 0 self.Tracked = PokeList() self.Pokedex = PokeList() self.Scrape = Scrape() self.Pokedex.add_all(array) def stay_on_top(self, event): if self.ToggleWindowStyle(flag=wx.STAY_ON_TOP): self.m_stayontop.SetItemLabel('Stay on Top [On]\tCtrl-S') else: self.m_stayontop.SetItemLabel('Stay on Top [Off]\tCtrl-S') def resizable(self, event): if self.ToggleWindowStyle(flag=wx.RESIZE_BORDER): self.m_resize.SetItemLabel('Resizable [On]\tCtrl-E') else: self.m_resize.SetItemLabel('Resizable [Off]\tCtrl-E') def track(self, event): if self.LC.GetItemCount(): index = self.LC.GetFocusedItem() if index == -1: index = 0 ListItem = self.LC.GetItem(index, 0) Name = ListItem.GetText() if self.track_mode: self.Tracked.remove(Name) filter = self.input.GetValue().strip().lower() self.refresh(filter) else: pokemon = self.Pokedex.get(Name) self.Tracked.add(pokemon) if self.LC.GetItemCount() == 0: self.input.SetFocus() def toggle_track_mode(self, event): if self.track_mode: self.m_track.SetItemLabel('Show Tracked\tCtrl-T') self.track_mode = 0 else: self.m_track.SetItemLabel('Show All\tCtrl-T') self.track_mode = 1 self.input.SetValue('') self.refresh('') def set_item_color(self, index, color1, color2): if index % 2: self.LC.SetItemBackgroundColour(index, color1) else: self.LC.SetItemBackgroundColour(index, color2) def set_items(self, set): self.LC.DeleteAllItems() for index, pokemon in enumerate(set): self.append_item(pokemon) self.set_item_color(index, 'pink', 'white') def add_items(self, set): column = self.CB.GetSelection() i, max_i = (0, self.LC.GetItemCount()) j, max_j = (0, len(set)) while True: if j == max_j: break elif i == max_i: self.insert_item(i, set[j]) max_i += 1 else: ListItem = self.LC.GetItem(i, column) ItemText = ListItem.GetText() if not set[j].get_name() == ItemText: self.insert_item(i, set[j]) max_i += 1 self.set_item_color(i, 'pink', 'white') i, j = (i + 1, j + 1) def refresh(self, filter): if self.track_mode: if filter == '': subset = self.Tracked.get_all() elif self.CB.GetSelection(): subset = self.Tracked.type_filter(filter) else: subset = self.Tracked.name_filter(filter) else: if filter == '': subset = self.Pokedex.get_all() elif self.CB.GetSelection(): subset = self.Pokedex.type_filter(filter) else: subset = self.Pokedex.name_filter(filter) if subset[0].get_name() == '???': self.LC.DeleteAllItems() elif self.p_filter in filter: self.set_items(subset) else: self.add_items(subset) self.p_filter = filter self.SB1.SetLabel(subset[0].get_name()) self.set_image(subset[0].get_image()) self.set_weaknesses(subset[0]) self.LC.Select(0, on=1) def insert_item(self, index, pokemon): self.LC.InsertItem(index, pokemon.get_name()) self.LC.SetItem(index, 1, pokemon.get_type()) self.LC.SetItem(index, 2, pokemon.get_total()) self.LC.SetItem(index, 3, pokemon.get_hp()) self.LC.SetItem(index, 4, pokemon.get_atk()) self.LC.SetItem(index, 5, pokemon.get_def()) self.LC.SetItem(index, 6, pokemon.get_spatk()) self.LC.SetItem(index, 7, pokemon.get_spdef()) self.LC.SetItem(index, 8, pokemon.get_spd()) def append_item(self, pokemon): self.LC.Append(pokemon.get()) def set_information(self, event): self.LC2.DeleteAllItems() ListItem = self.LC.GetItem(self.LC.GetFocusedItem(), 0) Name = ListItem.GetText() pokemon = self.Pokedex.get(Name) self.set_weaknesses(pokemon) self.SB1.SetLabel(pokemon.get_name()) self.set_image(pokemon.get_image()) def set_weaknesses(self, pokemon): self.LC2.DeleteAllItems() self.LC2.InsertItem(0, pokemon.get_quad()) self.LC2.SetItemBackgroundColour(0, self.colors[0]) self.LC2.InsertItem(1, pokemon.get_double()) self.LC2.SetItemBackgroundColour(1, self.colors[1]) self.LC2.InsertItem(2, pokemon.get_normal()) self.LC2.SetItemBackgroundColour(2, self.colors[2]) self.LC2.InsertItem(3, pokemon.get_half()) self.LC2.SetItemBackgroundColour(3, self.colors[3]) self.LC2.InsertItem(4, pokemon.get_fourth()) self.LC2.SetItemBackgroundColour(4, self.colors[4]) self.LC2.InsertItem(5, pokemon.get_immune()) self.LC2.SetItemBackgroundColour(5, self.colors[5]) self.LC2.resizeLastColumn(-1) def set_image(self, image): self.image.SetBitmap(image) def search(self, event): filter = self.input.GetValue().strip().lower() self.refresh(filter) def rescrape(self, event): self.Hide() self.mySplash = wx.adv.SplashScreen( app.myBitmap, wx.adv.SPLASH_NO_TIMEOUT | wx.adv.SPLASH_CENTER_ON_SCREEN, -1, None) self.mySplash.Show() self.all_data = self.Scrape.scrape() self.reset(self.all_data) self.refresh('') self.mySplash.Destroy() self.Show()
if not sys.argv[1]: sys.exit("please provide page start and end numbers \n " + info) # currently we are counting down to 1.. page_min = int(sys.argv[1]) page_max = int(sys.argv[2]) posts_limit = 10 # only publish this many to WP at a time base_url = 'http://hirise.lpl.arizona.edu/releases/all_captions.php' # base_url_wallpapers = 'http://hirise.lpl.arizona.edu/' base_url_wallpapers = 'http://static.uahirise.org/images/wallpaper/' local_img_dir = '/app/tmp/' # setup some tools scrape = Scrape(base_url=base_url, local_img_dir=local_img_dir, base_url_wallpapers=base_url_wallpapers) wp_publish = WPPublish() if not debug: previously_published = wp_publish.get_all_published() else: previously_published = [] # grab links to all the detail pages we need all_detail_page_urls, urls_by_page = scrape.grab_all_page_urls(page_min, page_max) # set to False if you don't wnat to publish to Wordpress # this will also cause it to ignore previously published list # grab content each page and publish to api and perhaps WP too post_count = 0 last_page = 0
def get_latest_URLs(): scrape_latest = Scrape(Defaults.main_url) scrape_latest_soup = scrape_latest.get_soup() latest_URLs = find_latest_URLs(scrape_latest_soup['soup']) return latest_URLs
def db_check(self): outdated = True while outdated: # Try table read otherwise create try: connection = sqlite3.connect("posts.db") cursor = connection.cursor() sql = f"""SELECT * FROM "{self.handle}" """ cursor.execute(sql) result = cursor.fetchall() connection.close() # Try table read otherwise populate try: #print(result) #print(result[-1]) last_location_data = result[-1][0] print( f"Database last recorded location: {last_location_data}" ) last_location = ScrapeLast(self.handle).get_last()[0] print(f"Instagram last posted location: {last_location}") # Check up to date if last_location_data == last_location: print("Database is up to date!") outdated = False else: print("Uh oh! we need to update table") # scrape latest data print(f"Starting Scrape: @{self.handle}") data = Scrape(self.handle).get_locations() locations = data[0] links = data[1] latitudes = data[2] longitudes = data[3] #print(locations[0]) connection = sqlite3.connect("posts.db") for i in range(0, len(locations), 1): try: sql = f"""INSERT INTO `{self.handle}` (`location`,`link`,`latitude`,`longitude`) VALUES ("{locations[i]}","{links[i]}","{latitudes[i]}","{longitudes[i]}")""" #print(sql) connection.execute(sql) connection.commit() # print("Table populated") # connection.close() except: pass print("Table updated") outdated = False connection.close() print("connection closed") except: print( "Uh oh! we need to populate the table for the first time" ) # scrape latest data print(f"Starting Scrape: @{self.handle}") data = Scrape(self.handle).get_locations() locations = data[0] links = data[1] latitudes = data[2] longitudes = data[3] #print(locations[0]) connection = sqlite3.connect("posts.db") for i in range(0, len(locations), 1): try: sql = f"""INSERT INTO `{self.handle}` (`location`,`link`,`latitude`,`longitude`) VALUES ("{locations[i]}","{links[i]}","{latitudes[i]}","{longitudes[i]}")""" #print(sql) connection.execute(sql) connection.commit() # print("Table populated") # connection.close() except: pass # create table except: print("table needs to be created") # create table sql = f"""CREATE TABLE "{self.handle}" ("location" REAL UNIQUE, "link" REAL UNIQUE, "latitude" REAL UNIQUE, "longitude" REAL UNIQUE);""" #print(sql) connection = sqlite3.connect("posts.db") connection.execute(sql) connection.commit() connection.close() print("Table created")
class Verify(object): # Initialize the logpath and server name obtained from the config file def __init__(self, logpath, server): self.logpath = logpath self.timestamp = '' # Time stamp will be updated when the file is created self.scrape = Scrape(server) self.rule = Rule() # Send an email by refering to the dictionary that includes list of users for which discrepancy was found for particular rule type def sendEmail(self, UserRuleDict): # Send an email if at least one discrepancy was found msgbody = '' if len(UserRuleDict) > 0: for rule, userlist in UserRuleDict.items(): msgbody += ' \n\n*** Dashboard data does not match GNATS for rule type %(1)s for following users: *** \n%(2)s' \ % {"1": rule, "2": ', '.join(user for user in userlist)} msgbody += '\n\nRefer to the following log file for details: \n%s/DAM_%s.log' % ( self.logpath, self.timestamp) EmailAlert().send('*****@*****.**', ['*****@*****.**'], 'DAM Alert', msgbody) ''' Run the logic to compare the list of PRs found in Dashboard and the one generated from Gnats Log all the details and if discrepancy is found, send and email alert to concerned users ''' def run(self, usernamelist, rulelist): self.timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") logfile = '%s/DAM_%s.log' % (self.logpath, self.timestamp) logging.basicConfig(filename=logfile, filemode='w', format='%(message)s', level=logging.WARNING) hdlr = logging.FileHandler(logfile) try: discrepancyUserdict = {} for ruletype in rulelist: for user in usernamelist: logline = '-----------------------------------------------------------------------------------' print logline logging.critical(logline) logline = 'Verifying for user ||%(1)s|| for ||%(2)s|| type of PRs: ' % { "1": user, "2": ruletype } print logline logging.critical(logline) gnatsPRlist = self.rule.getPRlist(user, ruletype) dashPRlist = self.scrape.getPRList(user, ruletype) discrepancylist1 = list(set(gnatsPRlist) - set(dashPRlist)) discrepancylist2 = list(set(dashPRlist) - set(gnatsPRlist)) logging.critical('%s:' % datetime.datetime.now().strftime( "%A, %d - %B %Y %I:%M%p")) logline = 'Count of PRs from GNATS: %d' % len(gnatsPRlist) print logline logging.critical(logline) logline = 'Count of PRs from Dashboard: %d' % len( dashPRlist) print logline logging.critical(logline) logline = 'List of PRs missing in dashboard: [%s] ' % ','.join( d for d in discrepancylist1) print logline logging.critical(logline) logline = 'List of PRs additionally found in dashboard: [%s]' % ','.join( d for d in discrepancylist2) print logline logging.critical(logline) if len(discrepancylist1 + discrepancylist2) > 0: print 'Checking for recently updated PRs and discarding false negatives from the list...' finalDescList = self.removeRecentPRs(discrepancylist1 + discrepancylist2) if len(finalDescList) > 0: logline = '\nDiscrepancies found after removing recently updated PRs: [%s] ' % ','.join( f for f in finalDescList) discrepancyUserdict.setdefault(ruletype, []).append(user) else: logline = '\nDiscrepancies found after removing recently updated PRs: None' print logline logging.critical(logline) else: logline = '\nNo discrepancies found at all' print logline logging.critical(logline) # alert admin by sending an autogenerated email: if len(discrepancyUserdict) > 0: self.sendEmail(discrepancyUserdict) # close the log file hdlr.close() except Exception, e: logging.exception(e)
def scrape(self, data): self._operation = Scrape(self._apikey, data) self._operation.execute()
dates.next_day() dates.driver.close() total, counter = len(set(games)), 0 games = [ i for i in games if i not in [ j.replace('\n', '') for j in open('games_inserted_already.txt', 'r').readlines() ] ] with open('queries.txt', 'w') as queries: for game in set(games): a = Scrape(game) #The below is to create tables gameData.datetime,gameData.flags, #gameData.game,gameData.officialScorer,gameData.primaryDatacaster, #gameData.status,gameData.weather try: b = jsonParse(a.get_raw_data()['gameData']['datetime']) b.parse() c = Table('datetime', 'gameData') c.add_data(b.keys, b.values) queries.write(c.insert(a.id)) except: print((game, "Issue with datetime")) try: b = jsonParse(a.get_raw_data()['gameData']['flags'])
def __init__(self, logpath, server): self.logpath = logpath self.timestamp = '' # Time stamp will be updated when the file is created self.scrape = Scrape(server) self.rule = Rule()
def read_message(self): #quearies websites on a topic, puts output in message file and returns it for email structuring Scrape(self.technology).upwork() with open("message.txt", 'r', encoding='utf-8') as template_file: message_template = template_file.read() return message_template
from bs4 import BeautifulSoup # turns raw web pages into object hierarchy and provides selectors from urlparse import urljoin #from urlunshort import resolve import re # regular expression module for matching, parsing import csv # simplifies process of writing data to csv import nltk # natural language tool kit module for quick analysis import cStringIO #string buffer from scrape import Scrape # a list of URLs about venture capital, investing, data stores allowedDomains = ["bizjournals.com","stltoday.com","stlpublicradio.org","alivemag.com","stlamerican.com","techli.com","stlregionalchamber.com","cbslocal.com","ktrs.com","ksdk.com","kmov.com","fox2now.com","kplr11.com"] pagesToScrape = ["http://www.bizjournals.com/stlouis/blog/biznext/2015/07/10-biggest-funding-rounds-for-startups-so-far-this.html", "http://www.bizjournals.com/stlouis/news/", "http://www.stltoday.com/", "http://news.stlpublicradio.org/#stream/0", "http://www.alivemag.com/", "http://www.stlamerican.com/", "http://techli.com/#.", "http://www.stlregionalchamber.com/who-we-are/chamber-blog", "http://stlouis.cbslocal.com/station/kmox/", "http://www.ktrs.com/", "http://www.ksdk.com/", "http://www.kmov.com/", "http://fox2now.com/", "http://kplr11.com/", "http://www.biospace.com/", "http://medcitynews.com/", "http://www.fiercebiotech.com/", "http://blogs.wsj.com/venturecapital/page/2/", "http://techcrunch.com/", "http://venturebeat.com/", "http://www.bloomberg.com/", "http://www.americanentrepreneurship.com/", "http://siteselection.com/", "http://businessfacilities.com/", "http://www.tradeandindustrydev.com/", "http://www.sec.gov/edgar.shtml", "https://www.sec.gov/about/forms/formd.pdf", "http://www.edgar-online.com/"] firstScrape = Scrape(pagesToScrape) for index,URL in enumerate(firstScrape.getDictionary()): output = cStringIO.StringIO() webpage = requests.get(URL) content = webpage.content soup = BeautifulSoup(content,'html.parser') #print soup.body.text # good line , gives good text info #print soup.title.text # also decent line here for quick line-up of titles #print soup.select("td") # good, produces a lot of information but it is all in html as <td info, need to parse futher #print soup.body output.write("<scrape id=\"" + firstScrape.getAddress() + ":" + str(index) + "\" baseURI=\"" + URL + "\">\n") for link in soup.find_all("a", href=True): output.write(link['href']) output.write('\n') output.write("</scrape>\n") firstScrape.writeScrape(output)
def geolocate(self, data): self._operation = Geolocate(data) self._operation.execute()
df_final = pd.read_sql(sql_query, engine) df_final.to_csv(file_path + 'SQL_generated.csv') if source == 'Quandl': return [str(item) for item in list(df_final['Code'])] elif source == 'Yahoo': return [str(item) for item in list(df_final['Ticker'])] if __name__ == '__main__': current_time = datetime.datetime.now().time() print 'Start time:' + str(current_time) scrape = Scrape() sym = SymbolDb() # Refresh symbol files from Quandl link #scrape.scrape_quandl_codes_us() #scrape.scrape_quandl_cboe_data() #change total pages to scrape in function above #scrape.scrape_finviz_codes_overview(7141,20) #scrape.scrape_finviz_codes_overview() # Merge all the symbol files from finviz and quandl into SQLite database #sym.merge_symbol_files_to_db() # Returns the final table from the database
def get_collapse_content(self): html = super(Collapse, self).get_html() strain = super(Collapse, self).strain_by_id(self.strain_id) soup = Scrape.get_soup(html, strain) return soup
if __name__ == '__main__': # Command line arguments parser = argparse.ArgumentParser(description='Scrape') parser.add_argument('url', type=str, help='URL') parser.add_argument('--debug', action='store_true', help='include debug output') args = parser.parse_args() # Setup logging sh = logging.StreamHandler(sys.stdout) if args.debug: sh.setLevel(logging.DEBUG) else: sh.setLevel(logging.INFO) sh.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) root_log.addHandler(sh) # Init scraper scrape = Scrape(ZivityScraper(root_log), root_log) # Download page if os.path.exists(args.url): scrape.scrape_file(args.url) else: scraped = set() to_scrape = set([args.url]) while len(to_scrape): current_url = to_scrape.pop() to_scrape.update(scrape.scrape_url(current_url)) scraped.add(current_url) to_scrape -= scraped
class Verify(object): # Initialize the logpath and server name obtained from the config file def __init__(self, logpath, server): self.logpath = logpath self.timestamp = '' # Time stamp will be updated when the file is created self.scrape = Scrape(server) self.rule = Rule() # Send an email by refering to the dictionary that includes list of users for which discrepancy was found for particular rule type def sendEmail(self, UserRuleDict): # Send an email if at least one discrepancy was found msgbody = '' if len(UserRuleDict) > 0: for rule, userlist in UserRuleDict.items(): msgbody += ' \n\n*** Dashboard data does not match GNATS for rule type %(1)s for following users: *** \n%(2)s' \ % {"1": rule, "2": ', '.join(user for user in userlist)} msgbody += '\n\nRefer to the following log file for details: \n%s/DAM_%s.log' % (self.logpath, self.timestamp) EmailAlert().send('*****@*****.**', ['*****@*****.**'], 'DAM Alert', msgbody) ''' Run the logic to compare the list of PRs found in Dashboard and the one generated from Gnats Log all the details and if discrepancy is found, send and email alert to concerned users ''' def run(self, usernamelist, rulelist): self.timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") logfile = '%s/DAM_%s.log' % (self.logpath, self.timestamp) logging.basicConfig(filename=logfile, filemode='w', format='%(message)s' ,level=logging.WARNING) hdlr = logging.FileHandler(logfile) try: discrepancyUserdict = {} for ruletype in rulelist: for user in usernamelist: logline = '-----------------------------------------------------------------------------------' print logline logging.critical(logline) logline = 'Verifying for user ||%(1)s|| for ||%(2)s|| type of PRs: ' % {"1": user, "2": ruletype} print logline logging.critical(logline) gnatsPRlist = self.rule.getPRlist(user, ruletype) dashPRlist = self.scrape.getPRList(user, ruletype) discrepancylist1 = list(set(gnatsPRlist) - set(dashPRlist)) discrepancylist2 = list(set(dashPRlist) - set(gnatsPRlist)) logging.critical('%s:' % datetime.datetime.now().strftime("%A, %d - %B %Y %I:%M%p")) logline = 'Count of PRs from GNATS: %d' % len(gnatsPRlist) print logline logging.critical(logline) logline = 'Count of PRs from Dashboard: %d' % len(dashPRlist) print logline logging.critical(logline) logline = 'List of PRs missing in dashboard: [%s] ' % ','.join(d for d in discrepancylist1) print logline logging.critical(logline) logline = 'List of PRs additionally found in dashboard: [%s]' % ','.join(d for d in discrepancylist2) print logline logging.critical(logline) if len(discrepancylist1 + discrepancylist2) > 0: print 'Checking for recently updated PRs and discarding false negatives from the list...' finalDescList = self.removeRecentPRs(discrepancylist1 + discrepancylist2) if len(finalDescList) > 0: logline = '\nDiscrepancies found after removing recently updated PRs: [%s] ' % ','.join(f for f in finalDescList) discrepancyUserdict.setdefault(ruletype, []).append(user) else: logline = '\nDiscrepancies found after removing recently updated PRs: None' print logline logging.critical(logline) else: logline = '\nNo discrepancies found at all' print logline logging.critical(logline) # alert admin by sending an autogenerated email: if len(discrepancyUserdict) > 0: self.sendEmail(discrepancyUserdict) # close the log file hdlr.close() except Exception, e: logging.exception(e)
def get_summary_content(self): html = Scrape.get_html(self) strain = Scrape.strain_by_id(self.summary_id) soup = Scrape.get_soup(html, strain) return soup
#MONGO_HOST = os.environ["MONGO_HOST"] #MONGO_PORT = os.environ["MONGO_PORT"] #MONGO_USER = os.environ["MONGO_USER"] #MONGO_PASS = os.environ["MONGO_PASS"] #MONGO_NAME = os.environ["MONGO_NAME"] #MONGO_COLLECTION = os.environ["MONGO_COLLECTION"] # Initialize database connection #client = MongoClient(MONGO_HOST, int(MONGO_PORT)) #db = client[MONGO_NAME] #db.authenticate(MONGO_USER, MONGO_PASS) #urls_collection = db[MONGO_COLLECTION] # Initialize scraper s = Scrape() # Initialize mpi cluster variables comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() # Initialize root set of sources sources_file = open(SOURCES_LOC, "r") sources = sources_file.readlines() explored = set() uniqueKeywords = set() # Initialize stopTime if len(sys.argv) == 2: stopTime = time.time() + int(sys.argv[1])
def __init__(self, url): self.summary_id = "tableSummaryHeader" self.filter_class = "strongRow" self.table_class = "summaryTable" Scrape.__init__(self, url)