def grab(channel, timespan): tz = pytz.timezone("Europe/Athens") now = datetime.datetime.now(tz) charset = "windows-1253" shows = [] a = 0 if now.time().hour < 7: a = -1 for i in range(a, 14): date = now + datetime.timedelta(days=i) text = helper.download("https://program.ert.gr/Ert1/index.asp?id=" + channel + "&pdate=" + date.strftime("%d/%m/%Y"), encoding=charset) if text is None: continue sections = helper.split( text, "<td width=\"50\" align=\"center\" class=\"table\">", "</tr></table>") laststart = datetime.datetime.min.replace(tzinfo=tz) for section in sections: show = {} temp = re.search("(\d\d):(\d\d)", section) show["start"] = date.replace(hour=int(temp.group(1)), minute=int(temp.group(2)), second=0, microsecond=0) if show["start"] < laststart: date += datetime.timedelta(days=1) show["start"] += datetime.timedelta(days=1) if (show["start"] - now).total_seconds() / 3600 > timespan: lastshow = True else: lastshow = False laststart = show["start"] temp = re.search("<a class=\"black\".*href=\"(.*)\">(.*)</a>", section) show["title"] = temp.group(2) subtitle = helper.cut( section, "<td width=\"3\"></td><td><font color=\"#6e6868\">", "</font>") if subtitle is not None and subtitle: show["sub-title"] = subtitle link = temp.group(1) if link[0] == "/": link = "https://program.ert.gr" + link if link: show["details-url"] = link shows.append(show) if lastshow: return shows return shows
def grabdetails(url): charset = "windows-1253" text = helper.download(url, encoding=charset) if text is None: return None show = {} temp = helper.split(text, "<div align=\"justify\" class=\"black\">", "</div>") description = "" for d in temp: description += d if description: show["desc"] = helper.cleanup(description) director = re.search("Σκηνοθεσία</b>: (.*?)(?:\n|<br>)", text) if director is not None: show["director"] = helper.cleanup(director.group(1)) presenter = re.search("Παρουσίαση</b>: (.*?)(?:\n|<br>)", text) if presenter is not None: show["presenter"] = helper.cleanup(presenter.group(1)) producer = re.search("Οργάνωση παραγωγής: (.*?)(?:\n|<br>)", text) if producer is not None: show["producer"] = helper.cleanup(producer.group(1)) writer = re.search("Αρχισυνταξία: (.*?)(?:\n|<br>)", text) if writer is not None: show["writer"] = helper.cleanup(writer.group(1)) return show
def grab(channel, timespan): tz = pytz.timezone("UTC") now = datetime.datetime.now(tz) shows = [] for i in range(9): text = helper.download("https://www.freesat.co.uk/tv-guide/api/" + str(i) + "/?channel=" + channel) if text is None: continue events = json.loads(text)[0]["event"] for event in events: show = {} show["start"] = datetime.datetime.fromtimestamp( event["startTime"], tz) if (show["start"] - now).total_seconds() / 3600 > timespan: return shows show["stop"] = show["start"] + datetime.timedelta( seconds=event["duration"]) show["title"] = event["name"] show["desc"] = event["description"] if "episodeNo" in event: show["episode-num"] = (event["episodeNo"], "onscreen") shows.append(show) return shows
def channellist(): text = helper.download("http://programm.ard.de/") channels = helper.split(text, "Tagesprogramm::", "</a>") result = [] for channel in channels: temp = re.search("Tagesprogramm::(.*?)\".*\?sender\=-?(.*?)\&", channel) result.append((temp.group(2), temp.group(1), temp.group(1))) return result
def channellist(): text = helper.download("https://www.freesat.co.uk/tv-guide/api/") channels = json.loads(text) result = [] for channel in channels: result.append((channel["channelid"], channel["channelname"], channel["channelname"])) return result
def channellist(): text = helper.download("http://www.ishow.gr/tvNow.asp") channels = helper.split(text, "<b><a style=\"color:#E1D8BE\"", "</a>") result = [] for channel in channels: temp = re.search("\?cid=(.*?)\">(.*)</a>", channel) result.append((temp.group(1), temp.group(2), temp.group(2))) result.sort(key=lambda r: int(r[0])) return result
def grabdetails(url): text = helper.download(url) if text is None: return None show = {} description = helper.cut(text, "<meta name=\"description\" content=\"", "\" />") if description is not None: show["desc"] = helper.cleanup(description) return show
def grab(channel, timespan): tz = pytz.timezone("Europe/Athens") now = datetime.datetime.now(tz) shows = [] a = 0 if now.time().hour < 4: a = -1 for i in range(a, 6): date = now + datetime.timedelta(days=i) text = helper.download( "http://ishow.gr/showTodayChannelProgramm.asp?cid=" + channel + "&gotoDay=" + str(i)) if text is None: continue sections = helper.split(text, "<tr id=\"progTr", "</tr>") laststart = datetime.datetime.min.replace(tzinfo=tz) for section in sections: show = {} temp = re.search( "<td class=\"progTd progTdTime\".*?>(\d\d):(\d\d)", section) show["start"] = date.replace(hour=int(temp.group(1)), minute=int(temp.group(2)), second=0, microsecond=0) if show["start"] < laststart: date += datetime.timedelta(days=1) show["start"] += datetime.timedelta(days=1) if (show["start"] - now).total_seconds() / 3600 > timespan: lastshow = True else: lastshow = False laststart = show["start"] title = re.search("<div class=\"grandTitle\".*>(.+)\s*?</div>", section) show["title"] = helper.cleanup(title.group(1)) subtitle = helper.cut(section, "<div class=\"subTitle\">", "</div>") if subtitle is not None and subtitle: show["sub-title"] = helper.cleanup(subtitle) temp = re.search("<div class=\"grandTitle\">.*?href=\"(.*?)\"", section) if temp is not None: show["details-url"] = "http://ishow.gr" + temp.group(1) shows.append(show) if lastshow: return shows return shows
def grabdetails(url): text = helper.download(url) if text is None: return None show = {} description = helper.cut(text, "<meta property=\"og:description\" content=\"", "/>") temp = re.search( "<meta property=\"og:description\" content=\"(.*?)(?:\"/>|<)", text) if temp is not None: description = temp.group(1) if description: show["desc"] = helper.cleanup(description) return show
def update(self): '''update(self) - Fill Queue with new Pastie IDs''' print '[*] Retrieving Pastie ID\'s' results = [tag for tag in BeautifulSoup(helper.download(self.BASE_URL + '/pastes')).find_all('p','link') if tag.a] new_pastes = [] if not self.ref_id: results = results[:60] for entry in results: paste = PastiePaste(entry.a['href'].replace(self.BASE_URL + '/pastes', '')) # Check to see if we found our last checked URL if paste.id == self.ref_id: break new_pastes.append(paste) for entry in new_pastes[::-1]: print '[+] Adding URL: ' + entry.url self.put(entry)
def update(self): '''update(self) - Fill Queue with new Slexy IDs''' print '[*] Retrieving Slexy ID\'s' results = BeautifulSoup(helper.download(self.BASE_URL + '/recent')).find_all(lambda tag: tag.name=='td' and tag.a and '/view/' in tag.a['href']) new_pastes = [] if not self.ref_id: results = results[:60] for entry in results: paste = SlexyPaste(entry.a['href'].replace('/view/', '')) # Check to see if we found our last checked URL if paste.id == self.ref_id: break new_pastes.append(paste) for entry in new_pastes[::-1]: print '[+] Adding URL: ' + entry.url self.put(entry)
def grabdetails(url): text = helper.download(url) if text is None: return None show = {} subtitle = helper.cut(text, "<h3 class=\"overlay-subtitle\">", "</h3>") if subtitle is not None and subtitle: show["sub-title"] = helper.cleanup(subtitle) description = helper.cut(text, "<p class=\"overlay-text\">", "</p>") if description is not None and description: show["desc"] = helper.cleanup(description) if text.find("Untertitel für Hörgeschädigte") != -1: show["subtitles"] = True return show
def grab(channel, timespan): tz = pytz.timezone("Europe/Berlin") now = datetime.datetime.now(tz) shows = [] a = 0 if now.time().hour < 7: a = -1 for i in range(a, 14): date = now + datetime.timedelta(days=i) text = helper.download("http://www.zdf.de/live-tv?airtimeDate=" + date.strftime("%Y-%m-%d")) if text is None: continue text = helper.cut(text, "<section class=\"b-epg-timeline timeline-" + channel, "</section>") sections = helper.split(text, "<li", "</li>") laststart = datetime.datetime.min.replace(tzinfo=tz) for section in sections: show = {} temp = helper.cut(section, "<span class=\"time\">", "</span>") temp = re.search("(\d\d):(\d\d) - (\d\d):(\d\d)", temp) show["start"] = date.replace(hour=int(temp.group(1)), minute=int(temp.group(2)), second=0, microsecond=0) if show["start"] < laststart: date += datetime.timedelta(days=1) show["start"] += datetime.timedelta(days=1) if (show["start"] - now).total_seconds() / 3600 > timespan: return shows laststart = show["start"] show["stop"] = date.replace(hour=int(temp.group(3)), minute=int(temp.group(4)), second=0, microsecond=0) if show["stop"] < show["start"]: show["stop"] += datetime.timedelta(days=1) temp = re.search("<span class=\"overlay-link-category\">(.*?)<span class=\"visuallyhidden\">:</span></span>\s*(?:<.*>)*\s*(.*?)\s*?</a>", section) if temp.group(1): show["title"] = helper.cleanup(temp.group(1) + " - " + temp.group(2)) else: show["title"] = helper.cleanup(temp.group(2)) temp = re.search("contentUrl\": \"(.*)\"", section) if temp is not None: show["details-url"] = "http://www.zdf.de" + temp.group(1) shows.append(show) return shows
def update(self): '''update(self) - Fill Queue with new Slexy IDs''' logging.info('[*] Retrieving Slexy ID\'s') results = BeautifulSoup(helper.download(self.BASE_URL + '/recent')).find_all( lambda tag: tag.name == 'td' and tag.a and '/view/' in tag.a['href']) new_pastes = [] if not self.ref_id: results = results[:60] for entry in results: paste = SlexyPaste(entry.a['href'].replace('/view/', '')) # Check to see if we found our last checked URL if paste.id == self.ref_id: break new_pastes.append(paste) for entry in new_pastes[::-1]: logging.info('[+] Adding URL: ' + entry.url) self.put(entry)
def grab(channel, timespan): tz = pytz.timezone("Europe/Berlin") now = datetime.datetime.now(tz) shows = [] a = 0 if now.time().hour < 7: a = -1 for i in range(a, 14): date = now + datetime.timedelta(days=i) datestring = "%s.%s.%s" % (date.day, date.month, date.year) text = helper.download("http://programm.ard.de/TV/Programm/Sender?datum=" + date.strftime("%d.%m.%Y") + "&hour=0&sender=" + channel) if text is None: continue sections = helper.split(text, "<li class=\"eid", "</li>") laststart = datetime.datetime.min.replace(tzinfo=tz) for section in sections: show = {} temp = re.search("<span class=\"date[\s\S]*?(\d\d):(\d\d)", section) show["start"] = date.replace(hour=int(temp.group(1)), minute=int(temp.group(2)), second=0, microsecond=0) if show["start"] < laststart: date += datetime.timedelta(days=1) show["start"] += datetime.timedelta(days=1) if (show["start"] - now).total_seconds() / 3600 > timespan: lastshow = True else: lastshow = False laststart = show["start"] show["title"] = helper.cleanup(re.search("<span class=\"title[\s\S]*?>\s*([^<]*?)[\t\n]", section).group(1)) temp = re.search("<span class=\"subtitle[\s\S]*?>\s*([^<]*?)[\t\n]", section) if temp is not None: subtitle = temp.group(1) if subtitle: show["sub-title"] = helper.cleanup(subtitle) temp = re.search("<a class=\"sendungslink[\s\S]*?href=\"(.*?)\"", section) if temp is not None: show["details-url"] = "http://programm.ard.de" + temp.group(1) shows.append(show) if lastshow: return shows return shows
def update(self): '''update(self) - Fill Queue with new Pastie IDs''' logging.info('Retrieving Pastie ID\'s') results = [ tag for tag in BeautifulSoup( helper.download(self.BASE_URL + '/pastes'), 'lxml').find_all('p', 'link') if tag.a ] new_pastes = [] if not self.ref_id: results = results[:60] for entry in results: paste = PastiePaste(entry.a['href'].replace( self.BASE_URL + '/pastes/', '')) # Check to see if we found our last checked URL if paste.id == self.ref_id: break new_pastes.append(paste) for entry in new_pastes[::-1]: if self.put(entry): logging.debug('Adding URL: ' + entry.url)
def monitor(self, bot, l_lock, t_lock): self.update() while(1): while not self.empty(): paste = self.get() self.ref_id = paste.id with l_lock: helper.log('[*] Checking ' + paste.url) paste.text = helper.download(paste.url) with l_lock: tweet = helper.build_tweet(paste) if tweet: print tweet with t_lock: helper.record(tweet) bot.PostUpdate(tweet) self.update() # If no new results... sleep for 5 sec while self.empty(): with l_lock: helper.log('[*] No results... sleeping') sleep(SLEEP_SLEXY) self.update()
def monitor(self, bot, l_lock, t_lock): self.update() while(1): while not self.empty(): paste = self.get() self.ref_id = paste.id with l_lock: helper.log('[*] Checking ' + paste.url) # goober pastie - Not actually showing *raw* text.. Still need to parse it out paste.text = BeautifulSoup(helper.download(paste.url)).pre.text with l_lock: tweet = helper.build_tweet(paste) if tweet: print tweet with t_lock: helper.record(tweet) bot.PostUpdate(tweet) self.update() # If no new results... sleep for 5 sec while self.empty(): with l_lock: helper.log('[*] No results... sleeping') sleep(SLEEP_PASTIE) self.update()
def grab(channel, timespan): # for olympia this is https://olympia.zdf.de eventurl = "https://european-championships.zdf.de" # length of the event in days eventduration = 11 tz = pytz.timezone("UTC") now = datetime.datetime.now(tz) shows = [] for i in range(eventduration + 1): text = helper.download(eventurl + "/feeds/epg-" + str(i)) if text is None: continue events = json.loads(text)["epg-" + str(i)]["data"][int(channel)]["shows"] for event in events: show = {} show["start"] = datetime.datetime.fromtimestamp(event["start"], tz) if (show["start"] - now).total_seconds() / 3600 > timespan: return shows show["stop"] = datetime.datetime.fromtimestamp(event["end"], tz) title = event["title"] category = event["category"]["name"] if category in title: show["title"] = title else: show["title"] = category + ": " + title show["desc"] = event["text"] show["presenter"] = event["presenter"] show["url"] = eventurl + event["url"] show["icon"] = "https:" + event["image"] shows.append(show) return shows
def grab(channel, timespan): tz = pytz.timezone("UTC") now = datetime.datetime.now(tz) shows = [] laststart = datetime.datetime.min.replace(tzinfo=tz) for i in range(1 + timespan // 4): timestamp = int(time.time()) + i * 14400 text = helper.download("https://www.dw.com/epg/data/4765/1/" + str(timestamp) + "000") if text is None: continue channeldata = helper.cut(text, "data-channel-id=\"" + channel + "\"", "data-channel-id") if not channeldata: try: channeldata = text.split("data-channel-id=\"" + channel + "\"")[1] except IndexError: continue sections = helper.split(channeldata, "<div class=\"epgProgram\"", "<div class=\"broadcastlinks\">") for section in sections: show = {} day = helper.cut(section, "data-day=\"", "\"") begintime = helper.cut(section, "data-begin-time=\"", "\"") endtime = helper.cut(section, "data-end-time=\"", "\"") show["start"] = pytz.utc.localize( datetime.datetime.strptime(day + begintime, "%Y-%m-%d%H:%M")) if show["start"] <= laststart: continue if (show["start"] - now).total_seconds() / 3600 > timespan: return shows laststart = show["start"] show["stop"] = pytz.utc.localize( datetime.datetime.strptime(day + endtime, "%Y-%m-%d%H:%M")) if show["stop"] < show["start"]: show["stop"] += datetime.timedelta(days=1) show["title"] = helper.cleanup( helper.cut(section, "<h2 class=\"title\">", "</h2>")) url = helper.cut(section, "<a href=\"", "\">") if url is not None and url: show["url"] = "https://www.dw.com" + url description = helper.cleanup( helper.cut(section, "<ul class=\"topics\">", "</ul>")) if description is not None and description: show["desc"] = description try: icon = re.search("<img[\s\S]*?/>", section).group(0) width = helper.cut(icon, "width=\"", "\"") height = helper.cut(icon, "height=\"", "\"") src = "https://www.dw.com" + helper.cut(icon, "src=\"", "\"") show["icon"] = (src, {"width": width, "height": height}) except (AttributeError, IndexError): pass shows.append(show) return shows
import requests from helper import download from bs4 import BeautifulSoup url = "https://en.wikipedia.org/wiki/List_of_Dragon_Ball_Z_episodes" response = requests.get(url) soup = BeautifulSoup(response.content, 'html5lib') outer_div = soup.find('div', {'class': 'thumb tright'}) img_tag = outer_div.find('img') img_url = 'https:' + img_tag['src'] print(img_url) download(img_url, "Goku.jpg")
# Check for an environment variable defined in CNTK's test infrastructure envvar = 'CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY' def is_test(): return envvar in os.environ path = './models/vgg16_weights.bin' url = 'https://cntk.ai/jup/models/vgg16_weights.bin' # We check for the model locally if not os.path.exists(path): # If not there we might be running in CNTK's test infrastructure if is_test(): path = os.path.join(os.environ[envvar],'PreTrainedModels','Vgg16','v0',path) else: #If neither is true we download the file from the web print('downloading VGG model (~0.5GB)') helper.download(url, path) layers = helper.load_vgg(path) print('loaded VGG model') # A convolutional layer in the VGG network def vggblock(x, arrays, layer_map, name): f = arrays[0] b = arrays[1] k = C.constant(value=f) t = C.constant(value=np.reshape(b, (-1, 1, 1))) y = C.relu(C.convolution(k, x, auto_padding=[False, True, True]) + t) layer_map[name] = y return y
def final(): helper.download(session['title']) return send_file('{}.mp3'.format(session['title']), as_attachment=True, attachment_filename='{}.mp3'.format(session['title']))
def get_paste_text(self, paste): return BeautifulSoup(helper.download(paste.url)).pre.text
def get_paste_text(self, paste): return helper.download(paste.scraping_url)
def test_download(): url = 'https://www.google.com/logos/doodles/2020/wear-a-mask-save-lives-copy-6753651837108810-s.png' img1 = os.path.join('sample_data', 'download.png') helper.download(url, img1) helper.show(img1)
# ---------------------------------------------------- import json import pandas as pd import os from helper import url_paths, download # read user inputs with open('nam_download_inputs.json', 'r') as f: inputs = json.load(f) # format list of dates to download NAM data for dates_to_download = pd.date_range(inputs['start_date'], inputs['end_date']) months_to_download = [1, 2, 3, 4, 5, 11, 12] # only scrape months during ski season dates_to_download = dates_to_download[dates_to_download.month.isin(months_to_download)] # function to create list of url paths url_paths = url_paths(dates_to_download) # commment out once list of paths generated # create dataframe of url paths df_url_paths = pd.DataFrame(url_paths) df_url_paths.columns = ['url_paths'] # export url paths to csv df_url_paths.to_csv(os.path.join(inputs['relative_data_path'], 'nam_data_url_paths.csv'), index=False) # download model wind speed and direction data df_output = download(os.path.join(inputs['relative_data_path'], 'nam_data_url_paths.csv'), inputs['requested_lat'], inputs['requested_lon'], inputs['pressure_levels']) # export data to gzip compressed csv df_output.to_csv(os.path.join(inputs['relative_data_path'], 'nam_data.csv.gz'), index=False, compression='gzip')
for i in range(2): url = "https://arxiv.org/search/advanced?advanced=&terms-0-operator=AND&terms-0-term=&terms-0-field=title&classification-computer_science=y&classification-physics_archives=all&classification-include_cross_list=include&date-filter_by=all_dates&date-year=&date-from_date=&date-to_date=&date-date_type=announced_date_first&abstracts=hide&size=50&order=-announced_date_first&start={page_num}" url = url.format(page_num=i * 50) response = requests.get(url) tree = html.fromstring(response.content) range_of_files_per_page = "position()<=2" filename_xpath = '/html/body/main/div[2]/ol/li[{range_files}]/div/p/a' element_xpath = '/html/body/main/div[2]/ol/li[{range_files}]/div/p/span/a[1]' filename_xpath = filename_xpath.format(range_files=range_of_files_per_page) element_xpath = element_xpath.format(range_files=range_of_files_per_page) filenames = tree.xpath(filename_xpath) elements = tree.xpath(element_xpath) for filename, element in zip(filenames, elements): pdf_file_name = filename.text pdf_file_name = pdf_file_name.replace(':', '_') pdf_file_name = pdf_file_name.replace('.', '_') pdf_file_name += ".pdf" pdf_file_link = element.attrib['href'] pdf_file_link += ".pdf" download(pdf_file_link, pdf_file_name)
dest='tags', help='Tags to Search for', nargs='+', required=True) parserDownload.add_argument('-f', '--folder', dest='downloadFolder', help='Folder to Download Images to', required=True) parserDownload.add_argument('-b', '--booru', dest='booru', help='Booru Site to Search', required=True) parserDownload.add_argument('-p', '--page', dest='page', help='Page to start downloading images from', nargs='?', default=1) #parserLoad = subParsers.add_parser('load', help='Load New Folders/Images') #parserLoad.add_argument('-f', '--folder', dest='imageFolder', help='Location of Folder to Load', required=True) args = parser.parse_args() config = startup(args.configLoc) if args.downloadFolder: helper.download(config['banned_tags'], config['ratings'], args.tags, args.downloadFolder, args.booru, args.page)
Vector representation of Words Neural Probablistic Approach Skip-Gram Model Theano Implementation Github : peymanbey """ from __future__ import division from helper import download, read_data, build_dataset, gen_batch from math import sqrt import numpy as np from six.moves import urllib from six.moves import xrange # pylint: disable=redefined-builtin from sklearn.manifold import TSNE import matplotlib.pyplot as plt #%% name = download('text8.zip') #%% # store the data into a list # the list contains the text as sequential words words = read_data(name) print 'Data size', len(words) #%% # Build a Dictionary and replace rare words with UNK tokken # translate the input text in terms of unique numerical IDs vocab_size = 50000 data, count, dictionary, reverse_dictionary = build_dataset(words, vocab_size) #%% # you can delete the 'words' to reduce memory usage del words print 'Most common words: ', count[:5] print 'Sample data:', '\n', data[:10], '\n', [
import os import helper project_name = "project" # Create the project's directory if it doesn't exist already. helper.softcreate(project_name) # Download the tutorial's repository tutorial_path = os.path.join(project_name, "tutorial.zip") helper.download( "https://github.com/EdjeElectronics/TensorFlow-Object-Detection-API-Tutorial-Train-Multiple-Objects-Windows-10/archive/master.zip", tutorial_path) helper.unzip( tutorial_path, project_name, "TensorFlow-Object-Detection-API-Tutorial-Train-Multiple-Objects-Windows-10-master" ) # Download the model's directory model_path = os.path.join(project_name, "model.zip") helper.download("https://github.com/tensorflow/models/archive/master.zip", model_path) helper.unzip(model_path, project_name, "models-master")
def get_paste_text(self, paste): return helper.download(paste.url, paste.headers)