def main(): session = Session() session.headers.update({'User-Agent': USER_AGENT}) for url in ALBUM_URLS: try: album_name = decode_url(url.split('/')[-2]) except Exception: album_name = None print("Downloading music from album '{}'\n".format(album_name)) resp = session.get(url, timeout=30) if not resp.ok: print('Response code: {}'.format(resp.status_code)) return html = resp.text soup = Soup(html) try: music_list = soup.scrape(scrape_config) except Exception: print( "Can't download music from album {}. check URL and content selector.\n" .format(album_name)) continue for song in music_list: music_title = remove_whitespace(song.get('music_title', None)) \ .replace(" (music.com.bd).mp3", "") \ .replace("{} - ".format(decode_url(url.split('/')[-3])), "") music_url = 'https:{}'.format(encode_to_url( song.get('music_url'))).replace(".html", "") if not music_url.endswith('.mp3'): print("Skipping '{}' : not a music file".format(music_title)) continue music_title = '{}.mp3'.format( re.search(MUSIC_TITLE_REGEX, music_title).group(1)) create_dir(os.path.join('Downloads', album_name)) print('>>> Downloading {}'.format(music_title)) resp_music = session.get(music_url) if not resp_music.ok: print(' >>> Download failed : {}'.format( resp_music.status_code)) continue music_path = os.path.join(album_name, music_title) write_binary_file(music_path, resp_music) print(' >>> {} downloaded'.format(music_title)) # end loop: music_list print("\n>>> Album '{}' downloaded".format(album_name)) print('----------------------------\n\n') # end loop : urls print('- Done -')
def add_page(request, category_name_url): context = RequestContext(request) category_name = decode_url(category_name_url) if request.method == 'POST': form = PageForm(request.POST) if form.is_valid(): page = form.save(commit=False) try: cat = Category.objects.get(name=category_name) page.category = cat except Category.DoesNotExist: print category_name return render_to_response('rango/add_category.html',{},context) page.views = 0 page.save() print 'here' return category(request, category_name_url) else: print form.errors else: page = PageForm() return render_to_response('rango/add_page.html',{'category_name_url':category_name_url,'page':page},context)
def worker(index, event, base_url): while True: print "thread-{} start".format(index) event.wait() _time = time.time() conn = check_data_for_thread(index) data = "" new_data = True while b"\r\n" not in data and new_data: new_data = conn.recv(1024) if new_data: data += new_data else: break data = decode_url(data) method, path, http_version = http_parser(data) if not method or not path or not http_version: try: conn.send(make_40X_resopnse_header("405 Bad Gateway")) except BaseException as e: conn.close() set_data_for_thread(index, None) event.clear() continue else: is_data_type_determinate, data_type = determinate_content_type(path) try: if not is_data_type_determinate: path += 'index.html' data, length = read_file(path, base_url) except IOError: if is_data_type_determinate: conn.send(make_40X_resopnse_header("404 Not Found")) else: conn.send(make_40X_resopnse_header("403 Forbidden")) conn.close() set_data_for_thread(index, None) event.clear() continue try: header = make_response_header(data_type, length, http_version) if method == "GET": data = header + data conn.send(data) if method == "HEAD": data = header conn.send(data) except BaseException as e: print "404 BASE in thread-{}".format(index) print path print e try: conn.close() except Exception as e: print e set_data_for_thread(index, None) event.clear() print "thread-{} stop".format(index)
def category(request, category_name_url): context = RequestContext(request) category_name = decode_url(category_name_url) context_dict = {'category_name':category_name, 'category_name_url':category_name_url} try: category = Category.objects.get(name=category_name) pages = Page.objects.filter(category=category) context_dict['pages'] = pages context_dict['category'] = category except Category.DoesNotExist: pass return render_to_response('rango/category.html', context_dict, context)
def download_attachment(download_url, download_folder, attachment_id, attachment_duplicate_file_names, attachment_file_matching, depth=0): """ Repairs links in the page contents with local links. :param download_url: Confluence download URL. :param download_folder: Folder to place downloaded files in. :param attachment_id: ID of the attachment to download. :param attachment_duplicate_file_names: A dict in the structure {'<sanitized attachment filename>': amount of \ duplicates} :param attachment_file_matching: A dict in the structure {'<attachment title>': '<used offline filename>'} :param depth: (optional) Hierarchy depth of the handled Confluence page. :returns: Path and name of the downloaded file as dict. """ clean_url = utils.decode_url(download_url) downloaded_file_name = derive_downloaded_file_name(clean_url) downloaded_file_name = provide_unique_file_name(attachment_duplicate_file_names, attachment_file_matching, downloaded_file_name) downloaded_file_path = download_file(download_url, download_folder, downloaded_file_name, depth=depth) # Download the thumbnail as well if the attachment is an image clean_thumbnail_url = clean_url.replace('/attachments/', '/thumbnails/', 1) downloaded_thumbnail_file_name = derive_downloaded_file_name(clean_thumbnail_url) downloaded_thumbnail_file_name = provide_unique_file_name(attachment_duplicate_file_names, attachment_file_matching, downloaded_thumbnail_file_name) if utils.is_file_format(downloaded_thumbnail_file_name, settings.CONFLUENCE_THUMBNAIL_FORMATS): # TODO: Confluence creates thumbnails always as PNGs but does not change the file extension to .png. download_file(clean_thumbnail_url, download_folder, downloaded_thumbnail_file_name, depth=depth, error_output=False) # Download the image preview as well if Confluence generated one for the attachment if utils.is_file_format(downloaded_file_name, settings.CONFLUENCE_GENERATED_PREVIEW_FORMATS): clean_preview_url = '/rest/documentConversion/latest/conversion/thumbnail/%s/1' % attachment_id downloaded_preview_file_name = derive_downloaded_file_name(clean_preview_url) downloaded_preview_file_name = provide_unique_file_name(attachment_duplicate_file_names, attachment_file_matching, downloaded_preview_file_name) download_file(clean_preview_url, download_folder, downloaded_preview_file_name, depth=depth, error_output=False) return {'file_name': downloaded_file_name, 'file_path': downloaded_file_path}
def paper_details(purl): purl = utils.decode_url(purl) try: store_history(purl) except: pass cursor = g.conn.execute(text(""" SELECT P.title, P.purl, P.model, PO.url FROM Papers P LEFT OUTER JOIN Published_On PO ON P.purl = PO.purl WHERE P.purl = :purl; """), purl=purl) paper = cursor.fetchone() cursor = g.conn.execute(text(""" SELECT PB.aid, A.first_name, A.last_name, I.iid, I.name FROM Papers P RIGHT OUTER JOIN Published_by PB ON P.purl = PB.purl INNER JOIN Authors A ON PB.aid = A.aid INNER JOIN Works_At WA ON WA.aid = A.aid INNER JOIN Institutions I ON I.iid = WA.iid WHERE P.purl = :purl; """), purl=purl) authors = list(cursor.fetchall()) cursor.close() return render_template('paper_details.html', paper=paper, authors=authors)
def get(self): next = self.request.get('next') redirect_uri = settings.APP_DOMAIN + self.uri_for('onfb') + '?next='+ next error = self.request.get('error') code = self.request.get('code') # If error on login if error: self.response.out.write('Anda gagal login ke Bokerface.com.') # If code received elif code: try: token = facebook.get_access_token_from_code(code, redirect_uri, settings.FACEBOOK_APP_ID, settings.FACEBOOK_APP_SECRET) except facebook.GraphAPIError as e: self.response.out.write(e) else: access_token = token['access_token'] # Get user profile graph = facebook.GraphAPI(access_token) profile = graph.get_object('me') uid = profile.get('id') user = User.get_by_key_name(uid) # Update already user access-token if user: if user.access_token != access_token: user.access_token = access_token user.put() # Create new user else: user = User( key_name = str(profile['id']), id = str(profile['id']), username = '******' % str(profile['id'])[-4:], name = profile['name'], profile_url = profile['link'], access_token = access_token ) user.put() # Save user to session self.session['user'] = dict( username=user.username, name=user.name, profile_url=user.profile_url, id=user.id, access_token=user.access_token, is_admin=user.is_admin, ) self.redirect(decode_url(next)) # Default action, authorize app else: fbauth_url = u'https://www.facebook.com/dialog/oauth?client_id=%s&scope=publish_actions&redirect_uri=%s' % ( settings.FACEBOOK_APP_ID, redirect_uri ) self.redirect(str(fbauth_url))
def post(self): to = self.request.get('to', encode_url('/')) # self.redirect(self.uri_for('onfb') + '?next=' + to) self.redirect(decode_url(to))
def show_results(search_word): search_word = utils.decode_url(search_word) results = search_term(search_word) return render_template('results.html', results=results)
def handle_html_references(html_content, page_duplicate_file_names, page_file_matching, depth=0): """ Repairs links in the page contents with local links. :param html_content: Confluence HTML content. :param page_duplicate_file_names: A dict in the structure {'<sanitized filename>': amount of duplicates} :param page_file_matching: A dict in the structure {'<page title>': '<used offline filename>'} :param depth: (optional) Hierarchy depth of the handled Confluence page. :returns: Fixed HTML content. """ try: html_tree = html.fromstring(html_content) except ParserError: print('page is empty') return html_content except XMLSyntaxError: print( '%sWARNING: Could not parse HTML content of last page. Original content will be downloaded as it is.' % ('\t' * (depth + 1))) return html_content # Fix links to other Confluence pages # Example: /display/TES/pictest1 # => pictest1.html # TODO: This code does not work for "Recent space activity" areas in space pages because of a different url format. xpath_expr = '//a[contains(@href, "/display/")]' for link_element in html_tree.xpath(xpath_expr): if not link_element.get('class'): page_title = link_element.attrib['href'].split('/')[3] page_title = page_title.replace('+', ' ') decoded_page_title = utils.decode_url(page_title) offline_link = provide_unique_file_name( page_duplicate_file_names, page_file_matching, decoded_page_title, explicit_file_extension='html') link_element.attrib['href'] = utils.encode_url(offline_link) # Fix links to other Confluence pages when page ids are used xpath_expr = '//a[contains(@href, "/pages/viewpage.action?pageId=")]' for link_element in html_tree.xpath(xpath_expr): if not link_element.get('class'): page_id = link_element.attrib['href'].split( '/pages/viewpage.action?pageId=')[1] offline_link = '%s.html' % utils.sanitize_for_filename(page_id) link_element.attrib['href'] = utils.encode_url(offline_link) # Fix attachment links xpath_expr = '//a[contains(@class, "confluence-embedded-file")]' for link_element in html_tree.xpath(xpath_expr): file_url = link_element.attrib['href'] file_name = derive_downloaded_file_name(file_url) relative_file_path = '%s/%s' % (settings.DOWNLOAD_SUB_FOLDER, file_name) #link_element.attrib['href'] = utils.encode_url(relative_file_path) link_element.attrib['href'] = relative_file_path # Fix file paths for img tags # TODO: Handle non-<img> tags as well if necessary. # TODO: Support files with different versions as well if necessary. possible_image_xpaths = [ '//img[contains(@src, "/download/")]', '//img[contains(@src, "/rest/documentConversion/latest/conversion/thumbnail/")]' ] xpath_expr = '|'.join(possible_image_xpaths) for img_element in html_tree.xpath(xpath_expr): # Replace file path file_url = img_element.attrib['src'] file_name = derive_downloaded_file_name(file_url) relative_file_path = '%s/%s' % (settings.DOWNLOAD_SUB_FOLDER, file_name) img_element.attrib['src'] = relative_file_path # Add alt attribute if it does not exist yet if not 'alt' in img_element.attrib.keys(): img_element.attrib['alt'] = relative_file_path return html.tostring(html_tree)