def on_message(self, *args): message = args[1].split(':', 3) key = int(message[0]) namespace = message[2] if len(message) >= 4: data = message[3] else: data = '' if key == 1 and args[1] == '1::': self.send_packet_helper(1) elif key == 1 and args[1] == '1::{}'.format(self.namespace): self.send_packet_helper(5, data={'name':'initialize'}) data = {'name':'join', 'args':['{}'.format(self._streamer_name)]} self.send_packet_helper(5, data=data) elif key == 2: self.send_packet_helper(2) elif key == 5: data = json.loads(data, ) if data['name'] == 'message': message = data['args'][0] sender = html.unescape(message['sender']) message = html.unescape(message['text']) self._messager.recieve_chat_data(sender, message, gui.StatusBarSelector.WatchPeopleCode.value)
def parse_urban(refresh=False): try: with open('words.json', 'r') as f: definitions = json.load(f) if refresh: raise FileNotFoundError() except FileNotFoundError: definitions = {} links = [] for char in chars: link = popular_link.format(char=char) data = urllib.request.urlopen(link).read().decode("utf8") found = map(lambda x: (x[0], unescape(x[1])), sample(re.findall(popular_regex, data), items_to_sample)) links += found print("{char} done!".format(char=char)) for link, word in links: data = urllib.request.urlopen(base_link + link).read().decode("utf8") definition = re.findall(meaning_regex, data)[0] definition = re.sub(to_delete_regex, '', definition) definition = re.sub(open_tag_regex, '', definition) definition = re.sub(close_tag_regex, '', definition) definitions[word] = unescape(definition) print("{word} done!".format(word=word)) print("Done!") with open('words.json', 'w+') as f: f.write(json.dumps(definitions, ensure_ascii=False)) return definitions
def getepisodelist(html, url): if '"loggedIn":true' not in html: raise Exception("you didn't log in!") base = re.search("(https?://[^/]+)", url).group(1) s = [] while True: safeprint(url) startpos = html.index('id="gmi-ResourceStream"') ex = re.compile('<a class="thumb[^"]*?" href="({}/art/.+?)" title="(.+?)"'.format(base)) r = ex.findall(html, startpos) for m in r: id = re.search("\d+$", m[0]).group() title = m[1].rpartition(" by ")[0] # WTF r u doing deviantArt? title = unescape(unescape(title)) e = comiccrawler.Episode() e.firstpageurl = m[0] e.title = "{} - {}".format(id, title) s.append(e) next = re.search('id="gmi-GPageButton"[^>]+?href="([^"]+?)"><span>Next</span>', html) if not next: break url = base + next.group(1).replace("&", "&") html = comiccrawler.grabhtml(url, header) return s[::-1]
def get_title(html, url): if is_pool(url): title = unescape(re.search("<h3>Now Viewing: ([^<]+)", html).group(1)) pool_id = re.search("id=(\d+)", url).group(1) return "[Gelbooru] {title} ({pool_id})".format(title=title, pool_id=pool_id) title = unescape(re.search("<title>([^<]+)\| Gelbooru", html).group(1).strip()) return "[Gelbooru] {title}".format(title=title)
def get_lectures_urllist(url, login_cookie): with requests.session() as s: s.cookies = login_cookie r = s.get(url) # Different html structure for videolessons on elearning.polito.it and # didattica.polito.it if "didattica.polito.it" in url: lectures_urllist = re.findall( 'href="(sviluppo\.videolezioni\.vis.*lez=\w*)">', r.text) for i in range(len(lectures_urllist)): lectures_urllist[i] = \ 'https://didattica.polito.it/pls/portal30/'+html.unescape( lectures_urllist[i]) elif "elearning.polito.it" in url: lectures_urllist = re.findall( "href='(template_video\.php\?[^']*)", r.text) for i in range(len(lectures_urllist)): lectures_urllist[i] = \ 'https://elearning.polito.it/gadgets/video/'+html.unescape( lectures_urllist[i]) else: # Still under developement new_domain_message() exit(1) lectures_urllist = "" return lectures_urllist
def main(): credentials = get_credentials() http = credentials.authorize(httplib2.Http()) service = discovery.build('calendar', 'v3', http=http) now = datetime.datetime.utcnow().isoformat() + 'Z' # 'Z' indicates UTC time print('Getting the upcoming 50 events') eventsResult = service.events().list( calendarId='*****@*****.**', timeMin=now, maxResults=50, singleEvents=True, orderBy='startTime').execute() events = eventsResult.get('items', []) username, password = "******", "BOT_PASSWORD" subreddit = "mscalstest" r = praw.Reddit("MSCals Ticker 0.1") print ("Logging in") r.login(username,password) subreddit=r.get_subreddit(subreddit) #create string, add Title thing sl=list() sl.append("Today (GMT):") import time if not events: print('No upcoming events found.') for event in events: #Pull required info from event, then filter on events that match todays date start = event['start'].get('dateTime', event['start'].get('date')) fullEv = start[11:16] + ' - ' + event['summary'] if (time.strftime("%Y-%m-%d") == start[0:10]): sl.append(fullEv) print(sl) #Account for no events existing on a given day if (len(sl) == 1): sl.append("No events") #Very possibly redundant, from old code try: config = html.unescape(r.get_wiki_page(subreddit,"sidebar_bot_config").content_md) except requests.exceptions.HTTPError: print ("Couldn't access format wiki page, reddit may be down.") raise sidebar_string = ' | '.join(sl) #Updating sidebar section print ("Updating sidebar") sidebar = r.get_settings(subreddit) submit_text = html.unescape(sidebar["submit_text"]) desc = html.unescape(sidebar['description']) startmarker, endmarker = desc.index("[](#StartMarker)"), desc.index("[](#MarkerEnd)") + len("[](#MarkerEnd)") updated_desc = desc.replace(desc[startmarker:endmarker], "[](#StartMarker)" + sidebar_string + "[](#MarkerEnd)") if updated_desc != desc: subreddit.update_settings(description=updated_desc.encode('utf8'), submit_text=submit_text)
def write_unitn(cls, out_path, unitn_path, download_path, is_train): with open(unitn_path) as unitn_sr, open(download_path) as download_sr, open(out_path, 'a+') as out_sr: for unitn_line, download_line in zip(unitn_sr, download_sr): doc_id_unitn, label_unitn, text_unitn = \ re.match(r'\d+\t(\d+)\t(negative|neutral|positive)\t(.+)', unitn_line).groups() doc_id_download, label_download, text_download = \ re.match(r'\d+\t(\d+)\t(negative|neutral|positive)\t(.+)', download_line).groups() text_unitn = text_unitn.encode().decode('unicode-escape') text_unitn = text_unitn.replace(r'’', '\'') if is_train: text_unitn = html.unescape(text_unitn) text_unitn = text_unitn.replace('""', '"') text_download = html.unescape(html.unescape(text_download)) assert doc_id_unitn == doc_id_download assert label_unitn == label_download text = text_unitn if text_download != 'Not Available': # some differences are impossible to reconcile, some unitn data have the wrong order # if re.sub(r'\s+', ' ', text_unitn) != re.sub(r'\s+', ' ', text_download): # logging.error(out_path) # logging.error(text_unitn) # logging.error(text_download) # assert re.sub(r'\s+', ' ', text_unitn) == re.sub(r'\s+', ' ', text_download) text = text_download out_sr.write(json.dumps({'id': doc_id_unitn, 'text': text, 'label': cls.class_map[label_unitn]}) + '\n')
def get_login_cookie(user, passw): if user is None: user = input("Username: "******"Password: "******"https://idp.polito.it:443/idp/profile/SAML2/Redirect/SSO": # Login successfull, we just need to follow some redirects relaystate = html.unescape( re.findall('name="RelayState".*value="(.*)"', r.text)[0]) samlresponse = html.unescape( re.findall('name="SAMLResponse".*value="(.*)"', r.text)[0]) r = s.post( 'https://www.polito.it/Shibboleth.sso/SAML2/POST', data={'RelayState': relaystate, 'SAMLResponse': samlresponse}) r = s.post('https://login.didattica.polito.it/secure/ShibLogin.php') relaystate = html.unescape( re.findall('name="RelayState".*value="(.*)"', r.text)[0]) samlresponse = html.unescape( re.findall('name="SAMLResponse".*value="(.*)"', r.text)[0]) r = s.post( 'https://login.didattica.polito.it/Shibboleth.sso/SAML2/POST', data={'RelayState': relaystate, 'SAMLResponse': samlresponse} ) login_cookie = s.cookies else: login_cookie = "" return login_cookie
def __init__(self, uid, access_token, output_folder, album): self.uid = uid self.access_token = access_token self.output_folder = output_folder self.album = album self.cpu_count = os.cpu_count() if not os.path.exists(self.output_folder): os.makedirs(self.output_folder) # '{04}_{}.mp3 self.folder_aids = {x[5:-4] for x in os.listdir(self.output_folder)} url = ( "https://api.vkontakte.ru/method/audio.get.json?" "uid={uid}&access_token={access_token}" ).format(uid=self.uid, access_token=self.access_token) response = urllib.request.urlopen(url) content = response.read() self._content = json.loads(content.decode('utf-8')) self.music_list = self._content['response'] self.tracks_map = {} for ind, track in enumerate(reversed(self.music_list)): self.tracks_map[str(track['aid'])] = { 'index': ind, 'artist': unescape(track['artist']), 'title': unescape(track['title']), 'url': track['url'], 'output_path': os.path.join(output_folder, '{}_{}.mp3'.format(format(ind, '04'), track['aid'])), }
def spider_xitu_gold(proxy=None): source_name = '稀土掘金' time1 = int(time.time()) level = 3 headers1 = headers.copy() headers1.update({"x-avoscloud-request-sign": "dd36c74cb860e12f7e12ac1c9c14917f,2139632477696", "X-avoscloud-Application-Id": "mhke0kuv33myn4t4ghuid4oq2hjj12li374hvcif202y5bm6"}) r = trequests.get( 'https://api.leancloud.cn/1.1/classes/Entry?include=user,user.installation&limit=15&order=-createdAt&where={"tags":{"__type":"Pointer","className":"Tag","objectId":"559a7227e4b08a686d25744f"}}', timeout=20, headers=headers1) # print(r.text) items = r.json().get('results', []) if not items: logit('%s 解析失败.' % source_name) return titles = [unescape(i.get('title', '').strip()) for i in items] if '' in titles: logit('%s 出现空Title字段。' % source_name) covers = [re.sub('.*user-gold-cdn.xitu.io.*','',getlist1(jp(i, '$.screenshot.url'))) for i in items] urls = [i.get('originalUrl', 'http://gold.xitu.io/#/tag/Python') for i in items] descriptions = [unescape(i.get('createdAt', '').split('T')[0]) for i in items] result = [{'title': i[0], '_id':cleanid(i[0]), 'level':level, 'cover':i[1], 'description':i[2], 'toptime':0, 'urls':{ source_name: i[3]}, 'time':time1} for i in zip(titles, covers, descriptions, urls)] print('%s finished: %s gotten.' % (source_name, len(result))) assert len(result) > 0, '%s 抓取结果为 0。' % source_name return result
def spider_bole_article(proxy=None): source_name = '伯乐文章' time1 = int(time.time()) level = 3 r = trequests.get( 'http://python.jobbole.com/all-posts/', proxies=proxy, **default_args) items = fromstring(r.text).xpath( '//div[@id="archive"]/div[@class="post floated-thumb"]') titles = [unescape(getlist1(i.xpath( './div[@class="post-meta"]/p/a[@class="archive-title"]/text()')).strip()) for i in items] if '' in titles: logit('%s 出现空Title字段。' % source_name) # covers = [getlist1(i.xpath('./div[@class="post-thumb"]//img/@src')) for # i in items] covers = [''] * len(titles) descriptions1 = [unescape(getlist1(i.xpath( './div[@class="post-meta"]/span[@class="excerpt"]/p/text()'))[:50]) for i in items] urls = [getlist1(i.xpath('./div[@class="post-thumb"]/a/@href')) for i in items] times = [getlist1( i.xpath('./div[@class="post-meta"]/p[1]')).text_content() for i in items] times = [getlist1(re.findall('20\d\d/\d\d/\d\d', i)) for i in times] descriptions = ['...'.join(i) for i in zip(descriptions1, times)] result = [{'title': i[0], '_id':cleanid(i[0]), 'level':level, 'cover':i[1], 'description':i[2], 'toptime':0, 'urls':{ source_name: i[3]}, 'time':time1} for i in zip(titles, covers, descriptions, urls)] print('%s finished: %s gotten.' % (source_name, len(result))) assert len(result) > 0, '%s 抓取结果为 0。' % source_name return result
def getepisodelist(html, url): if '"loggedIn":true' not in html: raise PauseDownloadError("you didn't log in!") base = search("(https?://[^/]+)", url).group(1) s = [] while True: safeprint(url) startpos = html.index('id="gmi-ResourceStream"') ex = compile('<a class="thumb[^"]*?" href="({}/art/.+?)" title="(.+?)"'.format(base)) for match in ex.finditer(html, startpos): id = search("\d+$", match.group(1)).group() title = match.group(2).rpartition(" by ")[0] # WTF r u doing deviantArt? title = unescape(unescape(title)) s.append( Episode( "{} - {}".format(id, title), match.group(1) ) ) next = search('id="gmi-GPageButton"[^>]+?href="([^"]+?)"><span>Next</span>', html) if not next: break url = base + unescape(next.group(1)) html = grabhtml(url) return s[::-1]
def print_decision_table_row(elem): headers = [elem.decision_table.hit_policy[0]] for input in elem.decision_table.input_list: inp = input.label if input.label != None else input.expression headers.append(inp) headers.append('') headers.append(elem.decision_table.output.name) table = [] for rule in elem.decision_table.rule_list: table_row = [elem.decision_table.rule_list.index(rule) + 1] for input in rule.input_list: table_row.append(html.unescape(input.value)) table_row.append('') table_row.append(html.unescape(rule.output)) table.append(table_row) print('\n\n ' + elem.name) print(tabulate(table, headers, tablefmt='fancy_grid')) # table_row = [elem.decision_table.output] # table_row.append(elem.decision_table.output) # for rule in elem.decision_table.rule_list: # table_row.append(elem.decision_table.input_list[0]) # table = [table_row] # table_row = [elem.decision_table.output] # table_row.append(elem.decision_table.output) # for rule in elem.decision_table.rule_list: # table_row.append(html.unescape(rule.input_list[0].value)) # table.append(table_row) # for rule in elem.decision_table.rule_list: # table_row = [elem.decision_table.input_list[1]] # table_row.append(html.unescape(rule.input_list[1].value)) # table_row.append()
def print_results(options, items): query_regexp = options.query_regexp try: query_regexp = re.compile('({0})'.format(query_regexp)) except re.error: query_regexp = re.compile(r'\Zx') # never match anything for item in items: colors.print('{path}:{line}:', pkg=item['package'], path=item['path'], line=item['line']) for line in item['ctxp2'], item['ctxp1']: line = html.unescape(line) colors.print('{t.dim}|{t.off} {line}', line=line) line = html.unescape(item['context']) template = '{t.dim}>{t.off} ' chunkdict = {} for i, (chunk, matched) in enumerate(xsplit(query_regexp, line)): chunkdict['l{0}'.format(i)] = chunk template += '{t.bold}' if matched: template += '{t.yellow}' template += '{l' + str(i) + '}{t.off}' colors.print(template, **chunkdict) for line in item['ctxn1'], item['ctxn2']: line = html.unescape(line) colors.print('{t.dim}|{t.off} {line}', line=line) colors.print('{t.dim}(pathrank {pathrank:.4f}, rank {rank:.4f}){t.off}', pathrank=item['pathrank'], rank=item['ranking'], ) print() sys.stdout.flush()
def get_http_homepage(xmlfile:str, host_ip:str) -> list: lines = get_lines_between(xmlfile,host_ip, '<script id="http-homepage"','</script>') if not lines: return [] lines = [re.sub(r'<.+?>','', _) for _ in lines] lines = ''.join(_ for _ in lines) header, body, *_ = lines.split('\n\n') header = [_.split(':',1) for _ in header.splitlines() if ':' in _] header = flatten_listoflist([k.strip(),v.strip()] for k,v in header if k not in ['Date','Expires','Pragma','Connection','Content-Type','ETag', 'Last-Modified', 'Accept-Ranges','Content-Length', 'Cache-Control']) from html import unescape header = [unescape(_) for _ in header] header = [remove_uuid(_) for _ in header] header = flatten_listoflist(re.split(r""",|\ |=|"|;|:|\(|\)""", _) for _ in header) body = unescape(unescape(body)) body = re.sub(r'\\x([0-9A-Fa-f]{2})', lambda m:chr(int(m.group(1),16)), body) body = remove_uuid(body) body = remove_html_tags(body) body = [_ for _ in re.split(r"""</|/>|<|>|=|"|\ |\t|\n|\r|,|;""", body) if _] body = [_ for _ in (strip_punc(_) for _ in body) if _] body = [_ for _ in (strip_punc(_) for _ in body) if _] return [_.strip() for _ in header+body if _.strip()]
def extract_bilibili_by_api(url): u'''使用api,获取title和index''' title,page_titles = get_title_by_url(url) titles = [] if title: # unquote html entities title = unescape(title.strip()) # replace / to _ title = escape_seps(title) else: title = "" titles.append(title) if page_titles: #debug(text_match.groups()) index = len(page_titles) if index == 1 and page_titles[0] == '': index = 0 debug(index) # have multi pages if index > 0: for page_title in page_titles: # unquote html entities page_title = unescape(page_title.strip()) # replace / to _ page_title = escape_seps(page_title) # delete bilibli page prefix titles.append(page_title) debug(titles) return titles,index
def getTrackInfo(self, context=""): trackInfo = TrackInfo() trackInfo.TrackAuthor = html.unescape(re.search(r"target=\"_blank\">(.*?)</a></TD>", context).group(1)) trackInfo.TrackName = html.unescape(re.search(r"id=\"ctl03_ShowTrackName\">(.*?)</span>", context).group(1)) trackInfo.TrackLength = html.unescape(re.search(r"id=\"ctl03_ShowLength\">(.*?)</span>", context).group(1)) trackInfo.TrackStyle = html.unescape(re.search(r"id=\"ctl03_ShowStyle\">(.*?)</span>", context).group(1)) return trackInfo
def getTmxRecordsData(self, data=""): recordsData = [] context = data while True: record = RecordInfo() record.RecordType = "Tmx" record.Server = "Offline" replay_link = re.search("get.aspx\?action=recordgbx&id=(.*?)\"", context) if replay_link is not None: record.ReplayUrl = html.unescape("https://tmnforever.tm-exchange.com/get.aspx?action=recordgbx&id=" + replay_link.group(1)) else: break index = context.index("</a></td><td>") time = context[index - 13:index - 6] record.Time = time context = context[index + 7:] author = re.search("target=\"_blank\">(.*?)</a></td><td>", context).group(1) record.Player = html.unescape(author) index = context.index("</a></td><td>") context = context[index + 7:] recordsData.append(record) return recordsData
def test_complete_task_input(self, section, area, goal_number, task_number): generators = { 'lower128': { 'all': self.generate_all_lower_128_unicode_string, }, 'high': { 'limit': self.generate_500_high_unicode_string, }, } comment = generators[section][area]() self.webapp.click('goal-%s-task-%s_incomplete' % ( goal_number, task_number, )) self.fill_form( task_comment=comment, ) self.webapp.click('complete-task') element = self.driver.find_elements_by_id('goal-%s-task-%s_complete' % (goal_number, task_number))[0] self.webapp.click('task-%s_history' % task_number) today = urllib.parse.quote_plus(datetime.datetime.strftime( datetime.datetime.today(), '%Y %b %d', )) comment_id = '{date}_comment'.format(date=today) comment_element = self.driver.find_elements_by_id(comment_id)[0] comment_text = html.unescape(comment_element.get_attribute('innerHTML')) comment_expected = html.unescape(comment) assert comment_text == comment_expected, '%s does not match comment %s' % (comment_text, comment_expected) self.webapp.click('home-link')
def on_message(self, *args): message = args[1].split(':', 3) key = int(message[0]) # namespace = message[2] if len(message) >= 4: data = message[3] else: data = '' if key == 1 and args[1] == '1::': self.send_packet_helper(1) elif key == 1 and args[1] == '1::{}'.format(self.namespace): self.send_packet_helper(5, data={'name': 'initialize'}) data = {'name': 'join', 'args': ['{}'.format(self._streamer_name)]} self.send_packet_helper(5, data=data) self.log.info('Connected to channel with socket io!') self.messaging.send_status('CONNECTED') elif key == 2: self.send_packet_helper(2) elif key == 5: data = json.loads(data, ) if data['name'] == 'message': message = data['args'][0] sender = html.unescape(message['sender']) message = html.unescape(message['text']) self.messaging.send_message(author=sender, message=message) elif data['name'] == 'join': self.nick = data['args'][1]
def get_dlurl(lecture_url, login_cookie, dl_format='video'): with requests.session() as s: s.cookies = login_cookie r = s.get(lecture_url) if "didattica.polito.it" in lecture_url: if dl_format == 'video': dlurl = re.findall('href="(.*)".*Video', r.text)[0] if dl_format == 'iphone': dlurl = re.findall('href="(.*)".*iPhone', r.text)[0] if dl_format == 'audio': dlurl = re.findall('href="(.*)".*Audio', r.text)[0] r = s.get( 'https://didattica.polito.it'+html.unescape(dlurl), allow_redirects=False) dlurl = r.headers['location'] elif "elearning.polito.it" in lecture_url: if dl_format == 'video': dlurl = re.findall( 'href="(download.php[^\"]*).*video1', r.text)[0] if dl_format == 'iphone': dlurl = re.findall( 'href="(download.php[^\"]*).*video2', r.text)[0] if dl_format == 'audio': dlurl = re.findall( 'href="(download.php[^\"]*).*video3', r.text)[0] r = s.get( 'https://elearning.polito.it/gadgets/video/' + html.unescape(dlurl), allow_redirects=False) dlurl = r.headers['location'] else: # Still under developement new_domain_message() exit(1) dlurl = "" return dlurl
def api_get_post(post_url): GlobalVars.api_request_lock.acquire() # Respect backoff, if we were given one if GlobalVars.api_backoff_time > time.time(): time.sleep(GlobalVars.api_backoff_time - time.time() + 2) d = parsing.fetch_post_id_and_site_from_url(post_url) if d is None: GlobalVars.api_request_lock.release() return None post_id, site, post_type = d if post_type == "answer": api_filter = r"!FdmhxNRjn0vYtGOu3FfS5xSwvL" else: assert post_type == "question" api_filter = r"!DEPw4-PqDduRmCwMBNAxrCdSZl81364qitC3TebCzqyF4-y*r2L" request_url = "https://api.stackexchange.com/2.2/{}s/{}".format(post_type, post_id) params = { 'filter': api_filter, 'key': 'IAkbitmze4B8KpacUfLqkw((', 'site': site } response = requests.get(request_url, params=params).json() if "backoff" in response: if GlobalVars.api_backoff_time < time.time() + response["backoff"]: GlobalVars.api_backoff_time = time.time() + response["backoff"] if 'items' not in response or len(response['items']) == 0: GlobalVars.api_request_lock.release() return False GlobalVars.api_request_lock.release() item = response['items'][0] post_data = PostData() post_data.post_id = post_id post_data.post_url = parsing.url_to_shortlink(item['link']) post_data.post_type = post_type post_data.title = html.unescape(item['title']) if 'owner' in item and 'link' in item['owner']: post_data.owner_name = html.unescape(item['owner']['display_name']) post_data.owner_url = item['owner']['link'] post_data.owner_rep = item['owner']['reputation'] else: post_data.owner_name = "" post_data.owner_url = "" post_data.owner_rep = 1 post_data.site = site post_data.body = item['body'] post_data.score = item['score'] post_data.up_vote_count = item['up_vote_count'] post_data.down_vote_count = item['down_vote_count'] post_data.creation_date = item['creation_date'] try: post_data.last_edit_date = item['last_edit_date'] except KeyError: post_data.last_edit_date = post_data.creation_date # Key not present = not edited if post_type == "answer": post_data.question_id = item['question_id'] return post_data
def generate_user(config, user): url = "https://www.soundcloud.com/" + user data = rssit.util.download(url) soup = bs4.BeautifulSoup(data, 'lxml') author = html.unescape(soup.find("meta", attrs={"property": "og:title"})["content"]) if config["author_username"]: author = user description = html.unescape(soup.find("p", attrs={"itemprop": "description"}).text).strip() if len(description) <= 0: description = "%s's soundcloud" % user feed = { "title": author, "description": description, "url": url, "author": user, "entries": [] } tracks = soup.findAll("article", attrs={"itemprop": "track"}) for track in tracks: tracka = track.find("a", attrs={"itemprop": "url"}) trackname = tracka.text trackurl = urllib.parse.urljoin(url, tracka["href"]) date = parse(track.find("time").text) title = trackname duration_delta = isodate.parse_duration(track.find("meta", attrs={"itemprop": "duration"})["content"]) duration_seconds = duration_delta.total_seconds() duration_text = "[%s:%s:%s]" % ( str(int(duration_seconds / 3600)).zfill(2), str(int((duration_seconds % 3600) / 60)).zfill(2), str(int(duration_seconds % 60)).zfill(2) ) content = "<p>%s <a href='%s'>%s</a> by <a href='%s'>%s</a></p>" % ( duration_text, trackurl, trackname, url, author ) feed["entries"].append({ "url": trackurl, "title": title, "content": content, "author": user, "date": date, }) return ("feed", feed)
def write_download(cls, out_path, download_path): with open(download_path) as download_sr, open(out_path, 'a+') as out_sr: for line in download_sr: doc_id, label, text = re.match(r'(?:\d+\t)?(\d+)\t(negative|neutral|positive)\t(.+)', line).groups() text = html.unescape(html.unescape(text)) if text == 'Not Available': continue out_sr.write(json.dumps({'id': doc_id, 'text': text, 'label': cls.class_map[label]}) + '\n')
def write_test_2015(cls, out_path, input_path, label_path): with open(input_path) as in_sr, open(label_path) as labels_sr, open(out_path, 'a+') as out_sr: for line, label_line in zip(in_sr, labels_sr): doc_id, text = re.match(r'NA\t(T\d+)\tunknwn\t(.+)', line).groups() text = html.unescape(html.unescape(text)) doc_id_label, label = re.match(r'\d+\t(T\d+)\t(negative|neutral|positive)', label_line).groups() assert doc_id == doc_id_label out_sr.write(json.dumps({'id': doc_id, 'text': text, 'label': cls.class_map[label]}) + '\n')
def _path2href( self, match ): if match.group(0) not in core.listoutdir(): return match.group(0) filename = html.unescape( match.group(0) ) ext = html.unescape( match.group(1) ) whitelist = ['.jpg','.png','.svg','.txt','.mp4','.webm'] + list(getattr(config, 'plot_extensions', [])) fmt = '<a href="{href}"' + (' class="plot"' if ext in whitelist else '') + '>{name}</a>' return fmt.format( href=urllib.parse.quote( filename ), name=html.escape( filename ) )
def insert_hyphenation(self, s): hs = '' if s: if self.hyphenator and self.hyphenate and not (self.header or self.subheader): hs = ' '.join([self.hyphenate_word(html.unescape(w)) for w in s.split(' ')]) else: hs = html.unescape(s) return hs
def checkArtistTypes(types): whatArtists = [unescape(x['name']) for x in albumGroup['group']['musicInfo'][types[0]]] types.pop(0) if len(whatArtists) == 1: return whatArtists elif len(whatArtists)>1: return whatArtists+[unescape(x['name']) for y in types for x in albumGroup['group']['musicInfo'][y]] return checkArtistTypes(types) if len(types)>0 else []
def represent_question(question): return { 'title': html.unescape(question['title']), 'owner': html.unescape(question['owner']['display_name']), 'date': question['creation_date'], 'link': question['link'], 'is_answered': question['is_answered'], }
def _path2href( self, match ): if not os.path.exists( os.path.join( self._logdir, match.group(0) ) ): return match.group(0) filename = html.unescape( match.group(0) ) ext = html.unescape( match.group(1) ) whitelist = ['.jpg','.png','.svg','.txt','.mp4','.webm'] + list( core.getprop( 'plot_extensions', [] ) ) fmt = '<a href="{href}"' + (' class="plot"' if ext in whitelist else '') + '>{name}</a>' return fmt.format( href=urllib.parse.quote( filename ), name=html.escape( filename ) )
if 'Planetarium' in event['categoryCalendar']: print(f"{event['location']}: {event['title']}: {event['time_start']}: {event['categoryCalendar']}") break # Challenge 5: Repeat challenge 4 also print Cost print("\nChallenge 5: First event with categoryCalendar ending in 'Planetarium' including 'Cost'") import html for event in events: if 'Planetarium' in event['categoryCalendar']: print(f"{event['location']}: {event['title']}: {event['time_start']}: {event['categoryCalendar']}") for field in event['customFields']: if field['label'] == 'Cost': cost_with_entities = field['value'] print(cost_with_entities) cost = html.unescape(cost_with_entities) print(cost) break break # Challenge 6: Repeat challenge 4 using regular expression search on Planetarium. print("\nChallenge 6: First event with categoryCalendar ending in 'Planetarium' including 'Cost' using regular expressions") import re for event in events: if re.search('Planetarium', event['categoryCalendar']): print(f"{event['location']}: {event['title']}: {event['time_start']}: {event['categoryCalendar']}") for field in event['customFields']: if field['label'] == 'Cost': print(html.unescape(field['value'])) break break
def html_to_plain_text(html): text = re.sub('<head.*?>.*?</head>', '', html, flags=re.M | re.S | re.I) text = re.sub('<a\s.*?>', ' HYPERLINK ', text, flags=re.M | re.S | re.I) text = re.sub('<.*?>', '', text, flags=re.M | re.S) text = re.sub(r'(\s*\n)+', '\n', text, flags=re.M | re.S) return unescape(text)
def uni_norm(text): text = text.translate( {0x2018: 0x27, 0x2019: 0x27, 0x201C: 0x22, 0x201D: 0x22, 0xA0: 0x20} ) return unescape(text)
"div", {"class": "card__content"}) # find all the cards on the page print(len(days), " number of cards found") for d in days: try: subhead = d.find_all( "div", {"class": "date_day"})[0].decode_contents() event = d.find_all( "h3", {"class": "card__title heading" })[0].find_all("a")[0].decode_contents( ) # all h3 card__title headings have an a element inside if month in subhead and str( int(day) ) in subhead: # events related to other days appear on each day page #print(subhead, event) rows.append((int(m), int(day), unescape(event))) else: pass #print("wrong date") except: pass sleep( 5 ) # sleep for 5 seconds between requests to prevent spamming webserver # takes about 30 minutes to scrape a whole year with open("fun_holidays_4.txt", 'a', encoding="UTF-8") as file: for row in rows: r = '\t'.join([str(x) for x in row]) file.write(f"{r}\n")
def extract_title(entry): if entry.get('title'): return html.unescape(entry['title'])
def messages(data, user, recipient, report_html, local, time_start, time_end, timeformat, operating_system): """ Function that show database messages """ rep_med = "" # Saves the complete chat rows = len(data.index) for i in data.index: try: report_msj = "" # Saves each message report_name = "" # Saves the chat sender message = "" # Saves each msg sys.stdout.write("\rMessage {}/{}".format(str(i+1), str(rows))) sys.stdout.flush() # transform chat time in epoch local time time_parse = str(data['Date'][i]) + " " + str(data['Time'][i]) utc_time = time.strptime(time_parse, timeformat) dt = time.mktime(utc_time) if time_start <= dt <= time_end: sender = str(data['Author'][i]) if operating_system == "ios": text = getAttachediOS(str(data['Message'][i])) else: text = getAttachedAndroid(str(data['Message'][i])) if ("Los mensajes y las llamadas están cifrados de extremo a extremo. Nadie fuera de este chat, ni siquiera WhatsApp, puede leerlos ni escucharlos" "" in text) or ("Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them") in text: sender = "None" if sender == user: # The owner post a message if (report_var == 'EN') or (report_var == 'ES'): report_name = user else: message = Fore.RED + "\n--------------------------------------------------------------------------------" + Fore.RESET + "\n" message += Fore.GREEN + "From " + Fore.RESET + user + Fore.GREEN + " to " + Fore.RESET + recipient + "\n" elif sender == "None": # The system post a message if report_var == 'EN': report_name = "System Message" elif report_var == 'ES': report_name = "Mensaje de Sistema" else: message = Fore.RED + "\n--------------------------------------------------------------------------------" + Fore.RESET + "\n" message += Fore.GREEN + "From " + Fore.RESET + "System\n" else: # Other user post a message if (report_var == 'EN') or (report_var == 'ES'): report_name = "<font color='{}'> {} </font>".format(color.get(sender), sender) else: message = Fore.RED + "\n--------------------------------------------------------------------------------" + Fore.RESET + "\n" message += Fore.GREEN + "From " + Fore.RESET + sender + Fore.GREEN + " to" + Fore.RESET + " Me\n" if (report_var == 'EN') or (report_var == 'ES'): report_msj += text else: message += Fore.GREEN + "Message: " + Fore.RESET + html.unescape(text) + "\n" report_time = "{} - {}".format(str(data['Date'][i]), str(data['Time'][i])) if (report_var == 'EN') or (report_var == 'ES'): if report_name == user: rep_med += """ <li> <div class="bubble2"> <span class="personName2">""" + report_name + """</span><br> <span class="personSay2">""" + report_msj + """</span><br> <span class="time round2">""" + report_time + " " + """</span><br> </div> </li>""" elif (report_name == "System Message") or (report_name == "Mensaje de Sistema"): rep_med += """ <li> <div class="bubble-system"> <span class="time-system round">""" + report_time + " " + """</span><br> <span class="person-System">""" + report_msj + """</span><br> </div> </li>""" else: rep_med += """ <li> <div class="bubble"> <span class="personName">""" + report_name + """</span><br> <span class="personSay">""" + report_msj + """</span><br> <span class="time round">""" + report_time + " " + """</span><br> </div> </li>""" elif report_var == 'None': message += Fore.GREEN + "Timestamp: " + Fore.RESET + report_time + "\n" print(message) except Exception as e: print("\nError showing message details: {}, Message ID {}, Timestamp {}".format(e, str(i), data['Date'][i] + ", " + data['Time'][i])) if report_var != "None": report(rep_med, report_html, local)
def extract_link(self, parse_url, link): """html解码""" link = unescape(link) """判断后缀是否在黑名单中""" filename = os.path.basename(link) file_extend = self.get_file_extend(filename) is_link = False if link.startswith( ('http://', 'https://')) and file_extend not in self.black_extend_list: full_url = link elif link.startswith('javascript:'): return False elif link.startswith('////') and len(link) > 4: full_url = 'http://' + link[2:] elif link.startswith('//') and len(link) > 2: full_url = 'http:' + link elif link.startswith('/'): full_url = parse_url.scheme + '://' + parse_url.netloc + link elif link.startswith('./'): full_url = parse_url.scheme + '://' + parse_url.netloc + parse_url.path + link[ 1:] else: full_url = parse_url.scheme + '://' + parse_url.netloc + parse_url.path + '/' + link """解析爬取到链接的域名和根域名""" extract_full_url_domain = extract(full_url) root_domain = extract_full_url_domain.domain + '.' + extract_full_url_domain.suffix sub_domain = urlparse(full_url).netloc """判断爬取到的链接是否满足keyword""" in_keyword = False for keyword in self.keywords: if keyword in root_domain: in_keyword = True if not in_keyword: return False """添加根域名""" try: self._value_lock.acquire() if root_domain not in self.root_domains: self.root_domains.append(root_domain) logger.info( '[+]Find a new root domain ==> {}'.format(root_domain)) if root_domain not in self.extract_urls: self.extract_urls.append(root_domain) self.queue.put('http://' + root_domain) finally: self._value_lock.release() """添加子域名""" try: self._value_lock.acquire() if sub_domain not in self.sub_domains and sub_domain != root_domain: self.sub_domains.append(sub_domain) logger.info( '[+]Find a new subdomain ==> {}'.format(sub_domain)) if sub_domain not in self.extract_urls: self.extract_urls.append(sub_domain) self.queue.put('http://' + sub_domain) finally: self._value_lock.release() if file_extend in self.black_extend_list: return False if is_link is True: return link try: self._value_lock.acquire() if full_url not in self.apis and file_extend != 'html' and file_extend != 'js': self.apis.append(full_url) # logger.info('[+]Find a new api in {}'.format(parse_url.netloc)) finally: self._value_lock.release() format_url = self.get_format_url(urlparse(full_url), filename, file_extend) try: self._value_lock.acquire() if format_url not in self.extract_urls: self.extract_urls.append(format_url) self.queue.put(full_url) finally: self._value_lock.release()
def invoice_validate(self): detalles = [] subtotal = 0 for factura in self: if factura.journal_id.usuario_fel and not factura.firma_fel: attr_qname = etree.QName( "http://www.w3.org/2001/XMLSchema-instance", "schemaLocation") NSMAP = { "ds": "http://www.w3.org/2000/09/xmldsig#", "dte": "http://www.sat.gob.gt/dte/fel/0.2.0", } NSMAP_REF = { "cno": "http://www.sat.gob.gt/face2/ComplementoReferenciaNota/0.1.0", } NSMAP_ABONO = { "cfc": "http://www.sat.gob.gt/dte/fel/CompCambiaria/0.1.0", } NSMAP_EXP = { "cex": "http://www.sat.gob.gt/face2/ComplementoExportaciones/0.1.0", } NSMAP_FE = { "cfe": "http://www.sat.gob.gt/face2/ComplementoFacturaEspecial/0.1.0", } DTE_NS = "{http://www.sat.gob.gt/dte/fel/0.2.0}" DS_NS = "{http://www.w3.org/2000/09/xmldsig#}" CNO_NS = "{http://www.sat.gob.gt/face2/ComplementoReferenciaNota/0.1.0}" CFE_NS = "{http://www.sat.gob.gt/face2/ComplementoFacturaEspecial/0.1.0}" CEX_NS = "{http://www.sat.gob.gt/face2/ComplementoExportaciones/0.1.0}" CFC_NS = "{http://www.sat.gob.gt/dte/fel/CompCambiaria/0.1.0}" # GTDocumento = etree.Element(DTE_NS+"GTDocumento", {attr_qname: "http://www.sat.gob.gt/dte/fel/0.2.0"}, Version="0.4", nsmap=NSMAP) GTDocumento = etree.Element(DTE_NS + "GTDocumento", {}, Version="0.1", nsmap=NSMAP) SAT = etree.SubElement(GTDocumento, DTE_NS + "SAT", ClaseDocumento="dte") DTE = etree.SubElement(SAT, DTE_NS + "DTE", ID="DatosCertificados") DatosEmision = etree.SubElement(DTE, DTE_NS + "DatosEmision", ID="DatosEmision") tipo_documento_fel = factura.journal_id.tipo_documento_fel if tipo_documento_fel in ['FACT', 'FACM' ] and factura.type == 'out_refund': tipo_documento_fel = 'NCRE' moneda = "GTQ" if factura.currency_id.id != factura.company_id.currency_id.id: moneda = "USD" fecha = fields.Date.from_string( factura.date_invoice).strftime('%Y-%m-%d') hora = fields.Datetime.context_timestamp( factura, timestamp=datetime.now()).strftime('%H:%M:%S') fecha_hora = fecha + 'T' + hora DatosGenerales = etree.SubElement(DatosEmision, DTE_NS + "DatosGenerales", CodigoMoneda=moneda, FechaHoraEmision=fecha_hora, Tipo=tipo_documento_fel) if factura.tipo_gasto == 'importacion': DatosGenerales.attrib['Exp'] = "SI" Emisor = etree.SubElement( DatosEmision, DTE_NS + "Emisor", AfiliacionIVA="GEN", CodigoEstablecimiento=factura.journal_id. codigo_establecimiento_fel, CorreoEmisor="", NITEmisor=factura.company_id.vat.replace('-', ''), NombreComercial=factura.journal_id.direccion.name, NombreEmisor=factura.company_id.name) DireccionEmisor = etree.SubElement(Emisor, DTE_NS + "DireccionEmisor") Direccion = etree.SubElement(DireccionEmisor, DTE_NS + "Direccion") Direccion.text = factura.journal_id.direccion.street or 'Ciudad' CodigoPostal = etree.SubElement(DireccionEmisor, DTE_NS + "CodigoPostal") CodigoPostal.text = factura.journal_id.direccion.zip or '01001' Municipio = etree.SubElement(DireccionEmisor, DTE_NS + "Municipio") Municipio.text = factura.journal_id.direccion.city or 'Guatemala' Departamento = etree.SubElement(DireccionEmisor, DTE_NS + "Departamento") Departamento.text = factura.journal_id.direccion.state_id.name if factura.journal_id.direccion.state_id else '' Pais = etree.SubElement(DireccionEmisor, DTE_NS + "Pais") Pais.text = factura.journal_id.direccion.country_id.code or 'GT' nit_receptor = 'CF' if factura.partner_id.vat: nit_receptor = factura.partner_id.vat.replace('-', '') if tipo_documento_fel == "FESP" and factura.partner_id.cui: nit_receptor = factura.partner_id.cui Receptor = etree.SubElement( DatosEmision, DTE_NS + "Receptor", IDReceptor=nit_receptor, NombreReceptor=factura.partner_id.name) if factura.partner_id.nombre_facturacion_fel: Receptor.attrib[ 'NombreReceptor'] = factura.partner_id.nombre_facturacion_fel if factura.partner_id.email: Receptor.attrib[ 'CorreoReceptor'] = factura.partner_id.email if tipo_documento_fel == "FESP" and factura.partner_id.cui: Receptor.attrib['TipoEspecial'] = "CUI" DireccionReceptor = etree.SubElement( Receptor, DTE_NS + "DireccionReceptor") Direccion = etree.SubElement(DireccionReceptor, DTE_NS + "Direccion") Direccion.text = (factura.partner_id.street or '') + ' ' + ( factura.partner_id.street2 or '') # Direccion.text = " " CodigoPostal = etree.SubElement(DireccionReceptor, DTE_NS + "CodigoPostal") CodigoPostal.text = factura.partner_id.zip or '01001' Municipio = etree.SubElement(DireccionReceptor, DTE_NS + "Municipio") Municipio.text = factura.partner_id.city or 'Guatemala' Departamento = etree.SubElement(DireccionReceptor, DTE_NS + "Departamento") Departamento.text = factura.partner_id.state_id.name if factura.partner_id.state_id else '' Pais = etree.SubElement(DireccionReceptor, DTE_NS + "Pais") Pais.text = factura.partner_id.country_id.code or 'GT' if tipo_documento_fel not in [ 'NDEB', 'NCRE', 'RECI', 'NABN', 'FESP' ]: ElementoFrases = etree.fromstring( factura.company_id.frases_fel) if factura.tipo_gasto == 'importacion': Frase = etree.SubElement(ElementoFrases, DTE_NS + "Frase", CodigoEscenario="1", TipoFrase="4") DatosEmision.append(ElementoFrases) Items = etree.SubElement(DatosEmision, DTE_NS + "Items") linea_num = 0 gran_subtotal = 0 gran_total = 0 gran_total_impuestos = 0 cantidad_impuestos = 0 for linea in factura.invoice_line_ids: if linea.quantity * linea.price_unit == 0: continue linea_num += 1 tipo_producto = "B" if linea.product_id.type == 'service': tipo_producto = "S" precio_unitario = linea.price_unit * (100 - linea.discount) / 100 precio_sin_descuento = linea.price_unit descuento = precio_sin_descuento * linea.quantity - precio_unitario * linea.quantity precio_unitario_base = linea.price_subtotal / linea.quantity total_linea = precio_unitario * linea.quantity total_linea_base = precio_unitario_base * linea.quantity total_impuestos = total_linea - total_linea_base cantidad_impuestos += len(linea.invoice_line_tax_ids) Item = etree.SubElement(Items, DTE_NS + "Item", BienOServicio=tipo_producto, NumeroLinea=str(linea_num)) Cantidad = etree.SubElement(Item, DTE_NS + "Cantidad") Cantidad.text = str(linea.quantity) UnidadMedida = etree.SubElement(Item, DTE_NS + "UnidadMedida") UnidadMedida.text = "UNI" Descripcion = etree.SubElement(Item, DTE_NS + "Descripcion") Descripcion.text = linea.name PrecioUnitario = etree.SubElement( Item, DTE_NS + "PrecioUnitario") PrecioUnitario.text = '{:.6f}'.format(precio_sin_descuento) Precio = etree.SubElement(Item, DTE_NS + "Precio") Precio.text = '{:.6f}'.format(precio_sin_descuento * linea.quantity) Descuento = etree.SubElement(Item, DTE_NS + "Descuento") Descuento.text = '{:.6f}'.format(descuento) if len(linea.invoice_line_tax_ids) > 0: Impuestos = etree.SubElement(Item, DTE_NS + "Impuestos") Impuesto = etree.SubElement(Impuestos, DTE_NS + "Impuesto") NombreCorto = etree.SubElement(Impuesto, DTE_NS + "NombreCorto") NombreCorto.text = "IVA" CodigoUnidadGravable = etree.SubElement( Impuesto, DTE_NS + "CodigoUnidadGravable") CodigoUnidadGravable.text = "1" if factura.tipo_gasto == 'importacion': CodigoUnidadGravable.text = "2" MontoGravable = etree.SubElement( Impuesto, DTE_NS + "MontoGravable") MontoGravable.text = '{:.2f}'.format( factura.currency_id.round(total_linea_base)) MontoImpuesto = etree.SubElement( Impuesto, DTE_NS + "MontoImpuesto") MontoImpuesto.text = '{:.2f}'.format( factura.currency_id.round(total_impuestos)) Total = etree.SubElement(Item, DTE_NS + "Total") Total.text = '{:.2f}'.format( factura.currency_id.round(total_linea)) gran_total += factura.currency_id.round(total_linea) gran_subtotal += factura.currency_id.round( total_linea_base) gran_total_impuestos += factura.currency_id.round( total_impuestos) Totales = etree.SubElement(DatosEmision, DTE_NS + "Totales") if cantidad_impuestos > 0: TotalImpuestos = etree.SubElement( Totales, DTE_NS + "TotalImpuestos") TotalImpuesto = etree.SubElement( TotalImpuestos, DTE_NS + "TotalImpuesto", NombreCorto="IVA", TotalMontoImpuesto='{:.2f}'.format( factura.currency_id.round(gran_total_impuestos))) GranTotal = etree.SubElement(Totales, DTE_NS + "GranTotal") GranTotal.text = '{:.2f}'.format( factura.currency_id.round(gran_total)) if factura.company_id.adenda_fel: Adenda = etree.SubElement(SAT, DTE_NS + "Adenda") exec(factura.company_id.adenda_fel, { 'etree': etree, 'Adenda': Adenda, 'factura': factura }) # En todos estos casos, es necesario enviar complementos if tipo_documento_fel in [ 'NDEB', 'NCRE' ] or tipo_documento_fel in [ 'FCAM' ] or (tipo_documento_fel in ['FACT', 'FCAM'] and factura.tipo_gasto == 'importacion') or tipo_documento_fel in ['FESP']: Complementos = etree.SubElement(DatosEmision, DTE_NS + "Complementos") if tipo_documento_fel in ['NDEB', 'NCRE']: Complemento = etree.SubElement( Complementos, DTE_NS + "Complemento", IDComplemento="ReferenciasNota", NombreComplemento="Nota de Credito" if tipo_documento_fel == 'NCRE' else "Nota de Debito", URIComplemento="text") if factura.factura_original_id.numero_fel: ReferenciasNota = etree.SubElement( Complemento, CNO_NS + "ReferenciasNota", FechaEmisionDocumentoOrigen=str( factura.factura_original_id.date_invoice), MotivoAjuste="-", NumeroAutorizacionDocumentoOrigen=factura. factura_original_id.firma_fel, NumeroDocumentoOrigen=factura. factura_original_id.numero_fel, SerieDocumentoOrigen=factura. factura_original_id.serie_fel, Version="0.0", nsmap=NSMAP_REF) else: ReferenciasNota = etree.SubElement( Complemento, CNO_NS + "ReferenciasNota", RegimenAntiguo="Antiguo", FechaEmisionDocumentoOrigen=str( factura.factura_original_id.date_invoice), MotivoAjuste="-", NumeroAutorizacionDocumentoOrigen=factura. factura_original_id.firma_fel, NumeroDocumentoOrigen=factura. factura_original_id.name.split("-")[1], SerieDocumentoOrigen=factura. factura_original_id.name.split("-")[0], Version="0.0", nsmap=NSMAP_REF) if tipo_documento_fel in ['FCAM']: Complemento = etree.SubElement( Complementos, DTE_NS + "Complemento", IDComplemento="FCAM", NombreComplemento="AbonosFacturaCambiaria", URIComplemento="#AbonosFacturaCambiaria") AbonosFacturaCambiaria = etree.SubElement( Complemento, CFC_NS + "AbonosFacturaCambiaria", Version="1", nsmap=NSMAP_ABONO) Abono = etree.SubElement(AbonosFacturaCambiaria, CFC_NS + "Abono") NumeroAbono = etree.SubElement(Abono, CFC_NS + "NumeroAbono") NumeroAbono.text = "1" FechaVencimiento = etree.SubElement( Abono, CFC_NS + "FechaVencimiento") FechaVencimiento.text = str(factura.date_due) MontoAbono = etree.SubElement(Abono, CFC_NS + "MontoAbono") MontoAbono.text = '{:.2f}'.format( factura.currency_id.round(gran_total)) if tipo_documento_fel in [ 'FACT', 'FCAM' ] and factura.tipo_gasto == 'importacion': Complemento = etree.SubElement( Complementos, DTE_NS + "Complemento", IDComplemento="text", NombreComplemento="text", URIComplemento="text") Exportacion = etree.SubElement(Complemento, CEX_NS + "Exportacion", Version="1", nsmap=NSMAP_EXP) NombreConsignatarioODestinatario = etree.SubElement( Exportacion, CEX_NS + "NombreConsignatarioODestinatario") NombreConsignatarioODestinatario.text = factura.consignatario_fel.name if factura.consignatario_fel else "-" DireccionConsignatarioODestinatario = etree.SubElement( Exportacion, CEX_NS + "DireccionConsignatarioODestinatario") DireccionConsignatarioODestinatario.text = factura.consignatario_fel.street or "-" if factura.consignatario_fel else "-" NombreComprador = etree.SubElement( Exportacion, CEX_NS + "NombreComprador") NombreComprador.text = factura.comprador_fel.name if factura.comprador_fel else "-" DireccionComprador = etree.SubElement( Exportacion, CEX_NS + "DireccionComprador") DireccionComprador.text = factura.comprador_fel.street or "-" if factura.comprador_fel else "-" INCOTERM = etree.SubElement(Exportacion, CEX_NS + "INCOTERM") INCOTERM.text = factura.incoterm_fel or "-" NombreExportador = etree.SubElement( Exportacion, CEX_NS + "NombreExportador") NombreExportador.text = factura.exportador_fel.name if factura.exportador_fel else "-" CodigoExportador = etree.SubElement( Exportacion, CEX_NS + "CodigoExportador") CodigoExportador.text = factura.exportador_fel.ref or "-" if factura.exportador_fel else "-" if tipo_documento_fel in ['FESP']: total_isr = abs(factura.amount_tax) total_iva_retencion = 0 for impuesto in factura.tax_line_ids: if impuesto.amount > 0: total_iva_retencion += impuesto.amount Complemento = etree.SubElement( Complementos, DTE_NS + "Complemento", IDComplemento="text", NombreComplemento="text", URIComplemento="text") RetencionesFacturaEspecial = etree.SubElement( Complemento, CFE_NS + "RetencionesFacturaEspecial", Version="1", nsmap=NSMAP_FE) RetencionISR = etree.SubElement( RetencionesFacturaEspecial, CFE_NS + "RetencionISR") RetencionISR.text = str(total_isr) RetencionIVA = etree.SubElement( RetencionesFacturaEspecial, CFE_NS + "RetencionIVA") RetencionIVA.text = str(total_iva_retencion) TotalMenosRetenciones = etree.SubElement( RetencionesFacturaEspecial, CFE_NS + "TotalMenosRetenciones") TotalMenosRetenciones.text = str(factura.amount_total) xml_sin_firma = etree.tostring( GTDocumento, encoding="UTF-8").decode("utf-8") logging.warn(xml_sin_firma) # signature = xmlsig.template.create( # xmlsig.constants.TransformInclC14N, # xmlsig.constants.TransformRsaSha256, # "Signature" # ) # signature_id = utils.get_unique_id() # ref_datos = xmlsig.template.add_reference( # signature, xmlsig.constants.TransformSha256, uri="#DatosEmision" # ) # xmlsig.template.add_transform(ref_datos, xmlsig.constants.TransformEnveloped) # ref_prop = xmlsig.template.add_reference( # signature, xmlsig.constants.TransformSha256, uri_type="http://uri.etsi.org/01903#SignedProperties", uri="#" + signature_id # ) # xmlsig.template.add_transform(ref_prop, xmlsig.constants.TransformInclC14N) # ki = xmlsig.template.ensure_key_info(signature) # data = xmlsig.template.add_x509_data(ki) # xmlsig.template.x509_data_add_certificate(data) # xmlsig.template.x509_data_add_subject_name(data) # serial = xmlsig.template.x509_data_add_issuer_serial(data) # xmlsig.template.x509_issuer_serial_add_issuer_name(serial) # xmlsig.template.x509_issuer_serial_add_serial_number(serial) # qualifying = template.create_qualifying_properties( # signature, name=utils.get_unique_id() # ) # props = template.create_signed_properties( # qualifying, name=signature_id, datetime=fecha_hora # ) # # GTDocumento.append(signature) # ctx = XAdESContext() # with open(path.join("/home/odoo/megaprint_leplan", "51043491-6747a80bb6a554ae.pfx"), "rb") as key_file: # ctx.load_pkcs12(crypto.load_pkcs12(key_file.read(), "Planeta123$")) # ctx.sign(signature) # ctx.verify(signature) # DatosEmision.remove(SingatureTemp) # xml_con_firma = etree.tostring(GTDocumento, encoding="utf-8").decode("utf-8") request_url = "apiv2" request_path = "" if factura.company_id.pruebas_fel: request_url = "dev2.api" request_path = "" headers = {"Content-Type": "application/xml"} data = '<?xml version="1.0" encoding="UTF-8"?><SolicitaTokenRequest><usuario>{}</usuario><apikey>{}</apikey></SolicitaTokenRequest>'.format( factura.journal_id.usuario_fel, factura.journal_id.clave_fel) r = requests.post('https://' + request_url + '.ifacere-fel.com/' + request_path + 'api/solicitarToken', data=data, headers=headers) resultadoXML = etree.XML(bytes(r.text, encoding='utf-8')) if len(resultadoXML.xpath("//token")) > 0: token = resultadoXML.xpath("//token")[0].text uuid_factura = str( uuid.uuid5(uuid.NAMESPACE_OID, str(factura.id))).upper() headers = { "Content-Type": "application/xml", "authorization": "Bearer " + token } data = '<?xml version="1.0" encoding="UTF-8"?><FirmaDocumentoRequest id="{}"><xml_dte><![CDATA[{}]]></xml_dte></FirmaDocumentoRequest>'.format( uuid_factura, xml_sin_firma) r = requests.post( 'https://api.soluciones-mega.com/api/solicitaFirma', data=data.encode('utf-8'), headers=headers) logging.warn(r.text) resultadoXML = etree.XML(bytes(r.text, encoding='utf-8')) if len(resultadoXML.xpath("//xml_dte")) > 0: xml_con_firma = html.unescape( resultadoXML.xpath("//xml_dte")[0].text) headers = { "Content-Type": "application/xml", "authorization": "Bearer " + token } data = '<?xml version="1.0" encoding="UTF-8"?><RegistraDocumentoXMLRequest id="{}"><xml_dte><![CDATA[{}]]></xml_dte></RegistraDocumentoXMLRequest>'.format( uuid_factura, xml_con_firma) logging.warn(data) r = requests.post('https://' + request_url + '.ifacere-fel.com/' + request_path + 'api/registrarDocumentoXML', data=data.encode('utf-8'), headers=headers) resultadoXML = etree.XML( bytes(r.text, encoding='utf-8')) if len(resultadoXML.xpath("//listado_errores")) == 0: xml_certificado = html.unescape( resultadoXML.xpath("//xml_dte")[0].text) xml_certificado_root = etree.XML( bytes(xml_certificado, encoding='utf-8')) numero_autorizacion = xml_certificado_root.find( ".//{http://www.sat.gob.gt/dte/fel/0.2.0}NumeroAutorizacion" ) factura.firma_fel = numero_autorizacion.text factura.name = numero_autorizacion.get( "Serie") + "-" + numero_autorizacion.get( "Numero") factura.serie_fel = numero_autorizacion.get( "Serie") factura.numero_fel = numero_autorizacion.get( "Numero") headers = { "Content-Type": "application/xml", "authorization": "Bearer " + token } data = '<?xml version="1.0" encoding="UTF-8"?><RetornaPDFRequest><uuid>{}</uuid></RetornaPDFRequest>'.format( factura.firma_fel) r = requests.post('https://' + request_url + '.ifacere-fel.com/' + request_path + 'api/retornarPDF', data=data, headers=headers) resultadoXML = etree.XML( bytes(r.text, encoding='utf-8')) if len(resultadoXML.xpath( "//listado_errores")) == 0: pdf = resultadoXML.xpath("//pdf")[0].text factura.pdf_fel = pdf else: raise UserError(r.text) else: raise UserError(r.text) else: raise UserError(r.text) return super(AccountInvoice, self).invoice_validate()
def get_title(html, url): return unescape(re.search("<h6[^>]*>([^<]+)", html).group(1)).strip()
print("Path: " + path) print("Start From: " + str(startFrom)) print("Starting test data generation...") print("------------") p = re.compile('[\S]+') f = open("data/test.txt", 'w+') i = 1 for mReview in parse(path): if i > startFrom: if i % 100 == 0: print(i) if p.search(str(mReview['reviewText'])): try: f.write( getMark(int(mReview['overall'])) + "\t" + html.unescape(mReview['reviewText']).lower() + "\n") except UnicodeEncodeError: f.write( getMark(int(mReview['overall'])) + "\t" + mReview['reviewText'] + "\n") i += 1 print("------------") print("End parsing test data!") f.close()
course_credit_description = course_info_table.find_next("p") try: course_credit_type = course_credit_description.string.split( ") ")[0][1:] except AttributeError: course_credit_type = course_credit_description.contents[0].split( ") ")[0][1:] if len(course_credit_type) > 30: course_credit_type = "N.A" course_prereqs_raw = course_credit_description.find_next("p") course_prereqs = " ".join([ tag.text if type(tag) == bs4.Tag else tag for tag in course_prereqs_raw.contents ]) course_prereqs = html.unescape(course_prereqs) #if type(course_prereqs_raw.contents) == list: # course_prereqs = " ".join([tag.text for tag in # course_prereqs_raw.contents]) #elif type(course_prereqs_raw.contents) == bs4.Tag: # course_prereqs = " ".join(course_prereqs_raw.text) #else: # raise Exception course_details = course_info_table.find_next("tbody") course_id = course_details.find_next("td") course_section = course_id.find_next("td") course_credits = course_section.find_next("td") course_capacity = course_credits.find_next("td") course_enrolled = course_capacity.find_next("td") course_instructors = course_enrolled.find_next("td")
def test_unescape_method(self): from html import unescape p = self.get_collector() with self.assertWarns(DeprecationWarning): s = '""""""&#bad;' self.assertEqual(p.unescape(s), unescape(s))
def generate_question(self): self.current = self.list[self.num] self.num += 1 question = html.unescape(self.current.text) return f'Q.{self.num}: {question} '
def _preprocess_v2(self, text: str) -> str: text = str(text) text = html.unescape(text) if self.strip_tashkeel: text = araby.strip_tashkeel(text) if self.strip_tatweel: text = araby.strip_tatweel(text) if self.replace_urls_emails_mentions: # replace all possible URLs for reg in url_regexes: text = re.sub(reg, " [رابط] ", text) # REplace Emails with [بريد] for reg in email_regexes: text = re.sub(reg, " [بريد] ", text) # replace mentions with [مستخدم] text = re.sub(user_mention_regex, " [مستخدم] ", text) if self.remove_html_markup: # remove html line breaks text = re.sub("<br />", " ", text) # remove html markup text = re.sub("</?[^>]+>", " ", text) if self.map_hindi_numbers_to_arabic: text = text.translate(hindi_to_arabic_map) # remove repeated characters >2 if self.remove_non_digit_repetition: text = self._remove_non_digit_repetition(text) # insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets if self.insert_white_spaces: text = re.sub( "([^0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z\[\]])", r" \1 ", text, ) # insert whitespace between words and numbers or numbers and words text = re.sub("(\d+)([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)", r" \1 \2 ", text) text = re.sub("([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)(\d+)", r" \1 \2 ", text) if self.replace_slash_with_dash: text = text.replace("/", "-") # remove unwanted characters if self.keep_emojis: emoji_regex = "".join(list(self.emoji.UNICODE_EMOJI["en"].keys())) rejected_chars_regex2 = "[^%s%s]" % (chars_regex, emoji_regex) text = re.sub(rejected_chars_regex2, " ", text) else: text = re.sub(rejected_chars_regex, " ", text) # remove extra spaces text = " ".join(text.replace("\uFE0F", "").split()) if (self.model_name == "bert-base-arabertv2" or self.model_name == "bert-large-arabertv2"): if self.keep_emojis: new_text = [] for word in text.split(): if word in list(self.emoji.UNICODE_EMOJI["en"].keys()): new_text.append(word) else: new_text.append(self.farasa_segmenter.segment(word)) text = " ".join(new_text) else: text = self.farasa_segmenter.segment(text) return self._farasa_segment(text) # ALl the other models dont require Farasa Segmentation return text
def amend_html_symbols(string: str) -> str: return html.unescape(string)
def init_oca( fname, message_stream=sys.stdout ): if fname == '_': fname = fnameDefault if message_stream == sys.stdout or message_stream == sys.stderr: def messsage_stream_write( s ): message_stream.write( removeDiacritic(s) ) else: def messsage_stream_write( s ): message_stream.write( u'{}'.format(s) ) tstart = datetime.datetime.now() fix_bad_license_codes() discipline_id = dict( (discipline, Discipline.objects.get(name=discipline)) for discipline in ['Road', 'Track', 'Cyclocross', 'MTB', 'Para'] ) discipline_cols = { 'Road': ['national_road', 'provincial_road'], 'Cyclocross': ['national_cyclocross', 'provincial_cyclocross'], 'Track': ['track'], 'MTB': ['cross_country', 'provincial_cross_country', 'downhill', 'fourx'], 'Para': ['para_cycling'], } effective_date = datetime.date.today() html_parser = HTMLParser() # Process the records in larger transactions for performance. @transaction.atomic def process_ur_records( ur_records ): for i, ur in ur_records: try: date_of_birth = date_from_str( ur.dob ) except Exception as e: messsage_stream_write( u'Line {}: Invalid birthdate "{}" ({}) {}\n'.format( i, ur.dob, ur, e ) ) continue attributes = { 'license_code': ur.license, 'last_name': ur.last_name, 'first_name': ur.first_name, 'gender': gender_from_str(ur.sex), 'date_of_birth':date_of_birth, 'state_prov': 'Ontario', 'nationality': 'Canada', 'uci_code': ur.uci_code, } if attributes['uci_code'][:3] != 'CAN': attributes['nationality'] = '' try: attributes['city'] = ur.city except: pass try: lh = LicenseHolder.objects.get( license_code=ur.license ) if set_attributes(lh, attributes): lh.save() except LicenseHolder.DoesNotExist: lh = LicenseHolder( **attributes ) lh.save() messsage_stream_write( u'{:>6}: {:>8} {:>9} {:>10} {}, {}, ({})\n'.format( i, lh.license_code, lh.uci_code, lh.date_of_birth.strftime('%Y/%m/%d'), lh.last_name, lh.first_name, lh.city ) ) team_name = ur.club or ur.trade_team TeamHint.objects.filter( license_holder=lh ).delete() if team_name: team_names = [t.strip() for t in team_name.split(',') if t.strip()] for count, team_name in enumerate(team_names): team = Team.objects.get_or_create( name=team_name )[0] if count == len(team_names)-1: for discipline_name, discipline in discipline_id.items(): for col_name in discipline_cols[discipline_name]: if getattr(ur, col_name, None): TeamHint( license_holder=lh, team=team, discipline=discipline, effective_date=effective_date ).save() break ur_records = [] with io.open(fname, 'r', encoding='utf-8', errors='replace') as fp: oca_reader = csv.reader( fp ) for i, row in enumerate(oca_reader): if i == 0: # Get the header fields from the first row. fields = utils.getHeaderFields( [unescape(v.strip()) for v in row] ) messsage_stream_write( u'Recognized Header Fields:\n' ) messsage_stream_write( u'----------------------------\n' ) messsage_stream_write( u'\n'.join(fields) + u'\n' ) messsage_stream_write( u'----------------------------\n' ) oca_record = namedtuple('oca_record', fields) continue ur = oca_record( *[unescape(v.strip()) for v in row] ) ur_records.append( (i, ur) ) if len(ur_records) == 3000: process_ur_records( ur_records ) ur_records = [] process_ur_records( ur_records ) messsage_stream_write( 'Initialization in: {}\n'.format(datetime.datetime.now() - tstart) )
async def run(self, bot: Bot, event: Event): if not isinstance(event, Message): return True root_args = '' root_call = '' args = '' handler = None if event.text: try: root_call, root_args = SPACE_RE.split(event.text, 1) except ValueError: root_call = event.text elif event.message and event.message.text: try: root_call, root_args = SPACE_RE.split(event.message.text, 1) except ValueError: root_call = event.message.text if root_call == bot.config.PREFIX + self.name: for c in self.route_list: if c.subtype == event.subtype or c.subtype == '*': if root_args is None: if c.name is None: handler = c.handler break else: try: call, args = SPACE_RE.split(root_args, 1) except ValueError: call = root_args if c.name == call: handler = c.handler break else: handler = Handler(self.fallback) if handler: raw = html.unescape(args) func_params = handler.params try: chunks = split_chunks(raw, self.use_shlex) except ValueError: await bot.say(event.channel, '*Error*: Can not parse this command') return False try: kw, remain_chunks = parse_option_and_arguments( handler, chunks, ) except SyntaxError as e: await bot.say(event.channel, '*Error*\n{}'.format(e)) return False with self.prepare_kwargs( bot=bot, event=event, func_params=func_params, **kw, ) as kwargs: return await handler(**kwargs) return True
def basic_clean(text): text = ftfy.fix_text(text) text = html.unescape(html.unescape(text)) return text.strip()
def next_question(self): self.current_question = self.question_list[self.question_number] self.question_number += 1 q_text = html.unescape(self.current_question.text) return (f"Q.{self.question_number}: {q_text} ")
def test_views_lti_development_post_bypass_lti_instructor_no_video(self): """When bypassing LTI, the "example.com" consumer site is automatically created.""" data = { "resource_link_id": "example.com-123", "context_id": "course-v1:ufr+mathematics+00001", "roles": "instructor", "tool_consumer_instance_guid": "example.com", "user_id": "56255f3807599c377bf0e5bf072359fd", } response = self.client.post( "/lti/videos/{!s}".format(uuid.uuid4()), data, HTTP_REFERER="https://example.com", ) self.assertEqual(response.status_code, 200) self.assertContains(response, "<html>") content = response.content.decode("utf-8") match = re.search( '<div id="marsha-frontend-data" data-context="(.*)">', content ) context = json.loads(unescape(match.group(1))) jwt_token = AccessToken(context.get("jwt")) video = Video.objects.get() self.assertEqual(jwt_token.payload["resource_id"], str(video.id)) self.assertEqual(jwt_token.payload["user_id"], data["user_id"]) self.assertEqual(jwt_token.payload["context_id"], data["context_id"]) self.assertEqual(jwt_token.payload["roles"], [data["roles"]]) self.assertEqual(jwt_token.payload["locale"], "en_US") self.assertDictEqual( jwt_token.payload["course"], {"school_name": "ufr", "course_name": "mathematics", "course_run": "00001"}, ) self.assertEqual(context.get("state"), "success") self.assertEqual( context.get("resource"), { "active_stamp": None, "is_ready_to_show": False, "show_download": True, "description": video.description, "id": str(video.id), "upload_state": "pending", "timed_text_tracks": [], "thumbnail": None, "title": video.title, "urls": None, "should_use_subtitle_as_transcript": False, "has_transcript": False, "playlist": { "title": "course-v1:ufr+mathematics+00001", "lti_id": "course-v1:ufr+mathematics+00001", }, "live_state": None, "live_info": {}, }, ) self.assertEqual(context.get("modelName"), "videos") # The consumer site was created with a name and a domain name ConsumerSite.objects.get(name="example.com", domain="example.com")
def get_article(article_url, article_i, total): article_url = normurl(article_url) myjson = {"entries": []} basetext = "(%i/%i) " % (article_i, total) article_i += 1 sys.stderr.write(basetext + "Downloading %s... " % article_url) sys.stderr.flush() try: data = download(article_url) except Exception: sys.stderr.write("failed!\n") sys.stderr.flush() return sys.stderr.write("\r" + basetext + "Processing %s... " % article_url) sys.stderr.flush() soup = bs4.BeautifulSoup(data, 'lxml') jsondata = soup.find(attrs={"type": "application/ld+json"}).text jsondecode = rssit.util.json_loads(jsondata) sitetitle = html.unescape( soup.find("meta", attrs={"property": "og:site_name"})["content"]) myjson["title"] = sitetitle myjson["author"] = sitetitle author = sitetitle title = html.unescape(jsondecode["headline"]) date = parse(jsondecode["datePublished"]) album = "[" + str(date.year)[-2:] + str(date.month).zfill(2) + str( date.day).zfill(2) + "] " + title article_selectors = [ ".entry .article", ".article_post", "#content", "#mArticle", "#article", "#entire > #contents > .post", "#main .article-desc", "section > article" ] for selector in article_selectors: articletag = soup.select(selector) if articletag and len(articletag) > 0: articletag = articletag[0] break if not articletag: sys.stderr.write("failed!\n") sys.stderr.flush() return images = [] videos = [] articlestr = str(articletag) re_images = re.findall( "(https?://cfile\d+\.uf\.tistory\.com/(image|attach|original)/\w+)", articlestr) for image in re_images: url = get_full_image(article_url, image[0]) if url not in images: images.append(url) lightboxes = articletag.findAll(attrs={"data-lightbox": True}) for lightbox in lightboxes: image = get_full_image(article_url, lightbox["data-url"]) if image not in images: images.append(image) #images.append(re.sub("/image/", "/original/", lightbox["data-url"])) #imageblocks = articletag.select(".imageblock img") imageblocks = articletag.select("p img, .imageblock img") for image in imageblocks: if "onclick" in image: url = re.sub("^open_img\(['\"](.*)['\"]\)$", "\\1", image["onclick"]) url = image["src"] url = get_full_image(article_url, url) if url not in images: images.append(url) videotags = articletag.select("video") for video in videotags: if video.has_attr("src"): url = video["src"] else: sources = video.select("source") if len(sources) > 0: url = sources[0]["src"] else: continue videos.append({ "image": None, "video": get_full_image(article_url, url) }) myjson["entries"].append({ "caption": title, "url": article_url, "album": album, "date": date, "author": author, "images": images, "videos": videos }) sys.stderr.write("done\n") sys.stderr.flush() return myjson
def test_views_lti_development_post_bypass_lti_instructor(self): """In development, passport creation and LTI verif can be bypassed for a instructor.""" video = VideoFactory( playlist__consumer_site__domain="example.com", playlist__title="foo bar", playlist__lti_id="course-v1:ufr+mathematics+00001", ) data = { "resource_link_id": video.lti_id, "context_id": video.playlist.lti_id, "roles": "instructor", "tool_consumer_instance_guid": "example.com", "context_title": "mathematics", "tool_consumer_instance_name": "ufr", "user_id": "56255f3807599c377bf0e5bf072359fd", } response = self.client.post( "/lti/videos/{!s}".format(video.pk), data, HTTP_REFERER="https://example.com", ) self.assertEqual(response.status_code, 200) self.assertContains(response, "<html>") content = response.content.decode("utf-8") match = re.search( '<div id="marsha-frontend-data" data-context="(.*)">', content ) context = json.loads(unescape(match.group(1))) jwt_token = AccessToken(context.get("jwt")) self.assertEqual(jwt_token.payload["resource_id"], str(video.id)) self.assertEqual(jwt_token.payload["user_id"], data["user_id"]) self.assertEqual(jwt_token.payload["context_id"], data["context_id"]) self.assertEqual(jwt_token.payload["roles"], [data["roles"]]) self.assertEqual(jwt_token.payload["locale"], "en_US") self.assertEqual( jwt_token.payload["permissions"], {"can_access_dashboard": True, "can_update": True}, ) self.assertDictEqual( jwt_token.payload["course"], {"school_name": "ufr", "course_name": "mathematics", "course_run": "00001"}, ) self.assertEqual(context.get("state"), "success") self.assertEqual( context.get("resource"), { "active_stamp": None, "is_ready_to_show": False, "show_download": True, "description": video.description, "id": str(video.id), "upload_state": "pending", "timed_text_tracks": [], "thumbnail": None, "title": video.title, "urls": None, "should_use_subtitle_as_transcript": False, "has_transcript": False, "playlist": { "title": "foo bar", "lti_id": "course-v1:ufr+mathematics+00001", }, "live_state": None, "live_info": {}, }, ) self.assertEqual(context.get("modelName"), "videos")
re.sub(r" *\n *", "\n", re.sub(r" +", " ", re.sub(r"\r", "", plaintext)))) if len(plaintext) > 0: # Guessing MIME of the file (checked on original content) mime = magic.from_buffer(text, mime=True) mimeFile.write(mime.encode() + b"\n") urlFile.write(url.encode() + b"\n") langFile.write(lang.encode() + b"\n") encodingFile.write(orig_encoding.encode() + b"\n") b64norm = base64.b64encode(cleantree.encode()) normHtmlFile.write(b64norm + b"\n") if options.boilerpipe: b64deboil = base64.b64encode(deboiled.encode()) deboilFile.write(b64deboil + b"\n") b64text = base64.b64encode(html.unescape(plaintext).encode()) plainTextFile.write(b64text + b"\n") urlFile.close() langFile.close() encodingFile.close() mimeFile.close() normHtmlFile.close() plainTextFile.close() # Boilerpipe cleaning is optional if options.boilerpipe: deboilFile.close()
def sanitize(self): super(Cataclysmic, self).sanitize() # Calculate some columns based on imported data, sanitize some fields name = self[self._KEYS.NAME] aliases = self.get_aliases() if ((name.startswith('SN') and is_number(name[2:6]) and self._KEYS.DISCOVER_DATE in self and int(self[self._KEYS.DISCOVER_DATE][0][QUANTITY.VALUE].split('/')[ 0]) >= 2016 and not any(['AT' in x for x in aliases]))): source = self.add_self_source() self.add_quantity(self._KEYS.ALIAS, 'AT' + name[2:], source) if self._KEYS.CLAIMED_TYPE in self: # FIX: this is something that should be done completely internally # i.e. add it to `clean` or something?? self[self._KEYS.CLAIMED_TYPE] = self.ct_list_prioritized() if self._KEYS.CLAIMED_TYPE in self: self[self._KEYS.CLAIMED_TYPE][:] = [ ct for ct in self[self._KEYS.CLAIMED_TYPE] if (ct[QUANTITY.VALUE] != '?' and ct[QUANTITY.VALUE] != '-') ] if not len(self[self._KEYS.CLAIMED_TYPE]): del (self[self._KEYS.CLAIMED_TYPE]) if self._KEYS.CLAIMED_TYPE not in self and name.startswith('AT'): source = self.add_self_source() self.add_quantity(self._KEYS.CLAIMED_TYPE, 'Candidate', source) if self._KEYS.SOURCES in self: for source in self[self._KEYS.SOURCES]: if SOURCE.BIBCODE in source: import urllib from html import unescape # First sanitize the bibcode if len(source[SOURCE.BIBCODE]) != 19: source[SOURCE.BIBCODE] = urllib.parse.unquote( unescape(source[SOURCE.BIBCODE])).replace('A.A.', 'A&A') if source[SOURCE.BIBCODE] in self.catalog.biberror_dict: source[SOURCE.BIBCODE] = \ self.catalog.biberror_dict[source[SOURCE.BIBCODE]] if (source[SOURCE.BIBCODE] not in self.catalog.bibauthor_dict): bibcode = source[SOURCE.BIBCODE] adsquery = (self.catalog.ADS_BIB_URL + urllib.parse.quote(bibcode) + '&data_type=Custom&format=%253m%20%25(y)') bibcodeauthor = '' try: response = urllib.request.urlopen(adsquery) html = response.read().decode('utf-8') hsplit = html.split("\n") if len(hsplit) > 5: bibcodeauthor = hsplit[5] except: pass if not bibcodeauthor: warnings.warn( "Bibcode didn't return authors, not converting" "this bibcode.") self.catalog.bibauthor_dict[bibcode] = unescape( bibcodeauthor).strip() for source in self[self._KEYS.SOURCES]: if (SOURCE.BIBCODE in source and source[SOURCE.BIBCODE] in self.catalog.bibauthor_dict and self.catalog.bibauthor_dict[source[SOURCE.BIBCODE]]): source[SOURCE.REFERENCE] = self.catalog.bibauthor_dict[ source[SOURCE.BIBCODE]] if (SOURCE.NAME not in source and SOURCE.BIBCODE in source and source[SOURCE.BIBCODE]): source[SOURCE.NAME] = source[SOURCE.BIBCODE] if self._KEYS.REDSHIFT in self: self[self._KEYS.REDSHIFT] = list( sorted( self[self._KEYS.REDSHIFT], key=lambda q: frame_priority(q, self._KEYS.REDSHIFT))) if self._KEYS.VELOCITY in self: self[self._KEYS.VELOCITY] = list( sorted( self[self._KEYS.VELOCITY], key=lambda q: frame_priority(q, self._KEYS.VELOCITY))) if self._KEYS.CLAIMED_TYPE in self: self[self._KEYS.CLAIMED_TYPE] = self.ct_list_prioritized() # Renumber and reorder sources if self._KEYS.SOURCES in self: # Sort sources reverse-chronologically self[self._KEYS.SOURCES] = sorted( self[self._KEYS.SOURCES], key=lambda x: bib_priority(x)) # Assign new aliases to match new order source_reps = OrderedDict( [[x[SOURCE.ALIAS], str(i + 1)] for i, x in enumerate(self[self._KEYS.SOURCES])]) for i, source in enumerate(self[self._KEYS.SOURCES]): self[self._KEYS.SOURCES][i][SOURCE.ALIAS] = source_reps[source[ SOURCE.ALIAS]] # Change sources to match new aliases for key in self.keys(): if self._KEYS.get_key_by_name(key).no_source: continue for item in self[key]: aliases = [ str(y) for y in sorted( int(source_reps[x]) for x in item[item._KEYS.SOURCE].split(',')) ] item[item._KEYS.SOURCE] = ','.join(aliases)
def get_presentes(pk, response, materia): if type(materia) == OrdemDia: presentes = PresencaOrdemDia.objects.filter(sessao_plenaria_id=pk) else: presentes = SessaoPlenariaPresenca.objects.filter( sessao_plenaria_id=pk) sessao = SessaoPlenaria.objects.get(id=pk) num_presentes = len(presentes) data_sessao = sessao.data_inicio oradores = OradorExpediente.objects.filter( sessao_plenaria_id=pk).order_by('numero_ordem') oradores_list = [] for o in oradores: oradores_list.append({ 'nome': o.parlamentar.nome_parlamentar, 'numero': o.numero_ordem }) presentes_list = [] for p in presentes: legislatura = sessao.legislatura # Recupera os mandatos daquele parlamentar mandatos = p.parlamentar.mandato_set.filter(legislatura=legislatura) if p.parlamentar.ativo and mandatos: filiacao = filiacao_data(p.parlamentar, data_sessao, data_sessao) if not filiacao: partido = 'Sem Registro' else: partido = filiacao presentes_list.append({ 'id': p.id, 'parlamentar_id': p.parlamentar.id, 'nome': p.parlamentar.nome_parlamentar, 'partido': partido, 'voto': '' }) elif not p.parlamentar.ativo or not mandatos: num_presentes += -1 if materia: if materia.tipo_votacao == 1: tipo_votacao = 'Simbólica' elif materia.tipo_votacao == 2: tipo_votacao = 'Nominal' elif materia.tipo_votacao == 3: tipo_votacao = 'Secreta' response.update({ 'tipo_resultado': materia.resultado, 'observacao_materia': html.unescape(materia.observacao), 'tipo_votacao': tipo_votacao, 'materia_legislativa_texto': str(materia.materia) }) presentes_list = sort_lista_chave(presentes_list, 'nome') response.update({ 'presentes': presentes_list, 'num_presentes': num_presentes, 'oradores': oradores_list, 'msg_painel': str(_('Votação aberta!')), }) return response
else: data['uselang'] = 'zh-cn' r = requests.post('https://zh.wikipedia.org/w/api.php', data=data, headers=headers) try: result = r.json() except Exception as e: print(e) print(r.text) continue result = result['parse']['text']['*'] matches = re.findall(r'<div id="text(\d+)">(.+?)</div>', result) for match in matches: idx = int(match[0]) newtext = html.unescape(match[1]).replace('\\n', '\\\\n') # print(idx, newtext) if args.mode == 1: newregex = r'\g<1>\g<2>\g<3>{}\g<5>'.format(newtext) else: newregex = r'\g<1>{}\g<3>\g<4>\g<5>'.format(newtext) jstext = re.sub( r"(wgULS\(')({})(',[\s\n]*?')({})('\))".format( re.escape(messages[idx][0]), re.escape(messages[idx][1])), newregex, jstext, ) jstext = re.sub(r"wgULS\('(.+?)',[\s\n]*?'\1'\)", r"'\1'", jstext) with open(full_filename, 'w', encoding='utf8') as f:
def print_info(self, req, req_body, res, res_body): def _parse_qsl(s): return '\n'.join("%-20s %s" % (k, v) for k, v in parse_qsl(s, keep_blank_values=True)) if not options['debug']: return req_header_text = "%s %s %s\n%s" % (req.method, req.uri, self.request_version, req.headers) if res is not None: reshdrs = res.headers if type(reshdrs) == dict or 'CaseInsensitiveDict' in str( type(reshdrs)): reshdrs = '' for k, v in res.headers.items(): if k in plugins.IProxyPlugin.proxy2_metadata_headers.values( ): continue if k.lower().startswith('x-proxy2-'): continue reshdrs += '{}: {}\n'.format(k, v) res_header_text = "%s %d %s\n%s" % ( res.response_version, res.status, res.reason, reshdrs) self.logger.trace("==== REQUEST ====\n%s" % req_header_text, color=ProxyLogger.colors_map['yellow']) u = urlparse(req.uri) if u.query: query_text = _parse_qsl(u.query) self.logger.trace("==== QUERY PARAMETERS ====\n%s\n" % query_text, color=ProxyLogger.colors_map['green']) cookie = req.headers.get('Cookie', '') if cookie: cookie = _parse_qsl(re.sub(r';\s*', '&', cookie)) self.logger.trace("==== COOKIES ====\n%s\n" % cookie, color=ProxyLogger.colors_map['green']) auth = req.headers.get('Authorization', '') if auth.lower().startswith('basic'): token = auth.split()[1].decode('base64') self.logger.trace("==== BASIC AUTH ====\n%s\n" % token, color=ProxyLogger.colors_map['red']) if req_body is not None: req_body_text = None content_type = req.headers.get('Content-Type', '') if content_type.startswith('application/x-www-form-urlencoded'): req_body_text = _parse_qsl(req_body) elif content_type.startswith('application/json'): try: json_obj = json.loads(req_body) json_str = json.dumps(json_obj, indent=2) if json_str.count('\n') < 50: req_body_text = json_str else: lines = json_str.splitlines() req_body_text = "%s\n(%d lines)" % ('\n'.join( lines[:50]), len(lines)) except ValueError: req_body_text = req_body elif len(req_body) < 1024: req_body_text = req_body if req_body_text: self.logger.trace("==== REQUEST BODY ====\n%s\n" % req_body_text.strip(), color=ProxyLogger.colors_map['white']) if res is not None: self.logger.trace("\n==== RESPONSE ====\n%s" % res_header_text, color=ProxyLogger.colors_map['cyan']) cookies = res.headers.get('Set-Cookie') if cookies: if type(cookies) == list or type(cookies) == tuple: cookies = '\n'.join(cookies) self.logger.trace("==== SET-COOKIE ====\n%s\n" % cookies, color=ProxyLogger.colors_map['yellow']) if res_body is not None: res_body_text = res_body content_type = res.headers.get('Content-Type', '') if content_type.startswith('application/json'): try: json_obj = json.loads(res_body) json_str = json.dumps(json_obj, indent=2) if json_str.count('\n') < 50: res_body_text = json_str else: lines = json_str.splitlines() res_body_text = "%s\n(%d lines)" % ('\n'.join( lines[:50]), len(lines)) except ValueError: res_body_text = res_body elif content_type.startswith('text/html'): if type(res_body) == str: res_body = str.encode(res_body) m = re.search(r'<title[^>]*>\s*([^<]+?)\s*</title>', res_body.decode(errors='ignore'), re.I) if m: self.logger.trace("==== HTML TITLE ====\n%s\n" % html.unescape(m.group(1)), color=ProxyLogger.colors_map['cyan']) elif content_type.startswith('text/') and len(res_body) < 1024: res_body_text = res_body if res_body_text: res_body_text2 = '' maxchars = 4096 halfmax = int(maxchars / 2) try: if type(res_body_text) == bytes: dec = res_body_text.decode() else: dec = res_body_text if dec != None and len(dec) > maxchars: res_body_text2 = dec[:halfmax] + ' <<< ... >>> ' + dec[ -halfmax:] else: res_body_text2 = dec except UnicodeDecodeError: if len(res_body_text) > maxchars: res_body_text2 = hexdump(list(res_body_text[:halfmax])) res_body_text2 += '\n\t................\n' res_body_text2 += hexdump( list(res_body_text[-halfmax:])) else: res_body_text2 = hexdump(list(res_body_text)) self.logger.trace("==== RESPONSE BODY ====\n%s\n" % res_body_text2, color=ProxyLogger.colors_map['green'])
self.close() def handle_starttag(self, tag, attrs): if tag.lower() == 'script': attrs = dict(attrs) if attrs.get('id', '').lower() == 'last-search-results': self.json = True def handle_data(self, data): if self.json is True: self.json = data try: from html import unescape PasskeyParser.unescape = lambda self, text: unescape(text) except ImportError as e: pass def type_day(arg): try: d = datetime.strptime(arg, '%Y-%m-%d') except ValueError: raise ArgumentTypeError("%s is not a date in the form YYYY-MM-DD" % arg) if not firstDay <= d <= lastDay: raise ArgumentTypeError( "%s is outside the Gencon housing block window" % arg) return arg
file_success = False with open(fname_in, 'r', encoding='utf8') as fin, open(fname_out, 'w') as fout: reader = csv.DictReader(fin) next(reader, None) # skip the headers writer = csv.writer(fout) try: logger.info( "File successfully pulled from Acalog remote service. Generate CSV file of outcomes." ) for row in reader: writer.writerow( (row["Prefix"] + html.unescape(row["Common Course Identifier"]), row["Code"], row["Catalog Name"], row["Course Outcomes"])) file_success = True except csv.Error as e: logger.exception("Error reading or writing courses file. " + str(e)) # if we successfully generated the file, move it to its remote destination if file_success and os.path.getsize(fname_out) > 9000: logger.info( "Course outcomes file successfully generated so let's move it to the remote location." ) put_file_smb(fname_out) else:
def loadLocalsFromWebsite(url): print('Loading', url, '...') with urllib.request.urlopen(url) as response: content = response.read() d = pq(content) entries = [] categories = d('#content > div.grid-container') availableDistricts = {} os.makedirs(imagesFolder, exist_ok=True) imagesCached = [ f for f in os.listdir(imagesFolder) if os.path.isfile(os.path.join(imagesFolder, f)) ] print('Found', len(categories), 'categories.\nAlready fetched', len(imagesCached), 'images.') for category in categories: elements = d('.cat-items .post.item', category) availableSubCategories = {} categoryTitle = d('h2:first', category).text() for subCategoryItem in d('.options a.option.category', category): subCategoryItem = d(subCategoryItem) availableSubCategories[subCategoryItem.attr( 'data-filter-value')] = subCategoryItem.text() for tagFilterItem in d('.options a.option.tag', category): tagFilterItem = d(tagFilterItem) availableDistricts[tagFilterItem.attr( 'data-filter-value')] = tagFilterItem.text() print('Found', len(elements), 'elements in category', categoryTitle, '\nSub.categories:\n -', '\n - '.join(availableSubCategories.values()), '\n') for element in elements: classes = set() for cssClass in d(element).attr('class').split(' '): cssClass = cssClass.strip() if cssClass != '': classes.add(cssClass) image = d('img', element) imagePath = image.attr('src') imageFilename = ntpath.basename(imagePath) # Fix for multi-char umlaut. imageFilename = imageFilename.replace('ö', 'oe') if imageFilename not in imagesCached: saveImage(imagePath, imageFilename) imagesCached.append(imageFilename) link = d('a:last', element) subCategories = [] for subCategoryKey in classes & availableSubCategories.keys(): subCategories.append( availableSubCategories[subCategoryKey]) districts = [] for districtKey in classes & availableDistricts.keys(): districts.append(availableDistricts[districtKey]) entry = { 'image': imageFilename, 'title': link.text(), 'link': link.attr('href'), 'sub-categories': sorted(subCategories), 'districts': sorted(districts), 'category': categoryTitle, 'cleanTitle': link.text().split('/ ')[0].strip(), 'description': html.unescape(d('p', element).text()) } entries.append(entry) print('Parsed', len(entries), 'entries in', len(categories), 'categories\nDistricts:\n -', '\n - '.join(availableDistricts.values())) return entries