def whiteScrape(webaddress): url = ('file:///Users/user/Documents/programming/1-978-251-1362.html') ourUrl = opener.open(url).read() soup = BeautifulSoup(ourUrl) soup = soup.find("div", { "class" : "address-card" }) #below code will delete tags except /br soup = str(soup) soup = soup.replace('<br/>' , '^') soup = BeautifulSoup(soup) soup = (soup.get_text()) soup = str(soup) soup=soup.replace('^' , '<br/>') return soup
def cleanUpText(review): #this is by no means exauhstive punctuation = """.,?!:;(){}[]""" #remove html tags review_text = BeautifulSoup(review).get_text() #replace '\n' with '' review_text = review_text.replace('\n', '') #treat punctuation as a individual word for c in punctuation: review_text = review_text.replace(c," %s "%c) return review_text.split()
def pages(): url = "https://kat.cr/usearch/%s%s%s" %(search, category, filetype) print "\n The url is: ", url, "\n" page = requests.get(url).content textblock = BeautifulSoup(page) textblock = textblock.find("div").h2 textblock = textblock.find("span") textblock = str(textblock) textblock = textblock.replace("<span> results 1-25 from ","") textblock = textblock.replace("</span>","") torrentamount = int(textblock) pageamount = (torrentamount/25) + 1 return pageamount
def extract_form_declaration(source): text = "PORTLET:YAML:" if text in source: soup = BeautifulSoup(source) soup = soup.find(text=re.compile(text)) soup = soup.replace(text, "").strip() return yaml.load(soup) text = "PORTLET:HELPER:" if text in source: soup = BeautifulSoup(source) soup = soup.find(text=re.compile(text)) soup = soup.replace(text, "").strip() return get_helper_declaration(soup) return {}
def preprocessor(tweet): emo_repl_order = const.emo_repl_order emo_repl = const.emo_repl re_repl = const.re_repl tweet = BeautifulSoup(tweet).get_text() tweet = tweet.lower() for k in emo_repl_order: tweet = tweet.replace(k, emo_repl[k]) tweet=tweet.replace('\'s ','').replace("-", " ").replace("_", " ").replace('"','').replace(".",'').\ replace(',','').replace(';','').strip() for r, repl in re_repl.items(): tweet = re.sub(r, repl, tweet) return tweet
def preprocessor(row_review): global emoticons_replaced data=BeautifulSoup(row_review).get_text() data = data.lower() for k in count.emo_repl_order: data = data.replace(k, count.emo_repl[k]) for r, repl in count.re_repl.iteritems(): data = re.sub(r, repl, data) data = data.replace('\'s ','') data = re.sub("[^a-z]"," ",data) newdata =" ".join(data.split()) # english_stemmer = nltk.stem.SnowballStemmer('english') # newdata = " ".join([english_stemmer.stem(w) for w in data.split()]) return newdata
def sanitize_detail(detail): replacements = [ ('\t', ''), ('<br/>', '\n'), ('\n', '<br/>'), (':', '\b'), ('\b', ':'), ('\r', ''), ('////', '<br/>') ] reg_replacements = [ (r'^:', ''), (r']$', ''), (r'(<br\/>)*$', ''), (r'^(<br\/>)*', ''), (r'\s{2,}', ''), (r'(<br\/>)*$', '') ] detail_text = detail['details'].replace('<br/>', '////') detail_text = BeautifulSoup(detail_text, "html.parser").text detail['title'] = detail['title'].replace(':', '').strip() for r in replacements: detail_text = detail_text.replace(r[0], r[1]).strip() for r in reg_replacements: detail_text = re.sub(r[0], r[1], detail_text).strip() detail['details'] = detail_text return detail
def format(self, article, subscriber): """ Constructs a dictionary that represents the parameters passed to the SMS InsertAlerts stored procedure :return: returns the sequence number of the subscriber and the constructed parameter dictionary """ try: pub_seq_num = superdesk.get_resource_service('subscribers').generate_sequence_number(subscriber) odbc_item = {'Sequence': pub_seq_num, 'Category': article.get('anpa_category', [{}])[0].get('qcode'), 'Headline': article.get('headline', '').replace('\'', '\'\''), 'Priority': map_priority(article.get('priority'))} body = self.append_body_footer(article) if article.get(EMBARGO): embargo = '{}{}'.format('Embargo Content. Timestamp: ', article.get(EMBARGO).isoformat()) body = embargo + body if article[ITEM_TYPE] == CONTENT_TYPE.TEXT: body = BeautifulSoup(body, "html.parser").text odbc_item['StoryText'] = body.replace('\'', '\'\'') # @article_text odbc_item['ident'] = '0' return [(pub_seq_num, odbc_item)] except Exception as ex: raise FormatterError.AAPSMSFormatterError(ex, subscriber)
def get_solution(url): #url = 'https://community.topcoder.com/stat?c=problem_solution&cr=40440099&rd=16747&pm=14278' #url = 'https://community.topcoder.com/stat?c=problem_solution&rm=329103&rd=16775&pm=14340&cr=23089515' #url = 'https://community.topcoder.com/stat?c=problem_solution&cr=40364957&rd=16747&pm=14278' print url #tcsso = 'b0be8a6e3acae9d8743c91ada7294a5b65a698b0dfa82cda539d54a7d41e7584' #cookies = dict() #cookies['tcsso'] = '40451530|b0be8a6e3acae9d8743c91ada7294a5b65a698b0dfa82cda539d54a7d41e7584' #'40451530|b0be8a6e3acae9d8743c91ada7294a5b65a698b0dfa82cda539d54a7d41e7584' #cookies['JSESSIONID'] = 'UYKd7Rv1-OY-6bmewBWJDw**.tomcat_tc01' print cookies page = requests.get(url, cookies=cookies) #print page if str(page) == "<Response [503]>": while str(page) == "<Response [503]>": time.sleep(1) page = requests.get(url, cookies=cookies) html_content = page.text #print html_content[0:100000] #soup = BeautifulSoup(html_content, "html.parser") #text = soup.select("body > table > tbody > tr > td.bodyText > table.paddingTable > tbody > tr:nth-child(1) > td > table:nth-child(4) > tbody > tr:nth-child(13) > td") body = re.findall('<TD CLASS="problemText" COLSPAN="8" VALIGN="middle" class="alignMiddle" ALIGN="left">\n (.+?)<BR>\n </TD>', html_content, flags=re.S) text = body[0] text = text.replace("<BR>","\n") #print w #print repr(text) print text failed_to_download = None solution = None if len(text)==0: failed_to_download = solution_id else: body = BeautifulSoup(str(text), "html.parser").get_text() body = body.replace("\\","\\\\") solution = body.encode('utf-8').decode('string-escape') #print repr(solution) #print solution return solution
def html_prettify(html): """Prettify HTML main function.""" log.info("Prettify HTML...") html = BeautifulSoup(html).prettify() html = html.replace("\t", " ").strip() + "\n" log.info("Finished prettify HTML !.") return html
def format(self, article, subscriber, codes=None): """ Constructs a dictionary that represents the parameters passed to the SMS InsertAlerts stored procedure :return: returns the sequence number of the subscriber and the constructed parameter dictionary """ try: pub_seq_num = superdesk.get_resource_service('subscribers').generate_sequence_number(subscriber) sms_message = article.get('sms_message', article.get('abstract', '')).replace('\'', '\'\'') # category = 1 is used to indicate a test message category = '1' if superdesk.app.config.get('TEST_SMS_OUTPUT', True) is True \ else article.get('anpa_category', [{}])[0].get('qcode').upper() odbc_item = {'Sequence': pub_seq_num, 'Category': category, 'Headline': BeautifulSoup(sms_message, 'html.parser').text, 'Priority': map_priority(article.get('priority'))} body = self.append_body_footer(article) if article.get(EMBARGO): embargo = '{}{}'.format('Embargo Content. Timestamp: ', get_utc_schedule(article, EMBARGO).isoformat()) body = embargo + body if article[ITEM_TYPE] == CONTENT_TYPE.TEXT: body = BeautifulSoup(body, "html.parser").text odbc_item['StoryText'] = body.replace('\'', '\'\'') # @article_text odbc_item['ident'] = '0' return [(pub_seq_num, json.dumps(odbc_item))] except Exception as ex: raise FormatterError.AAPSMSFormatterError(ex, subscriber)
def debug(self, message): ''' Utility method for debugging. Make sure settings.TEST_DEBUG is defined and set to True. When used, self.debug_buffer will contain concatinated debug messages. ''' if (not hasattr(settings, 'TEST_DEBUG')) or ( not settings.TEST_DEBUG ): return if not hasattr(self, 'debug_buffer'): self.debug_buffer = '' try: message = BeautifulSoup(message).body.get_text() except: pass while '\n\n' in message: message = message.replace('\n\n', '\n') self.debug_buffer += ( message + '\n------------------------------\n' )
def Encode(self,text): text=BeautifulSoup(text).get_text() try: l=re.findall(r'&#(.*?);',text) for sub in l: try : a='&#'+sub+';' bc=int(sub) text = text.replace(a,unichr(bc)) except : pass except: text=text # tag = False # quote = False # out = "" # for c in text: # if c == '<' and not quote: # tag = True # elif c == '>' and not quote: # tag = False # elif (c == '"' or c == "'") and tag: # quote = not quote # elif not tag: # out = out + c return text
def get_solution(contest, solution_id): url = 'http://codeforces.com/contest/' + str(contest[0]) + '/submission/' + str(solution_id) print url page = requests.get(url) if str(page) == "<Response [503]>": while str(page) == "<Response [503]>": time.sleep(1) page = requests.get(url) html_content = page.text #print html_content soup = BeautifulSoup(html_content, "html.parser") text = soup.select("body > div > div > div > div > pre") failed_to_download = None solution = None if len(text)==0: failed_to_download = solution_id else: body = BeautifulSoup(str(text[0]), "html.parser").get_text() body = body.replace("\\","\\\\") solution = body.encode('utf-8').decode('string-escape') return solution_id, solution, failed_to_download
def play_video(video_url = _common.args.url): stack_url = 'stack://' hbitrate = -1 sbitrate = int(_addoncompat.get_setting('quality')) * 1024 closedcaption = None video_data = _connection.getURL(video_url) video_tree = BeautifulSoup(video_data, 'html.parser') video_segments = video_tree.find_all('segment') for video_segment in video_segments: seg_url = VIDEOINFO % video_segment['id'] seg_data = _connection.getURL(seg_url) seg_menu = BeautifulSoup(seg_data).find_all('file') hbitrate = -1 file_url = None for video_index in seg_menu: try: bitrate = int(video_index['bitrate']) type = video_index['type'] if bitrate > hbitrate and bitrate <= sbitrate: hbitrate = bitrate file_url = video_index.string elif bitrate == hbitrate and bitrate <= sbitrate and type == 'hd' : file_url = video_index.string except: pass if file_url is None: file_url = BeautifulSoup(seg_data).find_all('file',type = 'hd')[0].string stack_url += file_url.replace(',', ',,') + ' , ' finalurl = stack_url[:-3] xbmcplugin.setResolvedUrl(pluginHandle, True, xbmcgui.ListItem(path = finalurl))
def get_solution(solution_id): #solutions = [] #failed_to_download_s = [] #for i in solution_ids: url = "https://www.codechef.com/viewplaintext/" + str(solution_id) page = requests.get(url) if str(page) == "<Response [503]>": while str(page) == "<Response [503]>": time.sleep(1) page = requests.get(url) html_content = page.text if html_content==None: failed_to_download_s.append(i) text = BeautifulSoup(html_content, "html.parser").get_text() #'''figure out if escape_lt needs to go here''' print len(text) #print text failed_to_download = None solution = None #print text if len(text)==0 or re.search('var _sf_startpt = (new Date()).getTime()', text) != None: failed_to_download = solution_id else: text = text.replace("\\","\\\\") solution = text.encode('utf-8').decode('string-escape') return solution_id, solution, failed_to_download
async def translate(self, ctx, to_language, *, msg): """Translates words from one language to another. Do [p]help translate for more information. Usage: [p]translate <new language> <words> - Translate words from one language to another. Full language names must be used. The original language will be assumed automatically. """ await ctx.message.delete() if to_language == "rot13": # little easter egg embed = discord.Embed(color=discord.Color.blue()) embed.add_field(name="Original", value=msg, inline=False) embed.add_field(name="ROT13", value=codecs.encode(msg, "rot_13"), inline=False) return await ctx.send("", embed=embed) async with self.bot.session.get("https://gist.githubusercontent.com/astronautlevel2/93a19379bd52b351dbc6eef269efa0bc/raw/18d55123bc85e2ef8f54e09007489ceff9b3ba51/langs.json") as resp: lang_codes = await resp.json(content_type='text/plain') real_language = False to_language = to_language.lower() for entry in lang_codes: if to_language in lang_codes[entry]["name"].replace(";", "").replace(",", "").lower().split(): language = lang_codes[entry]["name"].replace(";", "").replace(",", "").split()[0] to_language = entry real_language = True if real_language: async with self.bot.session.get("https://translate.google.com/m", params={"hl": to_language, "sl": "auto", "q": msg}) as resp: translate = await resp.text() result = str(translate).split('class="t0">')[1].split("</div>")[0] result = BeautifulSoup(result, "lxml").text embed = discord.Embed(color=discord.Color.blue()) embed.add_field(name="Original", value=msg, inline=False) embed.add_field(name=language, value=result.replace("&", "&"), inline=False) if result == msg: embed.add_field(name="Warning", value="This language may not be supported by Google Translate.") await ctx.send("", embed=embed) else: await ctx.send(self.bot.bot_prefix + "That's not a real language.")
def authorFilter(self, pageSoup): author = BeautifulSoup(str(pageSoup.find_all( 'a', {'class': 'trb_ar_by_nm_au_a'}))).get_text().encode('ascii',errors='ignore') if author != '[]': #when author is with href return author.replace("\n",'') else: #when author is without href authorSoup = BeautifulSoup(str(pageSoup.find_all( 'span', {'class': 'trb_ar_by_nm_au'}))).get_text().encode('ascii',errors='ignore') authorFilter = BeautifulSoup(str(pageSoup.find_all( 'span', {'itemprop': 'author'}))).get_text().encode('ascii',errors='ignore') if authorFilter == '[]': return "NULL" #No Author Present in Article else: return authorFilter.replace('\n','')
def list_qualities(BASE, video_url = _common.args.url, media_base = VIDEOURL): if media_base not in video_url: video_url = media_base + video_url bitrates = [] if 'feed' not in video_url: swf_url = _connection.getRedirect(video_url, header = {'Referer' : BASE}) params = dict(item.split("=") for item in swf_url.split('?')[1].split("&")) uri = urllib.unquote_plus(params['uri']) config_url = urllib.unquote_plus(params['CONFIG_URL']) config_data = _connection.getURL(config_url, header = {'Referer' : video_url, 'X-Forwarded-For' : '12.13.14.15'}) feed_url = BeautifulSoup(config_data, 'html.parser', parse_only = SoupStrainer('feed')).feed.string feed_url = feed_url.replace('{uri}', uri).replace('&', '&').replace('{device}', DEVICE).replace('{ref}', 'None').strip() else: feed_url = video_url feed_data = _connection.getURL(feed_url) video_tree = BeautifulSoup(feed_data, 'html.parser', parse_only = SoupStrainer('media:group')) video_segments = video_tree.find_all('media:content') srates = [] for video_segment in video_segments: video_url3 = video_segment['url'].replace('{device}', DEVICE) video_data3 = _connection.getURL(video_url3, header = {'X-Forwarded-For' : '12.13.14.15'}) video_menu = BeautifulSoup(video_data3).findAll('rendition') orates = srates srates = [] for video_index in video_menu: bitrate = int(video_index['bitrate']) srates.append((bitrate, bitrate)) if orates != []: srates = list(set(srates).intersection(orates)) bitrates =srates return bitrates
def OTVVideos(params): onlinetv_cookie = HTML('http://onlinetv.kg/Auth/Login', {'UserName':onlinetvkg_login, 'Password':onlinetvkg_password, 'RememberMe':'true'}) if not onlinetv_cookie or onlinetv_cookie == 'false': Noty('Online TV', 'Ошибка авторизации / Cервер недоступен', '', 5000) else: onlinetv_token = HTML('http://onlinetv.kg/TV/GetNewTransmissionUID?') html = BeautifulSoup(re.sub('\s+', ' ', HTML(params['url']).replace('<br/>',' '))) try: page = int(params['page']) except: page = 0 current_page = page if page > 0 else 1 try: pages_all = html.find('ul', attrs={'class':'pages'}).findAll('a')[-1].string XBMCItemAdd({'title':Colored('[ Перейти на страницу ]', 'opendialog') + ' ' + str(current_page) + ' из ' + str(pages_all) + ' страниц', 'thumb':ImagePath('findpage.png')}, { 'func': 'OTVSearchPage', 'page': current_page, 'url' : params['url'] }) if current_page < int(pages_all): current_page = (page + 1) if page > 0 else 2 url = (params['url'] + '/' + str(current_page)) if page < 1 else params['url'][:(len(str(page)) * -1)] + str(current_page) XBMCItemAdd({'title':Colored('[ Следующая страница ]', 'nextpage'), 'thumb':ImagePath('next.png')}, { 'func': 'OTVVideos', 'page': current_page, 'url' : url }) except: pass video_list = html.find('div', attrs={'class':'results'}).findAll('a') if len(video_list) > 0: for a in video_list: url = str(a['href']) time = Colored(str(a.div.find('div', {'class':'time'}).string).decode('utf-8'), 'FF268789').encode('utf-8') description = Colored(str(a.div.find('div', {'class':'description'}).string).decode('utf-8'), 'FF61a061').encode('utf-8').replace('-', '').strip() if url.find('GenreGroupTransmissions') >= 0 or url.find('GroupedSearch') >= 0: name1 = str(a.div.find('div', {'class':'name'})) name1 = BeautifulSoup(name1.replace('<span>', '[COLOR FF00AA00] ').replace('</span>', '[/COLOR] ').strip()) name = Colored(name1.div.string.encode('utf-8'), 'bold') XBMCItemAdd({'title': time + ' | ' + name + ' | ' + description}, { 'func': 'OTVVideos', 'page': current_page, 'url' : 'http://onlinetv.kg' + url }) else: name = Colored(a.div.find('div', {'class':'name'}).string.title(), 'bold').encode('utf-8') url_re = re.compile('TV/VOD/(.+[0-9])').findall(url)[0] XBMCItemAdd({'title': time + ' | ' + name + ' | ' + description}, {'url' : 'http://vod.onlinetv.kg/FileUpload/' + onlinetv_token + '/rus/' + url_re + '.ts'}, False) XBMCEnd() else: Noty('Online TV', 'Видео не найдено')
def cleaning(original_text): text = BeautifulSoup(original_text,"lxml").get_text() # Remove Encodings text = re.sub(r'\\\\', r'\\', text) text = re.sub(r'\\x\w{2,2}',' ', text) text = re.sub(r'\\u\w{4,4}', ' ', text) text = re.sub(r'\\n', '.', text) #Whitespace Formatting text = text.replace('"', ' ') text = text.replace('\\', ' ') text = text.replace('_', ' ') text = text.replace('-', ' ') text = re.sub(' +',' ', text) #Remove Unicode characters text = codecs.decode(text, 'unicode-escape') text = ''.join([i if ord(i) < 128 else '' for i in text]) #Remove email addresses text = re.sub(r'[\w\-][\w\-\.]+@[\w\-][\w\-\.]+[a-zA-Z]{1,4}', ' ', text) #Remove Twitter Usernames text = re.sub(r"(\A|\s)@(\w+)+[a-zA-Z0-9_\.]", ' ', text) #Remove urls text = re.sub(r'\w+:\/\/\S+', ' ', text) # Word Standardizing (Ex. Looooolll should be Looll) text = ''.join(''.join(s)[:2] for _, s in itertools.groupby(text)) #Convert words to lower case text = text.lower().split() #Remove contractions by expansion of words text = [contractions[word] if word in contractions else word for word in text] #Rejoin words text = " ".join(text) #Remove non-alphabets text = re.sub("[^a-z\s]", " ", text) return " ".join(text.split())
def __iter__(self): for path, dirs, files in os.walk(self.dirname): for d in dirs: dir_path = os.path.join(self.dirname,d) for fname in os.listdir(dir_path): for line in open(os.path.join(dir_path, fname)): # 1. Remove HTML line = BeautifulSoup(line).get_text() # 2. Remove non-letters line = re.sub("[^a-zA-Z]"," ", line) # 3. Remove numbers from text for i in range(10): line.replace(str(i),'') # 4. Convert words to lower case and split them words=line.lower().split() yield words
def format(self, article, output_channel, selector_codes): """ Constructs a dictionary that represents the parameters passed to the IPNews InsertNews stored procedure :param article: :param output_channel: :param selector_codes: :return: returns the sequence number of the output channel and the constructed parameter dictionary """ try: pub_seq_num = superdesk.get_resource_service('output_channels').generate_sequence_number(output_channel) odbc_item = {} odbc_item['originator'] = article.get('originator', None) odbc_item['sequence'] = pub_seq_num odbc_item['category'] = article.get('anpa-category', {}).get('qcode') # @category odbc_item['headline'] = article.get('headline', '') # @headline odbc_item['author'] = article.get('byline', '') # @author odbc_item['keyword'] = article.get('slugline', None) if article['subject'][0]: odbc_item['subject_reference'] = article['subject'][0].get('qcode', None) if odbc_item['subject_reference']: odbc_item['subject'] = subject_codes[odbc_item['subject_reference'][:2] + '000000'] odbc_item['subject_matter'] = subject_codes[odbc_item['subject_reference'][:5] + '000'] odbc_item['subject_detail'] = subject_codes[odbc_item['subject_reference']] odbc_item['take_key'] = article.get('anpa_take_key', None) # @take_key odbc_item['usn'] = article.get('unique_id', None) # @usn if article['type'] == 'preformatted': odbc_item['article_text'] = article.get('body_html', '').replace('\'', '\'\'') # @article_text elif article['type'] == 'text': text = BeautifulSoup(article.get('body_html', '')).text text = text.replace('\'', '\'\'') odbc_item['article_text'] = text if 'genre' in article: odbc_item['genre'] = article['genre'][0].get('name', None) else: odbc_item['genre'] = 'Current' # @genre if article.get('type', 'text') == 'text': odbc_item['texttab'] = 'x' elif article.get('type', None) == 'preformatted': odbc_item['texttab'] = 't' odbc_item['wordcount'] = article.get('word_count', None) # @wordcount odbc_item['news_item_type'] = 'News' odbc_item['priority'] = article.get('priority', None) # @priority odbc_item['service_level'] = 'a' # @service_level sel_codes = selector_codes[output_channel['_id']] odbc_item['selector_codes'] = ' '.join(sel_codes) odbc_item['fullStory'] = 1 odbc_item['ident'] = '0' # @ident return pub_seq_num, odbc_item except Exception as ex: raise FormatterError.AAPIpNewsFormatterError(ex, output_channel)
def page2text(pagename): page = wekeypedia.WikipediaPage(pagename) content = page.get_revision() txt = BeautifulSoup(content, "html.parser") txt = txt.get_text() txt = txt.replace("[edit]","") return txt
def txt2words(self, txt, remove_stopwords=True): txt = BeautifulSoup(txt).get_text() txt = ftfy.fix_text(txt) txt = txt.replace("\\n", '') txt = re.sub("[^0-9a-zA-Z]"," ", txt) if remove_stopwords: words = [self.save_stem(w) for w in txt.lower().split() if (w not in self.stopwords) & (len(w) > 2) & (not w.isdigit())] else: words = [self.save_stem(w) for w in txt.lower().split() if (len(w) > 2) & (not w.isdigit())] return words
class TVShow: def __init__(self, tvshow): self.tvshow = tvshow self.series = 0 self.season = 0 self.episode = 0 self.title = "" def __str__(self): return "{0} - {1}x{2} - {3}".format(self.tvshow, self.season.zfill(SEASON_PADDED_ZEROS), self.episode.zfill(EPISODE_PADDED_ZEROS), self.title) def process(self): release = re.findall("S?(\d+)x?E?(\d+)", self.tvshow, flags=2) if len(release) > 0: self.season = release[0][0].lstrip("0") if len(re.findall("^0+$", release[0][0])) == 0 else "0" self.episode = release[0][1].lstrip("0") if len(re.findall("^0+$", release[0][1])) == 0 else "0" # Replace any fluff from the file name self.tvshow = re.sub(FILTER, " ", self.tvshow, flags=2).strip() def fetch(self): r = requests.get("http://thetvdb.com/api/GetSeries.php?seriesname={0}&language={1}".format(self.tvshow, LANGUAGE)) soup = BeautifulSoup(r.content) self.series = soup.find("seriesid").text self.tvshow = soup.find("seriesname").text def get_episode(self): r = requests.get("http://thetvdb.com/api/{0}/series/{1}/default/{2}/{3}/{4}.xml" .format(API_KEY, self.series, str(self.season), str(self.episode), LANGUAGE)) if r.status_code == 404: print("Error! 404: Not found") else: self.title = BeautifulSoup(r.content).find("episodename").text def replace_illegal_characters(self): for illegal_character in ILLEGAL_CHARACTERS: self.title = self.title.replace(illegal_character, REPLACE_CHAR) # Replace the ellipsis with three periods to prevent UnicodeError self.title = self.title.replace("…", "...")
def _get_model(self, dbms): print(dbms.extract()) model = dbms.css('tr th:nth-child(5)') link = dbms.css('tr th:nth-child(5) a') span = dbms.css('tr th:nth-child(5) span span') if span: html = span.extract()[0] elif link: html = link.extract()[0] else: html = model.extract()[0] models = BeautifulSoup(html, 'html.parser').text return models.replace(',', '|')
def img_save(self, page_url): img_response = self.hit(page_url) img_url = BeautifulSoup(img_response.text, 'lxml').find('div', class_ = 'main-image').find('img')['src'] name = img_url.replace('/', '_') try: img = self.hit(img_url) f = open(name + '.jpg', 'ab') f.write(img.content) f.close() print ">>>>>>>>>>>>>>>>写入成功~" except: print '不能写入图片数据:' + img_url return False
def _render_levelings(self, html: BeautifulSoup, nvalues: int) -> List[Leveling]: # Do some pre-processing on the html if not isinstance(html, str): html = str(html) html = html.replace("</dt>", "\n</dt>") html = html.replace("</dd>", "\n</dd>") html = BeautifulSoup(html, "lxml") html = html.text.strip() while "\n\n" in html: html = html.replace("\n\n", "\n") while " " in html: html = html.replace(" ", " ") levelings = html.replace("\xa0", " ") # Get ready results = [] # Let's parse! initial_split = levelings.split("\n") initial_split = [ lvling.strip() for lvling in initial_split if lvling.strip() not in ( "Takedown scales with Aspect of the Cougar's rank", "Swipe scales with Aspect of the Cougar's rank", "Pounce scales with Aspect of the Cougar's rank", "Cougar form's abilities rank up when Aspect of the Cougar does", ) ] initial_split = list(grouper(initial_split, 2)) for attribute, data in initial_split: if attribute.endswith(":"): attribute = attribute[:-1] result = self._render_leveling(attribute, data, nvalues) results.append(result) return results
def clean_text(string, remove_spaces=False): matches = ["\n", "<br>"] for m in matches: string = string.replace( m, " ").strip() string = ' '.join(string.split()) string = BeautifulSoup(string, "lxml").get_text() SAFE_PTN = r"[|\^&+\-%*/=!>]" string = re.sub(SAFE_PTN, ' ', string.strip() ).strip() if remove_spaces: string = string.replace(' ', '_') return string
def format(self, comment): found_difficult = "" # digits = re.findall("\d+\.", comment) # for digit in set(digits): # comment = comment.replace(digit, "<b>"+digit + " </b>") all_a_links = re.findall("(<a href.*?>(.*?)</a>)", comment) for a_link_and_text in all_a_links: a_link, text = a_link_and_text comment = comment.replace(a_link, text) if "pages/images/hard.gif" in comment: found_difficult += "*" if "pages/images/harder.gif" in comment: found_difficult += "*" # we need to specifically keep these tags because the "text" property will remove them so we "hide" them with nosense characters tags_to_keep = ["u", "b"] comment = comment.replace("<u>", "$!u$").replace("</u>", "$/!u$") comment = comment.replace("<b>", "$!b$").replace("</b>", "$/!b$") text = BeautifulSoup(comment, "lxml").text text = text.strip() while " " in text: text = text.replace(" ", " ") # following code makes sure "3.\nhello" becomes "3. hello" digit = re.match(u"^.{1,2}[\)|\.]", text) if digit: text = text.replace(digit.group(0), u"") text = text.strip() text = digit.group(0) + u" " + text # now get the tags back and remove nonsense chars text = text.replace("$!u$", "<u>").replace("$/!u$", "</u>") text = text.replace("$!b$", "<b>").replace("$/!b$", "</b>") text = text.replace("\n", "<br/>") return (found_difficult + text).strip()
def clean_text(string, remove_spaces=False): matches = ["\n", "<br>"] for m in matches: string = string.replace( m, " ").strip() string = ' '.join(string.split()) string = BeautifulSoup(string, 'lxml').get_text() SAFE_PTN = "[^0-9a-zA-Z-_.'()]+" string = re.sub(SAFE_PTN, ' ', string.strip() ).strip() if remove_spaces: string = string.replace(' ', '_') return string
def cleanHTMLtext(self, raw_html): """ Function to clean the Description Col in Indeed Dataset """ if type(raw_html) == str: cleantext = BeautifulSoup(raw_html).get_text(" ") #BeautifulSoup(raw_html, "html.parser").text cleantext = cleantext.replace('\r', ' ').replace('\n', ' ')[1:-1] re.sub('\W+', ' ', cleantext) re.sub(',', ' ', cleantext) return cleantext else: return None
def internet_search(indebug, inquery, inkey): debug = str(indebug) query = str(inquery) key = str(inkey) try: # create credential for authentication user_agent = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; FDM; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 1.1.4322)' creds = (':%s' % key).encode('utf-8')[:-1] auth = 'Basic %s' % creds # Search query for Bing. To obtains definitions url = 'http://www.bing.com/search?q=define+"%s"' %query url = str(url) request = urllib.request.Request(url) request.add_header('Authorization', auth) request.add_header('User-Agent', user_agent) requestor = urllib.request.build_opener() result = requestor.open(request) soup = BeautifulSoup(result, 'lxml') out = soup.findAll('ol', {'class': 'b_dList'}) # Obtains the information of 'ol' tag whose class name is 'b_dList' out = str(out) # Converting class type to string type if(out == '[]'): if (debug == '--debug'): print(("No results found for '%s' using Bing Search" %(query))) return 0 else: out = BeautifulSoup(out, 'lxml') out = out.findAll('li') out = str(out) out = str((re.compile(r'<li>(.*)</li>').search(out)).group()) # Extracting <li>..</li> tag information out = out.replace('>, <', '>\n<') # Breaks the list items / definitions into one per line out = out.split('\n') # outputs a list definitions = [] # A list to store the definitions for i in out: i = BeautifulSoup(i, 'lxml') i = i.text # Extracts text only leaving the html tags definitions.append(i) # Appends definitions to the list if (debug == '--debug'): #print "------------------------------------------------------" print(("Definitions of '%s' from Internet search:" %(query))) for x in range(0,len(definitions)): print(('%d : %s' %(x+1, definitions[x]))) print("------------------------------------------------------") return definitions except: print("Exception raised!! in obtaining definitions using Bing Search API") return 0
def pdf_to_html(fonts,jumpiness,word_rotation,width_shift,height_shift,rotace): #change all the html files for filee in os.listdir("data\\converted\\pdf"): filee_converted = "data\\converted\\pdf\\" + filee filee_dest = "data\\done\\" + filee #find text and change it with open(filee_converted,"r",encoding="utf-8") as f: result = f.read() whole_file = BeautifulSoup(result,"html.parser") schulubung = whole_file.find("div",attrs={"id" : "page-container"}) #najit pages for data in schulubung.children: #page cislo if (data.name == "div"): #najit jenom page this_page = False for page in data.children: if (page.name == "div"): #jenom ten spravnej text #for loop mezi divs for bad_div in page.children: for divs in bad_div.children: if (divs.name == "div"): divs["style"] = "margin:0px 0px {1}px {0}px;transform:rotate({2}deg);".format(randrange(width_shift[0],width_shift[1]),randrange(height_shift[0],height_shift[1]),randrange(rotace[0],rotace[1])) #loop v divu a randomize fontu line = divs.decode_contents() res = "" i = 0 while i < len(line): if (line[i:i + 1] == " "): res += line[i:i + 1] elif (unidecode(line[i:i + 1]) == unidecode("")): res += " " elif (line[i:i + 5] == "<span" or line[i:i + 6] == "</span"): while line[i:i + 1] != ">": res += line[i:i + 1] i += 1 res += ">" else: word = ["<span style='margin-top:10px;font-family:{0};color:#000F55;position:relative;top:{1}px;font-size:40%;transform:skewY({2}deg)'>".format(choice(fonts),randrange(jumpiness[0],jumpiness[1]),randrange(word_rotation[0],word_rotation[1])),"</span>"] res += word[0] + line[i:i + 1] + word[1] i += 1 divs.string = res this_page = not this_page #write new file with open(filee_dest,"w",encoding="utf-8") as f: whole_file = str(whole_file).replace("<","<") whole_file = whole_file.replace(">",">") f.write(str(whole_file)) print("done")
def __populate_db_contexts_for_opinion( self, session: Session, opinion: Opinion, reporter_resource_dict: dict, context_slice=slice(-128, 128), ) -> None: unstructured_html = opinion.html_text if not unstructured_html: raise ValueError(f"No HTML for case {opinion.resource_id}") unstructured_text = BeautifulSoup(unstructured_html, features="lxml").text clean_text = unstructured_text.replace("U. S.", "U.S.") tokenizer = OneTimeTokenizer(self.eyecite_tokenizer) citations = list(eyecite.get_citations(clean_text, tokenizer=tokenizer)) cited_resources = eyecite.resolve_citations(citations) for resource, citation_list in cited_resources.items(): cited_opinion_res_id = reporter_resource_dict.get( format_reporter( resource.citation.groups.get("volume"), resource.citation.groups.get("reporter"), resource.citation.groups.get("page"), )) if cited_opinion_res_id is None: continue for citation in citation_list: if not isinstance(citation, CaseCitation): continue if (citation.metadata.parenthetical is not None and ParentheticalProcessor.is_descriptive( citation.metadata.parenthetical)): session.add( OpinionParenthetical( citing_opinion_id=opinion.resource_id, cited_opinion_id=cited_opinion_res_id, text=ParentheticalProcessor.prepare_text( citation.metadata.parenthetical), )) start = max(0, citation.index + context_slice.start) stop = min(len(tokenizer.words), citation.index + context_slice.stop) session.add( CitationContext( citing_opinion_id=opinion.resource_id, cited_opinion_id=cited_opinion_res_id, text=" ".join([ s for s in tokenizer.words[start:stop] if isinstance(s, str) ]), ))
def tweet_cleaning_for_sentiment_analysis(tweet): #Escaping HTML characters tweet = BeautifulSoup(tweet).get_text() #Special case not handled previously. tweet = tweet.replace('\x92',"'") #Removal of hastags/account tweet = ' '.join(re.sub("(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)", " ", tweet).split()) #Removal of address tweet = ' '.join(re.sub("(\w+:\/\/\S+)", " ", tweet).split()) #Removal of Punctuation tweet = ' '.join(re.sub("[\.\,\!\?\:\;\-\=]", " ", tweet).split()) #Lower case tweet = tweet.lower() #CONTRACTIONS source: https://en.wikipedia.org/wiki/Contraction_%28grammar%29 CONTRACTIONS = load_dict_contractions() tweet = tweet.replace("’","'") words = tweet.split() reformed = [CONTRACTIONS[word] if word in CONTRACTIONS else word for word in words] tweet = " ".join(reformed) # Standardizing words tweet = ''.join(''.join(s)[:2] for _, s in itertools.groupby(tweet)) #Deal with smileys #source: https://en.wikipedia.org/wiki/List_of_emoticons SMILEY = load_dict_smileys() words = tweet.split() reformed = [SMILEY[word] if word in SMILEY else word for word in words] tweet = " ".join(reformed) #Deal with emojis tweet = emoji.demojize(tweet) #Strip accents tweet= strip_accents(tweet) tweet = tweet.replace(":"," ") tweet = ' '.join(tweet.split()) # DO NOT REMOVE STOP WORDS FOR SENTIMENT ANALYSIS - OR AT LEAST NOT NEGATIVE ONES return tweet
def get_fifa_data(): url = "https://www.fifaindex.com/teams/fifa20_358/?league=13&order=desc" r = requests.get(url) #check if the site being used sends a 200 status code. Verifying it can be webscraped. print(r.status_code) #break down site html and siphon out team data. soup = BeautifulSoup(r.text, "html.parser") soup = soup.findAll('tbody') soup = soup[0].text soup = soup.replace('\n', '') #remove Premier league string from the results delete_league = "Premier League" soup = soup.replace(delete_league, " ") #separate the names and numbers from the data list_of_names = re.findall('\D+', soup) list_of_nums = re.findall("\d{8}", soup) #dictionary to hold our new data data = {"Squads": []} #organize names and averages into the list of data i = 0 for n in list_of_names: _att = list_of_nums[i][:2] _mid = list_of_nums[i][2:4] _def = list_of_nums[i][4:6] _ovr = list_of_nums[i][6:8] data["Squads"].append({"Squad": n, "ATT": _att, "MID": _mid, "DEF": _def, "OVR": _ovr}) i += 1 #normalize our team data into a json dataframe so we can make a easy to read csv file df = pd.json_normalize(data["Squads"]) path = "../Data/team_data_fifa.csv" list_of_files.append(path) df.to_csv(path, index = False) #avoiid being flagged as a spammer from site time.sleep(1) return df
def rtn_chapter_txt(chapterHtml): soup = BeautifulSoup(chapterHtml, 'html.parser') try: txtContent = soup.find_all(name="div", attrs={"class": SUB_DOWN_FLAG})[0] txtContent = str(txtContent).replace('<br/>', "\n") txtContent = BeautifulSoup(txtContent, 'html.parser') txtContent = txtContent.find_all(name="div", attrs={"id": "ChapterContents"})[0].text # time.sleep(5000) except: time.sleep(2) print(chapterHtml) # txtContent = txtContent.split("最新章节!")[1] txtContent = txtContent.strip() txtContent = txtContent.replace(' ', "") # txtContent = txtContent.replace("一秒记住【顶点小说网 www.23wx.so】,精彩小说无弹窗免费阅读!", "") # txtContent = txtContent.replace(" ", "") # txtContent = txtContent.replace(" ", "") # txtContent = txtContent.replace(" ", "") # txtContent = txtContent.replace(" ", "") # txtContent = txtContent.replace("~", "") # txtContent = txtContent.replace("\r\n", "") # txtContent = txtContent.replace("\n\n", "") txtContent = txtContent.replace('\xa0', '') # txtContent = txtContent.replace('\u016f','') # txtContent = txtContent.replace('\u027c','') # txtContent = txtContent.replace('\u025b','') # txtContent = txtContent.replace('\u0c4a','') # txtContent = txtContent.replace('\u0154','') # txtContent = txtContent.replace('\u0189','') return txtContent
def clean_texts(x): if x: x = strip_non_ascii(x) x = BeautifulSoup(x, "lxml") x = x.get_text() x = x.replace('\n', ' ').replace('\r', '').replace('\t', ' ') # remove between word dashes x = x.replace('- ', ' ').replace(' -', ' ').replace('-', ' ') #replace parentheses x = x.replace("(", "").replace(")", "").replace("[", "").replace("]", "") #remove punctuation but keep commas, semicolons, periods, exclamation marks, question marks, intra-word dashes and apostrophes (e.g., "I'd like") x = x.replace(r"[^[:alnum:][:space:].'-:]", " ").replace('+', ' ').replace('*', ' ').replace( "' ", "").replace(" '", "").replace("'", "").replace( ",", " ").replace(";", " ").replace(":", " ") #remove numbers (integers and floats) x = re.sub('\d+', '', x) #remove extra white space, trim and lower x = re.sub('\\s+', ' ', x).strip() return x else: return ""
def d2rucrawl(url): pages = 1 activity = [] html = urlopen(url) soup = BeautifulSoup(html, 'lxml') type(soup) rows = soup.find_all('div', class_='dropdown') str_cells = str(rows) username = BeautifulSoup(str_cells, "lxml").get_text() rows2 = soup.find_all('span', id="user-posts-count") str_cells2 = str(rows2) messagesn = BeautifulSoup(str_cells2, "lxml").get_text() rows3 = soup.find_all('span', class_="points") str_cells3 = str(rows3) likes = BeautifulSoup(str_cells3, "lxml").get_text() img = soup.find_all('img', class_='my') img_a = (str(img).split('/')[1:7]) img_b = [i + '/' for i in img_a] img_c = [''.join(img_b)] while pages < 11 and pages != 0: #блок расчета колличества постов html = urlopen(url + 'activity/page-' + str(pages)) soup = BeautifulSoup(html, 'lxml') type(soup) rows4 = soup.find_all('div', class_='text-medium') str_cells4 = str(rows4) activ = BeautifulSoup(str_cells4, "lxml").get_text() activity.append(activ) pages += 1 data = activity text_string2 = str(data).lower() match_pattern2 = re.findall(r'\b[а-я]{3,15}\b', text_string2) frequency2 = {} for word in match_pattern2: count = frequency2.get(word, 0) frequency2[word] = count + 1 frequency_list2 = frequency2.keys() activity_mess = [] for words in frequency_list2: if frequency2[words] > 5: activity_mess.append(str(words) + ': ' + str(frequency2[words])) activity_end = ', '.join(activity_mess) return 'Никнейм: ' + username.replace( ' ', '' )[2:-1] + ', Сообщения: ' + messagesn[1:-1] + ', Симпатии: ' + likes[ 1: -1] + ' Часто используемые слова - за последние 100 сообщений ' + activity_end + ' Аватар: ' + 'https://dota2.ru/' + str( img_c)[2:-4]
def GetQuote(): url = 'http://quotesondesign.com/wp-json/posts?filter[orderby]=rand&filter[posts_per_page]=1' headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' } request = urllib.request.Request(url, headers=headers) connection = urllib.request.urlopen(request) response = connection.read() data = json.loads(response) quote = BeautifulSoup(data[0]["content"], 'html.parser').get_text() quote = quote.replace('\n', '') quote = quote.strip() author = BeautifulSoup(data[0]["title"], 'html.parser').get_text() author = author.replace('\n', '') author = author.strip() final_text = "{} ~{}".format(quote, author) return final_text
def processTweet(self, tweet): # Cleansing and tokenizing tweet tweet = BeautifulSoup( tweet).get_text() # Extracts text from HTML (just in case!) tweet = tweet.lower() # Converts text to lower-case tweet = re.sub("((www\.[^\s]+)|(https?://[^\s]+))", "URL", tweet) # Replces URLs by URL constan tweet = re.sub("@[^\s]+", "USERTAGGING", tweet) # Replaces usernames by USERTAGGING constant tweet = re.sub(r"#([^\s]+)", r"\1", tweet) # Removes the # in #hashtag for p in punctuation: tweet = tweet.replace(p, "") # Removes punctiation tweet = word_tokenize(tweet) # Creates a list of words return [word for word in tweet if word not in self._stopwords]
def about_love(message): n = randint(13564, 13687) r = requests.get(f'http://ruspoeti.ru/aut/tushnova/{n}/') soup = BeautifulSoup(r.text, 'lxml') soup = str(soup) start = soup.find('class="pcont"') finish = soup.find('class="pfoot"') soup = soup[start:finish] st = 'Тушнова</em><br/><br/>' start = soup.find('Тушнова</em><br/><br/>') finish = soup.find('</p><div') soup = soup[start + len(st):finish] soup = soup.replace('<br/>', '') bot.send_message(message.chat.id, soup)
def scrape(episode_id): response = requests.get('https://open.spotify.com/embed-podcast/episode/{}'.format(episode_id)) page_data = BeautifulSoup(response.text, 'html.parser').find_all('script')[-1].string data_json = page_data.replace('window.__PRELOADED_STATE__ = ', '') data = json.loads(data_json) result = { 'filename': data['data']['name'], 'url': data['data']['unencryptedAudioUrl'] } return result
def post_require(self): """ 爬取职位描述 """ for c in self.company: r = requests.get( c.get('href'), headers=self.headers).content.decode('gbk') bs = BeautifulSoup(r, 'lxml').find( 'div', class_="bmsg job_msg inbox").text s = bs.replace("举报", "").replace("分享", "").replace("\t", "").strip() self.text += s # print(self.text) with open(os.path.join("data", "post_require.txt"), "w+", encoding="utf-8") as f: f.write(self.text)
def clean_texts(tweet): if tweet: tweet = strip_non_ascii(tweet) tweet= BeautifulSoup(tweet, "lxml") tweet= tweet.get_text() tweet= tweet.replace('\n', '').replace('\r', ' ').replace('\t', ' ') tweet= tweet.replace('!', '')###### remove ! tweet= tweet.replace('"', '')###### remove " tweet = re.sub(r'^http?:\/\/.*[\r\n]*', '', tweet, flags=re.MULTILINE) tweet = re.sub(r"http\S+", "", tweet, flags=re.MULTILINE) # remove between word dashes tweet= tweet.replace('- ', '').replace(' -','').replace('-','') #replace parentheses tweet= tweet.replace("(","").replace(")","").replace("[","").replace("]","").replace("RT","") #remove punctuation but keep commas, semicolons, periods, exclamation marks, question marks, intra-word dashes and apostrophes (e.g., "I'd like") tweet= tweet.replace(r"[^[:alnum:][:space:].'-:]", "").replace('+','').replace('*','').replace("' ","").replace(" '","").replace("'","").replace(","," ").replace(";"," ").replace(":"," ").replace("."," ") #remove numbers (integers and floats) tweet= re.sub('\d+', '', tweet) #remove extra white space, trim and lower tweet = re.sub('\\s+',' ',tweet).strip() return tweet else: return ""
def parse(self, project, line_list): string = utils.make_string(line_list) soup = BeautifulSoup(string, 'html.parser') soup = soup.find('body') soup = soup.find('main') soup = soup.find('article') soup = str(soup) string = soup.replace('\n', '!@#$') string = self._string_arrange(string) string = string.split('!@#$') string = utils.make_string(string, conj='\n') return string
async def translate(self, ctx, to_language, *, msg): """Translates words from one language to another. Do [p]help translate for more information. Usage: [p]translate <new language> <words> - Translate words from one language to another. Full language names must be used. The original language will be assumed automatically. """ await bot.message.delete() if to_language == "rot13": # little easter egg embed = discord.Embed(color=discord.Color.blue()) embed.add_field(name="Original", value=msg, inline=False) embed.add_field(name="ROT13", value=codecs.encode(msg, "rot_13"), inline=False) return await bot.send("", embed=embed) async with self.bot.session.get( "https://gist.githubusercontent.com/astronautlevel2/93a19379bd52b351dbc6eef269efa0bc/raw/18d55123bc85e2ef8f54e09007489ceff9b3ba51/langs.json" ) as resp: lang_codes = await resp.json(content_type='text/plain') real_language = False to_language = to_language.lower() for entry in lang_codes: if to_language in lang_codes[entry]["name"].replace( ";", "").replace(",", "").lower().split(): language = lang_codes[entry]["name"].replace(";", "").replace( ",", "").split()[0] to_language = entry real_language = True if real_language: async with self.bot.session.get("https://translate.google.com/m", params={ "hl": to_language, "sl": "auto", "q": msg }) as resp: translate = await resp.text() result = str(translate).split('class="t0">')[1].split("</div>")[0] result = BeautifulSoup(result, "lxml").text embed = discord.Embed(color=discord.Color.blue()) embed.add_field(name="Original", value=msg, inline=False) embed.add_field(name=language, value=result.replace("&", "&"), inline=False) if result == msg: embed.add_field( name="Warning", value= "This language may not be supported by Google Translate.") await bot.send("", embed=embed) else: await bot.send(self.bot.bot_prefix + "That's not a real language.")
def detweet(text): #remove html text = text.replace("#", "") text = text.replace('\u2044', ' or ') text = BeautifulSoup(text, features="html.parser").get_text() text = text.replace('\x92', "'") #remove hashtags (but just hashtags, they can be quite informative about sentiment) #remove mentions text = ' '.join(re.sub("(@[A-Za-z0-9]+)", "", text).split(" ")) # remove web adress text = ' '.join(re.sub("(\w+:\/\/\S+)", " ", text).split()) return text
def cleanText(text): stemmer = WordNetLemmatizer() en_stop = set(nltk.corpus.stopwords.words('english')) text = BeautifulSoup(text, "lxml").text text = re.sub(r'\W', ' ', str(text)) text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text) text = re.sub(r'\^[a-zA-Z]\s+', ' ', text) text = re.sub(r'\s+', ' ', text, flags=re.I) text = re.sub(r'^b\s+', '', text) text = text.lower() text = re.sub(r'\|\|\|', r' ', text) text = re.sub(r'http\S+', r'<URL>', text) text = text.lower() text = text.replace('x', '') text = text.replace(',', ' ') text = re.sub('\n', ' ', text) text = re.sub('[n|N]o\.', 'number', text) tokens = text.split() tokens = [stemmer.lemmatize(word) for word in tokens] tokens = [word for word in tokens if word not in en_stop] tokens = [word for word in tokens if len(word) > 3] preprocessed_text = ' '.join(tokens) return preprocessed_text
def preprocessing_english(x): x = BeautifulSoup(x) x = EmailReplyParser.parse_reply(x.get_text()) x = re.sub(r'<.*?>', '', x) x = x.replace("\n", " ").strip() x = re.sub(pattern=r'[\!"#$%&\*+,-./:;<=>?@^_`()|~=]', repl='', string=x) x = x.replace("\n", " ").strip() x = x.strip() x = re.sub(r"(^|\W)\d+", "", x) x = x.lower() x = re.sub(r'[^a-zA-Z]', ' ', x) x = re.sub("\s\s+", " ", x) stopwords = { 'forwarded', 'message', 'lz', 'logitech', 'dear', 'my', 'date', 'i', 'recently', 'hi', 'hello', 'product', 'serial', 'number', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'purchased', 'purchase', 'support', 'http', 'com', 'logitech', 'www', 'https', 'logi', 'customercare', 'contact', 'terms', 'blvd', 'gateway', 'newark', 'usa', 'logo', 'care', 'ca', 'footer', 'use', 'customer', 'owned', 'us', 'survey', 'americas', 'copyright', 'headquarters', 'owners', 'respective', 'the', 'rights', 'trademarks', 'reserved', 'property', 'dear', 'regards', 'thanks', 'mail', 'email', 'lz', 'g', 'x', 'k', 'date', 'like', 'get', 'one', 'set', 'thank', 'also', 'two', 'see', 'able', 'n', 'could', 'since', 'last', 'know', 'still', 'got', 'pm', 'p', 'n', 's' 'operating', 'system', 'platform', 'ce', 's', 'hs', 'y', 'mr', 'de', 'lfcm', 'sy', 'm', 'kh', 'w', 'ks', 'hs', 'afternoon', 'morning', 'regards', 'thx' 'thanks', 'fri', 'mon', 'tue', 'wed', 'thu', 'sat', 'sun', 'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'sep', 'oct', 'nov', 'dec' } x = x.split() x = [word for word in x if word.lower() not in stopwords] x = ' '.join(x) return x
def parse_details(self, response): items = HouzzItem() # make sure only un-cached / new records are saved in the spreadsheet if not "cached" in response.flags: try: PhoneNumber = response.xpath( "//div[@compid='Profile_Phone']/span[@class='pro-contact-text']/text()" )[0].extract() except: PhoneNumber = "-" try: ContactPersonRAW = response.xpath( "normalize-space(//div[@class='info-list-text']/b[text()='Contact']/../text())" )[0].extract() ContactPerson = ContactPersonRAW.split(": ")[1] except: ContactPerson = "-" try: LocationRAW = response.xpath( "//div[@class='info-list-text']/b[text()='Location']/..") Street = LocationRAW.xpath( "./span[@itemprop='streetAddress']/text()")[0].extract() AddressLocality = LocationRAW.xpath( "./span[@itemprop='addressLocality']/text()")[0].extract() AddressRegion = LocationRAW.xpath( "./span[@itemprop='addressRegion']/text()")[0].extract() PostalCode = LocationRAW.xpath( "./span[@itemprop='postalCode']/text()")[0].extract() AddressCountry = LocationRAW.xpath( "./span[@itemprop='addressCountry']/text()")[0].extract() Location = Street + ", " + AddressLocality + ", " + AddressRegion + ", " + PostalCode + ", " + AddressCountry except: Location = BeautifulSoup( response.xpath( "//div[@class='info-list-text']/b[text()='Location']/.." )[0].extract(), 'lxml').get_text() Location = Location.replace("Location: ", "") items["category"] = response.meta['category'], items["posttitle"] = response.meta['posttitle'], items["posthref"] = response.meta['posthref'], items["location"] = Location, items["contact"] = ContactPerson, items["phone"] = PhoneNumber yield items self.logger.info("Item processed!") #yield scrapy.FormRequest(GoogleURL, formdata=DataObject, callback=self.dummy, method="POST", dont_filter=True, meta={"refresh_cache":True}) else: # self.logger.info("Page is cached!") pass
def HTMLparser(page, blog, url): title = None content = None author = None datePublished = None dateModified = None soup = BeautifulSoup(page, 'lxml') doc = Document(page) title = doc.short_title() content = BeautifulSoup(doc.summary(), 'lxml').get_text() try: application_json_ld = json.loads( soup.find('script', { 'type': 'application/ld+json' }).get_text()) except: application_json_ld = None if application_json_ld is not None: if 'author' in application_json_ld: if isinstance(application_json_ld['author'], list): author = application_json_ld['author'][0]['name'] else: author = application_json_ld['author']['name'] if 'datePublished' in application_json_ld: datestring = application_json_ld['datePublished'] datePublished = parse(datestring) if 'dateModified' in application_json_ld: datestring = application_json_ld['dateModified'] dateModified = parse(datestring) if blog == 'steemit': author = soup.find('a', {'class': 'ptc'}).get_text().split(" ")[0] datestring = soup.find('span', {'class': 'updated'})['title'].split()[0] datePublished = parse(datestring) if len(content) < 500: return None content = content.replace('\n', '') return Post(meta={'id': url}, title=title, content=content, rawContent=content, author=author, datePublished=datePublished, dateModified=dateModified, url=url)
def obtenerTextoCompleto(response): try: TextoCompleto = BeautifulSoup(response, "html.parser").find( "div", {"class": "fullpost__cuerpo"}) TextoCompleto = str(TextoCompleto.contents) TextoCompleto = re.sub('</p>.*?<p>', ' ', TextoCompleto) TextoCompleto = TextoCompleto.replace("]", "").replace("[", "").replace( ", '\n',", ' ') #TextoCompleto = "<br />".join(TextoCompleto.split("\n")) return TextoCompleto except Exception as e: print("No se pudo obtener la Imagen ", e)
def csv_formater(title, abstract, pmid): print("Formating CSV entry from url number", count) title = BeautifulSoup(title).text # remove html tags abstract = BeautifulSoup(abstract).text pmid = BeautifulSoup(pmid).text # handle characters that cause errors for ML-reader title = title.replace("[", "").replace("]", "") abstract = abstract.replace("=", " equals ") #abstract = abstract.replace("\n", " ") # new-line in the paragraph is problematic # This regex statement (?<=[A-Za-z0-9()[\]%])\.(?=[A-Za-z()[\]]{2})|(?<=[A-Za-z()[\]%]{2})\.(?=[A-Za-z0-9()[\]]) # find all periods without a space after them, as in: (---best of times.It was the worst---), # but ignore decimals(12.3, 0.21), abreviations (N.Y., D.C.), and titles (Dr., Mr.) # the replacement value is ". " # build the regex as a string in-order to loop the list of punctuations. # find all punctuations without a space, but not numbers, and abreviations. punctuation = [".", "!", "?"] for punk in punctuation: repunk = re.escape(punk) punkregex = r"(?<=[A-Za-z0-9()[\]%])" + repunk + r"(?=[A-Za-z()[\]]{2})|(?<=[A-Za-z()[\]%]{2})" + repunk + r"(?=[A-Za-z0-9()[\]])" title = re.sub(punkregex, punk + " ", title) abstract = re.sub(punkregex, punk + " ", abstract) # join-split combo reduces all white space to single space, and eliminates trailing/leading space and \escape-char title = ' '.join(title.split()) abstract = ' '.join(abstract.split()) # remouve escape markers from '"Error: Failed to Scrape " + esc_url'; added in 'def parse_url(url):' title = title.replace("\\", "") abstract = abstract.replace("\\", "") pmid = pmid.replace("\\", "") entry = [[title, abstract, pmid]] return entry
def parseJSON(self): """Parse JSON VUIDs into data struct""" # Format of JSON file is: # "API": { "core|EXT": [ {"vuid": "<id>", "text": "<VU txt>"}]}, # "VK_KHX_external_memory" & "VK_KHX_device_group" - extension case (vs. "core") for top_level in sorted(self.json_data): if "validation" == top_level: for api in sorted(self.json_data[top_level]): for ext in sorted(self.json_data[top_level][api]): for vu_txt_dict in self.json_data[top_level][api][ext]: print ("Looking at dict for api:ext entry %s:%s" % (api, ext)) vuid = vu_txt_dict['vuid'] vutxt = vu_txt_dict['text'] # strip asciidoc xref from vu text vutxt = re.sub('&amp;lt;&amp;lt;([^&]*,\\s*|)(.*?)&amp;gt;&amp;gt;', '\\2', vutxt) #print ("%s:%s:%s:%s" % (api, ext, vuid, vutxt)) #print ("VUTXT orig:%s" % (vutxt)) just_txt = BeautifulSoup(vutxt, 'html.parser') #print ("VUTXT only:%s" % (just_txt.get_text())) num_vuid = vuid_mapping.convertVUID(vuid) self.json_db[vuid] = {} self.json_db[vuid]['ext'] = ext self.json_db[vuid]['number_vuid'] = num_vuid self.json_db[vuid]['struct_func'] = api just_txt = just_txt.get_text().strip() unicode_map = { u"\u2019" : "'", u"\u201c" : "\"", u"\u201d" : "\"", u"\u2192" : "->", } for um in unicode_map: just_txt = just_txt.replace(um, unicode_map[um]) self.json_db[vuid]['vu_txt'] = just_txt.replace("\\", "") print ("Spec vu txt:%s" % (self.json_db[vuid]['vu_txt']))
def injectid(obj): z = LoggedMessage.objects.get(pk=obj.pk) p = sanitise_case(z.site, z.text) if not p['status']: soup = Soup(z.text, 'xml') # GET HID k = IssuedIdentifier.objects.filter(site=z.site) _all = Identifier.objects.exclude(pk__in=k.values('identifier_id')) hid = _all[0] print p case_ = "household_head_health_id" if p['household'] else "health_id" case_type = p['form_type'] c = soup.find(case_) mm = "<%s>%s</%s>" % (case_, hid.identifier, case_) c = str(c) soup = str(soup) soup = soup.replace(c, mm) soup = soup.replace("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n", "") y = "<%s> %s </%s>" % (case_type, soup, case_type) COMMCARE_URL = COMMCARE_LINK % z.site print "HID: %s \n" % hid.identifier print "COMMCARE_URL: %s \n" % COMMCARE_URL print y print "==========================================================" form = {'data': y, 'SUBMIT_TO_COMMCARE': SUBMIT_TO_COMMCARE, 'COMMCARE_URL': COMMCARE_URL} if transmit_form(form): s = LoggedMessage() s.text = y s.direction = s.DIRECTION_OUTGOING s.response_to = z s.site = z.site s.save() z.status = s.STATUS_SUCCESS z.save() p = IssuedIdentifier() p.status = IssuedIdentifier.STATUS_ISSUED p.identifier = hid p.site = z.site p.save() else: s = LoggedMessage() s.text = y s.direction = s.DIRECTION_OUTGOING s.response_to = z s.site = z.site s.save() z.status = s.STATUS_ERROR z.save()
def titleFilter(self, pageSoup): title = BeautifulSoup(str(pageSoup.find_all( 'h1', {'class': 'trb_ar_hl_t'}))).get_text().encode( 'ascii',errors='ignore') title = title.replace("\n", "") title = title.replace(" ", "") return title
def parsePage(url): r = requests.get(url) data = r.text soup = BeautifulSoup(data) invalid_tags = ['b', 'i', 'u', 'ul','li', 'p','em'] soup = soup.find(id='primary') for tag in invalid_tags: for match in soup.findAll(tag): match.replaceWithChildren() for match in soup.findAll('span'): match.replaceWith('') for match in soup.findAll('div'): match.replaceWith('') soup = str(soup) soup = soup.replace('<strong>', "%") soup = soup.replace('</strong>', "%") finalOutput = soup.split('%') for n in range(0,4): finalOutput[n]="" return finalOutput