def parse_quotes (htmlitem): ''' Parse quotes in an HTML item. Convert all i-tags into quote-tags. Delete all other tags. ''' quoteMatch_with_comma = re.match('(?u)(.*?)<i.*?>(.{5,}?),(.{5,}?)</i>'\ '(.*)', htmlitem) quoteMatch = re.match('(?u)(.*?)(<i.*?>.{5,}?</i>)(.*)', htmlitem) if quoteMatch_with_comma: before_q = BeautifulSoup(quoteMatch_with_comma.group(1)).get_text() quote1 = BeautifulSoup(quoteMatch_with_comma.group(2)).get_text() quote2 = BeautifulSoup(quoteMatch_with_comma.group(3)).get_text() if quote1.strip() != '' and quote2.strip() != '': parsedItem = before_q + '<quote>' + quote1 + '</quote>,'\ + '<quote>' + quote2 + '</quote>'\ + parse_quotes(quoteMatch_with_comma.group(4)) else: return unicode(BeautifulSoup(htmlitem).get_text()) return unicode(parsedItem) elif quoteMatch: quote = BeautifulSoup(quoteMatch.group(2)).get_text() if not 'siehe' in quote and not 'Siehe' in quote\ and quote.strip() != '': parsedItem = BeautifulSoup(quoteMatch.group(1)).get_text()\ + '<quote>' + quote + '</quote>'\ + parse_quotes(quoteMatch.group(3)) return unicode(parsedItem) else: return unicode(BeautifulSoup(htmlitem).get_text()) else: return unicode(BeautifulSoup(htmlitem).get_text())
def crude_parsing(self): crude_list = [] standard_fields = [ "from:", "to:", "cc:", "bcc:", "mime-version:", "content-type:", "x-from:", "x-to:", "x-cc:", "content-transfer-encoding:", "x-bcc:", "x-filename", "subject:", "message-id:", "x-origin:" ] with open(self.origin_file) as f: for line in f: line = line.decode("utf-8", "ignore").encode("utf-8").lower() try: line = BeautifulSoup(line, "html.parser").getText() except Exception as e: line = "" line = line.lower() if line in ['\n', '\r\n']: crude_list.append("content: " + line.strip()) else: content = False for field in standard_fields: if line.startswith(field): content = True crude_list.append(line.strip()) if not content: if len(crude_list) > 0: crude_list[len(crude_list) - 1] += " " + line.strip() else: crude_list.append("content: " + line.strip()) return crude_list
def GetAllResInPage(tgtUrl) : # 対象URL r = requests.get(tgtUrl) # 第一引数=解析対象 第二引数=パーサー(何を元に解析するか:この場合はHTML) soup = BeautifulSoup(r.content, "html.parser") # dt, ddタグ以下の特定クラスをかき集める。 resheads = soup.find_all("dt", class_="st-bbs_reshead") resbodys = soup.find_all("dd", class_="st-bbs_resbody") formattedHead = [] formattedBody = [] resCount = 0 # 整形済みレスヘッダ部取得 for rhead in resheads: h = rhead # 取得したdt情報を再度文字列化し、BeautifulSoupにかけることでdt以下のタグを同じ手法で取れるようにする hObj = BeautifulSoup(str(h), 'html.parser') # dtタグ内における各タグ(およびクラス)を取得 bbs_resNo = hObj.find('span', class_='st-bbs_resNo').getText() bbs_name = hObj.find('span', class_='st-bbs_name').getText() bbs_resInfo = hObj.find('div', class_='st-bbs_resInfo').getText() # resInfo情報に関しては調整が必要なので前後のトリム・改行コードのち缶等を調整する bbs_resInfo = bbs_resInfo.strip() bbs_resInfo = bbs_resInfo.strip('\n') bbs_resInfo = bbs_resInfo.replace('\n', ' ') # テキスト中に大量の空白が混ざっているため、正規表現で複数空白については空白一個に置換する pattern = r' +' bbs_resInfo = re.sub(pattern, ' ', bbs_resInfo) # print(bbs_resNo, bbs_name, bbs_resInfo) # やり方はなんでもいいのだが、取得した複数のテキストを空白で区切った一行に出力。整形済みヘッダhへappendする resHeaders = [bbs_resNo, bbs_name, bbs_resInfo] h = ' '.join(resHeaders) formattedHead.append(h) # 整形済みレス本体部取得 for rbody in resbodys: # レス本体部分をStr形式にキャスト、文字列置換で改行タグを改行コードに変換し再度bs4オブジェクトに戻す # これを行わないとWebページ上では改行されていた箇所が全部消えてあらゆるレスが1行になる b = str(rbody) b = b.replace("<br>", "\n") b = b.replace("<br/>", "\n") b = BeautifulSoup(b, "html.parser").getText() b = b.strip() # 前後から空白削除 b = b.strip('\n') # 前後から改行削除 formattedBody.append(b) # カウントするのはheadでもbodyでもどちらでもいいのだが、この数が本ページにおけるレス数になる(通常は30だが最終ページでは少ない可能性あり) resCount += 1 return resCount, formattedHead, formattedBody
def chapter(self): chapter = BeautifulSoup(self.__kontenHalaman,features='html.parser').title.text.lower() chapter = re.sub('.+?\s[chapter]{6,7} ([\d\-\_\s\.]+).+','\g<1>',chapter) chapter = re.sub('-','.',chapter) if(chapter.find('.') < 0): chapter = re.sub('\s+','.',chapter.strip()) else: chapter = re.sub('\s','',chapter.strip()) return chapter.strip('.')
def clean(texts): processed_texts = [] for text in texts: text = BeautifulSoup(text, "html.parser") text = text.get_text().encode('ascii', 'ignore').decode('utf-8') text = re.sub(r"\\", "", text) text = re.sub(r"\'", "", text) text = re.sub(r"\"", "", text) text.strip().lower() processed_texts.append(text) return processed_texts
def description(self) -> [str]: """Return the description as a list of str where each str is a bullet point. """ html_text = self._data['description'].split('<li>') parsed_text = [] for html_line in html_text: line = BeautifulSoup(html_line, features='html.parser').get_text() line = unicodedata.normalize('NFKD', line) if line.strip() != '': parsed_text.append(line.strip()) return parsed_text
def upload(): count = 0 with open("v1/data.txt", "r") as fr: lines = fr.readlines() while len(lines) > 0: count += 1 curr_line = lines.pop(0) line_buf = [] while curr_line != '\n' or len(line_buf) < 5: line_buf.append(curr_line) if len(lines) > 0: curr_line = lines.pop(0) else: break #print(line_buf[0]) #print(line_buf[-1]) #print() """ if len(line_buf[0]) > max_name[0]: max_name[0] = len(line_buf[0]) max_name[1] = line_buf[0] for rev in line_buf[-3:]: if len(rev) > max_rev[0]: max_rev[0] = len(rev) max_rev[1] = rev """ #print(line_buf) desc = line_buf[1] for piece in line_buf[2:-3]: desc = desc + piece desc = BeautifulSoup(desc.strip()).text """ if len(desc) > max_desc[0]: max_desc[0] = len(desc) max_desc[1] = desc """ init_piece = line_buf[0].split("#") Product(name=BeautifulSoup(init_piece[0].strip()).text, amazon_id=init_piece[1].strip(), description=desc.strip(), review1=line_buf[-3].strip(), review2=line_buf[-2].strip(), review3=line_buf[-1].strip()).save() print(count)
async def get_page_title(url: str, allow_hostname=True, allow_path: bool = False, allow_filename: bool = True) \ -> Optional[str]: r = None # noinspection PyBroadException try: r = await get(url=url, timeout=2, decode=True, intended_content_type='text/html') if r.status != 200 or not r.content: raise ValueError('not an HTML page') if len(r.content) <= 27: # len of `<html><head><title></title>` raise ValueError('invalid HTML') title = BeautifulSoup(r.content, 'lxml').title.text return title.strip() except Exception: content_disposition = r.headers.get( 'Content-Disposition') if r else None filename_match = contentDispositionFilenameParser( content_disposition) if content_disposition else None if filename_match and allow_filename: return filename_match.group() url_parsed = urlparse(url) if allow_path: path = url_parsed.path return path.rsplit('/', 1)[-1] if path else None if allow_hostname: return url_parsed.hostname
def table_content_list(output_file): html = mammoth.convert_to_html(output_file).value soup = BeautifulSoup(html, "html.parser") # print("soup------->",soup) table_content_list_all = [] for tables in soup.find_all('table'): for row in tables.find_all('tr'): column_list = [] for column in row.find_all('td'): # column_list.append(str(column).replace('<td>','').replace('</td>','').replace('</p>','').replace('<p>','').replace('<td colspan="2">','').strip()) raw_html = str(column).replace( '<strong>', 'start_bold').replace('</strong>', 'end_bold').replace('</p>', '\n').strip() cleantext = BeautifulSoup(raw_html, "lxml").text cleantext = cleantext.replace('start_bold', '<b>').replace( 'end_bold', '</b>') cleantext = cleantext.replace('<', '<').replace( '>', '>').replace('\n', '') column_list.append(cleantext.strip()) column_list = [i for i in column_list if i] # print(column_list) table_content_list_all.append(column_list) table_content_list_all = [x for x in table_content_list_all if x != []] return table_content_list_all
def text(url): if 'http' not in url: url = 'http://' + url page = get(url).text doc = Document(page).summary() text = BeautifulSoup(doc).get_text() return text.strip()
def clean_comment(self, description): try: sent = BeautifulSoup(description, "lxml").get_text() return sent.strip().strip("/*").strip("//").strip("*").strip() except Exception: traceback.print_exc() return ""
def getNews(): try: # List of RSS feeds that we will fetch and combine newsurls = { 'dailynews': 'http://www.dailymirror.lk/RSS_Feeds/business-main' } # Iterate over the feed urls for key, url in newsurls.items(): all_headlines.extend(getHeadlines(url)) # Iterate over the allheadlines list and print each headline for hl in all_headlines: try: html_text = urllib.urlopen(hl).read() parsed_text = BeautifulSoup(html_text, "html.parser") desc = parsed_text.findAll(attrs={"class": "row inner-text"}) #print desc content_news = desc[0].encode('utf-8') para = str(content_news).split("<p>") final_string = "" for a in para: if "img" in a or "iframe" in a or "!--" in a: pass else: final_string = final_string + a append_string = BeautifulSoup(final_string, "html.parser").text printLog('info', "News item: " + append_string.strip()) cleantext.append(append_string) except Exception as e: print e printLog('output', "Parsed text: " + str(cleantext)) return cleantext except Exception as e: printLog('error', e)
def _parser(self, content, parse, add_new_line=False): content = str(content).replace('<br/>', '\n').replace(parse, '') content = BeautifulSoup(content, 'html.parser').text content = '\n'.join([s.strip() for s in content.strip().split('\n')]) if add_new_line: content += '\n' return content
def process_line(self, line): line = BeautifulSoup(line).text line = line.replace(',', ' , ') line = line.replace('.', ' . ') line = line.strip().split() line = [word.strip() for word in line] return line
def text_clean(text, url_removal, tag_removal, stem_stop_punc, punc_removal): """ cleaning a text :param text: input text of any length :param url_removal: flag for removing url from text :param tag_removal: flag for removing tags from text :param stem_stop_punc: flag for removing stop words, stemming tokens, and removing punctuations :param punc_removal: flag for removing punctuations :return: cleaned text """ # removing urls from text if url_removal is True: text = remove_url(text) # removing HTML tags if tag_removal is True: text = BeautifulSoup(text, "lxml").text # stop word removal, stemming the tokens, and punctuations removal if stem_stop_punc is True: text = stop_word_removal(text, 1, punc_removal) # removing new line characters text = text.replace('\n', ' ') # filtering non-printable characters text = ''.join([i if ord(i) < 128 else ' ' for i in text]) # removing more than one space text = ' '.join(text.split()) return text.strip()
def save_html(filename_without_path, page_source, message, prettify=True): """Save html dump of the current page. Path is determined by LOCAL_SERVER setting :param str filename_without_path: The base filename without the absolute path :param str page_source: The html source to save :param str message: The message to display :param bool prettify: if True, prettifies html """ if getattr(settings, 'TAKE_TEST_HTML', False): if settings.VERBOSE_OCOM_TEST_CLASSES: console(message) # check extension extension = os.path.splitext(filename_without_path)[1].lower() if extension != '.html': raise KeyError("Unknown extension for file: {}".format(filename_without_path)) save_path = html_path() if not os.path.exists(save_path): pathlib.Path(save_path).mkdir(parents=True, exist_ok=True) with open(html_path() + filename_without_path, 'w') as html_file: the_html = BeautifulSoup(str(page_source), 'html.parser').prettify() if prettify else str(page_source) # noinspection PyArgumentEqualDefault html_file.write(the_html.strip())
def clean_str(raw: Optional[str], strip_trailing_period: bool = False) -> Optional[str]: """ Takes a str and "cleans" it. Intended to be usable with short strings (names, titles) in any language. See scrub_text(), which extends this function for paragraph length and longer text fields. """ if not raw: return None text = ftfy.fix_text(raw) # remove HTML text = BeautifulSoup(text, "html.parser").get_text() # TODO: for performance, compile these as globals? # replaces whitespace with single space text = re.sub(r"\s+", " ", text).strip() # TODO: shouldn't HTML be parsing these out? text = text.replace("<em>", "").replace("</em>", "") text = text.strip() if strip_trailing_period and text.endswith("."): text = text[:-1] if text.lower() in UNWANTED_SHORT_STRINGS: return None if not text: return None return text
def parse_content(self, response): item = PoemsSpiderItem() title = response.css('div.sons h1').xpath('text()').extract()[0] print(title) item['title'] = response.css('div.sons h1').xpath('text()').extract()[0] try: author = response.css('div.cont p.source a').xpath('text()').extract()[1] except: author = '' content = response.xpath('//div[@class="contson"]')[0].extract() "过滤掉html标签" content = BeautifulSoup(content, 'xml').get_text() "过滤掉空格" content = content.strip().replace("\n", "").replace(' ', '') "去掉空格里的内容" content = re.sub('\([^)]*\)', '', content) content = re.sub('\([^)]*\)', '', content) # 换行 content = re.sub("。", "。\n", content) content = content.rstrip("\n") item['author'] = author item['content'] = content yield item
def _clean_str(self, string): """ desc: This function cleans a string adapted from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py args: string: the string to be cleaned returns: a cleaned string """ string = BeautifulSoup(string, "lxml").text string = re.sub(r"[^A-Za-z0-9(),!?\"\`]", " ", string) string = re.sub(r"\"s", " \"s", string) string = re.sub(r"\"ve", " \"ve", string) string = re.sub(r"n\"t", " n\"t", string) string = re.sub(r"\"re", " \"re", string) string = re.sub(r"\"d", " \"d", string) string = re.sub(r"\"ll", " \"ll", string) string = re.sub(r",", " , ", string) string = re.sub(r"!", " ! ", string) string = re.sub(r"\(", " \( ", string) string = re.sub(r"\)", " \) ", string) string = re.sub(r"\?", " \? ", string) string = re.sub(r"\s{2,}", " ", string) return string.strip().lower().split(" ")
def reformat(directory, media_id, file_name, text, ext, date, username, format_path, date_format, text_length, maximum_length): media_id = "" if media_id is None else str(media_id) has_text = False if "{text}" in format_path: has_text = True path = format_path.replace("{username}", username) text = BeautifulSoup(text, 'lxml').get_text().replace( "\n", " ").strip() SAFE_PTN = '[^0-9a-zA-Z-_.()]+' filtered_text = re.sub(SAFE_PTN, ' ', text.strip() ).strip().replace(' ', '_')[:text_length] path = path.replace("{text}", filtered_text) date = date.strftime(date_format) path = path.replace("{date}", date) path = path.replace("{id}", media_id) path = path.replace("{file_name}", file_name) path = path.replace("{ext}", ext) directory2 = directory + path if has_text: count_string = len(path) text_count = len(filtered_text) if count_string > maximum_length: text_limit = count_string - text_count path = path.replace( filtered_text, filtered_text[:-text_limit]) directory2 = directory + path return directory2
def reviews_to_words(raw_reviews, skip_stop_words): stop_words = set(nltk.corpus.stopwords.words('english')) tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') num_reviews = raw_reviews.size letter_only_re = re.compile('[^a-zA-Z]') final_words_list = [] print("======== tokenizing review into words ========") for i in range(num_reviews): print('encoding: %.1f%%\r' % (float(i) / raw_reviews.size * 100)) review = raw_reviews[i] review_text = BeautifulSoup(review).get_text() sentences_list = tokenizer.tokenize(review_text.strip()) words = [] for j in range(len(sentences_list)): sents = sentences_list[j] sents = letter_only_re.sub(" ", sents) ws = sents.split(' ') ws = filter(lambda x: len(x) > 0, ws) if skip_stop_words: meaningful_words = [w for w in ws if not w in stop_words] words += meaningful_words else: words += ws final_words_list.append(words) return final_words_list
def _html_to_text(self, html): # Hack to prevent Beautiful Soup from collapsing space-keeping tags # until no whitespace remains at all html = re.sub("<(br|p|li)", " \\g<0>", html, flags=re.IGNORECASE) text = BeautifulSoup(html, "html.parser").get_text() # Idea from http://stackoverflow.com/a/1546251 return " ".join(text.strip().split())
def clean_text(text): try: text = re.sub(r"http\S+", "", text) except: print(text) text = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", text) text = re.sub(r"\'s", " \'s", text) text = re.sub(r"\'ve", " \'ve", text) text = re.sub(r"n\'t", " n\'t", text) text = re.sub(r"\'re", " \'re", text) text = re.sub(r"\'d", " \'d", text) text = re.sub(r"\'ll", " \'ll", text) text = re.sub(r",", " , ", text) text = re.sub(r"!", " ! ", text) text = re.sub(r"\(", " \( ", text) text = re.sub(r"\)", " \) ", text) text = re.sub(r"\?", " \? ", text) text = re.sub(r"\s{2,}", " ", text) text = BeautifulSoup(text).text # HTML decoding text = text.lower() # lowercase text text = REPLACE_BY_SPACE_RE.sub( ' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text text = BAD_SYMBOLS_RE.sub( '', text) # delete symbols which are in BAD_SYMBOLS_RE from text text = ' '.join(re.sub("[\.\,\!\?\:\;\-\=]", " ", text).split()) text = ''.join(''.join(s)[:2] for _, s in itertools.groupby(text)) text = emoji.demojize(text) text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text text = text.replace(":", " ") text = ' '.join(text.split()) return text.strip().lower()
def get_proxy(): proxys = BeautifulSoup( requests.get( "http://qsrdk.daili666api.com/ip/?tid=" + keyinfo["tid"] + "&num=1", "lxml").text).p.contents[0] print(proxys) return proxys.strip()
def clean_str(review_docs, method=2): """ Tokenization/string cleaning for all datasets except for SST. Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py """ output_docs = [] if(method == 1): for string in review_docs: string = BeautifulSoup(string, "lxml").get_text() string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) string = re.sub(r"\'s", " \'s", string) string = re.sub(r"\'ve", " \'ve", string) string = re.sub(r"n\'t", " n\'t", string) string = re.sub(r"\'re", " \'re", string) string = re.sub(r"\'d", " \'d", string) string = re.sub(r"\'ll", " \'ll", string) string = re.sub(r",", " , ", string) string = re.sub(r"!", " ! ", string) string = re.sub(r"\(", " \( ", string) string = re.sub(r"\)", " \) ", string) string = re.sub(r"\?", " \? ", string) string = re.sub(r"\s{2,}", " ", string) string = string.strip().lower() string = string.split(" ") output_docs.append(string) elif(method==2): for string in review_docs: words = gensim.utils.to_unicode(string).split() output_docs.append(words) return output_docs
def clean_str(review_docs, method=2): """ Tokenization/string cleaning for all datasets except for SST. Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py """ output_docs = [] if (method == 1): for string in review_docs: string = BeautifulSoup(string, "lxml").get_text() string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) string = re.sub(r"\'s", " \'s", string) string = re.sub(r"\'ve", " \'ve", string) string = re.sub(r"n\'t", " n\'t", string) string = re.sub(r"\'re", " \'re", string) string = re.sub(r"\'d", " \'d", string) string = re.sub(r"\'ll", " \'ll", string) string = re.sub(r",", " , ", string) string = re.sub(r"!", " ! ", string) string = re.sub(r"\(", " \( ", string) string = re.sub(r"\)", " \) ", string) string = re.sub(r"\?", " \? ", string) string = re.sub(r"\s{2,}", " ", string) string = string.strip().lower() string = string.split(" ") output_docs.append(string) elif (method == 2): for string in review_docs: words = gensim.utils.to_unicode(string).split() output_docs.append(words) return output_docs
def preprocessdata(tweets): tweets = tweets.lower() # ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",tweets).split()) # Converting to URL. tweets = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^/s]+))', '', tweets) # Removing repeating letters; more than 2. tweets = re.compile(r'(.)\1{2,}', re.IGNORECASE).sub(r'\1', tweets) # Remove Username. tweets = re.sub('@[^\s]+', ' ', tweets) tweets = BeautifulSoup(tweets, features='lxml').get_text() # Remove Punctuation. tweets = re.sub('[^\w\s]', "", tweets) # remove '#' sign. tweets = re.sub(r'#([^\s]+)', r'\1', tweets) # Make Multiple spaces into a single space. tweets = re.sub('[\s]+', ' ', tweets) tweets = re.sub('<.*?>', " ", tweets) # remove '&' tags. tweets = re.sub('&[\s]+', ' ', tweets) tweets = re.sub(r'[^a-zA-Z\s]', '', tweets, re.I | re.A) tweets = tweets.strip() return tweets
def txt2words(txt, lower=True, is_html=False, remove_none_english_chars=True, remove_stop_words=True): """ Split text into words list :param txt: the input text :param lower: if to make the text to lowercase or not. :param is_html: If True then remove HTML tags using BeautifulSoup :param remove_none_english_chars: if True then remove non-english chars from text :param remove_stop_words: if True then remove stop words from text :return: words list create from the input text according to the input parameters. :rtype: list """ if is_html: txt = BeautifulSoup(txt).get_text() if lower: txt = txt.lower() if remove_none_english_chars: txt = re.sub("[^a-zA-Z]", " ", txt) words = TrainSentences.RE_WIHTE_SPACES.split(txt.strip().lower()) if remove_stop_words: #remove stop words from text words = [w for w in words if w not in TrainSentences.STOP_WORDS] return words
def preprocessing_french(x): x = BeautifulSoup(x) x = EmailReplyParser.parse_reply(x.get_text()) x = re.sub(r'<.*?>', '', x) x = x.replace("\n", " ").strip() x = re.sub(pattern=r'[\!"#$%&\*+,-./:;<=>?@^_`()|~=]', repl='', string=x) x = x.replace("\n", " ").strip() x = x.strip() x = re.sub(r"(^|\W)\d+", "", x) x = x.lower() stopwords = { 'merci', 'de', 'nous', 'aider', 'au', 'plus', 'vite', 'bonjour', 'la', 'le', 'en', 'message', 'cordialement', 'logitech', 'cher', 'mon', 'date', 'je', 'récemment', 'salut', 'produit', 'en série', 'nombre', 'achat', 'soutien', 'http', 'com', 'vous', 'logitech', 'www', 'https', 'logi', 'service à la clientèle', 'contact', 'termes', 'passerelle', 'newark', 'usa', 'logo', 'care', 'ca', 'footer', 'use', 'customer', 'owned', 'us', 'survey', 'americas', 'copyright', 'headquarters', 'owners', 'number', 'respective', 'the', 'rights', 'trademarks', 'reserved', 'property', 'dear', 'regards', 'thanks', 'mail', 'email', 'date', 'like', 'get', 'one', 'set', 'thank', 'also', 'two', 'see', 'able', 'could', 'since', 'last', 'know', 'still', 'got', 'pm', 'p', 'puisque', 'operating', 'system', 'platform', 'ce', 'mr', 'de', 'lfcm', 'sy', 'm', 'kh', 'w', 'ks', 'hs', 'afternoon', 'morning', 'regards', 'thx' 'thanks', 'fri', 'mon', 'tue', 'wed', 'thu', 'sat', 'sun', 'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'sep', 'oct', 'nov', 'dec' } x = x.split() x = [word for word in x if word.lower() not in stopwords] x = ' '.join(x) return x
class MovieReview(object): def __init__(self, mreview): self.mreview = mreview self.mreview_clean = None self.mreview_word_list = [] self.mreview_sentence_list = [] def clean_review(self): # function to clean the review by stripping html from review text body self.mreview_clean = BeautifulSoup(self.mreview).get_text() def remove_punctuation_and_nums(self): self.mreview_clean = re.sub("[^a-zA-Z]", " ", self.mreview_clean) def split_review_into_words(self): # function to split the review text to list of words self.mreview_word_list = self.mreview_clean.lower().split() def remove_stop_words(self): self.mreview_word_list = [word for word in self.mreview_word_list if not word in set(stopwords.words("english"))] self.mreview_clean = " ".join(self.mreview_word_list) def split_review_into_sentences(self): # function to split review into list of sentences # where each setence is a list of words extracted_sentences = TOKENIZER.tokenize(self.mreview_clean.strip()) for extracted_sentence in extracted_sentences: if len(extracted_sentence) > 0: # extracted_sentence needs to be operated on if stopword or punctuation # removal is required eventually(not required for word2Vec) self.mreview_sentence_list.append(extracted_sentence.lower().split())
def redis_write(): redis_cli = getRedisClient(db=15) fw = open("/hdd/crawl_result/daypop.json", "w") for key in redis_cli.scan_iter(): label = key.split(":")[0] value = redis_cli.get(key) d = json.loads(value) text = BeautifulSoup(d['html'], 'html.parser').get_text() # text = re.sub("\n+","\n",text) text = '\n'.join( [t.strip() for t in text.split("\n") if t.strip() != '']) if text.strip() == "": continue print("*" * 50 + d['article_id'] + '*' * 50 + d['url'] + "*" * 50) print(text) save_str = json.dumps(dict(id=d['article_id'], url=unquote(d['url']), title=d['title'], daypop_label=label, text=text), ensure_ascii=False) fw.write(save_str + '\n')
def get_blog(): url = "http://127.0.0.1:5000/blog" f = urllib.request.urlopen(url) the_html = f.read() quotes = BeautifulSoup(the_html, "lxml").text quotes = quotes.replace("[", "") quotes = quotes.replace("]", "") quotes = quotes.replace("\n", "") quotes = quotes.strip() quotes = quotes.split(',') useful_var = 0 useful_var2 = 0 counter = 0 obj_array = [] for stuff in quotes: if counter == 0: useful_var = stuff useful_var = useful_var.replace('"', "") counter += 1 if counter == 1: useful_var2 = stuff useful_var2 = useful_var2.replace('"', "") counter += 1 else: item = Posts(useful_var, useful_var2, stuff) obj_array.append(item) useful_var = 0 counter = 0 print(obj_array) return render_template("RandomPosts.html", quotes=obj_array)
def preProcess(comment): #To Do # https://stackoverflow.com/a/47091490/4084039 def decontracted(phrase): # specific phrase = re.sub(r"won't", "will not", phrase) phrase = re.sub(r"can\'t", "can not", phrase) # general phrase = re.sub(r"n\'t", " not", phrase) phrase = re.sub(r"\'re", " are", phrase) phrase = re.sub(r"\'s", " is", phrase) phrase = re.sub(r"\'d", " would", phrase) phrase = re.sub(r"\'ll", " will", phrase) phrase = re.sub(r"\'t", " not", phrase) phrase = re.sub(r"\'ve", " have", phrase) phrase = re.sub(r"\'m", " am", phrase) return phrase sentance = re.sub(r"http\S+", "", comment) sentance = BeautifulSoup(sentance, 'lxml').get_text() sentance = decontracted(sentance) sentance = re.sub("\S*\d\S*", "", sentance).strip() sentance = re.sub('[^A-Za-z]+', ' ', sentance) # https://gist.github.com/sebleier/554280 sentance = ' '.join(e.lower() for e in sentance.split() if e.lower() not in config.STOP_WORDS) return sentance.strip()
def tweet_to_sentences(tweet): tmpList = [] raw_tweet = BeautifulSoup(tweet).get_text() raw_sentences = tokenizer.tokenize(raw_tweet.strip()) for raw_sentence in raw_sentences: if len(raw_sentence) > 0: tmpList.append(raw_sentence) return tmpList
def Detail_left_fc(htmltext): Detail_left=str(htmltext.find('div',{"class":"detail_left"}).contents) a=Detail_left.split('<h4>') Description=a[1].split('</h4>') Description=Description[1] Description=BeautifulSoup(Description).text Description=Description.strip(', ') Description=suittext(Description) return Description
def connectionLost(self, reason): title = BeautifulSoup(self.data).title if title: title = title.string else: title = "No title provided." self.finished.callback("%s (%s)" % (title.strip(), self.url))
def get_text_from_td(td): """If we have html in the td then extract the text portion""" # just so we don't waste time parsing non html if "<" not in td: ret = td else: ret = BeautifulSoup(td).text # We change the td into ascii as a way to remove characters # such as \xa0 (non break space) which mess with the ordering # comparisons ret.strip().encode('ascii', errors='replace').replace('?', ' ') # Case where the td is empty if len(ret): return "0" else: return ret
def get_restaurants(url): try: urls = get_page_urls(url) for url in urls: data = get_text_from_url(url) search_div = BeautifulSoup(str(data)).find('div', class_='search-results-content') uls = BeautifulSoup(str(search_div)).findAll('ul', class_='ylist ylist-bordered search-results') for restaurant in BeautifulSoup(str(uls[1])).findAll('li', class_='regular-search-result'): main_attrs = BeautifulSoup(str(restaurant)).find('div', class_='main-attributes') rating = BeautifulSoup(str(main_attrs)).find('div', class_='rating-large') rating_data = str(BeautifulSoup(str(rating)).find('i').attrs['title']) rating_data = rating_data.replace('star rating', '') review_count = str(BeautifulSoup(str(main_attrs)).find('span', class_='review-count rating-qualifier').text.strip()) review_count = review_count.replace(' reviews', '') sub_url = BeautifulSoup(str(main_attrs)).find('a').attrs['href'] url = 'http://www.yelp.com' + sub_url category_data = BeautifulSoup(str(main_attrs)).find('div', class_='price-category') category_str_list = BeautifulSoup(str(category_data)).findAll('span', class_='category-str-list') categories = '' for a in BeautifulSoup(str(category_str_list)).findAll('a'): categories = categories + a.text.strip() + ',' expensive_level = BeautifulSoup(str(category_data)).find('span', 'business-attribute price-range').text h3 = BeautifulSoup(str(restaurant)).find('h3', class_='search-result-title') h3_a = BeautifulSoup(str(h3)).find('a').text name = h3_a.strip() sec_attrs = BeautifulSoup(str(restaurant)).find('div', class_='secondary-attributes') address = BeautifulSoup(str(sec_attrs)).find('address') if '<br/>' in str(address): address = str(address).replace('<br/>', ' ') address = BeautifulSoup(str(address)).find('address').text.strip() city = get_city_from_address(address) if not str(city).lower() in address.lower(): print 'Invalid city detected' RestaurantModel.objects.create( name=name, expensivelevel=expensive_level, city=city, current_rating=float(rating_data), url=url, category=categories, address=address, reviewcount=review_count ) set_db_status(False) except Exception, e: print str(e) + 'get restturats' set_db_status(False)
def parse_quotes(htmlitem): """ Parse quotes in an HTML item. Convert all i-tags into quote-tags. Delete all other tags. """ quoteMatch_with_comma = re.match("(?u)(.*?)<i.*?>(.{5,}?),(.{5,}?)</i>" "(.*)", htmlitem) quoteMatch = re.match("(?u)(.*?)(<i.*?>.{5,}?</i>)(.*)", htmlitem) if quoteMatch_with_comma: before_q = BeautifulSoup(quoteMatch_with_comma.group(1)).get_text() quote1 = BeautifulSoup(quoteMatch_with_comma.group(2)).get_text() quote2 = BeautifulSoup(quoteMatch_with_comma.group(3)).get_text() if quote1.strip() != "" and quote2.strip() != "": parsedItem = ( before_q + "<quote>" + quote1 + "</quote>," + "<quote>" + quote2 + "</quote>" + parse_quotes(quoteMatch_with_comma.group(4)) ) else: return unicode(BeautifulSoup(htmlitem).get_text()) return unicode(parsedItem) elif quoteMatch: quote = BeautifulSoup(quoteMatch.group(2)).get_text() if not "siehe" in quote and not "Siehe" in quote and quote.strip() != "": parsedItem = ( BeautifulSoup(quoteMatch.group(1)).get_text() + "<quote>" + quote + "</quote>" + parse_quotes(quoteMatch.group(3)) ) return unicode(parsedItem) else: return unicode(BeautifulSoup(htmlitem).get_text()) else: return unicode(BeautifulSoup(htmlitem).get_text())
def Description_awarded(htmltext): Detail_left=str(htmltext.find('div',{"class":"detail_left"}).contents) a=Detail_left.split('<h4>') Des=a[1].split('</h4>') Description=Des[1].split('<table class="additional_data">') Description=Description[0] Description=BeautifulSoup(Description).text Description=Description.strip(', ') Description=suittext(Description) return Description
def get_unsort_position(self, bm_body): array_bm_body = bm_body.split("\n") # print(array_bm_body) init = 0 for html_line in array_bm_body: # print(init) html_line_text = BeautifulSoup(html_line, self.FLAG_BS_PARSER).text if html_line_text.strip() == self.unsorted_bookmarks_title: self.unsorted_bookmarks_line = init init += 1
def __login(self): if self.__session is None: print('Logging "{0}" into site'.format(self.__username)) self.__session = requests.Session() payload = {'username': self.__username, 'password': self.__password, 'login-form-type': 'pwd'} r = self.__session.post(self.__loginUrl, payload, verify=False) data = BeautifulSoup(r.content).getText().strip() if data.strip() == "login_success": print("Login Succeeded") else: raise Exception("Login Failure")
def normalize_tag(tag): """ converts things like "-noise-" to "noise" and "- noise -" to "noise" """ if tag.startswith("-"): tag = tag[1:] if tag.endswith("-"): tag = tag[:-1] # fix for HTML entities tag = BeautifulSoup(tag).prettify(formatter="html") tag = tag.strip().lower() return tag
def clean_sentence(self,sentence): if self.html_clean: sentence = BeautifulSoup(sentence).get_text() # removing html markup sentence = sentence.lower() # everything to lowercase # sentence = ''.join(x for x in sentence if x.isalnum() or x==" ") for ch_rep in self.clean_list: sentence = re.sub(ch_rep[0],ch_rep[1],sentence) sentence = ' '.join(filter(lambda x:x not in self.stopwords_eng,sentence.split())) sentence = ' '.join(filter(lambda x:len(x) > 1,sentence.split())) sentence = sentence.strip(" ") # Remove possible extra spaces if self.split_words: sentence = sentence.split() return sentence
def htmlToTxt(dirPath,nameFile): '''Converting a html file into a plain text file and writing it with '.txt' extension in the same directory ''' print 'Converting '+ nameFile #extractin text from html file html = urlopen(os.path.join(dirPath,nameFile)) rawTxt = BeautifulSoup(html).get_text() #writing text file to_file = open(os.path.join(dirPath,nameFile) + '.txt','w') print>>to_file, rawTxt.strip() to_file.close() return rawTxt
def getname(): names=[] names.append("クロムクロ ブルーレイ") url="http://www.amazon.co.jp/b/ref=s9_acss_bw_fb_junglest_b4?_encoding=UTF8&node=4367309051&pf_rd_m=AN1VRQENFRJN5&pf_rd_s=merchandised-search-4&pf_rd_r=0EEMDS8WZ0YP48RWG59H&pf_rd_t=101&pf_rd_p=311392929&pf_rd_i=562020" r=requests.get(url) while r.status_code != requests.codes.ok: r=requests.get(url) content=r.content soup = BeautifulSoup(r.content, "html.parser") targets=soup.find_all("a",class_="acs-feature-header") for target in targets: name=BeautifulSoup(str(target),"html.parser").find("a").text names.append(name.strip().replace('\n','').replace('\t','')) # print(name) names.remove("逆転裁判~その「真実」、異議あり! ~") names.append("逆転裁判 その") return names
def review_to_sentences( review, tokenizer, remove_stopwords=False ): # Function to split a review into parsed sentences. Returns a # list of sentences, where each sentence is a list of words # # 1. Use the NLTK tokenizer to split the paragraph into sentences review_text = BeautifulSoup(review).get_text() raw_sentences = tokenizer.tokenize(review_text.strip()) # # 2. Loop over each sentence sentences = [] for raw_sentence in raw_sentences: # If a sentence is empty, skip it if len(raw_sentence) > 0: # Otherwise, call review_to_wordlist to get a list of words sentences.append( review_to_wordlist( raw_sentence, remove_stopwords )) # # Return the list of sentences (each sentence is a list of words, # so this returns a list of lists return sentences
def review_to_sentences( review, tokenizer, remove_stopwords=False ): ''' Function to split a review into parsed sentences. Returns a list of sentences, where each sentence is a list of words ''' # 0. Get rid of non-tokenizable characters review = BeautifulSoup(review, "lxml").get_text() # 1. Use the NLTK tokenizer to split the paragraph into sentences raw_sentences = tokenizer.tokenize(review.strip()) # 2. Loop over each sentence sentences = [] for raw_sentence in raw_sentences: # If a sentence is empty, skip it if len(raw_sentence) > 0: # Otherwise, get a list of words letters_only = re.sub("[^a-zA-Z]", " ", raw_sentence) words = letters_only.lower().split() sentences.append(words) # Return the list of sentences (each sentence is a list of words, # so this returns a list of lists) return sentences
def getBaseballData(): #Setting up a dynamic url baseball_url = 'http://dailybaseballdata.com/cgi-bin/getstats.pl?date=' + yesterday.strftime("%m%d") + '&out=csv' print(baseball_url) #Setting up the GET request to retrieve the HTML markup req = urllib.request.Request(baseball_url) response = urllib.request.urlopen(req) html = response.read() #Using BeautifulSoup to parse the markup for info, removing script and other unnecessary tags clean_html = BeautifulSoup(html) to_extract = clean_html.findAll('script') for item in to_extract: item.extract() clean_html = clean_html.get_text() clean_html = clean_html.strip() #strip removes all the whitespace clean_html = '\n'.join(clean_html.split('\n')[6:]) #Slice at the end removes the first 5 lines and then joining together by '\n' clean_html_with_date = clean_html.replace('\n',','+yesterday.strftime("%y-%m-%d")+'\n') clean_html_with_date = clean_html_with_date + ','+yesterday.strftime("%y-%m-%d") print("Fetching raw data from the website") print("...") print(clean_html_with_date) #print(clean_html) #I am creating a test .csv file and then writing to it using open(),write(),close() f = open("baseball_rawdata.csv", "w") f.write(clean_html_with_date) f.close()
def get_newest_rss(self, url): ## Retreive an RSS feed and get the newest item ## Then, nicely format the title and description, and add a shortened URL dom = xml.dom.minidom.parse(urllib.request.urlopen(url)) newest_news = dom.getElementsByTagName('item')[0] title = newest_news.getElementsByTagName('title')[0].childNodes[0].data description = BeautifulSoup(newest_news.getElementsByTagName('description')[0].childNodes[0].data) updated = dom.getElementsByTagName('pubDate')[0].childNodes[0].data updated = datetime.datetime.fromtimestamp(time.mktime(parsedate(updated))) ago = round((datetime.datetime.utcnow() - updated).seconds/60) links = description.findAll('a') for link in links: link.extract() links = description.findAll(color='#6f6f6f') for link in links: link.extract() title = title.strip() description = str(description) description = description.replace("\n", "") description = self.tools['remove_html_tags'](description) #description = description[0:len(description) - 9] description = description.strip() if description.rfind(".") != -1: description = description[0:description.rfind(".") + 1] link = self.tools['shorten_url'](newest_news.getElementsByTagName('link')[0].childNodes[0].data) description = "%s - %s [ %s ]" % (title, description, link) return description, updated, ago
def txt2words(txt, lower=True, is_html=False, remove_none_english_chars=True, remove_stop_words=True): """ Split text into words list :param txt: the input text :param lower: if to make the text to lowercase or not. :param is_html: If True then remove HTML tags using BeautifulSoup :param remove_none_english_chars: if True then remove non-english chars from text :param remove_stop_words: if True then remove stop words from text :return: words list create from the input text according to the input parameters. :rtype: list """ if is_html: txt = BeautifulSoup(txt).get_text() if lower: txt = txt.lower() if remove_none_english_chars: txt = re.sub("[^a-zA-Z]", " ", txt) words = TrainSentences.RE_WIHTE_SPACES.split(txt.strip().lower()) if remove_stop_words: # remove stop words from text words = [w for w in words if w not in TrainSentences.STOP_WORDS] return words
def google_news(self, e): query = urllib.parse.quote(e.input) url = "" if not query: url = "http://news.google.com/news?ned=us&topic=h&output=rss" else: url = "http://news.google.com/news?q=%s&output=rss" % query dom = xml.dom.minidom.parse(urllib.request.urlopen(url)) newest_news = dom.getElementsByTagName('item')[0] title = newest_news.getElementsByTagName('title')[0].childNodes[0].data description = BeautifulSoup(newest_news.getElementsByTagName('description')[0].childNodes[0].data) links = description.findAll('a') for link in links: link.extract() links = description.findAll(color='#6f6f6f') for link in links: link.extract() title = title.strip() description = str(description) description = description.replace("\n", "") description = self.tools['remove_html_tags'](description) # description = tools.decode_htmlentities(description) description = description[0:len(description) - 9] description = description.strip() if description.rfind(".") != -1: description = description[0:description.rfind(".") + 1] link = self.tools['shorten_url'](newest_news.getElementsByTagName('link')[0].childNodes[0].data) e.output = "%s - %s [ %s ]" % (title, description, link) return e
def strip_text(self, exclude_tags=[]): # exclude_patterns = list of element tag strings # to ignore (ie schemaLocation, etc) def _extract_tag(t): if not t: return return t.split('}')[-1] def _taggify(e): tags = [e.tag] + [m.tag for m in e.iterancestors()] tags.reverse() try: return [_extract_tag(t) for t in tags] except: return [] for elem in self.parser.xml.iter(): t = elem.text.strip() if elem.text else '' tags = _taggify(elem) if [e for e in exclude_tags if e in tags]: continue if t: if self.handle_html and ( (t.startswith('<') and t.endswith('>')) or ('<' in t or '>' in t)): t = self._un_htmlify(t) if t: yield ('/'.join(tags), t) for k, v in elem.attrib.iteritems(): if v.strip(): v = BeautifulSoup(v.strip()) yield ('/'.join(tags + ['@' + _extract_tag(k)]), v.text)
capsCount = sum(1 for x in paraText if x.isupper()) rowValue = pd.Series([companies[index], paraText, len(paraText), paraText.count(' '), capsCount]) df = df.append(rowValue, ignore_index=True) for url in urls2: index = urls2.index(url) page = urllib.request.urlopen(url) soup = BeautifulSoup(page) paraTitle = list(soup.find_all('h3')) for paragraph in paraTitle: para = str(paragraph.nextSibling.nextSibling) paraText = BeautifulSoup(para).get_text() paraText = paraText.strip() if (paraText != ""): capsCount = sum(1 for x in paraText if x.isupper()) rowValue = pd.Series([companies2[index], paraText, len(paraText), paraText.count(' '), capsCount]) df = df.append(rowValue, ignore_index=True) for url in urls3: index = urls3.index(url) page = urllib.request.urlopen(url) soup = BeautifulSoup(page) para = list(soup.find_all('li')) for paragraph in para: paraText = paragraph.get_text()
def post_paragraphs(self): post = BeautifulSoup(self.post, 'html.parser').text.encode('utf-8').strip() post = re.sub(r'(\n+\s*)+', '\n', post) post_list = post.strip().split('\n') return post_list
def cleaner(self, text): # Clean HTML and set to lowercase clean_ = BeautifulSoup(text, 'lxml').get_text().lower() # Clear newlines clean_ = clean_.strip().replace("\n", " ").replace("\r", " ") return clean_
processed = set() if args.file == "": f = sys.stdin else: f = open(args.file) if args.output == "": out = sys.stdout else: out = open(args.output, 'w') RE_WHITE_SPACES = re.compile("\s+") if args.html: txt = BeautifulSoup(f, "html5lib").get_text() txt = re.sub(u"[^a-zA-Záàãéíóõúç]", " ", txt) tokens = RE_WHITE_SPACES.split(txt.strip()) else: tokens = f for token in tokens: t = token.lower().strip() if args.code_page != 'none': t = unicode(t, args.code_page) if t not in stop: t = porter.stem(t) if len(t) > 1 and not t.isdigit(): if t not in processed: if args.allow_duplicates: processed[t] = 1 else: processed.add(t)
#print html soup = BeautifulSoup(str(html),'html.parser',from_encoding='utf-8') #print soup From = soup.title.get_text() Div = soup.find_all('div',class_='carousel-caption') #清空原来数据 sql="TRUNCATE `dj_bbs_news;" Mysql.MysqlHelper().In_sql(sql) x = 1 for line in Div: soup2 = BeautifulSoup(str(line),'html.parser',from_encoding='utf-8') Url = soup2.find('a')['href'] html2 = getHtml(soup2.find('a')['href']) Tltle = BeautifulSoup(html2,'html.parser',from_encoding='utf-8').find('h1',class_='ph').get_text() Img = soup.find('img',alt='%s' % Tltle.strip())['src'] #print Img try: sql='''INSERT INTO `dj_bbs_news`(title,summary,url,favor_count,reply_count,create_date,image_urls) VALUES("%s",'%s','%s','0','0',NOW(),'news/%s.jpg');''' %(Tltle,From,Url,x) #print sql Mysql.MysqlHelper().In_sql(sql) except Exception,e: print e local = os.path.join('D:/test/bbs/static/news/','%s.jpg' % x) urllib.urlretrieve(Img,local) #处理图片 Picture.timage("D:/test/bbs/static/news/%s.jpg" % x, 'D:/test/bbs/static/news/') print '已处理完%s.jpg'% x x+=1