def load_html(file_path: str) -> List[List]: """Split html content to pages. Args: file_path: path to file Returns: list of pages """ if file_path.startswith(("http", "https")): if "wikipedia.org" in file_path: parsed_url = urlparse(unquote(file_path)) lang = parsed_url.hostname.split(".", 1)[0] article_name = parsed_url.path.rsplit("/", 1)[-1] wikipedia.set_lang(lang) page = wikipedia.page(article_name) text = page.content return paginate(text.split("\n")) else: file = requests.get(file_path) raw_html = file.content else: with open(file_path, "r") as file: raw_html = file.read() soup = BeautifulSoup(raw_html, features="lxml") [ s.extract() for s in soup( ['style', 'script', 'head', 'title', 'meta', '[document]']) ] # replace non-breaking space soup = soup.get_text(strip=False).replace("\xa0", " ") lines = [line.strip() for line in soup.splitlines() if line.strip()] return paginate(lines)
def page_to_words(raw_page): # Swap <br> tags with newline character raw_page = re.sub('<br\s*?>', '\n', raw_page) page_text = BeautifulSoup(raw_page).get_text() # break into lines and remove leading and trailing space on each lines = (line.strip() for line in page_text.splitlines()) # break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # drop blank lines text = '\n'.join(chunk for chunk in chunks if chunk) # 2. Remove non-letters letters_only = re.sub("[^a-zA-Z]", " ", text, 0, re.UNICODE) # 3. Remove phrase 'OCR Output' cleaner_text = re.sub('OCR Output', " ", letters_only) # 4. Convert to lower case, split into individual words words = cleaner_text.lower().split() # 5. In Python, searching a set is much faster than searching a list, so convert the stop words to a set stops = set(nltk.corpus.stopwords.words("english")) # 6. Remove stop words meaningful_words = [w for w in words if w not in stops] # 7. Join the words back into one string separated by space and tokenize/stem final_book_text = " ".join(meaningful_words) return final_book_text
def clean_text(s): raw = BeautifulSoup(s, 'html.parser').get_text() raw = remove_control_characters(raw) lines = [line.strip() for line in raw.splitlines()] text = "".join(lines) return text
def get_readable_text(raw_html): """ Arguments: - `x`: """ raw_html = bytes(raw_html, 'utf-16').decode("utf-16", 'ignore') _cleantext = BeautifulSoup(raw_html).text # paragraphs = _cleantext.split("\n+") paragraphs = [s.strip() for s in _cleantext.splitlines()] cleaned_paragraphs = [] for para in paragraphs: cleantext = " ".join(para.split()) cleantext = ''.join(x for x in cleantext if x in string.printable) cleaned_paragraphs.append(cleantext) cleantext = "\n".join(cleaned_paragraphs) strs = re.sub('\\n+', '. ', cleantext) cleantext = re.sub(r'\.+', ".", strs) return cleantext
def get_readable_text(raw_html): """ Arguments: - `x`: """ ''' raw_html = bytes(raw_html, 'utf-16').decode("utf-16", 'ignore') _cleantext = BeautifulSoup(raw_html, 'lxml').text ''' raw_html = bytes(raw_html, 'utf-16').decode("utf-16", 'ignore') _cleantext = BeautifulSoup(raw_html, 'lxml') for e in _cleantext.findAll('br'): e.replace_with(" ") _cleantext = _cleantext.getText(separator=u' ') # paragraphs = _cleantext.split("\n+") paragraphs = [s.strip() for s in _cleantext.splitlines()] cleaned_paragraphs = [] for para in paragraphs: cleantext = " ".join(para.split()) cleantext = ''.join(x for x in cleantext if x in string.printable) cleaned_paragraphs.append(cleantext) cleantext = "\n".join(cleaned_paragraphs) strs = re.sub('\\n+', '. ', cleantext) cleantext = re.sub(r'\.+', ".", strs) return cleantext
def cleanComments(comments): clean_comments = [] for comment in comments: # Extract again (JUST IN CASE) comment = BeautifulSoup(comment, "lxml").get_text() # Some comments are edited and the datastamp of edited message is extracted comment = re.sub(r"Message edited by author .*", '', comment) # Some comments have urls comment = re.sub(r'^https?:\/\/.*[\r\n]*', '', comment, flags=re.MULTILINE) # Word Standardizing (Ex. Looooolll should be Looll) comment = ''.join(''.join(s)[:2] for _, s in itertools.groupby(comment)) # Remove Encodings comment = re.sub(r'\\\\', r'\\', comment) comment = re.sub(r'\\', ' ', comment) comment = re.sub(r'\\x\w{2,2}',' ', comment) comment = re.sub(r'\\u\w{4,4}', ' ', comment) comment = re.sub(r'\\n', '.', comment) # Remove carraige returns character comment = ' '.join(comment.splitlines()) #Remove Unicode characters comment = codecs.decode(comment, 'unicode-escape') comment = ''.join([i if ord(i) < 128 else '' for i in comment]) clean_comments.append(comment) return clean_comments
def get_mail(id, redact=False): M = imaplib.IMAP4() M.login("*****@*****.**", passwords.mitgliedsantrag) M.select() typ, data = M.search(None, 'Subject', str(id)) typ, data = M.fetch(data[0].split()[-1], '(RFC822)') msg = email.message_from_string(data[0][1].decode('utf-8')) for part in msg.walk(): if part.get_content_type() == "text/html": body = part.get_payload(decode=False) else: continue M.close() M.logout() body = BeautifulSoup(body, features="html.parser").get_text(separator="\n") if redact: body = body.replace("E-Mail: \n", "E-Mail: ") body_redacted = "" to_be_redacted = ["E-Mail", "Straße", "Telefonnummer"] for line in body.splitlines(True): if line[:line.find(':')] not in to_be_redacted: body_redacted += line body = body_redacted return body
def response_to_readable(response): firsthead = httpvers + " " + str( response.status_code) + " " + response.reason + "\n" soup = BeautifulSoup(str(response.text), 'html.parser').prettify() # ToDo: would it be better, if the indent is setable to 2 or 3 instead of 1? nlines = min(template.maxbodylines, len(soup.splitlines())) #sometimes everything is in one line if (nlines <= 3): if (len(soup) > template.maxbodycharacters): soup = soup[template.maxbodycharacters:] + "[...]" else: if (len(soup.splitlines()) > nlines): soup = os.linesep.join(soup.split( os.linesep)[:nlines]) + "\n" + "[...]" return firsthead + readable_headers(response) + "\n" + soup
def upload_qso(self, qso, username: str = "", password: str = "", qth_nickname: str = ""): if not username or not password or not qth_nickname: _logger.error( "Invalid login data. Username: %s - Password: %s - QTH Nickname: %s" % (username, password, qth_nickname)) raise ValidationError(_("Invalid login data")) if not qso: _logger.error("Invalid QSO") raise ValidationError(_("Invalid QSO")) adif_utility = self.env["ham.utility.adif"] extra_fields = { "EQSL_USER": username, "EQSL_PSWD": password, } adif_content = adif_utility.generate_adif_header( extra_fields=extra_fields) extra_fields = {"APP_EQSL_QTH_NICKNAME": qth_nickname} adif_content += adif_utility.generate_adif_qso( qso, extra_fields=extra_fields) data = { "EQSL_USER": username, "EQSL_PSWD": password, } files = {"Filename": ("file.adi", adif_content)} url = EQSL_API_UPLOAD_QSO response = requests.post(url=url, files=files) if response.status_code != 200: _logger.error("Unable to upload QSO to eQSL. Status code %d: %s" % (response.status_code, response.content)) _logger.error("Data: %s" % data) raise ValidationError(_("Unable to upload QSO to eQSL")) if b"error" in response.content.lower(): root = etree.fromstring( response.content.decode(), parser=etree.HTMLParser(remove_comments=True)) html_content = etree.tostring(root) raw_message = BeautifulSoup(html_content, "lxml").text.strip() message = "\n".join( [x.strip() for x in raw_message.splitlines() if x.strip()]) _logger.error("Error publishing QSO: %s" % message) _logger.error("Data: %s" % data) raise ValidationError("%s.\n%s" % (_("Error publishing QSO"), message))
def filterbiography(data): data = json.loads(data['d']) properdata = {} try: textx = BeautifulSoup(data['BIOGRAPHY_HTML'], 'html.parser').get_text() properdata['biographyAll'] = textx textx = textx.splitlines() properdata['Biography'] = textx properdata['MediaCaption'] = data['MEDIA_CAPTION'] except: return properdata return properdata
def get_img_url(raw_url): # Get text from page bs4_text = BeautifulSoup(get_page(raw_url), 'lxml').get_text() img_url_location = bs4_text.index( 'Image URL') # 'Image URL is what we are searching for in the text bs4_text = bs4_text[img_url_location:] bs4_text_list = bs4_text.splitlines() # Get line that has the image URL and return it img_url_line = bs4_text_list[0] img_url = img_url_line.split(' ')[4] return img_url
class PDFHandler(): ROBOT_LIBRARY_SCOPE = 'TEST SUITE' def open_pdf(self, filepath, xml=True): pdf = parser.from_file(filepath, xmlContent=xml) for key, value in pdf.items(): setattr(self, key, value) if xml: self._treat_xml() else: self._treat_plaintext() def _treat_xml(self): self.content = BeautifulSoup(self.content, "html.parser") self.content = [el.text for el in self.content if len(el.text.strip())] filtered = [] for el in self.content: el = el.split("\n") el = [x.strip() for x in el] filtered += el self.content = [el.strip() for el in filtered if len(el.strip())] def _treat_plaintext(self): self.content = self.content.splitlines() self.content = [el.strip() for el in self.content if len(el.strip())] def get_values_by_pattern(self, regexp): pattern = re.compile(regexp) matches = [ pattern.search(el) for el in self.content if pattern.match(el) ] matches = [match.group(1) for match in matches] return matches def get_pdf_content(self): return self.content def get_pdf_status(self): return self.status def get_pdf_metadata(self): return self.metadata
def get_update(): """ Attempt to reach the pheonixminer webserver. Returns a dict with some useful info. If HTML code is not 200, returns an error message. """ # Get HTML data resp = requests.get(MONITOR_URL) # Prepare our final returned object (dict) # TODO: Convert numerics into int from str summary = {'html_code': resp.status_code, 'status_text': '', # 'full_text': '', 'hash rate': '', 'time': '', 'power': '', 'uptime': '' } # Check for HTTP code, error out if necessary # TODO: Handle discrete cases if summary['html_code'] == 200: # Convert to lines of text clean_text = BeautifulSoup(resp.text, "html.parser").text text_lines = clean_text.splitlines() # Remove lines that are empty or have only whitespace empties = ('', ' ', '\t', '\n', ' ') text_lines = [line for line in text_lines if line not in empties] # Extract insights # TODO: Replace these with some more elegant regex summary['hash rate'] = text_lines[-2].split()[-2] summary['power'] = text_lines[-7].split()[-2] timestamp = [line for line in text_lines if line[0] == "*"][-1] # Looks like ['***', '63:20', '***', '2/19', '10:16', '**************************************'] summary['uptime'] = timestamp.split()[1] summary['time'] = timestamp.split()[4] # TODO: Extract individual GPU speeds, temps idx_of_last_ts = len(text_lines) - 1 - text_lines[::-1].index(timestamp) idx_of_gpu1 = len(text_lines) - 1 - text_lines[::-1].index(text_lines[idx_of_last_ts+3]) summary['gpu list'] = [text_lines[i].split(',')[0] for i in range(idx_of_gpu1, len(text_lines)-8)] summary['gpu stats'] = text_lines[-8].split(',') summary['gpu speeds'] = [item.split('(')[0] for item in text_lines[idx_of_last_ts-2].split(':')[2:]] else: print(f"Warning: Got status code {summary['html_code']}") return summary
def post(self): if self.arguments: api_logger.info("HEADERS: "+str(self.request)) # Parse each param data = self.arguments if 'name' not in list(data.keys()) and 'lines' not in list(data.keys()): api_logger.error("Error requests params.") self.finish('{"result":"error","description":"Error requests params"}') else: try: t_name = data['name'] t_lines = data['lines'] except Exception as e: api_logger.error("Error requests params "+str(e)) self.finish('{"result":"error","description":"Error requests params","debug":"'+str(e)+'"}') try: filename = stormuiapi.getWorkersByTopologyName(t_name)[0] if filename: api_logger.info("LogFilename: "+filename) #get log file from storm cluster n_lines = int(t_lines)*200 url = filename+"&tail="+str(n_lines) api_logger.debug("URL to fecth"+url) content = "" try: content = requests.get(url).content except Exception as e: api_logger.error("Error getting log from Storm UI : "+str(e)) self.finish('{"result":"error","description":"Error getting log from Storm UI: ", "detail":"'+str(e)+'"}') try: # Remove HTML tags from Storm Log 8000 port lines = BeautifulSoup(content).text api_logger.debug("Getting "+str(len(lines.splitlines()))+" lines.") self.set_header ('Content-Type', 'text/plain') self.set_header ('Content-Disposition', 'attachment; filename='+filename+'') self.finish(lines) except Exception as e: api_logger.error("Error parsing data from Storm UI"+str(e)) self.finish('{"result":"error","description":"Error parsing data from Storm UI: ", "detail":"'+str(e)+'"}') else: api_logger.error("Error getting worker from Storm UI API") self.finish('{"result":"error","description":"Error getting worker from Storm UI API", "detail":""}') except Exception as e: api_logger.error("Unknown error"+str(e)) self.finish('{"result":"error","description":"Error getting topology log: ", "detail":"'+str(e)+'"}') else: api_logger.error("Content-Type:application/json missing") self.finish('{"result":"error","description":"Content-Type:application/json missing"}')
def _convert_html_to_plain_text(html_string, remove_line_breaks=False): """Returns string after removing all HTML markup. Args: html_string (str): string with HTML markup remove_line_breaks (bool): whether to also remove line breaks and extra white space from string """ if not html_string: return '' text = BeautifulSoup(html_string, "html.parser").get_text() # remove empty lines as well leading and trailing space on non-empty lines if remove_line_breaks: text = ' '.join(line.strip() for line in text.splitlines() if line.strip()) return text
def convert_html_to_plain_text(html_string, remove_line_breaks=False): """Returns string after removing all HTML markup. Args: html_string (str): string with HTML markup remove_line_breaks (bool): whether to also remove line breaks and extra white space from string """ if not html_string: return '' text = BeautifulSoup(html_string, "html.parser").get_text() # remove empty lines as well leading and trailing space on non-empty lines if remove_line_breaks: text = ' '.join(line.strip() for line in text.splitlines() if line.strip()) return text
def txt_from_HTML(self): fin_sentences = [] url = input("Enter the url here: ") http = urllib3.PoolManager() response = http.request('POST', url) raw = BeautifulSoup(response.data, "lxml").get_text() raw = os.linesep.join([s for s in raw.splitlines() if s]) txt = sent_tokenize(raw) new_sentences = self.filter_results(txt) for line in new_sentences: if line.count("\n") <= 1 and len(word_tokenize(line)) > 3 and "<" not in line and "subscrib" not in line: fin_sentences.append(line) elif line.find("\n") == 1: fin_sentences.append(line) fin_txt = ' '.join(fin_sentences) fin_txt = fin_txt.replace(".", ". ") return fin_txt
def get_page(url): print('trying to fetch page: ', url) r = requests.get(url) if re.search( '(Ocorreu um erro ao buscar o documento\. Tente novamente)|(O documento não foi liberado para publicação)', r.text): print('No document available.') else: text = BeautifulSoup(r.text, 'html5lib').text text_list = text.splitlines() text_list = [i for i in text_list if i] try: with open('html_data/' + str(i) + '.html', 'wb') as f: f.write(r.content) except IndexError as e: print(e) except UnicodeError as e: print(e)
def simple_page(raw_page): raw_page = re.sub('<br\s*?>', '\n', raw_page) page_text = BeautifulSoup(raw_page).get_text() # break into lines and remove leading and trailing space on each lines = (line.strip() for line in page_text.splitlines()) # break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # drop blank lines text = '\n'.join(chunk for chunk in chunks if chunk) letters_only = re.sub("[^a-zA-Z]", " ", text, 0, re.UNICODE) # 3. Remove phrase 'OCR Output' cleaner_text = re.sub('OCR Output', " ", letters_only) words = cleaner_text.lower().split() final_book_text = " ".join(words) return final_book_text
def Phrase_ETF_Components(): ETF_List_URL = "https://www.twse.com.tw/zh/page/ETF/list.html" try: page = requests.get(ETF_List_URL) text = BeautifulSoup(page.text, 'html.parser').find('tbody').text text = list(filter(None, text.splitlines())) table = [] comp = [] cnt = 0 for i in text: if cnt < 4: comp.append(i) cnt = cnt + 1 else: table.append(comp) comp = [] cnt = 0 print(table) except: print("Failed to get list from twse website.")
def get_counts(): only_class = SoupStrainer(class_="stats-wrapper") only_br = SoupStrainer("br") data = BeautifulSoup(open('main.html'),"html.parser", parse_only=only_class).prettify() posts="" topics="" post_count = 0 topics_count = 0 for line in data.splitlines(): if "Příspěvky" in line: posts += line elif "Témata" in line: topics += line post_numbers = map(int, re.findall('\d+', posts)) topics_numbers = map(int, re.findall('\d+', topics)) for num in post_numbers: post_count += num for num in topics_numbers: topics_count += num return (post_count,topics_count)
async def update_spots(application): """ Coroutine that updates spots """ lasthash = "" try: while True: r = requests.get( 'http://www.dxsummit.fi/DxSpots.aspx?count=50&include_modes=PHONE') cleantext = BeautifulSoup(r.text, "lxml").get_text() currenthash = hashlib.md5(cleantext.encode('utf-8')).hexdigest() clusterdata = [] i = 0 for line in cleantext.splitlines(): line = line[:73] + ':' + line[73:] #line = line[:76] + line[84:] cleanline = ' '.join(line.split()) splitstring = cleanline.split(sep=" ", maxsplit=3) clusterdata.append( (hashlib.md5(line.encode('utf-8')).hexdigest() + " " + splitstring[1]+" "+splitstring[2], " " + line)) data = cleanline.split(sep=" ", maxsplit=3) if ((auto_tune.checked is True) and (i == 0) and (data[2] != globalvars['lastcall'])): globalvars['lastcall'] = data[2] frequency.content=FormattedTextControl(HTML('<b fg="#884444">Freq.:</b> ' + (data[1] + " Khz").rjust(15))) dx.content=FormattedTextControl(HTML('<b fg="#884444">Call:</b> ' + data[2].rjust(12))) if qrz.checked is True: redis.rpush('qrzLookupQueue',data[2]) i+=1 radios.values = clusterdata if currenthash != lasthash: application.invalidate() await asyncio.sleep(15) else: await asyncio.sleep(30) except asyncio.CancelledError: #TODO safe config here ? print()
def split_html(raw_html: str) -> List[List]: """Split html content to pages. Args: raw_html: html content Returns: list of pages """ ''' 将html内容拆分为页面。 args: raw_html:html内容 return: 页面清单 ''' soup = BeautifulSoup(raw_html, features="lxml") [s.extract() for s in soup(["style", "script", "head", "title", "meta", "[document]"])] # replace non-breaking space # 替换不间断空间 soup = soup.get_text(strip=False).replace("\xa0", " ") lines = [line.strip() for line in soup.splitlines() if line.strip()] return paginate(lines)
def get_counts(): only_class = SoupStrainer(class_="stats-wrapper") only_br = SoupStrainer("br") data = BeautifulSoup(open('main.html'), "html.parser", parse_only=only_class).prettify() posts = "" topics = "" post_count = 0 topics_count = 0 for line in data.splitlines(): if "Příspěvky" in line: posts += line elif "Témata" in line: topics += line post_numbers = map(int, re.findall('\d+', posts)) topics_numbers = map(int, re.findall('\d+', topics)) for num in post_numbers: post_count += num for num in topics_numbers: topics_count += num return (post_count, topics_count)
class Text: def __init__(self, text_id: int) -> None: normalization = { 132: '"', 147: '"', 0x96: "--", 0x91: "'", 0x92: "'", 0x97: "---", } text_url = "https://www.keinverlag.de/{}.text".format(text_id) soup = soup_from(text_url) try: self.title = soup.select("h1 > span")[0].text.translate( normalization) self.content = BeautifulSoup( re.sub( r'<span style="font-style: italic;">(([\n\r]|.)*?)</span>', r"_\1_", str(soup.select(".fliesstext > span")[0]), ), "lxml", ).text.translate(normalization) self.author = soup.select("h3 > a")[2].text self.type = soup.select("h1 ~ h3")[0].text except IndexError: raise ValueError("Text {} not available.".format(text_id)) def markdown(self, *, with_author: bool = True, with_type: bool = False) -> str: return "#### {maybe_author}{title}{maybe_type}\n\n{content}".format( title=self.title, maybe_author=self.author + ": " if with_author else "", maybe_type=" (" + self.type + ")" if with_type else "", content="\n".join(line + "\\" if line else "" for line in self.content.splitlines()), )
def parse_details(self, response): def extract_with_css(query): return response.css(query) html = extract_with_css('body').extract_first() full_text = BeautifulSoup(html, 'html.parser').get_text() lines = full_text.splitlines() # List of HTML text lines norms = utils.split_list_by_sep(lines, '__') norms = list( map(lambda l: [' '.join(l)], norms)) # A list of separated norms from the same source for norm in norms: yield Norm({ 'published_at': response.meta['date'], 'text': norm[0], 'type': dict(simple=response.meta['type']) })
def check_changes(self): response = requests.get(self.url, headers=self.headers) r = response.text self.course.teacher_id.html = r self.course.teacher_id.save() soup = BeautifulSoup(r, "html.parser") for link in soup.select("a[href$='.pdf']"): url = urljoin(self.url, link['href']) filename = link['href'].split('/')[-1] if not Files.objects.check_if_exists(self.course, filename, self.teacher_id): self.send_push_pdf(url) course = Course.objects.get_record_by_id(self.course_id) Files.objects.add_record(course, filename, str(self.teacher_id)) for script in soup(["script", "style"]): script.extract() soup = soup.get_text() self.prev_version = BeautifulSoup(self.prev_version, "html.parser") for script in self.prev_version(["script", "style"]): script.extract() self.prev_version = self.prev_version.get_text() if self.prev_version != soup: self.old_page = self.prev_version.splitlines() self.new_page = soup.splitlines() d = difflib.Differ() diff = d.compare(self.old_page, self.new_page) out_text = "\n".join([ ll.rstrip() for ll in '\n'.join(diff).splitlines() if ll.strip() ]) msg = get_diff(out_text) self.send_email(msg) self.send_push()
def load_html(file_path: Path) -> List[List]: """Split html content to pages. Args: file_path: path to file Returns: list of pages """ try: raw_html = file_path.read_text(encoding="utf-8") except UnicodeDecodeError: raw_html = file_path.read_text(encoding="windows-1252") soup = BeautifulSoup(raw_html, features="lxml") [ s.extract() for s in soup( ['style', 'script', 'head', 'title', 'meta', '[document]']) ] # replace non-breaking space soup = soup.get_text(strip=False).replace("\xa0", " ") lines = [line.strip() for line in soup.splitlines() if line.strip()] return paginate(lines)
def preproc_text1(text): """Initial text cleaning. Convert text to all lowercase and strip it of unwanted HTML tags and content, unwanted REGEX patterns, natural line breaks and non-unicode characters. Args: text: a string. Returns: A cleaned text string. """ bad_tags = ['i', 'h4', 'b'] bad_regex_list = ['translated[^\.]+\.', 'previous (profile|loan)[^\.]+', 'http\S+', 'www\S+', 'mifex offers its clients[^\.]+\.', 'for more information[^\<]+'] bad_regex = re.compile('|'.join(bad_regex_list)) # remove unwanted html content contained in BAD_TAGS soup = BeautifulSoup(text, 'lxml') content_to_remove = [s.get_text() for s in soup.find_all(bad_tags)] if content_to_remove: text = ''.join([text.replace(c, "") for c in content_to_remove]) else: text = text text = text.lower() text = BeautifulSoup(text, 'lxml').text # remove html tags text = bad_regex.sub("", text) # remove unwanted REGEX patterns text = ' '.join(text.splitlines()) # remove natural line breaks text = unidecode.unidecode(text) # remove non-English characters return text
#################################################################### ##start for loops to iterate through a list of loans and parse HTML #################################################################### counter = 0 list=[] for lead in df.iterrows(): driver.implicitly_wait(5) elem = driver.find_element_by_name("") elem.send_keys(df.iloc[counter]) elem.send_keys(Keys.RETURN) driver.implicitly_wait(2) html = driver.page_source soup = BeautifulSoup(html) soup = str(soup) soup = soup.splitlines() for poop in soup: list.append( re.findall(r'<td><font color="blue">(.*)</font></td>',poop.strip())) list.append(df.iloc[counter]) counter = counter + 1 time.sleep(2) list = filter(len,list) df2=pd.DataFrame.from_records(list) df2.to_excel(writer,sheet_name='Text') writer.save()
# # TF-IDF without suffix tf = pandas.DataFrame() for url in urls: try: #Getting HTML and cleaning it request = req.Request(url,None,headers) html = req.urlopen(request).read() soup = BeautifulSoup(html, "lxml") for script in soup(["script", "style", "link", "meta", "head"]): script.extract() soup = soup.get_text(separator = " ") lines = (line.strip() for line in soup.splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) text = (line.translate(translator) for line in chunks) #Raw term frequency counts counts = dict.fromkeys(terms, 0) w = 0 for line in text: new_line = (line.split()) for word in new_line: word = re.sub(regex,"",word.lower()) w += 1 if word in terms: counts[word] += 1 #Term frequesncy as a percentage
def post(self): if self.arguments: api_logger.info("HEADERS: " + str(self.request)) # Parse each param data = self.arguments if 'name' not in list(data.keys()) and 'lines' not in list( data.keys()): api_logger.error("Error requests params.") self.finish( '{"result":"error","description":"Error requests params"}') else: try: t_name = data['name'] t_lines = data['lines'] except Exception as e: api_logger.error("Error requests params " + str(e)) self.finish( '{"result":"error","description":"Error requests params","debug":"' + str(e) + '"}') try: filename = stormuiapi.getWorkersByTopologyName(t_name)[0] if filename: api_logger.info("LogFilename: " + filename) #get log file from storm cluster n_lines = int(t_lines) * 200 url = filename + "&tail=" + str(n_lines) api_logger.debug("URL to fecth" + url) content = "" try: content = requests.get(url).content except Exception as e: api_logger.error( "Error getting log from Storm UI : " + str(e)) self.finish( '{"result":"error","description":"Error getting log from Storm UI: ", "detail":"' + str(e) + '"}') try: # Remove HTML tags from Storm Log 8000 port lines = BeautifulSoup(content).text api_logger.debug("Getting " + str(len(lines.splitlines())) + " lines.") self.set_header('Content-Type', 'text/plain') self.set_header( 'Content-Disposition', 'attachment; filename=' + filename + '') self.finish(lines) except Exception as e: api_logger.error( "Error parsing data from Storm UI" + str(e)) self.finish( '{"result":"error","description":"Error parsing data from Storm UI: ", "detail":"' + str(e) + '"}') else: api_logger.error( "Error getting worker from Storm UI API") self.finish( '{"result":"error","description":"Error getting worker from Storm UI API", "detail":""}' ) except Exception as e: api_logger.error("Unknown error" + str(e)) self.finish( '{"result":"error","description":"Error getting topology log: ", "detail":"' + str(e) + '"}') else: api_logger.error("Content-Type:application/json missing") self.finish( '{"result":"error","description":"Content-Type:application/json missing"}' )
class BodyExtractor(): def __init__(self, html, encoding='utf-8'): if type(html) == bytes: self.html = html.decode(encoding) else: self.html = html self.pureText = '' # 去除标签后的 self.THRESHOLD = 50 # 骤升点阈值 self.K = 3 # 行块中行数 self.wordCount = [] # 每个行块中的字符个数 self.lines = [] self.content = '' # 抽取的正文 self.title = '' self.maxIndex = -1 # 字符最多的行块索引 self.start = -1 self.end = -1 self._preprocess() self._start() self._end() if self.end != -1: self.content = ''.join(self.lines[self.start:self.end + self.K - 1]) def _preprocess(self): regex = re.compile( r'(?:<!DOCTYPE.*?>)|' # doctype r'(?:<head[\S\s]*?>[\S\s]*?</head>)|' r'(?:<!--[\S\s]*?-->)|' # comment r'(?:<img[\s\S]*?>)|' # 图片 r'(?:<br[\s\S]*?>\s*[\n])|' r'(?:<script[\S\s]*?>[\S\s]*?</script>)|' # js... r'(?:<style[\S\s]*?>[\S\s]*?</style>)', re.IGNORECASE) # css regTitle = re.search('<title>[\s\S]*?</title>',self.html) if regTitle is not None: titleTag = regTitle.group() self.title = titleTag[7:len(titleTag)-8] filteredHtml = self.html_escape(regex.sub('', self.html)) self.pureText = BeautifulSoup(filteredHtml, 'lxml').get_text() self.lines = list(map(lambda s: re.sub(r'\s+', '', s), self.pureText.splitlines())) count = list(map(lambda s: len(s), self.lines)) for i in range(len(count) - self.K + 1): self.wordCount.append(count[i] + count[i + 1] + count[i + 2]) self.maxIndex = self.wordCount.index(max(self.wordCount)) def html_escape(self,text): """ html转义 """ text = (text.replace(""", "\"").replace("“", "“").replace("”", "”") .replace("·", "·").replace("’", "’").replace("“", "“") .replace("”", "\”").replace("—", "——").replace("…", "…") .replace("•", "·").replace("(", "(").replace(")", ")") .replace("·", "·").replace("&", "&").replace("•", "·") .replace("<", "<").replace("<", "<").replace(">", ">") .replace(">", ">").replace(" ", " ").replace(" ", " ") .replace("˜", "~").replace("—", "—").replace("©", "@") .replace("©", "@").replace("♂", "").replace("\r\n|\r", "\n")) return text def _start(self): for i in [-x - 1 + self.maxIndex for x in range(self.maxIndex)]: gap = min(self.maxIndex - i, self.K) if sum(self.wordCount[i + 1:i + 1 + gap]) > 0: if self.wordCount[i] > self.THRESHOLD: continue else: break self.start = i + 1 def _end(self): for i in [x + self.maxIndex for x in range(len(self.wordCount) - self.maxIndex - 2)]: if self.wordCount[i] == 0 and self.wordCount[i + 1] == 0: self.end = i break
def get_cleaned_body_text(self, width: int = 70) -> str: body_text = BeautifulSoup(self.body, features='lxml').get_text() return '\n\n'.join('\n'.join( textwrap.wrap(line.replace('\xa0', ' ').rstrip(), width=width)) for line in body_text.splitlines())
dict = {} with open('/home/rishav/Desktop/hi-en_dict.csv') as f_obj: reader = csv.DictReader(f_obj, delimiter=',') for line in reader: if line['hword'] in dict.keys(): dict[line['hword']].append(line['eword']) else: dict.update({line['hword']: list([line['eword']])}) # print(dict['निराला'][0]) _setup() # print(transliterate("निराला", 'devanagari', 'iast')) eng_queries = "" for query in queries.splitlines(): eng_query = "" pharse = pharse_translate(query) word_list = query.split(' ') for p in phrase: if p in dict.keys(): eng_query += dict[p] + ' ' for word in word_list: if (word in trans_dict) or (word in dict): if word in trans_dict: eng_query = eng_query + trans_dict[word] + ' ' if word in dict: if len(dict[word]) == 1: eng_query = eng_query + dict[word][0] + ' ' else:
while True: page = input('Page link: ') if page in links: break else: print('The page is not on the database.') continue #take html T and T-1 from SQL databse (from the company's table of links) #the code is splitted using split_html function code1 = c.execute("select HTML from [%s] where LINK=?" % (company), [page]).fetchall()[0][0] #html code in T soup1 = BeautifulSoup( code1, 'html.parser').get_text() #apply BeautifulSoup and get text from the code1 soup1_lines = soup1.splitlines() #split soup1 in lines code2 = c.execute("select [HTML T-1] from [%s] where LINK=?" % (company), [page]).fetchall()[0][0] #html code in T-1 soup2 = BeautifulSoup( code2, 'html.parser').get_text() #apply BeautifulSoup and get text from code2 soup2_lines = soup2.splitlines() #split soup2 in lines #run the compare_html function compare_table = compare_html(soup1_lines, soup2_lines) print(compare_table) #write the compare table in an html file f = open('compare_pages_text.html', 'w', encoding='utf-8') f.write(compare_table) f.close()
def main(): warningDelimeter = re.compile("_{100,200}") warningDateRegex = re.compile("^.*(\b\d{1,2}\D{0,3})?\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|(Nov|Dec)(?:ember)?)\D?(\d{1,2}\D?)?\D?((19[7-9]\d|20\d{2})|\d{2}).*") warningNumberRegex = re.compile("^.*\d{2}-\d{1,3},? .*$") warningNumberSplitRegex = re.compile("\d{2}-\d{1,3},? ") warningReportedCrime = re.compile(".*REPORTED CRIME.*(:|-).*") warningIncident = re.compile(".*INCIDENT.*(:|-).*") warningLocation = re.compile("(LOCATION):{1}") warningLocationRegexMain = re.compile(":\s.*([A-Z0-9]).*\s") warningLocationRegex = re.compile(".*\d{3,5}.*BLOCK OF") warningLocationRegexStrict = re.compile("\d{3,5}\s*BLOCK OF.*") warningLocationDirection = re.compile("(\(NORTH)|(\(SOUTH)|(\(WEST)|(\(EAST)|(AT APPROXIMATELY)|(NEXT)|(,\s)|(THE YPSI)|(YPSILANTI POLICE)|(IN THE CITY)|(.THE VICTIM)") timelyWarningsPage = requests.get("http://www.emich.edu/police/alerts/safetynotices/index.php") timelyWarnings = BeautifulSoup(timelyWarningsPage.text).findAll("div",attrs={'id':'textcontainer'})[0].text thisWarning = [] allWarnings = [] thisWarningDict = {} allWarningDicts = [] debug = False inBody = False loc_dir = 0 global loc_temp timelyLines = iter(timelyWarnings.splitlines()) for line in timelyLines: line = line.decode('utf-8') if warningDelimeter.match(line): allWarnings.append(thisWarning) if 'crime' not in thisWarningDict.keys(): #print "\n".join(thisWarning) pass allWarningDicts.append(thisWarningDict) thisWarning = [] thisWarningDict = {} debug = False inBody = False else: thisWarning.append(line) if warningNumberRegex.match(line) or '15-6' in line or '15-15' in line: if "update".upper() not in line.upper(): if "15-03" in line: thisDate = "March 9, 2015" elif "15-9" in line: thisDate = "July 21, 2015" elif "15-8" in line: thisDate = "July 18, 2015" elif "15-15" in line: thisDate = "October 15, 2015" else: thisDate = line.partition(",")[-1] thisDate = thisDate.strip() thisWarningDict['date'] = thisDate #warningDates.append(thisDate) if warningReportedCrime.match(line.upper()) or warningIncident.match(line.upper()): tokens = line.replace(u'\xa0', "-").replace("-",":") tokens = tokens.replace(u'\u2013',"-").replace("-",":") tokens = tokens.replace(u'Date and time of incident',"") tokens = tokens.split(":") tokens = [x for x in tokens if x] upper_tokens = [token.upper().rstrip() for token in tokens] your_token = [token for token in tokens if 'OFF CAMPUS' in token.upper()] if your_token: thisWarningDict['onCampus'] = False your_token = None else: thisWarningDict['onCampus'] = True your_token = [token for token in tokens if 'STREET' in token.upper()] if your_token: thisWarningDict['location'] = your_token[0] your_token = None your_token = [token for token in tokens if 'GREEN LOT 1' in token.upper()] if your_token: thisWarningDict['location'] = your_token[0] your_token = None your_token = [token for token in tokens if warningLocationRegex.match(token.upper())] if your_token: thisWarningDict['location'] = your_token[0] your_token = None if "REPORTED CRIME" in upper_tokens: crime_int = upper_tokens.index("REPORTED CRIME") + 1 thisWarningDict['crime'] = tokens[crime_int] while thisWarningDict['crime'] == ' ': crime_int = crime_int + 1 thisWarningDict['crime'] = tokens[crime_int] if "City of Ypsilanti, Off Campus" in tokens[crime_int]: loc_index = tokens[crime_int].index(", City of Ypsilanti, Off Campus") thisWarningDict['crime'] = tokens[crime_int][:loc_index] inBody = True crime_int = None loc_index = None if "INCIDENT" in upper_tokens: crime_int = upper_tokens.index("INCIDENT") + 1 thisWarningDict['crime'] = tokens[crime_int] while thisWarningDict['crime'] == ' ': crime_int +=1 thisWarningDict['crime'] = tokens[crime_int] if "Location" in tokens[crime_int]: loc_index = tokens[crime_int].index("Location") thisWarningDict['crime'] = tokens[crime_int][:loc_index] inBody = True crime_int = None loc_index = None upper_tokens = None #print tokens if warningLocation.match(line.upper()): line = line.replace("Location:","") if 'location' not in thisWarningDict.keys(): thisWarningDict['location'] = line #print line if inBody: if warningLocationRegex.match(line.upper()): string_hold = line + next(timelyLines) loc_hold = warningLocationRegexStrict.search(string_hold.upper()).group(0) else: loc_hold = None if loc_hold: loc_dir = warningLocationDirection.search(loc_hold.upper()) if loc_dir: loc_ind = loc_dir.start() loc_hold = loc_hold[:loc_ind] else: loc_ind = 0; thisWarningDict['location'] = loc_hold if debug: print line for warning in allWarningDicts: print "____________________________________________________" print warning