Пример #1
0
def load_html(file_path: str) -> List[List]:
    """Split html content to pages.

    Args:
        file_path: path to file

    Returns:
        list of pages
    """
    if file_path.startswith(("http", "https")):
        if "wikipedia.org" in file_path:
            parsed_url = urlparse(unquote(file_path))
            lang = parsed_url.hostname.split(".", 1)[0]
            article_name = parsed_url.path.rsplit("/", 1)[-1]
            wikipedia.set_lang(lang)
            page = wikipedia.page(article_name)
            text = page.content
            return paginate(text.split("\n"))
        else:
            file = requests.get(file_path)
            raw_html = file.content
    else:
        with open(file_path, "r") as file:
            raw_html = file.read()

    soup = BeautifulSoup(raw_html, features="lxml")
    [
        s.extract() for s in soup(
            ['style', 'script', 'head', 'title', 'meta', '[document]'])
    ]
    # replace non-breaking space
    soup = soup.get_text(strip=False).replace("\xa0", " ")
    lines = [line.strip() for line in soup.splitlines() if line.strip()]
    return paginate(lines)
Пример #2
0
def page_to_words(raw_page):
    # Swap <br> tags with newline character
    raw_page = re.sub('<br\s*?>', '\n', raw_page)

    page_text = BeautifulSoup(raw_page).get_text()

    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in page_text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)

    # 2. Remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", text, 0, re.UNICODE)

    # 3. Remove phrase 'OCR Output'
    cleaner_text = re.sub('OCR Output', " ", letters_only)

    # 4. Convert to lower case, split into individual words
    words = cleaner_text.lower().split()

    # 5. In Python, searching a set is much faster than searching a list, so convert the stop words to a set
    stops = set(nltk.corpus.stopwords.words("english"))

    # 6. Remove stop words
    meaningful_words = [w for w in words if w not in stops]

    # 7. Join the words back into one string separated by space and tokenize/stem
    final_book_text = " ".join(meaningful_words)

    return final_book_text
def clean_text(s):
    raw = BeautifulSoup(s, 'html.parser').get_text()
    raw = remove_control_characters(raw)
    lines = [line.strip() for line in raw.splitlines()]
    text = "".join(lines)

    return text
Пример #4
0
def get_readable_text(raw_html):
    """

    Arguments:
    - `x`:
    """
    raw_html = bytes(raw_html, 'utf-16').decode("utf-16", 'ignore')
    _cleantext = BeautifulSoup(raw_html).text

    #    paragraphs = _cleantext.split("\n+")

    paragraphs = [s.strip() for s in _cleantext.splitlines()]

    cleaned_paragraphs = []

    for para in paragraphs:
        cleantext = " ".join(para.split())
        cleantext = ''.join(x for x in cleantext if x in string.printable)
        cleaned_paragraphs.append(cleantext)

    cleantext = "\n".join(cleaned_paragraphs)

    strs = re.sub('\\n+', '. ', cleantext)
    cleantext = re.sub(r'\.+', ".", strs)

    return cleantext
Пример #5
0
def get_readable_text(raw_html):
    """
    Arguments:
    - `x`:
    """
    '''
    raw_html = bytes(raw_html, 'utf-16').decode("utf-16", 'ignore')
    _cleantext = BeautifulSoup(raw_html, 'lxml').text
    '''
    raw_html = bytes(raw_html, 'utf-16').decode("utf-16", 'ignore')
    _cleantext = BeautifulSoup(raw_html, 'lxml')
    for e in _cleantext.findAll('br'):
        e.replace_with(" ")
    _cleantext = _cleantext.getText(separator=u' ')

    #    paragraphs = _cleantext.split("\n+")
    paragraphs = [s.strip() for s in _cleantext.splitlines()]
    cleaned_paragraphs = []
    for para in paragraphs:
        cleantext = " ".join(para.split())
        cleantext = ''.join(x for x in cleantext if x in string.printable)
        cleaned_paragraphs.append(cleantext)

    cleantext = "\n".join(cleaned_paragraphs)
    strs = re.sub('\\n+', '. ', cleantext)
    cleantext = re.sub(r'\.+', ".", strs)
    return cleantext
Пример #6
0
def cleanComments(comments):

	clean_comments = []

	for comment in comments:

		# Extract again (JUST IN CASE)
		comment = BeautifulSoup(comment, "lxml").get_text()

		# Some comments are edited and the datastamp of edited message is extracted
		comment = re.sub(r"Message edited by author .*", '', comment)

		# Some comments have urls
		comment = re.sub(r'^https?:\/\/.*[\r\n]*', '', comment, flags=re.MULTILINE)

		# Word Standardizing (Ex. Looooolll should be Looll)
		comment = ''.join(''.join(s)[:2] for _, s in itertools.groupby(comment))

		# Remove Encodings
		comment = re.sub(r'\\\\', r'\\', comment)
		comment = re.sub(r'\\', ' ', comment)
		comment = re.sub(r'\\x\w{2,2}',' ', comment)
		comment = re.sub(r'\\u\w{4,4}', ' ', comment)
		comment = re.sub(r'\\n', '.', comment)

		# Remove carraige returns character
		comment = ' '.join(comment.splitlines())

		#Remove Unicode characters
		comment = codecs.decode(comment, 'unicode-escape')
		comment = ''.join([i if ord(i) < 128 else '' for i in comment])

		clean_comments.append(comment)

	return clean_comments
Пример #7
0
def get_mail(id, redact=False):
    M = imaplib.IMAP4()
    M.login("*****@*****.**", passwords.mitgliedsantrag)
    M.select()
    typ, data = M.search(None, 'Subject', str(id))
    typ, data = M.fetch(data[0].split()[-1], '(RFC822)')
    msg = email.message_from_string(data[0][1].decode('utf-8'))

    for part in msg.walk():
        if part.get_content_type() == "text/html":
            body = part.get_payload(decode=False)
        else:
            continue

    M.close()
    M.logout()

    body = BeautifulSoup(body, features="html.parser").get_text(separator="\n")

    if redact:
        body = body.replace("E-Mail: \n", "E-Mail: ")
        body_redacted = ""
        to_be_redacted = ["E-Mail", "Straße", "Telefonnummer"]
        for line in body.splitlines(True):
            if line[:line.find(':')] not in to_be_redacted:
                body_redacted += line
        body = body_redacted
    return body
Пример #8
0
def response_to_readable(response):
    firsthead = httpvers + " " + str(
        response.status_code) + " " + response.reason + "\n"
    soup = BeautifulSoup(str(response.text), 'html.parser').prettify()
    # ToDo: would it be better, if the indent is setable to 2 or 3 instead of 1?
    nlines = min(template.maxbodylines, len(soup.splitlines()))
    #sometimes everything is in one line
    if (nlines <= 3):
        if (len(soup) > template.maxbodycharacters):
            soup = soup[template.maxbodycharacters:] + "[...]"
    else:
        if (len(soup.splitlines()) > nlines):
            soup = os.linesep.join(soup.split(
                os.linesep)[:nlines]) + "\n" + "[...]"

    return firsthead + readable_headers(response) + "\n" + soup
Пример #9
0
    def upload_qso(self,
                   qso,
                   username: str = "",
                   password: str = "",
                   qth_nickname: str = ""):
        if not username or not password or not qth_nickname:
            _logger.error(
                "Invalid login data. Username: %s - Password: %s - QTH Nickname: %s"
                % (username, password, qth_nickname))
            raise ValidationError(_("Invalid login data"))

        if not qso:
            _logger.error("Invalid QSO")
            raise ValidationError(_("Invalid QSO"))

        adif_utility = self.env["ham.utility.adif"]

        extra_fields = {
            "EQSL_USER": username,
            "EQSL_PSWD": password,
        }
        adif_content = adif_utility.generate_adif_header(
            extra_fields=extra_fields)

        extra_fields = {"APP_EQSL_QTH_NICKNAME": qth_nickname}
        adif_content += adif_utility.generate_adif_qso(
            qso, extra_fields=extra_fields)

        data = {
            "EQSL_USER": username,
            "EQSL_PSWD": password,
        }

        files = {"Filename": ("file.adi", adif_content)}

        url = EQSL_API_UPLOAD_QSO

        response = requests.post(url=url, files=files)

        if response.status_code != 200:
            _logger.error("Unable to upload QSO to eQSL. Status code %d: %s" %
                          (response.status_code, response.content))
            _logger.error("Data: %s" % data)
            raise ValidationError(_("Unable to upload QSO to eQSL"))

        if b"error" in response.content.lower():
            root = etree.fromstring(
                response.content.decode(),
                parser=etree.HTMLParser(remove_comments=True))
            html_content = etree.tostring(root)
            raw_message = BeautifulSoup(html_content, "lxml").text.strip()
            message = "\n".join(
                [x.strip() for x in raw_message.splitlines() if x.strip()])

            _logger.error("Error publishing QSO: %s" % message)
            _logger.error("Data: %s" % data)
            raise ValidationError("%s.\n%s" %
                                  (_("Error publishing QSO"), message))
Пример #10
0
def filterbiography(data):
    data = json.loads(data['d'])
    properdata = {}
    try:
        textx = BeautifulSoup(data['BIOGRAPHY_HTML'], 'html.parser').get_text()
        properdata['biographyAll'] = textx
        textx = textx.splitlines()
        properdata['Biography'] = textx
        properdata['MediaCaption'] = data['MEDIA_CAPTION']
    except:
        return properdata
    return properdata
Пример #11
0
def get_img_url(raw_url):
    # Get text from page
    bs4_text = BeautifulSoup(get_page(raw_url), 'lxml').get_text()
    img_url_location = bs4_text.index(
        'Image URL')  # 'Image URL is what we are searching for in the text
    bs4_text = bs4_text[img_url_location:]
    bs4_text_list = bs4_text.splitlines()

    # Get line that has the image URL and return it
    img_url_line = bs4_text_list[0]
    img_url = img_url_line.split(' ')[4]

    return img_url
Пример #12
0
class PDFHandler():

    ROBOT_LIBRARY_SCOPE = 'TEST SUITE'

    def open_pdf(self, filepath, xml=True):
        pdf = parser.from_file(filepath, xmlContent=xml)

        for key, value in pdf.items():
            setattr(self, key, value)

        if xml:
            self._treat_xml()
        else:
            self._treat_plaintext()

    def _treat_xml(self):
        self.content = BeautifulSoup(self.content, "html.parser")
        self.content = [el.text for el in self.content if len(el.text.strip())]

        filtered = []

        for el in self.content:
            el = el.split("\n")
            el = [x.strip() for x in el]

            filtered += el

        self.content = [el.strip() for el in filtered if len(el.strip())]

    def _treat_plaintext(self):
        self.content = self.content.splitlines()
        self.content = [el.strip() for el in self.content if len(el.strip())]

    def get_values_by_pattern(self, regexp):
        pattern = re.compile(regexp)

        matches = [
            pattern.search(el) for el in self.content if pattern.match(el)
        ]
        matches = [match.group(1) for match in matches]

        return matches

    def get_pdf_content(self):
        return self.content

    def get_pdf_status(self):
        return self.status

    def get_pdf_metadata(self):
        return self.metadata
Пример #13
0
def get_update():
    """
    Attempt to reach the pheonixminer webserver. Returns a dict with some useful info. If HTML code is not 200,
    returns an error message.
    """
    # Get HTML data
    resp = requests.get(MONITOR_URL)

    # Prepare our final returned object (dict)
    # TODO: Convert numerics into int from str
    summary = {'html_code': resp.status_code,
               'status_text': '',
               # 'full_text': '',
               'hash rate': '',
               'time': '',
               'power': '',
               'uptime': ''
               }

    # Check for HTTP code, error out if necessary
    # TODO: Handle discrete cases
    if summary['html_code'] == 200:
        # Convert to lines of text
        clean_text = BeautifulSoup(resp.text, "html.parser").text
        text_lines = clean_text.splitlines()

        # Remove lines that are empty or have only whitespace
        empties = ('', ' ', '\t', '\n', '&nbsp;')
        text_lines = [line for line in text_lines if line not in empties]

        # Extract insights
        # TODO: Replace these with some more elegant regex
        summary['hash rate'] = text_lines[-2].split()[-2]
        summary['power'] = text_lines[-7].split()[-2]
        timestamp = [line for line in text_lines if line[0] == "*"][-1]
        # Looks like ['***', '63:20', '***', '2/19', '10:16', '**************************************']
        summary['uptime'] = timestamp.split()[1]
        summary['time'] = timestamp.split()[4]

        # TODO: Extract individual GPU speeds, temps
        idx_of_last_ts = len(text_lines) - 1 - text_lines[::-1].index(timestamp)
        idx_of_gpu1 = len(text_lines) - 1 - text_lines[::-1].index(text_lines[idx_of_last_ts+3])
        summary['gpu list'] = [text_lines[i].split(',')[0] for i in range(idx_of_gpu1, len(text_lines)-8)]
        summary['gpu stats'] = text_lines[-8].split(',')
        summary['gpu speeds'] = [item.split('(')[0] for item in text_lines[idx_of_last_ts-2].split(':')[2:]]

    else:
        print(f"Warning: Got status code {summary['html_code']}")

    return summary
Пример #14
0
	def post(self):
		if self.arguments:
			api_logger.info("HEADERS: "+str(self.request))
			# Parse each param
			data = self.arguments
			if 'name' not in list(data.keys()) and 'lines' not in list(data.keys()):
				api_logger.error("Error requests params.")
				self.finish('{"result":"error","description":"Error requests params"}')
			else:
				try:
					t_name = data['name']
					t_lines = data['lines']
				except Exception as e:
					api_logger.error("Error requests params "+str(e))
					self.finish('{"result":"error","description":"Error requests params","debug":"'+str(e)+'"}')
				try:
					filename = stormuiapi.getWorkersByTopologyName(t_name)[0]
					if filename:
						api_logger.info("LogFilename: "+filename)
						#get log file from storm cluster
						n_lines = int(t_lines)*200
						url = filename+"&tail="+str(n_lines)
						api_logger.debug("URL to fecth"+url)
						content = ""
						try:
							content = requests.get(url).content
						except Exception as e:
							api_logger.error("Error getting log from Storm UI : "+str(e))
							self.finish('{"result":"error","description":"Error getting log from Storm UI: ", "detail":"'+str(e)+'"}')
						try:
							# Remove HTML tags from Storm Log 8000 port
							lines = BeautifulSoup(content).text
							api_logger.debug("Getting "+str(len(lines.splitlines()))+" lines.")
							self.set_header ('Content-Type', 'text/plain')
							self.set_header ('Content-Disposition', 'attachment; filename='+filename+'')
							self.finish(lines)
						except Exception as e:
							api_logger.error("Error parsing data from Storm UI"+str(e))
							self.finish('{"result":"error","description":"Error parsing data from Storm UI: ", "detail":"'+str(e)+'"}')
					else:
						api_logger.error("Error getting worker from Storm UI API")
						self.finish('{"result":"error","description":"Error getting worker from Storm UI API", "detail":""}')
				except Exception as e:
					api_logger.error("Unknown error"+str(e))
					self.finish('{"result":"error","description":"Error getting topology log: ", "detail":"'+str(e)+'"}')
		else:
			api_logger.error("Content-Type:application/json missing")
			self.finish('{"result":"error","description":"Content-Type:application/json missing"}')
Пример #15
0
def _convert_html_to_plain_text(html_string, remove_line_breaks=False):
    """Returns string after removing all HTML markup.

    Args:
        html_string (str): string with HTML markup
        remove_line_breaks (bool): whether to also remove line breaks and extra white space from string
    """
    if not html_string:
        return ''

    text = BeautifulSoup(html_string, "html.parser").get_text()

    # remove empty lines as well leading and trailing space on non-empty lines
    if remove_line_breaks:
        text = ' '.join(line.strip() for line in text.splitlines() if line.strip())

    return text
Пример #16
0
def convert_html_to_plain_text(html_string, remove_line_breaks=False):
    """Returns string after removing all HTML markup.

    Args:
        html_string (str): string with HTML markup
        remove_line_breaks (bool): whether to also remove line breaks and extra white space from string
    """
    if not html_string:
        return ''

    text = BeautifulSoup(html_string, "html.parser").get_text()

    # remove empty lines as well leading and trailing space on non-empty lines
    if remove_line_breaks:
        text = ' '.join(line.strip() for line in text.splitlines() if line.strip())

    return text
Пример #17
0
 def txt_from_HTML(self):
     fin_sentences = []
     url = input("Enter the url here: ")
     http = urllib3.PoolManager()
     response = http.request('POST', url)
     raw = BeautifulSoup(response.data, "lxml").get_text()
     raw = os.linesep.join([s for s in raw.splitlines() if s])
     txt = sent_tokenize(raw)
     new_sentences = self.filter_results(txt)
     for line in new_sentences:
         if line.count("\n") <= 1 and len(word_tokenize(line)) > 3 and "<" not in line and "subscrib" not in line:
             fin_sentences.append(line)
         elif line.find("\n") == 1:
             fin_sentences.append(line)
     fin_txt = ' '.join(fin_sentences)
     fin_txt = fin_txt.replace(".", ". ")
     return fin_txt
Пример #18
0
def get_page(url):
    print('trying to fetch page: ', url)
    r = requests.get(url)
    if re.search(
            '(Ocorreu um erro ao buscar o documento\. Tente novamente)|(O documento não foi liberado para publicação)',
            r.text):
        print('No document available.')
    else:
        text = BeautifulSoup(r.text, 'html5lib').text
        text_list = text.splitlines()
        text_list = [i for i in text_list if i]
        try:
            with open('html_data/' + str(i) + '.html', 'wb') as f:
                f.write(r.content)
        except IndexError as e:
            print(e)
        except UnicodeError as e:
            print(e)
Пример #19
0
def simple_page(raw_page):
    raw_page = re.sub('<br\s*?>', '\n', raw_page)
    page_text = BeautifulSoup(raw_page).get_text()

    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in page_text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)

    letters_only = re.sub("[^a-zA-Z]", " ", text, 0, re.UNICODE)
    # 3. Remove phrase 'OCR Output'
    cleaner_text = re.sub('OCR Output', " ", letters_only)
    words = cleaner_text.lower().split()
    final_book_text = " ".join(words)

    return final_book_text
Пример #20
0
def Phrase_ETF_Components():
    ETF_List_URL = "https://www.twse.com.tw/zh/page/ETF/list.html"
    try:
        page = requests.get(ETF_List_URL)
        text = BeautifulSoup(page.text, 'html.parser').find('tbody').text
        text = list(filter(None, text.splitlines()))
        table = []
        comp = []
        cnt = 0
        for i in text:
            if cnt < 4:
                comp.append(i)
                cnt = cnt + 1
            else:
                table.append(comp)
                comp = []
                cnt = 0
        print(table)
    except:
        print("Failed to get list from twse website.")
Пример #21
0
def get_counts():
	only_class = SoupStrainer(class_="stats-wrapper")
	only_br = SoupStrainer("br")
	data = BeautifulSoup(open('main.html'),"html.parser", parse_only=only_class).prettify()
	posts=""
	topics=""
	post_count = 0
	topics_count = 0
	for line in data.splitlines():
		if "Příspěvky" in line:
			posts += line
		elif "Témata" in line:
			topics += line
	post_numbers = map(int, re.findall('\d+', posts))
	topics_numbers = map(int, re.findall('\d+', topics))
	for num in post_numbers:
		post_count += num
	for num in topics_numbers:
		topics_count += num
	return (post_count,topics_count)
Пример #22
0
async def update_spots(application):
    """
    Coroutine that updates spots
    """
    lasthash = ""
    try:
        while True:
            r = requests.get(
                'http://www.dxsummit.fi/DxSpots.aspx?count=50&include_modes=PHONE')
            cleantext = BeautifulSoup(r.text, "lxml").get_text()
            currenthash = hashlib.md5(cleantext.encode('utf-8')).hexdigest()

            clusterdata = []

            i = 0
            for line in cleantext.splitlines():
                line = line[:73] + ':' + line[73:]
                #line = line[:76] + line[84:]
                cleanline = ' '.join(line.split())
                splitstring = cleanline.split(sep=" ", maxsplit=3)
                clusterdata.append(
                    (hashlib.md5(line.encode('utf-8')).hexdigest() + " " + splitstring[1]+" "+splitstring[2], " " + line))
                data = cleanline.split(sep=" ", maxsplit=3)
                if ((auto_tune.checked is True) and (i == 0) and (data[2] != globalvars['lastcall'])):
                    globalvars['lastcall'] = data[2]
                    frequency.content=FormattedTextControl(HTML('<b fg="#884444">Freq.:</b> ' + (data[1] + " Khz").rjust(15)))
                    dx.content=FormattedTextControl(HTML('<b fg="#884444">Call:</b> ' + data[2].rjust(12)))
                    if qrz.checked is True:
                        redis.rpush('qrzLookupQueue',data[2])
                i+=1

            radios.values = clusterdata

            if currenthash != lasthash:
                application.invalidate()
                await asyncio.sleep(15)
            else:
                await asyncio.sleep(30)
    except asyncio.CancelledError:
        #TODO safe config here ?
        print()
Пример #23
0
def split_html(raw_html: str) -> List[List]:
    """Split html content to pages.
    Args:
        raw_html: html content
    Returns:
        list of pages
    """

    '''
    将html内容拆分为页面。
     args:
         raw_html:html内容
     return:
         页面清单
    '''
    soup = BeautifulSoup(raw_html, features="lxml")
    [s.extract() for s in soup(["style", "script", "head", "title", "meta", "[document]"])]
    # replace non-breaking space
    # 替换不间断空间
    soup = soup.get_text(strip=False).replace("\xa0", " ")
    lines = [line.strip() for line in soup.splitlines() if line.strip()]
    return paginate(lines)
Пример #24
0
def get_counts():
    only_class = SoupStrainer(class_="stats-wrapper")
    only_br = SoupStrainer("br")
    data = BeautifulSoup(open('main.html'),
                         "html.parser",
                         parse_only=only_class).prettify()
    posts = ""
    topics = ""
    post_count = 0
    topics_count = 0
    for line in data.splitlines():
        if "Příspěvky" in line:
            posts += line
        elif "Témata" in line:
            topics += line
    post_numbers = map(int, re.findall('\d+', posts))
    topics_numbers = map(int, re.findall('\d+', topics))
    for num in post_numbers:
        post_count += num
    for num in topics_numbers:
        topics_count += num
    return (post_count, topics_count)
Пример #25
0
class Text:
    def __init__(self, text_id: int) -> None:
        normalization = {
            132: '"',
            147: '"',
            0x96: "--",
            0x91: "'",
            0x92: "'",
            0x97: "---",
        }
        text_url = "https://www.keinverlag.de/{}.text".format(text_id)
        soup = soup_from(text_url)
        try:
            self.title = soup.select("h1 > span")[0].text.translate(
                normalization)
            self.content = BeautifulSoup(
                re.sub(
                    r'<span style="font-style: italic;">(([\n\r]|.)*?)</span>',
                    r"_\1_",
                    str(soup.select(".fliesstext > span")[0]),
                ),
                "lxml",
            ).text.translate(normalization)
            self.author = soup.select("h3 > a")[2].text
            self.type = soup.select("h1 ~ h3")[0].text
        except IndexError:
            raise ValueError("Text {} not available.".format(text_id))

    def markdown(self,
                 *,
                 with_author: bool = True,
                 with_type: bool = False) -> str:
        return "#### {maybe_author}{title}{maybe_type}\n\n{content}".format(
            title=self.title,
            maybe_author=self.author + ": " if with_author else "",
            maybe_type=" (" + self.type + ")" if with_type else "",
            content="\n".join(line + "\\" if line else ""
                              for line in self.content.splitlines()),
        )
Пример #26
0
    def parse_details(self, response):
        def extract_with_css(query):
            return response.css(query)

        html = extract_with_css('body').extract_first()

        full_text = BeautifulSoup(html, 'html.parser').get_text()

        lines = full_text.splitlines()  # List of HTML text lines

        norms = utils.split_list_by_sep(lines, '__')

        norms = list(
            map(lambda l: [' '.join(l)],
                norms))  # A list of separated norms from the same source

        for norm in norms:
            yield Norm({
                'published_at': response.meta['date'],
                'text': norm[0],
                'type': dict(simple=response.meta['type'])
            })
Пример #27
0
    def check_changes(self):
        response = requests.get(self.url, headers=self.headers)
        r = response.text
        self.course.teacher_id.html = r
        self.course.teacher_id.save()
        soup = BeautifulSoup(r, "html.parser")

        for link in soup.select("a[href$='.pdf']"):
            url = urljoin(self.url, link['href'])
            filename = link['href'].split('/')[-1]
            if not Files.objects.check_if_exists(self.course, filename,
                                                 self.teacher_id):
                self.send_push_pdf(url)
                course = Course.objects.get_record_by_id(self.course_id)
                Files.objects.add_record(course, filename,
                                         str(self.teacher_id))

        for script in soup(["script", "style"]):
            script.extract()
        soup = soup.get_text()

        self.prev_version = BeautifulSoup(self.prev_version, "html.parser")
        for script in self.prev_version(["script", "style"]):
            script.extract()
        self.prev_version = self.prev_version.get_text()

        if self.prev_version != soup:
            self.old_page = self.prev_version.splitlines()
            self.new_page = soup.splitlines()
            d = difflib.Differ()
            diff = d.compare(self.old_page, self.new_page)
            out_text = "\n".join([
                ll.rstrip() for ll in '\n'.join(diff).splitlines()
                if ll.strip()
            ])
            msg = get_diff(out_text)
            self.send_email(msg)
            self.send_push()
Пример #28
0
def load_html(file_path: Path) -> List[List]:
    """Split html content to pages.

    Args:
        file_path: path to file

    Returns:
        list of pages
    """
    try:
        raw_html = file_path.read_text(encoding="utf-8")
    except UnicodeDecodeError:
        raw_html = file_path.read_text(encoding="windows-1252")

    soup = BeautifulSoup(raw_html, features="lxml")
    [
        s.extract() for s in soup(
            ['style', 'script', 'head', 'title', 'meta', '[document]'])
    ]
    # replace non-breaking space
    soup = soup.get_text(strip=False).replace("\xa0", " ")
    lines = [line.strip() for line in soup.splitlines() if line.strip()]
    return paginate(lines)
Пример #29
0
def preproc_text1(text):

    """Initial text cleaning.

    Convert text to all lowercase and strip it of unwanted HTML tags and content, unwanted REGEX patterns, natural line breaks and non-unicode characters.

    Args:
    text: a string.

    Returns:
    A cleaned text string.
    """

    bad_tags = ['i', 'h4', 'b']
    bad_regex_list = ['translated[^\.]+\.',
                      'previous (profile|loan)[^\.]+',
                      'http\S+',
                      'www\S+',
                      'mifex offers its clients[^\.]+\.',
                      'for more information[^\<]+']
    bad_regex = re.compile('|'.join(bad_regex_list))

    # remove unwanted html content contained in BAD_TAGS
    soup = BeautifulSoup(text, 'lxml')
    content_to_remove = [s.get_text() for s in soup.find_all(bad_tags)]
    if content_to_remove:
        text = ''.join([text.replace(c, "") for c in content_to_remove])
    else:
        text = text

    text = text.lower()
    text = BeautifulSoup(text, 'lxml').text  # remove html tags
    text = bad_regex.sub("", text) # remove unwanted REGEX patterns
    text = ' '.join(text.splitlines())  # remove natural line breaks
    text = unidecode.unidecode(text)  # remove non-English characters

    return text
Пример #30
0
####################################################################
##start for loops to iterate through a list of loans and parse HTML
####################################################################
counter = 0
list=[]
for lead in df.iterrows():
	driver.implicitly_wait(5)
	elem = driver.find_element_by_name("")
	elem.send_keys(df.iloc[counter])
	elem.send_keys(Keys.RETURN)

	driver.implicitly_wait(2)

	html = driver.page_source

	soup = BeautifulSoup(html)
	soup = str(soup)
	soup = soup.splitlines()
	
	for poop in soup:
		list.append( re.findall(r'<td><font color="blue">(.*)</font></td>',poop.strip()))
		
	list.append(df.iloc[counter])
	counter = counter + 1
	time.sleep(2)

list = filter(len,list)
df2=pd.DataFrame.from_records(list)
df2.to_excel(writer,sheet_name='Text')

writer.save()
Пример #31
0
# # TF-IDF without suffix
tf = pandas.DataFrame()  

for url in urls:
    try:        
        #Getting HTML and cleaning it
        request = req.Request(url,None,headers) 
        html    = req.urlopen(request).read()
        soup    = BeautifulSoup(html, "lxml")
        
        for script in soup(["script", "style", "link", "meta", "head"]):
            script.extract()
        
        soup   = soup.get_text(separator = " ")
        lines  = (line.strip() for line in soup.splitlines())   
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text   = (line.translate(translator) for line in chunks)
        
        #Raw term frequency counts
        counts = dict.fromkeys(terms, 0) 
        w      = 0
        for line in text:
            new_line = (line.split())
            for word in new_line:
                word = re.sub(regex,"",word.lower())
                w += 1
                if word in terms:
                    counts[word] += 1  
        
        #Term frequesncy as a percentage
 def post(self):
     if self.arguments:
         api_logger.info("HEADERS: " + str(self.request))
         # Parse each param
         data = self.arguments
         if 'name' not in list(data.keys()) and 'lines' not in list(
                 data.keys()):
             api_logger.error("Error requests params.")
             self.finish(
                 '{"result":"error","description":"Error requests params"}')
         else:
             try:
                 t_name = data['name']
                 t_lines = data['lines']
             except Exception as e:
                 api_logger.error("Error requests params " + str(e))
                 self.finish(
                     '{"result":"error","description":"Error requests params","debug":"'
                     + str(e) + '"}')
             try:
                 filename = stormuiapi.getWorkersByTopologyName(t_name)[0]
                 if filename:
                     api_logger.info("LogFilename: " + filename)
                     #get log file from storm cluster
                     n_lines = int(t_lines) * 200
                     url = filename + "&tail=" + str(n_lines)
                     api_logger.debug("URL to fecth" + url)
                     content = ""
                     try:
                         content = requests.get(url).content
                     except Exception as e:
                         api_logger.error(
                             "Error getting log from Storm UI : " + str(e))
                         self.finish(
                             '{"result":"error","description":"Error getting log from Storm UI: ", "detail":"'
                             + str(e) + '"}')
                     try:
                         # Remove HTML tags from Storm Log 8000 port
                         lines = BeautifulSoup(content).text
                         api_logger.debug("Getting " +
                                          str(len(lines.splitlines())) +
                                          " lines.")
                         self.set_header('Content-Type', 'text/plain')
                         self.set_header(
                             'Content-Disposition',
                             'attachment; filename=' + filename + '')
                         self.finish(lines)
                     except Exception as e:
                         api_logger.error(
                             "Error parsing data from Storm UI" + str(e))
                         self.finish(
                             '{"result":"error","description":"Error parsing data from Storm UI: ", "detail":"'
                             + str(e) + '"}')
                 else:
                     api_logger.error(
                         "Error getting worker from Storm UI API")
                     self.finish(
                         '{"result":"error","description":"Error getting worker from Storm UI API", "detail":""}'
                     )
             except Exception as e:
                 api_logger.error("Unknown error" + str(e))
                 self.finish(
                     '{"result":"error","description":"Error getting topology log: ", "detail":"'
                     + str(e) + '"}')
     else:
         api_logger.error("Content-Type:application/json missing")
         self.finish(
             '{"result":"error","description":"Content-Type:application/json missing"}'
         )
class BodyExtractor():
    def __init__(self, html, encoding='utf-8'):
        if type(html) == bytes:
            self.html = html.decode(encoding)
        else:
            self.html = html
        self.pureText = ''  # 去除标签后的
        self.THRESHOLD = 50  # 骤升点阈值
        self.K = 3  # 行块中行数
        self.wordCount = []  # 每个行块中的字符个数
        self.lines = []
        self.content = ''  # 抽取的正文
        self.title = ''
        self.maxIndex = -1  # 字符最多的行块索引
        self.start = -1
        self.end = -1
        self._preprocess()
        self._start()
        self._end()

        if self.end != -1:
            self.content = ''.join(self.lines[self.start:self.end + self.K - 1])

    def _preprocess(self):
        regex = re.compile(
            r'(?:<!DOCTYPE.*?>)|'  # doctype
            r'(?:<head[\S\s]*?>[\S\s]*?</head>)|'
            r'(?:<!--[\S\s]*?-->)|'  # comment
            r'(?:<img[\s\S]*?>)|'  # 图片
            r'(?:<br[\s\S]*?>\s*[\n])|'
            r'(?:<script[\S\s]*?>[\S\s]*?</script>)|'  # js...
            r'(?:<style[\S\s]*?>[\S\s]*?</style>)', re.IGNORECASE)  # css
        regTitle = re.search('<title>[\s\S]*?</title>',self.html)
        if regTitle is not None:
            titleTag = regTitle.group()
            self.title = titleTag[7:len(titleTag)-8]

        filteredHtml = self.html_escape(regex.sub('', self.html))
        self.pureText = BeautifulSoup(filteredHtml, 'lxml').get_text()
        self.lines = list(map(lambda s: re.sub(r'\s+', '', s), self.pureText.splitlines()))
        count = list(map(lambda s: len(s), self.lines))
        for i in range(len(count) - self.K + 1):
            self.wordCount.append(count[i] + count[i + 1] + count[i + 2])
        self.maxIndex = self.wordCount.index(max(self.wordCount))

    def html_escape(self,text):
        """
        html转义
        """
        text = (text.replace("&quot;", "\"").replace("&ldquo;", "“").replace("&rdquo;", "”")
                .replace("&middot;", "·").replace("&#8217;", "’").replace("&#8220;", "“")
                .replace("&#8221;", "\”").replace("&#8212;", "——").replace("&hellip;", "…")
                .replace("&#8226;", "·").replace("&#40;", "(").replace("&#41;", ")")
                .replace("&#183;", "·").replace("&amp;", "&").replace("&bull;", "·")
                .replace("&lt;", "<").replace("&#60;", "<").replace("&gt;", ">")
                .replace("&#62;", ">").replace("&nbsp;", " ").replace("&#160;", " ")
                .replace("&tilde;", "~").replace("&mdash;", "—").replace("&copy;", "@")
                .replace("&#169;", "@").replace("♂", "").replace("\r\n|\r", "\n"))
        return text

    def _start(self):
        for i in [-x - 1 + self.maxIndex for x in range(self.maxIndex)]:
            gap = min(self.maxIndex - i, self.K)
            if sum(self.wordCount[i + 1:i + 1 + gap]) > 0:
                if self.wordCount[i] > self.THRESHOLD:
                    continue
                else:
                    break

        self.start = i + 1

    def _end(self):
        for i in [x + self.maxIndex for x in range(len(self.wordCount) - self.maxIndex - 2)]:
            if self.wordCount[i] == 0 and self.wordCount[i + 1] == 0:
                self.end = i
                break
Пример #34
0
 def get_cleaned_body_text(self, width: int = 70) -> str:
     body_text = BeautifulSoup(self.body, features='lxml').get_text()
     return '\n\n'.join('\n'.join(
         textwrap.wrap(line.replace('\xa0', ' ').rstrip(), width=width))
                        for line in body_text.splitlines())
Пример #35
0
dict = {}
with open('/home/rishav/Desktop/hi-en_dict.csv') as f_obj:
    reader = csv.DictReader(f_obj, delimiter=',')
    for line in reader:
        if line['hword'] in dict.keys():
            dict[line['hword']].append(line['eword'])
        else:
            dict.update({line['hword']: list([line['eword']])})

# print(dict['निराला'][0])
_setup()
# print(transliterate("निराला", 'devanagari', 'iast'))

eng_queries = ""
for query in queries.splitlines():
    eng_query = ""
    pharse = pharse_translate(query)
    word_list = query.split(' ')
    for p in phrase:
        if p in dict.keys():
            eng_query += dict[p] + ' '

    for word in word_list:
        if (word in trans_dict) or (word in dict):
            if word in trans_dict:
                eng_query = eng_query + trans_dict[word] + ' '
            if word in dict:
                if len(dict[word]) == 1:
                    eng_query = eng_query + dict[word][0] + ' '
                else:
Пример #36
0
while True:
    page = input('Page link: ')
    if page in links:
        break
    else:
        print('The page is not on the database.')
        continue

#take html T and T-1 from SQL databse (from the company's table of links)
#the code is splitted using split_html function
code1 = c.execute("select HTML from [%s] where LINK=?" % (company),
                  [page]).fetchall()[0][0]  #html code in T
soup1 = BeautifulSoup(
    code1,
    'html.parser').get_text()  #apply BeautifulSoup and get text from the code1
soup1_lines = soup1.splitlines()  #split soup1 in lines
code2 = c.execute("select [HTML T-1] from [%s] where LINK=?" % (company),
                  [page]).fetchall()[0][0]  #html code in T-1
soup2 = BeautifulSoup(
    code2,
    'html.parser').get_text()  #apply BeautifulSoup and get text from code2
soup2_lines = soup2.splitlines()  #split soup2 in lines

#run the compare_html function
compare_table = compare_html(soup1_lines, soup2_lines)
print(compare_table)

#write the compare table in an html file
f = open('compare_pages_text.html', 'w', encoding='utf-8')
f.write(compare_table)
f.close()
Пример #37
0
def main():
	warningDelimeter = re.compile("_{100,200}")
	warningDateRegex = re.compile("^.*(\b\d{1,2}\D{0,3})?\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|(Nov|Dec)(?:ember)?)\D?(\d{1,2}\D?)?\D?((19[7-9]\d|20\d{2})|\d{2}).*")
	warningNumberRegex = re.compile("^.*\d{2}-\d{1,3},? .*$")
	warningNumberSplitRegex = re.compile("\d{2}-\d{1,3},? ")
	warningReportedCrime = re.compile(".*REPORTED CRIME.*(:|-).*")
	warningIncident = re.compile(".*INCIDENT.*(:|-).*")
	warningLocation = re.compile("(LOCATION):{1}")
	warningLocationRegexMain = re.compile(":\s.*([A-Z0-9]).*\s")
	warningLocationRegex = re.compile(".*\d{3,5}.*BLOCK OF")
	warningLocationRegexStrict = re.compile("\d{3,5}\s*BLOCK OF.*")
	warningLocationDirection = re.compile("(\(NORTH)|(\(SOUTH)|(\(WEST)|(\(EAST)|(AT APPROXIMATELY)|(NEXT)|(,\s)|(THE YPSI)|(YPSILANTI POLICE)|(IN THE CITY)|(.THE VICTIM)")


	timelyWarningsPage = requests.get("http://www.emich.edu/police/alerts/safetynotices/index.php")
	timelyWarnings = BeautifulSoup(timelyWarningsPage.text).findAll("div",attrs={'id':'textcontainer'})[0].text
	thisWarning = []
	allWarnings = []
	thisWarningDict = {}
	allWarningDicts = []
	debug = False
	inBody = False
	loc_dir = 0
	global loc_temp
	timelyLines = iter(timelyWarnings.splitlines())
	for line in  timelyLines:
		line = line.decode('utf-8')
		if warningDelimeter.match(line):
			allWarnings.append(thisWarning)
			if 'crime' not in thisWarningDict.keys():
				#print "\n".join(thisWarning)
				pass
			allWarningDicts.append(thisWarningDict)
			thisWarning = []
			thisWarningDict = {}
			debug = False
			inBody = False
		else:
			thisWarning.append(line)
		if warningNumberRegex.match(line) or '15-6' in line or '15-15' in line:
			if "update".upper() not in line.upper():
				if "15-03" in line:
					thisDate = "March 9, 2015"
				elif "15-9" in line:
					thisDate = "July 21, 2015"
				elif "15-8" in line:
					thisDate = "July 18, 2015"
				elif "15-15" in line:
					thisDate = "October 15, 2015"
				else:
					thisDate = line.partition(",")[-1]
				thisDate = thisDate.strip()
				thisWarningDict['date'] = thisDate
				#warningDates.append(thisDate)
		if warningReportedCrime.match(line.upper()) or warningIncident.match(line.upper()):
			tokens = line.replace(u'\xa0', "-").replace("-",":")
			tokens = tokens.replace(u'\u2013',"-").replace("-",":")
			tokens = tokens.replace(u'Date and time of incident',"")
			tokens = tokens.split(":")
			tokens = [x for x in tokens if x]
			upper_tokens = [token.upper().rstrip() for token in tokens]
			your_token = [token for token in tokens if 'OFF CAMPUS' in token.upper()]
			if your_token:
				thisWarningDict['onCampus'] = False
				your_token = None
			else:
				thisWarningDict['onCampus'] = True
			your_token = [token for token in tokens if 'STREET' in token.upper()]
			if your_token:
				thisWarningDict['location'] = your_token[0]
				your_token = None
			your_token = [token for token in tokens if 'GREEN LOT 1' in token.upper()]
			if your_token:
				thisWarningDict['location'] = your_token[0]
				your_token = None
			your_token = [token for token in tokens if warningLocationRegex.match(token.upper())]
			if your_token:
				thisWarningDict['location'] = your_token[0]
				your_token = None
			if "REPORTED CRIME" in upper_tokens:
				crime_int = upper_tokens.index("REPORTED CRIME") + 1
				thisWarningDict['crime'] = tokens[crime_int]
				while thisWarningDict['crime'] == ' ':
					crime_int = crime_int + 1
					thisWarningDict['crime'] = tokens[crime_int]
				if "City of Ypsilanti, Off Campus" in tokens[crime_int]:
					loc_index = tokens[crime_int].index(", City of Ypsilanti, Off Campus")
					thisWarningDict['crime'] = tokens[crime_int][:loc_index]
				inBody = True
				crime_int = None
				loc_index = None
			if "INCIDENT" in upper_tokens:
				crime_int = upper_tokens.index("INCIDENT") + 1
				thisWarningDict['crime'] = tokens[crime_int]
				while thisWarningDict['crime'] == ' ':
					crime_int +=1
					thisWarningDict['crime'] = tokens[crime_int]
				if "Location" in tokens[crime_int]:
					loc_index = tokens[crime_int].index("Location")
					thisWarningDict['crime'] = tokens[crime_int][:loc_index]
				inBody = True
				crime_int = None
				loc_index = None
				upper_tokens = None
			#print tokens
		if warningLocation.match(line.upper()):
			line = line.replace("Location:","")
			if 'location' not in thisWarningDict.keys():
				thisWarningDict['location'] = line
				#print line
		if inBody:
			if warningLocationRegex.match(line.upper()):
				string_hold = line + next(timelyLines)
				loc_hold = warningLocationRegexStrict.search(string_hold.upper()).group(0)
			else:
				loc_hold = None
			if loc_hold:
				loc_dir = warningLocationDirection.search(loc_hold.upper())
				if loc_dir:
					loc_ind = loc_dir.start()
					loc_hold = loc_hold[:loc_ind]
				else:
					loc_ind = 0;
				thisWarningDict['location'] = loc_hold
		if debug:
			print line
	for warning in allWarningDicts:
		print "____________________________________________________"
		print warning