Exemplos de BSHTML em Python, exemplos de BeautifulSoup.BSHTML em Python

Exemplo n.º 1

0

Exibir arquivo

def get_seq(chr_no, start, size):
    #hits' is of the form "3:3676292:3678292"
    if (size < 0):
        size = -size
        #excludes the base at reg_seq_start
        hits = str(chr_no) + ":" + str(start - size) + ":" + str(start - 1)
    else:
        #includes the base at reg_seq_start
        hits = str(chr_no) + ":" + str(start) + ":" + str(start + size - 1)
    params = {
        'program': 'returnFASTA',
        'db': 'GENOME',
        'dbid': '4',
        "hits": hits,
        "DBpath": "/DATA/PlantGDB/Index/Blast/OsGDB/OSgenome",
        "xGDB": "OsGDB"
    }

    r = rq.post('http://www.plantgdb.org/OsGDB/cgi-bin/formatReader.pl',
                data=params)
    html_resp = r.content

    bs = BSHTML(html_resp)
    fasta_string = bs.pre.contents[0].strip()
    return fasta_string

Exemplo n.º 2

0

Exibir arquivo

 def _doc_read_file(self, relpath, op=False):
     abspath = os.path.join(self.path, relpath)
     page = open(abspath, 'r').read()
     page = type(
         'Documentation', (object, ), {
             'get_relative_source_path': (lambda x: x.save_as),
             'content': page,
             'title': BSHTML(page).find('title').getText(),
             'url': relpath if op else os.path.dirname(relpath),
             'save_as': relpath,
             'template': 'documentation'
         })()
     self.add_source_path(page)
     return page

Exemplo n.º 3

0

Exibir arquivo

def get_decoded_email_body(def_self, incident_id, eml_filename, mail):

    attachments = []
    urls = []
    text = ''  # Stores the email body text, HTML formatted, for the return value of this function.

    if mail.is_multipart():

        for part in list(walk(mail)):

            try:
                if part is None: continue

                charset = part.get_content_charset()

                if (part.get('Content-Disposition')
                        is not None) and part.get_filename(
                        ) is not None:  # Likely a file attachment
                    if "attachment" in part.get(
                            'Content-Disposition').lower():  # File attachment
                        try:
                            filename = part.get_filename(
                            )  # The name of the file
                            content = part.get_payload(
                                decode=True)  # The content of the file
                            text += '<br />[attachment: ' + filename + ']'  # Insert found attachment into the body text

                            # Here we temporarily store the attachment, and then post it to the incident as an attachment and artifact
                            with tempfile.NamedTemporaryFile(
                                    delete=False) as temp_file:
                                try:
                                    temp_file.write(content)
                                    temp_file.close()
                                    artifact_type_id = 16  # "Other File" artifact ID
                                    def_self.rest_client().post_artifact_file(
                                        '/incidents/{0}/artifacts/files'.
                                        format(incident_id),
                                        artifact_type_id,
                                        temp_file.name,
                                        description='Attachment from {0}'.
                                        format(eml_filename),
                                        value=filename)
                                    def_self.rest_client().post_attachment(
                                        '/incidents/{0}/attachments'.format(
                                            incident_id), temp_file.name,
                                        '[MALICIOUS] {0}'.format(filename))
                                    attachments.append(filename)
                                finally:
                                    os.unlink(temp_file.name)
                        except:
                            pass

                elif part.get_payload(decode=True) is None:
                    continue

                elif part.get_content_charset() is None:
                    # We cannot know the character set, so return decoded "something" ...
                    text += unicode(
                        part.get_payload(decode=True), errors='replace'
                    ).encode('UTF-8', 'replace').strip(
                    )  # Trying this - may decide to remove later. -JJF, 2/23/2019
                    #continue

                elif part.get_content_type() == 'text/plain':
                    t = unicode(part.get_payload(decode=True), str(charset),
                                'replace').encode('UTF-8', 'replace').strip()
                    text += '<br />'.join(t.splitlines())  # To HTML

                    urls_temp = re.findall(
                        WEB_URL_REGEX,
                        text.strip())  # Find all URLs in body text
                    for u in urls_temp:
                        if u not in urls:
                            urls.append(
                                u)  # If not already in urls list, add it

                elif part.get_content_type() == 'text/html':
                    t = unicode(part.get_payload(decode=True), str(charset),
                                'replace').encode('UTF-8', 'replace').strip()
                    text += str(t)

                    skip_image_urls = []
                    urls_html_temp = re.findall(HTML_URL_REGEX, text.strip())
                    # Could also try: [a.get('href') for a in soup.find_all('a', href=True)]
                    soup = BSHTML(text)
                    images = soup.findAll(
                        'img'
                    )  # Find img tag urls, to ensure we don't put image URLs into urls list
                    for image in images:
                        skip_image_urls.append(image['src'])
                    for u in urls_html_temp:
                        if (u not in urls) and (u not in skip_image_urls):
                            urls.append(
                                u
                            )  # If not already in urls list and not an image, add it

                elif part.get_content_type(
                ) == 'text/enriched':  # This has not been tested yet, no test cases available.
                    t = unicode(part.get_payload(decode=True), str(charset),
                                'replace').encode('UTF-8', 'replace').strip()
                    text += '<br />'.join(striprtf(t).splitlines())  # To HTML

                    urls_temp = re.findall(
                        WEB_URL_REGEX,
                        text.strip())  # Find all URLs in body text
                    for u in urls_temp:
                        if u not in urls:
                            urls.append(
                                u)  # If not already in urls list, add it

            except Exception as err:
                log.info(
                    '[ERROR] Message body decoding failed at a part! Encountered: '
                    + str(err)
                )  # For debugging unexpected situations, function is robust as-is though

        if text is not None and text is not "":
            return [text.strip(), attachments, urls]

        else:
            return [
                'Unable to parse email body. Was it empty?', attachments, urls
            ]

    else:
        t = unicode(mail.get_payload(decode=True), mail.get_content_charset(),
                    'replace').encode('UTF-8', 'replace')
        text = '<br />'.join(t.splitlines())  # To HTML

        urls_temp = re.findall(WEB_URL_REGEX,
                               text.strip())  # Find all URLs in body text
        for u in urls_temp:
            if u not in urls:
                urls.append(u)  # If not already in urls list, add it

        return [text.strip(), attachments, urls]

Exemplo n.º 4

0

Exibir arquivo

def get_code(desc):
    bs = BSHTML(desc)
    return bs.code

Exemplo n.º 5

0

Exibir arquivo

# Modified on 2 July 2019
# This simple scripts reads all emoji counts from emojitracker and saves
# them into a csv file. Simply save emojitracker.com as a html file and
# pass the file name to this parser.
#
# How to run: python parse.py
#
# Requirements: pip install BeautifulSoup

from BeautifulSoup import BeautifulSoup as BSHTML

INPUT_FILE = 'emojitracker-sample.html'
OUTPUT_FILE = 'output-sample.csv' 

INPUT_FILE = 'emojitracker-2-july-2019.html'
OUTPUT_FILE = 'output.csv' 

f = open(OUTPUT_FILE,'w')
f.write('unicode\tname\tcount\n') # write headers

with open(INPUT_FILE) as texts:
    soup = BSHTML(texts)
    lis = soup.findAll('li', attrs = {'class' : 'emoji_char'})
    for li in lis:
        emoji = li['id'].lower()
        name = li['data-title'].lower()
        count = li.find('span', attrs = {'class' : 'score'}).text
        f.write(emoji+'\t"'+name+'"\t'+count+'\n') # write to file

f.close()