Python strip_html示例，util.misc.strip_html Python示例

示例#1

0

显示文件

文件： fmylife.py 项目： frozenMC/CloudBot

def fml(inp):
    ".fml [id] -- Gets a random quote from fmyfife.com. Optionally gets [id]."

    inp = inp.replace("#", "")

    if inp:
        if not inp.isdigit():
            return "Invalid ID!"
        try:
            page = http.get(urljoin(base_url, inp))
        except (HTTPError, IOError):
            return "Could not fetch #%s. FML" % inp
    else:
        try:
            page = http.get(urljoin(base_url, 'random'))
        except (HTTPError, IOError):
            return "I tried to use .fml, but it was broken. FML"

    soup = BeautifulSoup(page)

    soup.find('div', id='submit').extract()
    post = soup.body.find('div', 'post')
    try:
        id = int(post.find('a', 'fmllink')['href'].split('/')[-1])
    except TypeError:
        return "Could not fetch #%s. FML" % inp
    body = misc.strip_html(' '.join(link.renderContents() for link in post('a', 'fmllink')))
    return '(#%d) %s' % (id, body)

示例#2

0

显示文件

文件： fact.py 项目： ShadowDev/CloudBot

def get_fact():
    page = http.get('http://www.omg-facts.com/random')
    soup = BeautifulSoup(page)
    container = soup.find('a', {'class': 'surprise'})
    link = container['href']

    fact = misc.strip_html(container.renderContents())

    if fact:
        return (fact, link)
    else:
        raise nofact

示例#3

0

显示文件

文件： slogan.py 项目： frozenMC/CloudBot

def sloganizr(inp, nick=None, say=None, input=None):
    ".slogan <word> -- Makes a slogan for <word>."
    slogan = sloganize(inp)

    slogan = misc.strip_html(slogan)

    if inp.islower():
        slogan = slogan.split()
        slogan[0] = slogan[0].capitalize()
        slogan = " ".join(slogan)

    return slogan

示例#4

0

显示文件

文件： nltk_ex25.py 项目： clp/learn_python

def convert_text_to_words(raw_qa_s):
    """Convert and filter the i/p text string into a string of words.

    Convert a raw stackoverflow question or answer
    to a string of meaningful words for detailed analysis.

    The input is a single string of text.

    That content is processed in various ways, eg, remove HTML,
    remove non-letters, convert to lower-case, and remove
    stop words that clutter the output.

    Return a single string of meaningful words.
    """

    # 1. Remove HTML
    qa_text = ut.strip_html(raw_qa_s, "lxml")
    ###


    # 2. Remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", qa_text)

    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()

    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))

    # Add more noise terms to stopwords.
    stops.add('th')

    # 5. Remove stop words
    #ORG meaningful_words = [w for w in words if not w in stops]
    meaningful_words = [w for w in words if w not in stops]

    # 6. Join the words back into one string of words, each word
    # separated by a space, and return the resulting string.

    return(" ".join(meaningful_words))

示例#5

0

显示文件

文件： gcalc.py 项目： ShadowDev/CloudBot

def calc(inp):
    ".calc <term> -- Calculate <term> with Google Calc."

    white_re = re.compile(r'\s+')

    page = http.get('http://www.google.com/search', q=inp)

    soup = BeautifulSoup(page)

    response = soup.find('h2', {'class': 'r'})

    if response is None:
        return "Could not calculate " + inp

    output = response.renderContents()

    output = ' '.join(output.splitlines())
    output = output.replace("\xa0", ",")
    output = white_re.sub(' ', output.strip())

    output = output.decode('utf-8', 'ignore')
    output = misc.strip_html(output)

    return output

示例#6

0

显示文件

def write_df_to_otl(in_df, wdir, wfile, columns_l):
    """Write full contents of some columns of a data frame to an otl file.

    Open that file w/ Vim + VimOutliner for easy overview of all questions,
    and quick navigation.

    Use the list of columns specified in this function
    if caller does not specify such a list.
    """
    if in_df.empty:
        print('WARN: write*otl(): Input dataframe empty or not found.')
        return
    pd.set_option('display.max_colwidth', -1)  # -1=no limit, for debug
    outfile = wdir + wfile
    save_prior_file(wdir, wfile)
    # Specify default output columns to use.
    if not columns_l:
        columns_l = [
            'Id', 'Title', 'Body', 'Score', 'HSTCount', 'HiScoreTerms',
            'OwnerUserId', 'ParentId'
        ]

    #
    # Save o/p to a string and do not specify an output file in
    # calling to_string().
    # Use 'index=False' to prevent showing index in column 1.
    in_s = in_df[columns_l].to_string(header=False, index=False)
    print('#D-write1, len in_s: ', len(in_s))

    #D #TBD, Debug,
    #D import pdb
    #D pdb.set_trace()
    #D print()
    #D print('#D-write_otl in_s: ', in_s[:999])

    #
    # Delete long strings of spaces at end of each line.
    # Replace blank spaces at end of each line w/ only the newline char.
    # Do this for all matching patterns in the string in one cmd.
    out_s = in_s
    out_s = re.sub('  +\n', '\n', out_s)

    # Convert html line breaks to newlines before stripping html.
    out_s = re.sub(r'<br>', '\n    ', out_s)
    out_s = re.sub(r'<br/>', '\n    ', out_s)

    # Clean the newlines in the string so each line has proper indent.
    out_s = ut.strip_html(out_s, "lxml")
    out_s = replace_line_breaks_for_otl(out_s)
    #
    print('#D-write2, len out_s: ', len(out_s))
    #D print('#D-write3, out_s: ', out_s[:599] )
    #
    # Replace empty lines w/ INDENT+##
    out_s = re.sub(r'\n\s*\n', r'\n        ##\n', out_s)
    print('#D-write4, len out_s: ', len(out_s))

    #D print()
    #D print('#D-write_otl out_s: ', out_s[:999])

    with open(outfile, 'w') as f:
        cf.logger.info('NOTE: Writing data to otl outfile: ' + outfile)
        f.write(out_s)

    pd.set_option('display.max_colwidth',
                  MAX_COL_WID)  # -1=no limit, for debug
    return