Пример #1
0
def parse_quotes (htmlitem):
    '''
    Parse quotes in an HTML item. Convert all i-tags into quote-tags.
    Delete all other tags.
    '''
    quoteMatch_with_comma = re.match('(?u)(.*?)<i.*?>(.{5,}?),(.{5,}?)</i>'\
                                     '(.*)', htmlitem)
    quoteMatch = re.match('(?u)(.*?)(<i.*?>.{5,}?</i>)(.*)', htmlitem)
    
    if quoteMatch_with_comma:
        before_q = BeautifulSoup(quoteMatch_with_comma.group(1)).get_text()
        quote1 = BeautifulSoup(quoteMatch_with_comma.group(2)).get_text()
        quote2 = BeautifulSoup(quoteMatch_with_comma.group(3)).get_text()
        
        if quote1.strip() != '' and quote2.strip() != '':
          parsedItem = before_q + '<quote>' + quote1 + '</quote>,'\
                       + '<quote>' + quote2 + '</quote>'\
                       + parse_quotes(quoteMatch_with_comma.group(4))
                     
        else:
            return unicode(BeautifulSoup(htmlitem).get_text())
        return unicode(parsedItem)    
    
    elif quoteMatch:
        quote = BeautifulSoup(quoteMatch.group(2)).get_text()
        if not 'siehe' in quote and not 'Siehe' in quote\
                and quote.strip() != '':
            parsedItem = BeautifulSoup(quoteMatch.group(1)).get_text()\
                         + '<quote>' + quote + '</quote>'\
                         + parse_quotes(quoteMatch.group(3))
            return unicode(parsedItem)
        else:
            return unicode(BeautifulSoup(htmlitem).get_text())
    else:
        return unicode(BeautifulSoup(htmlitem).get_text())
Пример #2
0
 def crude_parsing(self):
     crude_list = []
     standard_fields = [
         "from:", "to:", "cc:", "bcc:", "mime-version:", "content-type:",
         "x-from:", "x-to:", "x-cc:", "content-transfer-encoding:",
         "x-bcc:", "x-filename", "subject:", "message-id:", "x-origin:"
     ]
     with open(self.origin_file) as f:
         for line in f:
             line = line.decode("utf-8", "ignore").encode("utf-8").lower()
             try:
                 line = BeautifulSoup(line, "html.parser").getText()
             except Exception as e:
                 line = ""
             line = line.lower()
             if line in ['\n', '\r\n']:
                 crude_list.append("content: " + line.strip())
             else:
                 content = False
                 for field in standard_fields:
                     if line.startswith(field):
                         content = True
                         crude_list.append(line.strip())
                 if not content:
                     if len(crude_list) > 0:
                         crude_list[len(crude_list) -
                                    1] += " " + line.strip()
                     else:
                         crude_list.append("content: " + line.strip())
     return crude_list
Пример #3
0
def GetAllResInPage(tgtUrl) :
    # 対象URL
    r = requests.get(tgtUrl)

    # 第一引数=解析対象 第二引数=パーサー(何を元に解析するか:この場合はHTML)
    soup = BeautifulSoup(r.content, "html.parser")

    # dt, ddタグ以下の特定クラスをかき集める。
    resheads = soup.find_all("dt", class_="st-bbs_reshead")
    resbodys = soup.find_all("dd", class_="st-bbs_resbody")

    formattedHead = []
    formattedBody = []
    resCount = 0

    # 整形済みレスヘッダ部取得
    for rhead in resheads:
        h = rhead

        # 取得したdt情報を再度文字列化し、BeautifulSoupにかけることでdt以下のタグを同じ手法で取れるようにする
        hObj = BeautifulSoup(str(h), 'html.parser')

        # dtタグ内における各タグ(およびクラス)を取得
        bbs_resNo   = hObj.find('span', class_='st-bbs_resNo').getText()
        bbs_name    = hObj.find('span', class_='st-bbs_name').getText()
        bbs_resInfo = hObj.find('div', class_='st-bbs_resInfo').getText()
        # resInfo情報に関しては調整が必要なので前後のトリム・改行コードのち缶等を調整する
        bbs_resInfo = bbs_resInfo.strip()
        bbs_resInfo = bbs_resInfo.strip('\n')
        bbs_resInfo = bbs_resInfo.replace('\n', ' ')

        # テキスト中に大量の空白が混ざっているため、正規表現で複数空白については空白一個に置換する
        pattern = r' +'
        bbs_resInfo = re.sub(pattern, ' ', bbs_resInfo)
        # print(bbs_resNo, bbs_name, bbs_resInfo)
        # やり方はなんでもいいのだが、取得した複数のテキストを空白で区切った一行に出力。整形済みヘッダhへappendする
        resHeaders = [bbs_resNo, bbs_name, bbs_resInfo]
        h = ' '.join(resHeaders)

        formattedHead.append(h)

    # 整形済みレス本体部取得
    for rbody in resbodys:
        # レス本体部分をStr形式にキャスト、文字列置換で改行タグを改行コードに変換し再度bs4オブジェクトに戻す
        # これを行わないとWebページ上では改行されていた箇所が全部消えてあらゆるレスが1行になる
        b = str(rbody)
        b = b.replace("<br>", "\n")
        b = b.replace("<br/>", "\n")
        b = BeautifulSoup(b, "html.parser").getText()

        b = b.strip()       # 前後から空白削除
        b = b.strip('\n')   # 前後から改行削除
        formattedBody.append(b)
        # カウントするのはheadでもbodyでもどちらでもいいのだが、この数が本ページにおけるレス数になる(通常は30だが最終ページでは少ない可能性あり)
        resCount += 1

    return resCount, formattedHead, formattedBody
Пример #4
0
 def chapter(self):
     chapter = BeautifulSoup(self.__kontenHalaman,features='html.parser').title.text.lower()
     chapter = re.sub('.+?\s[chapter]{6,7} ([\d\-\_\s\.]+).+','\g<1>',chapter)
     chapter = re.sub('-','.',chapter)
     if(chapter.find('.') < 0):
         chapter = re.sub('\s+','.',chapter.strip())
     else:
         chapter = re.sub('\s','',chapter.strip())
     return chapter.strip('.')
Пример #5
0
def clean(texts):
    processed_texts = []
    for text in texts:
        text = BeautifulSoup(text, "html.parser")
        text = text.get_text().encode('ascii', 'ignore').decode('utf-8')
        text = re.sub(r"\\", "", text)
        text = re.sub(r"\'", "", text)
        text = re.sub(r"\"", "", text)
        text.strip().lower()
        processed_texts.append(text)
    return processed_texts
Пример #6
0
 def description(self) -> [str]:
     """Return the description as a list of str 
     where each str is a bullet point.
     """
     html_text = self._data['description'].split('<li>')
     parsed_text = []
     for html_line in html_text:
         line = BeautifulSoup(html_line, features='html.parser').get_text()
         line = unicodedata.normalize('NFKD', line)
         if line.strip() != '':
             parsed_text.append(line.strip())
     return parsed_text
Пример #7
0
def upload():
    count = 0
    with open("v1/data.txt", "r") as fr:
        lines = fr.readlines()

    while len(lines) > 0:
        count += 1
        curr_line = lines.pop(0)
        line_buf = []
        while curr_line != '\n' or len(line_buf) < 5:
            line_buf.append(curr_line)
            if len(lines) > 0:
                curr_line = lines.pop(0)
            else:
                break

        #print(line_buf[0])
        #print(line_buf[-1])
        #print()
        """
        if len(line_buf[0]) > max_name[0]:
            max_name[0] = len(line_buf[0])
            max_name[1] = line_buf[0]
        
        for rev in line_buf[-3:]:
            if len(rev) > max_rev[0]:
                max_rev[0] = len(rev)
                max_rev[1] = rev
        """
        #print(line_buf)
        desc = line_buf[1]
        for piece in line_buf[2:-3]:
            desc = desc + piece
        desc = BeautifulSoup(desc.strip()).text
        """
        if len(desc) > max_desc[0]:
            max_desc[0] = len(desc)
            max_desc[1] = desc
        """
        init_piece = line_buf[0].split("#")
        Product(name=BeautifulSoup(init_piece[0].strip()).text,
                amazon_id=init_piece[1].strip(),
                description=desc.strip(),
                review1=line_buf[-3].strip(),
                review2=line_buf[-2].strip(),
                review3=line_buf[-1].strip()).save()

    print(count)
Пример #8
0
async def get_page_title(url: str, allow_hostname=True, allow_path: bool = False, allow_filename: bool = True) \
        -> Optional[str]:
    r = None
    # noinspection PyBroadException
    try:
        r = await get(url=url,
                      timeout=2,
                      decode=True,
                      intended_content_type='text/html')
        if r.status != 200 or not r.content:
            raise ValueError('not an HTML page')
        if len(r.content) <= 27:  # len of `<html><head><title></title>`
            raise ValueError('invalid HTML')
        title = BeautifulSoup(r.content, 'lxml').title.text
        return title.strip()
    except Exception:
        content_disposition = r.headers.get(
            'Content-Disposition') if r else None
        filename_match = contentDispositionFilenameParser(
            content_disposition) if content_disposition else None
        if filename_match and allow_filename:
            return filename_match.group()
        url_parsed = urlparse(url)
        if allow_path:
            path = url_parsed.path
            return path.rsplit('/', 1)[-1] if path else None
        if allow_hostname:
            return url_parsed.hostname
Пример #9
0
def table_content_list(output_file):
    html = mammoth.convert_to_html(output_file).value
    soup = BeautifulSoup(html, "html.parser")
    # print("soup------->",soup)
    table_content_list_all = []
    for tables in soup.find_all('table'):
        for row in tables.find_all('tr'):
            column_list = []
            for column in row.find_all('td'):
                #             column_list.append(str(column).replace('<td>','').replace('</td>','').replace('</p>','').replace('<p>','').replace('<td colspan="2">','').strip())
                raw_html = str(column).replace(
                    '<strong>',
                    'start_bold').replace('</strong>',
                                          'end_bold').replace('</p>',
                                                              '\n').strip()
                cleantext = BeautifulSoup(raw_html, "lxml").text
                cleantext = cleantext.replace('start_bold', '<b>').replace(
                    'end_bold', '</b>')
                cleantext = cleantext.replace('<', '&lt;').replace(
                    '>', '&gt;').replace('\n', '')
                column_list.append(cleantext.strip())
            column_list = [i for i in column_list if i]
            #         print(column_list)
            table_content_list_all.append(column_list)

    table_content_list_all = [x for x in table_content_list_all if x != []]
    return table_content_list_all
Пример #10
0
def text(url):
    if 'http' not in url:
        url = 'http://' + url
    page = get(url).text
    doc = Document(page).summary()
    text = BeautifulSoup(doc).get_text()
    return text.strip()
Пример #11
0
 def clean_comment(self, description):
     try:
         sent = BeautifulSoup(description, "lxml").get_text()
         return sent.strip().strip("/*").strip("//").strip("*").strip()
     except Exception:
         traceback.print_exc()
         return ""
Пример #12
0
def getNews():
    try:
        # List of RSS feeds that we will fetch and combine
        newsurls = {
            'dailynews': 'http://www.dailymirror.lk/RSS_Feeds/business-main'
        }

        # Iterate over the feed urls
        for key, url in newsurls.items():
            all_headlines.extend(getHeadlines(url))

        # Iterate over the allheadlines list and print each headline
        for hl in all_headlines:
            try:
                html_text = urllib.urlopen(hl).read()
                parsed_text = BeautifulSoup(html_text, "html.parser")
                desc = parsed_text.findAll(attrs={"class": "row inner-text"})
                #print desc
                content_news = desc[0].encode('utf-8')
                para = str(content_news).split("<p>")
                final_string = ""
                for a in para:
                    if "img" in a or "iframe" in a or "!--" in a:
                        pass
                    else:
                        final_string = final_string + a
                append_string = BeautifulSoup(final_string, "html.parser").text
                printLog('info', "News item: " + append_string.strip())
                cleantext.append(append_string)
            except Exception as e:
                print e
        printLog('output', "Parsed text: " + str(cleantext))
        return cleantext
    except Exception as e:
        printLog('error', e)
Пример #13
0
 def _parser(self, content, parse, add_new_line=False):
     content = str(content).replace('<br/>', '\n').replace(parse, '')
     content = BeautifulSoup(content, 'html.parser').text
     content = '\n'.join([s.strip() for s in content.strip().split('\n')])
     if add_new_line:
         content += '\n'
     return content
Пример #14
0
 def process_line(self, line):
     line = BeautifulSoup(line).text
     line = line.replace(',', ' , ')
     line = line.replace('.', ' . ')
     line = line.strip().split()
     line = [word.strip() for word in line]
     return line
Пример #15
0
def text_clean(text, url_removal, tag_removal, stem_stop_punc, punc_removal):
    """
    cleaning a text
    :param text: input text of any length
    :param url_removal: flag for removing url from text
    :param tag_removal: flag for removing tags from text
    :param stem_stop_punc: flag for removing stop words, stemming tokens, and removing punctuations
    :param punc_removal: flag for removing punctuations
    :return: cleaned text
    """

    # removing urls from text
    if url_removal is True:
        text = remove_url(text)

    # removing HTML tags
    if tag_removal is True:
        text = BeautifulSoup(text, "lxml").text

    # stop word removal, stemming the tokens, and punctuations removal
    if stem_stop_punc is True:
        text = stop_word_removal(text, 1, punc_removal)

    # removing new line characters
    text = text.replace('\n', ' ')

    # filtering non-printable characters
    text = ''.join([i if ord(i) < 128 else ' ' for i in text])

    # removing more than one space
    text = ' '.join(text.split())

    return text.strip()
Пример #16
0
def save_html(filename_without_path, page_source, message, prettify=True):
    """Save html dump of the current page. Path is determined by LOCAL_SERVER setting

    :param str filename_without_path: The base filename without the absolute path
    :param str page_source: The html source to save
    :param str message: The message to display
    :param bool prettify: if True, prettifies html
    """

    if getattr(settings, 'TAKE_TEST_HTML', False):
        if settings.VERBOSE_OCOM_TEST_CLASSES:
            console(message)

        # check extension
        extension = os.path.splitext(filename_without_path)[1].lower()

        if extension != '.html':
            raise KeyError("Unknown extension for file: {}".format(filename_without_path))

        save_path = html_path()

        if not os.path.exists(save_path):
            pathlib.Path(save_path).mkdir(parents=True, exist_ok=True)

        with open(html_path() + filename_without_path, 'w') as html_file:
            the_html = BeautifulSoup(str(page_source), 'html.parser').prettify() if prettify else str(page_source)
            # noinspection PyArgumentEqualDefault
            html_file.write(the_html.strip())
Пример #17
0
def clean_str(raw: Optional[str],
              strip_trailing_period: bool = False) -> Optional[str]:
    """
    Takes a str and "cleans" it. Intended to be usable with short strings
    (names, titles) in any language. See scrub_text(), which extends this
    function for paragraph length and longer text fields.
    """
    if not raw:
        return None

    text = ftfy.fix_text(raw)

    # remove HTML
    text = BeautifulSoup(text, "html.parser").get_text()

    # TODO: for performance, compile these as globals?
    # replaces whitespace with single space
    text = re.sub(r"\s+", " ", text).strip()

    # TODO: shouldn't HTML be parsing these out?
    text = text.replace("<em>", "").replace("</em>", "")

    text = text.strip()

    if strip_trailing_period and text.endswith("."):
        text = text[:-1]

    if text.lower() in UNWANTED_SHORT_STRINGS:
        return None

    if not text:
        return None
    return text
Пример #18
0
def text(url):
    if 'http' not in url:
        url = 'http://' + url
    page = get(url).text
    doc = Document(page).summary()
    text = BeautifulSoup(doc).get_text()
    return text.strip()
Пример #19
0
    def parse_content(self, response):
        item = PoemsSpiderItem()

        title = response.css('div.sons h1').xpath('text()').extract()[0]
        print(title)

        item['title'] = response.css('div.sons h1').xpath('text()').extract()[0]

        try:
            author = response.css('div.cont p.source a').xpath('text()').extract()[1]
        except:
            author = ''

        content = response.xpath('//div[@class="contson"]')[0].extract()

        "过滤掉html标签"
        content = BeautifulSoup(content, 'xml').get_text()
        "过滤掉空格"
        content = content.strip().replace("\n", "").replace(' ', '')
        "去掉空格里的内容"
        content = re.sub('\([^)]*\)', '', content)
        content = re.sub('\([^)]*\)', '', content)

        # 换行
        content = re.sub("。", "。\n", content)
        content = content.rstrip("\n")

        item['author'] = author
        item['content'] = content

        yield item
  def _clean_str(self, string):
    """
    desc: This function cleans a string
          adapted from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    args:
      string: the string to be cleaned
    returns:
      a cleaned string
    """

    string = BeautifulSoup(string, "lxml").text
    string = re.sub(r"[^A-Za-z0-9(),!?\"\`]", " ", string)
    string = re.sub(r"\"s", " \"s", string)
    string = re.sub(r"\"ve", " \"ve", string)
    string = re.sub(r"n\"t", " n\"t", string)
    string = re.sub(r"\"re", " \"re", string)
    string = re.sub(r"\"d", " \"d", string)
    string = re.sub(r"\"ll", " \"ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower().split(" ")
Пример #21
0
def reformat(directory, media_id, file_name, text, ext, date, username, format_path, date_format, text_length, maximum_length):
    media_id = "" if media_id is None else str(media_id)
    has_text = False
    if "{text}" in format_path:
        has_text = True
    path = format_path.replace("{username}", username)
    text = BeautifulSoup(text, 'lxml').get_text().replace(
        "\n", " ").strip()
    SAFE_PTN = '[^0-9a-zA-Z-_.()]+'
    filtered_text = re.sub(SAFE_PTN, ' ',  text.strip()
                           ).strip().replace(' ', '_')[:text_length]
    path = path.replace("{text}", filtered_text)
    date = date.strftime(date_format)
    path = path.replace("{date}", date)
    path = path.replace("{id}", media_id)
    path = path.replace("{file_name}", file_name)
    path = path.replace("{ext}", ext)
    directory2 = directory + path

    if has_text:
        count_string = len(path)
        text_count = len(filtered_text)
        if count_string > maximum_length:
            text_limit = count_string - text_count
            path = path.replace(
                filtered_text, filtered_text[:-text_limit])
            directory2 = directory + path
    return directory2
Пример #22
0
def reviews_to_words(raw_reviews, skip_stop_words):
    stop_words = set(nltk.corpus.stopwords.words('english'))

    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    num_reviews = raw_reviews.size
    letter_only_re = re.compile('[^a-zA-Z]')
    final_words_list = []
    print("======== tokenizing review into words ========")
    for i in range(num_reviews):
        print('encoding: %.1f%%\r' % (float(i) / raw_reviews.size * 100))

        review = raw_reviews[i]
        review_text = BeautifulSoup(review).get_text()
        sentences_list = tokenizer.tokenize(review_text.strip())

        words = []
        for j in range(len(sentences_list)):
            sents = sentences_list[j]
            sents = letter_only_re.sub(" ", sents)

            ws = sents.split(' ')
            ws = filter(lambda x: len(x) > 0, ws)
            if skip_stop_words:
                meaningful_words = [w for w in ws if not w in stop_words]
                words += meaningful_words
            else:
                words += ws

        final_words_list.append(words)

    return final_words_list
Пример #23
0
 def _html_to_text(self, html):
     # Hack to prevent Beautiful Soup from collapsing space-keeping tags
     # until no whitespace remains at all
     html = re.sub("<(br|p|li)", " \\g<0>", html, flags=re.IGNORECASE)
     text = BeautifulSoup(html, "html.parser").get_text()
     # Idea from http://stackoverflow.com/a/1546251
     return " ".join(text.strip().split())
Пример #24
0
def clean_text(text):
    try:
        text = re.sub(r"http\S+", "", text)
    except:
        print(text)
    text = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", text)
    text = re.sub(r"\'s", " \'s", text)
    text = re.sub(r"\'ve", " \'ve", text)
    text = re.sub(r"n\'t", " n\'t", text)
    text = re.sub(r"\'re", " \'re", text)
    text = re.sub(r"\'d", " \'d", text)
    text = re.sub(r"\'ll", " \'ll", text)
    text = re.sub(r",", " , ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\(", " \( ", text)
    text = re.sub(r"\)", " \) ", text)
    text = re.sub(r"\?", " \? ", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = BeautifulSoup(text).text  # HTML decoding
    text = text.lower()  # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(
        ' ', text)  # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub(
        '', text)  # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(re.sub("[\.\,\!\?\:\;\-\=]", " ", text).split())
    text = ''.join(''.join(s)[:2] for _, s in itertools.groupby(text))
    text = emoji.demojize(text)
    text = ' '.join(word for word in text.split()
                    if word not in STOPWORDS)  # delete stopwors from text
    text = text.replace(":", " ")
    text = ' '.join(text.split())
    return text.strip().lower()
Пример #25
0
def get_proxy():
    proxys = BeautifulSoup(
        requests.get(
            "http://qsrdk.daili666api.com/ip/?tid=" + keyinfo["tid"] +
            "&num=1", "lxml").text).p.contents[0]
    print(proxys)
    return proxys.strip()
Пример #26
0
def clean_str(review_docs, method=2):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    output_docs = []
    if(method == 1):
        for string in review_docs:
            string = BeautifulSoup(string, "lxml").get_text()
            string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
            string = re.sub(r"\'s", " \'s", string)
            string = re.sub(r"\'ve", " \'ve", string)
            string = re.sub(r"n\'t", " n\'t", string)
            string = re.sub(r"\'re", " \'re", string)
            string = re.sub(r"\'d", " \'d", string)
            string = re.sub(r"\'ll", " \'ll", string)
            string = re.sub(r",", " , ", string)
            string = re.sub(r"!", " ! ", string)
            string = re.sub(r"\(", " \( ", string)
            string = re.sub(r"\)", " \) ", string)
            string = re.sub(r"\?", " \? ", string)
            string = re.sub(r"\s{2,}", " ", string)
            string = string.strip().lower()
            string = string.split(" ")
            output_docs.append(string)
    elif(method==2):
        for string in review_docs:
            words = gensim.utils.to_unicode(string).split()
            output_docs.append(words)
    return output_docs
Пример #27
0
def clean_str(review_docs, method=2):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    output_docs = []
    if (method == 1):
        for string in review_docs:
            string = BeautifulSoup(string, "lxml").get_text()
            string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
            string = re.sub(r"\'s", " \'s", string)
            string = re.sub(r"\'ve", " \'ve", string)
            string = re.sub(r"n\'t", " n\'t", string)
            string = re.sub(r"\'re", " \'re", string)
            string = re.sub(r"\'d", " \'d", string)
            string = re.sub(r"\'ll", " \'ll", string)
            string = re.sub(r",", " , ", string)
            string = re.sub(r"!", " ! ", string)
            string = re.sub(r"\(", " \( ", string)
            string = re.sub(r"\)", " \) ", string)
            string = re.sub(r"\?", " \? ", string)
            string = re.sub(r"\s{2,}", " ", string)
            string = string.strip().lower()
            string = string.split(" ")
            output_docs.append(string)
    elif (method == 2):
        for string in review_docs:
            words = gensim.utils.to_unicode(string).split()
            output_docs.append(words)
    return output_docs
Пример #28
0
 def _html_to_text(self, html):
     # Hack to prevent Beautiful Soup from collapsing space-keeping tags
     # until no whitespace remains at all
     html = re.sub("<(br|p|li)", " \\g<0>", html, flags=re.IGNORECASE)
     text = BeautifulSoup(html, "html.parser").get_text()
     # Idea from http://stackoverflow.com/a/1546251
     return " ".join(text.strip().split())
def preprocessdata(tweets):
    tweets = tweets.lower()
    # ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",tweets).split())
    # Converting to URL.
    tweets = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^/s]+))', '',
                    tweets)
    # Removing repeating letters; more than 2.
    tweets = re.compile(r'(.)\1{2,}', re.IGNORECASE).sub(r'\1', tweets)
    # Remove Username.
    tweets = re.sub('@[^\s]+', ' ', tweets)

    tweets = BeautifulSoup(tweets, features='lxml').get_text()

    # Remove Punctuation.
    tweets = re.sub('[^\w\s]', "", tweets)
    # remove '#' sign.
    tweets = re.sub(r'#([^\s]+)', r'\1', tweets)
    # Make Multiple spaces into a single space.
    tweets = re.sub('[\s]+', ' ', tweets)
    tweets = re.sub('<.*?>', " ", tweets)
    # remove '&' tags.
    tweets = re.sub('&[\s]+', ' ', tweets)
    tweets = re.sub(r'[^a-zA-Z\s]', '', tweets, re.I | re.A)

    tweets = tweets.strip()
    return tweets
Пример #30
0
def txt2words(txt,
              lower=True,
              is_html=False,
              remove_none_english_chars=True,
              remove_stop_words=True):
    """
    Split text into words list
    :param txt: the input text
    :param lower: if to make the  text to lowercase or not.
    :param is_html: If True then  remove HTML tags using BeautifulSoup
    :param remove_none_english_chars: if True then remove non-english chars from text
    :param remove_stop_words: if True then remove stop words from text
    :return: words list create from the input text according to the input parameters.
    :rtype: list
    """
    if is_html:
        txt = BeautifulSoup(txt).get_text()
    if lower:
        txt = txt.lower()
    if remove_none_english_chars:
        txt = re.sub("[^a-zA-Z]", " ", txt)

    words = TrainSentences.RE_WIHTE_SPACES.split(txt.strip().lower())
    if remove_stop_words:
        #remove stop words from text
        words = [w for w in words if w not in TrainSentences.STOP_WORDS]
    return words
Пример #31
0
def preprocessing_french(x):
    x = BeautifulSoup(x)
    x = EmailReplyParser.parse_reply(x.get_text())
    x = re.sub(r'<.*?>', '', x)
    x = x.replace("\n", " ").strip()
    x = re.sub(pattern=r'[\!"#$%&\*+,-./:;<=>?@^_`()|~=]', repl='', string=x)
    x = x.replace("\n", " ").strip()
    x = x.strip()
    x = re.sub(r"(^|\W)\d+", "", x)
    x = x.lower()

    stopwords = {
        'merci', 'de', 'nous', 'aider', 'au', 'plus', 'vite', 'bonjour', 'la',
        'le', 'en', 'message', 'cordialement', 'logitech', 'cher', 'mon',
        'date', 'je', 'récemment', 'salut', 'produit', 'en série', 'nombre',
        'achat', 'soutien', 'http', 'com', 'vous', 'logitech', 'www', 'https',
        'logi', 'service à la clientèle', 'contact', 'termes', 'passerelle',
        'newark', 'usa', 'logo', 'care', 'ca', 'footer', 'use', 'customer',
        'owned', 'us', 'survey', 'americas', 'copyright', 'headquarters',
        'owners', 'number', 'respective', 'the', 'rights', 'trademarks',
        'reserved', 'property', 'dear', 'regards', 'thanks', 'mail', 'email',
        'date', 'like', 'get', 'one', 'set', 'thank', 'also', 'two', 'see',
        'able', 'could', 'since', 'last', 'know', 'still', 'got', 'pm', 'p',
        'puisque', 'operating', 'system', 'platform', 'ce', 'mr', 'de', 'lfcm',
        'sy', 'm', 'kh', 'w', 'ks', 'hs', 'afternoon', 'morning', 'regards',
        'thx'
        'thanks', 'fri', 'mon', 'tue', 'wed', 'thu', 'sat', 'sun', 'jan',
        'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'sep', 'oct', 'nov', 'dec'
    }

    x = x.split()
    x = [word for word in x if word.lower() not in stopwords]
    x = ' '.join(x)
    return x
class MovieReview(object):
	def __init__(self, mreview):
		self.mreview = mreview
		self.mreview_clean = None
		self.mreview_word_list = []
		self.mreview_sentence_list = []

	def clean_review(self):
		# function to clean the review by stripping html from review text body
		self.mreview_clean = BeautifulSoup(self.mreview).get_text()

	def remove_punctuation_and_nums(self):
		self.mreview_clean = re.sub("[^a-zA-Z]", " ", self.mreview_clean)

	def split_review_into_words(self):
		# function to split the review text to list of words
	    self.mreview_word_list = self.mreview_clean.lower().split()

	def remove_stop_words(self):
		self.mreview_word_list = [word for word in self.mreview_word_list if not word in set(stopwords.words("english"))]
		self.mreview_clean = " ".join(self.mreview_word_list)

	def split_review_into_sentences(self):
		# function to split review into list of sentences
		# where each setence is a list of words
		extracted_sentences = TOKENIZER.tokenize(self.mreview_clean.strip())
		for extracted_sentence in extracted_sentences:
			if len(extracted_sentence) > 0:
				# extracted_sentence needs to be operated on if stopword or punctuation
				# removal is required eventually(not required for word2Vec)
				self.mreview_sentence_list.append(extracted_sentence.lower().split())
Пример #33
0
def redis_write():
    redis_cli = getRedisClient(db=15)

    fw = open("/hdd/crawl_result/daypop.json", "w")
    for key in redis_cli.scan_iter():
        label = key.split(":")[0]
        value = redis_cli.get(key)
        d = json.loads(value)
        text = BeautifulSoup(d['html'], 'html.parser').get_text()
        # text = re.sub("\n+","\n",text)
        text = '\n'.join(
            [t.strip() for t in text.split("\n") if t.strip() != ''])
        if text.strip() == "":
            continue
        print("*" * 50 + d['article_id'] + '*' * 50 + d['url'] + "*" * 50)
        print(text)

        save_str = json.dumps(dict(id=d['article_id'],
                                   url=unquote(d['url']),
                                   title=d['title'],
                                   daypop_label=label,
                                   text=text),
                              ensure_ascii=False)

        fw.write(save_str + '\n')
Пример #34
0
def get_blog():
    url = "http://127.0.0.1:5000/blog"
    f = urllib.request.urlopen(url)
    the_html = f.read()
    quotes = BeautifulSoup(the_html, "lxml").text
    quotes = quotes.replace("[", "")
    quotes = quotes.replace("]", "")
    quotes = quotes.replace("\n", "")
    quotes = quotes.strip()
    quotes = quotes.split(',')
    useful_var = 0
    useful_var2 = 0
    counter = 0
    obj_array = []
    for stuff in quotes:
        if counter == 0:
            useful_var = stuff
            useful_var = useful_var.replace('"', "")
            counter += 1
        if counter == 1:
            useful_var2 = stuff
            useful_var2 = useful_var2.replace('"', "")
            counter += 1
        else:
            item = Posts(useful_var, useful_var2, stuff)
            obj_array.append(item)
            useful_var = 0
            counter = 0
    print(obj_array)
    return render_template("RandomPosts.html", quotes=obj_array)
Пример #35
0
def preProcess(comment):  #To Do
    # https://stackoverflow.com/a/47091490/4084039
    def decontracted(phrase):
        # specific
        phrase = re.sub(r"won't", "will not", phrase)
        phrase = re.sub(r"can\'t", "can not", phrase)

        # general
        phrase = re.sub(r"n\'t", " not", phrase)
        phrase = re.sub(r"\'re", " are", phrase)
        phrase = re.sub(r"\'s", " is", phrase)
        phrase = re.sub(r"\'d", " would", phrase)
        phrase = re.sub(r"\'ll", " will", phrase)
        phrase = re.sub(r"\'t", " not", phrase)
        phrase = re.sub(r"\'ve", " have", phrase)
        phrase = re.sub(r"\'m", " am", phrase)
        return phrase

    sentance = re.sub(r"http\S+", "", comment)
    sentance = BeautifulSoup(sentance, 'lxml').get_text()
    sentance = decontracted(sentance)
    sentance = re.sub("\S*\d\S*", "", sentance).strip()
    sentance = re.sub('[^A-Za-z]+', ' ', sentance)
    # https://gist.github.com/sebleier/554280
    sentance = ' '.join(e.lower() for e in sentance.split()
                        if e.lower() not in config.STOP_WORDS)
    return sentance.strip()
Пример #36
0
def tweet_to_sentences(tweet):
    tmpList = []
    raw_tweet = BeautifulSoup(tweet).get_text()
    raw_sentences = tokenizer.tokenize(raw_tweet.strip())

    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            tmpList.append(raw_sentence)
    return tmpList
Пример #37
0
def Detail_left_fc(htmltext):
    Detail_left=str(htmltext.find('div',{"class":"detail_left"}).contents)
    a=Detail_left.split('<h4>')
    Description=a[1].split('</h4>')
    Description=Description[1]
    Description=BeautifulSoup(Description).text
    Description=Description.strip(', ')
    Description=suittext(Description)
    return Description
Пример #38
0
 def connectionLost(self, reason):
     title = BeautifulSoup(self.data).title
     
     if title:
         title = title.string
     else:
         title = "No title provided."
         
     self.finished.callback("%s (%s)" % (title.strip(), self.url))
Пример #39
0
        def get_text_from_td(td):
            """If we have html in the td then extract the text portion"""
            # just so we don't waste time parsing non html
            if "<" not in td:
                ret = td
            else:
                ret = BeautifulSoup(td).text

            # We change the td into ascii as a way to remove characters
            # such as \xa0 (non break space) which mess with the ordering
            # comparisons
            ret.strip().encode('ascii',
                               errors='replace').replace('?', ' ')
            # Case where the td is empty
            if len(ret):
                return "0"
            else:
                return ret
Пример #40
0
def get_restaurants(url):
    try:
        urls = get_page_urls(url)
        for url in urls:
            data = get_text_from_url(url)
            search_div = BeautifulSoup(str(data)).find('div', class_='search-results-content')
            uls = BeautifulSoup(str(search_div)).findAll('ul', class_='ylist ylist-bordered search-results')
            for restaurant in BeautifulSoup(str(uls[1])).findAll('li', class_='regular-search-result'):
                main_attrs = BeautifulSoup(str(restaurant)).find('div', class_='main-attributes')
    
                rating = BeautifulSoup(str(main_attrs)).find('div', class_='rating-large')
                rating_data = str(BeautifulSoup(str(rating)).find('i').attrs['title'])
                rating_data = rating_data.replace('star rating', '')
                
                review_count = str(BeautifulSoup(str(main_attrs)).find('span', class_='review-count rating-qualifier').text.strip())
                review_count = review_count.replace(' reviews', '')
    
                sub_url = BeautifulSoup(str(main_attrs)).find('a').attrs['href']
                url = 'http://www.yelp.com' + sub_url
    
                category_data = BeautifulSoup(str(main_attrs)).find('div', class_='price-category')
                category_str_list = BeautifulSoup(str(category_data)).findAll('span', class_='category-str-list')
                categories = ''
                for a in BeautifulSoup(str(category_str_list)).findAll('a'):
                    categories = categories +  a.text.strip() + ','
    
                expensive_level = BeautifulSoup(str(category_data)).find('span', 'business-attribute price-range').text
                
    
                h3 = BeautifulSoup(str(restaurant)).find('h3', class_='search-result-title')
                h3_a = BeautifulSoup(str(h3)).find('a').text
                name = h3_a.strip()
    
                sec_attrs = BeautifulSoup(str(restaurant)).find('div', class_='secondary-attributes')
                address = BeautifulSoup(str(sec_attrs)).find('address')
                if '<br/>' in str(address):
                    address = str(address).replace('<br/>', ' ')
                address = BeautifulSoup(str(address)).find('address').text.strip()
    
                city = get_city_from_address(address)
    
                if not str(city).lower() in address.lower():
                    print 'Invalid city detected'
                RestaurantModel.objects.create(
                    name=name,
                    expensivelevel=expensive_level,
                    city=city,
                    current_rating=float(rating_data),
                    url=url,
                    category=categories,
                    address=address,
                    reviewcount=review_count
                        )
        set_db_status(False)
    except Exception, e:
        print str(e) + 'get restturats'
        set_db_status(False)
Пример #41
0
def parse_quotes(htmlitem):
    """
    Parse quotes in an HTML item. Convert all i-tags into quote-tags.
    Delete all other tags.
    """
    quoteMatch_with_comma = re.match("(?u)(.*?)<i.*?>(.{5,}?),(.{5,}?)</i>" "(.*)", htmlitem)
    quoteMatch = re.match("(?u)(.*?)(<i.*?>.{5,}?</i>)(.*)", htmlitem)

    if quoteMatch_with_comma:
        before_q = BeautifulSoup(quoteMatch_with_comma.group(1)).get_text()
        quote1 = BeautifulSoup(quoteMatch_with_comma.group(2)).get_text()
        quote2 = BeautifulSoup(quoteMatch_with_comma.group(3)).get_text()

        if quote1.strip() != "" and quote2.strip() != "":
            parsedItem = (
                before_q
                + "<quote>"
                + quote1
                + "</quote>,"
                + "<quote>"
                + quote2
                + "</quote>"
                + parse_quotes(quoteMatch_with_comma.group(4))
            )

        else:
            return unicode(BeautifulSoup(htmlitem).get_text())
        return unicode(parsedItem)

    elif quoteMatch:
        quote = BeautifulSoup(quoteMatch.group(2)).get_text()
        if not "siehe" in quote and not "Siehe" in quote and quote.strip() != "":
            parsedItem = (
                BeautifulSoup(quoteMatch.group(1)).get_text()
                + "<quote>"
                + quote
                + "</quote>"
                + parse_quotes(quoteMatch.group(3))
            )
            return unicode(parsedItem)
        else:
            return unicode(BeautifulSoup(htmlitem).get_text())
    else:
        return unicode(BeautifulSoup(htmlitem).get_text())
Пример #42
0
def Description_awarded(htmltext):
    Detail_left=str(htmltext.find('div',{"class":"detail_left"}).contents)
    a=Detail_left.split('<h4>')
    Des=a[1].split('</h4>')
    Description=Des[1].split('<table class="additional_data">')
    Description=Description[0]
    Description=BeautifulSoup(Description).text
    Description=Description.strip(', ')
    Description=suittext(Description)
    return Description
    def get_unsort_position(self, bm_body):
        array_bm_body = bm_body.split("\n")
        # print(array_bm_body)

        init = 0
        for html_line in array_bm_body:
            # print(init)
            html_line_text = BeautifulSoup(html_line, self.FLAG_BS_PARSER).text
            if html_line_text.strip() == self.unsorted_bookmarks_title:
                self.unsorted_bookmarks_line = init
            init += 1
 def __login(self):
     if self.__session is None:
         print('Logging "{0}" into site'.format(self.__username))
         self.__session = requests.Session()
         payload = {'username': self.__username, 'password': self.__password, 'login-form-type': 'pwd'}
         r = self.__session.post(self.__loginUrl, payload, verify=False)
         data = BeautifulSoup(r.content).getText().strip()
         if data.strip() == "login_success":
             print("Login Succeeded")
         else:
             raise Exception("Login Failure")
Пример #45
0
def normalize_tag(tag):
    """
    converts things like "-noise-" to "noise" and "- noise -" to "noise"
    """
    if tag.startswith("-"):
        tag = tag[1:]
    if tag.endswith("-"):
        tag = tag[:-1]

    # fix for HTML entities
    tag = BeautifulSoup(tag).prettify(formatter="html")
    tag = tag.strip().lower()
    return tag
Пример #46
0
 def clean_sentence(self,sentence):
     if self.html_clean:
         sentence = BeautifulSoup(sentence).get_text()   #   removing html markup
     sentence = sentence.lower() #   everything to lowercase
     # sentence = ''.join(x for x in sentence if x.isalnum() or x==" ")
     for ch_rep in self.clean_list:
         sentence = re.sub(ch_rep[0],ch_rep[1],sentence)
     sentence = ' '.join(filter(lambda x:x not in self.stopwords_eng,sentence.split()))
     sentence = ' '.join(filter(lambda x:len(x) > 1,sentence.split()))
     sentence = sentence.strip(" ") # Remove possible extra spaces
     if self.split_words:
         sentence = sentence.split()
     return sentence
Пример #47
0
def htmlToTxt(dirPath,nameFile):
    '''Converting a html file into a plain text file and 
    writing it with '.txt' extension in the same directory
    '''
    print 'Converting '+ nameFile
    #extractin text from html file
    html = urlopen(os.path.join(dirPath,nameFile))
    rawTxt = BeautifulSoup(html).get_text()
    #writing text file
    to_file = open(os.path.join(dirPath,nameFile) + '.txt','w')
    print>>to_file, rawTxt.strip()
    to_file.close()

    return rawTxt
Пример #48
0
def getname():
	names=[]
	names.append("クロムクロ ブルーレイ")
	url="http://www.amazon.co.jp/b/ref=s9_acss_bw_fb_junglest_b4?_encoding=UTF8&node=4367309051&pf_rd_m=AN1VRQENFRJN5&pf_rd_s=merchandised-search-4&pf_rd_r=0EEMDS8WZ0YP48RWG59H&pf_rd_t=101&pf_rd_p=311392929&pf_rd_i=562020"
	r=requests.get(url)
	while r.status_code != requests.codes.ok:
		r=requests.get(url)
	content=r.content
	soup = BeautifulSoup(r.content, "html.parser")
	targets=soup.find_all("a",class_="acs-feature-header")
	for target in targets:
		name=BeautifulSoup(str(target),"html.parser").find("a").text
		names.append(name.strip().replace('\n','').replace('\t',''))
		# print(name)
	names.remove("逆転裁判~その「真実」、異議あり! ~")
	names.append("逆転裁判 その")
	return names
Пример #49
0
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    review_text = BeautifulSoup(review).get_text()
    raw_sentences = tokenizer.tokenize(review_text.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( review_to_wordlist( raw_sentence, remove_stopwords ))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    '''
    Function to split a review into parsed sentences. Returns a
    list of sentences, where each sentence is a list of words
    '''
    # 0. Get rid of non-tokenizable characters
    review = BeautifulSoup(review, "lxml").get_text()
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, get a list of words
            letters_only = re.sub("[^a-zA-Z]", " ", raw_sentence)
            words = letters_only.lower().split()
            sentences.append(words)
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists)
    return sentences
def getBaseballData():

	#Setting up a dynamic url

	baseball_url = 'http://dailybaseballdata.com/cgi-bin/getstats.pl?date=' + yesterday.strftime("%m%d") + '&out=csv'
	
	print(baseball_url)

	#Setting up the GET request to retrieve the HTML markup

	req = urllib.request.Request(baseball_url)
	response = urllib.request.urlopen(req)
	html = response.read()

	#Using BeautifulSoup to parse the markup for info, removing script and other unnecessary tags

	clean_html = BeautifulSoup(html)
	to_extract = clean_html.findAll('script')

	for item in to_extract:
		item.extract()

	clean_html = clean_html.get_text()
	clean_html = clean_html.strip() #strip removes all the whitespace
	clean_html = '\n'.join(clean_html.split('\n')[6:]) #Slice at the end removes the first 5 lines and then joining together by '\n'
	clean_html_with_date = clean_html.replace('\n',','+yesterday.strftime("%y-%m-%d")+'\n')
	clean_html_with_date = clean_html_with_date + ','+yesterday.strftime("%y-%m-%d")
	
	print("Fetching raw data from the website")
	print("...")
	print(clean_html_with_date)

	#print(clean_html)

	#I am creating a test .csv file and then writing to it using open(),write(),close()

	f = open("baseball_rawdata.csv", "w")
	f.write(clean_html_with_date)
	f.close()
Пример #52
0
def get_newest_rss(self, url):
## Retreive an RSS feed and get the newest item
## Then, nicely format the title and description, and add a shortened URL

    dom = xml.dom.minidom.parse(urllib.request.urlopen(url))
    newest_news = dom.getElementsByTagName('item')[0]
    title = newest_news.getElementsByTagName('title')[0].childNodes[0].data
    description = BeautifulSoup(newest_news.getElementsByTagName('description')[0].childNodes[0].data)

    updated = dom.getElementsByTagName('pubDate')[0].childNodes[0].data
    updated = datetime.datetime.fromtimestamp(time.mktime(parsedate(updated)))
    ago = round((datetime.datetime.utcnow() - updated).seconds/60)



    links = description.findAll('a')
    for link in links:
        link.extract()
    links = description.findAll(color='#6f6f6f')
    for link in links:
        link.extract()

    title = title.strip()

    description = str(description)
    description = description.replace("\n", "")

    description = self.tools['remove_html_tags'](description)
    #description = description[0:len(description) - 9]
    description = description.strip()
    if description.rfind(".") != -1:
        description = description[0:description.rfind(".") + 1]

    link = self.tools['shorten_url'](newest_news.getElementsByTagName('link')[0].childNodes[0].data)

    description = "%s - %s [ %s ]" % (title, description, link)

    return description, updated, ago
Пример #53
0
def txt2words(txt, lower=True, is_html=False, remove_none_english_chars=True, remove_stop_words=True):
    """
    Split text into words list
    :param txt: the input text
    :param lower: if to make the  text to lowercase or not.
    :param is_html: If True then  remove HTML tags using BeautifulSoup
    :param remove_none_english_chars: if True then remove non-english chars from text
    :param remove_stop_words: if True then remove stop words from text
    :return: words list create from the input text according to the input parameters.
    :rtype: list
    """
    if is_html:
        txt = BeautifulSoup(txt).get_text()
    if lower:
        txt = txt.lower()
    if remove_none_english_chars:
        txt = re.sub("[^a-zA-Z]", " ", txt)

    words = TrainSentences.RE_WIHTE_SPACES.split(txt.strip().lower())
    if remove_stop_words:
        # remove stop words from text
        words = [w for w in words if w not in TrainSentences.STOP_WORDS]
    return words
Пример #54
0
def google_news(self, e):
    query = urllib.parse.quote(e.input)
    url = ""
    if not query:
        url = "http://news.google.com/news?ned=us&topic=h&output=rss"
    else:
        url = "http://news.google.com/news?q=%s&output=rss" % query

    dom = xml.dom.minidom.parse(urllib.request.urlopen(url))
    newest_news = dom.getElementsByTagName('item')[0]
    title = newest_news.getElementsByTagName('title')[0].childNodes[0].data
    description = BeautifulSoup(newest_news.getElementsByTagName('description')[0].childNodes[0].data)

    links = description.findAll('a')
    for link in links:
        link.extract()
    links = description.findAll(color='#6f6f6f')
    for link in links:
        link.extract()

    title = title.strip()

    description = str(description)
    description = description.replace("\n", "")

    description = self.tools['remove_html_tags'](description)
#    description = tools.decode_htmlentities(description)
    description = description[0:len(description) - 9]
    description = description.strip()
    if description.rfind(".") != -1:
        description = description[0:description.rfind(".") + 1]

    link = self.tools['shorten_url'](newest_news.getElementsByTagName('link')[0].childNodes[0].data)

    e.output = "%s - %s [ %s ]" % (title, description, link)

    return e
    def strip_text(self, exclude_tags=[]):
        # exclude_patterns = list of element tag strings
        # to ignore (ie schemaLocation, etc)

        def _extract_tag(t):
            if not t:
                return
            return t.split('}')[-1]

        def _taggify(e):
            tags = [e.tag] + [m.tag for m in e.iterancestors()]
            tags.reverse()

            try:
                return [_extract_tag(t) for t in tags]
            except:
                return []

        for elem in self.parser.xml.iter():
            t = elem.text.strip() if elem.text else ''
            tags = _taggify(elem)

            if [e for e in exclude_tags if e in tags]:
                continue

            if t:
                if self.handle_html and (
                        (t.startswith('<') and t.endswith('>'))
                        or ('<' in t or '>' in t)):
                    t = self._un_htmlify(t)
                if t:
                    yield ('/'.join(tags), t)

            for k, v in elem.attrib.iteritems():
                if v.strip():
                    v = BeautifulSoup(v.strip())
                    yield ('/'.join(tags + ['@' + _extract_tag(k)]), v.text)
Пример #56
0
            capsCount = sum(1 for x in paraText if x.isupper())
            rowValue = pd.Series([companies[index], paraText, len(paraText), 
                                  paraText.count(' '), capsCount])
            df = df.append(rowValue, ignore_index=True)

for url in urls2:
    index = urls2.index(url)
    
    page = urllib.request.urlopen(url)
    soup = BeautifulSoup(page)
    paraTitle = list(soup.find_all('h3'))
    
    for paragraph in paraTitle:
        para = str(paragraph.nextSibling.nextSibling)
        paraText = BeautifulSoup(para).get_text()
        paraText = paraText.strip()
        if (paraText != ""):
            capsCount = sum(1 for x in paraText if x.isupper())
            rowValue = pd.Series([companies2[index], paraText, len(paraText), 
                                  paraText.count(' '), capsCount])
            df = df.append(rowValue, ignore_index=True)
        
for url in urls3:
    index = urls3.index(url)
    
    page = urllib.request.urlopen(url)
    soup = BeautifulSoup(page)
    para = list(soup.find_all('li'))
    
    for paragraph in para:
        paraText = paragraph.get_text()
Пример #57
0
 def post_paragraphs(self):
     post = BeautifulSoup(self.post, 'html.parser').text.encode('utf-8').strip()
     post = re.sub(r'(\n+\s*)+', '\n', post)
     post_list = post.strip().split('\n')
     return post_list
Пример #58
0
 def cleaner(self, text):
     # Clean HTML and set to lowercase
     clean_ = BeautifulSoup(text, 'lxml').get_text().lower() 
     # Clear newlines
     clean_ = clean_.strip().replace("\n", " ").replace("\r", " ")
     return clean_
Пример #59
0
	processed = set()

if args.file == "":
    f = sys.stdin
else:
    f = open(args.file)
if args.output == "":
    out = sys.stdout
else:
    out = open(args.output, 'w')

RE_WHITE_SPACES = re.compile("\s+")
if args.html:
    txt = BeautifulSoup(f, "html5lib").get_text()
    txt = re.sub(u"[^a-zA-Záàãéíóõúç]", " ", txt)
    tokens = RE_WHITE_SPACES.split(txt.strip())
else:
    tokens = f

for token in tokens:
	t = token.lower().strip()
        if args.code_page != 'none':
            t = unicode(t, args.code_page)
	if t not in stop:
		t = porter.stem(t)
		if len(t) > 1 and not t.isdigit():
			if t not in processed:
				if args.allow_duplicates:
					processed[t] = 1
				else:
					processed.add(t)
Пример #60
0
Файл: demo.py Проект: weihxa/bbs
#print html
soup = BeautifulSoup(str(html),'html.parser',from_encoding='utf-8')
#print soup
From = soup.title.get_text()
Div = soup.find_all('div',class_='carousel-caption')
#清空原来数据
sql="TRUNCATE `dj_bbs_news;"
Mysql.MysqlHelper().In_sql(sql)

x = 1
for line in Div:
    soup2 = BeautifulSoup(str(line),'html.parser',from_encoding='utf-8')
    Url = soup2.find('a')['href']
    html2 = getHtml(soup2.find('a')['href'])
    Tltle = BeautifulSoup(html2,'html.parser',from_encoding='utf-8').find('h1',class_='ph').get_text()
    Img = soup.find('img',alt='%s' % Tltle.strip())['src']
    #print Img
    try:
        sql='''INSERT INTO `dj_bbs_news`(title,summary,url,favor_count,reply_count,create_date,image_urls) VALUES("%s",'%s','%s','0','0',NOW(),'news/%s.jpg');''' %(Tltle,From,Url,x)
        #print sql
        Mysql.MysqlHelper().In_sql(sql)
    except Exception,e:
        print e
    local = os.path.join('D:/test/bbs/static/news/','%s.jpg' % x)
    urllib.urlretrieve(Img,local)
    #处理图片
    Picture.timage("D:/test/bbs/static/news/%s.jpg" % x, 'D:/test/bbs/static/news/')
    print '已处理完%s.jpg'% x
    x+=1