Python html_to_text 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: html_to_text

메소드/함수: html_to_text

hotexamples.com에서의 예제들: 3

Python html_to_text - 3개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 html_to_text.html_to_text에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

    def create(cls, course_id, sender, to_option, subject, html_message, text_message=None, template_name=None, from_addr=None):
        """
        Create an instance of CourseEmail.

        The CourseEmail.save_now method makes sure the CourseEmail entry is committed.
        When called from any view that is wrapped by TransactionMiddleware,
        and thus in a "commit-on-success" transaction, an autocommit buried within here
        will cause any pending transaction to be committed by a successful
        save here.  Any future database operations will take place in a
        separate transaction.
        """
        # automatically generate the stripped version of the text from the HTML markup:
        if text_message is None:
            text_message = html_to_text(html_message)

        # perform some validation here:
        if to_option not in TO_OPTIONS:
            fmt = 'Course email being sent to unrecognized to_option: "{to_option}" for "{course}", subject "{subject}"'
            msg = fmt.format(to_option=to_option, course=course_id, subject=subject)
            raise ValueError(msg)

        # create the task, then save it immediately:
        course_email = cls(
            course_id=course_id,
            sender=sender,
            to_option=to_option,
            subject=subject,
            html_message=html_message,
            text_message=text_message,
            template_name=template_name,
            from_addr=from_addr,
        )
        course_email.save_now()

        return course_email

예제 #2

파일 보기

    def crawler(self, url, title, depth):
        target = datetime.datetime.now() - datetime.timedelta(days=30)
        target = target.strftime('%Y/%m/%d %H:%M:%S')
        query = "SELECT * FROM documents WHERE last_index IS NOT NULL AND last_index > %s"
        res = mysql_connect()
        self.server = res[0]
        self.conn = res[1]
        cur = self.conn.cursor()
        cur.execute(query, (target,))
        cur.close()
        for row in cur:
            self.memory.append(row[1])

        if "chorkleines.com/member/" not in url:
            return
        elif "chorkleines.com/member/bbs/" in url:
            return
        elif "chorkleines.com/member/download/18/pdf_search/" in url:
            return
        elif "chorkleines.com/member/download/18/scoredb/" in url:
            return
        elif "chorkleines.com/member/download/18/past_exam/" in url:
            return
        elif "chorkleines.com/member/wiki/" in url:
            return
        elif "chorkleines.com/member/kleines_search/" in url:
            return

        if url.endswith((".pdf", ".doc", ".docx")):
            if url not in self.memory:
                text = document_to_text(url)
                if text is None:
                    print("404: " + url)
                    return
                doc_id = self.insert_document(url, title)
                lines = text.splitlines()
                for line in lines:
                    if line != "":
                        line_words = mecab(line)
                        for line_word in line_words:
                            self.insert_word(line_word['text'], doc_id)
                line_words = mecab(title)
                for line_word in line_words:
                    self.insert_word(line_word['text'], doc_id)
                self.insert_done(doc_id)
                print("done: " + url)
            else:
                print("pass: "******".csv", ".txt")):
            if url not in self.memory:
                text = file_to_text(url)
                if text is None:
                    print("404: " + url)
                    return
                doc_id = self.insert_document(url, title)
                lines = text.splitlines()
                for line in lines:
                    if line != "":
                        line_words = mecab(line)
                        for line_word in line_words:
                            self.insert_word(line_word['text'], doc_id)
                line_words = mecab(title)
                for line_word in line_words:
                    self.insert_word(line_word['text'], doc_id)
                self.insert_done(doc_id)
                print("done: " + url)
            else:
                print("pass: "******".mp3", ".mp4", ".midi", ".mid", ".wav", ".zip", ".tar", ".gz", ".tgz", ".jpeg", ".jpg", ".png", ".xlsx", ".xls", ".pptx", ".ppt", ".mscz")):
            if url not in self.memory:
                if get_header(url) is None:
                    print("404: " + url)
                    return
                doc_id = self.insert_document(url, title)
                line_words = mecab(title)
                for line_word in line_words:
                    self.insert_word(line_word['text'], doc_id)
                self.insert_done(doc_id)
                print("done: " + url)
            else:
                print("pass: "******"css", "js")):
            return

        data = get_html(url)
        if data is None:
            return
        url = data[0]
        html = data[1]

        if url not in self.memory:
            text = html_to_text(html)
            title_tmp = html_title(html)
            if title_tmp != "":
                title = title_tmp
            doc_id = self.insert_document(url, title)
            lines = text.splitlines()
            for line in lines:
                if line != "":
                    line_words = mecab(line)
                    for line_word in line_words:
                        self.insert_word(line_word['text'], doc_id)
            line_words = mecab(title)
            for line_word in line_words:
                self.insert_word(line_word['text'], doc_id)
            self.insert_done(doc_id)
            print("done: " + url)
        else:
            print("pass: "******"href"], link["text"], depth + 1)

        return

예제 #3

파일 보기

파일: starter_code.py 프로젝트: ChJL/EntityLinking

                # the key_ID storing WebpageID, the text storing the text converted by html
                key_ID = record.rec_headers.get_header(KEYNAME)

                # try for few pages:
                #if key_ID == "clueweb12-0000tw-00-00017":
                #    break

                htmlcontent = record.content_stream().read()
                
                # method for html to text, if the soup return is none, drop current webpage
                soup = BeautifulSoup(htmlcontent, "lxml")
                if soup == None:
                    continue

                # if there is no raw text return, we drop the current webpage
                text = html_to_text(soup)
                if text == "" or text == " XML RPC server accepts POST requests only ":
                    continue

                # The NER_mentions is a list with ("string","type")
                NER_mentions = NLProcess(text)
                # drop duplicate in NER_mentions
                NER_mentions = list(dict.fromkeys(NER_mentions))

                
                final_entities = []
                for mention in NER_mentions:
                    # candidates is a dictionary with 10 results
                    candidates = generate_candidates(mention[0])

                    max_score = 0