예제 #1
0
    def create(cls, course_id, sender, to_option, subject, html_message, text_message=None, template_name=None, from_addr=None):
        """
        Create an instance of CourseEmail.

        The CourseEmail.save_now method makes sure the CourseEmail entry is committed.
        When called from any view that is wrapped by TransactionMiddleware,
        and thus in a "commit-on-success" transaction, an autocommit buried within here
        will cause any pending transaction to be committed by a successful
        save here.  Any future database operations will take place in a
        separate transaction.
        """
        # automatically generate the stripped version of the text from the HTML markup:
        if text_message is None:
            text_message = html_to_text(html_message)

        # perform some validation here:
        if to_option not in TO_OPTIONS:
            fmt = 'Course email being sent to unrecognized to_option: "{to_option}" for "{course}", subject "{subject}"'
            msg = fmt.format(to_option=to_option, course=course_id, subject=subject)
            raise ValueError(msg)

        # create the task, then save it immediately:
        course_email = cls(
            course_id=course_id,
            sender=sender,
            to_option=to_option,
            subject=subject,
            html_message=html_message,
            text_message=text_message,
            template_name=template_name,
            from_addr=from_addr,
        )
        course_email.save_now()

        return course_email
예제 #2
0
    def crawler(self, url, title, depth):
        target = datetime.datetime.now() - datetime.timedelta(days=30)
        target = target.strftime('%Y/%m/%d %H:%M:%S')
        query = "SELECT * FROM documents WHERE last_index IS NOT NULL AND last_index > %s"
        res = mysql_connect()
        self.server = res[0]
        self.conn = res[1]
        cur = self.conn.cursor()
        cur.execute(query, (target,))
        cur.close()
        for row in cur:
            self.memory.append(row[1])

        if "chorkleines.com/member/" not in url:
            return
        elif "chorkleines.com/member/bbs/" in url:
            return
        elif "chorkleines.com/member/download/18/pdf_search/" in url:
            return
        elif "chorkleines.com/member/download/18/scoredb/" in url:
            return
        elif "chorkleines.com/member/download/18/past_exam/" in url:
            return
        elif "chorkleines.com/member/wiki/" in url:
            return
        elif "chorkleines.com/member/kleines_search/" in url:
            return

        if url.endswith((".pdf", ".doc", ".docx")):
            if url not in self.memory:
                text = document_to_text(url)
                if text is None:
                    print("404: " + url)
                    return
                doc_id = self.insert_document(url, title)
                lines = text.splitlines()
                for line in lines:
                    if line != "":
                        line_words = mecab(line)
                        for line_word in line_words:
                            self.insert_word(line_word['text'], doc_id)
                line_words = mecab(title)
                for line_word in line_words:
                    self.insert_word(line_word['text'], doc_id)
                self.insert_done(doc_id)
                print("done: " + url)
            else:
                print("pass: "******".csv", ".txt")):
            if url not in self.memory:
                text = file_to_text(url)
                if text is None:
                    print("404: " + url)
                    return
                doc_id = self.insert_document(url, title)
                lines = text.splitlines()
                for line in lines:
                    if line != "":
                        line_words = mecab(line)
                        for line_word in line_words:
                            self.insert_word(line_word['text'], doc_id)
                line_words = mecab(title)
                for line_word in line_words:
                    self.insert_word(line_word['text'], doc_id)
                self.insert_done(doc_id)
                print("done: " + url)
            else:
                print("pass: "******".mp3", ".mp4", ".midi", ".mid", ".wav", ".zip", ".tar", ".gz", ".tgz", ".jpeg", ".jpg", ".png", ".xlsx", ".xls", ".pptx", ".ppt", ".mscz")):
            if url not in self.memory:
                if get_header(url) is None:
                    print("404: " + url)
                    return
                doc_id = self.insert_document(url, title)
                line_words = mecab(title)
                for line_word in line_words:
                    self.insert_word(line_word['text'], doc_id)
                self.insert_done(doc_id)
                print("done: " + url)
            else:
                print("pass: "******"css", "js")):
            return

        data = get_html(url)
        if data is None:
            return
        url = data[0]
        html = data[1]

        if url not in self.memory:
            text = html_to_text(html)
            title_tmp = html_title(html)
            if title_tmp != "":
                title = title_tmp
            doc_id = self.insert_document(url, title)
            lines = text.splitlines()
            for line in lines:
                if line != "":
                    line_words = mecab(line)
                    for line_word in line_words:
                        self.insert_word(line_word['text'], doc_id)
            line_words = mecab(title)
            for line_word in line_words:
                self.insert_word(line_word['text'], doc_id)
            self.insert_done(doc_id)
            print("done: " + url)
        else:
            print("pass: "******"href"], link["text"], depth + 1)

        return
예제 #3
0
                # the key_ID storing WebpageID, the text storing the text converted by html
                key_ID = record.rec_headers.get_header(KEYNAME)

                # try for few pages:
                #if key_ID == "clueweb12-0000tw-00-00017":
                #    break

                htmlcontent = record.content_stream().read()
                
                # method for html to text, if the soup return is none, drop current webpage
                soup = BeautifulSoup(htmlcontent, "lxml")
                if soup == None:
                    continue

                # if there is no raw text return, we drop the current webpage
                text = html_to_text(soup)
                if text == "" or text == " XML RPC server accepts POST requests only ":
                    continue

                # The NER_mentions is a list with ("string","type")
                NER_mentions = NLProcess(text)
                # drop duplicate in NER_mentions
                NER_mentions = list(dict.fromkeys(NER_mentions))

                
                final_entities = []
                for mention in NER_mentions:
                    # candidates is a dictionary with 10 results
                    candidates = generate_candidates(mention[0])

                    max_score = 0