def create(cls, course_id, sender, to_option, subject, html_message, text_message=None, template_name=None, from_addr=None): """ Create an instance of CourseEmail. The CourseEmail.save_now method makes sure the CourseEmail entry is committed. When called from any view that is wrapped by TransactionMiddleware, and thus in a "commit-on-success" transaction, an autocommit buried within here will cause any pending transaction to be committed by a successful save here. Any future database operations will take place in a separate transaction. """ # automatically generate the stripped version of the text from the HTML markup: if text_message is None: text_message = html_to_text(html_message) # perform some validation here: if to_option not in TO_OPTIONS: fmt = 'Course email being sent to unrecognized to_option: "{to_option}" for "{course}", subject "{subject}"' msg = fmt.format(to_option=to_option, course=course_id, subject=subject) raise ValueError(msg) # create the task, then save it immediately: course_email = cls( course_id=course_id, sender=sender, to_option=to_option, subject=subject, html_message=html_message, text_message=text_message, template_name=template_name, from_addr=from_addr, ) course_email.save_now() return course_email
def crawler(self, url, title, depth): target = datetime.datetime.now() - datetime.timedelta(days=30) target = target.strftime('%Y/%m/%d %H:%M:%S') query = "SELECT * FROM documents WHERE last_index IS NOT NULL AND last_index > %s" res = mysql_connect() self.server = res[0] self.conn = res[1] cur = self.conn.cursor() cur.execute(query, (target,)) cur.close() for row in cur: self.memory.append(row[1]) if "chorkleines.com/member/" not in url: return elif "chorkleines.com/member/bbs/" in url: return elif "chorkleines.com/member/download/18/pdf_search/" in url: return elif "chorkleines.com/member/download/18/scoredb/" in url: return elif "chorkleines.com/member/download/18/past_exam/" in url: return elif "chorkleines.com/member/wiki/" in url: return elif "chorkleines.com/member/kleines_search/" in url: return if url.endswith((".pdf", ".doc", ".docx")): if url not in self.memory: text = document_to_text(url) if text is None: print("404: " + url) return doc_id = self.insert_document(url, title) lines = text.splitlines() for line in lines: if line != "": line_words = mecab(line) for line_word in line_words: self.insert_word(line_word['text'], doc_id) line_words = mecab(title) for line_word in line_words: self.insert_word(line_word['text'], doc_id) self.insert_done(doc_id) print("done: " + url) else: print("pass: "******".csv", ".txt")): if url not in self.memory: text = file_to_text(url) if text is None: print("404: " + url) return doc_id = self.insert_document(url, title) lines = text.splitlines() for line in lines: if line != "": line_words = mecab(line) for line_word in line_words: self.insert_word(line_word['text'], doc_id) line_words = mecab(title) for line_word in line_words: self.insert_word(line_word['text'], doc_id) self.insert_done(doc_id) print("done: " + url) else: print("pass: "******".mp3", ".mp4", ".midi", ".mid", ".wav", ".zip", ".tar", ".gz", ".tgz", ".jpeg", ".jpg", ".png", ".xlsx", ".xls", ".pptx", ".ppt", ".mscz")): if url not in self.memory: if get_header(url) is None: print("404: " + url) return doc_id = self.insert_document(url, title) line_words = mecab(title) for line_word in line_words: self.insert_word(line_word['text'], doc_id) self.insert_done(doc_id) print("done: " + url) else: print("pass: "******"css", "js")): return data = get_html(url) if data is None: return url = data[0] html = data[1] if url not in self.memory: text = html_to_text(html) title_tmp = html_title(html) if title_tmp != "": title = title_tmp doc_id = self.insert_document(url, title) lines = text.splitlines() for line in lines: if line != "": line_words = mecab(line) for line_word in line_words: self.insert_word(line_word['text'], doc_id) line_words = mecab(title) for line_word in line_words: self.insert_word(line_word['text'], doc_id) self.insert_done(doc_id) print("done: " + url) else: print("pass: "******"href"], link["text"], depth + 1) return
# the key_ID storing WebpageID, the text storing the text converted by html key_ID = record.rec_headers.get_header(KEYNAME) # try for few pages: #if key_ID == "clueweb12-0000tw-00-00017": # break htmlcontent = record.content_stream().read() # method for html to text, if the soup return is none, drop current webpage soup = BeautifulSoup(htmlcontent, "lxml") if soup == None: continue # if there is no raw text return, we drop the current webpage text = html_to_text(soup) if text == "" or text == " XML RPC server accepts POST requests only ": continue # The NER_mentions is a list with ("string","type") NER_mentions = NLProcess(text) # drop duplicate in NER_mentions NER_mentions = list(dict.fromkeys(NER_mentions)) final_entities = [] for mention in NER_mentions: # candidates is a dictionary with 10 results candidates = generate_candidates(mention[0]) max_score = 0