def create_index(self, url): # url = "http://fapl.ru/" htmlToTextConverter = HtmlToTextConverter() text = htmlToTextConverter.transform_html_into_text(url) buildIndex = BuildIndex(text) index = buildIndex.getIndex() try: document = Page.objects.get(url=url) for match in Match.objects.filter(page=document): word = match.word match.delete() if not word.pages.all(): word.delete() except ObjectDoesNotExist: document = Page(url=url) document.save() for word in index.keys(): positions = " ".join(str(x) for x in index[word]) try: word = Word.objects.get(value=word) except ObjectDoesNotExist: word = Word(value=word) word.save() match = Match(word=word, page=document, positions=positions) match.save()
def create_index(self, url): # url = "http://fapl.ru/" htmlToTextConverter = HtmlToTextConverter() title, content = htmlToTextConverter.transform_html_into_text(url) buildIndex = BuildIndex(content) index = buildIndex.get_index() number_of_words = buildIndex.get_number_of_words() try: document = Page.objects.get(url=url) for match in Match.objects.filter(page=document): word = match.word match.delete() if not word.pages.all(): word.delete() except ObjectDoesNotExist: document = Page(url=url, number_of_words=number_of_words, title=title, content=content) document.save() for word in index.keys(): positions = " ".join(str(x) for x in index[word]) try: word = Word.objects.get(value=word) except ObjectDoesNotExist: word = Word(value=word) word.save() match = Match(word=word, page=document, positions=positions) match.save()
def show_id(request, key): return object_detail(request, Page.all(), key)
def update_last_updated(request): Pages = Page.all().order('update_time').fetch(100) for Page in Pages: Page.put() logging.info("Updated %s" % (Page.searchter_id)) return TextResponse("Success, Finished!")