def normalize(text, lang): return stemAndRemoveAccents(towords(normalize_text(text)), lang)
def toIndex(documents, stopwords, keylen, lang, elapsed = nothing): htmlrem = HTMLRemover() compiledDocuments = [] docID = 0 allRealWords = set() for doc in documents: try: elapsed('parsing: ' + doc['url']) if doc['type'] in ['html', 'txt']: if doc['type'] == 'html': content = unescapeHTMLEntities(doc['content']) try: content = htmlrem.getText(content) except Exception: content = strip_html(content) title = htmlrem.title description = htmlrem.description if not title: title = os.path.basename(doc['url']) if doc['type'] == 'txt': content = doc['content'] title = doc.get('title', os.path.basename(doc['url'])) description = doc.get('description', '') words = getWordsWithoutStopWords(normalize_text(content), stopwords) allRealWords |= stripAccents(words) if words: compiledDocuments.append({ 'pureContent':words, 'content':stemAndRemoveAccents(words, lang), 'title':title, 'url':doc['url'], 'id':docID, 'description':description, }) docID += 1 except Exception as err: print('Cannot parse ' + str(doc['url'])) print(str(err)) if not compiledDocuments: raise Exception('No document parsed') elapsed('Collecting documents...') sitesStats = getDocsStats([x['content'] for x in compiledDocuments]) for doc, wordscount in zip(compiledDocuments, sitesStats['wordscount']): doc['words'] = wordscount index = groupByKeylen(sitesStats['occurences'], keylen) return {'index': index, 'allwords':sitesStats['allwords'], 'documents':compiledDocuments, 'allRealWords':allRealWords}
def parseQuery(self, query): pureQuery = normalize_text(query) listQuery = sorted(pureQuery.split()) return tuple(map(self.normalizeQuery, listQuery))
def normalizeQuery(self, query): return strip_accents(createStem(normalize_text(query)))
def getstem(word, lang): word = normalize_text(word) stem = createStem(word, lang) stem = strip_accents(stem) return stem
def test_normalize_text(self): self.assertEqual('háčky čárky to je věda dva tři', normalize_text('Háčky čárky, to je věda! Dva + Tři = __?'))
def test_normalize_text(self): self.assertEqual( 'háčky čárky to je věda dva tři', normalize_text('Háčky čárky, to je věda! Dva + Tři = __?'))
def toIndex(documents, stopwords, keylen, lang, elapsed=nothing): htmlrem = HTMLRemover() compiledDocuments = [] docID = 0 allRealWords = set() for doc in documents: try: elapsed('parsing: ' + doc['url']) if doc['type'] in ['html', 'txt']: if doc['type'] == 'html': content = unescapeHTMLEntities(doc['content']) try: content = htmlrem.getText(content) except Exception: content = strip_html(content) title = htmlrem.title description = htmlrem.description if not title: title = os.path.basename(doc['url']) if doc['type'] == 'txt': content = doc['content'] title = doc.get('title', os.path.basename(doc['url'])) description = doc.get('description', '') words = getWordsWithoutStopWords(normalize_text(content), stopwords) allRealWords |= stripAccents(words) if words: compiledDocuments.append({ 'pureContent': words, 'content': stemAndRemoveAccents(words, lang), 'title': title, 'url': doc['url'], 'id': docID, 'description': description, }) docID += 1 except Exception as err: print('Cannot parse ' + str(doc['url'])) print(str(err)) if not compiledDocuments: raise Exception('No document parsed') elapsed('Collecting documents...') sitesStats = getDocsStats([x['content'] for x in compiledDocuments]) for doc, wordscount in zip(compiledDocuments, sitesStats['wordscount']): doc['words'] = wordscount index = groupByKeylen(sitesStats['occurences'], keylen) return { 'index': index, 'allwords': sitesStats['allwords'], 'documents': compiledDocuments, 'allRealWords': allRealWords }