def processEntries( db : couchdb.Database ): totalSentences = [x for x in db.iterview( 'sentences/sentences_count', 10 )][0].value sentenceCount = 1 for entry in db.iterview( 'sentences/sentences', 100 ) : if sentenceCount % 1000 == 0 : printProgress( sentenceCount, totalSentences ) for word in sentences.splitInWords( entry.value['sentence'] ) : if word and word != "" : updateWordDocument( db, word, entry.value ) sentenceCount = sentenceCount + 1 print("") # to clear printProgress
def getMinimumSentenceThreshold(db: couchdb.Database, sentences_length_view: str, threshold: int): availableLengths = [ x.key for x in db.iterview(sentences_length_view, 2000000, group_level=1) ] availableLengths.sort(reverse=True) logging.info(f'the available lengths per word are {availableLengths}') lastElement = round(len(availableLengths) * (threshold / 100)) return availableLengths[lastElement]
def processEntries(db: couchdb.Database, sleepHourList: list): totalSentences = getTotalSentences( db, 'sentences/sentences_not_processed_count') sentenceCount = 1 for entry in db.iterview('sentences/sentences_not_processed', 100): waitWhileSleepHour(sleepHourList) if sentenceCount % 1000 == 0: printProgress(sentenceCount, totalSentences) wordSet = {word for word in entry.value["word_list"] if word != ""} for word in wordSet: updateWordDocument(db, word, entry.value) setSentenceAsVisited(db, entry.id) sentenceCount = sentenceCount + 1 if sentenceCount > 1: print("") # to clear printProgress
def processEntries(db: couchdb.Database, corpus_result_dir: str, sentenceThreshold): words_path = os.path.join(corpus_result_dir, WORDS_TXT) discarded_words_path = os.path.join(corpus_result_dir, DISCARDED_WORDS_TXT) corpus_path = os.path.join(corpus_result_dir, CORPUS_TXT) deleteFileIfExists(words_path) deleteFileIfExists(corpus_path) with open(words_path, "wt", encoding="utf-8") as words: with open(corpus_path, "wt", encoding="utf-8") as corpus: with open(discarded_words_path, "wt", encoding="utf-8") as discarded: words.write(WORDS_TXT_HEADER) corpus.write(CORPUS_TXT_HEADER) discarded.write(DISCARDED_WORDS_TXT_HEADER) for row in db.iterview('all_words/all_words', 100): if len(row.value['sentences']) >= sentenceThreshold: words.write(f"{row.value['_id']}\n") writeCorpusInfo(db, corpus, row.value) else: discarded.write(f"{row.value['_id']}\n")
def deleteAllWords( db : couchdb.Database ): for row in db.iterview( 'all_words/all_words', 100 ) : db.delete( row.value )
def getTotalSentences(db: couchdb.Database, totalSentencesView): doc = [x for x in db.iterview(totalSentencesView, 10)] if len(doc) > 0: return doc[0].value else: return 0
def set_urls_as_not_visited(db: couchdb.Database, not_visited_view): for url in db.iterview(not_visited_view, 100): urlDoc = db[url.id] urlDoc['visited'] = False db.save(urlDoc)