Exemplo n.º 1
0
def _getSources():
    all_word_lists = []
    for i,f in enumerate(docs_list):
        with open(f, 'r') as file:
            text = file.read().lower()
            text = removeNonAscii(text)
            # text = unicode(text , errors='ignore')
            sentences = nltk.sent_tokenize(text)
            #print self._generate_word_list(sentences)
            current_doc_uniq_word_list = list(set(_generate_word_list(sentences)))
            
            con = DB.get_con()

            for w in current_doc_uniq_word_list:
                if not isNumeric(w) and len(w)>1:
                    DB.incr_occurrence(w,con)
            DB.incr_total_doc_number(con)

            con.close()
            file.close()

    return all_word_lists