class Seeds: def __init__(self): self.dic_seeds = {} self.accents = Accents() self.__buildDic__() def __buildDic__(self): try: file_seeds = codecs.open('seeds.txt', 'r', 'utf-8') except IOError: print 'ERROR: System cannot open the seeds.txt file' sys.exit() for line in file_seeds: if line != '': line = line.replace('\n','') line = self.accents.buildCodes(line) self.dic_seeds[line] = line file_seeds.close() def getQtySeeds(self): return len(self.dic_seeds) def getSeeds(self): return sorted(self.dic_seeds.keys()) def printSeeds(self): print self.dic_seeds def printQtySeeds(self): print len(self.dic_seeds)
def __buildStatisticalCorpus__(self): try: root, dirs, files = os.walk(self.corpus_folder).next()[:3] except: print 'ERROR: It was not possible to open the ../Data/Corpus/Raw/ folder' sys.exit() accents = Accents() for corpus_file in files: if re.match('.*xml$', corpus_file): corpus_filename = corpus_file.split('.')[0] xmlfile = ParseXml(root+''+corpus_file) dic_terms = xmlfile.getDicTerms() dic_nouns = xmlfile.getNouns() dic_verbs = xmlfile.getVerbs() id_sentence = 1 id_word = 1 id_t = 's'+str(id_sentence)+'_'+str(id_word) string_full = '' string_nouns = '' while dic_terms.has_key(id_t): while dic_terms.has_key(id_t): if not re.match('^(pu|num|conj|art|prp|spec)', dic_terms[id_t]['pos']) and (re.search('[$]', dic_terms[id_t]['lemma']) is None) and (len(dic_terms[id_t]['lemma']) >= self.parameters.getMinWordSize()): lemma = accents.buildCodes(dic_terms[id_t]['lemma']) if dic_nouns.has_key(id_t): string_nouns += lemma+'__N ' string_full += lemma+'__N ' elif dic_verbs.has_key(id_t): string_nouns += lemma+'__V ' string_full += lemma+'__V ' else: string_full += lemma+'__O ' string_nouns = string_nouns.replace('-', '_') string_full = string_full.replace('-', '_') id_word += 1 id_t = 's'+str(id_sentence)+'_'+str(id_word) id_word = 1 id_sentence += 1 id_t = 's'+str(id_sentence)+'_'+str(id_word) self.__writeCorpusFile__(corpus_filename, string_full, string_nouns)