langLines = f.readlines() f.close() print "Read file" #CLEAN AND FACTORIZE THE LINES cleanLines = [] for line, langLine, soundLine in zip(lines, langLines, soundexLines): cleanedLine = cleanLine(line) cleanedLangLine = cleanLine(langLine) cleanedSoundLine = cleanLine(soundLine) cleanLines.append(factorize(cleanedLine, cleanedLangLine, cleanedSoundLine)) print "cleaned lines" #GET THE COUNTS FROM THE SENTENCES mc = Counts() mcLang = Counts() mcSoundex = Counts() for line in cleanLines: for token in line: mc.incrementWord(token.getFactor(WORD_FACTOR)) #mcLang.incrementWord(token.getFactor(LANGAUGE_FACTOR)) #mcSoundex.incrementWord(token.getFactor(SOUNDEX_FACTOR)) mc.fixRanks() mcLang.fixRanks() mcSoundex.fixRanks() print "Counted the words" print "Number of words in the dataset: ", len(mc) print "Number of languages in the dataset: ", len(mcLang) print "Number of sounds in the dataset: ", len(mcSoundex)