langLines = f.readlines()
f.close()
print "Read file"

#CLEAN AND FACTORIZE THE LINES
cleanLines = []
for line, langLine, soundLine in zip(lines, langLines, soundexLines):
    cleanedLine = cleanLine(line)
    cleanedLangLine = cleanLine(langLine)
    cleanedSoundLine = cleanLine(soundLine)
    cleanLines.append(factorize(cleanedLine, cleanedLangLine,
                                cleanedSoundLine))
print "cleaned lines"

#GET THE COUNTS FROM THE SENTENCES
mc = Counts()
mcLang = Counts()
mcSoundex = Counts()
for line in cleanLines:
    for token in line:
        mc.incrementWord(token.getFactor(WORD_FACTOR))
        #mcLang.incrementWord(token.getFactor(LANGAUGE_FACTOR))
        #mcSoundex.incrementWord(token.getFactor(SOUNDEX_FACTOR))
mc.fixRanks()
mcLang.fixRanks()
mcSoundex.fixRanks()
print "Counted the words"
print "Number of words in the dataset: ", len(mc)
print "Number of languages in the dataset: ", len(mcLang)
print "Number of sounds in the dataset: ", len(mcSoundex)