Пример #1
0
class Lid:
    """The basic class for Language Identification Library
    """

    def __init__(self):
        """Lid constructor
            The constructor loads automatically all language models in the
            current directory.
            The language models are stored in files that are made up as follows:
            LANGUAGE_NAME followd by .dat.
        """
        self.trigrams = Trigrams()
        self.languages  = [] # list of loaded language models
        self.models     = [] # list with the trigram models
        self.load_language_models()

    def load_language_models(self):
        for x in listdir( STAT_DIR ):
            if x[-4:] == ".dat":
                modelfile = file(os.path.join(STAT_DIR,x))
                language = x[0:-4]
                self.languages.append( language )
                new_model = Trigrams()
                for line in modelfile:
                    tokens = split(line)
                    if len(tokens) == 2:
                        trigram = lower( unicode( tokens[0],'utf-8') )
                        probability = float(tokens[1])
                        new_model.add_trigram(trigram,probability)
                self.models.append(new_model)
                modelfile.close()

    def checkText(self, text):
        """Check which language a text is."""
        self.trigrams.create_trigrams(text)
        self.trigrams.calculate_probabilities()
        result = self.count_deviation()
        language, confidence = find_best_language(self.languages, result)
        answer = {'confidence':confidence}
        stat = {}
        for x, lang in enumerate(self.languages):
            stat[lang] = result[x]
        answer['stat'] = stat
        return language, answer
#
#        if self.is_results_equal(result):
#            return '?', res

    def count_result_in_percents(results, num):
        new_result = [ float(result) / float(num) for result in results ]
        return new_result

    def count_deviation(self):
        result = []   
        for x in range(len(self.languages)):
            result.append(0)
        for x in self.trigrams.trigrams.keys():
            for i in range(len(self.models)):
                model = self.models[i]
                if model.trigrams.has_key(x):
                    value = model.trigrams[x] - self.trigrams.trigrams[x]
                    result[i] += abs( value )
                else:
                    # otherwise set the resulting value to 1 = max. deviation
                    result[i] += 1
        return result

    def is_results_equal(self,results):
        for x in range(len(results)-1):
            if results[x] != results[x+1]:
                return False
        return True
Пример #2
0
if __name__ == "__main__":
    myTrigrams = Trigrams()
    if len(sys.argv) > 1:
        for x in sys.argv[1:]:
            for y in glob.glob(os.path.normcase(x)):
                try:
                    f = open(y)
                    string = f.read()
                    encoding = chardet.detect(string)['encoding']
                    unistring = unicode(string, encoding)#.decode(encoding)
                    cleaned =  clean_punctuation_from_text(unistring)
                    myTrigrams.add_trigrams_from_text(cleaned)
                except IOError:
                    pass

        myTrigrams.eliminate_frequences(2)
        myTrigrams.calculate_probabilities()
        pairs = zip(myTrigrams.trigrams.keys(), myTrigrams.trigrams.values())
        pairs.sort()
        pairs.reverse()
        for i in pairs:
            string = "%s   %s"  % (i[0], i[1])
            print string.encode('utf-8')
    else:
        print "Usage:"
        print "python lidtrainer.py [document1] ..."
        print "outputs stats by trigrams"