Exemplo n.º 1
0
    def detect(self, text):
        '''
        Detect the text's language
        '''
        words = nltk_word_tokenize(text.lower())
        trigrams = {}
        scores = dict([(lang, 0) for lang in self.language_trigrams.keys()])

        for match in words:
            for trigram in self.get_word_trigrams(match):
                if not trigram in trigrams.keys():
                    trigrams[trigram] = 0
                trigrams[trigram] += 1

        total = sum(trigrams.values())

        for trigram, count in trigrams.items():
            for lang, frequencies in self.language_trigrams.items():
                # normalize and add to the total score
                scores[lang] += (float(frequencies[trigram]) / float(
                    frequencies.N())) * (float(count) / float(total))

        # special case
        # if all scores are 0.0 we return None
        s = 0.0
        for score in scores.itervalues():
            s += score

        if s == 0.0:
            return None

        return l_map[sorted(scores.items(), key=lambda x: x[1],
                            reverse=True)[0][0]]
Exemplo n.º 2
0
    def detect(self, text):
        '''
        Detect the text's language
        '''
        words    = nltk_word_tokenize(text.lower())
        trigrams = {}
        scores   = dict([(lang, 0) for lang in self.language_trigrams.keys()])

        for match in words:
            for trigram in self.get_word_trigrams(match):
                if not trigram in trigrams.keys():
                    trigrams[trigram] = 0
                trigrams[trigram] += 1

        total = sum(trigrams.values())

        for trigram, count in trigrams.items():
            for lang, frequencies in self.language_trigrams.items():
                # normalize and add to the total score
                scores[lang] += (float(frequencies[trigram]) / float(frequencies.N())) * (float(count) / float(total))
        
        
        # special case
        # if all scores are 0.0 we return None
        s = 0.0
        for score in scores.itervalues():
            s += score

        if s == 0.0:
            return None

        return l_map[ sorted(scores.items(), key=lambda x: x[1], reverse=True)[0][0] ]
 def preprocess_text_nltk(self, text):
     sentences = self.sent_tokenize(text)
     sentences_cleaned = []
     for sent in sentences:
         if self.stopwords_remove:
             self.stopword_remover.replace_keywords(sent)
         words = nltk_word_tokenize(sent, self.language)
         words = [w for w in words if w not in string.punctuation]
         words = [w for w in words if w not in self.extra_stopwords]
         words = [w.lower() for w in words]
         sentences_cleaned.append(" ".join(words))
     return sentences_cleaned
Exemplo n.º 4
0
def word_tokenize(sentence, pt_tokenizer=True):
    """Tokenize sentence into words.

  NOTE: Default is `TreebankWordTokenizer`.

  If pt_tokenizer, use `TreebankWordTokenizer`, don't work well for "can't"

  TODO(zcq)
  """
    if pt_tokenizer:
        return nltk_word_tokenize(sentence)
    else:
        return nltk_WP_tokenize(sentence)
def word_tokenize(sentence, tokenizer="nltk"):
    """
    Tokenize the input string.

    Args:
        sentence (string):      The input string
        tokenizer (string):     The tokenizer to use. Default is nltk word tokenizer

    Returns:
        List[string]: The tokens from the input string
    """
    if tokenizer in "nltk":
        return nltk_word_tokenize(sentence)
    else:
        return re.split(SENTENCE_SPLIT_REGEX, sentence)
Exemplo n.º 6
0
    def detect(self, text):
        '''
        Detect the text's language
        '''
        #print "Detect: " + text
        try:
            self.__mutex.acquire()
            if not text:
                raise ValueError(u"Text: " + unicode(text))
            text = unicodedata.normalize("NFC", text)
            words    = nltk_word_tokenize(text.lower())
            trigrams = {}
            scores   = dict([(lang, 0) for lang in self.language_trigrams.keys()])

            for match in words:
                word_trigrams = self.__get_word_trigrams(match)
                #print "Match: " + match
                #print "trigrams: " + str(word_trigrams)
                for trigram in word_trigrams:
                    if not trigram in trigrams.keys():
                        trigrams[trigram] = 0
                    trigrams[trigram] += 1

            total = sum(trigrams.values())

            for trigram, count in trigrams.items():
                for lang, frequencies in self.language_trigrams.items():
                    # normalize and add to the total score
                    try:
                        scores[lang] += (float(frequencies[trigram]) / float(frequencies.N())) * (float(count) / float(total))
                    except ZeroDivisionError as e:
                        logger.error(u"Div: " + unicode(float(frequencies.N())) + u" " + unicode(float(total)))
                        raise e

            sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
            #print sorted_scores
            #logger.info(u"%s: %s" % (text, unicode(sorted_scores)))
            for lang, score in sorted_scores:
                if score > 0.0001:
                    return lang
            return None
        finally:
            self.__mutex.release()
Exemplo n.º 7
0
def preprocess_text_nltk(text):
    stopwords_remove = True
    language = 'english'
    sentences = sent_tokenize(text)
    extra_stopwords = ["''", "``", "'s"]
    sentences_cleaned = []
    if stopwords_remove:
        stopword_remover = flashtext.KeywordProcessor()
        for stopword in stopwords.words(language):
            stopword_remover.add_keyword(stopword, '')
        stopword_remover = stopword_remover

    for sent in sentences:
        if stopwords_remove:
            stopword_remover.replace_keywords(sent)
        words = nltk_word_tokenize(sent, language)
        words = [w for w in words if w not in string.punctuation]
        words = [w for w in words if w not in extra_stopwords]
        words = [w.lower() for w in words]
        sentences_cleaned.append(" ".join(words))
    return sentences_cleaned
Exemplo n.º 8
0
		def detect(self, text):
		
			words = nltk_word_tokenize(text.lower())
			trigrams = {}
			scores = dict([(lang, 0) for lang in self.language_trigrams.keys()])

			trigcount = [(trigram, 1.0) for match in words for trigram in self.get_word_trigrams(match)]
			if len(trigcount) > 0:
				trigdf = pandas.DataFrame(trigcount, columns = ["key", "value"])
				trigrams = trigdf.groupby("key")["value"].sum().to_dict()
			else:
				trigrams = {}

			total = sum(trigrams.values())
			maxscore, maxid = 0, ""
			for trigram, count in trigrams.items():
				trishare = (float(count) / float(total))
				for lang, frequencies in filter(lambda (l, f): trigram in f, self.language_dicts.iteritems()):
					scores[lang] += frequencies[trigram] * trishare
					if scores[lang] > maxscore:
						maxid, maxscore = lang, scores[lang]
Exemplo n.º 9
0
	def mapper(self, key, tweet):
		'''
		Detect the text's language
		'''
		obj = cjson.decode(tweet)
		text = obj['tx']
		words = nltk_word_tokenize(text.lower())
		trigrams = {}
		scores = dict([(lang, 0) for lang in self.language_trigrams.keys()])
		
		for match in words:
			for trigram in self.get_word_trigrams(match):
				if not trigram in trigrams.keys():
					trigrams[trigram] = 0
				trigrams[trigram] += 1
		total = sum(trigrams.values())
		for trigram, count in trigrams.items():
			for lang, frequencies in self.language_trigrams.items():
				# normalize and add to the total score
				scores[lang] += (float(frequencies[trigram]) / float(frequencies.N())) * (float(count) / float(total))
		obj['lang'] = sorted(scores.items(), key=lambda x: x[1], reverse=True)[0][0]
		yield key, obj
Exemplo n.º 10
0
    def detect(self, text):

        #tokenize the words
        words    = nltk_word_tokenize(text.lower())
        trigrams = {}
        scores   = dict([(lang, 0) for lang in self.language_trigrams.keys()])

        #get the trigrams and insert count of trigrams in a list
        for match in words:
            for trigram in self.get_word_trigrams(match):
                if not trigram in trigrams.keys():
                    trigrams[trigram] = 0
                trigrams[trigram] += 1

        total = sum(trigrams.values())

        #normalie the frequency and sort according to the keys.
        for trigram, count in trigrams.items():
            for lang, frequencies in self.language_trigrams.items():
                # normalize and add to the total score
                scores[lang] += (float(frequencies[trigram]) / float(frequencies.N())) * (float(count) / float(total))

        return sorted(scores.items(), key=lambda x: x[1], reverse=True)[0][0]
Exemplo n.º 11
0
    def mapper(self, key, tweet):
        '''
		Detect the text's language
		'''
        obj = cjson.decode(tweet)
        text = obj['tx']
        words = nltk_word_tokenize(text.lower())
        trigrams = {}
        scores = dict([(lang, 0) for lang in self.language_trigrams.keys()])

        for match in words:
            for trigram in self.get_word_trigrams(match):
                if not trigram in trigrams.keys():
                    trigrams[trigram] = 0
                trigrams[trigram] += 1
        total = sum(trigrams.values())
        for trigram, count in trigrams.items():
            for lang, frequencies in self.language_trigrams.items():
                # normalize and add to the total score
                scores[lang] += (float(frequencies[trigram]) / float(
                    frequencies.N())) * (float(count) / float(total))
        obj['lang'] = sorted(scores.items(), key=lambda x: x[1],
                             reverse=True)[0][0]
        yield key, obj