示例#1
0
def featureExtractor(spoken, target, words, sentences, return_feats=False):
    prefmap = makePrefixLangMapping()
    feats = {}
    prefixes = scanForMultipleLanguages(target, words)
    for pref in prefmap:
        feats["other_langs({0})".format(pref)] = 0
    for prefix in prefixes:
        feats["other_langs({0}".format(prefix)] = 1

    vec = DictVectorizer()
    pos_vectorized = vec.fit_transform(feats)
    pos_vectorized.toarray()

    if return_feats: return feats, vec
    else: return vec
示例#2
0
def featureExtractor(spoken, target, words, sentences, return_feats=False):
    prefmap = makePrefixLangMapping()
    feats = {}
    prefixes = scanForMultipleLanguages(target, words)
    for pref in prefmap:
        feats["other_langs({0})".format(pref)] = 0
    for prefix in prefixes:
        feats["other_langs({0}".format(prefix)] = 1

    vec = DictVectorizer()
    pos_vectorized = vec.fit_transform(feats)
    pos_vectorized.toarray()

    if return_feats:
        return feats, vec
    else:
        return vec
示例#3
0
	def returnEntryVersusTarget(self, datalist):
		'''Some users write in a language that is different from their target language
		   (i.e. if they are practicing a language that they didn't specify that they were
		   learning, or if they are writing an entry in their native language asking someone
		   to translate something for them). This function counts how many of these instances
		   exist in the specified dataset.'''
		t0 = time()
		prefmap = makePrefixLangMapping()
		not_orig_lang = 0
		for data in datalist:
			blob = TextBlob(data[self.ENTRY])
			entrylang = blob.detect_language()
			islang = True
			for d in data[self.STUDYING].split():
				if entrylang not in prefmap: continue
				if prefmap[entrylang] == d: continue
				not_orig_lang += 1
		print("Took %s seconds" % (time() - t0))
		print("Of %s entries, there are %s entries written in a different language than specified" % 
			(len(datalist), not_orig_lang))
示例#4
0
    def returnEntryVersusTarget(self, datalist):
        '''Some users write in a language that is different from their target language
		   (i.e. if they are practicing a language that they didn't specify that they were
		   learning, or if they are writing an entry in their native language asking someone
		   to translate something for them). This function counts how many of these instances
		   exist in the specified dataset.'''
        t0 = time()
        prefmap = makePrefixLangMapping()
        not_orig_lang = 0
        for data in datalist:
            blob = TextBlob(data[self.ENTRY])
            entrylang = blob.detect_language()
            islang = True
            for d in data[self.STUDYING].split():
                if entrylang not in prefmap: continue
                if prefmap[entrylang] == d: continue
                not_orig_lang += 1
        print("Took %s seconds" % (time() - t0))
        print(
            "Of %s entries, there are %s entries written in a different language than specified"
            % (len(datalist), not_orig_lang))