예제 #1
0
	def __init__(self, corrections=None, spoken=None, target=None):
		self.train_path, self.dev_path, self.test_path = filepaths()
		self.trainset, self.devset, self.testset = returnDatasets()
		self.languages_known = Counter()
		self.languages_learning = Counter()
		self.languages = languages()
		self.ch = CorrectionHelper(corrections, spoken, target)
예제 #2
0
 def __init__(self):
     self.train, self.dev, self.test = returnDatasets()
     self.SPEAKING = 0
     self.STUDYING = 1
     self.ENTRY = 2
     self.INCORRECT = 3
     self.CORRECTIONS = 4
예제 #3
0
	def __init__(self):
		self.train, self.dev, self.test = returnDatasets()
		self.SPEAKING = 0
		self.STUDYING = 1
		self.ENTRY = 2
		self.INCORRECT = 3
		self.CORRECTIONS = 4
예제 #4
0
 def __init__(self, corrections=None, spoken=None, target=None):
     self.train_path, self.dev_path, self.test_path = filepaths()
     self.trainset, self.devset, self.testset = returnDatasets()
     self.languages_known = Counter()
     self.languages_learning = Counter()
     self.languages = languages()
     self.ch = CorrectionHelper(corrections, spoken, target)
예제 #5
0
    def fit(self, x, y=None):
        return self

    def transform(self, pairs):
        features = np.recarray(shape=(len(pairs), ),
                               dtype=[('spoken', object),
                                      ('edit_dist', object)])
        for i, pair in enumerate(pairs):
            spoken, count = pair
            features['spoken'][i] = spoken
            features['edit_dist'][i] = count
        return features


if __name__ == '__main__':
    train, dev, test = returnDatasets()
    ''' Set up the classes. '''
    #gfe = GiveawayFeatureExtraction()
    #sfe = SocialFeatureExtraction()
    sse = SyntacticStructExtraction()
    ce = CorrectionExtraction()
    sp = SetProcessing()

    datalist = sp.convertDataToList(train)
    #dev = sp.convertDataToList(dev)
    #test = sp.convertDataToList(test)
    #merged = sp.mergeLists(train, dev, test)
    #english, french, spanish, japanese, korean, mandarin = sp.returnSplitDatasets(train, 5, False)
    '''Return the individual sets by native language.'''
    '''Takes approx. 1 second.'''
    print("Collecting test sets...")
예제 #6
0
        langs.append(data[sp.SPEAKING])

    print(langs)

    vect = CountVectorizer()
    X_train_counts = vect.fit_transform(entries)
    tfidf = TfidfTransformer(use_idf=True).fit(X_train_counts)
    X_train_tfidf = tfidf.transform(X_train_counts)
    X_train_tfidf = X_train_tfidf.toarray()

    tree = SGDClassifier()
    tree.fit(X_train_tfidf, langs)
    result = tree.predict(X_train_tfidf)
    print(np.mean(result == langs))
    print(metrics.classification_report(langs, result, target_names=langs))

def findMeanWordVector(words, model, num_feats):
	# Finds the average of all the word vectors.
    featureVec = np.zeros((num_feats,),dtype="float32")
    num_words = 0
    index2word_set = set(model.index2word)
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    featureVec = np.divide(featureVec,num_words)
    return featureVec

if __name__ == '__main__':
    train, dev, test = returnDatasets()
    bagOfWords(dev[:2])