def __init__(self, corrections=None, spoken=None, target=None): self.train_path, self.dev_path, self.test_path = filepaths() self.trainset, self.devset, self.testset = returnDatasets() self.languages_known = Counter() self.languages_learning = Counter() self.languages = languages() self.ch = CorrectionHelper(corrections, spoken, target)
def __init__(self): self.train, self.dev, self.test = returnDatasets() self.SPEAKING = 0 self.STUDYING = 1 self.ENTRY = 2 self.INCORRECT = 3 self.CORRECTIONS = 4
def fit(self, x, y=None): return self def transform(self, pairs): features = np.recarray(shape=(len(pairs), ), dtype=[('spoken', object), ('edit_dist', object)]) for i, pair in enumerate(pairs): spoken, count = pair features['spoken'][i] = spoken features['edit_dist'][i] = count return features if __name__ == '__main__': train, dev, test = returnDatasets() ''' Set up the classes. ''' #gfe = GiveawayFeatureExtraction() #sfe = SocialFeatureExtraction() sse = SyntacticStructExtraction() ce = CorrectionExtraction() sp = SetProcessing() datalist = sp.convertDataToList(train) #dev = sp.convertDataToList(dev) #test = sp.convertDataToList(test) #merged = sp.mergeLists(train, dev, test) #english, french, spanish, japanese, korean, mandarin = sp.returnSplitDatasets(train, 5, False) '''Return the individual sets by native language.''' '''Takes approx. 1 second.''' print("Collecting test sets...")
langs.append(data[sp.SPEAKING]) print(langs) vect = CountVectorizer() X_train_counts = vect.fit_transform(entries) tfidf = TfidfTransformer(use_idf=True).fit(X_train_counts) X_train_tfidf = tfidf.transform(X_train_counts) X_train_tfidf = X_train_tfidf.toarray() tree = SGDClassifier() tree.fit(X_train_tfidf, langs) result = tree.predict(X_train_tfidf) print(np.mean(result == langs)) print(metrics.classification_report(langs, result, target_names=langs)) def findMeanWordVector(words, model, num_feats): # Finds the average of all the word vectors. featureVec = np.zeros((num_feats,),dtype="float32") num_words = 0 index2word_set = set(model.index2word) for word in words: if word in index2word_set: nwords = nwords + 1. featureVec = np.add(featureVec,model[word]) featureVec = np.divide(featureVec,num_words) return featureVec if __name__ == '__main__': train, dev, test = returnDatasets() bagOfWords(dev[:2])