def load_data(dirFolder, testRatio, featureKeepRatio=1.0): classes = sorted(os.listdir(dirFolder)) vocabulary = set() cMap = {i: classes[i] for i in range(len(classes))} allDocs = [] for i, dclass in enumerate(classes): documents = os.listdir(os.path.join(dirFolder, dclass)) np.random.shuffle(documents) splitPoint = int(testRatio * len(documents)) trainDocs, testDocs = documents[splitPoint:], documents[:splitPoint] allDocs.append([trainDocs, testDocs]) # Process documents for vocabulary selection tfidf = TfIdf(os.path.join(dirFolder, dclass), trainDocs, featureKeepRatio) selectedWords = tfidf.selectWords() vocabulary = vocabulary | selectedWords # Featurize data according to above vocabulary vocabulary = list(vocabulary) X_train, Y_train = [], [] X_test, Y_test = [], [] for i, dclass in enumerate(classes): for j in range(len(allDocs[i])): for doc in allDocs[i][j]: processedFile = preprocess.readFile( os.path.join(os.path.join(dirFolder, dclass), doc)) words = Counter(processedFile) features = [words.get(w, 0) for w in vocabulary] if j == 0: X_train.append(features) Y_train.append(i) else: X_test.append(features) Y_test.append(i) return (np.stack(X_train), Y_train), (np.stack(X_test), Y_test)
def evaluateMEMM(train, test): """ a function that returns the accuracy of the MEMM model using a given validation set. """ model = MEMM(train) lines = preprocess.readFile(test) lineNum = 1 correct = 0 total = 0 for line in lines: if (lineNum % 3) == 1: sentence = line elif (lineNum % 3) == 2: tags = model.assignTags(sentence, line) addBio(tags) elif (lineNum % 3) == 0: answers = line.strip().split() #Following line just for testing #assert len(tags) == len(answers) for i in range(len(tags)): if tags[i] == answers[i]: correct += 1 total += 1 lineNum += 1 return correct, total
def load_data_word2vec(dirFolder, featureKeepRatio=1.0): model = gensim.models.keyedvectors.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True) classes = sorted(os.listdir(dirFolder)) vocabulary = set() cMap = {i:classes[i] for i in range(len(classes))} allDocs = [] for i, dclass in enumerate(classes): documents = os.listdir(os.path.join(dirFolder, dclass)) np.random.shuffle(documents) allDocs.append(documents) # Process documents for vocabulary selection tfidf = TfIdf(os.path.join(dirFolder, dclass), documents, featureKeepRatio) selectedWords = tfidf.selectWords() vocabulary = vocabulary | selectedWords # Featurize data according to above vocabulary vocabulary = list(vocabulary) X = [] def getIt(dablu): try: return model[dablu] except: return np.zeros((300,)) for i, dclass in enumerate(classes): for doc in allDocs[i]: processedFile = preprocess.readFile(os.path.join(os.path.join(dirFolder, dclass), doc)) words = list(set(processedFile)) features = [ getIt(w) for w in vocabulary] X.append(features) return np.stack(X)
def scoreBaseLine(self, filepath): """ A function that grades the models accuracy using a given test file specified by [filepath]. """ lines = preprocess.readFile(filepath) lineNum = 1 correct = 0 total = 0 for line in lines: if (lineNum % 3) == 1: tokens = line.strip().split() tags = self.assignTags(tokens) elif (lineNum % 3) == 0: answers = line.strip().split() #Following section just for testing if len(tags) != len(answers): print("Lengths don't match!\n") print(tags) print(answers) break for i in range(len(tags)): if tags[i] == answers[i]: correct += 1 total += 1 lineNum += 1 return correct, total
def MEMMClassify(train, test): """ The function returns the tagging prediction by the MEMM system as a dictionary. """ model = MEMM.MEMM(train) lines = preprocess.readFile(test) prediction = {'PER': [], 'LOC': [], 'ORG': [], 'MISC': []} lineNum = 1 for line in lines: if (lineNum % 3) == 1: #Line with Tokens sentence = line elif (lineNum % 3) == 2: tags = model.assignTags(sentence, line) else: #Line with indexes indexes = line.strip().split() preClass = None firstIdx = None lastIdx = None NEcontinues = False for i in range(len(tags)): tag = tags[i] if tag == 'O': if NEcontinues: #Previous tag ends prediction[preClass].append(firstIdx + '-' + lastIdx) preClass = None firstIdx = None lastIdx = None NEcontinues = False else: if NEcontinues: if tag != preClass: #Previous tag ends, new Tag begins prediction[preClass].append(firstIdx + '-' + lastIdx) preClass = tag firstIdx = indexes[i] lastIdx = indexes[i] else: #Previous tag continues lastIdx = indexes[i] else: #New tag begins preClass = tag firstIdx = indexes[i] lastIdx = indexes[i] NEcontinues = True lineNum += 1 return prediction
def construct_index(self, directory): documents = [] documentNames = [] for root, dirs, files in os.walk(directory): for file in tqdm(files): dataRecovered = preprocess.readFile(os.path.join(root, file)) if dataRecovered: documentNames.append(os.path.join(root, file)) documents.append(dataRecovered) self.allDocuments = set(documentNames) self.construct(documents, documentNames)
def preprocessData(self): for doc in self.docs: processedFile = preprocess.readFile(os.path.join( self.baseDir, doc)) words = Counter(processedFile) self.vocabulary = list(set(self.vocabulary) | set(words.keys())) self.documents[doc] = words for uniqueWord in words.keys(): self.wordFrequencies[ uniqueWord] = 1 + self.wordFrequencies.get(uniqueWord, 0) self.N += 1
def baselineClassify(train, test): """ The function returns the tagging prediction by the baseline system as a dictionary. """ model = baseline.Baseline(train) lines = preprocess.readFile(test) prediction = {'PER': [], 'LOC': [], 'ORG': [], 'MISC': []} lineNum = 1 for line in lines: if (lineNum % 3) == 1: #Line with Tokens tokens = line.strip().split() tags = BLdebug(model.assignTags(tokens)) elif (lineNum % 3) == 0: #Line with indexes indexes = line.strip().split() preClass = None firstIdx = None lastIdx = None NEcontinues = False for i in range(len(tags)): bioTag = tags[i][:1] if bioTag == 'B': if NEcontinues: #Previous tag ends prediction[preClass].append(firstIdx + '-' + lastIdx) preClass = tags[i][2:] firstIdx = indexes[i] lastIdx = indexes[i] NEcontinues = True elif bioTag == 'I': curClass = tags[i][2:] assert NEcontinues and curClass == preClass lastIdx = indexes[i] else: # bioTag == 'O' if NEcontinues: prediction[preClass].append(firstIdx + '-' + lastIdx) preClass = None firstIdx = None lastIdx = None NEcontinues = False lineNum += 1 return prediction
def buildDB(): # clean the data file at private/filename noisy_file = os.path.join(request.folder, 'private','027__BTECH__6TH SEM.txt') clean_file = os.path.join(request.folder, 'private','cleanFile.txt') preprocess.cleanData(noisy_file, clean_file) # extract college information lines = preprocess.readFile(clean_file) colleges = extract.extract_data(lines) #insert into table colleges for name in colleges.keys(): db.colleges.insert(name=name) # insert into table students for name in colleges.keys(): for student in colleges[name].students: credits = (4,4,4,4,4,4,1,1,1,1,1) total_credits = 0 for i in xrange(len(credits)): total_credits = total_credits + credits[i] percentage = (credits[0]*int(student.marks[0]) + credits[1]*int(student.marks[1]) + credits[2]*int(student.marks[2]) + credits[3]*int(student.marks[3]) + credits[4]*int(student.marks[4]) + credits[5]*int(student.marks[5]) + credits[6]*int(student.marks[6]) + credits[7]*int(student.marks[7]) + credits[8]*int(student.marks[8]) + credits[9]*int(student.marks[9]) + credits[10]*int(student.marks[10])) percentage = (percentage * 1.0) / total_credits if percentage < 50.0: continue collegeid = db(db.colleges.name == name).select()[0]['id'] db.students.insert(colleges_id = collegeid, rollNo = student.rollNo[1:], name = student.name, subj1 = int(student.marks[0]), subj2 = int(student.marks[1]), subj3 = int(student.marks[2]), subj4 = int(student.marks[3]), subj5 = int(student.marks[4]), subj6 = int(student.marks[5]), subj7 = int(student.marks[6]), subj8 = int(student.marks[7]), subj9 = int(student.marks[8]), subj10 = int(student.marks[9]), subj11 = int(student.marks[10]), percentage = percentage) return 'db built'
def load_data(dirFolder, featureKeepRatio=1.0): classes = sorted(os.listdir(dirFolder)) vocabulary = set() cMap = {i:classes[i] for i in range(len(classes))} allDocs = [] for i, dclass in enumerate(classes): documents = os.listdir(os.path.join(dirFolder, dclass)) np.random.shuffle(documents) allDocs.append(documents) # Process documents for vocabulary selection tfidf = TfIdf(os.path.join(dirFolder, dclass), documents, featureKeepRatio) selectedWords = tfidf.selectWords() vocabulary = vocabulary | selectedWords # Featurize data according to above vocabulary vocabulary = list(vocabulary) X = [] for i, dclass in enumerate(classes): for doc in allDocs[i]: processedFile = preprocess.readFile(os.path.join(os.path.join(dirFolder, dclass), doc)) words = Counter(processedFile) features = [ words.get(w, 0) for w in vocabulary] X.append(features) return np.stack(X)
import preprocess as pp import decisionTree as dt import numpy as np import random random.seed() file = 'data.csv' rows = pp.readFile(file=file) X_train, Y_train, X_test, Y_test = pp.splitData(rows) tree = dt.DecisionTree(X_train, Y_train, 9) tree.train() Y_pred, error = tree.test(X_test, Y_test) print("error in decision tree testing =", error) for i in range(len(Y_test)): print(Y_pred[i], Y_test[i]) print("vignesh's prediction") Xmale = [['3', '9.6', 'M', '28.3', '7', '1']] Xfemale = [['3', '9.6', 'F', '28.3', '7', '1']] Y = [10] yp, e = tree.test(Xmale, Y) print("vignesh's grade", yp[0]) yp, e = tree.test(Xfemale, Y) print("chhakka vignesh's grade", yp[0])
import preprocess, deal, logistic, submission import numpy as np if __name__ == "__main__": path = "../train" preprocess.readFile(path) dict_train = preprocess.getDict(path) # print(dict) # 相似度dict,键是两个文件名,值是[特征向量,是否clone] similarityDict_train = deal.getSimilarity(dict_train) # print(similarityDict) # # 获得所有clone的余弦相似度 # cloneSimilarityList=deal.getCloneSimilarity(similarityDict) # # print(cloneSimilarityList) # # 获得所有非clone的余弦相似度 # notCloneSimilarityList=deal.getNotCloneSimilarity(similarityDict) # # print(notCloneSimilarityList) # 获得训练集的x和y x_train, y_train = deal.getArray(similarityDict_train) path = "../test" preprocess.readFile(path) dict_test = preprocess.getDict(path) # print(dict) # 测试集 # 相似度dict,键是两个文件名,值是[特征向量]