Пример #1
0
def load_data(dirFolder, testRatio, featureKeepRatio=1.0):
    classes = sorted(os.listdir(dirFolder))
    vocabulary = set()
    cMap = {i: classes[i] for i in range(len(classes))}
    allDocs = []
    for i, dclass in enumerate(classes):
        documents = os.listdir(os.path.join(dirFolder, dclass))
        np.random.shuffle(documents)
        splitPoint = int(testRatio * len(documents))
        trainDocs, testDocs = documents[splitPoint:], documents[:splitPoint]
        allDocs.append([trainDocs, testDocs])
        # Process documents for vocabulary selection
        tfidf = TfIdf(os.path.join(dirFolder, dclass), trainDocs,
                      featureKeepRatio)
        selectedWords = tfidf.selectWords()
        vocabulary = vocabulary | selectedWords
    # Featurize data according to above vocabulary
    vocabulary = list(vocabulary)
    X_train, Y_train = [], []
    X_test, Y_test = [], []
    for i, dclass in enumerate(classes):
        for j in range(len(allDocs[i])):
            for doc in allDocs[i][j]:
                processedFile = preprocess.readFile(
                    os.path.join(os.path.join(dirFolder, dclass), doc))
                words = Counter(processedFile)
                features = [words.get(w, 0) for w in vocabulary]
                if j == 0:
                    X_train.append(features)
                    Y_train.append(i)
                else:
                    X_test.append(features)
                    Y_test.append(i)
    return (np.stack(X_train), Y_train), (np.stack(X_test), Y_test)
Пример #2
0
def evaluateMEMM(train, test):
    """
    a function that returns the accuracy of the MEMM model using a given
    validation set.
    """
    model = MEMM(train)
    lines = preprocess.readFile(test)
    lineNum = 1
    correct = 0
    total = 0

    for line in lines:
        if (lineNum % 3) == 1:
            sentence = line

        elif (lineNum % 3) == 2:
            tags = model.assignTags(sentence, line)
            addBio(tags)

        elif (lineNum % 3) == 0:
            answers = line.strip().split()
            #Following line just for testing
            #assert len(tags) == len(answers)
            for i in range(len(tags)):
                if tags[i] == answers[i]:
                    correct += 1
                total += 1
        lineNum += 1

    return correct, total
Пример #3
0
def load_data_word2vec(dirFolder, featureKeepRatio=1.0):
    model = gensim.models.keyedvectors.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)
    classes = sorted(os.listdir(dirFolder))
    vocabulary = set()
    cMap = {i:classes[i] for i in range(len(classes))}
    allDocs = []
    for i, dclass in enumerate(classes):
        documents = os.listdir(os.path.join(dirFolder, dclass))
        np.random.shuffle(documents)
        allDocs.append(documents)
        # Process documents for vocabulary selection
        tfidf = TfIdf(os.path.join(dirFolder, dclass), documents, featureKeepRatio)
        selectedWords = tfidf.selectWords()
        vocabulary = vocabulary | selectedWords
    # Featurize data according to above vocabulary
    vocabulary = list(vocabulary)
    X = []
    def getIt(dablu):
        try:
            return model[dablu]
        except:
            return np.zeros((300,))

    for i, dclass in enumerate(classes):
        for doc in allDocs[i]:
            processedFile = preprocess.readFile(os.path.join(os.path.join(dirFolder, dclass), doc))
            words = list(set(processedFile))
            features = [ getIt(w) for w in vocabulary]
            X.append(features)
    return np.stack(X)
Пример #4
0
    def scoreBaseLine(self, filepath):
        """
		A function that grades the models accuracy using a given test
		file specified by [filepath].
		"""
        lines = preprocess.readFile(filepath)
        lineNum = 1
        correct = 0
        total = 0
        for line in lines:
            if (lineNum % 3) == 1:
                tokens = line.strip().split()
                tags = self.assignTags(tokens)
            elif (lineNum % 3) == 0:
                answers = line.strip().split()

                #Following section just for testing
                if len(tags) != len(answers):
                    print("Lengths don't match!\n")
                    print(tags)
                    print(answers)
                    break

                for i in range(len(tags)):
                    if tags[i] == answers[i]:
                        correct += 1
                    total += 1
            lineNum += 1

        return correct, total
Пример #5
0
def MEMMClassify(train, test):
    """
    The function returns the tagging prediction by the MEMM system as
    a dictionary.
    """
    model = MEMM.MEMM(train)
    lines = preprocess.readFile(test)
    prediction = {'PER': [], 'LOC': [], 'ORG': [], 'MISC': []}
    lineNum = 1

    for line in lines:
        if (lineNum % 3) == 1:
            #Line with Tokens
            sentence = line

        elif (lineNum % 3) == 2:
            tags = model.assignTags(sentence, line)

        else:
            #Line with indexes
            indexes = line.strip().split()
            preClass = None
            firstIdx = None
            lastIdx = None
            NEcontinues = False

            for i in range(len(tags)):
                tag = tags[i]
                if tag == 'O':
                    if NEcontinues:
                        #Previous tag ends
                        prediction[preClass].append(firstIdx + '-' + lastIdx)
                    preClass = None
                    firstIdx = None
                    lastIdx = None
                    NEcontinues = False

                else:
                    if NEcontinues:
                        if tag != preClass:
                            #Previous tag ends, new Tag begins
                            prediction[preClass].append(firstIdx + '-' +
                                                        lastIdx)
                            preClass = tag
                            firstIdx = indexes[i]
                            lastIdx = indexes[i]
                        else:
                            #Previous tag continues
                            lastIdx = indexes[i]
                    else:
                        #New tag begins
                        preClass = tag
                        firstIdx = indexes[i]
                        lastIdx = indexes[i]
                        NEcontinues = True
        lineNum += 1
    return prediction
Пример #6
0
 def construct_index(self, directory):
     documents = []
     documentNames = []
     for root, dirs, files in os.walk(directory):
         for file in tqdm(files):
             dataRecovered = preprocess.readFile(os.path.join(root, file))
             if dataRecovered:
                 documentNames.append(os.path.join(root, file))
                 documents.append(dataRecovered)
     self.allDocuments = set(documentNames)
     self.construct(documents, documentNames)
Пример #7
0
 def preprocessData(self):
     for doc in self.docs:
         processedFile = preprocess.readFile(os.path.join(
             self.baseDir, doc))
         words = Counter(processedFile)
         self.vocabulary = list(set(self.vocabulary) | set(words.keys()))
         self.documents[doc] = words
         for uniqueWord in words.keys():
             self.wordFrequencies[
                 uniqueWord] = 1 + self.wordFrequencies.get(uniqueWord, 0)
         self.N += 1
Пример #8
0
def baselineClassify(train, test):
    """
    The function returns the tagging prediction by the baseline system as
    a dictionary.
    """
    model = baseline.Baseline(train)
    lines = preprocess.readFile(test)
    prediction = {'PER': [], 'LOC': [], 'ORG': [], 'MISC': []}
    lineNum = 1

    for line in lines:
        if (lineNum % 3) == 1:
            #Line with Tokens
            tokens = line.strip().split()
            tags = BLdebug(model.assignTags(tokens))
        elif (lineNum % 3) == 0:
            #Line with indexes
            indexes = line.strip().split()
            preClass = None
            firstIdx = None
            lastIdx = None
            NEcontinues = False

            for i in range(len(tags)):
                bioTag = tags[i][:1]

                if bioTag == 'B':
                    if NEcontinues:
                        #Previous tag ends
                        prediction[preClass].append(firstIdx + '-' + lastIdx)
                    preClass = tags[i][2:]
                    firstIdx = indexes[i]
                    lastIdx = indexes[i]
                    NEcontinues = True

                elif bioTag == 'I':
                    curClass = tags[i][2:]
                    assert NEcontinues and curClass == preClass
                    lastIdx = indexes[i]

                else:  # bioTag == 'O'
                    if NEcontinues:
                        prediction[preClass].append(firstIdx + '-' + lastIdx)
                    preClass = None
                    firstIdx = None
                    lastIdx = None
                    NEcontinues = False

        lineNum += 1

    return prediction
Пример #9
0
def buildDB():

	# clean the data file at private/filename
    noisy_file = os.path.join(request.folder, 'private','027__BTECH__6TH SEM.txt')
    clean_file = os.path.join(request.folder, 'private','cleanFile.txt')
    preprocess.cleanData(noisy_file, clean_file)


    # extract college information
    lines = preprocess.readFile(clean_file)
    colleges = extract.extract_data(lines)

    #insert into table colleges
    for name in colleges.keys():
    	db.colleges.insert(name=name)

    # insert into table students
    for name in colleges.keys():
        for student in colleges[name].students:

            credits = (4,4,4,4,4,4,1,1,1,1,1)
            total_credits = 0
            for i in xrange(len(credits)):
                total_credits = total_credits + credits[i]

            percentage = (credits[0]*int(student.marks[0]) + credits[1]*int(student.marks[1]) +  credits[2]*int(student.marks[2]) +
                         credits[3]*int(student.marks[3]) + credits[4]*int(student.marks[4]) + credits[5]*int(student.marks[5]) +
                         credits[6]*int(student.marks[6]) + credits[7]*int(student.marks[7]) + credits[8]*int(student.marks[8]) +
                         credits[9]*int(student.marks[9]) + credits[10]*int(student.marks[10]))

            percentage = (percentage * 1.0) / total_credits

            if percentage < 50.0:
                continue

            collegeid = db(db.colleges.name == name).select()[0]['id']
            db.students.insert(colleges_id = collegeid, rollNo = student.rollNo[1:], name = student.name,
                           subj1 = int(student.marks[0]), subj2 = int(student.marks[1]), subj3 = int(student.marks[2]),
                           subj4 = int(student.marks[3]), subj5 = int(student.marks[4]), subj6 = int(student.marks[5]),
                           subj7 = int(student.marks[6]), subj8 = int(student.marks[7]), subj9 = int(student.marks[8]),
                           subj10 = int(student.marks[9]), subj11 = int(student.marks[10]), percentage = percentage)

    return 'db built'
Пример #10
0
def load_data(dirFolder, featureKeepRatio=1.0):
    classes = sorted(os.listdir(dirFolder))
    vocabulary = set()
    cMap = {i:classes[i] for i in range(len(classes))}
    allDocs = []
    for i, dclass in enumerate(classes):
        documents = os.listdir(os.path.join(dirFolder, dclass))
        np.random.shuffle(documents)
        allDocs.append(documents)
        # Process documents for vocabulary selection
        tfidf = TfIdf(os.path.join(dirFolder, dclass), documents, featureKeepRatio)
        selectedWords = tfidf.selectWords()
        vocabulary = vocabulary | selectedWords
    # Featurize data according to above vocabulary
    vocabulary = list(vocabulary)
    X = []
    for i, dclass in enumerate(classes):
        for doc in allDocs[i]:
            processedFile = preprocess.readFile(os.path.join(os.path.join(dirFolder, dclass), doc))
            words = Counter(processedFile)
            features = [ words.get(w, 0) for w in vocabulary]
            X.append(features)
    return np.stack(X)
Пример #11
0
import preprocess as pp
import decisionTree as dt
import numpy as np
import random

random.seed()
file = 'data.csv'

rows = pp.readFile(file=file)

X_train, Y_train, X_test, Y_test = pp.splitData(rows)

tree = dt.DecisionTree(X_train, Y_train, 9)
tree.train()
Y_pred, error = tree.test(X_test, Y_test)

print("error in decision tree testing =", error)
for i in range(len(Y_test)):
    print(Y_pred[i], Y_test[i])

print("vignesh's prediction")

Xmale = [['3', '9.6', 'M', '28.3', '7', '1']]
Xfemale = [['3', '9.6', 'F', '28.3', '7', '1']]
Y = [10]

yp, e = tree.test(Xmale, Y)
print("vignesh's grade", yp[0])
yp, e = tree.test(Xfemale, Y)
print("chhakka vignesh's grade", yp[0])
Пример #12
0
import preprocess, deal, logistic, submission
import numpy as np

if __name__ == "__main__":
    path = "../train"
    preprocess.readFile(path)
    dict_train = preprocess.getDict(path)
    # print(dict)

    # 相似度dict,键是两个文件名,值是[特征向量,是否clone]
    similarityDict_train = deal.getSimilarity(dict_train)
    # print(similarityDict)

    # # 获得所有clone的余弦相似度
    # cloneSimilarityList=deal.getCloneSimilarity(similarityDict)
    # # print(cloneSimilarityList)
    # # 获得所有非clone的余弦相似度
    # notCloneSimilarityList=deal.getNotCloneSimilarity(similarityDict)
    # # print(notCloneSimilarityList)

    # 获得训练集的x和y
    x_train, y_train = deal.getArray(similarityDict_train)

    path = "../test"
    preprocess.readFile(path)
    dict_test = preprocess.getDict(path)
    # print(dict)

    # 测试集

    # 相似度dict,键是两个文件名,值是[特征向量]