Exemplo n.º 1
0
def main(trainMode, binarized, testDir, resultFile):
    dir1 = r'R:\masters\fall2013\COMP579A\project\aclImdb\train\pos';
    dir2 = r'R:\masters\fall2013\COMP579A\project\aclImdb\train\neg';
    corpusName = r'corpus\binCorpus.txt';
    if(not binarized):
        corpusName = r'corpus\corpus.txt';
    if(trainMode):
        if(binarized):
            buildBinarizedCorpus(dir1, corpusName);
            buildBinarizedCorpus(dir2, corpusName);
        else:
            buildCorpus(dir1, corpusName);
            buildCorpus(dir2, corpusName);
    
    dictionary = loadCorpus(corpusName);
    testText = '';
    print('test for dir' , testDir);
    for dir_entry in os.listdir(testDir):
        if binarized:
            testText = getBinTestText(os.path.join(testDir, dir_entry));
        else:
            testText = getTestText(os.path.join(testDir, dir_entry));
        result =test(testText, dictionary);
        FileIO.wrtieToFile(resultFile, 'a' ,"The file " + os.path.join(testDir, dir_entry) + " is classified as " + result + "\n");
    print("Done....");
Exemplo n.º 2
0
def buildCorpus(dirPath, corpusName):
    print('creating corpus!!');
    tagClass = fetchLabel(dirPath);
    classCnt = FileIO.countFiles(dirPath);
    dictionary = {};
    FileIO.wrtieToFile("corpus\classCount.txt", 'a', (tagClass + '\t' + str(classCnt) + '\n' ));
    for dir_entry in os.listdir(dirPath):
        text = FileIO.readFile(os.path.join(dirPath, dir_entry));
        text = Tokenizer.tokenizer(text);
        for token in text.split('\n'):
            if token not in dictionary:
                dictionary[token] = {};
            if tagClass not in dictionary[token]:
                dictionary[token][tagClass] = 0;
            dictionary[token][tagClass] = dictionary[token][tagClass] + 1;
    for key, value in dictionary.items():
        FileIO.wrtieToFile(corpusName, 'a', (key + '\t' + str(value[tagClass]) + '\t' + tagClass + '\n'));
    print('Corpus creation : Done..');
Exemplo n.º 3
0
def buildBinarizedCorpus(dirPath, corpusName):
    print('creating binarized corpus!!');
    tagClass = fetchLabel(dirPath);
    classCnt = FileIO.countFiles(dirPath);
    FileIO.wrtieToFile("corpus\classCount.txt", 'a', (tagClass + '\t' + str(classCnt) + '\n' ));
    corpusDict = {};
    for dir_entry in os.listdir(dirPath):
        fileTokens = {};
        text = FileIO.readFile(os.path.join(dirPath, dir_entry));
        text = Tokenizer.tokenizer(text);
        for token in text.split('\n'):
            if token not in fileTokens:
                fileTokens[token] = 1;
                if token not in corpusDict:
                    corpusDict[token] = {};
                    corpusDict[token][tagClass] = 1;
                else:
                    corpusDict[token][tagClass] = corpusDict[token][tagClass] + 1;
    for key, value in corpusDict.items():
        FileIO.wrtieToFile(corpusName, 'a', (key + '\t' + str(value[tagClass]) + '\t' + tagClass + '\n' ));
    print('binarized corpus creation done!!');