# -*- coding: utf-8 -*-

import __init__

import argparse

import cpLib.conceptDB as db
import cpLib.conceptExtraction as cpe

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='shuffle a wordPair file')
    parser.add_argument("vocFilePath", help='voc file')
    parser.add_argument("inputConceptPairPath", help='concept pair file')
    args = parser.parse_args()

    vocFilePath = args.vocFilePath
    inputConceptPairPath = args.inputConceptPairPath
    conceptPairStrList = [
        l.split('\t')
        for l in open(args.inputConceptPairPath).read().splitlines()
    ]
    strict = args.compose

    d = db.DB('../data/voc/npy/wikiEn-skipgram.npy', False)
    conceptPairList = cpe.buildConceptPairList(d, conceptPairStrList, True)

    shuffledConceptPairList = cpe.shuffledConceptPairList(conceptPairList)

    for conceptPair in shuffledConceptPairList:
        print '\t'.join([str(s) for s in conceptPair])
예제 #2
0
 def setUp(self):
     self.d = db.DB('../data/voc/npy/googleNews_mini.npy')
def printPredictedConceptClass(d, clf, cpStrList, strict):
    cpList = cpe.buildConceptList(d, cpStrList, strict)

    yPred = clf.predict(cpList)
    yProba = clf.predict_proba(cpList)
    for x, y, yp in zip(cpStrList, yPred, yProba):
        print '\t'.join([str(i) for i in [x, y, yp]])


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description='Predict concept class according to a trained classifier')
    parser.add_argument("vocFilePath", help='voc file')
    parser.add_argument("trainedClfPath", help='trained classifier file')
    parser.add_argument("inputConceptPath", help='concept file')
    parser.add_argument("--compose",
                        help='try to compose concept',
                        action='store_false')
    args = parser.parse_args()

    vocFilePath = args.vocFilePath
    trainedClfPath = args.trainedClfPath
    inputConceptPath = args.inputConceptPath
    strict = args.compose

    cpStrList = open(inputConceptPath).read().splitlines()

    printPredictedConceptClass(db.DB(vocFilePath),
                               dill.load(open(trainedClfPath)), cpStrList,
                               strict)
예제 #4
0
import cpLib.conceptDB as db

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description=
        'Convert a word2vec vocabulary/vector file from text to numpy format')
    parser.add_argument('txtFile', help='voc and vectors file in text format')
    parser.add_argument('npFile', help='path to store the voc in numpy format')
    args = parser.parse_args()

    inputTxtFilePath = args.txtFile
    npFilePath = args.npFile
    dictFilePath = npFilePath + 'dict'

    if inputTxtFilePath.endswith('.txt') and npFilePath.endswith('.npy'):
        d = db.DB(inputTxtFilePath)

        vocIndexDict = {}

        with open(inputTxtFilePath, 'r') as inputTxtFile:
            print
            print 'vector dim: ' + inputTxtFile.readline()
            for i, line in enumerate(inputTxtFile):
                vocIndexDict[line.split()[0]] = i

        np.save(npFilePath, d.vect)

        with open(dictFilePath, 'w') as dictFile:
            json.dump(vocIndexDict, dictFile)
    else:
        print 'input file error'
    return list(conceptStrSet)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description='Find best pair match given a trained pair classifier')
    parser.add_argument("vocFilePath", help='voc file')
    parser.add_argument("trainedClfPath", help='trained classifier file')
    parser.add_argument("sourceConcept", help='concept source')
    parser.add_argument("targetClass",
                        help='class to search the best match for')
    parser.add_argument("--domain", help='restrict target domain')
    args = parser.parse_args()

    d = db.DB(args.vocFilePath)
    clf = dill.load(open(args.trainedClfPath))
    classIndex = clf.classes_.tolist().index(args.targetClass)

    conceptSource = d.get(args.sourceConcept)

    otherConceptStrList = [c for c in d.voc.keys() if c != conceptSource.word
                           ] if args.domain is None else extractSubDomain(
                               open(args.domain), conceptSource)
    conceptPairList = zip([conceptSource.word] * len(otherConceptStrList),
                          [args.targetClass] * len(otherConceptStrList),
                          otherConceptStrList)

    X = cpe.buildConceptPairList(d, conceptPairList, True)
    yProba = clf.predict_proba(X)
        l.split('\t') for l in open(inputConceptPath[0]).read().splitlines()
    ], inputConceptPath[1]


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description=
        'Predict concept pair class according to a trained classifier')
    parser.add_argument("vocFilePath", help='voc file')
    parser.add_argument("trainedClfPath", help='trained classifier file')
    parser.add_argument("inputConceptPairPathAndClassList",
                        nargs='+',
                        help='concept pair file list followed by class name')
    parser.add_argument("--compose",
                        help='try to compose concept',
                        action='store_false')
    args = parser.parse_args()

    vocFilePath = args.vocFilePath
    trainedClfPath = args.trainedClfPath
    inputConceptPairPathAndClassList = args.inputConceptPairPathAndClassList
    annotedConceptPairStrList = [
        extractAnnotedConceptPairStr(f)
        for f in pairwise(args.inputConceptPairPathAndClassList)
    ]
    strict = args.compose

    detailConceptPairClfError(db.DB(vocFilePath),
                              dill.load(open(trainedClfPath)),
                              annotedConceptPairStrList, strict)
    return pc.carthToPolar(vectorLine)[1:]


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description=
        'Convert a word2vec vocabulary/vector from carthesian to polar')
    parser.add_argument('inputFilePath', help='input database')
    parser.add_argument('outputFolderPath',
                        help='folder path to store the output database')
    parser.add_argument("--angular",
                        help='drop norm of vectors (angular)',
                        action='store_true')
    args = parser.parse_args()

    d = db.DB(args.inputFilePath)
    newVectFile, transformName = [], ''

    if args.angular:
        newVectFile = np.apply_along_axis(lineToAngular, 1, d.vect)
        transformName = '_angular'
    else:
        newVectFile = np.apply_along_axis(lineToPolar, 1, d.vect)
        transformName = '_polar'

    vectInPath, dictInPath = path(
        args.inputFilePath), path(args.inputFilePath + 'dict')
    outParentPath = path(args.outputFolderPath)
    vectOutPath = path(outParentPath / vectInPath.namebase + transformName +
                       vectInPath.ext)
    dictOutPath = path(outParentPath / dictInPath.namebase + transformName +
    sku.detailClassificationError(clf, cpList, cpList, yTrue, True)


def extractAnnotedConceptStr(inputConceptPath):
    return open(inputConceptPath[0]).read().splitlines(), inputConceptPath[1]


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description='detail concept classifier error')
    parser.add_argument("vocFilePath", help='voc file')
    parser.add_argument("trainedClfPath", help='trained classifier file')
    parser.add_argument("inputConceptPathandClassList",
                        nargs='+',
                        help='concept file list followed by class name')
    parser.add_argument("--compose",
                        help='try to compose concept',
                        action='store_false')
    args = parser.parse_args()

    vocFilePath = args.vocFilePath
    trainedClfPath = args.trainedClfPath
    annotedConceptStrList = [
        extractAnnotedConceptStr(f)
        for f in pairwise(args.inputConceptPathandClassList)
    ]
    strict = args.compose

    detailConceptClfError(db.DB(vocFilePath), dill.load(open(trainedClfPath)),
                          annotedConceptStrList, strict)
예제 #9
0
 def test_buildFromNpyFile(self):
     d = db.DB('../data/voc/npy/googleNews_mini.npy')
     self.assertEquals(d.get('</s>').vect[0], d.vect[0][0])
예제 #10
0
 def test_buildFromTxtFile(self):
     d = db.DB('../data/voc/txt/googleNews_mini.txt')
     self.assertEquals(d.get('</s>').vect[0], d.vect[0][0])