Пример #1
0
def main(inputFiles, outputPath, n):
    rule = ".json"
    files = Queue.Queue(0)

    # if target dir not exist, create it.
    if not os.path.isdir(outputPath):
        os.mkdir(outputPath)

    def workerFunction(jobObj, tid, args):
        content = json.load(open(os.path.join(inputFiles, jobObj), "r"))
        print "worker #%02d read file %s" % (tid, jobObj)
        dealL = 0
        rawGram = {}
        for subFilename in content:
            ngl = []
            for line in content[subFilename]:
                dealL += 1
                if n == 2:
                    # FIXME
                    ngs = line.lower().replace("[", " ").replace(
                        "]", " ").replace("!", " ").replace("?", " ").replace(
                            ",", " ").replace(")", " ").replace("(",
                                                                " ").split()
                    if len(ngs) > 2:
                        for i in xrange(1, len(ngs)):
                            ngl.append("%s\t%s" % (ngs[i - 1], ngs[i]))

                    else:
                        pass

                else:
                    # do the n-gram using nltk
                    ngs = ngrams(word_tokenize(line.lower()), n)
                    if len(ngs) > 1:
                        continue
                    for ng in ngs:
                        ngl.append(toStringForm(ng))

                if dealL % 10000 == 0:
                    print "worker #%02d deal with %d lines" % (tid, dealL)

            rawGram[subFilename] = ngl

        json.dump(rawGram, open(os.path.join(outputPath, jobObj), "w"))

    fileNameList = []
    for filename in os.listdir(inputFiles):
        if rule in filename:
            fileNameList.append(filename)
            #files.put(filename)

    fileNameList.sort()
    for filename in fileNameList:
        files.put(filename)

    manager = Manager(workerNumber=15)
    manager.setJobQueue(files)
    manager.setWorkerFunction(workerFunction)
    manager.startWorking()
Пример #2
0
def main(inputFiles,outputPath,n):
    rule = ".json"
    files = Queue.Queue(0)

    # if target dir not exist, create it.
    if not os.path.isdir(outputPath):
        os.mkdir(outputPath)

    def workerFunction(jobObj,tid,args):
        content = json.load(open(os.path.join(inputFiles,jobObj),"r"))
        print "worker #%02d read file %s" % (tid,jobObj)
        dealL = 0
        rawGram = {}
        for subFilename in content:
            ngl = []
            for line in content[subFilename]:
                dealL += 1
                if n == 2:
                    # FIXME
                    ngs = line.lower().replace("["," ").replace("]"," ").replace("!"," ").replace("?"," ").replace(","," ").replace(")"," ").replace("("," ").split()
                    if len(ngs) > 2:
                        for i in xrange(1,len(ngs)):
                            ngl.append("%s\t%s" % (ngs[i-1],ngs[i]))

                    else:
                        pass

                else:
                    # do the n-gram using nltk
                    ngs = ngrams(word_tokenize(line.lower()),n)
                    if len(ngs) > 1:
                        continue
                    for ng in ngs:
                        ngl.append(toStringForm(ng))
            
                if dealL%10000 == 0:
                    print "worker #%02d deal with %d lines" % (tid,dealL)

            rawGram[subFilename] = ngl

        json.dump(rawGram,open(os.path.join(outputPath,jobObj),"w"))

    fileNameList = []
    for filename in os.listdir(inputFiles):
        if rule in filename:
            fileNameList.append(filename)
            #files.put(filename)
            
    fileNameList.sort()
    for filename in fileNameList:
        files.put(filename)

    manager = Manager(workerNumber=15)
    manager.setJobQueue(files)
    manager.setWorkerFunction(workerFunction)
    manager.startWorking()
Пример #3
0
def main(modelPath,inputPath,outputPath):
    ngram,models = readModel(modelPath)
    files = Queue.Queue(0)
    print "Read %d %d-gram models" % (len(models),ngram)

    if not os.path.isdir(outputPath):
        os.mkdir(outputPath)

    def workerFunction(jobObj,tid,args):
        a = datetime.now()
        print "worker #%02d start read file %s" % (tid,jobObj)
        f = open(os.path.join(inputPath,jobObj),"r")
        content = json.load(f)
        f.close()
        diff = datetime.now() - a
        print "worker #%02d read file %s, use [%d.%d s]" % (tid,jobObj,diff.seconds,diff.microseconds)
        results = {}
        count = 0
        for subFilename in content:
            ngs = content[subFilename]
            count += 1
            result = {}
            for model in models:
                mn = models[model]
                #result[model] = 0
                for ng in ngs:
                    if ng in mn:
                        if not model in result:
                            result[model] = 0
                        result[model]+=1

            if count % 100 == 0:
                print "worker #%02d scan %d files" % (tid,count)

            results[subFilename] = result
        
        a = datetime.now()
        print "worker #%02d start write file %s" % (tid,jobObj)
        f = open(os.path.join(outputPath,jobObj),"w")
        json.dump(results,f)
        f.close()
        diff = datetime.now() - a
        print "worker #%02d  write file %s, use [%d.%d s]" % (tid,jobObj,diff.seconds,diff.microseconds)


    for filename in os.listdir(inputPath):
        if ".json" in filename:
            files.put(filename)

    manager = Manager(25)
    manager.setJobQueue(files)
    manager.setWorkerFunction(workerFunction)
    manager.startWorking()
Пример #4
0
def main(modelPath, inputFiles, outputPath):
    ngram, models = readModel(modelPath)
    rule = ".json"
    files = Queue.Queue(0)

    start_time = datetime.now()

    # if target dir not exist, create it.
    if not os.path.isdir(outputPath):
        os.mkdir(outputPath)

    def workerFunction(jobObj, tid, args):
        content = json.load(open(os.path.join(inputFiles, jobObj), "r"))
        print "worker #%02d read file %s" % (tid, jobObj)
        dealL = 0
        count = 0
        results = {}
        for subFilename in content:
            count += 1
            ngl = []
            for line in content[subFilename]:
                dealL += 1
                # FIXME - only implement bigram
                ngs = (
                    line.lower()
                    .replace("[", " ")
                    .replace("]", " ")
                    .replace("!", " ")
                    .replace("?", " ")
                    .replace(",", " ")
                    .replace(")", " ")
                    .replace("(", " ")
                    .split()
                )
                for i in xrange(1, len(ngs)):
                    ngl.append("%s\t%s" % (ngs[i - 1], ngs[i]))

                # try to find hit.
                result = {}
                if len(ngl) < 1:
                    continue
                for model in models:
                    mn = models[model]
                    for ng in ngs:
                        if ng in mn:
                            if not model in result:
                                result[model] = 0
                            result[model] += 1

                if dealL % 10000 == 0:
                    print "worker #%02d deal with %d lines" % (tid, dealL)

            if count % 100 == 0:
                print "worker #%02d scan %d files" % (tid, count)
            results[subFilename] = result

        json.dump(results, open(os.path.join(outputPath, jobObj), "w"))

    fileNameList = []
    for filename in os.listdir(inputFiles):
        if rule in filename:
            # fileNameList.append(filename)
            files.put(filename)

    # fileNameList.sort()
    # for filename in fileNameList:
    #    files.put(filename)

    manager = Manager(workerNumber=25)
    manager.setJobQueue(files)
    manager.setWorkerFunction(workerFunction)
    manager.startWorking()

    diff = datetime.now() - start_time
    print "All job done, use %d.%d secs" % (diff.seconds, diff.microseconds)
Пример #5
0
def main(modelPath, inputPath, outputFileName):
    ngram, models = readModel(modelPath)
    resultQueue = Queue.Queue(0)
    for model in models:
        models[model] = {
            "tp": 0,  # true - postive            Real Ans: True    False
            "tn": 0,  # true - negative    Model told:
            "fp": 0,  # false - postive                 Yes  tp      fp     
            "fn": 0  # false - negative                 No  fn      tn
        }
    connect = pymongo.Connection()
    db = connect.projizz
    ansCol = db.result.data.instance

    def workerFunction(jobObj, tid, args):
        resultJson = json.load(open(os.path.join(inputPath, jobObj), "r"))
        print "worker #%02d read file %s" % (tid, jobObj)

        queries = map(lambda x: x[:-4], resultJson)
        itr = ansCol.find({"revid": {"$in": queries}})
        print "worker #%02d query=%d, result=%d" % (tid, len(queries),
                                                    itr.count())

        partAns = copy.deepcopy(models)

        count = 0
        for ans in itr:
            count += 1
            features = ans["features"]
            resultInstance = resultJson["%s.txt" % (ans["revid"])]

            for relationship in partAns:
                postive = False
                true = False

                if relationship in resultInstance and resultInstance[
                        relationship] > 0:
                    postive = True

                if relationship in features:
                    true = True

                if true:
                    if postive:
                        partAns[relationship]["tp"] += 1
                    else:
                        partAns[relationship]["fn"] += 1
                else:
                    if postive:
                        partAns[relationship]["fp"] += 1
                    else:
                        partAns[relationship]["tn"] += 1

            if count % 100 == 0:
                print "worker #%02d done %d." % (tid, count)

        resultQueue.put(partAns)

    files = Queue.Queue(0)
    for filename in os.listdir(inputPath):
        if ".json" in filename:
            files.put(filename)

    manager = Manager(8)
    manager.setJobQueue(files)
    manager.setWorkerFunction(workerFunction)
    manager.startWorking()

    print "Result Queue size", resultQueue.qsize()

    while True:
        if resultQueue.empty():
            break
        try:
            r = resultQueue.get()
            for m in r:
                models[m]["tp"] += r[m]["tp"]
                models[m]["tn"] += r[m]["tn"]
                models[m]["fp"] += r[m]["fp"]
                models[m]["fn"] += r[m]["fn"]
        except:
            break

    print "start write out to %s" % (outputFileName)
    json.dump(models, open(outputFileName, "w"))
    print "done"
Пример #6
0
def main(modelPath,inputFiles,outputPath):
    ngram,models = readModel(modelPath)
    rule = ".json"
    files = Queue.Queue(0)

    start_time = datetime.now() 

    # if target dir not exist, create it.
    if not os.path.isdir(outputPath):
        os.mkdir(outputPath)

    def workerFunction(jobObj,tid,args):
        content = json.load(open(os.path.join(inputFiles,jobObj),"r"))
        print "worker #%02d read file %s" % (tid,jobObj)
        dealL = 0
        count = 0
        results = {}
        for subFilename in content:
            count += 1
            ngl = []
            for line in content[subFilename]:
                dealL += 1
                # FIXME - only implement bigram
                ngs = line.lower().replace("["," ").replace("]"," ").replace("!"," ").replace("?"," ").replace(","," ").replace(")"," ").replace("("," ").split()
                for i in xrange(1,len(ngs)):
                    ngl.append("%s\t%s" % (ngs[i-1],ngs[i]))
                
                # try to find hit.
                result = {}
                if len(ngl) < 1:
                    continue
                for model in models:
                    mn = models[model]
                    for ng in ngs:
                        if ng in mn:
                            if not model in result:
                                result[model] = 0
                            result[model] += 1

                if dealL%10000 == 0:
                    print "worker #%02d deal with %d lines" % (tid,dealL)
            
            if count % 100 == 0:
                print "worker #%02d scan %d files" % (tid,count)
            results[subFilename] = result

        json.dump(results,open(os.path.join(outputPath,jobObj),"w"))

    fileNameList = []
    for filename in os.listdir(inputFiles):
        if rule in filename:
            #fileNameList.append(filename)
            files.put(filename)
            
    #fileNameList.sort()
    #for filename in fileNameList:
    #    files.put(filename)

    manager = Manager(workerNumber=25)
    manager.setJobQueue(files)
    manager.setWorkerFunction(workerFunction)
    manager.startWorking()

    diff = datetime.now() - start_time
    print "All job done, use %d.%d secs" % (diff.seconds,diff.microseconds)
Пример #7
0
def main(modelPath,inputPath,outputFileName):
    ngram, models = readModel(modelPath)
    resultQueue = Queue.Queue(0)
    for model in models:
        models[model] = {
                "tp":0, # true - postive            Real Ans: True    False
                "tn":0, # true - negative    Model told:
                "fp":0, # false - postive                 Yes  tp      fp     
                "fn":0  # false - negative                 No  fn      tn
                }
    connect = pymongo.Connection()
    db = connect.projizz
    ansCol = db.result.data.instance

    def workerFunction(jobObj,tid,args):
        resultJson = json.load(open(os.path.join(inputPath,jobObj),"r"))
        print "worker #%02d read file %s" % (tid,jobObj) 

        queries = map(lambda x: x[:-4], resultJson)
        itr = ansCol.find({"revid":{"$in":queries}})
        print "worker #%02d query=%d, result=%d" % (tid,len(queries),itr.count())

        partAns = copy.deepcopy(models) 

        count = 0
        for ans in itr:
            count += 1
            features = ans["features"]
            resultInstance = resultJson["%s.txt" % (ans["revid"])]

            for relationship in partAns:
                postive = False
                true = False

                if relationship in resultInstance and resultInstance[relationship] > 0:
                    postive = True

                if relationship in features:
                    true = True
                
                if true:
                    if postive:
                        partAns[relationship]["tp"] += 1
                    else:
                        partAns[relationship]["fn"] += 1
                else:
                    if postive:
                        partAns[relationship]["fp"] += 1
                    else:
                        partAns[relationship]["tn"] += 1

            if count % 100 == 0:
                print "worker #%02d done %d." % (tid,count)

        resultQueue.put(partAns)

    files = Queue.Queue(0)
    for filename in os.listdir(inputPath):
        if ".json" in filename:
            files.put(filename)

    manager = Manager(8)
    manager.setJobQueue(files)
    manager.setWorkerFunction(workerFunction)
    manager.startWorking()

    print "Result Queue size", resultQueue.qsize()

    while True:
        if resultQueue.empty():
            break
        try:
            r = resultQueue.get()
            for m in r:
                models[m]["tp"] += r[m]["tp"]
                models[m]["tn"] += r[m]["tn"]
                models[m]["fp"] += r[m]["fp"]
                models[m]["fn"] += r[m]["fn"]
        except:
            break

    print "start write out to %s" % (outputFileName)
    json.dump(models,open(outputFileName,"w"))
    print "done"