def main(treeModelPath, dataInputPath, resultOutPath, debug): # read model treeModel = readModel(treeModelPath) # create output dir if not os.path.isdir(resultOutPath): os.mkdir(resultOutPath) if debug: pool = multiprocessing.Pool(processes=1) else: pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) print "Number of core: %d" % (multiprocessing.cpu_count()) start_time = datetime.now() jobN = 0 for filename in os.listdir(dataInputPath): if ".json" in filename: if debug: # debug model just test 1 file in 1 process # filterFiles(jobN,filename,treeModel,debug) pool.apply_async(filterFiles, (jobN, filename, dataInputPath, resultOutPath, treeModel, debug)) break else: pool.apply_async(filterFiles, (jobN, filename, dataInputPath, resultOutPath, treeModel, debug)) jobN += 1 pool.close() pool.join() diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputModel, inputPath, outputFileName): def trace(root): for terms in root: if terms == "_rls_": if len(root[terms]) > 1: m_c = 0 m_r = "" for rela in root[terms]: if properties[rela] > m_c: m_c = properties[rela] m_r = rela root[terms] = [m_r] print root["_ptn_"], "->", root[terms] else: pass elif terms == "_ptn_": pass else: trace(root[terms]) properties = buildProperties("../naive_model/PbR/") treeModel = readModel(inputModel) #trace(treeModel) start_time = datetime.now() result = [] pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 for filename in os.listdir(inputPath): if ".json" in filename: partAns = copy.deepcopy(properties) result.append( pool.apply_async(findAnwser, ( t, filename, inputPath, partAns, ))) t += 1 pool.close() pool.join() for res in result: r = res.get() for m in r: properties[m] += r[m] trace(treeModel) print "start write out to %s" % (outputFileName) json.dump(treeModel, open(outputFileName, "w")) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def overlapDetect(): #print "Start to detect overlap between relationships" treeModel = readModel("./patternTree.json") overlap = {} patterns = {} for filename in os.listdir("../naive_model/PbR"): relation = filename[:-4] f = open(os.path.join("../naive_model/PbR", filename), "r") overlap[relation] = {} for line in f: words = line[:-2].lower().split() #print filename,words t = treeModel for word in words: t = t[word] for rls in t["_rls_"]: if not rls in overlap[relation]: overlap[relation][rls] = 0 overlap[relation][rls] += 1 if not t["_ptn_"] in patterns: patterns[t["_ptn_"]] = [] if relation not in patterns[t["_ptn_"]]: patterns[t["_ptn_"]].append(relation) #print relation,len(overlap[relation]),overlap[relation] f.close() #print "unique patten #:",len(patterns) if len(sys.argv) > 1: for relation in overlap: over = overlap[relation] over = sorted(over.items(), key=lambda x: x[1], reverse=True) print relation rela, total = over[0] for rela, count in over: p = 100 * float(count) / float(total) p2 = 100 * float(overlap[rela][relation]) / float( overlap[rela][rela]) if p > .0: print "\t", rela, count, "(%.2f%% / %.2f%% (%d))" % ( p, p2, overlap[rela][rela]) else: break else: sortedPattern = sorted(patterns.items(), key=lambda x: len(x[1]), reverse=True) for ptn, rls in sortedPattern: print ptn, len(rls), rls
def overlapDetect(): #print "Start to detect overlap between relationships" treeModel = readModel("./patternTree.json") overlap = {} patterns = {} for filename in os.listdir("../naive_model/PbR"): relation = filename[:-4] f = open(os.path.join("../naive_model/PbR",filename),"r") overlap[relation] = {} for line in f: words = line[:-2].lower().split() #print filename,words t = treeModel for word in words: t = t[word] for rls in t["_rls_"]: if not rls in overlap[relation]: overlap[relation][rls] = 0 overlap[relation][rls] += 1 if not t["_ptn_"] in patterns: patterns[t["_ptn_"]] = [] if relation not in patterns[t["_ptn_"]]: patterns[t["_ptn_"]].append(relation) #print relation,len(overlap[relation]),overlap[relation] f.close() #print "unique patten #:",len(patterns) if len(sys.argv) > 1: for relation in overlap: over = overlap[relation] over = sorted(over.items(), key=lambda x:x[1],reverse=True) print relation rela, total = over[0] for rela,count in over: p = 100*float(count)/float(total) p2 = 100*float(overlap[rela][relation])/float(overlap[rela][rela]) if p > .0: print "\t",rela,count,"(%.2f%% / %.2f%% (%d))" % (p,p2,overlap[rela][rela]) else: break else: sortedPattern = sorted(patterns.items(), key=lambda x:len(x[1]), reverse=True) for ptn,rls in sortedPattern: print ptn,len(rls),rls
def buildOverlapMatrix(): treeModel = readModel("./patternTree.json") keys = [] # overlapMatrix [ relaA ] [ relaB ] # means that relaA in relaB count (%?) overlapMatrix = {} for filename in os.listdir("../naive_model/PbR/"): keys.append(filename[:-4]) for relation in keys: overlapMatrix[relation] = {} for rela in keys: overlapMatrix[relation][rela] = 0 # read tree for ontology in overlapMatrix: f = open("../naive_model/PbR/%s.txt" % (ontology), "r") for pattern in f: words = pattern[:-2].lower().split() t = treeModel for word in words: t = t[word] for relaA in t["_rls_"]: overlapMatrix[ontology][relaA] += 1 f.close() over = {} for ontology in overlapMatrix: thisCount = overlapMatrix[ontology][ontology] for otherOntology in overlapMatrix[ontology]: if otherOntology == ontology: continue otherCount = overlapMatrix[ontology][otherOntology] if thisCount == otherCount: if not ontology in over: over[ontology] = [] if not otherOntology in over[ontology]: over[ontology].append(otherOntology) for ontology in over: for otherOntology in over[ontology]: if otherOntology in over and ontology in over[otherOntology]: print ontology, "<->", otherOntology
def main(inputModel, inputPath, outputFileName): def trace(root): for terms in root: if terms == "_rls_": if len(root[terms]) > 1: m_c = 0 m_r = "" for rela in root[terms]: if properties[rela] > m_c: m_c = properties[rela] m_r = rela root[terms] = [m_r] print root["_ptn_"], "->", root[terms] else: pass elif terms == "_ptn_": pass else: trace(root[terms]) properties = buildProperties("../naive_model/PbR/") treeModel = readModel(inputModel) # trace(treeModel) start_time = datetime.now() result = [] pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 for filename in os.listdir(inputPath): if ".json" in filename: partAns = copy.deepcopy(properties) result.append(pool.apply_async(findAnwser, (t, filename, inputPath, partAns))) t += 1 pool.close() pool.join() for res in result: r = res.get() for m in r: properties[m] += r[m] trace(treeModel) print "start write out to %s" % (outputFileName) json.dump(treeModel, open(outputFileName, "w")) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)