def test_ExportArchiFolderModels(cleandir): if __name__ == u"__main__": cleandir() assert (os.path.isfile(fileArchimateTest) is True) al = ArchiLib(fileArchimateTest) folder = u"Scenarios" logger.info(u"Exporting Folder : %s" % folder) listMTE = al.getModelsInFolder(folder) assert (listMTE is not None) logger.info(u"len(listMTE) = %d" % len(listMTE)) assert (len(listMTE) == 2) concepts = Concepts(u"Export", u"Pickle") for ModelToExport in listMTE: logger.info(u" Model : %s" % ModelToExport) d = concepts.addConceptKeyType(ModelToExport, u"Model") al.recurseModel(ModelToExport) al.outputCSVtoFile(concepts, fileExport=fileCSVExport) assert (os.path.isfile(fileCSVExport) is True) Concepts.saveConcepts(concepts, fileConceptsExport) logger.info(u"Save Concepts : %s" % fileConceptsExport) assert (os.path.isfile(fileConceptsExport) is True)
def requirementAnalysis(fileArchimate=None): if fileArchimate is None: fileArchimate = u"/Users/morrj140/Documents/SolutionEngineering/Archimate Models/DVC v38.archimate" al = ArchiLib(fileArchimate) conceptsFile = fileConceptsRequirements searchTypes = list() searchTypes.append(u"archimate:Requirement") nl = al.getTypeNodes(searchTypes) logger.info(u"Find Words in Requirements...") concepts = Concepts(u"Requirement", u"Requirements") n = 0 for sentence in nl: n += 1 logger.debug(u"%s" % sentence) c = concepts.addConceptKeyType(u"Document" + str(n), u"Document") d = c.addConceptKeyType(sentence, u"Sentence" + str(n)) if True and sentence is not None: cleanSentence = ' '.join([word for word in sentence.split(u" ") if word not in stop]) for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(cleanSentence)): if len(word) > 1 and pos[0] == u"N": e = d.addConceptKeyType(word, u"Word") f = e.addConceptKeyType(pos, u"POS") Concepts.saveConcepts(concepts, conceptsFile) logger.info(u"Saved : %s" % conceptsFile) chunks = Chunks(concepts) chunks.createChunks()
def test_ExportArchi(cleandir): if __name__ == u"__main__": cleandir() logger.info(u"Using : %s" % fileArchimateTest) assert (os.path.isfile(fileArchimateTest) is True) al = None concepts = None al = ArchiLib(fileArchimateTest) assert (al is not None) concepts = Concepts(u"Node", u"Nodes") assert (concepts is not None) logger.info(u"Found %d Nodes" % len(al.dictNodes)) logger.info(u"Found %d Edges" % len(al.dictEdges)) assert (len(al.dictNodes) == 45) assert (len(al.dictEdges) == 36) count = 0 listTSort = list() for x in al.dictEdges.keys(): logger.info(u"[%s]=%s" % (al.dictEdges[x][u"id"], x)) if u"source" in al.dictEdges[x]: source = al.dictEdges[x][u"source"] target = al.dictEdges[x][u"target"] logger.info(u" Rel : %s" % (al.dictEdges[x][ARCHI_TYPE])) sourceName = al.getNodeName(source) targetName = al.getNodeName(target) logger.info( u" %s--%s--%s" % (sourceName, al.dictEdges[x][ARCHI_TYPE][10:], targetName)) sc = concepts.addConceptKeyType( sourceName, al.dictNodes[source][ARCHI_TYPE][10:]) # getWords(sourceName, sc) tc = sc.addConceptKeyType(targetName, al.dictNodes[target][ARCHI_TYPE][10:]) # getWords(sourceName, tc) Concepts.saveConcepts(concepts, fileConceptsExport) assert (len(concepts.cd) == 17) assert (os.path.isfile(fileConceptsExport) is True) assert (concepts.typeName == u"Nodes")
def saveTopics(self, topics): wordConcepts = Concepts(u"TopicConcepts", u"Topics") for topic in topics: logger.debug(u"Topic:" + topic[0]) w = wordConcepts.addConceptKeyType(topic[0], u"Topic") w.count = topic[1] Concepts.saveConcepts(wordConcepts, self.topicsFile) return wordConcepts
def exportArchi(self): m = hashlib.md5() concepts = Concepts(u"Node", u"Nodes") logger.info(u"Found %d Nodes" % len(self.al.dictNodes)) logger.info(u"Found %d Edges" % len(self.al.dictEdges)) count = 0 listTSort = list() for x in self.al.dictEdges.keys(): logger.debug(u"Edge [%s]=%s" % (self.al.dictEdges[x], x)) if self.al.dictEdges[x].has_key(u"source") and self.al.dictEdges[x].has_key(u"target"): typeEdge = self.al.dictEdges[x][ARCHI_TYPE] logger.debug(u"Edge : %s" % typeEdge) source = self.al.dictEdges[x][u"source"] logger.debug(u"Source : %s" % source) target = self.al.dictEdges[x][u"target"] logger.debug(u"Target : %s" % target) logger.debug(u" Rel : %s" % (self.al.dictEdges[x][ARCHI_TYPE])) sourceName = self.al.getNodeName(source) targetName = self.al.getNodeName(target) logger.debug(u" %s--%s--%s" % (sourceName, self.al.dictEdges[x][ARCHI_TYPE][10:], targetName)) if source in self.al.dictNodes: l = list() sc = concepts.addConceptKeyType(sourceName, self.al.dictNodes[source][ARCHI_TYPE][10:]) # getWords(sourceName, sc) nameEdge = u"(" + sourceName + u"," + targetName + u")" logger.debug(u"nameEdge : %s[%d]" % (nameEdge, len(nameEdge))) logger.debug(u"typeEdge : %s" % typeEdge[10:]) ne = str(self.al.cleanString(nameEdge)) hl = hashlib.sha224(str(ne)).hexdigest() logger.debug(u"hash : %s" % hl) nh = u"%s-%s" % (typeEdge[10:], hl) rc = sc.addConceptKeyType(nh, typeEdge[10:]) if self.al.dictNodes.has_key(target): tc = rc.addConceptKeyType(targetName, self.al.dictNodes[target][ARCHI_TYPE][10:]) # getWords(sourceName, tc) Concepts.saveConcepts(concepts, self.fileConceptsExport) return concepts
def getOpenXmlText(filename, ftype): logger.info("OpenXmlText: %s" % filename) document = openxmldoc doc = openxmllib.openXmlDocument(path=filename) c = Concepts(filename, ftype) logger.debug ("%s\n" % (doc.allProperties)) ap = c.addConceptKeyType("allProperties","PROPERTIES") for x in doc.allProperties: logger.info("cp %s:%s" % (x, doc.allProperties[x])) ap.addConceptKeyType(doc.allProperties[x], x) logger.info("it %s\n" % (doc.indexableText(include_properties=True))) c.addConceptKeyType(doc.indexableText(include_properties=True),"TEXT") return c
def test_ExportArchi(cleandir): if __name__ == u"__main__": cleandir() logger.info(u"Using : %s" % fileArchimateTest) assert (os.path.isfile(fileArchimateTest) is True) al = None concepts = None al = ArchiLib(fileArchimateTest) assert (al is not None) concepts = Concepts(u"Node", u"Nodes") assert (concepts is not None) logger.info(u"Found %d Nodes" % len(al.dictNodes)) logger.info(u"Found %d Edges" % len(al.dictEdges)) assert (len(al.dictNodes) == 45) assert (len(al.dictEdges) == 36) count = 0 listTSort = list() for x in al.dictEdges.keys(): logger.info(u"[%s]=%s" % (al.dictEdges[x][u"id"], x)) if u"source" in al.dictEdges[x]: source = al.dictEdges[x][u"source"] target = al.dictEdges[x][u"target"] logger.info(u" Rel : %s" % (al.dictEdges[x][ARCHI_TYPE])) sourceName = al.getNodeName(source) targetName = al.getNodeName(target) logger.info(u" %s--%s--%s" % (sourceName, al.dictEdges[x][ARCHI_TYPE][10:], targetName)) sc = concepts.addConceptKeyType(sourceName, al.dictNodes[source][ARCHI_TYPE][10:]) # getWords(sourceName, sc) tc = sc.addConceptKeyType(targetName, al.dictNodes[target][ARCHI_TYPE][10:]) # getWords(sourceName, tc) Concepts.saveConcepts(concepts, fileConceptsExport) assert(len(concepts.cd) == 17) assert (os.path.isfile(fileConceptsExport) is True) assert(concepts.typeName == u"Nodes")
def getPDFText(filename): logger.info("filename: %s" % filename) newparatextlist = [] pdfDoc = PdfFileReader(file(filename, "rb")) pdfDict = pdfDoc.getDocumentInfo() c = Concepts(filename, "PDF") for x in pdfDict.keys(): try: c.addConceptKeyType(x[1:], pdfDict[x]) except: logger.warn("ops...") #c.logConcepts() for page in pdfDoc.pages: text = page.extractText() logger.info("PDF : %s" % text) newparatextlist.append(text + ". ") return newparatextlist
def exportArchiFolderModels(self, folder): logger.info(u"Exporting Folder : %s" % folder) listMTE = self.al.getModelsInFolder(folder) concepts = Concepts(u"Export", u"Pickle") for ModelToExport in listMTE: logger.info(u" Model : %s" % ModelToExport) d = concepts.addConceptKeyType(ModelToExport, u"Model") self.al.recurseModel(ModelToExport, d) self.al.outputCSVtoFile(concepts, fileCSVExport) Concepts.saveConcepts(concepts, self.conceptsFile) logger.info(u"Save Concepts : %s" % self.conceptsFile)
def test_RequirementAnalysis(cleandir, fileArchimate): assert (os.path.isfile(filePPTXIn) is True) al = ArchiLib(fileArchimate) conceptsFile = fileConceptsRequirements searchTypes = list() searchTypes.append(u"archimate:Requirement") nl = al.getTypeNodes(searchTypes) logger.info(u"Find Words in Requirements...") concepts = Concepts(u"Requirement", u"Requirements") n = 0 for sentence in nl: n += 1 logger.debug(u"%s" % sentence) c = concepts.addConceptKeyType(u"Document" + unicode(n), u"Document") d = c.addConceptKeyType(sentence, u"Sentence" + unicode(n)) if True and sentence is not None: cleanSentence = ' '.join([word for word in sentence.split(" ") if word not in stop]) for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(cleanSentence)): if len(word) > 1 and pos[0] == u"N": e = d.addConceptKeyType(word, u"Word") f = e.addConceptKeyType(pos, u"POS") Concepts.saveConcepts(concepts, conceptsFile) logger.info(u"Saved : %s" % conceptsFile) assert (os.path.isfile(conceptsFile) is True) chunks = Chunks(concepts) chunks.createChunks() assert (os.path.isfile(fileConceptsChunks) is True)
class DocumentsSimilarity(object): concepts = None conceptsSimilarity = None tm = None documentsList = None wordcount = None threads = None topics = None topicConcepts = None mapDocumentList = None df = None mapDocuments = None def __init__(self): self.threads = list() def createTopics(self, conceptsFile, concepts=None): if concepts is None: logger.info(u"Load Concepts from " + conceptsFile) self.concepts = Concepts.loadConcepts(conceptsFile) logger.info(u"Loaded Concepts") else: self.concepts = concepts self.tm = TopicsModel() logger.info(u"Load Documents from Concepts") self.documentsList, self.wordcount = self.tm.loadConceptsWords(self.concepts) self.mapDocuments = { self.documentsList.index(x) : x for x in self.documentsList} self.df = self.concepts.getConcepts().keys() logger.info(u"Read " + str(len(self.documentsList)) + u" Documents, with " + str(self.wordcount) + u" words.") if self.wordcount == 0: logger.error(u"No topics to use!") return None logger.info(u"Compute Topics") self.topics = self.tm.computeTopics(self.documentsList, nt=num_topics, nw=num_words) logger.info(u"Log Topics") self.tm.logTopics(self.topics) self.listTopics = [unicode(x[0]).strip().replace(u"\"", u"") for x in self.topics] logger.info(u"Saving Topics") self.topicConcepts = self.tm.saveTopics(self.topics) logger.info(u"Complete createTopics") return self.concepts def findSimilarties(self, conceptsSimilarityFile): logger.info(u"Compute Similarity") self.conceptsSimilarity = Concepts(u"ConceptsSimilarity", u"Similarities") # Compute similarity between documents / concepts similarityThreshold = similarity if THREAD: # startup the threads for threadID in range(0,ThreadDepth): thread = myThread(threadID, self) thread.start() self.threads.append(thread) for document in self.documentsList: indexNum = self.documentsList.index(document) logger.info(u"Document %s" % (self.df[indexNum])) pj = self.conceptsSimilarity.addConceptKeyType(self.df[indexNum], u"Document") logger.debug(u" documentsList[%d] = %s" % (indexNum, str(document))) # Show common topics d = [unicode(x).strip().replace(u"'", u"") for x in document] e = [unicode(y).strip().replace(u"\"", u"") for y in self.listTopics] s1 = set(e) s2 = set(d) common = s1 & s2 lc = [x for x in common] logger.debug(u" Common Topics : %s" % (lc)) if THREAD is False: self.doComputation(document, similarityThreshold, pj, Topics=True) else: logger.debug(u"npbtAquire Queue Lock") queueLock.acquire() logger.debug(u"npbtPut Queue ") rl = [document, similarityThreshold] while workQueue.qsize() == QueueDepth: time.sleep(1) workQueue.put(rl) queueLock.release() logger.debug(u"npbtRelease Queue Lock") qs = workQueue.qsize() if qs % QueueDelta == 0: logger.info(u"rQueue Size = %s" % qs) # Wait for queue to empty qs = workQueue.qsize() while qs != 0: time.sleep(1) qs = workQueue.qsize() if qs % QueueDelta == 0: logger.info(u"wQueue Size = %s" % qs) # Notify threads it's time to exit exitFlag = 1 # Wait for all threads to complete for t in self.threads: logger.info(u"Waiting for thread %s to end..." % t) t.join(0.5) Concepts.saveConcepts(self.conceptsSimilarity, conceptsSimilarityFile) # Concepts.outputConceptsToCSV(self.conceptsSimilarity, fileExport=u"BusinessRequirements.csv") logger.info(u"Complete - findSimilarties") return self.conceptsSimilarity def doComputation(self, j, similarityThreshold, pj, Topics=False): pt = None pl = self.tm.computeSimilar(self.documentsList.index(j), self.documentsList, similarityThreshold) if len(pl) != 0: logger.debug(u" similarity above threshold") logger.debug(u" pl:" + str(pl)) for l in pl: if l[1] != l[2]: logger.debug(u" l:" + str(l)) ni = self.documentsList.index(l[2]) mdl = u",".join([ q for q in self.mapDocuments[ni]]) dfni = unicode(self.df[ni]) logger.info(u" Similar Document : %s" % (dfni)) ps = pj.addConceptKeyType(dfni, u"SimilarDocument") # ps.count = TopicsModel.convertMetric(l[0]) common = set(l[1]) & set(l[2]) lc = [x for x in common] logger.debug(u" l[1] : %s" % (l[1])) logger.debug(u" l[2] : %s" % (l[2])) logger.debug(u" Common : %s" % (lc)) pt = ps.addConceptKeyType(mdl, u"DocumentTopics") for x in common: pc = pt.addConceptKeyType(x, u"CommonTopic") pc.count = len(lc) else: logger.debug(u" similarity below threshold")
class DocumentsSimilarity(object): conceptsDoc = None conceptsSimilarity = None conceptsSimilarityFile = None tm = None documentsList = None wordcount = None threads = None topics = None topicConcepts = None lemmatizer = None df = None def __init__(self, al): self.num_topics = 100 self.num_words = 50 self.similarity = 0.95 self.al = al self.conceptsSimilarityFile = u"GapsSimilarity.p" def createTopics(self, concepts): self.concepts = concepts self.tm = TopicsModel(directory=os.getcwd() + os.sep) logger.debug(u"--Load Documents from Concepts") self.documentsList, self.wordcount = self.tm.loadConceptsWords(self.concepts) logger.info(u"--Read %s Documents, with %s words." % (str(len(self.documentsList)), str(self.wordcount))) logger.info(u"--Compute Topics--") self.topics = self.tm.computeTopics(self.documentsList, nt=self.num_topics, nw=self.num_words) if False: logger.info(u"--Log Topics--") self.tm.logTopics(self.topics) # self.listTopics = [x[0].encode('ascii', errors="ignore").strip() for x in self.topics] self.listTopics = [x[0] for x in self.topics] logger.info(u"--Saving Topics--") self.topicConcepts = self.tm.saveTopics(self.topics) def findSimilarties(self): logger.info(u"Compute Similarity") self.conceptsSimilarity = Concepts(u"ConceptsSimilarity", u"Similarities") # Compute similarity between documents / concepts similarityThreshold = self.similarity for document in self.documentsList: indexNum = self.documentsList.index(document) self.df = self.concepts.getConcepts().keys() logger.info(u"++conceptsDoc %s" % (self.df[indexNum])) logger.info(u" documentsList[" + str(indexNum) + u"]=" + u"".join(x + u" " for x in document)) # Show common topics d = [unicode(x).strip().replace(u"'", u"") for x in document] e = [unicode(y).strip().replace(u"\"", u"") for y in self.listTopics] s1 = set(e) s2 = set(d) common = s1 & s2 lc = [x for x in common] logger.info(u" Common Topics : %s{%s}" % (lc, self.al.dictName[document][ARCHI_TYPE])) self.doComputation(indexNum, similarityThreshold, tfAddWords=True) Concepts.saveConcepts(self.conceptsSimilarity, conceptsSimilarityFile) logger.info(u"Saved Concepts : %s" % conceptsSimilarityFile) return self.conceptsSimilarity def doComputation(self, j, similarityThreshold, tfAddWords=True): logger.debug(u"--doComputation--") pl = self.tm.computeSimilar(j, self.documentsList, similarityThreshold) if len(pl) != 0: logger.debug(u" similarity above threshold - %2.3f" % (100.0 * float(pl[0][0]))) logger.debug(u" pl:" + str(pl)) for l in pl: if l[1] != l[2]: logger.debug(u" l:" + str(l)) l1 = u"".join(x + u" " for x in l[1]) ps = self.conceptsSimilarity.addConceptKeyType(l1, u"Similar") ps.count = TopicsModel.convertMetric(l[0]) l2 = u"".join(x + " " for x in l[2]) pt = ps.addConceptKeyType(l2, u"Concept") common = set(l[1]) & set(l[2]) lc = [x for x in common] logger.debug(u" l : %s" % l) logger.debug(u" l[1] : %s" % (l1)) logger.debug(u" l[2] : %s" % (l2)) logger.debug(u" Common : %s" % (lc)) if tfAddWords is True: for x in common: if not x in stop: logger.debug(u"word : %s" % x) pc = pt.addConceptKeyType(x, u"CommonTopic") pc.count = len(lc) else: logger.debug(u" similarity below threshold")
def gapSimilarity(fileArchimate, searchTypes): lemmatizer = WordNetLemmatizer() logger.info(u"Using : %s" % fileArchimate) al = ArchiLib(fileArchimate) nl = al.getTypeNodes(searchTypes) logger.info(u"Find Words...") concepts = Concepts(u"Word", u"Topic") n = 0 for sentence in nl: n += 1 if sentence is None: continue logger.info(u"%s" % sentence) c = concepts.addConceptKeyType(u"Document" + str(n), nl[sentence][ARCHI_TYPE]) d = c.addConceptKeyType(sentence, nl[sentence][ARCHI_TYPE]) cleanSentence = u' '.join([word for word in sentence.split(u" ") if word not in stop]) for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(cleanSentence)): if len(word) > 1 and pos[0] == u"N": lemmaWord =lemmatizer.lemmatize(word.lower()) e = d.addConceptKeyType(lemmaWord, u"LemmaWord") f = e.addConceptKeyType(pos, u"POS") if False: concepts.logConcepts() if True: logger.info(u"Find Collocations...") fc = Collocations() fc.find_collocations(concepts) if True: npbt = DocumentsSimilarity(al) logger.info(u"Create Topics") npbt.createTopics(concepts) if True: logger.info(u"Find Similarities") nc = npbt.findSimilarties() logger.debug(u"Topics") listTopics = list() ncg = npbt.topicConcepts.getConcepts().values() for x in ncg: logger.info(u"%s[%d]" % (x.name, x.count)) lt = (x.name, x.count) listTopics.append(lt) logger.info(u"Topics Sorted") with open(u"topic_sort.txt", "wb") as f: for x in sorted(listTopics, key=lambda c: abs(c[1]), reverse=False): output = "Topic : %s[%d]" % (x[0], x[1]) logger.info(output) f.write(output + os.linesep)
class Chunks(object): conceptFile = 'documents.p' chunkFile = 'chunks.p' concepts = None chunkConcepts = None def __init__(self, concepts=None): if concepts == None: logger.info("Loading : %s" % self.conceptFile) self.concepts = Concepts.loadConcepts(self.conceptFile) else: logger.info("Using : %s" % concepts.name) self.concepts = concepts self.chunkConcepts = Concepts("Chunk", "Chunks") def getChunkConcepts(self): return self.chunkConcepts def createChunks(self): stop = stopwords.words('english') stop.append("This") stop.append("The") stop.append(",") stop.append(".") stop.append("..") stop.append("...") stop.append(".") stop.append(";") stop.append("and") stemmer = PorterStemmer() lemmatizer = WordNetLemmatizer() tokenizer = RegexpTokenizer("[\w]+") for document in self.concepts.getConcepts().values(): logger.info("%s" % document.name) d = self.chunkConcepts.addConceptKeyType(document.name, "Document") for sentence in document.getConcepts().values(): logger.debug("%s(%s)" % (sentence.name, sentence.typeName)) cleanSentence = ' '.join([word for word in sentence.name.split() if word not in stop]) listSentence = list() for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(cleanSentence)): logger.debug("Word: " + word + " POS: " + pos) if pos[:1] == "N": lemmaWord = lemmatizer.lemmatize(word) logger.debug("Word: " + word + " Lemma: " + lemmaWord) synset = wn.synsets(word, pos='n') logger.debug("synset : %s" % synset) if len(synset) != 0: syn = synset[0] root = syn.root_hypernyms() logger.debug("root : %s" % root) hypernyms = syn.hypernyms() logger.debug("hypernyms : %s" % hypernyms) if len(hypernyms) > 0: hyponyms = syn.hypernyms()[0].hyponyms() logger.debug("hyponyms : %s" % hyponyms) else: hyponyms = None listSentence.append((word, lemmaWord, pos, root, hypernyms, hyponyms)) nounSentence = "" for word in listSentence: nounSentence += word[1] + " " if len(nounSentence) > 2: e = d.addConceptKeyType(nounSentence, "NounSentence") for word in listSentence: f = e.addConceptKeyType(word[0], word[2]) f.addConceptKeyType(word[1], "Lemma") logger.debug("%s = %s" % (cleanSentence, type(cleanSentence))) cleanSentence = cleanSentence.decode("utf-8", errors="ignore") pt = parsetree(cleanSentence, relations=True, lemmata=True) for sentence in pt: logger.debug("relations: %s" % [x for x in sentence.relations]) logger.debug("subject : %s" % [x for x in sentence.subjects]) logger.debug("verb : %s" % [x for x in sentence.verbs]) logger.debug("object : %s" % [x for x in sentence.objects]) if sentence.subjects is not None: logger.debug("Sentence : %s" % sentence.chunks) for chunk in sentence.chunks: logger.debug("Chunk : %s" % chunk) relation = str(chunk.relation).encode("ascii", errors="ignore").strip() role = str(chunk.role).encode("ascii", errors="ignore").strip() logger.debug("Relation : %s" % relation) logger.debug("Role : %s" % role) for subject in sentence.subjects: logger.debug("Subject.realtion : %s " % subject.relation) logger.debug("Subject : %s " % subject.string) f = e.addConceptKeyType(subject.string, "SBJ") for verb in sentence.verbs: if verb.relation == subject.relation: logger.debug("Verb.realtion : %s " % verb.relation) logger.debug("Verb : %s " % verb.string) g = f.addConceptKeyType(verb.string, "VP") for obj in sentence.objects: if obj.relation == verb.relation: logger.debug("Obj.realtion : %s " % obj.relation) logger.debug("Object : %s " % obj.string) g.addConceptKeyType(obj.string, "OBJ") Concepts.saveConcepts(self.chunkConcepts, self.chunkFile)
class PPTXCreateArchil(object): graph = None dictNodes = None dictEdges = None dictText = None dictNodeXY = None dictTextXY = None def __init__(self, fileCrawl, fileArchimate): self.EMU = 914400.0 self.fileArchimate = fileArchimate self.path_to_presentation = fileCrawl self.dictNodes = dict() self.dictEdges = dict() self.dictText = dict() self.dictNodeXY = dict() self.dictTextXY = dict() self.al = ArchiLib(fileArchimate) self.graph = GraphVizGraph() # self.graph = NetworkXGraph() # self.graph = PatternGraph() self.prs = Presentation(self.path_to_presentation) self.concepts = Concepts(u"Application", u"Relations") def addGraphNodes(self, concepts, n=0): n += 1 for c in concepts.getConcepts().values(): logger.debug(u"%s[%d]" % (c.name, len(c.name))) if len(c.name.strip(u" ")) == 0: return if not (c.typeName in (u"Source", u"Target")): return logger.debug(u"%d : %d Node c : %s:%s" % (n, len(c.getConcepts()), c.name, c.typeName)) self.graph.addConcept(c) if len(c.getConcepts()) != 0: self.addGraphNodes(c, n) def addGraphEdges(self, concepts, n=0): n += 1 i = 1 for c in concepts.getConcepts().values(): if (c.name in (u"l", u"h", u"t", u"w")): return logger.debug(u"%d : %d Edge c : %s:%s" % (n, len(c.getConcepts()), c.name, c.typeName)) if i == 1: p = c i += 1 else: self.graph.addEdge(p, c) if len(c.getConcepts()) != 0: self.addGraphEdges(c, n) def graphConcepts(self, concepts, graph=None): logger.info(u"Adding nodes the graph ...") self.addGraphNodes(concepts) logger.info(u"Adding edges the graph ...") self.addGraphEdges(concepts) if isinstance(graph, GraphVizGraph): filename = u"example.png" graph.exportGraph(filename=filename) logger.info(u"Saved Graph - %s" % filename) if isinstance(graph, Neo4JGraph): graph.setNodeLabels() if isinstance(graph, NetworkXGraph): graph.drawGraph(u"concepts.png") filename = u"concepts.net" logger.info(u"Saving Pajek - %s" % filename) graph.saveGraphPajek(filename) graph.saveGraph(u"concepts.gml") logger.info(u"Saving Graph - %s" % u"concepts.gml") if isinstance(graph, PatternGraph): logger.info(u"Exporting Graph") graph.exportGraph() def findID(self, nid): try: for x in self.dictNodes.keys(): logger.debug(u" dictNodes[%s] : %s" % (self.dictNodes[x], x)) if nid in self.dictNodes[x]: logger.debug(u"Found %s in %s" % (x, self.dictNodes[x])) return x except: em = format_exc().split('\n')[-2] logger.warn(u"findID : Warning: %s" % (em)) return None def findXY(self, nid, d): ld = list() try: ld = d[nid] logger.debug(u"ld : %s" % ld) except: pass return ld def logList(self, l, n=0): n += 1 s = " " * n logger.info(u"%sn=%d" % (s, n)) for x in l: # logger.info("%sx=%s" % (s, x)) if isinstance(x, list): logger.info(u"%slist: %s" % (s, x)) self.logList(x, n) elif isinstance(x, dict): logger.info(u"%sdict: %s" % (s, x)) self.logList(x, n) elif isinstance(x, tuple): logger.info(u"%stuple: %s" % (s, x)) self.logList(x, n) else: if isinstance(x, str): logger.info(u"%sstr: %s" % (s, x)) elif isinstance(x, float): logger.info(u"%sfloat: %3.2f" % (s, x)) elif isinstance(x, int): logger.info(u"%sint: %d" % (s, x)) def shapeText(self, shape): name = u"" if shape.has_text_frame: text_frame = shape.text_frame for paragraph in shape.text_frame.paragraphs: for run in paragraph.runs: logger.debug(u"%s" % run.text) name = name + run.text + u" " return name def shapeDim(self, shape): t = shape.top / self.EMU l = shape.left / self.EMU h = shape.height / self.EMU w = shape.width / self.EMU nid = shape.id dictDim = dict() dictDim[u"t"] = t dictDim[u"l"] = l dictDim[u"h"] = h dictDim[u"w"] = w self.dictNodeXY[nid] = dictDim logger.debug(u"shape.top : %3.2f" % (t)) logger.debug(u"shape.left : %3.2f" % (l)) logger.debug(u"shape.height : %3.2f" % (h)) logger.debug(u"shape.width : %3.2f" % (w)) logger.debug(u"shape.shape_type : %s" % shape.shape_type) return nid, t, l, h, w def addDictNodes(self, nid, name): name = unicode(name).rstrip(u" ").lstrip(u" ") if not (len(name) > 0): logger.warn(u"No Name!") return if name in self.dictNodes: nl = self.dictNodes[name] nl.append(nid) logger.debug(u"Duplicate Keys %s...%s" % (name, self.dictNodes[name])) else: nl = list() nl.append(nid) self.dictNodes[name] = nl def addDictEdges(self, nid, xl): nxl = list() for x in xl: nxl.append(int(x)) logger.debug(u"%d:%s" % (nid, x)) lenNXL = len(nxl) # # Only add connections between two nodes # if lenNXL == 3: if self.dictEdges.has_key(nid): nl = self.dictEdges[nid] nl.append(nxl) logger.debug(u"Duplicate Edges ...%s" % (self.dictEdges[nid])) else: el = list() el.append(nxl) self.dictEdges[nid] = el else: logger.debug(u"Only %d Connectors!" % (len(nxl))) return lenNXL def showConcepts(self, concepts): n = 0 for x in concepts.getConcepts().values(): n += 1 logger.info(u"x %s[%s]" % (x.name, x.typeName)) for y in x.getConcepts().values(): logger.info(u" y %s[%s]" % (y.name, y.typeName)) for z in y.getConcepts().values(): if not (z.name in (u"h", u"l", u"t", u"w")): logger.info(u" z %s[%s]" % (z.name, z.typeName)) def getPoint(self, d): t = d[u"t"] l = d[u"l"] h = d[u"h"] w = d[u"w"] py = t + (h / 2.0) px = l + (h / 2.0) return px, py def lineMagnitude(self, x1, y1, x2, y2): lineMagnitude = math.sqrt( math.pow((x2 - x1), 2) + math.pow((y2 - y1), 2)) return lineMagnitude # Calc minimum distance from a point and a line segment (i.e. consecutive vertices in a polyline). def DistancePointLine(self, px, py, x1, y1, x2, y2): try: # http://local.wasp.uwa.edu.au/~pbourke/geometry/pointline/source.vba LineMag = self.lineMagnitude(x1, y1, x2, y2) u1 = (((px - x1) * (x2 - x1)) + ((py - y1) * (y2 - y1))) u = u1 / (LineMag * LineMag) if (u < 0.00001) or (u > 1): # closest point does not fall within the line segment, take the shorter distance # to an endpoint ix = self.lineMagnitude(px, py, x1, y1) iy = self.lineMagnitude(px, py, x2, y2) if ix > iy: DistancePointLine = iy else: DistancePointLine = ix else: # Intersecting point is on the line, use the formula ix = x1 + u * (x2 - x1) iy = y1 + u * (y2 - y1) DistancePointLine = self.lineMagnitude(px, py, ix, iy) return DistancePointLine except: return 0 def crawlPPTX(self): sNum = 0 for slide in self.prs.slides: logger.debug(u"--new slide--") logger.debug(u"%s" % slide.partname) logger.debug(u"slideName : %s" % slide.name) sNum += 1 # # Get Title of Slide # titleSlide = u"" for idx, ph in enumerate(slide.shapes.placeholders): # logger.debug ("** %s:%s **" % (idx, ph.text)) if idx == 0: titleSlide = ph.text u = self.al.cleanString(titleSlide) logger.info(u"%d.%s" % (sNum, u)) tss = u"%d.%s" % (sNum, u) q = self.concepts.addConceptKeyType(tss, u"Slide") # showConcepts(concepts) # # Iterate ihrough slides # n = 0 nc = 0 for shape in slide.shapes: logger.debug(u"...%s..." % type(shape)) logger.debug(u"shape.element.xml : %s" % shape.element.xml) logger.debug(u"shape.name : %s[%d]" % (shape.name, shape.id - 1)) n += 1 sn = shape.name nid = shape.id # Get Shape Info if shape.name[:5] in (u"Recta", u"Round", u"Strai"): nid, t, l, h, w = self.shapeDim(shape) tl = (l, t) tr = (l + w, t) bl = (l, t + h) br = (l + w, t + h) name = self.shapeText(shape) if len(name) > 1: logger.info(u" node : %s[%d] - %s" % (name, nid, shape.name)) self.addDictNodes(nid, name) b = q.addConceptKeyType(self.al.cleanString(name), u"Node") b.addConceptKeyType(u"t", str(t)) b.addConceptKeyType(u"l", str(l)) b.addConceptKeyType(u"h", str(h)) b.addConceptKeyType(u"w", str(w)) # # Add in Connections # elif sn.find(u"Connector") != -1: xmlShape = shape.element.xml logger.debug(u"xmlShape : %s" % xmlShape) tree = etree.fromstring(xmlShape) xl = tree.xpath(u"//@id") logger.debug(u"xl : %s" % xl) numEdges = self.addDictEdges(nid, xl) if numEdges == 3: nc += 1 logger.info(u" %d Found Edge %d" % (nc, shape.id)) # # Get Text boxes and associate with Connector # elif shape.name[:8] in (u"Text Box", u"TextBox "): nid, t, l, h, w = self.shapeDim(shape) name = self.shapeText(shape) if name is not None: nxl = list() nxl.append(nid) self.dictText[name] = nxl logger.info(u" TextBox : %s[%d]" % (name, shape.id)) else: logger.debug(u"Skipped : %s" % shape.name) # # Now match the Connector with text # listEdges = self.dictEdges.values() logger.info(u"listEdges : %d" % len(listEdges)) tbFound = 0 tbTotal = len(self.dictTextXY) logger.info(u"Search for %s Text Box Connector's" % len(self.dictTextXY)) for txt in self.dictTextXY.keys(): logger.debug(u"txt : %s[%s]" % (txt, dictTextXY[txt])) searchText = self.findID(txt, self.dictText) logger.info(u" Search Text : %s" % (searchText)) # get text point - middle of node px, py = self.getPoint(dictTextXY[txt]) cDist = 1000.0 cNode = None csn = None ctn = None # for each node in dictEdges ni = 0 for edge in listEdges: logger.debug(u" edge: %s" % edge) try: # get source source = edge[0][2] sName = self.findID(source) sl = self.dictNodeXY[source] spx, spy = self.getPoint(sl) # get target target = edge[0][1] tName = self.findID(target) tl = self.dictNodeXY[target] tpx, tpy = self.getPoint(tl) # determine distance between points d = self.DistancePointLine(px, py, spx, spy, tpx, tpy) if d < cDist: cDist = d cNode = edge[0][0] csn = sName tsn = tName except: pass if cNode != None: tbFound += 1 logger.debug(u" Closest Connector : %s" % cNode) logger.info( u" found(%d:%d] - %s->%s->%s [%2.3f]" % (tbFound, tbTotal, csn, searchText, tsn, cDist)) edge = searchText source = sName target = tName dimSource = sl dimTarget = tl if edge is None: edge = u"TBD" d = q.getConcepts()[csn] for ld in dimSource.keys(): logger.debug(u"%s %s:%2.3f" % (source, ld, dimSource[ld])) d.addConceptKeyType(ld, str(dimSource[ld])) f = d.addConceptKeyType(target, u"Target") for ld in dimTarget.keys(): logger.debug(u"%s %s:%2.3f" % (target, ld, dimSource[ld])) f.addConceptKeyType(ld, str(dimTarget[ld])) f.addConceptKeyType(self.al.cleanString(edge), u"Edge") if tbTotal != 0: logger.info(u"Found [%3.1f] Text Box Connectors" % ((tbFound / float(tbTotal)) * 100.0)) dictTextXY = dict() return self.concepts
class DependancyAnalysis(object): def __init__(self, fileArchimate): self.fileArchimate = fileArchimate logger.info(u"Using : %s" % self.fileArchimate) self.al = ArchiLib(fileArchimate) self.concepts = Concepts(u"BusinessProcess", u"archimate:BusinessProcess") # "Batches" are sets of tasks that can be run together def get_task_batches(self, nodes): # Build a map of node names to node instances name_to_instance = dict((n.name, n) for n in nodes ) for x in name_to_instance.keys(): logger.debug(u"name_to_instance[%s]=%s : %s" % (x, name_to_instance[x].name, name_to_instance[x].depends)) # Build a map of node names to dependency names name_to_deps = dict( (n.name, set(n.depends)) for n in nodes ) for x in name_to_deps.keys(): logger.debug(u"name_to_deps[%s]=%s" % (x, name_to_deps[x])) # This is where we'll store the batches batches = [] n = 0 # While there are dependencies to solve... while name_to_deps: logger.info(u"length %d" % len(name_to_deps)) # Get all nodes with no dependencies ready = {name for name, deps in name_to_deps.iteritems() if not deps} n += 1 logger.info(u"iteration : %d" % n) for x in ready: logger.info(u"No Dep %s" % (x)) # If there aren't any, we have a loop in the graph if not ready: msg = u"Circular dependencies found!\n" msg += self.format_dependencies(name_to_deps) raise ValueError(msg) # Remove them from the dependency graph for name in ready: del name_to_deps[name] for deps in name_to_deps.itervalues(): deps.difference_update(ready) # Add the batch to the list batches.append( {name_to_instance[name] for name in ready} ) # Return the list of batches return batches # Format a dependency graph for printing def format_dependencies(self, name_to_deps): msg = [] for name, deps in name_to_deps.iteritems(): for parent in deps: msg.append(u"%s -> %s" % (name, parent)) return "\n".join(msg) # Create and format a dependency graph for printing def format_nodes(self, nodes): return self.format_dependencies(dict( (n.name, n.depends) for n in nodes )) def findConcept(self, concepts, name, n=0): n += 1 c = None if n == 3: return c for x in concepts.getConcepts().values(): if x.name == name: return x else: c = self.findConcept(x, name, n) return c def getWords(self, s, concepts): lemmatizer = WordNetLemmatizer() for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(s)): if len(word) > 1 and pos[0] == u"N": lemmaWord = lemmatizer.lemmatize(word.lower()) e = concepts.addConceptKeyType(lemmaWord, u"Word") f = e.addConceptKeyType(pos, u"POS") def collectDependancyAnalysisNodes(self): count = 0 listTSort = list() for x in self.al.dictEdges.keys(): logger.debug(u"[%s]=%s" % (self.al.dictEdges[x][u"id"], x)) if u"source" in self.al.dictEdges[x]: source = self.al.dictEdges[x][u"source"] target = self.al.dictEdges[x][u"target"] logger.debug(u" Rel : %s" % (self.al.dictEdges[x][ARCHI_TYPE])) if self.al.dictEdges[x][ARCHI_TYPE] in (u"archimate:FlowRelationship"): # al.countNodeType(al.dictNodes[source][ARCHI_TYPE]) # al.countNodeType(al.dictNodes[target][ARCHI_TYPE]) # al.countNodeType(al.dictEdges[x][ARCHI_TYPE]) if (self.al.dictNodes[source][ARCHI_TYPE] == u"archimate:BusinessProcess") and \ self.al.dictNodes[target][ARCHI_TYPE] == u"archimate:BusinessProcess": sourceName = self.al.getNodeName(source) targetName = self.al.getNodeName(target) if sourceName[0].isdigit() or targetName[0].isdigit(): continue logger.debug(u" %s:%s" % (sourceName, targetName)) l = list() sc = self.findConcept(self.concepts, sourceName) if sc is None: logger.debug(u"New Target - %s" % sourceName) sc = self.concepts.addConceptKeyType(self.al.getNodeName(source), u"Source") self.getWords(sourceName, sc) else: logger.debug(u"Prior Target %s" % sourceName) tc = self.findConcept(self.concepts, targetName) if tc is None: logger.debug(u"New Target %s" % targetName) tc = sc.addConceptKeyType(self.al.getNodeName(target), u"Target") self.getWords(sourceName, tc) else: logger.debug(u"Prior Target %s" % targetName) sc.addConcept(tc) l.append(target) l.append(source) listTSort.append(l) logger.debug(u"Edges = %s" % listTSort) Concepts.saveConcepts(self.concepts, fileConceptsTraversal) self.dependancyAnalysis(listTSort) return self.concepts, listTSort def dependancyAnalysis(self, listTSort): index = 0 for x in listTSort: logger.debug(u"%d %s[%s] -%s-> %s[%s]" % (index, self.al.dictNodes[x[0]][u"name"], self.al.dictNodes[x[0]][ARCHI_TYPE], u"UsedBy", self.al.dictNodes[x[1]][u"name"], self.al.dictNodes[x[1]][ARCHI_TYPE])) index += 1 self.al.addToNodeDict(self.al.dictNodes[x[0]][u"name"], self.al.dictBP) self.al.addToNodeDict(self.al.dictNodes[x[1]][u"name"], self.al.dictBP) logger.info(u"Topic Sort Candidates : %d" % (len(listTSort))) nodes = list() index = 0 dictTasks = dict() for x in listTSort: sname = self.al.dictNodes[x[0]][u"name"] tname = self.al.dictNodes[x[1]][u"name"] index += 1 logger.debug(u"%d %s -%s-> %s" % (index, sname, u"UsedBy", tname)) if sname in dictTasks: ln = dictTasks[sname] ln.append(tname) else: ln = list() ln.append(tname) dictTasks[sname] = ln for x in dictTasks.keys(): logger.debug(u"dictTasks[%s]=%s" % (x, dictTasks[x])) a = Task(x, dictTasks[x]) nodes.append(a) for x in self.al.dictBP.keys(): # for x in listBP: if x not in dictTasks: logger.debug(u"Add %s" % (x)) a = Task(x, list()) nodes.append(a) self.format_nodes(nodes) conceptBatches = Concepts(u"Batch", u"archimate:WorkPackage") n = 0 logger.info(u"Batches:") batches = self.get_task_batches(nodes) for bundle in batches: n += 1 name = u"Batch %d" % n c = conceptBatches.addConceptKeyType(name, u"archimate:WorkPackage") for node in bundle: c.addConceptKeyType(node.name, u"archimate:BusinessProcess") logger.info(u"%d : %s" % (n, ", ".join(node.name.lstrip() for node in bundle))) Concepts.saveConcepts(conceptBatches, fileConceptsBatches) return conceptBatches
def exportArchi(self): m = hashlib.md5() concepts = Concepts(u"Node", u"Nodes") logger.info(u"Found %d Nodes" % len(self.al.dictNodes)) logger.info(u"Found %d Edges" % len(self.al.dictEdges)) count = 0 listTSort = list() for x in self.al.dictEdges.keys(): logger.debug(u"Edge [%s]=%s" % (self.al.dictEdges[x], x)) if self.al.dictEdges[x].has_key( u"source") and self.al.dictEdges[x].has_key(u"target"): typeEdge = self.al.dictEdges[x][ARCHI_TYPE] logger.debug(u"Edge : %s" % typeEdge) source = self.al.dictEdges[x][u"source"] logger.debug(u"Source : %s" % source) target = self.al.dictEdges[x][u"target"] logger.debug(u"Target : %s" % target) logger.debug(u" Rel : %s" % (self.al.dictEdges[x][ARCHI_TYPE])) sourceName = self.al.getNodeName(source) targetName = self.al.getNodeName(target) logger.debug( u" %s--%s--%s" % (sourceName, self.al.dictEdges[x][ARCHI_TYPE][10:], targetName)) if source in self.al.dictNodes: l = list() sc = concepts.addConceptKeyType( sourceName, self.al.dictNodes[source][ARCHI_TYPE][10:]) # getWords(sourceName, sc) nameEdge = u"(" + sourceName + u"," + targetName + u")" logger.debug(u"nameEdge : %s[%d]" % (nameEdge, len(nameEdge))) logger.debug(u"typeEdge : %s" % typeEdge[10:]) ne = str(self.al.cleanString(nameEdge)) hl = hashlib.sha224(str(ne)).hexdigest() logger.debug(u"hash : %s" % hl) nh = u"%s-%s" % (typeEdge[10:], hl) rc = sc.addConceptKeyType(nh, typeEdge[10:]) if self.al.dictNodes.has_key(target): tc = rc.addConceptKeyType( targetName, self.al.dictNodes[target][ARCHI_TYPE][10:]) # getWords(sourceName, tc) Concepts.saveConcepts(concepts, self.fileConceptsExport) return concepts
class Chunks(object): conceptFile = fileConceptsDocuments chunkFile = fileConceptsChunks concepts = None chunkConcepts = None def __init__(self, concepts=None): if concepts is None: logger.info(u"Loading : %s" % self.conceptFile) self.concepts = Concepts.loadConcepts(self.conceptFile) else: logger.info(u"Using : %s" % concepts.name) self.concepts = concepts self.chunkConcepts = Concepts(u"Chunk", u"Chunks") def getChunkConcepts(self): return self.chunkConcepts def createChunks(self): stop = stopwords.words(u'english') stop.append(u"This") stop.append(u"The") stop.append(u",") stop.append(u".") stop.append(u"..") stop.append(u"...") stop.append(u".") stop.append(u";") stop.append(u"and") stemmer = PorterStemmer() lemmatizer = WordNetLemmatizer() tokenizer = RegexpTokenizer(u"[\w]+") for document in self.concepts.getConcepts().values(): logger.info(u"%s" % document.name) d = self.chunkConcepts.addConceptKeyType(document.name, u"Document") for sentence in document.getConcepts().values(): logger.debug(u"%s(%s)" % (sentence.name, sentence.typeName)) cleanSentence = ' '.join([word for word in sentence.name.split() if word not in stop]) listSentence = list() for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(cleanSentence)): logger.debug(u"Word: " + word + u" POS: " + pos) if pos[:1] == u"N": #if True: lemmaWord = lemmatizer.lemmatize(word) logger.debug(u"Word: " + word + u" Lemma: " + lemmaWord) morphWord = wn.morphy(word, wn.NOUN) if morphWord is not None: logger.debug(u"Word: " + word + u" Morph: " + morphWord) synset = wn.synsets(word, pos=u'n') logger.debug(u"synset : %s" % synset) if len(synset) != 0: syn = synset[0] root = syn.root_hypernyms() logger.debug(u"root : %s" % root) mh = syn.member_holonyms() logger.debug(u"member_holonyms : %s" % mh) hypernyms = syn.hypernyms() logger.debug(u"hypernyms : %s" % hypernyms) if len(hypernyms) > 0: hyponyms = syn.hypernyms()[0].hyponyms() logger.debug(u"hyponyms : %s" % hyponyms) else: hyponyms = None listSentence.append((word, lemmaWord, pos, root, hypernyms, hyponyms)) nounSentence = u"" for word in listSentence: nounSentence += word[1] + u" " if len(nounSentence) > 2: e = d.addConceptKeyType(nounSentence, u"NounSentence") for word in listSentence: f = e.addConceptKeyType(word[0], word[2]) f.addConceptKeyType(word[1], u"Lemma") logger.debug(u"%s = %s" % (cleanSentence, type(cleanSentence))) cleanSentence = unicode(cleanSentence) pt = parsetree(cleanSentence, relations=True, lemmata=True) for sentence in pt: logger.debug(u"relations: %s" % [x for x in sentence.relations]) logger.debug(u"subject : %s" % [x for x in sentence.subjects]) logger.debug(u"verb : %s" % [x for x in sentence.verbs]) logger.debug(u"object : %s" % [x for x in sentence.objects]) if sentence.subjects is not None: logger.debug(u"Sentence : %s" % sentence.chunks) for chunk in sentence.chunks: logger.debug(u"Chunk : %s" % chunk) relation = unicode(chunk.relation).strip() role = unicode(chunk.role).strip() logger.debug(u"Relation : %s" % relation) logger.debug(u"Role : %s" % role) for subject in sentence.subjects: logger.debug(u" Subject.realtion : %s " % subject.relation) logger.debug(u" Subject : %s " % subject.string) f = e.addConceptKeyType(subject.string, u"SBJ") for verb in sentence.verbs: if verb.relation == subject.relation: logger.debug(u" Verb.realtion : %s " % verb.relation) logger.debug(u" Verb : %s " % verb.string) g = f.addConceptKeyType(verb.string, u"VP") for obj in sentence.objects: if obj.relation == verb.relation: logger.debug(u" Obj.realtion : %s " % obj.relation) logger.debug(u" Object : %s " % obj.string) g.addConceptKeyType(obj.string, u"OBJ") Concepts.saveConcepts(self.chunkConcepts, self.chunkFile) logger.info(u"Saved : %s" % self.chunkFile)
class PPTXCreateArchil(object): graph = None dictNodes = None dictEdges = None dictText = None dictNodeXY = None dictTextXY = None def __init__(self, fileCrawl, fileArchimate): self.EMU = 914400.0 self.fileArchimate = fileArchimate self.path_to_presentation = fileCrawl self.dictNodes = dict() self.dictEdges = dict() self.dictText = dict() self.dictNodeXY = dict() self.dictTextXY = dict() self.al = ArchiLib(fileArchimate) self.graph = GraphVizGraph() # self.graph = NetworkXGraph() # self.graph = PatternGraph() self.prs = Presentation(self.path_to_presentation) self.concepts = Concepts(u"Application", u"Relations") def addGraphNodes(self, concepts, n=0): n += 1 for c in concepts.getConcepts().values(): logger.debug(u"%s[%d]" % (c.name, len(c.name))) if len(c.name.strip(u" ")) == 0: return if not (c.typeName in (u"Source", u"Target")): return logger.debug(u"%d : %d Node c : %s:%s" % (n, len(c.getConcepts()), c.name, c.typeName)) self.graph.addConcept(c) if len(c.getConcepts()) != 0: self.addGraphNodes(c, n) def addGraphEdges(self, concepts, n=0): n += 1 i = 1 for c in concepts.getConcepts().values(): if (c.name in (u"l", u"h", u"t", u"w")): return logger.debug(u"%d : %d Edge c : %s:%s" % (n, len(c.getConcepts()), c.name, c.typeName)) if i == 1: p = c i += 1 else: self.graph.addEdge(p, c) if len(c.getConcepts()) != 0: self.addGraphEdges(c, n) def graphConcepts(self, concepts, graph=None): logger.info(u"Adding nodes the graph ...") self.addGraphNodes(concepts) logger.info(u"Adding edges the graph ...") self.addGraphEdges(concepts) if isinstance(graph, GraphVizGraph): filename = u"example.png" graph.exportGraph(filename=filename) logger.info(u"Saved Graph - %s" % filename) if isinstance(graph, Neo4JGraph): graph.setNodeLabels() if isinstance(graph, NetworkXGraph): graph.drawGraph(u"concepts.png") filename = u"concepts.net" logger.info(u"Saving Pajek - %s" % filename) graph.saveGraphPajek(filename) graph.saveGraph(u"concepts.gml") logger.info(u"Saving Graph - %s" % u"concepts.gml") if isinstance(graph, PatternGraph): logger.info(u"Exporting Graph") graph.exportGraph() def findID(self, nid): try: for x in self.dictNodes.keys(): logger.debug(u" dictNodes[%s] : %s" % (self.dictNodes[x], x)) if nid in self.dictNodes[x]: logger.debug(u"Found %s in %s" % (x, self.dictNodes[x])) return x except: em = format_exc().split('\n')[-2] logger.warn(u"findID : Warning: %s" % (em)) return None def findXY(self, nid, d): ld = list() try: ld = d[nid] logger.debug(u"ld : %s" % ld) except: pass return ld def logList(self, l, n=0): n += 1 s = " " * n logger.info(u"%sn=%d" % (s, n)) for x in l: # logger.info("%sx=%s" % (s, x)) if isinstance(x, list): logger.info(u"%slist: %s" % (s, x)) self.logList(x, n) elif isinstance(x, dict): logger.info(u"%sdict: %s" % (s, x)) self.logList(x, n) elif isinstance(x, tuple): logger.info(u"%stuple: %s" % (s, x)) self.logList(x, n) else: if isinstance(x, str): logger.info(u"%sstr: %s" % (s, x)) elif isinstance(x, float): logger.info(u"%sfloat: %3.2f" % (s, x)) elif isinstance(x, int): logger.info(u"%sint: %d" % (s, x)) def shapeText(self, shape): name = u"" if shape.has_text_frame: text_frame = shape.text_frame for paragraph in shape.text_frame.paragraphs: for run in paragraph.runs: logger.debug(u"%s" % run.text) name = name + run.text + u" " return name def shapeDim(self, shape): t = shape.top / self.EMU l = shape.left / self.EMU h = shape.height / self.EMU w = shape.width / self.EMU nid = shape.id dictDim = dict() dictDim[u"t"] = t dictDim[u"l"] = l dictDim[u"h"] = h dictDim[u"w"] = w self.dictNodeXY[nid] = dictDim logger.debug(u"shape.top : %3.2f" % (t)) logger.debug(u"shape.left : %3.2f" % (l)) logger.debug(u"shape.height : %3.2f" % (h)) logger.debug(u"shape.width : %3.2f" % (w)) logger.debug(u"shape.shape_type : %s" % shape.shape_type) return nid, t, l, h , w def addDictNodes(self, nid, name): name = unicode(name).rstrip(u" ").lstrip(u" ") if not (len(name) > 0): logger.warn(u"No Name!") return if name in self.dictNodes: nl = self.dictNodes[name] nl.append(nid) logger.debug(u"Duplicate Keys %s...%s" % (name, self.dictNodes[name])) else: nl = list() nl.append(nid) self.dictNodes[name] = nl def addDictEdges(self, nid, xl): nxl = list() for x in xl: nxl.append(int(x)) logger.debug(u"%d:%s" % (nid, x)) lenNXL = len(nxl) # # Only add connections between two nodes # if lenNXL == 3: if self.dictEdges.has_key(nid): nl = self.dictEdges[nid] nl.append(nxl) logger.debug(u"Duplicate Edges ...%s" % (self.dictEdges[nid])) else: el = list() el.append(nxl) self.dictEdges[nid] = el else: logger.debug(u"Only %d Connectors!" % (len(nxl))) return lenNXL def showConcepts(self, concepts): n = 0 for x in concepts.getConcepts().values(): n += 1 logger.info(u"x %s[%s]" % (x.name, x.typeName)) for y in x.getConcepts().values(): logger.info(u" y %s[%s]" % (y.name, y.typeName)) for z in y.getConcepts().values(): if not (z.name in (u"h", u"l", u"t", u"w")): logger.info(u" z %s[%s]" % (z.name, z.typeName)) def getPoint(self, d): t = d[u"t"] l = d[u"l"] h = d[u"h"] w = d[u"w"] py = t + (h / 2.0) px = l + (h / 2.0) return px, py def lineMagnitude (self, x1, y1, x2, y2): lineMagnitude = math.sqrt(math.pow((x2 - x1), 2)+ math.pow((y2 - y1), 2)) return lineMagnitude # Calc minimum distance from a point and a line segment (i.e. consecutive vertices in a polyline). def DistancePointLine (self, px, py, x1, y1, x2, y2): try: # http://local.wasp.uwa.edu.au/~pbourke/geometry/pointline/source.vba LineMag = self.lineMagnitude(x1, y1, x2, y2) u1 = (((px - x1) * (x2 - x1)) + ((py - y1) * (y2 - y1))) u = u1 / (LineMag * LineMag) if (u < 0.00001) or (u > 1): # closest point does not fall within the line segment, take the shorter distance # to an endpoint ix = self.lineMagnitude(px, py, x1, y1) iy = self.lineMagnitude(px, py, x2, y2) if ix > iy: DistancePointLine = iy else: DistancePointLine = ix else: # Intersecting point is on the line, use the formula ix = x1 + u * (x2 - x1) iy = y1 + u * (y2 - y1) DistancePointLine = self.lineMagnitude(px, py, ix, iy) return DistancePointLine except: return 0 def crawlPPTX(self): sNum = 0 for slide in self.prs.slides: logger.debug(u"--new slide--") logger.debug(u"%s" % slide.partname) logger.debug(u"slideName : %s" % slide.name) sNum += 1 # # Get Title of Slide # titleSlide = u"" for idx, ph in enumerate(slide.shapes.placeholders): # logger.debug ("** %s:%s **" % (idx, ph.text)) if idx == 0: titleSlide = ph.text u = self.al.cleanString(titleSlide) logger.info(u"%d.%s" % (sNum, u)) tss = u"%d.%s" % (sNum, u) q = self.concepts.addConceptKeyType(tss, u"Slide") # showConcepts(concepts) # # Iterate ihrough slides # n = 0 nc = 0 for shape in slide.shapes: logger.debug(u"...%s..." % type(shape)) logger.debug(u"shape.element.xml : %s" % shape.element.xml) logger.debug(u"shape.name : %s[%d]" % (shape.name, shape.id - 1)) n += 1 sn = shape.name nid = shape.id # Get Shape Info if shape.name[:5] in (u"Recta", u"Round", u"Strai"): nid, t, l, h, w = self.shapeDim(shape) tl = (l, t) tr = (l + w, t) bl = (l, t + h) br = (l + w, t + h) name = self.shapeText(shape) if len(name) > 1: logger.info(u" node : %s[%d] - %s" % (name, nid, shape.name)) self.addDictNodes(nid, name) b = q.addConceptKeyType(self.al.cleanString(name), u"Node") b.addConceptKeyType(u"t", str(t)) b.addConceptKeyType(u"l", str(l)) b.addConceptKeyType(u"h", str(h)) b.addConceptKeyType(u"w", str(w)) # # Add in Connections # elif sn.find(u"Connector") != -1: xmlShape = shape.element.xml logger.debug(u"xmlShape : %s" % xmlShape) tree = etree.fromstring(xmlShape) xl = tree.xpath(u"//@id") logger.debug(u"xl : %s" % xl) numEdges = self.addDictEdges(nid, xl) if numEdges == 3: nc += 1 logger.info(u" %d Found Edge %d" % (nc, shape.id)) # # Get Text boxes and associate with Connector # elif shape.name[:8] in (u"Text Box", u"TextBox "): nid, t, l, h, w = self.shapeDim(shape) name = self.shapeText(shape) if name is not None: nxl = list() nxl.append(nid) self.dictText[name] = nxl logger.info(u" TextBox : %s[%d]" % (name, shape.id)) else: logger.debug(u"Skipped : %s" % shape.name) # # Now match the Connector with text # listEdges = self.dictEdges.values() logger.info(u"listEdges : %d" % len(listEdges)) tbFound = 0 tbTotal = len(self.dictTextXY) logger.info(u"Search for %s Text Box Connector's" % len(self.dictTextXY)) for txt in self.dictTextXY.keys(): logger.debug(u"txt : %s[%s]" % (txt, dictTextXY[txt] )) searchText = self.findID(txt, self.dictText) logger.info(u" Search Text : %s" % (searchText)) # get text point - middle of node px, py = self.getPoint(dictTextXY[txt]) cDist = 1000.0 cNode = None csn = None ctn = None # for each node in dictEdges ni = 0 for edge in listEdges: logger.debug(u" edge: %s" % edge) try: # get source source = edge[0][2] sName = self.findID(source) sl = self.dictNodeXY[source] spx, spy = self.getPoint(sl) # get target target = edge[0][1] tName = self.findID(target) tl = self.dictNodeXY[target] tpx, tpy = self.getPoint(tl) # determine distance between points d = self.DistancePointLine (px, py, spx, spy, tpx, tpy) if d < cDist: cDist = d cNode = edge[0][0] csn = sName tsn = tName except: pass if cNode != None: tbFound += 1 logger.debug(u" Closest Connector : %s" % cNode) logger.info(u" found(%d:%d] - %s->%s->%s [%2.3f]" % (tbFound, tbTotal, csn, searchText, tsn, cDist)) edge = searchText source = sName target = tName dimSource = sl dimTarget = tl if edge is None: edge = u"TBD" d = q.getConcepts()[csn] for ld in dimSource.keys(): logger.debug(u"%s %s:%2.3f" % (source, ld, dimSource[ld])) d.addConceptKeyType(ld, str(dimSource[ld])) f = d.addConceptKeyType(target, u"Target") for ld in dimTarget.keys(): logger.debug(u"%s %s:%2.3f" % (target, ld, dimSource[ld])) f.addConceptKeyType(ld, str(dimTarget[ld])) f.addConceptKeyType(self.al.cleanString(edge), u"Edge") if tbTotal != 0: logger.info(u"Found [%3.1f] Text Box Connectors" % ((tbFound / float(tbTotal)) * 100.0)) dictTextXY = dict() return self.concepts
def logSemanticTerms (inputFile, outputFile): logger.info("Input File: " + inputFile) iFile = open(inputFile, "r") logger.info("Output File: " + outputFile) oFile = open(outputFile , "w") writer = csv.writer(oFile) reader = csv.reader(iFile) wordsConcepts = Concepts("WordsConcepts", "Words") NOUN_POS = ['N', 'NN', 'NNP', 'NNS'] VERB_POS = ['V', 'VD', 'VG', 'VN'] POS = VERB_POS + NOUN_POS rownum = 0 tree = list() writer.writerow(["word", "synset.definition", "synset.lemma_names", "synset.examples"]) for row in reader: logger.debug("row: %s - %s" % (str(rownum), row)) # Take first column term = row[0] logger.debug("Term: %s" % term) text = nltk.word_tokenize(term) posTagged = (nltk.pos_tag(text)) logger.debug(" POS Text: %s" % posTagged) for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(term)): logger.debug(" Word: " + word + " POS: " + pos) if (pos in POS): logger.info("Add POS:" + word) wordsConcepts.addConceptKeyType(word, "WORD") else: logger.info("Skip POS:" + word) for synset in wn.synsets(word): if GRAPH == True: for i in synset.lemma_names: edge = pydot.Edge(term, i) graph.add_edge(edge) writer.writerow([word, synset.definition, synset.lemma_names, synset.examples]) logger.info(" %s\t%s" % (word, synset.lemma_names)) #logger.debug("%s\t%s\t%s\t%s" % (word, synset.lemma_names, synset.definition, synset.examples)) if GRAPH == True: paths = synset.hypernym_paths() prior = None for x in range(0, len(paths)-1): flag = False for synset in paths[x]: if flag == False: prior = synset.name flag = True else: edge = pydot.Edge(prior, synset.name) graph.add_edge(edge) prior = synset.name logger.info("%s" % synset.name) # tie it to the last entry if prior != None: edge = pydot.Edge(prior, term) graph.add_edge(edge) iFile.close() oFile.close() wordsConcepts.logConcepts() return wordsConcepts
class Collocations(object): concepts = None conceptsNGram = None conceptNGramScore = None conceptsNGramSubject = None conceptFile = u"documents.p" ngramFile = u"ngrams.p" ngramScoreFile = u"ngramscore.p" ngramSubjectFile = u"ngramsubject.p" def __init__(self, conceptFile=None): if conceptFile == None: conceptFile = u"documents.p" logger.info(u"Load Concepts from %s " % (conceptFile)) self.concepts = Concepts.loadConcepts(conceptFile) logger.info(u"Loaded Concepts") self.conceptsNGram = Concepts(u"n-gram", u"NGRAM") self.conceptsNGramScore = Concepts(u"NGram_Score", u"Score") self.conceptsNGramSubject = Concepts(u"Subject", u"Subjects") def getCollocationConcepts(self): return self.conceptsNGram, self.conceptsNGramScore, self.conceptsNGramSubject def find_collocations(self): lemmatizer = WordNetLemmatizer() stopset = set(stop) filter_stops = lambda w: len(w) < 3 or w in stopset words = list() dictWords = dict() for document in self.concepts.getConcepts().values(): logger.debug(document.name) for concept in document.getConcepts().values(): logger.debug(concept.name) for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(concept.name)): logger.debug(u"Word: " + word + u" POS: " + pos) lemmaWord = lemmatizer.lemmatize(word.lower()) logger.debug(u"Word: " + word + u" Lemma: " + lemmaWord) words.append(lemmaWord) if pos[0] == u"N": dictWords[lemmaWord] = word for x in dictWords.keys(): logger.info(u"noun : %s" % x) bcf = BigramCollocationFinder.from_words(words) tcf = TrigramCollocationFinder.from_words(words) bcf.apply_word_filter(filter_stops) tcf.apply_word_filter(filter_stops) tcf.apply_freq_filter(3) listBCF = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 100) for bigram in listBCF: concept = u' '.join([bg for bg in bigram]) e = self.conceptsNGram.addConceptKeyType(concept, u"BiGram") logger.info(u"Bigram : %s" % concept) for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(concept)): e.addConceptKeyType(word, pos) listTCF = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 100) for trigram in listTCF: concept = u' '.join([bg for bg in trigram]) e = self.conceptsNGram.addConceptKeyType(concept, u"TriGram") logger.info(u"Trigram : %s" % concept) for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(concept)): e.addConceptKeyType(word, pos) bcfscored = bcf.score_ngrams(BigramAssocMeasures.likelihood_ratio) lt = sorted(bcfscored, key=lambda c: c[1], reverse=True) for score in lt: name = ' '.join([w for w in score[0]]) count = float(score[1]) e = self.conceptsNGramScore.addConceptKeyType(name, u"BiGram") for x in score[0]: e.addConceptKeyType(x, u"BWord") e.count = count logger.debug(u"bcfscored: %s=%s" % (name, count)) tcfscored = tcf.score_ngrams(TrigramAssocMeasures.likelihood_ratio) lt = sorted(tcfscored, key=lambda c: c[1], reverse=True) for score in lt: name = ' '.join([w for w in score[0]]) count = float(score[1]) e = self.conceptsNGramScore.addConceptKeyType(name, u"TriGram") for x in score[0]: e.addConceptKeyType(x, u"TWord") e.count = count logger.debug(u"tcfscored: %s = %s" % (name, count)) Concepts.saveConcepts(self.conceptsNGramScore, self.ngramScoreFile) Concepts.saveConcepts(self.conceptsNGram, self.ngramFile) for concept in self.conceptsNGram.getConcepts().values(): for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(concept.name)): if pos[0] == u"N": e = self.conceptsNGramSubject.addConceptKeyType(word, pos) e.addConceptKeyType(concept.name, u"NGRAM") Concepts.saveConcepts(self.conceptsNGramSubject, self.ngramSubjectFile)
def dependancyAnalysis(self, listTSort): index = 0 for x in listTSort: logger.debug(u"%d %s[%s] -%s-> %s[%s]" % (index, self.al.dictNodes[x[0]][u"name"], self.al.dictNodes[x[0]][ARCHI_TYPE], u"UsedBy", self.al.dictNodes[x[1]][u"name"], self.al.dictNodes[x[1]][ARCHI_TYPE])) index += 1 self.al.addToNodeDict(self.al.dictNodes[x[0]][u"name"], self.al.dictBP) self.al.addToNodeDict(self.al.dictNodes[x[1]][u"name"], self.al.dictBP) logger.info(u"Topic Sort Candidates : %d" % (len(listTSort))) nodes = list() index = 0 dictTasks = dict() for x in listTSort: sname = self.al.dictNodes[x[0]][u"name"] tname = self.al.dictNodes[x[1]][u"name"] index += 1 logger.debug(u"%d %s -%s-> %s" % (index, sname, u"UsedBy", tname)) if sname in dictTasks: ln = dictTasks[sname] ln.append(tname) else: ln = list() ln.append(tname) dictTasks[sname] = ln for x in dictTasks.keys(): logger.debug(u"dictTasks[%s]=%s" % (x, dictTasks[x])) a = Task(x, dictTasks[x]) nodes.append(a) for x in self.al.dictBP.keys(): # for x in listBP: if x not in dictTasks: logger.debug(u"Add %s" % (x)) a = Task(x, list()) nodes.append(a) self.format_nodes(nodes) conceptBatches = Concepts(u"Batch", u"archimate:WorkPackage") n = 0 logger.info(u"Batches:") batches = self.get_task_batches(nodes) for bundle in batches: n += 1 name = u"Batch %d" % n c = conceptBatches.addConceptKeyType(name, u"archimate:WorkPackage") for node in bundle: c.addConceptKeyType(node.name, u"archimate:BusinessProcess") logger.info(u"%d : %s" % (n, ", ".join(node.name.lstrip() for node in bundle))) Concepts.saveConcepts(conceptBatches, fileConceptsBatches) return conceptBatches