def test_ExportArchiFolderModels(cleandir): if __name__ == u"__main__": cleandir() assert (os.path.isfile(fileArchimateTest) is True) al = ArchiLib(fileArchimateTest) folder = u"Scenarios" logger.info(u"Exporting Folder : %s" % folder) listMTE = al.getModelsInFolder(folder) assert (listMTE is not None) logger.info(u"len(listMTE) = %d" % len(listMTE)) assert (len(listMTE) == 2) concepts = Concepts(u"Export", u"Pickle") for ModelToExport in listMTE: logger.info(u" Model : %s" % ModelToExport) d = concepts.addConceptKeyType(ModelToExport, u"Model") al.recurseModel(ModelToExport) al.outputCSVtoFile(concepts, fileExport=fileCSVExport) assert (os.path.isfile(fileCSVExport) is True) Concepts.saveConcepts(concepts, fileConceptsExport) logger.info(u"Save Concepts : %s" % fileConceptsExport) assert (os.path.isfile(fileConceptsExport) is True)
def collectDependancyAnalysisNodes(self): count = 0 listTSort = list() for x in self.al.dictEdges.keys(): logger.debug(u"[%s]=%s" % (self.al.dictEdges[x][u"id"], x)) if u"source" in self.al.dictEdges[x]: source = self.al.dictEdges[x][u"source"] target = self.al.dictEdges[x][u"target"] logger.debug(u" Rel : %s" % (self.al.dictEdges[x][ARCHI_TYPE])) if self.al.dictEdges[x][ARCHI_TYPE] in (u"archimate:FlowRelationship"): # al.countNodeType(al.dictNodes[source][ARCHI_TYPE]) # al.countNodeType(al.dictNodes[target][ARCHI_TYPE]) # al.countNodeType(al.dictEdges[x][ARCHI_TYPE]) if (self.al.dictNodes[source][ARCHI_TYPE] == u"archimate:BusinessProcess") and \ self.al.dictNodes[target][ARCHI_TYPE] == u"archimate:BusinessProcess": sourceName = self.al.getNodeName(source) targetName = self.al.getNodeName(target) if sourceName[0].isdigit() or targetName[0].isdigit(): continue logger.debug(u" %s:%s" % (sourceName, targetName)) l = list() sc = self.findConcept(self.concepts, sourceName) if sc is None: logger.debug(u"New Target - %s" % sourceName) sc = self.concepts.addConceptKeyType(self.al.getNodeName(source), u"Source") self.getWords(sourceName, sc) else: logger.debug(u"Prior Target %s" % sourceName) tc = self.findConcept(self.concepts, targetName) if tc is None: logger.debug(u"New Target %s" % targetName) tc = sc.addConceptKeyType(self.al.getNodeName(target), u"Target") self.getWords(sourceName, tc) else: logger.debug(u"Prior Target %s" % targetName) sc.addConcept(tc) l.append(target) l.append(source) listTSort.append(l) logger.debug(u"Edges = %s" % listTSort) Concepts.saveConcepts(self.concepts, fileConceptsTraversal) self.dependancyAnalysis(listTSort) return self.concepts, listTSort
def exportNeo4JToConcepts(self, concepts, fileNodes=u"nodes.p"): qs = u"Match n return n" lq, qd = self.cypherQuery(qs) for x in lq: if len(x) == 2: logger.info(u"%s[%s]" % (x[0], x[1])) concepts.addConceptKeyType(x[0], x[1]) else: logger.warn(u"Not a standard node : %d : %s" % (len(x), x)) # Match r relations qs = u"match n-[r]-m return n, r, m" lq, qd = self.cypherQuery(qs) for x in lq: if len(x) == 6: logger.info(u"%s[%s]" % (x[0], x[1])) concepts.addConceptKeyType(x[0], x[1]) else: logger.warn(u"Not a standard node : %d : %s" % (len(x), x)) Concepts.saveConcepts(concepts, fileNodes) return concepts
def findSimilarties(self): logger.info(u"Compute Similarity") self.conceptsSimilarity = Concepts(u"ConceptsSimilarity", u"Similarities") # Compute similarity between documents / concepts similarityThreshold = self.similarity for document in self.documentsList: indexNum = self.documentsList.index(document) self.df = self.concepts.getConcepts().keys() logger.info(u"++conceptsDoc %s" % (self.df[indexNum])) logger.info(u" documentsList[" + str(indexNum) + u"]=" + u"".join(x + u" " for x in document)) # Show common topics d = [unicode(x).strip().replace(u"'", u"") for x in document] e = [unicode(y).strip().replace(u"\"", u"") for y in self.listTopics] s1 = set(e) s2 = set(d) common = s1 & s2 lc = [x for x in common] logger.info(u" Common Topics : %s{%s}" % (lc, self.al.dictName[document][ARCHI_TYPE])) self.doComputation(indexNum, similarityThreshold, tfAddWords=True) Concepts.saveConcepts(self.conceptsSimilarity, conceptsSimilarityFile) logger.info(u"Saved Concepts : %s" % conceptsSimilarityFile) return self.conceptsSimilarity
def requirementAnalysis(fileArchimate=None): if fileArchimate is None: fileArchimate = u"/Users/morrj140/Documents/SolutionEngineering/Archimate Models/DVC v38.archimate" al = ArchiLib(fileArchimate) conceptsFile = fileConceptsRequirements searchTypes = list() searchTypes.append(u"archimate:Requirement") nl = al.getTypeNodes(searchTypes) logger.info(u"Find Words in Requirements...") concepts = Concepts(u"Requirement", u"Requirements") n = 0 for sentence in nl: n += 1 logger.debug(u"%s" % sentence) c = concepts.addConceptKeyType(u"Document" + str(n), u"Document") d = c.addConceptKeyType(sentence, u"Sentence" + str(n)) if True and sentence is not None: cleanSentence = ' '.join([word for word in sentence.split(u" ") if word not in stop]) for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(cleanSentence)): if len(word) > 1 and pos[0] == u"N": e = d.addConceptKeyType(word, u"Word") f = e.addConceptKeyType(pos, u"POS") Concepts.saveConcepts(concepts, conceptsFile) logger.info(u"Saved : %s" % conceptsFile) chunks = Chunks(concepts) chunks.createChunks()
def test_ExportArchi(cleandir): if __name__ == u"__main__": cleandir() logger.info(u"Using : %s" % fileArchimateTest) assert (os.path.isfile(fileArchimateTest) is True) al = None concepts = None al = ArchiLib(fileArchimateTest) assert (al is not None) concepts = Concepts(u"Node", u"Nodes") assert (concepts is not None) logger.info(u"Found %d Nodes" % len(al.dictNodes)) logger.info(u"Found %d Edges" % len(al.dictEdges)) assert (len(al.dictNodes) == 45) assert (len(al.dictEdges) == 36) count = 0 listTSort = list() for x in al.dictEdges.keys(): logger.info(u"[%s]=%s" % (al.dictEdges[x][u"id"], x)) if u"source" in al.dictEdges[x]: source = al.dictEdges[x][u"source"] target = al.dictEdges[x][u"target"] logger.info(u" Rel : %s" % (al.dictEdges[x][ARCHI_TYPE])) sourceName = al.getNodeName(source) targetName = al.getNodeName(target) logger.info( u" %s--%s--%s" % (sourceName, al.dictEdges[x][ARCHI_TYPE][10:], targetName)) sc = concepts.addConceptKeyType( sourceName, al.dictNodes[source][ARCHI_TYPE][10:]) # getWords(sourceName, sc) tc = sc.addConceptKeyType(targetName, al.dictNodes[target][ARCHI_TYPE][10:]) # getWords(sourceName, tc) Concepts.saveConcepts(concepts, fileConceptsExport) assert (len(concepts.cd) == 17) assert (os.path.isfile(fileConceptsExport) is True) assert (concepts.typeName == u"Nodes")
def saveTopics(self, topics): wordConcepts = Concepts(u"TopicConcepts", u"Topics") for topic in topics: logger.debug(u"Topic:" + topic[0]) w = wordConcepts.addConceptKeyType(topic[0], u"Topic") w.count = topic[1] Concepts.saveConcepts(wordConcepts, self.topicsFile) return wordConcepts
def _saveConcepts(self): logger.info(u"Saving %s" % self.documentsConceptsFile) Concepts.saveConcepts(self.documentsConcepts, self.documentsConceptsFile) logger.info(u"Saving %s" % self.wordsConceptsFile) Concepts.saveConcepts(self.wordsConcepts, self.wordsConceptsFile) logger.info(u"Saving Documents %s" % os.getcwd() + os.sep + self.documentsConceptsFile) logger.info(u"Saving Words%s" % os.getcwd() + os.sep + self.wordsConceptsFile)
def exportArchi(self): m = hashlib.md5() concepts = Concepts(u"Node", u"Nodes") logger.info(u"Found %d Nodes" % len(self.al.dictNodes)) logger.info(u"Found %d Edges" % len(self.al.dictEdges)) count = 0 listTSort = list() for x in self.al.dictEdges.keys(): logger.debug(u"Edge [%s]=%s" % (self.al.dictEdges[x], x)) if self.al.dictEdges[x].has_key(u"source") and self.al.dictEdges[x].has_key(u"target"): typeEdge = self.al.dictEdges[x][ARCHI_TYPE] logger.debug(u"Edge : %s" % typeEdge) source = self.al.dictEdges[x][u"source"] logger.debug(u"Source : %s" % source) target = self.al.dictEdges[x][u"target"] logger.debug(u"Target : %s" % target) logger.debug(u" Rel : %s" % (self.al.dictEdges[x][ARCHI_TYPE])) sourceName = self.al.getNodeName(source) targetName = self.al.getNodeName(target) logger.debug(u" %s--%s--%s" % (sourceName, self.al.dictEdges[x][ARCHI_TYPE][10:], targetName)) if source in self.al.dictNodes: l = list() sc = concepts.addConceptKeyType(sourceName, self.al.dictNodes[source][ARCHI_TYPE][10:]) # getWords(sourceName, sc) nameEdge = u"(" + sourceName + u"," + targetName + u")" logger.debug(u"nameEdge : %s[%d]" % (nameEdge, len(nameEdge))) logger.debug(u"typeEdge : %s" % typeEdge[10:]) ne = str(self.al.cleanString(nameEdge)) hl = hashlib.sha224(str(ne)).hexdigest() logger.debug(u"hash : %s" % hl) nh = u"%s-%s" % (typeEdge[10:], hl) rc = sc.addConceptKeyType(nh, typeEdge[10:]) if self.al.dictNodes.has_key(target): tc = rc.addConceptKeyType(targetName, self.al.dictNodes[target][ARCHI_TYPE][10:]) # getWords(sourceName, tc) Concepts.saveConcepts(concepts, self.fileConceptsExport) return concepts
def test_ExportArchi(cleandir): if __name__ == u"__main__": cleandir() logger.info(u"Using : %s" % fileArchimateTest) assert (os.path.isfile(fileArchimateTest) is True) al = None concepts = None al = ArchiLib(fileArchimateTest) assert (al is not None) concepts = Concepts(u"Node", u"Nodes") assert (concepts is not None) logger.info(u"Found %d Nodes" % len(al.dictNodes)) logger.info(u"Found %d Edges" % len(al.dictEdges)) assert (len(al.dictNodes) == 45) assert (len(al.dictEdges) == 36) count = 0 listTSort = list() for x in al.dictEdges.keys(): logger.info(u"[%s]=%s" % (al.dictEdges[x][u"id"], x)) if u"source" in al.dictEdges[x]: source = al.dictEdges[x][u"source"] target = al.dictEdges[x][u"target"] logger.info(u" Rel : %s" % (al.dictEdges[x][ARCHI_TYPE])) sourceName = al.getNodeName(source) targetName = al.getNodeName(target) logger.info(u" %s--%s--%s" % (sourceName, al.dictEdges[x][ARCHI_TYPE][10:], targetName)) sc = concepts.addConceptKeyType(sourceName, al.dictNodes[source][ARCHI_TYPE][10:]) # getWords(sourceName, sc) tc = sc.addConceptKeyType(targetName, al.dictNodes[target][ARCHI_TYPE][10:]) # getWords(sourceName, tc) Concepts.saveConcepts(concepts, fileConceptsExport) assert(len(concepts.cd) == 17) assert (os.path.isfile(fileConceptsExport) is True) assert(concepts.typeName == u"Nodes")
def PPTXCrawl(filePPTX): logger.info(u"Using : %s" % filePPTX) cpptx = PPTXCreateArchil(filePPTX) c = cpptx.crawlPPTX() c.logConcepts() Concepts.saveConcepts(c, fileConceptsPPTX)
def test_PPTXCrawl(fileArchimate): assert (os.path.isfile(filePPTXIn) is True) logger.info(u"Using : %s" % filePPTXIn) cpptx = PPTXCreateArchil(filePPTXIn, fileArchimate) c = cpptx.crawlPPTX() Concepts.saveConcepts(c, fileConceptsPPTX) assert (os.path.isfile(fileConceptsPPTX) is True)
def PPTXCreateArchi(): start_time = ArchiLib.startTimer() logger.info(u"Using : %s" % filePPTXIn) cpptx = PPTXCreateArchil(filePPTXIn, fileArchimateTest) c = cpptx.crawlPPTX() c.logConcepts() Concepts.saveConcepts(c, fileConceptsPPTX) ArchiLib.stopTimer(start_time)
def createArchimateConcepts(fileArchimate, fileConceptsArch): logger.info(u"Using : %s" % fileArchimate) concepts = Concepts(fileArchimateModel, u"Archimate") al = ArchiLib(fileArchimate) al.logTypeCounts() # # Create Concepts from Archimate # al.folderConcepts(concepts) Concepts.saveConcepts(concepts, fileConceptsArch) logger.info(u"Saved concepts to : %s" % fileConceptsArch)
def exportArchiFolderModels(self, folder): logger.info(u"Exporting Folder : %s" % folder) listMTE = self.al.getModelsInFolder(folder) concepts = Concepts(u"Export", u"Pickle") for ModelToExport in listMTE: logger.info(u" Model : %s" % ModelToExport) d = concepts.addConceptKeyType(ModelToExport, u"Model") self.al.recurseModel(ModelToExport, d) self.al.outputCSVtoFile(concepts, fileCSVExport) Concepts.saveConcepts(concepts, self.conceptsFile) logger.info(u"Save Concepts : %s" % self.conceptsFile)
def test_CreateArchimateConcepts(cleandir, fileArchimate): assert (os.path.isfile(fileArchimate) is True) logger.info(u"Using : %s" % fileArchimate) concepts = Concepts(fileArchimate, u"Archimate") al = ArchiLib(fileArchimate) lc = al.logTypeCounts() assert (len(lc) > 0) # # Create Concepts from Archimate # al.folderConcepts(concepts) Concepts.saveConcepts(concepts, fileConceptsArch) logger.info(u"Saved concepts to : %s" % fileConceptsArch) assert (os.path.isfile(fileConceptsArch) is True)
def test_ExportArchiModel(cleandir): if __name__ == u"__main__": cleandir() assert (os.path.isfile(fileArchimateTest) is True) al = ArchiLib(fileArchimateTest) listMTE = list() listMTE.append(u"01. Market to Leads") concepts = Concepts(u"Export", u"Model") for ModelToExport in listMTE: al.recurseModel(ModelToExport) Concepts.saveConcepts(concepts, fileConceptsExport) assert (os.path.isfile(fileConceptsExport) is True) al.outputCSVtoFile(concepts, fileCSVExport) assert (os.path.isfile(fileCSVExport) is True)
def test_ArchimateConcepts(cleandir): if __name__ == u"__main__": cleandir() logger.info(u"Using : %s" % fileArchimateTest) assert (os.path.isfile(fileArchimateTest) is True) concepts = Concepts(fileConceptsArch, u"Archimate") al = ArchiLib(fileArchimateTest) # # Create Concepts from Archimate # al.folderConcepts(concepts) Concepts.saveConcepts(concepts, fileConceptsArch) logger.info(u"Saved concepts to : %s" % fileConceptsArch) assert (os.path.isfile(fileConceptsArch) is True)
def test_RequirementAnalysis(cleandir, fileArchimate): assert (os.path.isfile(filePPTXIn) is True) al = ArchiLib(fileArchimate) conceptsFile = fileConceptsRequirements searchTypes = list() searchTypes.append(u"archimate:Requirement") nl = al.getTypeNodes(searchTypes) logger.info(u"Find Words in Requirements...") concepts = Concepts(u"Requirement", u"Requirements") n = 0 for sentence in nl: n += 1 logger.debug(u"%s" % sentence) c = concepts.addConceptKeyType(u"Document" + unicode(n), u"Document") d = c.addConceptKeyType(sentence, u"Sentence" + unicode(n)) if True and sentence is not None: cleanSentence = ' '.join([word for word in sentence.split(" ") if word not in stop]) for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(cleanSentence)): if len(word) > 1 and pos[0] == u"N": e = d.addConceptKeyType(word, u"Word") f = e.addConceptKeyType(pos, u"POS") Concepts.saveConcepts(concepts, conceptsFile) logger.info(u"Saved : %s" % conceptsFile) assert (os.path.isfile(conceptsFile) is True) chunks = Chunks(concepts) chunks.createChunks() assert (os.path.isfile(fileConceptsChunks) is True)
def analyzeNamedEntities(self): rels = (u"archimate:AccessRelationship", u"archimate:SpecialisationRelationship", u"archimate:CompositionRelationship", u"archimate:AggregationRelationship") listType = (u"archimate:Requirement", ) dictEntities = self.al.getTypeNodes(listType) concepts = Concepts(u"Entities", u"BusinessObject") for x in self.al.dictEdges.keys(): try: logger.debug(u"[%s]=%s" % (x, self.al.dictEdges[x][ARCHI_TYPE])) source = self.al.dictEdges[x][u"source"] target = self.al.dictEdges[x][u"target"] logger.debug(u" Source : %s" % source) logger.debug(u" Target : %s" % target) except: logger.warn(u"[%s] ARCH_TYPE Exception" % (x)) continue if self.al.dictEdges[x][ARCHI_TYPE] in rels: logger.info(u"%s -> [ %s ] -> %s" % (self.al.dictNodes[source][u"name"][:20], self.al.dictEdges[x][ARCHI_TYPE], self.al.dictNodes[target][u"name"][:20])) listNodes = self.al.getEdgesForNode(source, rels) for x in listNodes: logger.debug(u" %s" % (x)) Concepts.saveConcepts(concepts, fileConceptsRelations)
def createChunks(self): stop = stopwords.words('english') stop.append("This") stop.append("The") stop.append(",") stop.append(".") stop.append("..") stop.append("...") stop.append(".") stop.append(";") stop.append("and") stemmer = PorterStemmer() lemmatizer = WordNetLemmatizer() tokenizer = RegexpTokenizer("[\w]+") for document in self.concepts.getConcepts().values(): logger.info("%s" % document.name) d = self.chunkConcepts.addConceptKeyType(document.name, "Document") for sentence in document.getConcepts().values(): logger.debug("%s(%s)" % (sentence.name, sentence.typeName)) cleanSentence = ' '.join([word for word in sentence.name.split() if word not in stop]) listSentence = list() for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(cleanSentence)): logger.debug("Word: " + word + " POS: " + pos) if pos[:1] == "N": lemmaWord = lemmatizer.lemmatize(word) logger.debug("Word: " + word + " Lemma: " + lemmaWord) synset = wn.synsets(word, pos='n') logger.debug("synset : %s" % synset) if len(synset) != 0: syn = synset[0] root = syn.root_hypernyms() logger.debug("root : %s" % root) hypernyms = syn.hypernyms() logger.debug("hypernyms : %s" % hypernyms) if len(hypernyms) > 0: hyponyms = syn.hypernyms()[0].hyponyms() logger.debug("hyponyms : %s" % hyponyms) else: hyponyms = None listSentence.append((word, lemmaWord, pos, root, hypernyms, hyponyms)) nounSentence = "" for word in listSentence: nounSentence += word[1] + " " if len(nounSentence) > 2: e = d.addConceptKeyType(nounSentence, "NounSentence") for word in listSentence: f = e.addConceptKeyType(word[0], word[2]) f.addConceptKeyType(word[1], "Lemma") logger.debug("%s = %s" % (cleanSentence, type(cleanSentence))) cleanSentence = cleanSentence.decode("utf-8", errors="ignore") pt = parsetree(cleanSentence, relations=True, lemmata=True) for sentence in pt: logger.debug("relations: %s" % [x for x in sentence.relations]) logger.debug("subject : %s" % [x for x in sentence.subjects]) logger.debug("verb : %s" % [x for x in sentence.verbs]) logger.debug("object : %s" % [x for x in sentence.objects]) if sentence.subjects is not None: logger.debug("Sentence : %s" % sentence.chunks) for chunk in sentence.chunks: logger.debug("Chunk : %s" % chunk) relation = str(chunk.relation).encode("ascii", errors="ignore").strip() role = str(chunk.role).encode("ascii", errors="ignore").strip() logger.debug("Relation : %s" % relation) logger.debug("Role : %s" % role) for subject in sentence.subjects: logger.debug("Subject.realtion : %s " % subject.relation) logger.debug("Subject : %s " % subject.string) f = e.addConceptKeyType(subject.string, "SBJ") for verb in sentence.verbs: if verb.relation == subject.relation: logger.debug("Verb.realtion : %s " % verb.relation) logger.debug("Verb : %s " % verb.string) g = f.addConceptKeyType(verb.string, "VP") for obj in sentence.objects: if obj.relation == verb.relation: logger.debug("Obj.realtion : %s " % obj.relation) logger.debug("Object : %s " % obj.string) g.addConceptKeyType(obj.string, "OBJ") Concepts.saveConcepts(self.chunkConcepts, self.chunkFile)
def createChunks(self): stop = stopwords.words(u'english') stop.append(u"This") stop.append(u"The") stop.append(u",") stop.append(u".") stop.append(u"..") stop.append(u"...") stop.append(u".") stop.append(u";") stop.append(u"and") stemmer = PorterStemmer() lemmatizer = WordNetLemmatizer() tokenizer = RegexpTokenizer(u"[\w]+") for document in self.concepts.getConcepts().values(): logger.info(u"%s" % document.name) d = self.chunkConcepts.addConceptKeyType(document.name, u"Document") for sentence in document.getConcepts().values(): logger.debug(u"%s(%s)" % (sentence.name, sentence.typeName)) cleanSentence = ' '.join([word for word in sentence.name.split() if word not in stop]) listSentence = list() for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(cleanSentence)): logger.debug(u"Word: " + word + u" POS: " + pos) if pos[:1] == u"N": #if True: lemmaWord = lemmatizer.lemmatize(word) logger.debug(u"Word: " + word + u" Lemma: " + lemmaWord) morphWord = wn.morphy(word, wn.NOUN) if morphWord is not None: logger.debug(u"Word: " + word + u" Morph: " + morphWord) synset = wn.synsets(word, pos=u'n') logger.debug(u"synset : %s" % synset) if len(synset) != 0: syn = synset[0] root = syn.root_hypernyms() logger.debug(u"root : %s" % root) mh = syn.member_holonyms() logger.debug(u"member_holonyms : %s" % mh) hypernyms = syn.hypernyms() logger.debug(u"hypernyms : %s" % hypernyms) if len(hypernyms) > 0: hyponyms = syn.hypernyms()[0].hyponyms() logger.debug(u"hyponyms : %s" % hyponyms) else: hyponyms = None listSentence.append((word, lemmaWord, pos, root, hypernyms, hyponyms)) nounSentence = u"" for word in listSentence: nounSentence += word[1] + u" " if len(nounSentence) > 2: e = d.addConceptKeyType(nounSentence, u"NounSentence") for word in listSentence: f = e.addConceptKeyType(word[0], word[2]) f.addConceptKeyType(word[1], u"Lemma") logger.debug(u"%s = %s" % (cleanSentence, type(cleanSentence))) cleanSentence = unicode(cleanSentence) pt = parsetree(cleanSentence, relations=True, lemmata=True) for sentence in pt: logger.debug(u"relations: %s" % [x for x in sentence.relations]) logger.debug(u"subject : %s" % [x for x in sentence.subjects]) logger.debug(u"verb : %s" % [x for x in sentence.verbs]) logger.debug(u"object : %s" % [x for x in sentence.objects]) if sentence.subjects is not None: logger.debug(u"Sentence : %s" % sentence.chunks) for chunk in sentence.chunks: logger.debug(u"Chunk : %s" % chunk) relation = unicode(chunk.relation).strip() role = unicode(chunk.role).strip() logger.debug(u"Relation : %s" % relation) logger.debug(u"Role : %s" % role) for subject in sentence.subjects: logger.debug(u" Subject.realtion : %s " % subject.relation) logger.debug(u" Subject : %s " % subject.string) f = e.addConceptKeyType(subject.string, u"SBJ") for verb in sentence.verbs: if verb.relation == subject.relation: logger.debug(u" Verb.realtion : %s " % verb.relation) logger.debug(u" Verb : %s " % verb.string) g = f.addConceptKeyType(verb.string, u"VP") for obj in sentence.objects: if obj.relation == verb.relation: logger.debug(u" Obj.realtion : %s " % obj.relation) logger.debug(u" Object : %s " % obj.string) g.addConceptKeyType(obj.string, u"OBJ") Concepts.saveConcepts(self.chunkConcepts, self.chunkFile) logger.info(u"Saved : %s" % self.chunkFile)
def exportArchi(self): m = hashlib.md5() concepts = Concepts(u"Node", u"Nodes") logger.info(u"Found %d Nodes" % len(self.al.dictNodes)) logger.info(u"Found %d Edges" % len(self.al.dictEdges)) count = 0 listTSort = list() for x in self.al.dictEdges.keys(): logger.debug(u"Edge [%s]=%s" % (self.al.dictEdges[x], x)) if self.al.dictEdges[x].has_key( u"source") and self.al.dictEdges[x].has_key(u"target"): typeEdge = self.al.dictEdges[x][ARCHI_TYPE] logger.debug(u"Edge : %s" % typeEdge) source = self.al.dictEdges[x][u"source"] logger.debug(u"Source : %s" % source) target = self.al.dictEdges[x][u"target"] logger.debug(u"Target : %s" % target) logger.debug(u" Rel : %s" % (self.al.dictEdges[x][ARCHI_TYPE])) sourceName = self.al.getNodeName(source) targetName = self.al.getNodeName(target) logger.debug( u" %s--%s--%s" % (sourceName, self.al.dictEdges[x][ARCHI_TYPE][10:], targetName)) if source in self.al.dictNodes: l = list() sc = concepts.addConceptKeyType( sourceName, self.al.dictNodes[source][ARCHI_TYPE][10:]) # getWords(sourceName, sc) nameEdge = u"(" + sourceName + u"," + targetName + u")" logger.debug(u"nameEdge : %s[%d]" % (nameEdge, len(nameEdge))) logger.debug(u"typeEdge : %s" % typeEdge[10:]) ne = str(self.al.cleanString(nameEdge)) hl = hashlib.sha224(str(ne)).hexdigest() logger.debug(u"hash : %s" % hl) nh = u"%s-%s" % (typeEdge[10:], hl) rc = sc.addConceptKeyType(nh, typeEdge[10:]) if self.al.dictNodes.has_key(target): tc = rc.addConceptKeyType( targetName, self.al.dictNodes[target][ARCHI_TYPE][10:]) # getWords(sourceName, tc) Concepts.saveConcepts(concepts, self.fileConceptsExport) return concepts
#'BusinessFunction.csv', #'BusinessProcess.csv', #'Capability.csv', #'Entity.csv', #'Functionality.csv', #'ITService.csv', #'OrganizationalProcess.csv', #'Requirements.csv', #'SystemProcess.csv' #] #for f in fileList: # logger.info("File: %s" %f) # inputFile = homeDir + "\\Mega\\" + f wordsConcepts = logSemanticTerms (inputFile, outputFile) Concepts.saveConcepts(wordsConcepts, wordsFile) if GRAPH == True: graph.write_png('SemanticTerms.png') topic = "WORD" tc = TopicCloud(wordsConcepts, os.getcwd() + os.sep) tc.createCloudImage(topic, size_x=1200, size_y=900, numWords=50, scale=1)
def dependancyAnalysis(self, listTSort): index = 0 for x in listTSort: logger.debug(u"%d %s[%s] -%s-> %s[%s]" % (index, self.al.dictNodes[x[0]][u"name"], self.al.dictNodes[x[0]][ARCHI_TYPE], u"UsedBy", self.al.dictNodes[x[1]][u"name"], self.al.dictNodes[x[1]][ARCHI_TYPE])) index += 1 self.al.addToNodeDict(self.al.dictNodes[x[0]][u"name"], self.al.dictBP) self.al.addToNodeDict(self.al.dictNodes[x[1]][u"name"], self.al.dictBP) logger.info(u"Topic Sort Candidates : %d" % (len(listTSort))) nodes = list() index = 0 dictTasks = dict() for x in listTSort: sname = self.al.dictNodes[x[0]][u"name"] tname = self.al.dictNodes[x[1]][u"name"] index += 1 logger.debug(u"%d %s -%s-> %s" % (index, sname, u"UsedBy", tname)) if sname in dictTasks: ln = dictTasks[sname] ln.append(tname) else: ln = list() ln.append(tname) dictTasks[sname] = ln for x in dictTasks.keys(): logger.debug(u"dictTasks[%s]=%s" % (x, dictTasks[x])) a = Task(x, dictTasks[x]) nodes.append(a) for x in self.al.dictBP.keys(): # for x in listBP: if x not in dictTasks: logger.debug(u"Add %s" % (x)) a = Task(x, list()) nodes.append(a) self.format_nodes(nodes) conceptBatches = Concepts(u"Batch", u"archimate:WorkPackage") n = 0 logger.info(u"Batches:") batches = self.get_task_batches(nodes) for bundle in batches: n += 1 name = u"Batch %d" % n c = conceptBatches.addConceptKeyType(name, u"archimate:WorkPackage") for node in bundle: c.addConceptKeyType(node.name, u"archimate:BusinessProcess") logger.info(u"%d : %s" % (n, ", ".join(node.name.lstrip() for node in bundle))) Concepts.saveConcepts(conceptBatches, fileConceptsBatches) return conceptBatches
def find_collocations(self): lemmatizer = WordNetLemmatizer() stopset = set(stop) filter_stops = lambda w: len(w) < 3 or w in stopset words = list() dictWords = dict() for document in self.concepts.getConcepts().values(): logger.debug(document.name) for concept in document.getConcepts().values(): logger.debug(concept.name) for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(concept.name)): logger.debug(u"Word: " + word + u" POS: " + pos) lemmaWord = lemmatizer.lemmatize(word.lower()) logger.debug(u"Word: " + word + u" Lemma: " + lemmaWord) words.append(lemmaWord) if pos[0] == u"N": dictWords[lemmaWord] = word for x in dictWords.keys(): logger.info(u"noun : %s" % x) bcf = BigramCollocationFinder.from_words(words) tcf = TrigramCollocationFinder.from_words(words) bcf.apply_word_filter(filter_stops) tcf.apply_word_filter(filter_stops) tcf.apply_freq_filter(3) listBCF = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 100) for bigram in listBCF: concept = u' '.join([bg for bg in bigram]) e = self.conceptsNGram.addConceptKeyType(concept, u"BiGram") logger.info(u"Bigram : %s" % concept) for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(concept)): e.addConceptKeyType(word, pos) listTCF = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 100) for trigram in listTCF: concept = u' '.join([bg for bg in trigram]) e = self.conceptsNGram.addConceptKeyType(concept, u"TriGram") logger.info(u"Trigram : %s" % concept) for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(concept)): e.addConceptKeyType(word, pos) bcfscored = bcf.score_ngrams(BigramAssocMeasures.likelihood_ratio) lt = sorted(bcfscored, key=lambda c: c[1], reverse=True) for score in lt: name = ' '.join([w for w in score[0]]) count = float(score[1]) e = self.conceptsNGramScore.addConceptKeyType(name, u"BiGram") for x in score[0]: e.addConceptKeyType(x, u"BWord") e.count = count logger.debug(u"bcfscored: %s=%s" % (name, count)) tcfscored = tcf.score_ngrams(TrigramAssocMeasures.likelihood_ratio) lt = sorted(tcfscored, key=lambda c: c[1], reverse=True) for score in lt: name = ' '.join([w for w in score[0]]) count = float(score[1]) e = self.conceptsNGramScore.addConceptKeyType(name, u"TriGram") for x in score[0]: e.addConceptKeyType(x, u"TWord") e.count = count logger.debug(u"tcfscored: %s = %s" % (name, count)) Concepts.saveConcepts(self.conceptsNGramScore, self.ngramScoreFile) Concepts.saveConcepts(self.conceptsNGram, self.ngramFile) for concept in self.conceptsNGram.getConcepts().values(): for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(concept.name)): if pos[0] == u"N": e = self.conceptsNGramSubject.addConceptKeyType(word, pos) e.addConceptKeyType(concept.name, u"NGRAM") Concepts.saveConcepts(self.conceptsNGramSubject, self.ngramSubjectFile)
def findSimilarties(self, conceptsSimilarityFile): logger.info(u"Compute Similarity") self.conceptsSimilarity = Concepts(u"ConceptsSimilarity", u"Similarities") # Compute similarity between documents / concepts similarityThreshold = similarity if THREAD: # startup the threads for threadID in range(0,ThreadDepth): thread = myThread(threadID, self) thread.start() self.threads.append(thread) for document in self.documentsList: indexNum = self.documentsList.index(document) logger.info(u"Document %s" % (self.df[indexNum])) pj = self.conceptsSimilarity.addConceptKeyType(self.df[indexNum], u"Document") logger.debug(u" documentsList[%d] = %s" % (indexNum, str(document))) # Show common topics d = [unicode(x).strip().replace(u"'", u"") for x in document] e = [unicode(y).strip().replace(u"\"", u"") for y in self.listTopics] s1 = set(e) s2 = set(d) common = s1 & s2 lc = [x for x in common] logger.debug(u" Common Topics : %s" % (lc)) if THREAD is False: self.doComputation(document, similarityThreshold, pj, Topics=True) else: logger.debug(u"npbtAquire Queue Lock") queueLock.acquire() logger.debug(u"npbtPut Queue ") rl = [document, similarityThreshold] while workQueue.qsize() == QueueDepth: time.sleep(1) workQueue.put(rl) queueLock.release() logger.debug(u"npbtRelease Queue Lock") qs = workQueue.qsize() if qs % QueueDelta == 0: logger.info(u"rQueue Size = %s" % qs) # Wait for queue to empty qs = workQueue.qsize() while qs != 0: time.sleep(1) qs = workQueue.qsize() if qs % QueueDelta == 0: logger.info(u"wQueue Size = %s" % qs) # Notify threads it's time to exit exitFlag = 1 # Wait for all threads to complete for t in self.threads: logger.info(u"Waiting for thread %s to end..." % t) t.join(0.5) Concepts.saveConcepts(self.conceptsSimilarity, conceptsSimilarityFile) # Concepts.outputConceptsToCSV(self.conceptsSimilarity, fileExport=u"BusinessRequirements.csv") logger.info(u"Complete - findSimilarties") return self.conceptsSimilarity