Пример #1
0
def test_ExportArchiFolderModels(cleandir):

    if __name__ == u"__main__":
        cleandir()

    assert (os.path.isfile(fileArchimateTest) is True)
    al = ArchiLib(fileArchimateTest)

    folder = u"Scenarios"

    logger.info(u"Exporting Folder : %s" % folder)
    listMTE = al.getModelsInFolder(folder)
    assert (listMTE is not None)

    logger.info(u"len(listMTE) = %d" % len(listMTE))
    assert (len(listMTE) == 2)

    concepts = Concepts(u"Export", u"Pickle")

    for ModelToExport in listMTE:
        logger.info(u"  Model : %s" % ModelToExport)
        d = concepts.addConceptKeyType(ModelToExport, u"Model")
        al.recurseModel(ModelToExport)

    al.outputCSVtoFile(concepts, fileExport=fileCSVExport)
    assert (os.path.isfile(fileCSVExport) is True)

    Concepts.saveConcepts(concepts, fileConceptsExport)
    logger.info(u"Save Concepts : %s" % fileConceptsExport)

    assert (os.path.isfile(fileConceptsExport) is True)
Пример #2
0
def test_ExportArchiFolderModels(cleandir):

    if __name__ == u"__main__":
        cleandir()

    assert (os.path.isfile(fileArchimateTest) is True)
    al = ArchiLib(fileArchimateTest)

    folder = u"Scenarios"

    logger.info(u"Exporting Folder : %s" % folder)
    listMTE = al.getModelsInFolder(folder)
    assert (listMTE is not None)

    logger.info(u"len(listMTE) = %d" % len(listMTE))
    assert (len(listMTE) == 2)

    concepts = Concepts(u"Export", u"Pickle")

    for ModelToExport in listMTE:
        logger.info(u"  Model : %s" % ModelToExport)
        d = concepts.addConceptKeyType(ModelToExport, u"Model")
        al.recurseModel(ModelToExport)

    al.outputCSVtoFile(concepts, fileExport=fileCSVExport)
    assert (os.path.isfile(fileCSVExport) is True)

    Concepts.saveConcepts(concepts, fileConceptsExport)
    logger.info(u"Save Concepts : %s" % fileConceptsExport)

    assert (os.path.isfile(fileConceptsExport) is True)
def requirementAnalysis(fileArchimate=None):

    if fileArchimate is None:
        fileArchimate = u"/Users/morrj140/Documents/SolutionEngineering/Archimate Models/DVC v38.archimate"

    al = ArchiLib(fileArchimate)

    conceptsFile = fileConceptsRequirements

    searchTypes = list()
    searchTypes.append(u"archimate:Requirement")
    nl = al.getTypeNodes(searchTypes)

    logger.info(u"Find Words in Requirements...")
    concepts = Concepts(u"Requirement", u"Requirements")
    n = 0
    for sentence in nl:
        n += 1
        logger.debug(u"%s" % sentence)

        c = concepts.addConceptKeyType(u"Document" + str(n), u"Document")
        d = c.addConceptKeyType(sentence, u"Sentence" + str(n))

        if True and sentence is not None:
            cleanSentence = ' '.join([word for word in sentence.split(u" ") if word not in stop])
            for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(cleanSentence)):
                if len(word) > 1 and pos[0] == u"N":
                    e = d.addConceptKeyType(word, u"Word")
                    f = e.addConceptKeyType(pos, u"POS")

    Concepts.saveConcepts(concepts, conceptsFile)
    logger.info(u"Saved : %s" % conceptsFile)

    chunks = Chunks(concepts)
    chunks.createChunks()
Пример #4
0
def test_ExportArchi(cleandir):

    if __name__ == u"__main__":
        cleandir()

    logger.info(u"Using : %s" % fileArchimateTest)

    assert (os.path.isfile(fileArchimateTest) is True)

    al = None
    concepts = None

    al = ArchiLib(fileArchimateTest)

    assert (al is not None)

    concepts = Concepts(u"Node", u"Nodes")

    assert (concepts is not None)

    logger.info(u"Found %d Nodes" % len(al.dictNodes))
    logger.info(u"Found %d Edges" % len(al.dictEdges))

    assert (len(al.dictNodes) == 45)
    assert (len(al.dictEdges) == 36)

    count = 0
    listTSort = list()
    for x in al.dictEdges.keys():
        logger.info(u"[%s]=%s" % (al.dictEdges[x][u"id"], x))

        if u"source" in al.dictEdges[x]:
            source = al.dictEdges[x][u"source"]
            target = al.dictEdges[x][u"target"]

            logger.info(u"  Rel    : %s" % (al.dictEdges[x][ARCHI_TYPE]))

            sourceName = al.getNodeName(source)
            targetName = al.getNodeName(target)

            logger.info(
                u" %s--%s--%s" %
                (sourceName, al.dictEdges[x][ARCHI_TYPE][10:], targetName))

            sc = concepts.addConceptKeyType(
                sourceName, al.dictNodes[source][ARCHI_TYPE][10:])
            # getWords(sourceName, sc)

            tc = sc.addConceptKeyType(targetName,
                                      al.dictNodes[target][ARCHI_TYPE][10:])
            # getWords(sourceName, tc)

    Concepts.saveConcepts(concepts, fileConceptsExport)

    assert (len(concepts.cd) == 17)

    assert (os.path.isfile(fileConceptsExport) is True)

    assert (concepts.typeName == u"Nodes")
Пример #5
0
 def saveTopics(self, topics):
     wordConcepts = Concepts(u"TopicConcepts", u"Topics")
     for topic in topics:
         logger.debug(u"Topic:" + topic[0])
         w = wordConcepts.addConceptKeyType(topic[0], u"Topic")
         w.count = topic[1]
     Concepts.saveConcepts(wordConcepts, self.topicsFile)
     return wordConcepts
Пример #6
0
    def exportArchi(self):

        m = hashlib.md5()

        concepts = Concepts(u"Node", u"Nodes")

        logger.info(u"Found %d Nodes" % len(self.al.dictNodes))
        logger.info(u"Found %d Edges" % len(self.al.dictEdges))

        count = 0
        listTSort = list()
        for x in self.al.dictEdges.keys():
            logger.debug(u"Edge [%s]=%s" % (self.al.dictEdges[x], x))

            if self.al.dictEdges[x].has_key(u"source") and self.al.dictEdges[x].has_key(u"target"):

                typeEdge = self.al.dictEdges[x][ARCHI_TYPE]
                logger.debug(u"Edge   : %s" % typeEdge)

                source = self.al.dictEdges[x][u"source"]
                logger.debug(u"Source : %s" % source)

                target = self.al.dictEdges[x][u"target"]
                logger.debug(u"Target : %s" % target)

                logger.debug(u"  Rel    : %s" % (self.al.dictEdges[x][ARCHI_TYPE]))

                sourceName = self.al.getNodeName(source)
                targetName = self.al.getNodeName(target)

                logger.debug(u" %s--%s--%s" % (sourceName, self.al.dictEdges[x][ARCHI_TYPE][10:], targetName))

                if source in self.al.dictNodes:
                    l = list()
                    sc = concepts.addConceptKeyType(sourceName, self.al.dictNodes[source][ARCHI_TYPE][10:])
                    # getWords(sourceName, sc)

                nameEdge = u"(" + sourceName + u"," + targetName + u")"
                logger.debug(u"nameEdge : %s[%d]" % (nameEdge, len(nameEdge)))
                logger.debug(u"typeEdge : %s" % typeEdge[10:])

                ne = str(self.al.cleanString(nameEdge))
                hl = hashlib.sha224(str(ne)).hexdigest()

                logger.debug(u"hash : %s" % hl)

                nh = u"%s-%s" % (typeEdge[10:], hl)

                rc = sc.addConceptKeyType(nh, typeEdge[10:])

                if self.al.dictNodes.has_key(target):
                    tc = rc.addConceptKeyType(targetName, self.al.dictNodes[target][ARCHI_TYPE][10:])
                    # getWords(sourceName, tc)

        Concepts.saveConcepts(concepts, self.fileConceptsExport)

        return concepts
Пример #7
0
def getOpenXmlText(filename, ftype):
        logger.info("OpenXmlText: %s" % filename)

        document = openxmldoc

        doc = openxmllib.openXmlDocument(path=filename)
        c = Concepts(filename, ftype)

        logger.debug ("%s\n" % (doc.allProperties))

        ap = c.addConceptKeyType("allProperties","PROPERTIES")
        for x in doc.allProperties:
            logger.info("cp %s:%s" % (x, doc.allProperties[x]))
            ap.addConceptKeyType(doc.allProperties[x], x)

        logger.info("it %s\n" % (doc.indexableText(include_properties=True)))
        c.addConceptKeyType(doc.indexableText(include_properties=True),"TEXT")

        return c
Пример #8
0
def test_ExportArchi(cleandir):

    if __name__ == u"__main__":
        cleandir()

    logger.info(u"Using : %s" % fileArchimateTest)

    assert (os.path.isfile(fileArchimateTest) is True)

    al = None
    concepts = None

    al = ArchiLib(fileArchimateTest)

    assert (al is not None)

    concepts = Concepts(u"Node", u"Nodes")

    assert (concepts is not None)

    logger.info(u"Found %d Nodes" % len(al.dictNodes))
    logger.info(u"Found %d Edges" % len(al.dictEdges))

    assert (len(al.dictNodes) == 45)
    assert (len(al.dictEdges) == 36)

    count = 0
    listTSort = list()
    for x in al.dictEdges.keys():
        logger.info(u"[%s]=%s" % (al.dictEdges[x][u"id"], x))

        if u"source" in al.dictEdges[x]:
            source = al.dictEdges[x][u"source"]
            target = al.dictEdges[x][u"target"]

            logger.info(u"  Rel    : %s" % (al.dictEdges[x][ARCHI_TYPE]))

            sourceName = al.getNodeName(source)
            targetName = al.getNodeName(target)

            logger.info(u" %s--%s--%s" % (sourceName, al.dictEdges[x][ARCHI_TYPE][10:], targetName))

            sc = concepts.addConceptKeyType(sourceName, al.dictNodes[source][ARCHI_TYPE][10:])
            # getWords(sourceName, sc)

            tc = sc.addConceptKeyType(targetName, al.dictNodes[target][ARCHI_TYPE][10:])
            # getWords(sourceName, tc)

    Concepts.saveConcepts(concepts, fileConceptsExport)

    assert(len(concepts.cd) == 17)

    assert (os.path.isfile(fileConceptsExport) is True)

    assert(concepts.typeName == u"Nodes")
Пример #9
0
def getPDFText(filename):
        logger.info("filename: %s" % filename)
        newparatextlist = []

        pdfDoc = PdfFileReader(file(filename, "rb"))
        
        pdfDict = pdfDoc.getDocumentInfo()
        c = Concepts(filename, "PDF")
        for x in pdfDict.keys():
	    try:
                c.addConceptKeyType(x[1:], pdfDict[x])
            except:
                logger.warn("ops...")
 
        #c.logConcepts()
        
        for page in pdfDoc.pages:
            text = page.extractText()
            logger.info("PDF : %s" % text)
            newparatextlist.append(text + ". ")

        return newparatextlist
Пример #10
0
    def exportArchiFolderModels(self, folder):

        logger.info(u"Exporting Folder : %s" % folder)

        listMTE = self.al.getModelsInFolder(folder)

        concepts = Concepts(u"Export", u"Pickle")

        for ModelToExport in listMTE:
            logger.info(u"  Model : %s" % ModelToExport)
            d = concepts.addConceptKeyType(ModelToExport, u"Model")
            self.al.recurseModel(ModelToExport, d)

        self.al.outputCSVtoFile(concepts, fileCSVExport)

        Concepts.saveConcepts(concepts, self.conceptsFile)

        logger.info(u"Save Concepts : %s" % self.conceptsFile)
Пример #11
0
    def exportArchiFolderModels(self, folder):

        logger.info(u"Exporting Folder : %s" % folder)

        listMTE = self.al.getModelsInFolder(folder)

        concepts = Concepts(u"Export", u"Pickle")

        for ModelToExport in listMTE:
            logger.info(u"  Model : %s" % ModelToExport)
            d = concepts.addConceptKeyType(ModelToExport, u"Model")
            self.al.recurseModel(ModelToExport, d)

        self.al.outputCSVtoFile(concepts, fileCSVExport)

        Concepts.saveConcepts(concepts, self.conceptsFile)

        logger.info(u"Save Concepts : %s" % self.conceptsFile)
Пример #12
0
def test_RequirementAnalysis(cleandir, fileArchimate):

    assert (os.path.isfile(filePPTXIn) is True)

    al = ArchiLib(fileArchimate)

    conceptsFile = fileConceptsRequirements

    searchTypes = list()
    searchTypes.append(u"archimate:Requirement")
    nl = al.getTypeNodes(searchTypes)

    logger.info(u"Find Words in Requirements...")
    concepts = Concepts(u"Requirement", u"Requirements")
    n = 0
    for sentence in nl:
        n += 1
        logger.debug(u"%s" % sentence)

        c = concepts.addConceptKeyType(u"Document" + unicode(n), u"Document")
        d = c.addConceptKeyType(sentence, u"Sentence" + unicode(n))

        if True and sentence is not None:
            cleanSentence = ' '.join([word for word in sentence.split(" ") if word not in stop])
            for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(cleanSentence)):
                if len(word) > 1 and pos[0] == u"N":
                    e = d.addConceptKeyType(word, u"Word")
                    f = e.addConceptKeyType(pos, u"POS")

    Concepts.saveConcepts(concepts, conceptsFile)
    logger.info(u"Saved : %s" % conceptsFile)

    assert (os.path.isfile(conceptsFile) is True)

    chunks = Chunks(concepts)
    chunks.createChunks()

    assert (os.path.isfile(fileConceptsChunks) is True)
Пример #13
0
class DocumentsSimilarity(object):
    concepts = None
    conceptsSimilarity = None
    tm = None
    documentsList = None
    wordcount = None
    threads = None
    topics = None
    topicConcepts = None
    mapDocumentList = None
    df = None
    mapDocuments = None
    
    def __init__(self):
        self.threads = list()
    
    def createTopics(self, conceptsFile, concepts=None):

        if concepts is None:
            logger.info(u"Load Concepts from " + conceptsFile)
            self.concepts = Concepts.loadConcepts(conceptsFile)
            logger.info(u"Loaded Concepts")
        else:
            self.concepts = concepts

        self.tm = TopicsModel()

        logger.info(u"Load Documents from Concepts")

        self.documentsList, self.wordcount = self.tm.loadConceptsWords(self.concepts)

        self.mapDocuments = { self.documentsList.index(x) : x for x in self.documentsList}

        self.df = self.concepts.getConcepts().keys()

        logger.info(u"Read " + str(len(self.documentsList)) + u" Documents, with " + str(self.wordcount) + u" words.")

        if self.wordcount == 0:
            logger.error(u"No topics to use!")
            return None

        logger.info(u"Compute Topics")
        self.topics = self.tm.computeTopics(self.documentsList, nt=num_topics, nw=num_words)

        logger.info(u"Log Topics")
        self.tm.logTopics(self.topics)

        self.listTopics = [unicode(x[0]).strip().replace(u"\"", u"") for x in self.topics]

        logger.info(u"Saving Topics")
        self.topicConcepts = self.tm.saveTopics(self.topics)

        logger.info(u"Complete createTopics")

        return self.concepts

    def findSimilarties(self, conceptsSimilarityFile):

        logger.info(u"Compute Similarity")

        self.conceptsSimilarity = Concepts(u"ConceptsSimilarity", u"Similarities")

        # Compute similarity between documents / concepts
        similarityThreshold = similarity

        if THREAD:
            # startup the threads
            for threadID in range(0,ThreadDepth):
                thread = myThread(threadID, self)
                thread.start()
                self.threads.append(thread)

        for document in self.documentsList:
            indexNum = self.documentsList.index(document)

            logger.info(u"Document %s" % (self.df[indexNum]))

            pj = self.conceptsSimilarity.addConceptKeyType(self.df[indexNum], u"Document")

            logger.debug(u"  documentsList[%d] = %s" % (indexNum, str(document)))

            # Show common topics

            d = [unicode(x).strip().replace(u"'", u"") for x in document]
            e = [unicode(y).strip().replace(u"\"", u"") for y in self.listTopics]

            s1 = set(e)
            s2 = set(d)
            common = s1 & s2
            lc = [x for x in common]
            logger.debug(u"  Common Topics : %s" % (lc))

            if THREAD is False:

                self.doComputation(document, similarityThreshold, pj, Topics=True)

            else:
                logger.debug(u"npbtAquire  Queue Lock")
                queueLock.acquire()
                logger.debug(u"npbtPut     Queue     ")
                rl = [document, similarityThreshold]

                while workQueue.qsize() == QueueDepth:
                    time.sleep(1)

                workQueue.put(rl)
                queueLock.release()
                logger.debug(u"npbtRelease Queue Lock")

                qs = workQueue.qsize()
                if qs % QueueDelta == 0:
                    logger.info(u"rQueue Size = %s" % qs)

                # Wait for queue to empty
                qs = workQueue.qsize()
                while qs != 0:
                    time.sleep(1)
                    qs = workQueue.qsize()
                    if qs % QueueDelta == 0:
                        logger.info(u"wQueue Size = %s" % qs)

                # Notify threads it's time to exit
                exitFlag = 1

                # Wait for all threads to complete
                for t in self.threads:
                    logger.info(u"Waiting for thread %s to end..." % t)
                    t.join(0.5)

        Concepts.saveConcepts(self.conceptsSimilarity, conceptsSimilarityFile)

        # Concepts.outputConceptsToCSV(self.conceptsSimilarity, fileExport=u"BusinessRequirements.csv")

        logger.info(u"Complete - findSimilarties")

        return self.conceptsSimilarity

    def doComputation(self, j, similarityThreshold, pj, Topics=False):

        pt = None

        pl = self.tm.computeSimilar(self.documentsList.index(j), self.documentsList, similarityThreshold)

        if len(pl) != 0:
            logger.debug(u"   similarity above threshold")
            logger.debug(u"   pl:" + str(pl))

            for l in pl:
                if l[1] != l[2]:
                    logger.debug(u"   l:" + str(l))

                    ni = self.documentsList.index(l[2])
                    mdl = u",".join([ q for q in self.mapDocuments[ni]])

                    dfni = unicode(self.df[ni])

                    logger.info(u"    Similar Document : %s" % (dfni))

                    ps = pj.addConceptKeyType(dfni, u"SimilarDocument")
                    # ps.count = TopicsModel.convertMetric(l[0])

                    common = set(l[1]) & set(l[2])
                    lc = [x for x in common]

                    logger.debug(u"  l[1] : %s" % (l[1]))
                    logger.debug(u"  l[2] : %s" % (l[2]))
                    logger.debug(u"  Common : %s" % (lc))

                    pt = ps.addConceptKeyType(mdl, u"DocumentTopics")

                    for x in common:
                        pc = pt.addConceptKeyType(x, u"CommonTopic")
                        pc.count = len(lc)
                
        else:
            logger.debug(u"   similarity below threshold")
Пример #14
0
class DocumentsSimilarity(object):
    conceptsDoc = None
    conceptsSimilarity = None
    conceptsSimilarityFile = None
    tm = None
    documentsList = None
    wordcount = None
    threads = None
    topics = None
    topicConcepts = None
    lemmatizer = None
    df = None

    def __init__(self, al):
        self.num_topics = 100
        self.num_words = 50
        self.similarity = 0.95
        self.al = al

        self.conceptsSimilarityFile = u"GapsSimilarity.p"

    def createTopics(self, concepts):

        self.concepts = concepts

        self.tm = TopicsModel(directory=os.getcwd() + os.sep)

        logger.debug(u"--Load Documents from Concepts")
        self.documentsList, self.wordcount = self.tm.loadConceptsWords(self.concepts)

        logger.info(u"--Read %s Documents, with %s words." % (str(len(self.documentsList)), str(self.wordcount)))

        logger.info(u"--Compute Topics--")
        self.topics = self.tm.computeTopics(self.documentsList, nt=self.num_topics, nw=self.num_words)

        if False:
            logger.info(u"--Log Topics--")
            self.tm.logTopics(self.topics)

        # self.listTopics = [x[0].encode('ascii', errors="ignore").strip() for x in self.topics]
        self.listTopics = [x[0] for x in self.topics]

        logger.info(u"--Saving Topics--")

        self.topicConcepts = self.tm.saveTopics(self.topics)

    def findSimilarties(self):

        logger.info(u"Compute Similarity")

        self.conceptsSimilarity = Concepts(u"ConceptsSimilarity", u"Similarities")

        # Compute similarity between documents / concepts
        similarityThreshold = self.similarity

        for document in self.documentsList:
            indexNum = self.documentsList.index(document)

            self.df = self.concepts.getConcepts().keys()

            logger.info(u"++conceptsDoc %s" % (self.df[indexNum]))
            logger.info(u"  documentsList[" + str(indexNum) + u"]=" + u"".join(x + u" " for x in document))

            # Show common topics
            d = [unicode(x).strip().replace(u"'", u"") for x in document]
            e = [unicode(y).strip().replace(u"\"", u"") for y in self.listTopics]

            s1 = set(e)
            s2 = set(d)
            common = s1 & s2
            lc = [x for x in common]
            logger.info(u"  Common Topics : %s{%s}" % (lc, self.al.dictName[document][ARCHI_TYPE]))

            self.doComputation(indexNum, similarityThreshold, tfAddWords=True)

        Concepts.saveConcepts(self.conceptsSimilarity, conceptsSimilarityFile)

        logger.info(u"Saved Concepts : %s" % conceptsSimilarityFile)

        return self.conceptsSimilarity

    def doComputation(self, j, similarityThreshold, tfAddWords=True):
        logger.debug(u"--doComputation--")
        pl = self.tm.computeSimilar(j, self.documentsList, similarityThreshold)

        if len(pl) != 0:
            logger.debug(u"   similarity above threshold - %2.3f" % (100.0 * float(pl[0][0])))
            logger.debug(u"   pl:" + str(pl))

            for l in pl:
                if l[1] != l[2]:
                    logger.debug(u"  l:" + str(l))
                    l1 = u"".join(x + u" " for x in l[1])
                    ps = self.conceptsSimilarity.addConceptKeyType(l1, u"Similar")
                    ps.count = TopicsModel.convertMetric(l[0])

                    l2 = u"".join(x + " " for x in l[2])
                    pt = ps.addConceptKeyType(l2, u"Concept")

                    common = set(l[1]) & set(l[2])
                    lc = [x for x in common]

                    logger.debug(u"  l    : %s" % l)
                    logger.debug(u"  l[1] : %s" % (l1))
                    logger.debug(u"  l[2] : %s" % (l2))
                    logger.debug(u"  Common : %s" % (lc))

                    if tfAddWords is True:
                        for x in common:
                            if not x in stop:
                                logger.debug(u"word : %s" % x)
                                pc = pt.addConceptKeyType(x, u"CommonTopic")
                                pc.count = len(lc)

        else:
            logger.debug(u"   similarity below threshold")
Пример #15
0
def gapSimilarity(fileArchimate, searchTypes):

    lemmatizer = WordNetLemmatizer()

    logger.info(u"Using : %s" % fileArchimate)

    al = ArchiLib(fileArchimate)

    nl = al.getTypeNodes(searchTypes)

    logger.info(u"Find Words...")
    concepts = Concepts(u"Word", u"Topic")

    n = 0
    for sentence in nl:
        n += 1

        if sentence is None:
            continue

        logger.info(u"%s" % sentence)

        c = concepts.addConceptKeyType(u"Document" + str(n), nl[sentence][ARCHI_TYPE])
        d = c.addConceptKeyType(sentence, nl[sentence][ARCHI_TYPE])

        cleanSentence = u' '.join([word for word in sentence.split(u" ") if word not in stop])
        for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(cleanSentence)):
            if len(word) > 1 and pos[0] == u"N":
                lemmaWord =lemmatizer.lemmatize(word.lower())
                e = d.addConceptKeyType(lemmaWord, u"LemmaWord")
                f = e.addConceptKeyType(pos, u"POS")

    if False:
        concepts.logConcepts()

    if True:
        logger.info(u"Find Collocations...")
        fc = Collocations()
        fc.find_collocations(concepts)

    if True:
        npbt = DocumentsSimilarity(al)

        logger.info(u"Create Topics")
        npbt.createTopics(concepts)

        if True:
            logger.info(u"Find Similarities")

            nc = npbt.findSimilarties()

            logger.debug(u"Topics")
            listTopics = list()
            ncg = npbt.topicConcepts.getConcepts().values()
            for x in ncg:
                logger.info(u"%s[%d]" % (x.name, x.count))
                lt = (x.name, x.count)
                listTopics.append(lt)

            logger.info(u"Topics Sorted")
            with open(u"topic_sort.txt", "wb") as f:
                for x in sorted(listTopics, key=lambda c: abs(c[1]), reverse=False):
                    output = "Topic : %s[%d]" % (x[0], x[1])
                    logger.info(output)
                    f.write(output + os.linesep)
Пример #16
0
class Chunks(object):

    conceptFile = 'documents.p'
    chunkFile = 'chunks.p'
    concepts = None
    chunkConcepts = None

    def __init__(self, concepts=None):
        if concepts == None:
            logger.info("Loading : %s" % self.conceptFile)
            self.concepts = Concepts.loadConcepts(self.conceptFile)
        else:
            logger.info("Using   : %s" % concepts.name)
            self.concepts = concepts

        self.chunkConcepts = Concepts("Chunk", "Chunks")

    def getChunkConcepts(self):
        return self.chunkConcepts

    def createChunks(self):
        stop = stopwords.words('english')
        stop.append("This")
        stop.append("The")
        stop.append(",")
        stop.append(".")
        stop.append("..")
        stop.append("...")
        stop.append(".")
        stop.append(";")
        stop.append("and")

        stemmer = PorterStemmer()
        lemmatizer = WordNetLemmatizer()
        tokenizer = RegexpTokenizer("[\w]+")

        for document in self.concepts.getConcepts().values():
            logger.info("%s" % document.name)
            d = self.chunkConcepts.addConceptKeyType(document.name, "Document")

            for sentence in document.getConcepts().values():
                logger.debug("%s(%s)" % (sentence.name, sentence.typeName))
                cleanSentence = ' '.join([word for word in sentence.name.split() if word not in stop])

                listSentence = list()

                for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(cleanSentence)):
                    logger.debug("Word: " + word + " POS: " + pos)

                    if pos[:1] == "N":
                        lemmaWord = lemmatizer.lemmatize(word)
                        logger.debug("Word: " + word + " Lemma: " + lemmaWord)

                        synset = wn.synsets(word, pos='n')
                        logger.debug("synset : %s" %  synset)

                        if len(synset) != 0:
                            syn = synset[0]

                            root = syn.root_hypernyms()
                            logger.debug("root : %s" %  root)

                            hypernyms = syn.hypernyms()
                            logger.debug("hypernyms : %s" %  hypernyms)

                            if len(hypernyms) > 0:
                                hyponyms = syn.hypernyms()[0].hyponyms()
                                logger.debug("hyponyms : %s" %  hyponyms)
                            else:
                                hyponyms = None

                            listSentence.append((word, lemmaWord, pos, root, hypernyms, hyponyms))

                nounSentence = ""
                for word in listSentence:
                    nounSentence += word[1] + " "

                if len(nounSentence) > 2:
                    e = d.addConceptKeyType(nounSentence, "NounSentence")

                    for word in listSentence:
                        f = e.addConceptKeyType(word[0], word[2])
                        f.addConceptKeyType(word[1], "Lemma")

                logger.debug("%s = %s" % (cleanSentence, type(cleanSentence)))
                cleanSentence = cleanSentence.decode("utf-8", errors="ignore")
                pt = parsetree(cleanSentence, relations=True, lemmata=True)

                for sentence in pt:
                    logger.debug("relations: %s" % [x for x in sentence.relations])
                    logger.debug("subject  : %s" % [x for x in sentence.subjects])
                    logger.debug("verb     : %s" % [x for x in sentence.verbs])
                    logger.debug("object   : %s" % [x for x in sentence.objects])

                    if sentence.subjects is not None:
                        logger.debug("Sentence : %s" % sentence.chunks)

                        for chunk in sentence.chunks:
                            logger.debug("Chunk  : %s" % chunk)

                            relation = str(chunk.relation).encode("ascii", errors="ignore").strip()
                            role = str(chunk.role).encode("ascii", errors="ignore").strip()

                            logger.debug("Relation : %s" % relation)
                            logger.debug("Role     : %s" % role)

                            for subject in sentence.subjects:
                                logger.debug("Subject.realtion : %s " % subject.relation)
                                logger.debug("Subject : %s " % subject.string)
                                f = e.addConceptKeyType(subject.string, "SBJ")

                                for verb in sentence.verbs:
                                    if verb.relation == subject.relation:
                                        logger.debug("Verb.realtion : %s " % verb.relation)
                                        logger.debug("Verb   : %s " % verb.string)
                                        g = f.addConceptKeyType(verb.string, "VP")

                                        for obj in sentence.objects:
                                            if obj.relation == verb.relation:
                                                logger.debug("Obj.realtion : %s " % obj.relation)
                                                logger.debug("Object : %s " % obj.string)
                                                g.addConceptKeyType(obj.string, "OBJ")

        Concepts.saveConcepts(self.chunkConcepts, self.chunkFile)
Пример #17
0
class PPTXCreateArchil(object):

    graph = None

    dictNodes = None
    dictEdges = None

    dictText = None
    dictNodeXY = None
    dictTextXY = None

    def __init__(self, fileCrawl, fileArchimate):
        self.EMU = 914400.0
        self.fileArchimate = fileArchimate

        self.path_to_presentation = fileCrawl

        self.dictNodes = dict()
        self.dictEdges = dict()
        self.dictText = dict()

        self.dictNodeXY = dict()
        self.dictTextXY = dict()

        self.al = ArchiLib(fileArchimate)

        self.graph = GraphVizGraph()
        # self.graph = NetworkXGraph()
        # self.graph = PatternGraph()

        self.prs = Presentation(self.path_to_presentation)

        self.concepts = Concepts(u"Application", u"Relations")

    def addGraphNodes(self, concepts, n=0):
        n += 1
        for c in concepts.getConcepts().values():

            logger.debug(u"%s[%d]" % (c.name, len(c.name)))

            if len(c.name.strip(u" ")) == 0:
                return

            if not (c.typeName in (u"Source", u"Target")):
                return

            logger.debug(u"%d : %d Node c : %s:%s" %
                         (n, len(c.getConcepts()), c.name, c.typeName))

            self.graph.addConcept(c)
            if len(c.getConcepts()) != 0:
                self.addGraphNodes(c, n)

    def addGraphEdges(self, concepts, n=0):
        n += 1
        i = 1
        for c in concepts.getConcepts().values():

            if (c.name in (u"l", u"h", u"t", u"w")):
                return

            logger.debug(u"%d : %d Edge c : %s:%s" %
                         (n, len(c.getConcepts()), c.name, c.typeName))
            if i == 1:
                p = c
                i += 1
            else:
                self.graph.addEdge(p, c)
            if len(c.getConcepts()) != 0:
                self.addGraphEdges(c, n)

    def graphConcepts(self, concepts, graph=None):

        logger.info(u"Adding nodes the graph ...")
        self.addGraphNodes(concepts)

        logger.info(u"Adding edges the graph ...")
        self.addGraphEdges(concepts)

        if isinstance(graph, GraphVizGraph):
            filename = u"example.png"
            graph.exportGraph(filename=filename)
            logger.info(u"Saved Graph - %s" % filename)

        if isinstance(graph, Neo4JGraph):
            graph.setNodeLabels()

        if isinstance(graph, NetworkXGraph):
            graph.drawGraph(u"concepts.png")

            filename = u"concepts.net"
            logger.info(u"Saving Pajek - %s" % filename)
            graph.saveGraphPajek(filename)

            graph.saveGraph(u"concepts.gml")
            logger.info(u"Saving Graph - %s" % u"concepts.gml")

        if isinstance(graph, PatternGraph):
            logger.info(u"Exporting Graph")
            graph.exportGraph()

    def findID(self, nid):
        try:
            for x in self.dictNodes.keys():
                logger.debug(u"    dictNodes[%s] : %s" %
                             (self.dictNodes[x], x))

                if nid in self.dictNodes[x]:
                    logger.debug(u"Found %s in %s" % (x, self.dictNodes[x]))
                    return x
        except:
            em = format_exc().split('\n')[-2]
            logger.warn(u"findID : Warning: %s" % (em))

        return None

    def findXY(self, nid, d):

        ld = list()

        try:
            ld = d[nid]

            logger.debug(u"ld : %s" % ld)

        except:
            pass

        return ld

    def logList(self, l, n=0):

        n += 1
        s = " " * n

        logger.info(u"%sn=%d" % (s, n))

        for x in l:
            # logger.info("%sx=%s" % (s, x))
            if isinstance(x, list):
                logger.info(u"%slist: %s" % (s, x))
                self.logList(x, n)
            elif isinstance(x, dict):
                logger.info(u"%sdict: %s" % (s, x))
                self.logList(x, n)
            elif isinstance(x, tuple):
                logger.info(u"%stuple: %s" % (s, x))
                self.logList(x, n)
            else:
                if isinstance(x, str):
                    logger.info(u"%sstr: %s" % (s, x))
                elif isinstance(x, float):
                    logger.info(u"%sfloat: %3.2f" % (s, x))
                elif isinstance(x, int):
                    logger.info(u"%sint: %d" % (s, x))

    def shapeText(self, shape):
        name = u""
        if shape.has_text_frame:
            text_frame = shape.text_frame

            for paragraph in shape.text_frame.paragraphs:
                for run in paragraph.runs:
                    logger.debug(u"%s" % run.text)
                    name = name + run.text + u" "

        return name

    def shapeDim(self, shape):
        t = shape.top / self.EMU
        l = shape.left / self.EMU
        h = shape.height / self.EMU
        w = shape.width / self.EMU

        nid = shape.id

        dictDim = dict()
        dictDim[u"t"] = t
        dictDim[u"l"] = l
        dictDim[u"h"] = h
        dictDim[u"w"] = w

        self.dictNodeXY[nid] = dictDim

        logger.debug(u"shape.top     : %3.2f" % (t))
        logger.debug(u"shape.left    : %3.2f" % (l))
        logger.debug(u"shape.height  : %3.2f" % (h))
        logger.debug(u"shape.width   : %3.2f" % (w))
        logger.debug(u"shape.shape_type    : %s" % shape.shape_type)

        return nid, t, l, h, w

    def addDictNodes(self, nid, name):

        name = unicode(name).rstrip(u" ").lstrip(u" ")

        if not (len(name) > 0):
            logger.warn(u"No Name!")
            return

        if name in self.dictNodes:
            nl = self.dictNodes[name]
            nl.append(nid)
            logger.debug(u"Duplicate Keys %s...%s" %
                         (name, self.dictNodes[name]))
        else:
            nl = list()
            nl.append(nid)
            self.dictNodes[name] = nl

    def addDictEdges(self, nid, xl):
        nxl = list()
        for x in xl:
            nxl.append(int(x))
            logger.debug(u"%d:%s" % (nid, x))

        lenNXL = len(nxl)

        #
        # Only add connections between two nodes
        #
        if lenNXL == 3:
            if self.dictEdges.has_key(nid):
                nl = self.dictEdges[nid]
                nl.append(nxl)
                logger.debug(u"Duplicate Edges ...%s" % (self.dictEdges[nid]))
            else:
                el = list()
                el.append(nxl)
                self.dictEdges[nid] = el
        else:
            logger.debug(u"Only %d Connectors!" % (len(nxl)))

        return lenNXL

    def showConcepts(self, concepts):
        n = 0
        for x in concepts.getConcepts().values():
            n += 1
            logger.info(u"x %s[%s]" % (x.name, x.typeName))
            for y in x.getConcepts().values():
                logger.info(u"  y %s[%s]" % (y.name, y.typeName))
                for z in y.getConcepts().values():
                    if not (z.name in (u"h", u"l", u"t", u"w")):
                        logger.info(u"    z  %s[%s]" % (z.name, z.typeName))

    def getPoint(self, d):

        t = d[u"t"]
        l = d[u"l"]
        h = d[u"h"]
        w = d[u"w"]

        py = t + (h / 2.0)
        px = l + (h / 2.0)

        return px, py

    def lineMagnitude(self, x1, y1, x2, y2):
        lineMagnitude = math.sqrt(
            math.pow((x2 - x1), 2) + math.pow((y2 - y1), 2))
        return lineMagnitude

    # Calc minimum distance from a point and a line segment (i.e. consecutive vertices in a polyline).
    def DistancePointLine(self, px, py, x1, y1, x2, y2):
        try:
            # http://local.wasp.uwa.edu.au/~pbourke/geometry/pointline/source.vba
            LineMag = self.lineMagnitude(x1, y1, x2, y2)

            u1 = (((px - x1) * (x2 - x1)) + ((py - y1) * (y2 - y1)))
            u = u1 / (LineMag * LineMag)

            if (u < 0.00001) or (u > 1):
                # closest point does not fall within the line segment, take the shorter distance
                # to an endpoint
                ix = self.lineMagnitude(px, py, x1, y1)
                iy = self.lineMagnitude(px, py, x2, y2)
                if ix > iy:
                    DistancePointLine = iy
                else:
                    DistancePointLine = ix
            else:
                # Intersecting point is on the line, use the formula
                ix = x1 + u * (x2 - x1)
                iy = y1 + u * (y2 - y1)
                DistancePointLine = self.lineMagnitude(px, py, ix, iy)

            return DistancePointLine
        except:
            return 0

    def crawlPPTX(self):

        sNum = 0

        for slide in self.prs.slides:

            logger.debug(u"--new slide--")
            logger.debug(u"%s" % slide.partname)
            logger.debug(u"slideName : %s" % slide.name)

            sNum += 1

            #
            # Get Title of Slide
            #
            titleSlide = u""

            for idx, ph in enumerate(slide.shapes.placeholders):
                # logger.debug ("**    %s:%s    **" % (idx, ph.text))
                if idx == 0:
                    titleSlide = ph.text

            u = self.al.cleanString(titleSlide)

            logger.info(u"%d.%s" % (sNum, u))
            tss = u"%d.%s" % (sNum, u)
            q = self.concepts.addConceptKeyType(tss, u"Slide")

            # showConcepts(concepts)

            #
            # Iterate ihrough slides
            #
            n = 0
            nc = 0
            for shape in slide.shapes:
                logger.debug(u"...%s..." % type(shape))
                logger.debug(u"shape.element.xml : %s" % shape.element.xml)
                logger.debug(u"shape.name : %s[%d]" %
                             (shape.name, shape.id - 1))

                n += 1

                sn = shape.name

                nid = shape.id

                # Get Shape Info
                if shape.name[:5] in (u"Recta", u"Round", u"Strai"):
                    nid, t, l, h, w = self.shapeDim(shape)

                    tl = (l, t)
                    tr = (l + w, t)
                    bl = (l, t + h)
                    br = (l + w, t + h)

                    name = self.shapeText(shape)

                    if len(name) > 1:
                        logger.info(u"  node : %s[%d] - %s" %
                                    (name, nid, shape.name))

                        self.addDictNodes(nid, name)

                        b = q.addConceptKeyType(self.al.cleanString(name),
                                                u"Node")
                        b.addConceptKeyType(u"t", str(t))
                        b.addConceptKeyType(u"l", str(l))
                        b.addConceptKeyType(u"h", str(h))
                        b.addConceptKeyType(u"w", str(w))

                #
                # Add in Connections
                #
                elif sn.find(u"Connector") != -1:

                    xmlShape = shape.element.xml

                    logger.debug(u"xmlShape : %s" % xmlShape)

                    tree = etree.fromstring(xmlShape)

                    xl = tree.xpath(u"//@id")

                    logger.debug(u"xl : %s" % xl)

                    numEdges = self.addDictEdges(nid, xl)

                    if numEdges == 3:
                        nc += 1
                        logger.info(u"  %d Found Edge %d" % (nc, shape.id))

                #
                # Get Text boxes and associate with Connector
                #
                elif shape.name[:8] in (u"Text Box", u"TextBox "):

                    nid, t, l, h, w = self.shapeDim(shape)

                    name = self.shapeText(shape)

                    if name is not None:
                        nxl = list()
                        nxl.append(nid)
                        self.dictText[name] = nxl

                        logger.info(u"  TextBox : %s[%d]" % (name, shape.id))

                else:
                    logger.debug(u"Skipped : %s" % shape.name)

            #
            # Now match the Connector with text
            #

            listEdges = self.dictEdges.values()

            logger.info(u"listEdges : %d" % len(listEdges))

            tbFound = 0
            tbTotal = len(self.dictTextXY)
            logger.info(u"Search for %s Text Box Connector's" %
                        len(self.dictTextXY))

            for txt in self.dictTextXY.keys():
                logger.debug(u"txt : %s[%s]" % (txt, dictTextXY[txt]))
                searchText = self.findID(txt, self.dictText)

                logger.info(u"  Search Text : %s" % (searchText))

                # get text point - middle of node
                px, py = self.getPoint(dictTextXY[txt])

                cDist = 1000.0
                cNode = None
                csn = None
                ctn = None

                # for each node in dictEdges
                ni = 0
                for edge in listEdges:

                    logger.debug(u"  edge: %s" % edge)

                    try:
                        # get source
                        source = edge[0][2]
                        sName = self.findID(source)
                        sl = self.dictNodeXY[source]
                        spx, spy = self.getPoint(sl)

                        # get target
                        target = edge[0][1]
                        tName = self.findID(target)
                        tl = self.dictNodeXY[target]

                        tpx, tpy = self.getPoint(tl)

                        # determine distance between points
                        d = self.DistancePointLine(px, py, spx, spy, tpx, tpy)

                        if d < cDist:
                            cDist = d
                            cNode = edge[0][0]
                            csn = sName
                            tsn = tName

                    except:
                        pass

                if cNode != None:
                    tbFound += 1
                    logger.debug(u"    Closest Connector : %s" % cNode)
                    logger.info(
                        u"    found(%d:%d] - %s->%s->%s [%2.3f]" %
                        (tbFound, tbTotal, csn, searchText, tsn, cDist))

                    edge = searchText
                    source = sName
                    target = tName

                    dimSource = sl
                    dimTarget = tl

                    if edge is None:
                        edge = u"TBD"

                    d = q.getConcepts()[csn]

                    for ld in dimSource.keys():
                        logger.debug(u"%s %s:%2.3f" %
                                     (source, ld, dimSource[ld]))
                        d.addConceptKeyType(ld, str(dimSource[ld]))

                        f = d.addConceptKeyType(target, u"Target")
                        for ld in dimTarget.keys():
                            logger.debug(u"%s %s:%2.3f" %
                                         (target, ld, dimSource[ld]))
                            f.addConceptKeyType(ld, str(dimTarget[ld]))

                        f.addConceptKeyType(self.al.cleanString(edge), u"Edge")

            if tbTotal != 0:
                logger.info(u"Found [%3.1f] Text Box Connectors" %
                            ((tbFound / float(tbTotal)) * 100.0))

            dictTextXY = dict()

        return self.concepts
Пример #18
0
class DependancyAnalysis(object):

    def __init__(self, fileArchimate):

        self.fileArchimate = fileArchimate

        logger.info(u"Using : %s" % self.fileArchimate)

        self.al = ArchiLib(fileArchimate)

        self.concepts = Concepts(u"BusinessProcess", u"archimate:BusinessProcess")

    # "Batches" are sets of tasks that can be run together
    def get_task_batches(self, nodes):

        # Build a map of node names to node instances
        name_to_instance = dict((n.name, n) for n in nodes )

        for x in name_to_instance.keys():
            logger.debug(u"name_to_instance[%s]=%s : %s" % (x, name_to_instance[x].name, name_to_instance[x].depends))

        # Build a map of node names to dependency names
        name_to_deps = dict( (n.name, set(n.depends)) for n in nodes )

        for x in name_to_deps.keys():
            logger.debug(u"name_to_deps[%s]=%s" % (x, name_to_deps[x]))

        # This is where we'll store the batches
        batches = []

        n = 0
        # While there are dependencies to solve...
        while name_to_deps:
            logger.info(u"length %d" % len(name_to_deps))

            # Get all nodes with no dependencies
            ready = {name for name, deps in name_to_deps.iteritems() if not deps}

            n += 1
            logger.info(u"iteration : %d" % n)
            for x in ready:
                logger.info(u"No Dep  %s" % (x))

            # If there aren't any, we have a loop in the graph
            if not ready:
                msg  = u"Circular dependencies found!\n"
                msg += self.format_dependencies(name_to_deps)
                raise ValueError(msg)

            # Remove them from the dependency graph
            for name in ready:
                del name_to_deps[name]
            for deps in name_to_deps.itervalues():
                deps.difference_update(ready)

            # Add the batch to the list
            batches.append( {name_to_instance[name] for name in ready} )

        # Return the list of batches
        return batches

    # Format a dependency graph for printing
    def format_dependencies(self, name_to_deps):
        msg = []
        for name, deps in name_to_deps.iteritems():
            for parent in deps:
                msg.append(u"%s -> %s" % (name, parent))
        return "\n".join(msg)

    # Create and format a dependency graph for printing
    def format_nodes(self, nodes):
        return self.format_dependencies(dict( (n.name, n.depends) for n in nodes ))


    def findConcept(self, concepts, name, n=0):
        n += 1
        c = None

        if n == 3:
            return c

        for x in concepts.getConcepts().values():
            if x.name == name:
                return x
            else:
               c = self.findConcept(x, name, n)
        return c

    def getWords(self, s, concepts):
        lemmatizer = WordNetLemmatizer()

        for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(s)):
            if len(word) > 1 and pos[0] == u"N":
                lemmaWord = lemmatizer.lemmatize(word.lower())
                e = concepts.addConceptKeyType(lemmaWord, u"Word")
                f = e.addConceptKeyType(pos, u"POS")

    def collectDependancyAnalysisNodes(self):

        count = 0
        listTSort = list()
        for x in self.al.dictEdges.keys():
            logger.debug(u"[%s]=%s" % (self.al.dictEdges[x][u"id"], x))

            if u"source" in self.al.dictEdges[x]:
                source = self.al.dictEdges[x][u"source"]
                target = self.al.dictEdges[x][u"target"]

                logger.debug(u"  Rel    : %s" % (self.al.dictEdges[x][ARCHI_TYPE]))

                if self.al.dictEdges[x][ARCHI_TYPE] in (u"archimate:FlowRelationship"):

                    # al.countNodeType(al.dictNodes[source][ARCHI_TYPE])
                    # al.countNodeType(al.dictNodes[target][ARCHI_TYPE])
                    # al.countNodeType(al.dictEdges[x][ARCHI_TYPE])

                    if (self.al.dictNodes[source][ARCHI_TYPE] == u"archimate:BusinessProcess") and \
                            self.al.dictNodes[target][ARCHI_TYPE] == u"archimate:BusinessProcess":

                        sourceName = self.al.getNodeName(source)
                        targetName = self.al.getNodeName(target)

                        if sourceName[0].isdigit() or targetName[0].isdigit():
                            continue

                        logger.debug(u" %s:%s" % (sourceName, targetName))

                        l = list()

                        sc = self.findConcept(self.concepts, sourceName)
                        if sc is None:
                            logger.debug(u"New Target - %s" % sourceName)
                            sc = self.concepts.addConceptKeyType(self.al.getNodeName(source), u"Source")
                            self.getWords(sourceName, sc)
                        else:
                            logger.debug(u"Prior Target %s" % sourceName)

                        tc = self.findConcept(self.concepts, targetName)
                        if tc is None:
                            logger.debug(u"New Target %s" % targetName)
                            tc = sc.addConceptKeyType(self.al.getNodeName(target), u"Target")
                            self.getWords(sourceName, tc)
                        else:
                            logger.debug(u"Prior Target %s" % targetName)
                            sc.addConcept(tc)

                        l.append(target)
                        l.append(source)
                        listTSort.append(l)

        logger.debug(u"Edges = %s" % listTSort)

        Concepts.saveConcepts(self.concepts, fileConceptsTraversal)

        self.dependancyAnalysis(listTSort)

        return self.concepts, listTSort

    def dependancyAnalysis(self, listTSort):

        index = 0
        for x in listTSort:
            logger.debug(u"%d %s[%s] -%s-> %s[%s]" % (index, self.al.dictNodes[x[0]][u"name"], self.al.dictNodes[x[0]][ARCHI_TYPE], u"UsedBy",
                                                    self.al.dictNodes[x[1]][u"name"], self.al.dictNodes[x[1]][ARCHI_TYPE]))
            index += 1

            self.al.addToNodeDict(self.al.dictNodes[x[0]][u"name"], self.al.dictBP)
            self.al.addToNodeDict(self.al.dictNodes[x[1]][u"name"], self.al.dictBP)

        logger.info(u"Topic Sort Candidates : %d" % (len(listTSort)))

        nodes = list()
        index = 0
        dictTasks = dict()
        for x in listTSort:
            sname = self.al.dictNodes[x[0]][u"name"]
            tname = self.al.dictNodes[x[1]][u"name"]
            index += 1
            logger.debug(u"%d %s -%s-> %s" % (index, sname, u"UsedBy", tname))

            if sname in dictTasks:
                ln = dictTasks[sname]
                ln.append(tname)
            else:
                ln = list()
                ln.append(tname)
                dictTasks[sname] = ln

        for x in dictTasks.keys():
            logger.debug(u"dictTasks[%s]=%s" % (x, dictTasks[x]))
            a = Task(x, dictTasks[x])
            nodes.append(a)

        for x in self.al.dictBP.keys():
            # for x in listBP:
            if x not in dictTasks:
                logger.debug(u"Add %s" % (x))
                a = Task(x, list())
                nodes.append(a)

        self.format_nodes(nodes)

        conceptBatches = Concepts(u"Batch", u"archimate:WorkPackage")

        n = 0
        logger.info(u"Batches:")
        batches = self.get_task_batches(nodes)
        for bundle in batches:
            n += 1
            name = u"Batch %d" % n
            c = conceptBatches.addConceptKeyType(name, u"archimate:WorkPackage")
            for node in bundle:
                c.addConceptKeyType(node.name, u"archimate:BusinessProcess")

            logger.info(u"%d : %s" % (n, ", ".join(node.name.lstrip() for node in bundle)))

        Concepts.saveConcepts(conceptBatches, fileConceptsBatches)

        return conceptBatches
Пример #19
0
    def exportArchi(self):

        m = hashlib.md5()

        concepts = Concepts(u"Node", u"Nodes")

        logger.info(u"Found %d Nodes" % len(self.al.dictNodes))
        logger.info(u"Found %d Edges" % len(self.al.dictEdges))

        count = 0
        listTSort = list()
        for x in self.al.dictEdges.keys():
            logger.debug(u"Edge [%s]=%s" % (self.al.dictEdges[x], x))

            if self.al.dictEdges[x].has_key(
                    u"source") and self.al.dictEdges[x].has_key(u"target"):

                typeEdge = self.al.dictEdges[x][ARCHI_TYPE]
                logger.debug(u"Edge   : %s" % typeEdge)

                source = self.al.dictEdges[x][u"source"]
                logger.debug(u"Source : %s" % source)

                target = self.al.dictEdges[x][u"target"]
                logger.debug(u"Target : %s" % target)

                logger.debug(u"  Rel    : %s" %
                             (self.al.dictEdges[x][ARCHI_TYPE]))

                sourceName = self.al.getNodeName(source)
                targetName = self.al.getNodeName(target)

                logger.debug(
                    u" %s--%s--%s" %
                    (sourceName, self.al.dictEdges[x][ARCHI_TYPE][10:],
                     targetName))

                if source in self.al.dictNodes:
                    l = list()
                    sc = concepts.addConceptKeyType(
                        sourceName, self.al.dictNodes[source][ARCHI_TYPE][10:])
                    # getWords(sourceName, sc)

                nameEdge = u"(" + sourceName + u"," + targetName + u")"
                logger.debug(u"nameEdge : %s[%d]" % (nameEdge, len(nameEdge)))
                logger.debug(u"typeEdge : %s" % typeEdge[10:])

                ne = str(self.al.cleanString(nameEdge))
                hl = hashlib.sha224(str(ne)).hexdigest()

                logger.debug(u"hash : %s" % hl)

                nh = u"%s-%s" % (typeEdge[10:], hl)

                rc = sc.addConceptKeyType(nh, typeEdge[10:])

                if self.al.dictNodes.has_key(target):
                    tc = rc.addConceptKeyType(
                        targetName, self.al.dictNodes[target][ARCHI_TYPE][10:])
                    # getWords(sourceName, tc)

        Concepts.saveConcepts(concepts, self.fileConceptsExport)

        return concepts
class Chunks(object):

    conceptFile = fileConceptsDocuments
    chunkFile = fileConceptsChunks
    concepts = None
    chunkConcepts = None

    def __init__(self, concepts=None):
        if concepts is None:
            logger.info(u"Loading : %s" % self.conceptFile)
            self.concepts = Concepts.loadConcepts(self.conceptFile)
        else:
            logger.info(u"Using   : %s" % concepts.name)
            self.concepts = concepts

        self.chunkConcepts = Concepts(u"Chunk", u"Chunks")

    def getChunkConcepts(self):
        return self.chunkConcepts

    def createChunks(self):
        stop = stopwords.words(u'english')
        stop.append(u"This")
        stop.append(u"The")
        stop.append(u",")
        stop.append(u".")
        stop.append(u"..")
        stop.append(u"...")
        stop.append(u".")
        stop.append(u";")
        stop.append(u"and")

        stemmer = PorterStemmer()
        lemmatizer = WordNetLemmatizer()
        tokenizer = RegexpTokenizer(u"[\w]+")

        for document in self.concepts.getConcepts().values():
            logger.info(u"%s" % document.name)
            d = self.chunkConcepts.addConceptKeyType(document.name, u"Document")

            for sentence in document.getConcepts().values():
                logger.debug(u"%s(%s)" % (sentence.name, sentence.typeName))
                cleanSentence = ' '.join([word for word in sentence.name.split() if word not in stop])

                listSentence = list()

                for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(cleanSentence)):
                    logger.debug(u"Word: " + word + u" POS: " + pos)

                    if pos[:1] == u"N":
                    #if True:
                        lemmaWord = lemmatizer.lemmatize(word)
                        logger.debug(u"Word: " + word + u" Lemma: " + lemmaWord)

                        morphWord = wn.morphy(word, wn.NOUN)
                        if morphWord is not None:
                            logger.debug(u"Word: " + word + u" Morph: " + morphWord)

                        synset = wn.synsets(word, pos=u'n')
                        logger.debug(u"synset : %s" % synset)

                        if len(synset) != 0:
                            syn = synset[0]

                            root = syn.root_hypernyms()
                            logger.debug(u"root : %s" % root)

                            mh = syn.member_holonyms()
                            logger.debug(u"member_holonyms : %s" % mh)

                            hypernyms = syn.hypernyms()
                            logger.debug(u"hypernyms : %s" % hypernyms)

                            if len(hypernyms) > 0:
                                hyponyms = syn.hypernyms()[0].hyponyms()
                                logger.debug(u"hyponyms : %s" % hyponyms)
                            else:
                                hyponyms = None

                            listSentence.append((word, lemmaWord, pos, root, hypernyms, hyponyms))

                nounSentence = u""
                for word in listSentence:
                    nounSentence += word[1] + u" "

                if len(nounSentence) > 2:
                    e = d.addConceptKeyType(nounSentence, u"NounSentence")

                    for word in listSentence:
                        f = e.addConceptKeyType(word[0], word[2])
                        f.addConceptKeyType(word[1], u"Lemma")

                logger.debug(u"%s = %s" % (cleanSentence, type(cleanSentence)))
                cleanSentence = unicode(cleanSentence)
                pt = parsetree(cleanSentence, relations=True, lemmata=True)

                for sentence in pt:
                    logger.debug(u"relations: %s" % [x for x in sentence.relations])
                    logger.debug(u"subject  : %s" % [x for x in sentence.subjects])
                    logger.debug(u"verb     : %s" % [x for x in sentence.verbs])
                    logger.debug(u"object   : %s" % [x for x in sentence.objects])

                    if sentence.subjects is not None:
                        logger.debug(u"Sentence : %s" % sentence.chunks)

                        for chunk in sentence.chunks:
                            logger.debug(u"Chunk  : %s" % chunk)

                            relation = unicode(chunk.relation).strip()
                            role = unicode(chunk.role).strip()

                            logger.debug(u"Relation : %s" % relation)
                            logger.debug(u"Role     : %s" % role)

                            for subject in sentence.subjects:
                                logger.debug(u" Subject.realtion : %s " % subject.relation)
                                logger.debug(u" Subject : %s " % subject.string)
                                f = e.addConceptKeyType(subject.string, u"SBJ")

                                for verb in sentence.verbs:
                                    if verb.relation == subject.relation:
                                        logger.debug(u"  Verb.realtion : %s " % verb.relation)
                                        logger.debug(u"  Verb   : %s " % verb.string)
                                        g = f.addConceptKeyType(verb.string, u"VP")

                                        for obj in sentence.objects:
                                            if obj.relation == verb.relation:
                                                logger.debug(u"    Obj.realtion : %s " % obj.relation)
                                                logger.debug(u"    Object : %s " % obj.string)
                                                g.addConceptKeyType(obj.string, u"OBJ")

        Concepts.saveConcepts(self.chunkConcepts, self.chunkFile)
        logger.info(u"Saved : %s" % self.chunkFile)
Пример #21
0
class PPTXCreateArchil(object):

    graph      = None

    dictNodes  = None
    dictEdges  = None

    dictText   = None
    dictNodeXY = None
    dictTextXY = None

    def __init__(self, fileCrawl, fileArchimate):
        self.EMU = 914400.0
        self.fileArchimate = fileArchimate

        self.path_to_presentation = fileCrawl

        self.dictNodes = dict()
        self.dictEdges = dict()
        self.dictText  = dict()

        self.dictNodeXY = dict()
        self.dictTextXY = dict()

        self.al = ArchiLib(fileArchimate)

        self.graph = GraphVizGraph()
        # self.graph = NetworkXGraph()
        # self.graph = PatternGraph()

        self.prs = Presentation(self.path_to_presentation)

        self.concepts = Concepts(u"Application", u"Relations")


    def addGraphNodes(self, concepts, n=0):
        n += 1
        for c in concepts.getConcepts().values():

            logger.debug(u"%s[%d]" % (c.name, len(c.name)))

            if len(c.name.strip(u" ")) == 0:
                return

            if not (c.typeName in (u"Source", u"Target")):
                return

            logger.debug(u"%d : %d Node c : %s:%s" % (n, len(c.getConcepts()), c.name, c.typeName))

            self.graph.addConcept(c)
            if len(c.getConcepts()) != 0:
                self.addGraphNodes(c, n)

    def addGraphEdges(self, concepts, n=0):
        n += 1
        i = 1
        for c in concepts.getConcepts().values():

            if (c.name in (u"l", u"h", u"t", u"w")):
                return

            logger.debug(u"%d : %d Edge c : %s:%s" % (n, len(c.getConcepts()), c.name, c.typeName))
            if i == 1:
                p = c
                i += 1
            else:
                self.graph.addEdge(p, c)
            if len(c.getConcepts()) != 0:
                self.addGraphEdges(c, n)

    def graphConcepts(self, concepts, graph=None):

        logger.info(u"Adding nodes the graph ...")
        self.addGraphNodes(concepts)

        logger.info(u"Adding edges the graph ...")
        self.addGraphEdges(concepts)

        if isinstance(graph, GraphVizGraph):
            filename = u"example.png"
            graph.exportGraph(filename=filename)
            logger.info(u"Saved Graph - %s" % filename)

        if isinstance(graph, Neo4JGraph):
            graph.setNodeLabels()

        if isinstance(graph, NetworkXGraph):
            graph.drawGraph(u"concepts.png")

            filename = u"concepts.net"
            logger.info(u"Saving Pajek - %s" % filename)
            graph.saveGraphPajek(filename)

            graph.saveGraph(u"concepts.gml")
            logger.info(u"Saving Graph - %s" % u"concepts.gml")

        if isinstance(graph, PatternGraph):
            logger.info(u"Exporting Graph")
            graph.exportGraph()

    def findID(self, nid):
        try:
            for x in self.dictNodes.keys():
                logger.debug(u"    dictNodes[%s] : %s" % (self.dictNodes[x], x))

                if nid in self.dictNodes[x]:
                    logger.debug(u"Found %s in %s" % (x, self.dictNodes[x]))
                    return x
        except:
            em = format_exc().split('\n')[-2]
            logger.warn(u"findID : Warning: %s" % (em))

        return None

    def findXY(self, nid, d):

        ld = list()

        try:
            ld = d[nid]

            logger.debug(u"ld : %s" % ld)

        except:
            pass

        return ld

    def logList(self, l, n=0):

        n += 1
        s = " " * n

        logger.info(u"%sn=%d" % (s, n))

        for x in l:
            # logger.info("%sx=%s" % (s, x))
            if isinstance(x, list):
                logger.info(u"%slist: %s" % (s, x))
                self.logList(x, n)
            elif isinstance(x, dict):
                logger.info(u"%sdict: %s" % (s, x))
                self.logList(x, n)
            elif isinstance(x, tuple):
                logger.info(u"%stuple: %s" % (s, x))
                self.logList(x, n)
            else:
                if isinstance(x, str):
                    logger.info(u"%sstr: %s" % (s, x))
                elif isinstance(x, float):
                    logger.info(u"%sfloat: %3.2f" % (s, x))
                elif isinstance(x, int):
                    logger.info(u"%sint: %d" % (s, x))

    def shapeText(self, shape):
        name = u""
        if shape.has_text_frame:
            text_frame = shape.text_frame

            for paragraph in shape.text_frame.paragraphs:
                for run in paragraph.runs:
                    logger.debug(u"%s" % run.text)
                    name = name + run.text + u" "

        return name

    def shapeDim(self, shape):
        t = shape.top / self.EMU
        l = shape.left / self.EMU
        h = shape.height / self.EMU
        w = shape.width / self.EMU

        nid = shape.id

        dictDim = dict()
        dictDim[u"t"] = t
        dictDim[u"l"] = l
        dictDim[u"h"] = h
        dictDim[u"w"] = w

        self.dictNodeXY[nid] = dictDim

        logger.debug(u"shape.top     : %3.2f" % (t))
        logger.debug(u"shape.left    : %3.2f" % (l))
        logger.debug(u"shape.height  : %3.2f" % (h))
        logger.debug(u"shape.width   : %3.2f" % (w))
        logger.debug(u"shape.shape_type    : %s" % shape.shape_type)

        return nid, t, l, h , w

    def addDictNodes(self, nid, name):

        name = unicode(name).rstrip(u" ").lstrip(u" ")

        if not (len(name) > 0):
            logger.warn(u"No Name!")
            return

        if name in self.dictNodes:
            nl = self.dictNodes[name]
            nl.append(nid)
            logger.debug(u"Duplicate Keys %s...%s" % (name, self.dictNodes[name]))
        else:
            nl = list()
            nl.append(nid)
            self.dictNodes[name] = nl

    def addDictEdges(self, nid, xl):
        nxl = list()
        for x in xl:
            nxl.append(int(x))
            logger.debug(u"%d:%s" % (nid, x))

        lenNXL = len(nxl)

        #
        # Only add connections between two nodes
        #
        if lenNXL == 3:
            if self.dictEdges.has_key(nid):
                nl = self.dictEdges[nid]
                nl.append(nxl)
                logger.debug(u"Duplicate Edges ...%s" % (self.dictEdges[nid]))
            else:
                el = list()
                el.append(nxl)
                self.dictEdges[nid] = el
        else:
            logger.debug(u"Only %d Connectors!" % (len(nxl)))

        return lenNXL

    def showConcepts(self, concepts):
        n = 0
        for x in concepts.getConcepts().values():
            n += 1
            logger.info(u"x %s[%s]" % (x.name, x.typeName))
            for y in x.getConcepts().values():
                logger.info(u"  y %s[%s]" % (y.name, y.typeName))
                for z in y.getConcepts().values():
                    if not (z.name in (u"h", u"l", u"t", u"w")):
                        logger.info(u"    z  %s[%s]" % (z.name, z.typeName))

    def getPoint(self, d):

        t = d[u"t"]
        l = d[u"l"]
        h = d[u"h"]
        w = d[u"w"]

        py = t + (h / 2.0)
        px = l + (h / 2.0)

        return px, py

    def lineMagnitude (self, x1, y1, x2, y2):
        lineMagnitude = math.sqrt(math.pow((x2 - x1), 2)+ math.pow((y2 - y1), 2))
        return lineMagnitude

    # Calc minimum distance from a point and a line segment (i.e. consecutive vertices in a polyline).
    def DistancePointLine (self, px, py, x1, y1, x2, y2):
        try:
            # http://local.wasp.uwa.edu.au/~pbourke/geometry/pointline/source.vba
            LineMag = self.lineMagnitude(x1, y1, x2, y2)

            u1 = (((px - x1) * (x2 - x1)) + ((py - y1) * (y2 - y1)))
            u = u1 / (LineMag * LineMag)

            if (u < 0.00001) or (u > 1):
                # closest point does not fall within the line segment, take the shorter distance
                # to an endpoint
                ix = self.lineMagnitude(px, py, x1, y1)
                iy = self.lineMagnitude(px, py, x2, y2)
                if ix > iy:
                    DistancePointLine = iy
                else:
                    DistancePointLine = ix
            else:
                # Intersecting point is on the line, use the formula
                ix = x1 + u * (x2 - x1)
                iy = y1 + u * (y2 - y1)
                DistancePointLine = self.lineMagnitude(px, py, ix, iy)

            return DistancePointLine
        except:
            return 0

    def crawlPPTX(self):

        sNum = 0

        for slide in self.prs.slides:

            logger.debug(u"--new slide--")
            logger.debug(u"%s" % slide.partname)
            logger.debug(u"slideName : %s" % slide.name)

            sNum += 1

            #
            # Get Title of Slide
            #
            titleSlide = u""

            for idx, ph in enumerate(slide.shapes.placeholders):
                # logger.debug ("**    %s:%s    **" % (idx, ph.text))
                if idx == 0:
                    titleSlide = ph.text

            u = self.al.cleanString(titleSlide)

            logger.info(u"%d.%s" % (sNum, u))
            tss = u"%d.%s" % (sNum, u)
            q = self.concepts.addConceptKeyType(tss, u"Slide")

            # showConcepts(concepts)

            #
            # Iterate ihrough slides
            #
            n = 0
            nc = 0
            for shape in slide.shapes:
                logger.debug(u"...%s..." % type(shape))
                logger.debug(u"shape.element.xml : %s" % shape.element.xml)
                logger.debug(u"shape.name : %s[%d]" % (shape.name,  shape.id - 1))

                n += 1

                sn = shape.name

                nid = shape.id

                # Get Shape Info
                if shape.name[:5] in (u"Recta", u"Round", u"Strai"):
                    nid, t, l, h, w = self.shapeDim(shape)

                    tl = (l, t)
                    tr = (l + w, t)
                    bl = (l, t + h)
                    br = (l + w, t + h)

                    name = self.shapeText(shape)

                    if len(name) > 1:
                        logger.info(u"  node : %s[%d] - %s" % (name, nid, shape.name))

                        self.addDictNodes(nid, name)

                        b = q.addConceptKeyType(self.al.cleanString(name), u"Node")
                        b.addConceptKeyType(u"t", str(t))
                        b.addConceptKeyType(u"l", str(l))
                        b.addConceptKeyType(u"h", str(h))
                        b.addConceptKeyType(u"w", str(w))

                #
                # Add in Connections
                #
                elif sn.find(u"Connector") != -1:

                    xmlShape = shape.element.xml

                    logger.debug(u"xmlShape : %s" % xmlShape)

                    tree = etree.fromstring(xmlShape)

                    xl = tree.xpath(u"//@id")

                    logger.debug(u"xl : %s" % xl)

                    numEdges = self.addDictEdges(nid, xl)

                    if numEdges == 3:
                        nc += 1
                        logger.info(u"  %d Found Edge %d" % (nc, shape.id))

                #
                # Get Text boxes and associate with Connector
                #
                elif shape.name[:8] in (u"Text Box", u"TextBox "):

                    nid, t, l, h, w = self.shapeDim(shape)

                    name = self.shapeText(shape)

                    if name is not None:
                        nxl = list()
                        nxl.append(nid)
                        self.dictText[name] = nxl

                        logger.info(u"  TextBox : %s[%d]" % (name, shape.id))

                else:
                    logger.debug(u"Skipped : %s" % shape.name)

            #
            # Now match the Connector with text
            #

            listEdges = self.dictEdges.values()

            logger.info(u"listEdges : %d" % len(listEdges))

            tbFound = 0
            tbTotal = len(self.dictTextXY)
            logger.info(u"Search for %s Text Box Connector's" % len(self.dictTextXY))

            for txt in self.dictTextXY.keys():
                logger.debug(u"txt : %s[%s]" % (txt, dictTextXY[txt] ))
                searchText = self.findID(txt, self.dictText)

                logger.info(u"  Search Text : %s" % (searchText))

                # get text point - middle of node
                px, py = self.getPoint(dictTextXY[txt])

                cDist = 1000.0
                cNode = None
                csn = None
                ctn = None

                # for each node in dictEdges
                ni = 0
                for edge in listEdges:

                    logger.debug(u"  edge: %s" % edge)

                    try:
                        # get source
                        source = edge[0][2]
                        sName = self.findID(source)
                        sl = self.dictNodeXY[source]
                        spx, spy = self.getPoint(sl)

                        # get target
                        target = edge[0][1]
                        tName = self.findID(target)
                        tl = self.dictNodeXY[target]

                        tpx, tpy = self.getPoint(tl)

                        # determine distance between points
                        d = self.DistancePointLine (px, py, spx, spy, tpx, tpy)

                        if d < cDist:
                            cDist = d
                            cNode = edge[0][0]
                            csn = sName
                            tsn = tName

                    except:
                        pass

                if cNode != None:
                    tbFound += 1
                    logger.debug(u"    Closest Connector : %s" % cNode)
                    logger.info(u"    found(%d:%d] - %s->%s->%s [%2.3f]" % (tbFound, tbTotal, csn, searchText, tsn, cDist))

                    edge   = searchText
                    source = sName
                    target = tName

                    dimSource = sl
                    dimTarget = tl

                    if edge is None:
                        edge = u"TBD"

                    d = q.getConcepts()[csn]

                    for ld in dimSource.keys():
                        logger.debug(u"%s %s:%2.3f" % (source, ld, dimSource[ld]))
                        d.addConceptKeyType(ld, str(dimSource[ld]))

                        f = d.addConceptKeyType(target, u"Target")
                        for ld in dimTarget.keys():
                            logger.debug(u"%s %s:%2.3f" % (target, ld, dimSource[ld]))
                            f.addConceptKeyType(ld, str(dimTarget[ld]))

                        f.addConceptKeyType(self.al.cleanString(edge), u"Edge")


            if tbTotal != 0:
                    logger.info(u"Found [%3.1f] Text Box Connectors" % ((tbFound / float(tbTotal)) * 100.0))

            dictTextXY = dict()

        return self.concepts
Пример #22
0
def logSemanticTerms (inputFile, outputFile):
    logger.info("Input File: " + inputFile)
    iFile = open(inputFile, "r")

    logger.info("Output File: " + outputFile)
    oFile = open(outputFile , "w")

    writer = csv.writer(oFile)
    reader = csv.reader(iFile)

    wordsConcepts = Concepts("WordsConcepts", "Words")

    NOUN_POS = ['N', 'NN', 'NNP', 'NNS']
    VERB_POS = ['V', 'VD', 'VG', 'VN']
    POS = VERB_POS + NOUN_POS
  
    rownum = 0

    tree = list()

    writer.writerow(["word", "synset.definition", "synset.lemma_names", "synset.examples"])
    
    for row in reader:
        logger.debug("row: %s - %s" % (str(rownum), row))

        # Take first column
        term = row[0]
        
        logger.debug("Term: %s" % term)

        text = nltk.word_tokenize(term)
        
        posTagged = (nltk.pos_tag(text))

        logger.debug("  POS Text: %s" % posTagged)

        for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(term)):
            logger.debug("   Word: " + word + " POS: " + pos)

            if (pos in POS):
                logger.info("Add  POS:" + word)
                wordsConcepts.addConceptKeyType(word, "WORD")
            else:
                logger.info("Skip POS:" + word)

            for synset in wn.synsets(word):

                if GRAPH == True:
                    for i in synset.lemma_names:
                        edge = pydot.Edge(term, i)
                        graph.add_edge(edge)
                    
                writer.writerow([word, synset.definition, synset.lemma_names, synset.examples])

                logger.info("    %s\t%s" % (word, synset.lemma_names))
                #logger.debug("%s\t%s\t%s\t%s" % (word, synset.lemma_names, synset.definition, synset.examples))

                if GRAPH == True:
                    paths = synset.hypernym_paths()

                    prior = None
                    for x in range(0, len(paths)-1):
                        flag = False
                        for synset in paths[x]:
                            if flag == False:
                                prior = synset.name
                                flag = True
                            else:
                                edge = pydot.Edge(prior, synset.name)
                                graph.add_edge(edge)
                                prior = synset.name
                            logger.info("%s" % synset.name)

                    # tie it to the last entry
                    if prior != None:
                        edge = pydot.Edge(prior, term)
                        graph.add_edge(edge)

    iFile.close()
    oFile.close()

    wordsConcepts.logConcepts()
    
    return wordsConcepts
class Collocations(object):
    concepts         = None
    
    conceptsNGram        = None
    conceptNGramScore    = None
    conceptsNGramSubject = None

    conceptFile      = u"documents.p"

    ngramFile        = u"ngrams.p"
    ngramScoreFile   = u"ngramscore.p"
    ngramSubjectFile = u"ngramsubject.p"
    
    def __init__(self, conceptFile=None):
        if conceptFile == None:
            conceptFile      = u"documents.p"

        logger.info(u"Load Concepts from %s " % (conceptFile))
        self.concepts = Concepts.loadConcepts(conceptFile)
        logger.info(u"Loaded Concepts")

        self.conceptsNGram = Concepts(u"n-gram", u"NGRAM")
        self.conceptsNGramScore = Concepts(u"NGram_Score", u"Score")
        self.conceptsNGramSubject = Concepts(u"Subject", u"Subjects")

    def getCollocationConcepts(self):
        return self.conceptsNGram, self.conceptsNGramScore, self.conceptsNGramSubject
        
    def find_collocations(self):
        lemmatizer = WordNetLemmatizer()

        stopset = set(stop)
        filter_stops = lambda w: len(w) < 3 or w in stopset

        words = list()

        dictWords = dict()

        for document in self.concepts.getConcepts().values():
            logger.debug(document.name)
            for concept in document.getConcepts().values():
               logger.debug(concept.name)

               for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(concept.name)):
                    logger.debug(u"Word: " + word + u" POS: " + pos)
                    lemmaWord = lemmatizer.lemmatize(word.lower())
                    logger.debug(u"Word: " + word + u" Lemma: " + lemmaWord)
                    words.append(lemmaWord)

                    if pos[0] == u"N":
                        dictWords[lemmaWord] = word


        for x in dictWords.keys():
            logger.info(u"noun : %s" % x)

        bcf = BigramCollocationFinder.from_words(words)
        tcf = TrigramCollocationFinder.from_words(words)

        bcf.apply_word_filter(filter_stops)
        tcf.apply_word_filter(filter_stops)
        tcf.apply_freq_filter(3)

        listBCF = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 100)

        for bigram in listBCF:
            concept = u' '.join([bg for bg in bigram])
            e = self.conceptsNGram.addConceptKeyType(concept, u"BiGram")
            logger.info(u"Bigram  : %s" % concept)
            for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(concept)):
                e.addConceptKeyType(word, pos)

        listTCF = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 100)

        for trigram in listTCF:
            concept = u' '.join([bg for bg in trigram])
            e = self.conceptsNGram.addConceptKeyType(concept, u"TriGram")
            logger.info(u"Trigram : %s" % concept)
            for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(concept)):
                e.addConceptKeyType(word, pos)
            
        bcfscored = bcf.score_ngrams(BigramAssocMeasures.likelihood_ratio)
        lt = sorted(bcfscored, key=lambda c: c[1], reverse=True)
        for score in lt:
            name = ' '.join([w for w in score[0]])
            count = float(score[1])
            e = self.conceptsNGramScore.addConceptKeyType(name, u"BiGram")
            for x in score[0]:
                e.addConceptKeyType(x, u"BWord")
            e.count = count
            logger.debug(u"bcfscored: %s=%s" % (name, count))

        tcfscored = tcf.score_ngrams(TrigramAssocMeasures.likelihood_ratio)
        lt = sorted(tcfscored, key=lambda c: c[1], reverse=True)
        for score in lt:
            name = ' '.join([w for w in score[0]])
            count = float(score[1])
            e = self.conceptsNGramScore.addConceptKeyType(name, u"TriGram")
            for x in score[0]:
                e.addConceptKeyType(x, u"TWord")
            e.count = count
            logger.debug(u"tcfscored: %s = %s" % (name, count))

        Concepts.saveConcepts(self.conceptsNGramScore, self.ngramScoreFile)
        Concepts.saveConcepts(self.conceptsNGram, self.ngramFile)

        for concept in self.conceptsNGram.getConcepts().values():
            for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(concept.name)):
                if pos[0] == u"N":
                    e = self.conceptsNGramSubject.addConceptKeyType(word, pos)
                    e.addConceptKeyType(concept.name, u"NGRAM")

        Concepts.saveConcepts(self.conceptsNGramSubject, self.ngramSubjectFile)
Пример #24
0
    def dependancyAnalysis(self, listTSort):

        index = 0
        for x in listTSort:
            logger.debug(u"%d %s[%s] -%s-> %s[%s]" % (index, self.al.dictNodes[x[0]][u"name"], self.al.dictNodes[x[0]][ARCHI_TYPE], u"UsedBy",
                                                    self.al.dictNodes[x[1]][u"name"], self.al.dictNodes[x[1]][ARCHI_TYPE]))
            index += 1

            self.al.addToNodeDict(self.al.dictNodes[x[0]][u"name"], self.al.dictBP)
            self.al.addToNodeDict(self.al.dictNodes[x[1]][u"name"], self.al.dictBP)

        logger.info(u"Topic Sort Candidates : %d" % (len(listTSort)))

        nodes = list()
        index = 0
        dictTasks = dict()
        for x in listTSort:
            sname = self.al.dictNodes[x[0]][u"name"]
            tname = self.al.dictNodes[x[1]][u"name"]
            index += 1
            logger.debug(u"%d %s -%s-> %s" % (index, sname, u"UsedBy", tname))

            if sname in dictTasks:
                ln = dictTasks[sname]
                ln.append(tname)
            else:
                ln = list()
                ln.append(tname)
                dictTasks[sname] = ln

        for x in dictTasks.keys():
            logger.debug(u"dictTasks[%s]=%s" % (x, dictTasks[x]))
            a = Task(x, dictTasks[x])
            nodes.append(a)

        for x in self.al.dictBP.keys():
            # for x in listBP:
            if x not in dictTasks:
                logger.debug(u"Add %s" % (x))
                a = Task(x, list())
                nodes.append(a)

        self.format_nodes(nodes)

        conceptBatches = Concepts(u"Batch", u"archimate:WorkPackage")

        n = 0
        logger.info(u"Batches:")
        batches = self.get_task_batches(nodes)
        for bundle in batches:
            n += 1
            name = u"Batch %d" % n
            c = conceptBatches.addConceptKeyType(name, u"archimate:WorkPackage")
            for node in bundle:
                c.addConceptKeyType(node.name, u"archimate:BusinessProcess")

            logger.info(u"%d : %s" % (n, ", ".join(node.name.lstrip() for node in bundle)))

        Concepts.saveConcepts(conceptBatches, fileConceptsBatches)

        return conceptBatches