예제 #1
0
def logSemanticTerms (inputFile, outputFile):
    logger.info("Input File: " + inputFile)
    iFile = open(inputFile, "r")

    logger.info("Output File: " + outputFile)
    oFile = open(outputFile , "w")

    writer = csv.writer(oFile)
    reader = csv.reader(iFile)

    wordsConcepts = Concepts("WordsConcepts", "Words")

    NOUN_POS = ['N', 'NN', 'NNP', 'NNS']
    VERB_POS = ['V', 'VD', 'VG', 'VN']
    POS = VERB_POS + NOUN_POS
  
    rownum = 0

    tree = list()

    writer.writerow(["word", "synset.definition", "synset.lemma_names", "synset.examples"])
    
    for row in reader:
        logger.debug("row: %s - %s" % (str(rownum), row))

        # Take first column
        term = row[0]
        
        logger.debug("Term: %s" % term)

        text = nltk.word_tokenize(term)
        
        posTagged = (nltk.pos_tag(text))

        logger.debug("  POS Text: %s" % posTagged)

        for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(term)):
            logger.debug("   Word: " + word + " POS: " + pos)

            if (pos in POS):
                logger.info("Add  POS:" + word)
                wordsConcepts.addConceptKeyType(word, "WORD")
            else:
                logger.info("Skip POS:" + word)

            for synset in wn.synsets(word):

                if GRAPH == True:
                    for i in synset.lemma_names:
                        edge = pydot.Edge(term, i)
                        graph.add_edge(edge)
                    
                writer.writerow([word, synset.definition, synset.lemma_names, synset.examples])

                logger.info("    %s\t%s" % (word, synset.lemma_names))
                #logger.debug("%s\t%s\t%s\t%s" % (word, synset.lemma_names, synset.definition, synset.examples))

                if GRAPH == True:
                    paths = synset.hypernym_paths()

                    prior = None
                    for x in range(0, len(paths)-1):
                        flag = False
                        for synset in paths[x]:
                            if flag == False:
                                prior = synset.name
                                flag = True
                            else:
                                edge = pydot.Edge(prior, synset.name)
                                graph.add_edge(edge)
                                prior = synset.name
                            logger.info("%s" % synset.name)

                    # tie it to the last entry
                    if prior != None:
                        edge = pydot.Edge(prior, term)
                        graph.add_edge(edge)

    iFile.close()
    oFile.close()

    wordsConcepts.logConcepts()
    
    return wordsConcepts
예제 #2
0
def gapSimilarity(fileArchimate, searchTypes):

    lemmatizer = WordNetLemmatizer()

    logger.info(u"Using : %s" % fileArchimate)

    al = ArchiLib(fileArchimate)

    nl = al.getTypeNodes(searchTypes)

    logger.info(u"Find Words...")
    concepts = Concepts(u"Word", u"Topic")

    n = 0
    for sentence in nl:
        n += 1

        if sentence is None:
            continue

        logger.info(u"%s" % sentence)

        c = concepts.addConceptKeyType(u"Document" + str(n), nl[sentence][ARCHI_TYPE])
        d = c.addConceptKeyType(sentence, nl[sentence][ARCHI_TYPE])

        cleanSentence = u' '.join([word for word in sentence.split(u" ") if word not in stop])
        for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(cleanSentence)):
            if len(word) > 1 and pos[0] == u"N":
                lemmaWord =lemmatizer.lemmatize(word.lower())
                e = d.addConceptKeyType(lemmaWord, u"LemmaWord")
                f = e.addConceptKeyType(pos, u"POS")

    if False:
        concepts.logConcepts()

    if True:
        logger.info(u"Find Collocations...")
        fc = Collocations()
        fc.find_collocations(concepts)

    if True:
        npbt = DocumentsSimilarity(al)

        logger.info(u"Create Topics")
        npbt.createTopics(concepts)

        if True:
            logger.info(u"Find Similarities")

            nc = npbt.findSimilarties()

            logger.debug(u"Topics")
            listTopics = list()
            ncg = npbt.topicConcepts.getConcepts().values()
            for x in ncg:
                logger.info(u"%s[%d]" % (x.name, x.count))
                lt = (x.name, x.count)
                listTopics.append(lt)

            logger.info(u"Topics Sorted")
            with open(u"topic_sort.txt", "wb") as f:
                for x in sorted(listTopics, key=lambda c: abs(c[1]), reverse=False):
                    output = "Topic : %s[%d]" % (x[0], x[1])
                    logger.info(output)
                    f.write(output + os.linesep)