def logSemanticTerms (inputFile, outputFile): logger.info("Input File: " + inputFile) iFile = open(inputFile, "r") logger.info("Output File: " + outputFile) oFile = open(outputFile , "w") writer = csv.writer(oFile) reader = csv.reader(iFile) wordsConcepts = Concepts("WordsConcepts", "Words") NOUN_POS = ['N', 'NN', 'NNP', 'NNS'] VERB_POS = ['V', 'VD', 'VG', 'VN'] POS = VERB_POS + NOUN_POS rownum = 0 tree = list() writer.writerow(["word", "synset.definition", "synset.lemma_names", "synset.examples"]) for row in reader: logger.debug("row: %s - %s" % (str(rownum), row)) # Take first column term = row[0] logger.debug("Term: %s" % term) text = nltk.word_tokenize(term) posTagged = (nltk.pos_tag(text)) logger.debug(" POS Text: %s" % posTagged) for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(term)): logger.debug(" Word: " + word + " POS: " + pos) if (pos in POS): logger.info("Add POS:" + word) wordsConcepts.addConceptKeyType(word, "WORD") else: logger.info("Skip POS:" + word) for synset in wn.synsets(word): if GRAPH == True: for i in synset.lemma_names: edge = pydot.Edge(term, i) graph.add_edge(edge) writer.writerow([word, synset.definition, synset.lemma_names, synset.examples]) logger.info(" %s\t%s" % (word, synset.lemma_names)) #logger.debug("%s\t%s\t%s\t%s" % (word, synset.lemma_names, synset.definition, synset.examples)) if GRAPH == True: paths = synset.hypernym_paths() prior = None for x in range(0, len(paths)-1): flag = False for synset in paths[x]: if flag == False: prior = synset.name flag = True else: edge = pydot.Edge(prior, synset.name) graph.add_edge(edge) prior = synset.name logger.info("%s" % synset.name) # tie it to the last entry if prior != None: edge = pydot.Edge(prior, term) graph.add_edge(edge) iFile.close() oFile.close() wordsConcepts.logConcepts() return wordsConcepts
def gapSimilarity(fileArchimate, searchTypes): lemmatizer = WordNetLemmatizer() logger.info(u"Using : %s" % fileArchimate) al = ArchiLib(fileArchimate) nl = al.getTypeNodes(searchTypes) logger.info(u"Find Words...") concepts = Concepts(u"Word", u"Topic") n = 0 for sentence in nl: n += 1 if sentence is None: continue logger.info(u"%s" % sentence) c = concepts.addConceptKeyType(u"Document" + str(n), nl[sentence][ARCHI_TYPE]) d = c.addConceptKeyType(sentence, nl[sentence][ARCHI_TYPE]) cleanSentence = u' '.join([word for word in sentence.split(u" ") if word not in stop]) for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(cleanSentence)): if len(word) > 1 and pos[0] == u"N": lemmaWord =lemmatizer.lemmatize(word.lower()) e = d.addConceptKeyType(lemmaWord, u"LemmaWord") f = e.addConceptKeyType(pos, u"POS") if False: concepts.logConcepts() if True: logger.info(u"Find Collocations...") fc = Collocations() fc.find_collocations(concepts) if True: npbt = DocumentsSimilarity(al) logger.info(u"Create Topics") npbt.createTopics(concepts) if True: logger.info(u"Find Similarities") nc = npbt.findSimilarties() logger.debug(u"Topics") listTopics = list() ncg = npbt.topicConcepts.getConcepts().values() for x in ncg: logger.info(u"%s[%d]" % (x.name, x.count)) lt = (x.name, x.count) listTopics.append(lt) logger.info(u"Topics Sorted") with open(u"topic_sort.txt", "wb") as f: for x in sorted(listTopics, key=lambda c: abs(c[1]), reverse=False): output = "Topic : %s[%d]" % (x[0], x[1]) logger.info(output) f.write(output + os.linesep)