def __buildDics__(self, filename): misc = Miscelaneous() txtfile = misc.openFile(filename, 'r') record_phrase = False for line in txtfile: line = re.sub('\n', '', line) if '(ROOT' in line: record_phrase = True elif line == '': record_phrase = False elif record_phrase: elements = line.split('(') for values in elements: if ')' in values: term = values.split(')')[0] print term """ self.dic_t[id_t] = {'word':word, 'lemma':lemma, 'pos':pos, 'morph':morph, 'sem':sem, 'extra':extra, 'headof':''} self.dic_nt[id_nt] = {'cat':cat, 'edge':array_edges} self.dic_t[idref]['headof'] = id_nt self.dic_nt[id_nt]['head'] = idref self.dic_nt[id_nt]['edge'] = array_edges """ txtfile.close()
def writeDicAN(self, filename): misc = Miscelaneous() output_an = misc.openFile(filename+'.txt', 'w') if self.mountANRelations: self.__extractRelations__('AN') self.mountANRelations = False for id_an in self.dic_an: output_an.write(id_an+'#'+str(self.dic_an[id_an])+'\n') output_an.close()
def __init__(self, fileinput): self.list_seeds = [] misc = Miscelaneous() file_seeds = misc.openFile(fileinput, 'r') for line in file_seeds: if line != '': line = line.replace('\n', '') self.list_seeds.append(line.lower()) file_seeds.close()
def __init__(self, fileinput): self.list_seeds = [] misc = Miscelaneous() file_seeds = misc.openFile(fileinput, "r") for line in file_seeds: if line != "": line = line.replace("\n", "") self.list_seeds.append(line.lower()) file_seeds.close()
def __init__(self, temp_folder, file_input, seedfile, mi_precision): self.window_size = file_input[1:-23] self.temp_folder = temp_folder self.misc = Miscelaneous() seeds_file = Seeds(seedfile) self.list_seeds = seeds_file.getSeeds() self.first_line = '' self.dic_tuplas = defaultdict(dict) self.dic_terms = OrderedDict() self.__buildMI__(file_input, mi_precision)
def __init__(self, temp_folder): self.temp_folder = temp_folder self.misc = Miscelaneous() self.dic_an = {} self.dic_sv = {} self.dic_vo = {} self.matrix_relations = ['AN', 'SV', 'VO'] for type_relation in self.matrix_relations: self.__loadTerms__(type_relation) self.__writeDic__()
def __init__(self, seedfile, temp_folder, sim_measure): self.misc = Miscelaneous() seeds_file = Seeds(seedfile) self.temp_folder = temp_folder self.dic_nouns = {} self.dic_seeds = defaultdict(dict) #self.dic_seeds_freqObj = {} #self.dic_seeds_Obj = {} self.list_seeds = seeds_file.getSeeds() self.dic_measure = defaultdict(dict) self.dic_Obj2 = defaultdict(dict) self.dic_freqObj = {} self.dic_Obj = {} self.__buildHashs__(sim_measure)
def __init__(self, input_folder, temp_folder, min_word_size, window_size): try: self.root, self.dirs, self.files = os.walk(input_folder).next()[:3] except IOError: print bcolors.FAIL+'ERROR: It was not possible to open the '+input_folder+' folder'+bcolors.ENDC sys.exit(2) self.min_word_size = int(min_word_size) self.window_size = int(window_size) self.temp_folder = temp_folder self.qty_documents = len(self.files) self.misc = Miscelaneous() if os.path.exists(self.temp_folder+'Statistical_corpus.txt'): os.system('rm '+self.temp_folder+'Statistical_corpus.txt') self.temp_file = self.misc.openFile(self.temp_folder+'Statistical_corpus.txt', 'a')
def __init__(self, input_file, temp_folder, svd_dimension): self.misc = Miscelaneous() self.temp_folder = temp_folder self.svd_dimension = svd_dimension self.dic_column = OrderedDict() self.dic_column_index = {} self.dic_row = OrderedDict() self.dic_row_index = {} self.array_row = [] self.array_col = [] self.array_data = [] self.dic_matrix = {} string_files_matrix = '' self.buildMatrixFromFile(input_file) self.applySvd() self.writeSvd()
def __buildDics__(self, filename): misc = Miscelaneous() xmlfile = misc.openFile(filename, 'r') for line in xmlfile: if '<t ' in line: id_t = (line.split('id=\"')[1]).split('\"')[0] word = (line.split('word=\"')[1]).split('\"')[0].lower() lemma = ((line.split('lemma=\"')[1]).split('\"')[0]).lower() morph = (line.split('morph=\"')[1]).split('\"')[0] sem = (line.split('sem=\"')[1]).split('\"')[0] extra = (line.split('extra=\"')[1]).split('\"')[0] if re.search('%|&', lemma): pos = '--' else: pos = (line.split('pos=\"')[1]).split('\"')[0] self.dic_t[id_t] = {'word':word, 'lemma':lemma, 'pos':pos, 'morph':morph, 'sem':sem, 'extra':extra, 'headof':''} elif '<nt ' in line: id_nt = (line.split('id=\"')[1]).split('\"')[0] id_nt_number = id_nt.split('_')[1] cat = (line.split('cat=\"')[1]).split('\"')[0] array_edges = [] self.dic_nt[id_nt] = {'cat':cat, 'edge':array_edges} elif '<edge ' in line: idref = (line.split('idref=\"')[1]).split('\"')[0] idref_number = idref.split('_')[1] label = (line.split('label=\"')[1]).split('\"')[0] if int(idref_number) < 500 or int(idref_number) > int(id_nt_number): array_edges.append([idref, label]) if label == 'H': self.dic_t[idref]['headof'] = id_nt self.dic_nt[id_nt]['head'] = idref elif '</nt>' in line: self.dic_nt[id_nt]['edge'] = array_edges xmlfile.close()
def __init__(self, ctx_freq_file, seedfile): self.misc = Miscelaneous() seeds_file = Seeds(seedfile) self.list_seeds = seeds_file.getSeeds() self.dic_baseline = defaultdict(dict) self.dic_diceBin = defaultdict(dict) self.dic_diceMin = defaultdict(dict) self.dic_jaccard = defaultdict(dict) self.dic_cosineBin = defaultdict(dict) self.dic_cosine = defaultdict(dict) self.dic_city = defaultdict(dict) self.dic_euclidean = defaultdict(dict) self.dic_js = defaultdict(dict) self.dic_lin = defaultdict(dict) self.dic_jaccardMax = defaultdict(dict) self.dic_ctx = defaultdict(dict) self.dic_sum_freq_noun = {} self.dic_qty_noun = {} self.__buildHashs__(ctx_freq_file, seedfile)
def __init__(self, input_folder, temp_folder, stoplist_file, min_word_size, record_intermediate): try: self.root, self.dirs, self.files = os.walk(input_folder).next()[:3] except IOError: print bcolors.FAIL+'ERROR: It was not possible to open the '+input_folder+' folder'+bcolors.ENDC sys.exit(2) self.min_word_size = int(min_word_size) self.temp_folder = temp_folder self.qty_documents = len(self.files) self.misc = Miscelaneous() self.stoplist = self.misc.getStoplist(stoplist_file) self.matrix_relations = ['AN', 'SV', 'VO'] self.dic_an = {} self.dic_sv = {} self.dic_vo = {} command = 'rm -Rf '+self.temp_folder+'; mkdir '+self.temp_folder+' ' if record_intermediate: command += self.temp_folder+'AN/'+' '+self.temp_folder+'AN/2Order/'+self.temp_folder+'AN/3Order/ ' command += self.temp_folder+'SV/'+' '+self.temp_folder+'SV/2Order/'+self.temp_folder+'SV/3Order/ ' command += self.temp_folder+'VO/'+' '+self.temp_folder+'VO/2Order/'+self.temp_folder+'VO/3Order/ ' os.system(command) i = 0 for corpus_file in self.files: self.dic_t = {} self.dic_nt = {} self.dic_an_doc = {} self.dic_sv_doc = {} self.dic_vo_doc = {} #print corpus_file i += 1 if re.match('.*xml$', corpus_file): corpus_filename = corpus_file.split('.')[0] xml = ParseStanfordXml(self.root+''+corpus_file) self.dic_t = xml.getDicTerms() self.dic_nt = xml.getDicNonTerminals() self.__extractRelations__() if record_intermediate: self.__writeDicRelations__(corpus_filename) self.misc.progress_bar(i, self.qty_documents, 100) self.__writeDic__()
def __init__(self, input_file, temp_folder, svd_dimension): self.misc = Miscelaneous() self.temp_folder = temp_folder self.svd_dimension = svd_dimension self.dic_column = OrderedDict() self.dic_column_index = {} self.dic_row = OrderedDict() self.dic_row_index = {} self.array_row = [] self.array_col = [] self.array_data = [] self.dic_matrix = {} string_files_matrix = "" self.buildMatrixFromFile(input_file) self.applySvd() self.writeSvd()
def __init__(self, temp_folder, svd_dimension, record_intermediate): self.misc = Miscelaneous() self.temp_folder = temp_folder self.svd_dimension = svd_dimension self.dic_noun = OrderedDict() self.dic_noun_index = {} self.dic_modifier = OrderedDict() self.dic_modifier_index = {} self.row = [] self.col = [] self.data = [] self.dic_matrix = {} #self.line_data = '' list_relations = ['AN', 'SV', 'VO'] string_files_matrix = '' for relation in list_relations: self.type_relation = relation #self.buildMatrixFromFile() #self.applySvd() #if record_intermediate: # logfile.writeLogfile('- Recording SVD matrix to '+relation+' in a file...') # self.writeSvd() #self.buildRelationsSvd() string_files_matrix += self.temp_folder+''+relation+'/Matrix_row.txt '+self.temp_folder+''+relation+'/Matrix_column.txt ' file_matrix = self.misc.openFile(self.temp_folder+''+relation+'/Matrix_row.txt', 'r') for line in file_matrix: self.__loadDicMatrix__(line, relation) file_matrix.close() file_doc_matrix = self.misc.openFile(self.temp_folder+'/Matrix_nouns.txt', 'w') number_document = 0 for noun in self.dic_matrix: file_doc_matrix.write(str(number_document)+' : '+noun+'\n') command = 'cat'+self.dic_matrix[noun]+' > '+self.temp_folder+'Matrix/'+str(number_document)+'.txt' os.system(command) if not record_intermediate: command = 'rm -Rf'+self.dic_matrix[noun]+' '+string_files_matrix os.system(command) number_document += 1 file_doc_matrix.close()
class Contexts: def __init__(self, temp_folder): self.temp_folder = temp_folder self.misc = Miscelaneous() self.dic_an = {} self.dic_sv = {} self.dic_vo = {} self.matrix_relations = ['AN', 'SV', 'VO'] for type_relation in self.matrix_relations: self.__loadTerms__(type_relation) self.__writeDic__() def __del__(self): pass def __loadTerms__(self, type_relation): try: root, dirs, files = os.walk(self.temp_folder+''+type_relation+'/2Order/').next()[:3] except IOError: print bcolors.FAIL+'ERROR: It was not possible to open the '+self.temp_folder+' folder'+bcolors.ENDC sys.exit(2) qty_documents = len(files) i = 0 for corpus_file in files: i += 1 if re.match('.*txt$', corpus_file): relation_file = self.misc.openFile(root+''+corpus_file, 'r') for line in relation_file: line = re.sub('\n', '', line) relation, noun, frequency = line.split('#') if type_relation == 'AN': self.__addElementDicAN__(relation+'#'+noun, frequency) elif type_relation == 'SV': self.__addElementDicSV__(relation+'#'+noun, frequency) elif type_relation == 'VO': self.__addElementDicVO__(relation+'#'+noun, frequency) self.misc.progress_bar(i, qty_documents, 100) def __addElementDicAN__(self, relation, frequency): if self.dic_an.has_key(relation): self.dic_an[relation] += int(frequency) else: self.dic_an[relation] = int(frequency) def __addElementDicSV__(self, relation, frequency): if self.dic_sv.has_key(relation): self.dic_sv[relation] += int(frequency) else: self.dic_sv[relation] = int(frequency) def __addElementDicVO__(self, relation, frequency): if self.dic_vo.has_key(relation): self.dic_vo[relation] += int(frequency) else: self.dic_vo[relation] = int(frequency) def __writeDic__(self): for type_relation in self.matrix_relations: file_relation = self.misc.openFile(self.temp_folder+''+type_relation+'/Relations.txt', 'w') dic_relation = self.getDic(type_relation) for id_relation in dic_relation: file_relation.write(id_relation+'#'+str(dic_relation[id_relation])+'\n') file_relation.close() """ Get and Print methods """ def getDic(self, type_relation): if type_relation == 'AN': return self.dic_an elif type_relation == 'SV': return self.dic_sv elif type_relation == 'VO': return self.dic_vo def printDic (self, type_relation): dic_relation = getDic(type_relation) for id_relation in self.dic_relation: print id_relation+' = '+str(self.dic_relation[id_relation])
class StanfordSyntacticContexts: def __init__(self, input_folder, temp_folder, stoplist_file, min_word_size, record_intermediate): try: self.root, self.dirs, self.files = os.walk(input_folder).next()[:3] except IOError: print bcolors.FAIL+'ERROR: It was not possible to open the '+input_folder+' folder'+bcolors.ENDC sys.exit(2) self.min_word_size = int(min_word_size) self.temp_folder = temp_folder self.qty_documents = len(self.files) self.misc = Miscelaneous() self.stoplist = self.misc.getStoplist(stoplist_file) self.matrix_relations = ['AN', 'SV', 'VO'] self.dic_an = {} self.dic_sv = {} self.dic_vo = {} command = 'rm -Rf '+self.temp_folder+'; mkdir '+self.temp_folder+' ' if record_intermediate: command += self.temp_folder+'AN/'+' '+self.temp_folder+'AN/2Order/'+self.temp_folder+'AN/3Order/ ' command += self.temp_folder+'SV/'+' '+self.temp_folder+'SV/2Order/'+self.temp_folder+'SV/3Order/ ' command += self.temp_folder+'VO/'+' '+self.temp_folder+'VO/2Order/'+self.temp_folder+'VO/3Order/ ' os.system(command) i = 0 for corpus_file in self.files: self.dic_t = {} self.dic_nt = {} self.dic_an_doc = {} self.dic_sv_doc = {} self.dic_vo_doc = {} #print corpus_file i += 1 if re.match('.*xml$', corpus_file): corpus_filename = corpus_file.split('.')[0] xml = ParseStanfordXml(self.root+''+corpus_file) self.dic_t = xml.getDicTerms() self.dic_nt = xml.getDicNonTerminals() self.__extractRelations__() if record_intermediate: self.__writeDicRelations__(corpus_filename) self.misc.progress_bar(i, self.qty_documents, 100) self.__writeDic__() def __del__(self): pass def __extractRelations__(self): i = 0 #print self.dic_t for id_nt in self.dic_nt: if self.dic_nt[id_nt]['cat'] == 'nn': noun = self.dic_t[self.dic_nt[id_nt]['gov']]['lemma'].lower() #noun = re.sub('-', '_', noun) context = self.dic_t[self.dic_nt[id_nt]['dep']]['lemma'].lower() context = re.sub('-', '_', context) if len(noun) >= self.min_word_size and len(context) >= self.min_word_size: self.__addElementDicAN__(context+'#'+noun) self.__addElementDicAN__(noun+'#'+context) self.__addElementDicDocAN__(context+'#'+noun) self.__addElementDicDocAN__(noun+'#'+context) elif self.dic_nt[id_nt]['cat'] == 'amod': noun = self.dic_t[self.dic_nt[id_nt]['gov']]['lemma'].lower() #noun = re.sub('-', '_', noun) context = self.dic_t[self.dic_nt[id_nt]['dep']]['lemma'].lower() context = re.sub('-', '_', context) if len(noun) >= self.min_word_size and len(context) >= self.min_word_size and context not in self.stoplist: self.__addElementDicAN__(context+'#'+noun) self.__addElementDicDocAN__(context+'#'+noun) if re.match('prep_', self.dic_nt[id_nt]['cat']) \ and re.match('^NN', self.dic_t[self.dic_nt[id_nt]['dep']]['pos']) \ and re.match('^NN', self.dic_t[self.dic_nt[id_nt]['gov']]['pos']): noun = self.dic_t[self.dic_nt[id_nt]['gov']]['lemma'].lower() #noun = re.sub('-', '_', noun) context = self.dic_t[self.dic_nt[id_nt]['dep']]['lemma'].lower() context = re.sub('-', '_', context) prep = self.dic_nt[id_nt]['cat'].split('_')[1] if len(noun) >= self.min_word_size and len(context) >= self.min_word_size: self.__addElementDicAN__(prep+'_'+context+'#'+noun) self.__addElementDicAN__(prep+'_'+noun+'#'+context) self.__addElementDicDocAN__(prep+'_'+context+'#'+noun) self.__addElementDicDocAN__(prep+'_'+noun+'#'+context) elif re.match('^(nsubjpass|nsubj|xsubj|agent)$', self.dic_nt[id_nt]['cat']): #gov = verb if re.match('V', self.dic_t[self.dic_nt[id_nt]['gov']]['pos']): verb = self.dic_t[self.dic_nt[id_nt]['gov']]['lemma'].lower() #verb = re.sub('-', '_', verb) contexts = self.dic_t[self.dic_nt[id_nt]['dep']]['nps'] for context in contexts: if len(verb) >= self.min_word_size and len(context) >= self.min_word_size: self.__addElementDicSV__('sub_'+verb+'#'+context) self.__addElementDicDocSV__('sub_'+verb+'#'+context) elif re.match('^(dobj|iobj)$', self.dic_nt[id_nt]['cat']): if re.match('V', self.dic_t[self.dic_nt[id_nt]['gov']]['pos']): verb = self.dic_t[self.dic_nt[id_nt]['gov']]['lemma'].lower() #verb = re.sub('-', '_', verb) contexts = self.dic_t[self.dic_nt[id_nt]['dep']]['nps'] for context in contexts: if len(verb) >= self.min_word_size and len(context) >= self.min_word_size: self.__addElementDicVO__('obj_'+verb+'#'+context) self.__addElementDicDocVO__('obj_'+verb+'#'+context) def __addElementDicAN__(self, relation): if self.dic_an.has_key(relation): self.dic_an[relation] += 1 else: self.dic_an[relation] = 1 def __addElementDicDocAN__(self, relation): if self.dic_an_doc.has_key(relation): self.dic_an_doc[relation] += 1 else: self.dic_an_doc[relation] = 1 def __addElementDicSV__(self, relation): if self.dic_sv.has_key(relation): self.dic_sv[relation] += 1 else: self.dic_sv[relation] = 1 def __addElementDicDocSV__(self, relation): if self.dic_sv_doc.has_key(relation): self.dic_sv_doc[relation] += 1 else: self.dic_sv_doc[relation] = 1 def __addElementDicVO__(self, relation): if self.dic_vo.has_key(relation): self.dic_vo[relation] += 1 else: self.dic_vo[relation] = 1 def __addElementDicDocVO__(self, relation): if self.dic_vo_doc.has_key(relation): self.dic_vo_doc[relation] += 1 else: self.dic_vo_doc[relation] = 1 def __writeDicRelations__(self, corpus_filename): file_relation_an = self.misc.openFile(self.temp_folder+'AN/2Order/AN_'+corpus_filename+'.txt', 'w') for id_relation in self.dic_an_doc: file_relation_an.write(id_relation+'#'+str(self.dic_an_doc[id_relation])+'\n') file_relation_an.close() file_relation_sv = self.misc.openFile(self.temp_folder+'SV/2Order/SV_'+corpus_filename+'.txt', 'w') for id_relation in self.dic_sv_doc: file_relation_sv.write(id_relation+'#'+str(self.dic_sv_doc[id_relation])+'\n') file_relation_sv.close() file_relation_vo = self.misc.openFile(self.temp_folder+'VO/2Order/VO_'+corpus_filename+'.txt', 'w') for id_relation in self.dic_vo_doc: file_relation_vo.write(id_relation+'#'+str(self.dic_vo_doc[id_relation])+'\n') file_relation_vo.close() def __writeDic__(self): for type_relation in self.matrix_relations: file_relation = self.misc.openFile(self.temp_folder+''+type_relation+'/2Order/Relations.txt', 'w') dic_relation = self.getDic(type_relation) for id_relation in dic_relation: file_relation.write(id_relation+'#'+str(dic_relation[id_relation])+'\n') file_relation.close() """ Get and Print methods """ def getDic(self, type_relation): if type_relation == 'AN': return self.dic_an elif type_relation == 'SV': return self.dic_sv elif type_relation == 'VO': return self.dic_vo def printDic (self, type_relation): dic_relation = getDic(type_relation) for id_relation in self.dic_relation: print id_relation+' = '+str(self.dic_relation[id_relation])
class StatisticalCorpus: def __init__(self, input_folder, temp_folder, min_word_size, window_size): try: self.root, self.dirs, self.files = os.walk(input_folder).next()[:3] except IOError: print bcolors.FAIL+'ERROR: It was not possible to open the '+input_folder+' folder'+bcolors.ENDC sys.exit(2) self.min_word_size = int(min_word_size) self.window_size = int(window_size) self.temp_folder = temp_folder self.qty_documents = len(self.files) self.misc = Miscelaneous() if os.path.exists(self.temp_folder+'Statistical_corpus.txt'): os.system('rm '+self.temp_folder+'Statistical_corpus.txt') self.temp_file = self.misc.openFile(self.temp_folder+'Statistical_corpus.txt', 'a') def __del__(self): pass def buildCorpus_pt(self): i = 0 for corpus_file in self.files: i += 1 if re.match('.*xml$', corpus_file): corpus_filename = corpus_file.split('.')[0] xmlfile = ParsePalavrasXml(self.root+''+corpus_file) dic_terms = xmlfile.getDicTerms() dic_nouns = xmlfile.getNouns() #dic_verbs = xmlfile.getVerbs() id_sentence = 1 id_word = 1 id_t = 's'+str(id_sentence)+'_'+str(id_word) string_corpus = '' while dic_terms.has_key(id_t): while dic_terms.has_key(id_t): lemma = re.sub('(--|/|,|;|\(|\)|\$|\+|\')', '', dic_terms[id_t]['lemma']) lemma = re.sub('-', '_', lemma) lemma = re.sub('_$', '', lemma) if not re.match('^(pu|num|conj|art|prp|spec)', dic_terms[id_t]['pos']) and (len(lemma) >= self.min_word_size): if dic_nouns.has_key(id_t): string_corpus += lemma+'__N ' #elif dic_verbs.has_key(id_t): # string_corpus += lemma+'__V ' else: string_corpus += lemma+'__O ' id_word += 1 id_t = 's'+str(id_sentence)+'_'+str(id_word) id_word = 1 id_sentence += 1 id_t = 's'+str(id_sentence)+'_'+str(id_word) #print string_corpus #print string_corpus self.temp_file.write(string_corpus) self.misc.progress_bar(i, self.qty_documents, 100) self.temp_file.close() def buildCorpus_en(self): i = 0 for corpus_file in self.files: i += 1 if re.match('.*xml$', corpus_file): corpus_filename = corpus_file.split('.')[0] xmlfile = ParseStanfordXml(self.root+''+corpus_file) dic_terms = xmlfile.getDicTerms() self.__getRelationsInAWindow__(dic_terms, self.window_size) self.misc.progress_bar(i, self.qty_documents, 100) self.temp_file.close() """ GET RELATIONS IN A WINDOW """ def __getRelationsInAWindow__(self, dic_terms, window_size): i = 0 id_sentence = 1 id_word = 1 id_t = 's'+str(id_sentence)+'_'+str(id_word) string_corpus = '' while dic_terms.has_key(id_t): while dic_terms.has_key(id_t): lemma = re.sub('(--|/|,|;|\(|\)|\$|\+|\'|[.])', '', dic_terms[id_t]['lemma']).lower() lemma = re.sub('-', '_', lemma) lemma = re.sub('_$', '', lemma) if len(lemma) >= self.min_word_size: if re.match('^NN', dic_terms[id_t]['pos']): string_corpus += lemma+'__N ' elif re.match('^(AMOD|JJ|VB|MD|RB|RP)', dic_terms[id_t]['pos']): string_corpus += lemma+'__O ' id_word += 1 id_t = 's'+str(id_sentence)+'_'+str(id_word) id_word = 1 id_sentence += 1 id_t = 's'+str(id_sentence)+'_'+str(id_word) self.temp_file.write(string_corpus) def buildSTRelations(self, file_input, seeds_file): seeds = Seeds(seeds_file) list_seeds = seeds.getSeeds() dic_tuplas = {} file_bigrams = self.misc.openFile(self.temp_folder+''+file_input, 'r') first_line = '' for line in file_bigrams: if first_line != '': part = line.split('<>') term_type1 = part[0] term_type2 = part[1] term1, type1 = term_type1.split('__') term2, type2 = term_type2.split('__') freq_tupla = part[2].split(' ')[0] freq_term1 = part[2].split(' ')[1] freq_term2 = part[2].split(' ')[2] if type1 == 'N' and term1 != term2: if dic_tuplas.has_key(term2+'#'+term1+'#'): dic_tuplas[term2+'#'+term1+'#'] += int(freq_tupla) else: dic_tuplas[term2+'#'+term1+'#'] = int(freq_tupla) if type2 == 'N' and term1 != term2: if dic_tuplas.has_key(term1+'#'+term2+'#'): dic_tuplas[term1+'#'+term2+'#'] += int(freq_tupla) else: dic_tuplas[term1+'#'+term2+'#'] = int(freq_tupla) else: first_line = line file_bigrams.close() file_relations = self.misc.openFile(self.temp_folder+'W'+str(self.window_size)+'_Relations.txt', 'w') for tupla in dic_tuplas: file_relations.write(tupla+''+str(dic_tuplas[tupla])+'\n') file_relations.close()
def __buildDics__(self, filename): misc = Miscelaneous() xmlfile = misc.openFile(filename, 'r') self.stoplist = misc.getStoplist('../misc/stoplist.txt') record_dependencies = False record_collapsed = False for line in xmlfile: line = re.sub('\n', '', line) if '<sentence ' in line: id_s = (line.split('id=\"')[1]).split('\"')[0] array_rec_dep = [] elif '<token ' in line: id_t = 's'+id_s+'_'+(line.split('id=\"')[1]).split('\"')[0] elif '<word>' in line: word = (line.split('<word>')[1]).split('</word>')[0] elif '<lemma>' in line: lemma = (line.split('<lemma>')[1]).split('</lemma>')[0] elif '<POS>' in line: pos = (line.split('<POS>')[1]).split('</POS>')[0] elif '<NER>' in line: ner = (line.split('<NER>')[1]).split('</NER>')[0] elif '</token>' in line: self.dic_t[id_t] = {'word':word, 'lemma':lemma, 'pos':pos, 'ner':ner, 'nps':[]} if re.match('NN|NNP', pos): array_nps = [lemma.lower()] self.dic_t[id_t]['nps'] = array_nps elif '<basic-dependencies>' in line: record_dependencies = True index_nt = 500 elif '</basic-dependencies>' in line: record_dependencies = False elif '<collapsed-ccprocessed-dependencies>' in line: record_collapsed = True elif '</collapsed-ccprocessed-dependencies>' in line: record_collapsed = False if record_dependencies or record_collapsed: if '<dep type=' in line: cat = (line.split('type=\"')[1]).split('\"')[0] elif '<governor ' in line: idx_gov = (line.split('idx=\"')[1]).split('\"')[0] id_t_gov = 's'+id_s+'_'+idx_gov elif '<dependent ' in line: idx_dep = (line.split('idx=\"')[1]).split('\"')[0] id_t_dep = 's'+id_s+'_'+idx_dep elif (record_dependencies or record_collapsed) and '</dep>' in line: if cat+'#'+id_t_gov+'#'+id_t_dep not in array_rec_dep: array_rec_dep.append(cat+'#'+id_t_gov+'#'+id_t_dep) self.dic_nt['s'+id_s+'_'+str(index_nt)] = {'cat':cat, 'gov':id_t_gov, 'dep':id_t_dep} index_nt += 1 xmlfile.close() for id_nt in self.dic_nt: if re.match("(nn|amod)", self.dic_nt[id_nt]['cat']): array_nps = self.dic_t[self.dic_nt[id_nt]['gov']]['nps'] string = '' id_gov = self.dic_nt[id_nt]['gov'].split('_')[1] id_dep = self.dic_nt[id_nt]['dep'].split('_')[1] id_s = self.dic_nt[id_nt]['dep'].split('_')[0] for i in range(int(id_dep), int(id_gov)): id_next = id_s+'_'+str(i) if re.match("(NN|JJ)", self.dic_t[id_next]['pos']) and self.dic_t[id_next]['lemma'] not in self.stoplist: string += self.dic_t[id_next]['lemma']+'_' string += self.dic_t[id_s+'_'+id_gov]['lemma'] if len(string.split('_')) > 1 and string.lower() not in array_nps: array_nps.append(string.lower()) self.dic_t[self.dic_nt[id_nt]['gov']]['nps'] = array_nps elif re.match("prep_of", self.dic_nt[id_nt]['cat']): id_gov = self.dic_nt[id_nt]['gov'] id_dep = self.dic_nt[id_nt]['dep'] if re.match("NN", self.dic_t[id_dep]['pos']) and re.match("NN", self.dic_t[id_gov]['pos']): array_nps = self.dic_t[id_dep]['nps'] string = self.dic_t[id_gov]['lemma']+'_of_'+self.dic_t[id_dep]['lemma'] array_nps.append(string.lower()) self.dic_t[self.dic_nt[id_nt]['dep']]['nps'] = array_nps array_nps = self.dic_t[id_gov]['nps'] string = self.dic_t[id_gov]['lemma']+'_of_'+self.dic_t[id_dep]['lemma'] array_nps.append(string.lower()) self.dic_t[self.dic_nt[id_nt]['gov']]['nps'] = array_nps
class Similarities: def __init__(self, seedfile, temp_folder, sim_measure): self.misc = Miscelaneous() seeds_file = Seeds(seedfile) self.temp_folder = temp_folder self.dic_nouns = {} self.dic_seeds = defaultdict(dict) #self.dic_seeds_freqObj = {} #self.dic_seeds_Obj = {} self.list_seeds = seeds_file.getSeeds() self.dic_measure = defaultdict(dict) self.dic_Obj2 = defaultdict(dict) self.dic_freqObj = {} self.dic_Obj = {} self.__buildHashs__(sim_measure) def __del__(self): pass def __buildHashs__(self, sim_measure): file_nouns = self.misc.openFile(self.temp_folder+'Matrix_nouns.txt', 'r') for line in file_nouns: line = re.sub('\n', '', line) doc, noun = line.split(' : ') self.dic_nouns[doc] = noun if noun in self.list_seeds: file_doc_seed = self.misc.openFile(self.temp_folder+'Matrix/'+doc+'.txt', 'r') self.dic_freqObj[doc] = 0 self.dic_Obj[doc] = 0 for line in file_doc_seed: line = re.sub('\n', '', line) modifier, noun, freq = line.split('#') self.dic_seeds[doc][modifier] = float(freq) #self.dic_seeds_freqObj[doc] += float(freq) #self.dic_seeds_Obj[doc] += 1 file_doc_seed.close() for doc_noun in self.dic_nouns: file_doc_nouns = self.misc.openFile(self.temp_folder+'Matrix/'+doc_noun+'.txt', 'r') for line in file_doc_nouns: line = re.sub('\n', '', line) modifier, noun, freq = line.split('#') self.dic_Obj2[doc_noun][modifier] = float(freq) #self.dic_freqObj[doc] += float(freq) #self.dic_Obj[doc] += 1 file_doc_nouns.close() # Colocar o limitador do array para n valores, não consumindo muita memória OU imprimir a lista em um arquivo for doc_seed in self.dic_seeds: if doc_noun != doc_seed: if sim_measure == 'jaccardMax': self.dic_measure[doc_seed][doc_noun] = self.getJaccardMaxMeasure(doc_seed, doc_noun) elif sim_measure == 'cosine': self.dic_measure[doc_seed][doc_noun] = self.getCosineMeasure(doc_seed, doc_noun) del self.dic_Obj2[doc_noun] # Deletar a hash criada Obj2 def getTopNOrderedDic(self, n): dic_measure_ordered = self.__sortTopNFromAllDic__(self.dic_measure, n) return dic_measure_ordered def getJaccardMaxMeasure(self, doc_seed, doc_noun): minimum = 0 maximum = 0 for attr in self.dic_seeds[doc_seed]: if self.dic_Obj2[doc_noun].has_key(attr): assoc1 = self.dic_seeds[doc_seed][attr] assoc2 = self.dic_Obj2[doc_noun][attr] minimum += min(assoc1, assoc2) maximum += max(assoc1, assoc2) elif self.dic_seeds[doc_seed].has_key(attr): maximum += self.dic_seeds[doc_seed][attr] for attr2 in self.dic_Obj2[doc_noun]: if not self.dic_seeds[doc_seed].has_key(attr2): maximum += self.dic_Obj2[doc_noun][attr2] if maximum > 0: return minimum/maximum else: return -1 def getCosineMeasure(self, doc_seed, doc_noun): intersection = 0 o1 = 0 o2 = 0 for attr in self.dic_seeds[doc_seed]: if self.dic_Obj2[doc_noun].has_key(attr): assoc1 = self.dic_seeds[doc_seed][attr] assoc2 = self.dic_Obj2[doc_noun][attr] intersection += assoc1 * assoc2 o1 += assoc1**2 o2 += assoc2**2 elif self.dic_seeds[doc_seed].has_key(attr): o1 += self.dic_seeds[doc_seed][attr]**2 for attr2 in self.dic_Obj2[doc_noun]: if not self.dic_seeds[doc_seed].has_key(attr2): o2 += self.dic_Obj2[doc_noun][attr2]**2 if o1 > 0 and o2 > 0: return intersection/math.sqrt(float(o1 * o2)) else: return -1 def __sortTopNFromAllDic__(self, dic, n): dic_terms = OrderedDict() self.dic_seeds = sorted(self.dic_seedsp) for doc in self.dic_seeds: if self.__existKeyInDic__(doc, dic): seed = self.dic_nouns[doc] dic_terms[seed] = {'terms': []} dic_related = {} for related_term in dic[doc]: dic_related[related_term] = dic[doc][related_term] if n == 0: n = None dic_ordered = sorted(dic_related.items(), key=itemgetter(1), reverse=True)[0:n] for list_ordered in dic_ordered: dic_terms[seed]['terms'].append({self.dic_nouns[list_ordered[0]]:str(list_ordered[1])}) return dic_terms def __existKeyInDic__(self, key, dic): if dic.has_key(key): return dic else: print bcolors.WARNING+'WARNING: System cannot found the term "'+key+'" in corpus'+bcolors.ENDC print '' return False def __printDic__(self, dic_terms): for seed in dic_terms: print 'Seed: '+seed for index_related_term in dic_terms[seed]['terms']: similarity = index_related_term[index_related_term.keys()[0]] term = index_related_term.keys()[0] print 'Related term: '+term+'\nSimilarity : '+similarity print ''
class Contexts: def __init__(self, temp_folder): self.temp_folder = temp_folder self.misc = Miscelaneous() self.dic_an = {} self.dic_sv = {} self.dic_vo = {} self.matrix_relations = ['AN', 'SV', 'VO'] for type_relation in self.matrix_relations: self.__loadTerms__(type_relation) self.__writeDic__() def __del__(self): pass def __loadTerms__(self, type_relation): try: root, dirs, files = os.walk(self.temp_folder + '' + type_relation + '/2Order/').next()[:3] except IOError: print bcolors.FAIL + 'ERROR: It was not possible to open the ' + self.temp_folder + ' folder' + bcolors.ENDC sys.exit(2) qty_documents = len(files) i = 0 for corpus_file in files: i += 1 if re.match('.*txt$', corpus_file): relation_file = self.misc.openFile(root + '' + corpus_file, 'r') for line in relation_file: line = re.sub('\n', '', line) relation, noun, frequency = line.split('#') if type_relation == 'AN': self.__addElementDicAN__(relation + '#' + noun, frequency) elif type_relation == 'SV': self.__addElementDicSV__(relation + '#' + noun, frequency) elif type_relation == 'VO': self.__addElementDicVO__(relation + '#' + noun, frequency) self.misc.progress_bar(i, qty_documents, 100) def __addElementDicAN__(self, relation, frequency): if self.dic_an.has_key(relation): self.dic_an[relation] += int(frequency) else: self.dic_an[relation] = int(frequency) def __addElementDicSV__(self, relation, frequency): if self.dic_sv.has_key(relation): self.dic_sv[relation] += int(frequency) else: self.dic_sv[relation] = int(frequency) def __addElementDicVO__(self, relation, frequency): if self.dic_vo.has_key(relation): self.dic_vo[relation] += int(frequency) else: self.dic_vo[relation] = int(frequency) def __writeDic__(self): for type_relation in self.matrix_relations: file_relation = self.misc.openFile( self.temp_folder + '' + type_relation + '/Relations.txt', 'w') dic_relation = self.getDic(type_relation) for id_relation in dic_relation: file_relation.write(id_relation + '#' + str(dic_relation[id_relation]) + '\n') file_relation.close() """ Get and Print methods """ def getDic(self, type_relation): if type_relation == 'AN': return self.dic_an elif type_relation == 'SV': return self.dic_sv elif type_relation == 'VO': return self.dic_vo def printDic(self, type_relation): dic_relation = getDic(type_relation) for id_relation in self.dic_relation: print id_relation + ' = ' + str(self.dic_relation[id_relation])
def __buildDics__(self, filename): misc = Miscelaneous() xmlfile = misc.openFile(filename, "r") self.stoplist = misc.getStoplist("../misc/stoplist.txt") record_dependencies = False record_collapsed = False for line in xmlfile: line = re.sub("\n", "", line) if "<sentence " in line: id_s = (line.split('id="')[1]).split('"')[0] array_rec_dep = [] elif "<token " in line: id_t = "s" + id_s + "_" + (line.split('id="')[1]).split('"')[0] elif "<word>" in line: word = (line.split("<word>")[1]).split("</word>")[0] elif "<lemma>" in line: lemma = (line.split("<lemma>")[1]).split("</lemma>")[0] elif "<POS>" in line: pos = (line.split("<POS>")[1]).split("</POS>")[0] elif "<NER>" in line: ner = (line.split("<NER>")[1]).split("</NER>")[0] elif "</token>" in line: self.dic_t[id_t] = {"word": word, "lemma": lemma, "pos": pos, "ner": ner, "nps": []} if re.match("NN|NNP", pos): array_nps = [lemma.lower()] self.dic_t[id_t]["nps"] = array_nps elif "<basic-dependencies>" in line: record_dependencies = True index_nt = 500 elif "</basic-dependencies>" in line: record_dependencies = False elif "<collapsed-ccprocessed-dependencies>" in line: record_collapsed = True elif "</collapsed-ccprocessed-dependencies>" in line: record_collapsed = False if record_dependencies or record_collapsed: if "<dep type=" in line: cat = (line.split('type="')[1]).split('"')[0] elif "<governor " in line: idx_gov = (line.split('idx="')[1]).split('"')[0] id_t_gov = "s" + id_s + "_" + idx_gov elif "<dependent " in line: idx_dep = (line.split('idx="')[1]).split('"')[0] id_t_dep = "s" + id_s + "_" + idx_dep elif (record_dependencies or record_collapsed) and "</dep>" in line: if cat + "#" + id_t_gov + "#" + id_t_dep not in array_rec_dep: array_rec_dep.append(cat + "#" + id_t_gov + "#" + id_t_dep) self.dic_nt["s" + id_s + "_" + str(index_nt)] = {"cat": cat, "gov": id_t_gov, "dep": id_t_dep} index_nt += 1 xmlfile.close() for id_nt in self.dic_nt: if re.match("(nn|amod)", self.dic_nt[id_nt]["cat"]): array_nps = self.dic_t[self.dic_nt[id_nt]["gov"]]["nps"] string = "" id_gov = self.dic_nt[id_nt]["gov"].split("_")[1] id_dep = self.dic_nt[id_nt]["dep"].split("_")[1] id_s = self.dic_nt[id_nt]["dep"].split("_")[0] for i in range(int(id_dep), int(id_gov)): id_next = id_s + "_" + str(i) if ( re.match("(NN|JJ)", self.dic_t[id_next]["pos"]) and self.dic_t[id_next]["lemma"] not in self.stoplist ): string += self.dic_t[id_next]["lemma"] + "_" string += self.dic_t[id_s + "_" + id_gov]["lemma"] if len(string.split("_")) > 1 and string.lower() not in array_nps: array_nps.append(string.lower()) self.dic_t[self.dic_nt[id_nt]["gov"]]["nps"] = array_nps elif re.match("prep_of", self.dic_nt[id_nt]["cat"]): id_gov = self.dic_nt[id_nt]["gov"] id_dep = self.dic_nt[id_nt]["dep"] if re.match("NN", self.dic_t[id_dep]["pos"]) and re.match("NN", self.dic_t[id_gov]["pos"]): array_nps = self.dic_t[id_dep]["nps"] string = self.dic_t[id_gov]["lemma"] + "_of_" + self.dic_t[id_dep]["lemma"] array_nps.append(string.lower()) self.dic_t[self.dic_nt[id_nt]["dep"]]["nps"] = array_nps array_nps = self.dic_t[id_gov]["nps"] string = self.dic_t[id_gov]["lemma"] + "_of_" + self.dic_t[id_dep]["lemma"] array_nps.append(string.lower()) self.dic_t[self.dic_nt[id_nt]["gov"]]["nps"] = array_nps
def __init__(self, output_file, max_qty_terms): self.output_file = output_file self.max_qty_terms = max_qty_terms misc = Miscelaneous() self.thesaurus_file = misc.openFile(output_file, 'w')
class MutualInformation: def __init__(self, temp_folder, file_input, seedfile, mi_precision): self.window_size = file_input[1:-23] self.temp_folder = temp_folder self.misc = Miscelaneous() seeds_file = Seeds(seedfile) self.list_seeds = seeds_file.getSeeds() self.first_line = '' self.dic_tuplas = defaultdict(dict) self.dic_terms = OrderedDict() self.__buildMI__(file_input, mi_precision) def __del__(self): pass def __buildMI__(self, file_input, mi_precision): filename_input = file_input[:-4] file_bigrams = self.misc.openFile(self.temp_folder+''+file_input, 'r') for line in file_bigrams: if self.first_line != '': part = line.split('<>') term_type1 = part[0] term_type2 = part[1] term1, type1 = term_type1.split('__') term2, type2 = term_type2.split('__') freq_tupla = part[2].split(' ')[0] freq_term1 = part[2].split(' ')[1] freq_term2 = part[2].split(' ')[2] if type1 == 'N' and type2 == 'N' and term1 != term2: if term1 in self.list_seeds: self.dic_tuplas[term1+'<>'+term2+'<>']['freq_tupla'] = int(freq_tupla) self.dic_tuplas[term1+'<>'+term2+'<>']['freq_term1'] = int(freq_term1) self.dic_tuplas[term1+'<>'+term2+'<>']['freq_term2'] = int(freq_term2) elif term2 in self.list_seeds: if self.dic_tuplas.has_key(term2): self.dic_tuplas[term2+'<>'+term1+'<>']['freq_tupla'] += int(freq_tupla) else: self.dic_tuplas[term2+'<>'+term1+'<>']['freq_tupla'] = int(freq_tupla) self.dic_tuplas[term2+'<>'+term1+'<>']['freq_term1'] = int(freq_term2) self.dic_tuplas[term2+'<>'+term1+'<>']['freq_term2'] = int(freq_term1) else: self.first_line = line file_bigrams.close() file_relations = self.misc.openFile(self.temp_folder+''+filename_input+'_to_MI.txt', 'w') file_relations.write(self.first_line) for tupla in self.dic_tuplas: file_relations.write(tupla+''+str(self.dic_tuplas[tupla]['freq_tupla'])+' '+str(self.dic_tuplas[tupla]['freq_term1'])+' '+str(self.dic_tuplas[tupla]['freq_term2'])+'\n') file_relations.close() command = "statistic.pl tmi.pm -precision "+mi_precision+' '+self.temp_folder+'IM'+self.window_size+'_SecondOrder.txt '+self.temp_folder+''+filename_input+'_to_MI.txt' os.system(command) def getDicMI(self): file_mi = self.misc.openFile(self.temp_folder+'IM'+self.window_size+'_SecondOrder.txt', 'r') first_line = False list_used_seeds = [] for line in file_mi: if first_line: seed, none, term, none, rank, true_mi, freq_1, freq_2, freq_3 = re.split(r'[ |<>]', line) if seed in self.list_seeds and seed not in list_used_seeds: list_used_seeds.append(seed) self.dic_terms[seed] = {'terms': []} if seed in self.list_seeds: self.dic_terms[seed]['terms'].append({term:true_mi}) else: first_line = True return self.dic_terms def getDicBigrams(self): return self.dic_tuplas def printDicBigrams(self): print self.first_line, for tupla in self.dic_tuplas: print tupla,self.dic_tuplas[tupla]['freq_tupla'],self.dic_tuplas[tupla]['freq_term1'],self.dic_tuplas[tupla]['freq_term2']
class Matrix: def __init__(self, temp_folder, svd_dimension, record_intermediate): self.misc = Miscelaneous() self.temp_folder = temp_folder self.svd_dimension = svd_dimension self.dic_noun = OrderedDict() self.dic_noun_index = {} self.dic_modifier = OrderedDict() self.dic_modifier_index = {} self.row = [] self.col = [] self.data = [] self.dic_matrix = {} #self.line_data = '' list_relations = ['AN', 'SV', 'VO'] string_files_matrix = '' for relation in list_relations: self.type_relation = relation #self.buildMatrixFromFile() #self.applySvd() #if record_intermediate: # logfile.writeLogfile('- Recording SVD matrix to '+relation+' in a file...') # self.writeSvd() #self.buildRelationsSvd() string_files_matrix += self.temp_folder+''+relation+'/Matrix_row.txt '+self.temp_folder+''+relation+'/Matrix_column.txt ' file_matrix = self.misc.openFile(self.temp_folder+''+relation+'/Matrix_row.txt', 'r') for line in file_matrix: self.__loadDicMatrix__(line, relation) file_matrix.close() file_doc_matrix = self.misc.openFile(self.temp_folder+'/Matrix_nouns.txt', 'w') number_document = 0 for noun in self.dic_matrix: file_doc_matrix.write(str(number_document)+' : '+noun+'\n') command = 'cat'+self.dic_matrix[noun]+' > '+self.temp_folder+'Matrix/'+str(number_document)+'.txt' os.system(command) if not record_intermediate: command = 'rm -Rf'+self.dic_matrix[noun]+' '+string_files_matrix os.system(command) number_document += 1 file_doc_matrix.close() def __del__(self): pass def __loadDicMatrix__(self, line, relation): line = re.sub('\n', '', line) row, noun = line.split(' : ') if self.dic_matrix.has_key(noun): self.dic_matrix[noun] = self.dic_matrix[noun]+' '+self.temp_folder+''+relation+'/3Order/'+row+'.txt' else: self.dic_matrix[noun] = ' '+self.temp_folder+''+relation+'/3Order/'+row+'.txt' def buildMatrixFromFile(self): index_modifier = 0 index_noun = 0 line_row = '' line_column = '' file_relations = self.misc.openFile(self.temp_folder+''+self.type_relation+'/Relations.txt', 'r') file_row = self.misc.openFile(self.temp_folder+''+self.type_relation+'/Matrix_row.txt', 'w') file_column = self.misc.openFile(self.temp_folder+''+self.type_relation+'/Matrix_column.txt', 'w') #file_data = self.misc.openFile(self.temp_folder+''+self.type_relation+'/Matrix_data.txt', 'w') for line in file_relations: line = re.sub('\n', '', line) modifier, noun, frequency = line.split('#') if self.dic_modifier.has_key(modifier): index_m = self.dic_modifier[modifier] else: self.dic_modifier[modifier] = index_modifier self.dic_modifier_index[index_modifier] = modifier index_m = index_modifier index_modifier = index_modifier + 1 if self.dic_noun.has_key(noun): index_n = self.dic_noun[noun] else: self.dic_noun[noun] = index_noun self.dic_noun_index[index_noun] = noun index_n = index_noun index_noun = index_noun + 1 self.row.append(int(index_n)) self.col.append(int(index_m)) self.data.append(math.log(float(frequency)+1, e)) line_row += str(index_n)+' ' line_column += str(index_m)+' ' #self.line_data += str(frequency)+' ' file_relations.close() for modifier in self.dic_modifier: file_column.write(str(self.dic_modifier[modifier])+' : '+modifier+'\n') for noun in self.dic_noun: file_row.write(str(self.dic_noun[noun])+' : '+noun+'\n') #file_data.write('<row>\n') #file_data.write(line_row[0:-1]+'\n') #file_data.write('<column>\n') #file_data.write(line_column[0:-1]+'\n') #file_data.write('<data>\n') #file_data.write(self.line_data[0:-1]+'\n') file_row.close() file_column.close() #file_data.close() def applySvd(self): len_row = max(self.row)+1 len_col = max(self.col)+1 print 'Applying SVD with ROW: '+str(len_row)+' and COL: '+str(len_col) sparse_matrix = scipy.sparse.csc_matrix( (self.data,(self.row,self.col)), shape=(len_row,len_col) ) print 'sparsed matrix' Ut, Sigma, Vt = sparsesvd(sparse_matrix, self.svd_dimension) print 'Ut Sigma Vt done!' sparse_matrix = array(0) print 'Mounting Matrix SVD' self.svd_matrix = numpy.dot(Ut.T, numpy.dot(numpy.diag(Sigma), Vt)) print 'Done!' print Ut print '\n' print Sigma print '\n' print Vt Ut = None Sigma = None Vt = None #Ut = array(0) #Sigma = array(0) #Vt = array(0) def buildRelationsSvd(self): index_noun = 0 for row_data in self.svd_matrix: index_modifier = 0 file_relations_svd = self.misc.openFile(self.temp_folder+''+self.type_relation+'/3Order/'+str(index_noun)+'.txt', 'w') for value in row_data: file_relations_svd.write(self.dic_modifier_index[index_modifier]+'#'+self.dic_noun_index[index_noun]+'#'+str(value)+'\n') index_modifier += 1 index_noun += 1 file_relations_svd.close() self.svd_matrix = array(0) def writeSvd(self): file_matrix_svd = self.misc.openFile(self.temp_folder+''+self.type_relation+'/MatrixDataSvd.txt', 'w') for row_data in self.svd_matrix: for value in row_data: file_matrix_svd.write(str(value)+' ') file_matrix_svd.write('\n'); file_matrix_svd.close()
class Parameters: def __init__(self, type_atc, argv): self.input_folder = '../Data/Corpus/' self.output_folder = '../Data/Output/' self.temp_folder = '../Data/Temp/' self.seeds_file = '../misc/seeds.txt' self.stoplist_file = '../misc/stoplist.txt' self.misc = Miscelaneous() file_parameters = self.misc.openFile('../misc/parameters.cfg', 'r') for line in file_parameters: if re.match('contexts', line): contexts = line.split('=')[1].replace('\n','') if contexts == 'On': self.contexts = True else: self.contexts = False if re.match('language', line): self.language = line.split('=')[1].replace('\n','') if re.match('max_qty_terms', line): self.max_qty_terms = line.split('=')[1].replace('\n','') if re.match('mi_precision', line): self.mi_precision = line.split('=')[1].replace('\n','') if re.match('min_word_size', line): self.min_word_size = line.split('=')[1].replace('\n','') if re.match('sim_measure', line): self.sim_measure = line.split('=')[1].replace('\n','') if re.match('svd_dimension', line): self.svd_dimension = line.split('=')[1].replace('\n','') if re.match('window_size', line): self.window_size = line.split('=')[1].replace('\n','') if re.match('record_log', line): record_log = line.split('=')[1].replace('\n','') if record_log == 'On': self.record_log = True else: self.record_log = False if re.match('record_intermediate', line): record_intermediate = line.split('=')[1].replace('\n','') if record_intermediate == 'On': self.record_intermediate = True else: self.record_intermediate = False file_parameters.close() try: opts, args = getopt.getopt(argv,\ "h:c:i:o:m:M:p:w:d:t:l:L:r:R:s:S:", \ ["help", "contexts=", "input=", "output=", "min_size=", "max_terms=", "mi_precision=", "window_size=", "svd_dimension=", "temp=", "language=", "record_log=", "record_intermediate=", "seeds=", "sim_measure=", "stoplist="]) except getopt.GetoptError: self.usage(type_atc) sys.exit(2) for opt, arg in opts: if opt in ("-h", "--help"): self.help() sys.exit(0) elif opt in ("-c", "--contexts"): if arg == 'On': self.contexts = True elif arg == 'Off': self.contexts = False elif opt in ("-i", "--input"): if os.path.isdir(arg): self.input_folder = arg else: print bcolors.WARNING+'WARNING: '+str(arg)+' is not a folder, setting '+self.input_folder+' as input folder'+bcolors.ENDC elif opt in ("-o", "--output"): if os.path.isdir(arg): self.output_folder = arg else: print bcolors.WARNING+'WARNING: '+str(arg)+' is not a folder, setting '+self.output_folder+' as output folder'+bcolors.ENDC elif opt in ("-t", "--temp"): if os.path.isdir(arg): self.temp_folder = arg else: print bcolors.WARNING+'WARNING: '+str(arg)+' is not a folder, setting '+self.temp_folder+' as temporary folder'+bcolors.ENDC elif opt in ("-m", "--min_size"): self.min_word_size = arg elif opt in ("-M", "--max_terms"): self.max_qty_terms = arg elif opt in ("-l", "--language"): if arg == 'en' or arg == 'pt': self.language = arg else: print bcolors.WARNING+'WARNING: "'+str(arg)+'" is not a supported language, setting to "'+self.language+'" as language'+bcolors.ENDC elif opt in ("-r", "--record_log"): if arg == 'On': self.record_log = True elif arg == 'Off': self.record_log = False elif opt in ("-R", "--record_intermediate"): if arg == 'On': self.record_intermediate = True elif arg == 'Off': self.record_intermediate = False else: print bcolors.WARNING+'WARNING: "'+str(arg)+'" is not a supported option to log recording, setting to "'+record_log+'" as default option'+bcolors.ENDC elif opt in ("-s", "--seeds"): if os.path.isfile(arg): self.seeds_file = arg else: print bcolors.WARNING+'WARNING: '+str(arg)+' is not a file, setting '+self.seeds+' as seeds file'+bcolors.ENDC elif opt in ("-S", "--sim_measure"): if arg == 'mutual_information' \ or arg == 'baseline' \ or arg == 'dicebin' \ or arg == 'dicemin' \ or arg == 'jaccard' \ or arg == 'cosinebin' \ or arg == 'cosine' \ or arg == 'city' \ or arg == 'euclidean' \ or arg == 'js' \ or arg == 'lin' \ or arg == 'jaccardmax': self.sim_measure = arg else: print bcolors.WARNING+'WARNING: "'+str(arg)+'" is not a supported similarity measure, setting to "'+self.sim_measure+'" as default similarity measure. \nSimilarity measures supported by the system:\n - mutual_information [used only in First Order construction]\n - baseline\n - dicebin\n - dicemin\n - jaccard\n - cosinebin\n - cosine\n - city\n - euclidean\n - js\n - lin\n - jaccardmax'+bcolors.ENDC elif opt in ("-L", "--stoplist"): if os.path.isfile(arg): self.stoplist_file = arg else: print bcolors.WARNING+'WARNING: '+str(arg)+' is not a file, setting '+self.seeds+' as stoplist file'+bcolors.ENDC if type_atc == 'FirstOrder': if opt in ("-p", "--mi_precision"): self.mi_precision = arg elif opt in ("-w", "--window_size"): self.window_size = arg elif type_atc == 'HigherOrder': if opt in ("-d", "--svd_dimension"): self.svd_dimension = arg def __del__(self): pass def getContexts(self): return self.contexts def getInputFolder(self): return self.input_folder def getLanguage(self): return self.language def getMinWordSize(self): return self.min_word_size def getMaxQtyTerms(self): return self.max_qty_terms def getMIPrecision(self): return self.mi_precision def getOutputFolder(self): return self.output_folder def getRecordLog(self): return self.record_log def getRecordIntermediate(self): return self.record_intermediate def getSeedsFile(self): return self.seeds_file def getSimilarityMeasure(self): return self.sim_measure def getStoplistFile(self): return self.stoplist_file def getSvdDimension(self): return self.svd_dimension def getWindowSize(self): return self.window_size def getTempFolder(self): return self.temp_folder def usage(self, type_atc): if type_atc == 'FirstOrder': usage = """ Usage: python main_FirstOrder.py [OPTION] [FOLDER]... [OPTION] [PARAMETER]...\n -c --contexts= Input folder containing the syntactic context files -i --input= Input folder containing the corpus -l --language= Language of the corpus data -L --stoplist= File containing a list of stopwords -m --min_size= Minimum size of a word to be computed -M --max_terms= Max number of similar terms recorded in the XML file -o --output= Output folder to receive the data -p --mi_precision= Precision of the Mutual Information result -r --record_log= Enable/Disable log file recording -R --record_intermediate= Enable/Disable intermediate files recording -s --seeds= File containing seeds to the thesaurus -S --sim_measure= Metric to compute the similarity between seed and related terms -w --window_size= Size of the window to compute the correlation analysis -t --temp= Temp folder to receive temporary data -h --help Display this help and exit """ elif type_atc == 'HigherOrder': usage = """ Usage: python main_HigherOrder.py [OPTION] [FOLDER]... [OPTION] [PARAMETER]...\n -c --contexts= Input folder containing the syntactic context files -d --svd_dimension= Number of dimensions to reduce the SVD -i --input= Input folder containing the corpus -l --language= Language of the corpus data -L --stoplist= File containing a list of stopwords -m --min_size= Minimum size of a word to be computed -M --max_terms= Max number of similar terms recorded in the XML file -o --output= Output folder to receive the corpus -r --record_log= Enable/Disable log file recording -R --record_intermediate= Enable/Disable intermediate files recording -s --seeds= File containing seeds to the thesaurus -S --sim_measure= Metric to compute the similarity between seed and related terms -t --temp= Temp folder to receive temporary data -h --help Display this help and exit """ else: usage = """ Usage: python main_SecondOrder.py [OPTION] [FOLDER]... [OPTION] [PARAMETER]...\n -c --contexts= Input folder containing the syntactic context files -i --input= Input folder containing the corpus -l --language= Language of the corpus data -L --stoplist= File containing a list of stopwords -m --min_size= Minimum size of a word to be computed -M --max_terms= Max number of similar terms recorded in the XML file -o --output= Output folder to receive the corpus -r --record_log= Enable/Disable log file recording -R --record_intermediate= Enable/Disable intermediate files recording -s --seeds= File containing seeds to the thesaurus -S --sim_measure= Metric to compute the similarity between seed and related terms -t --temp= Temp folder to receive temporary data -h --help Display this help and exit """ print usage def help(self): help = """ HELP FILE: -----------------------------------------------------------------------------------------------\n [COMMAND] $python ['main' program].py [OPTION] [FOLDER]... [OPTION] [PARAMETER]...\n [OPTION] [FOLDER] ... [OPTION] [PARAMETER] -c --contexts= Input folder containing the sybtactic context files Default option: 'Off' [The system loads the corpus folder] Supported options: 'On' and 'Off' -d --svd_dimension= Number of dimensions to reduce the SVD [Used only in main_HigherOrder.py]\n -i --input= Input folder containing the corpus Default folder: '../Data/Corpus/'\n -l --language= Language of the corpus data Default language: 'en' Supported languages: 'en' [English] and 'pt' [Portuguese]\n -L --stoplist= File containing a list of stopwords Default file: '../misc/stoplist.txt'\n -m --min_size= Minimum size of a word to be computed Default size: '3' letters\n -M --max_terms= Max number of similar terms recorded in the XML file Default max: '10' related terms\n -o --output= Output folder to receive the data Default output: '../Data/Output/'\n -p --mi_precision= Precision of the Mutual Information result [Used only in main_FirstOrder.py with --sim_measure=mi_information] Default precision: 10\n -r --record_log= Enable/Disable log file recording Default option: 'Off' [Log file is recorded in ../misc/application.log] Supported options: 'On' and 'Off' -R --record_intermediate= Enable/Disable intermediate files recording Default option: 'Off' Supported options: 'On' and 'Off' [Intermediate files are recorded in '../Temp/AN/', '../Temp/SV/', and '../Temp/VO/'] -s --seeds= File containing seeds to the thesaurus Default file: '../misc/seeds.txt'\n -S --sim_measure= Metric to compute the similarity between seed and related terms Default measure: 'jaccardmax' Supported measures: 'mutual_information', 'baseline', 'dicebin' 'dicemin', 'jaccard', 'cosinebin', 'cosine' 'city', 'euclidean', 'js', 'lin', 'jaccardmax'\n -w --window_size= Size of the window to compute the correlation analysis [Used only in main_FirstOrder.py] Default size: '20'\n -t --temp= Temp folder to receive temporary data Default folder: '../Data/Temp/'\n -h --help Display this help and exit\n """ print help
class Matrix: def __init__(self, input_file, temp_folder, svd_dimension): self.misc = Miscelaneous() self.temp_folder = temp_folder self.svd_dimension = svd_dimension self.dic_column = OrderedDict() self.dic_column_index = {} self.dic_row = OrderedDict() self.dic_row_index = {} self.array_row = [] self.array_col = [] self.array_data = [] self.dic_matrix = {} string_files_matrix = "" self.buildMatrixFromFile(input_file) self.applySvd() self.writeSvd() def __del__(self): pass def buildMatrixFromFile(self, input_file): index_row = 0 index_column = 0 line_row = "" line_column = "" line_data = "" file_input = self.misc.openFile(input_file, "r") file_row = self.misc.openFile(self.temp_folder + "Matrix_row.txt", "w") file_column = self.misc.openFile(self.temp_folder + "Matrix_column.txt", "w") file_data = self.misc.openFile(self.temp_folder + "Matrix_data.txt", "w") for line in file_input: line = re.sub("\n", "", line) row, column, frequency = line.split("#") if self.dic_row.has_key(row): index_m = self.dic_row[row] else: self.dic_row[row] = index_row self.dic_row_index[index_row] = row index_m = index_row index_row = index_row + 1 if self.dic_column.has_key(column): index_n = self.dic_column[column] else: self.dic_column[column] = index_column self.dic_column_index[index_column] = column index_n = index_column index_column = index_column + 1 self.array_row.append(int(index_n)) self.array_col.append(int(index_m)) log_frequency = math.log(float(frequency) + 1, e) self.array_data.append(float(frequency)) line_row += str(index_n) + " " line_column += str(index_m) + " " line_data += str(frequency) + " " file_input.close() for row in self.dic_row: file_row.write(str(self.dic_row[row]) + " : " + row + "\n") for column in self.dic_column: file_column.write(str(self.dic_column[column]) + " : " + column + "\n") file_data.write("<row>\n") file_data.write(line_row[0:-1] + "\n") file_data.write("<column>\n") file_data.write(line_column[0:-1] + "\n") file_data.write("<data>\n") file_data.write(line_data[0:-1] + "\n") file_row.close() file_column.close() file_data.close() def applySvd(self): len_row = max(self.array_row) + 1 len_col = max(self.array_col) + 1 print "Applying SVD with ROW: " + str(len_row) + " and COL: " + str(len_col) sparse_matrix = scipy.sparse.csc_matrix( (self.array_data, (self.array_row, self.array_col)), shape=(len_row, len_col) ) print "sparsed matrix" Ut, Sigma, Vt = sparsesvd(sparse_matrix, self.svd_dimension) print "U Sigma Vt done!" sparse_matrix = array(0) print "Mounting Matrix SVD" self.svd_matrix = numpy.dot(Ut.T, numpy.dot(numpy.diag(Sigma), Vt)) print "Done!" print Ut.T print "\n" print Sigma print "\n" print Vt print "\n" print self.svd_matrix.T print "\n" Ut = None Sigma = None Vt = None # Ut = array(0) # Sigma = array(0) # Vt = array(0) def writeSvd(self): file_matrix_svd = self.misc.openFile(self.temp_folder + "/MatrixDataSvd.txt", "w") row_number = 0 for row_data in self.svd_matrix.T: column_number = 0 for value in row_data: file_matrix_svd.write( self.dic_row_index[row_number] + "#" + self.dic_column_index[column_number] + "#" + str(value) + "\n" ) column_number += 1 row_number += 1 file_matrix_svd.close()
def __init__(self, type_atc, argv): self.input_folder = '../Data/Corpus/' self.output_folder = '../Data/Output/' self.temp_folder = '../Data/Temp/' self.seeds_file = '../misc/seeds.txt' self.stoplist_file = '../misc/stoplist.txt' self.misc = Miscelaneous() file_parameters = self.misc.openFile('../misc/parameters.cfg', 'r') for line in file_parameters: if re.match('contexts', line): contexts = line.split('=')[1].replace('\n','') if contexts == 'On': self.contexts = True else: self.contexts = False if re.match('language', line): self.language = line.split('=')[1].replace('\n','') if re.match('max_qty_terms', line): self.max_qty_terms = line.split('=')[1].replace('\n','') if re.match('mi_precision', line): self.mi_precision = line.split('=')[1].replace('\n','') if re.match('min_word_size', line): self.min_word_size = line.split('=')[1].replace('\n','') if re.match('sim_measure', line): self.sim_measure = line.split('=')[1].replace('\n','') if re.match('svd_dimension', line): self.svd_dimension = line.split('=')[1].replace('\n','') if re.match('window_size', line): self.window_size = line.split('=')[1].replace('\n','') if re.match('record_log', line): record_log = line.split('=')[1].replace('\n','') if record_log == 'On': self.record_log = True else: self.record_log = False if re.match('record_intermediate', line): record_intermediate = line.split('=')[1].replace('\n','') if record_intermediate == 'On': self.record_intermediate = True else: self.record_intermediate = False file_parameters.close() try: opts, args = getopt.getopt(argv,\ "h:c:i:o:m:M:p:w:d:t:l:L:r:R:s:S:", \ ["help", "contexts=", "input=", "output=", "min_size=", "max_terms=", "mi_precision=", "window_size=", "svd_dimension=", "temp=", "language=", "record_log=", "record_intermediate=", "seeds=", "sim_measure=", "stoplist="]) except getopt.GetoptError: self.usage(type_atc) sys.exit(2) for opt, arg in opts: if opt in ("-h", "--help"): self.help() sys.exit(0) elif opt in ("-c", "--contexts"): if arg == 'On': self.contexts = True elif arg == 'Off': self.contexts = False elif opt in ("-i", "--input"): if os.path.isdir(arg): self.input_folder = arg else: print bcolors.WARNING+'WARNING: '+str(arg)+' is not a folder, setting '+self.input_folder+' as input folder'+bcolors.ENDC elif opt in ("-o", "--output"): if os.path.isdir(arg): self.output_folder = arg else: print bcolors.WARNING+'WARNING: '+str(arg)+' is not a folder, setting '+self.output_folder+' as output folder'+bcolors.ENDC elif opt in ("-t", "--temp"): if os.path.isdir(arg): self.temp_folder = arg else: print bcolors.WARNING+'WARNING: '+str(arg)+' is not a folder, setting '+self.temp_folder+' as temporary folder'+bcolors.ENDC elif opt in ("-m", "--min_size"): self.min_word_size = arg elif opt in ("-M", "--max_terms"): self.max_qty_terms = arg elif opt in ("-l", "--language"): if arg == 'en' or arg == 'pt': self.language = arg else: print bcolors.WARNING+'WARNING: "'+str(arg)+'" is not a supported language, setting to "'+self.language+'" as language'+bcolors.ENDC elif opt in ("-r", "--record_log"): if arg == 'On': self.record_log = True elif arg == 'Off': self.record_log = False elif opt in ("-R", "--record_intermediate"): if arg == 'On': self.record_intermediate = True elif arg == 'Off': self.record_intermediate = False else: print bcolors.WARNING+'WARNING: "'+str(arg)+'" is not a supported option to log recording, setting to "'+record_log+'" as default option'+bcolors.ENDC elif opt in ("-s", "--seeds"): if os.path.isfile(arg): self.seeds_file = arg else: print bcolors.WARNING+'WARNING: '+str(arg)+' is not a file, setting '+self.seeds+' as seeds file'+bcolors.ENDC elif opt in ("-S", "--sim_measure"): if arg == 'mutual_information' \ or arg == 'baseline' \ or arg == 'dicebin' \ or arg == 'dicemin' \ or arg == 'jaccard' \ or arg == 'cosinebin' \ or arg == 'cosine' \ or arg == 'city' \ or arg == 'euclidean' \ or arg == 'js' \ or arg == 'lin' \ or arg == 'jaccardmax': self.sim_measure = arg else: print bcolors.WARNING+'WARNING: "'+str(arg)+'" is not a supported similarity measure, setting to "'+self.sim_measure+'" as default similarity measure. \nSimilarity measures supported by the system:\n - mutual_information [used only in First Order construction]\n - baseline\n - dicebin\n - dicemin\n - jaccard\n - cosinebin\n - cosine\n - city\n - euclidean\n - js\n - lin\n - jaccardmax'+bcolors.ENDC elif opt in ("-L", "--stoplist"): if os.path.isfile(arg): self.stoplist_file = arg else: print bcolors.WARNING+'WARNING: '+str(arg)+' is not a file, setting '+self.seeds+' as stoplist file'+bcolors.ENDC if type_atc == 'FirstOrder': if opt in ("-p", "--mi_precision"): self.mi_precision = arg elif opt in ("-w", "--window_size"): self.window_size = arg elif type_atc == 'HigherOrder': if opt in ("-d", "--svd_dimension"): self.svd_dimension = arg
class Matrix: def __init__(self, input_file, temp_folder, svd_dimension): self.misc = Miscelaneous() self.temp_folder = temp_folder self.svd_dimension = svd_dimension self.dic_column = OrderedDict() self.dic_column_index = {} self.dic_row = OrderedDict() self.dic_row_index = {} self.array_row = [] self.array_col = [] self.array_data = [] self.dic_matrix = {} string_files_matrix = '' self.buildMatrixFromFile(input_file) self.applySvd() self.writeSvd() def __del__(self): pass def buildMatrixFromFile(self, input_file): index_row = 0 index_column = 0 line_row = '' line_column = '' line_data = '' file_input = self.misc.openFile(input_file, 'r') file_row = self.misc.openFile(self.temp_folder + 'Matrix_row.txt', 'w') file_column = self.misc.openFile( self.temp_folder + 'Matrix_column.txt', 'w') file_data = self.misc.openFile(self.temp_folder + 'Matrix_data.txt', 'w') for line in file_input: line = re.sub('\n', '', line) row, column, frequency = line.split('#') if self.dic_row.has_key(row): index_m = self.dic_row[row] else: self.dic_row[row] = index_row self.dic_row_index[index_row] = row index_m = index_row index_row = index_row + 1 if self.dic_column.has_key(column): index_n = self.dic_column[column] else: self.dic_column[column] = index_column self.dic_column_index[index_column] = column index_n = index_column index_column = index_column + 1 self.array_row.append(int(index_n)) self.array_col.append(int(index_m)) log_frequency = math.log(float(frequency) + 1, e) self.array_data.append(float(frequency)) line_row += str(index_n) + ' ' line_column += str(index_m) + ' ' line_data += str(frequency) + ' ' file_input.close() for row in self.dic_row: file_row.write(str(self.dic_row[row]) + ' : ' + row + '\n') for column in self.dic_column: file_column.write( str(self.dic_column[column]) + ' : ' + column + '\n') file_data.write('<row>\n') file_data.write(line_row[0:-1] + '\n') file_data.write('<column>\n') file_data.write(line_column[0:-1] + '\n') file_data.write('<data>\n') file_data.write(line_data[0:-1] + '\n') file_row.close() file_column.close() file_data.close() def applySvd(self): len_row = max(self.array_row) + 1 len_col = max(self.array_col) + 1 print 'Applying SVD with ROW: ' + str(len_row) + ' and COL: ' + str( len_col) sparse_matrix = scipy.sparse.csc_matrix( (self.array_data, (self.array_row, self.array_col)), shape=(len_row, len_col)) print 'sparsed matrix' Ut, Sigma, Vt = sparsesvd(sparse_matrix, self.svd_dimension) print 'U Sigma Vt done!' sparse_matrix = array(0) print 'Mounting Matrix SVD' self.svd_matrix = numpy.dot(Ut.T, numpy.dot(numpy.diag(Sigma), Vt)) print 'Done!' print Ut.T print '\n' print Sigma print '\n' print Vt print '\n' print self.svd_matrix.T print '\n' Ut = None Sigma = None Vt = None #Ut = array(0) #Sigma = array(0) #Vt = array(0) def writeSvd(self): file_matrix_svd = self.misc.openFile( self.temp_folder + '/MatrixDataSvd.txt', 'w') row_number = 0 for row_data in self.svd_matrix.T: column_number = 0 for value in row_data: file_matrix_svd.write(self.dic_row_index[row_number] + '#' + self.dic_column_index[column_number] + '#' + str(value) + '\n') column_number += 1 row_number += 1 file_matrix_svd.close()
class Measures: def __init__(self, ctx_freq_file, seedfile): self.misc = Miscelaneous() seeds_file = Seeds(seedfile) self.list_seeds = seeds_file.getSeeds() self.dic_baseline = defaultdict(dict) self.dic_diceBin = defaultdict(dict) self.dic_diceMin = defaultdict(dict) self.dic_jaccard = defaultdict(dict) self.dic_cosineBin = defaultdict(dict) self.dic_cosine = defaultdict(dict) self.dic_city = defaultdict(dict) self.dic_euclidean = defaultdict(dict) self.dic_js = defaultdict(dict) self.dic_lin = defaultdict(dict) self.dic_jaccardMax = defaultdict(dict) self.dic_ctx = defaultdict(dict) self.dic_sum_freq_noun = {} self.dic_qty_noun = {} self.__buildHashs__(ctx_freq_file, seedfile) def __del__(self): pass def __buildHashs__(self, ctx_freq_file, seedfile): list_nouns = [] ctxfreqfile = self.misc.openFile(ctx_freq_file, 'r') for line in ctxfreqfile: modifier, noun, freq = line.split('#') list_nouns.append(noun) freq = freq.replace('\n', '') self.dic_ctx[noun][modifier] = float(freq) if self.dic_sum_freq_noun.has_key(noun): self.dic_sum_freq_noun[noun] += float(freq) else: self.dic_sum_freq_noun[noun] = float(freq) if self.dic_qty_noun.has_key(noun): self.dic_qty_noun[noun] += 1 else: self.dic_qty_noun[noun] = 1 for seed in self.list_seeds: print 'Seed: '+seed i = 0 qty_related = len(list_nouns) for related in list_nouns: if seed != related: i += 1 baseline = 0 diceBin = 0 diceMin = 0 jaccard = 0 cosineBin = 0 cosine = 0 city = 0 euclidean = 0 js = 0 lin = 0 jaccardMax = 0 sun_min = 0 sun_max = 0 sum_intersection = 0 intersection = 0 square_freq_seed = 0 square_freq_related = 0 d_seed = 0 d_related = 0 for modifier in self.dic_ctx[seed]: freq_seed_modifier = 0 freq_related_modifier = 0 if self.dic_ctx[related].has_key(modifier): baseline += 1 freq_seed_modifier = self.dic_ctx[seed][modifier] freq_related_modifier = self.dic_ctx[related][modifier] sun_min += min(freq_seed_modifier, freq_related_modifier) sun_max += max(freq_seed_modifier, freq_related_modifier) city += abs(freq_seed_modifier - freq_related_modifier) euclidean += (freq_seed_modifier - freq_related_modifier)**2 relative_freq_seed = float(freq_seed_modifier) / self.dic_sum_freq_noun[seed] if self.dic_sum_freq_noun[related] == 0: print bcolors.FAIL+'ERROR: Frequency of '+related+' is zero.'+bcolors.ENDC else: relative_freq_related = float(freq_related_modifier) / self.dic_sum_freq_noun[related] relative_freq_seed_related = float(relative_freq_seed + relative_freq_related) / 2 if relative_freq_seed > 0.0 and relative_freq_seed_related > 0.0: d_seed += relative_freq_seed * math.log(float(relative_freq_seed / relative_freq_seed_related)) if relative_freq_related > 0.0 and relative_freq_seed_related > 0.0: d_related += relative_freq_related * math.log(float(relative_freq_related / relative_freq_seed_related)) intersection += freq_seed_modifier * freq_related_modifier sum_intersection += freq_seed_modifier + freq_related_modifier square_freq_seed += freq_seed_modifier**2 square_freq_related += freq_related_modifier**2 elif self.dic_ctx[seed].has_key(modifier): freq_seed_modifier = self.dic_ctx[seed][modifier] sun_max += freq_seed_modifier city += freq_seed_modifier euclidean += freq_seed_modifier**2 square_freq_seed += freq_seed_modifier**2 for modifier in self.dic_ctx[related]: if not self.dic_ctx[seed].has_key(modifier): freq_related_modifier = self.dic_ctx[related][modifier] sun_max += freq_related_modifier city += freq_related_modifier euclidean += freq_related_modifier**2 square_freq_related += freq_related_modifier**2 if sun_max > 0: jaccardMax = float(sun_min) / sun_max if self.dic_qty_noun.has_key(seed) and self.dic_qty_noun.has_key(related): diceBin = float(2*baseline) / (self.dic_qty_noun[seed] + self.dic_qty_noun[related]) cosineBin = baseline / math.sqrt(float(self.dic_qty_noun[seed] * self.dic_qty_noun[related])) jaccard = float(baseline) / (self.dic_qty_noun[seed] + self.dic_qty_noun[related] - baseline) if self.dic_sum_freq_noun.has_key(seed) and self.dic_sum_freq_noun.has_key(related): diceMin = float((2*sun_min)) / (self.dic_sum_freq_noun[seed] + self.dic_sum_freq_noun[related]) lin = float(sum_intersection) / (self.dic_sum_freq_noun[seed] + self.dic_sum_freq_noun[related]) if square_freq_seed > 0 and square_freq_related > 0: cosine = intersection / (math.sqrt(float(square_freq_seed * square_freq_related))) euclidean = math.sqrt(float(euclidean)) js = float(d_seed + d_related) / 2 if baseline >= 1: self.dic_baseline[seed][related] = baseline self.dic_diceBin[seed][related] = diceBin self.dic_diceMin[seed][related] = diceMin self.dic_jaccard[seed][related] = jaccard self.dic_cosineBin[seed][related] = cosineBin self.dic_cosine[seed][related] = cosine self.dic_city[seed][related] = city self.dic_euclidean[seed][related] = euclidean self.dic_js[seed][related] = js self.dic_lin[seed][related] = lin self.dic_jaccardMax[seed][related] = jaccardMax self.misc.progress_bar(i, qty_related, 100) print '' """ Methods to get the entire dictionaries """ def getDic(self, sim_measure): dic_measure = self.__verifyMeasure__(sim_measure) return self.__sortTopNFromAllDic__(dic_measure, 0) """ Methods to get the DICs to a specific seed """ def getDicToSeed(self, sim_measure, seed): dic_measure = self.__verifyMeasure__(sim_measure) return self.__sortTopNFromDic__(dic_measure, seed, 0) """ Methods to get the TOP N to a specific seed """ def getTopNToSeed(self, sim_measure, seed, n): dic_measure = self.__verifyMeasure__(sim_measure) return self.__sortTopNFromDic__(dic_measure, seed, n) """ Methods to get the TOP N to ALL seeds """ def getTopNToAllSeeds(self, sim_measure, n): dic_measure = self.__verifyMeasure__(sim_measure) return self.__sortTopNFromAllDic__(dic_measure, n) """ Methods to print the TOP N to a specific seed """ def printTopNToSeed(self, sim_measure, seed, n): dic_terms = self.getTopNToSeed(sim_measure, seed, n) self.__printDic__(dic_terms) """ Methods to print the TOP N to ALL seeds """ def printTopNToAllSeeds(self, sim_measure, n): dic_terms = self.getTopNToAllSeeds(sim_measure, n) self.__printDic__(dic_terms) """ Internal methods """ def __verifyMeasure__(self, sim_measure): if sim_measure == 'baseline': dic_measure = self.dic_baseline elif sim_measure == 'dicebin': dic_measure = self.dic_diceBin elif sim_measure == 'dicemin': dic_measure = self.dic_diceMin elif sim_measure == 'jaccard': dic_measure = self.dic_jaccard elif sim_measure == 'cosinebin': dic_measure = self.dic_cosineBin elif sim_measure == 'cosine': dic_measure = self.dic_cosine elif sim_measure == 'city': dic_measure = self.dic_city elif sim_measure == 'euclidean': dic_measure = self.dic_euclidean elif sim_measure == 'js': dic_measure = self.dic_js elif sim_measure == 'lin': dic_measure = self.dic_lin elif sim_measure == 'jaccardmax': dic_measure = self.dic_jaccardMax return dic_measure def __sortTopNFromDic__(self, dic, seed, n): dic_terms = OrderedDict() if self.__existKeyInDic__(seed, dic): dic_related = {} dic_terms[seed] = {'terms': []} for related_term in dic[seed]: dic_related[related_term] = dic[seed][related_term] if n == 0: n = None dic_ordered = sorted(dic_related.items(), key=itemgetter(1), reverse=True)[0:n] for list_ordered in dic_ordered: dic_terms[seed]['terms'].append({list_ordered[0]:str(list_ordered[1])}) return dic_terms def __sortTopNFromAllDic__(self, dic, n): dic_terms = OrderedDict() for seed in self.list_seeds: if self.__existKeyInDic__(seed, dic): dic_terms[seed] = {'terms': []} dic_related = {} for related_term in dic[seed]: dic_related[related_term] = dic[seed][related_term] if n == 0: n = None dic_ordered = sorted(dic_related.items(), key=itemgetter(1), reverse=True)[0:n] for list_ordered in dic_ordered: dic_terms[seed]['terms'].append({list_ordered[0]:str(list_ordered[1])}) return dic_terms def __existKeyInDic__(self, key, dic): if dic.has_key(key): return dic else: print bcolors.WARNING+'WARNING: System cannot found the term "'+key+'" in corpus'+bcolors.ENDC print '' return False def __printDic__(self, dic_terms): for seed in dic_terms: print 'Seed: '+seed for index_related_term in dic_terms[seed]['terms']: similarity = index_related_term[index_related_term.keys()[0]] term = index_related_term.keys()[0] print 'Related term: '+term+'\nSimilarity : '+similarity print ''