def getWordVec(self, model, wordStr): ''' get the word's vector as arrayList type from w2v model ''' ExtraSegOpt().reLoadEncoding() return model[wordStr]
def copeMSimilarVecsbtwWordLists(self, model, wordStrList1, wordStrList2, topN_rev=20, topN=20): ''' range word vec res for two wordList from source to target use wordVector to express the relationship between src-wordList and tag-wordList first, use the tag-wordList as neg-wordList to get the rev-wordList, then use the scr-wordList and the rev-wordList as the new src-tag-wordList topN_rev is topN of rev-wordList and topN is the final topN of relationship vec ''' ExtraSegOpt().reLoadEncoding() srcWordList = [] tagWordList = [] srcWordList.extend(wordStr.decode('utf-8') for wordStr in wordStrList1) tagWordList.extend(wordStr.decode('utf-8') for wordStr in wordStrList2) revSimilarPairList = self.queryMSimilarVecswithPosNeg( model, [], tagWordList, topN_rev) revWordList = [] revWordList.extend(pair[0].decode('utf-8') for pair in revSimilarPairList) stSimilarPairList = self.queryMSimilarVecswithPosNeg( model, srcWordList, revWordList, topN) return stSimilarPairList
def culSimBtwWordVecs(self, model, wordStr1, wordStr2): ''' two words similar basic query function return double-prob ''' ExtraSegOpt().reLoadEncoding() similarValue = model.similarity(wordStr1.decode('utf-8'), wordStr2.decode('utf-8')) return similarValue
def queryMostSimilarWordVec(self, model, wordStr, topN=20): ''' MSimilar words basic query function return 2-dim List [0] is word [1] is double-prob ''' ExtraSegOpt().reLoadEncoding() similarPairList = model.most_similar(wordStr.decode('utf-8'), topn=topN) return similarPairList
def listAllFilePathInDirectory(dirPath): ''' list all file_path in a directory from dir folder ''' ExtraSegOpt().reLoadEncoding() loadedFilesPath = [] files = os.listdir(dirPath) # TODO need improve code to one line for file in files: filePath = dirPath + file # print(filePath) loadedFilesPath.append(filePath) return loadedFilesPath
def queryMSimilarVecswithPosNeg(self, model, posWordStrList, negWordStrList, topN=20): ''' pos-neg MSimilar words basic query function return 2-dim List [0] is word [1] is double-prob ''' ExtraSegOpt().reLoadEncoding() posWordList = [] negWordList = [] for wordStr in posWordStrList: posWordList.append(wordStr.decode('utf-8')) for wordStr in negWordStrList: negWordList.append(wordStr.decode('utf-8')) pnSimilarPairList = model.most_similar(positive=posWordList, negative=negWordList, topn=topN) return pnSimilarPairList
def folderFilesNameEntities(corpusDirPath, userDictPath=None, dictRewrite=False): ''' get entities from folder files' names write these entities into user_dict for jieba analyser(chose) ''' ExtraSegOpt().reLoadEncoding() entities = [] files = os.listdir(corpusDirPath) for file in files: fileName = file[:file.find(u'(seg)')] extra = u'' if fileName.find(u'(') != -1 and fileName.find(u')') != -1: extra = fileName[fileName.find(u'(') + 1:fileName.find(u')')] if fileName.find(u'(') != -1 and fileName.find(u')') != -1: extra = fileName[fileName.find(u'(') + 1:fileName.find(u')')] if len(extra) != 0: if fileName.find(extra) - 1 == 0: fileName = fileName[fileName.find(extra) + len(extra) + 1:] else: fileName = fileName[:fileName.find(extra) - 1] if fileName not in entities: entities.append(fileName) # write user's word directory if userDictPath != None: entitiesFwStr = '' for i in range(len(entities)): entitiesFwStr += (entities[i] + u' n') if not i == len(entities) - 1: entitiesFwStr += u'\n' mode = 'w' if dictRewrite == False else 'w+' fw = open(userDictPath, mode) fw.write(entitiesFwStr) fw.close() return entities
def updateWord2VecModel(self, corpusFilePath, modelFilePath=None): ''' update w2v model from disk (about corpusFilePath and safe_model is same as function initTrainWord2VecModel default set safe_model == True) ''' ExtraSegOpt().reLoadEncoding() fileType = localFileOptUnit.checkFileState(corpusFilePath) if fileType == u'error': warnings.warn('load file error!') return None else: if modelFilePath == None: modelFilePath = self.modelPath model = self.loadModelfromFile(modelFilePath) # TODO add safe_model == False if fileType == u'file' or u'opened': self.updateW2VModelUnit(model, corpusFilePath) elif fileType == u'directory': corpusFiles = localFileOptUnit.listAllFilePathInDirectory( corpusFilePath) for file in corpusFiles: self.updateW2VModelUnit(model, file)