class segment: def __init__(self): LTP_DATA_DIR = 'resources/ltp_data_v3.4.0/' cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') from pyltp import Segmentor self.segmentor = Segmentor() self.segmentor.load_with_lexicon(cws_model_path, '/path/to/your/lexicon') def seg(self, text): words = self.segmentor.segment(text) return words def destroy(self): self.segmentor.release() def segFile(self, infile, outfile): data = codecs.open(infile, 'r') out = codecs.open(outfile, 'w') #, 'utf-8' for line in data: fields = line.strip().split('\t') out.write(fields[0] + '\t' + '\t'.join( [' '.join(self.seg(fields[i])) for i in range(1, len(fields))]) + '\n') data.close() out.close()
def namedEntityRecognize(sentence): ''' 使用pyltp模块进行命名实体识别 返回:1)命名实体和类别元组列表、2)实体类别列表 ''' namedEntityTagTupleList = [] segmentor = Segmentor() # segmentor.load(inout.getLTPPath(index.CWS)) segmentor.load_with_lexicon(inout.getLTPPath(index.CWS), inout.getResourcePath('userDic.txt')) words = segmentor.segment(sentence) segmentor.release() postagger = Postagger() postagger.load(inout.getLTPPath(index.POS)) postags = postagger.postag(words) postagger.release() recognizer = NamedEntityRecognizer() recognizer.load(inout.getLTPPath(index.NER)) netags = recognizer.recognize(words, postags) recognizer.release() # 封装成元组形式 for word, netag in zip(words, netags): namedEntityTagTupleList.append((word, netag)) neTagList = '\t'.join(netags).split('\t') return namedEntityTagTupleList, neTagList
class LTP: def __init__(self): self.segmentor = Segmentor() # 分词器 self.segmentor.load_with_lexicon( Config.SEGMENTOR_PATH, Config.PERSONAL_SEGMENTOR_PATH) # 加载模型 self.postagger = Postagger() # 词性分析器 self.postagger.load(Config.POSTAGGER_PATH) # 加载模型 self.parser = Parser() # 句法分析器 self.recognizer = NamedEntityRecognizer() self.recognizer.load(Config.NAMED_ENTITY_RECONGNTION_PATH) self.parser.load(Config.PARSER_PATH) # 加载模型 self.labeller = SementicRoleLabeller() # 语义角色分析器 self.labeller.load(Config.LABELLER_PATH) # 加载模型 self.negative_list = get_negative_list() self.no_list = get_no_list() self.limit_list = get_limit_list() self.special_list = get_special_list() self.key_sentences = [] def __del__(self): """ 资源释放 """ self.segmentor.release() self.postagger.release() self.parser.release() self.labeller.release()
def run(): #分词+选词 cont = open('key/pinglun_filter_all1.txt','r',encoding='utf-8') segmentor = Segmentor() # 初始化实例 # segmentor.load('cws.model') # 加载模型,不加载字典 segmentor.load_with_lexicon('cws.model', 'userdict.txt') # 加载模型,加载用户字典 postagger = Postagger() # 初始化实例 postagger.load('pos.model') # 加载模型 nwordall = [] for sentence in cont: nword = [''] words = segmentor.segment(sentence) # 分词 #默认可以这样输出 # print (' '.join(words)) postags = postagger.postag(words) # 词性标注 for word,tag in zip(words,postags): #############选择词性输出 # print (word+'/'+tag) ############只选出副词 # if tag == 'd': #######过滤单个字 # if((tag == 'n'or tag == 'd' or tag == 'a') and len(word)>1): ############使用word2vec相似度计算找取跟名词相近的形容词 # if((tag == 'a' or tag == 'n') and len(word)>1): if((tag == 'n') and len(word)>1): # print(word+tag) nword.append(word) nwordall.append(nword) #size为词向量维度数也即是特征值,windows窗口范围,min_count频数小于5的词忽略,workers是线程数,维度高会造成问题 model = models.word2vec.Word2Vec(nwordall, size=10, window=5, min_count=100, workers=80) print('#############################################') sim = model.most_similar(positive=[u'餐饮']) for s in sim: print ("word:%s,similar:%s " %(s[0],s[1]))
def feature_about(): # 获取特征列表 feature_dict = NewsUtil.get_feature() # 获取新闻中出现特征后最近的5个词及其属性 logger.info("In Prepare Raw News...") raw_news_data = CommonUtil.read_excel(RAW_NEWS_DEMO_PATH) raw_news_table = raw_news_data.sheet_by_index(0) raw_news_rows = raw_news_table.nrows segmentor = Segmentor() # 初始化实例 segmentor.load_with_lexicon(cws_model_path, CFETSFX_LEXICON_PATH) # 加载模型,第二个参数是您的外部词典文件路径 feature_about_list = list() for rowN in range(0, raw_news_rows): news_content = raw_news_table.cell_value(rowN, 2) sentences = SentenceSplitter.split(news_content) for sentence in sentences: print(sentence) # 分词 words = segmentor.segment(sentence) print(list(words)) for word_index in range(0, len(words)): word = words[word_index] for feature_word in feature_dict.values(): if feature_word in word: about_list = list() count = 0 while word_index < len(words) and count < 6: about_list.append(words[word_index]) count += 1 word_index += 1 feature_about_list.append(about_list) print(about_list) break segmentor.release() CommonUtil.write_csv(FEATURE_ABOUT_PATH, feature_about_list)
def init_pyltp(model_dir, dict_file=None): ''' 初始化Pyltp的几个模块 :param model_dir 模型的路径 :param dict_file 分词的外部词典 :return segmentor, postagger, parser, ner ''' segmentor = Segmentor() postagger = Postagger() parser = Parser() ner = NamedEntityRecognizer() cws_model = os.path.join(model_dir, 'cws.model') pos_model = os.path.join(model_dir, 'pos.model') parser_model = os.path.join(model_dir, 'parser.model') ner_model = os.path.join(model_dir, 'ner.model') if dict_file: segmentor.load_with_lexicon(cws_model, dict_file) else: segmentor.load(cws_model) postagger.load(pos_model) ner.load(ner_model) parser.load(parser_model) return segmentor, postagger, parser, ner
def get_tfidf_feature(): '''获取tfidf特征 先对每个样本进行分句,并且分句结果中的公司名称全部换成前一步中的公司简名 Returns: 整个样本的tfidf结果 ''' segmentor = Segmentor() # 初始化实例 segmentor.load_with_lexicon('e:/ltp_data_v3.4.0/cws.model', '../data/user_dict.txt') # 加载模型 text = [] for i, row in sample_data.iterrows(): words = [] sentence = row['sentence'] start = 0 end = 0 for entity in row['ner']: end = entity[0] words.extend(segmentor.segment(sentence[start:end])) words.append(entity[3]) start = entity[1] - 1 if end < len(sentence): words.extend(segmentor.segment(sentence[start:len(sentence)])) text.append(' '.join(words)) vectorizer = CountVectorizer() X = vectorizer.fit_transform(text) transformer = TfidfTransformer(smooth_idf=False) tfidf_feature = transformer.fit_transform(X.toarray()) segmentor.release() return tfidf_feature
def load_all_model(): """返回分词,词性标注,命名实体识别,依存解析等实例对象""" LTP_DATA_DIR = 'E:/MYGIT/Project/ltp_data' # ltp模型目录的路径 cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` segmentor = Segmentor() # 初始化实例 segmentor.load_with_lexicon(cws_model_path, './temp_file/cut_external_dict/cut_external_dict') # 加载模型 LTP_DATA_DIR = 'E:/MYGIT/Project/ltp_data' # ltp模型目录的路径 pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` postagger = Postagger() # 初始化实例 postagger.load_with_lexicon(pos_model_path, './temp_file/pos_external_dict/pos_external_dict') # 加载模型 LTP_DATA_DIR = 'E:/MYGIT/Project/ltp_data' # ltp模型目录的路径 ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load(ner_model_path) # 加载模型 LTP_DATA_DIR = 'E:/MYGIT/Project/ltp_data' # ltp模型目录的路径 par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') # 依存句法分析模型路径,模型名称为`parser.model` parser = Parser() # 初始化实例 parser.load(par_model_path) # 加载模型 fname = r"E:/MYGIT/model/wiki_stopwords/wiki_word2vec.kv" # model_wv.save(fname) model_wv = KeyedVectors.load(fname, mmap='r') return [segmentor, postagger, recognizer, parser, model_wv]
class ModelLoader: __instance = None def __new__(cls): if cls.__instance is None: cls.__instance = super(ModelLoader, cls).__new__(cls) cls.__instance.__initialized = False return cls.__instance def __init__(self): if (self.__initialized): return self.__initialized = True LTP_DIR = "./ltp_data" #客製化分詞,並且後處理更改詞性 self.segmentor = Segmentor() self.segmentor.load_with_lexicon( os.path.join(LTP_DIR, "cws.model"), os.path.join(LTP_DIR, 'customized.txt')) self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model')) self.sentenceSplitter = SentenceSplitter()
class LtpTree(DepTree): def __init__(self, dict_path=None): super(DepTree, self).__init__() print("正在加载LTP模型... ...") self.segmentor = Segmentor() if dict_path is None: self.segmentor.load(os.path.join(MODELDIR, "cws.model")) else: self.segmentor.load_with_lexicon(os.path.join(MODELDIR, "cws.model"), dict_path) self.postagger = Postagger() self.postagger.load(os.path.join(MODELDIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(MODELDIR, "parser.model")) print("加载模型完毕。") def parse(self, sentence): self.words = self.segmentor.segment(sentence) self.postags = self.postagger.postag(self.words) self.arcs = self.parser.parse(self.words, self.postags) for i in range(len(self.words)): if self.arcs[i].head == 0: self.arcs[i].relation = "ROOT" def release_model(self): # 释放模型 self.segmentor.release() self.postagger.release() self.parser.release()
def words_split(): """ 对于句子进行分词 :return: """ segmentor = Segmentor() cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') segmentor.load_with_lexicon(cws_model_path, '../data/all_word_dict.txt') for sentence in sentences: words = segmentor.segment(sentence) postags = postaggers(words) index = 0 for word, postag in zip(words, postags): if postag == 'v': relation_words.append(word) # print(word) all_words.append(words) relation_words_file = open('relation_words.txt', 'w+', encoding='utf8') for word in relation_words: relation_words_file.write(word + '\n') # 将当前扫描的所有词加入file all_words_file = open('all_words.txt', 'w+', encoding='utf8') for words in all_words: temp_words = '\t'.join(words) all_words_file.write(temp_words + '\n') segmentor.release()
def cut_words(): #分词+去除空行 #词性标注集http://ltp.readthedocs.io/zh_CN/latest/appendix.html cont = open('resource_new.txt', 'r', encoding='utf-8') f = open('key/cut_resouce.txt', 'w', encoding='utf-8') segmentor = Segmentor() # 初始化实例 # segmentor.load('cws.model') # 加载模型,不加载字典 segmentor.load_with_lexicon('module/cws.model', 'userdict.txt') # 加载模型,加载用户字典 postagger = Postagger() # 初始化实例 postagger.load('module/pos.model') # 加载模型 for sentence in cont: if sentence.strip() != '': words = segmentor.segment(sentence) # 分词 pos_tags = postagger.postag(words) # 词性标注 for word, tag in zip(words, pos_tags): if tag != 'wp': f.write(word) else: f.write('\n') f.write('\n') else: continue f.close() segmentor.release() postagger.release()
class LTP_word(): """docstring for parser_word deal处理文本,返回词表、词性及依存关系,语义,命名实体五个值 release释放缓存""" def __init__(self, model_path): self.model_path = model_path self.segmentor = Segmentor() # 分词初始化实例 self.segmentor.load_with_lexicon(path.join(self.model_path, 'cws.model'), path.join(self.model_path, 'dictionary_kfc.txt')) self.postagger = Postagger() # 词性标注初始化实例 self.postagger.load(path.join(self.model_path, 'pos.model') ) # 加载模型 self.recognizer = NamedEntityRecognizer() # 命名实体识别初始化实例 self.recognizer.load(path.join(self.model_path, 'ner.model')) self.parser = Parser() # 依存句法初始化实例 s self.parser.load(path.join(self.model_path, 'parser.model')) # 加载模型 self.labeller = SementicRoleLabeller() # 语义角色标注初始化实例 self.labeller.load(path.join(self.model_path, 'srl')) def deal (self, text): #把所有该要使用的东西都提取出来 words =self.segmentor.segment(text) # 分词 postags = self.postagger.postag(words) # 词性标注 netags = self.recognizer.recognize(words, postags) #命名实体 arcs = self.parser.parse(words, postags) # 句法分析 roles = self.labeller.label(words, postags, netags, arcs) # 语义角色标注 return words,postags,arcs,roles,netags def release(self): self.segmentor.release() self.postagger.release() self.recognizer.release() self.parser.release() self.labeller.release()
def read_and_seg_pos(file_dir): segmentor = Segmentor() # postagger = Postagger() segmentor.load_with_lexicon("f:\\NLPJP\\xlbz\\LTP\\ltp_data_v3.4.0\\cws.model","ceshi.txt") # postagger.load_with_lexicon("f:/NLPJP/xlbz/LTP/ltp_data_v3.4.0/pos.model","ceshi.txt") file_read = open(file_dir,"r") texts = file_read.readlines() file_write_seg = open(file_dir+"_seg","w") # file_write_pos = open(file_dir+"_pos","w") for text in texts: words = segmentor.segment(text) file_write_seg.write(" ".join(words)+"\n") # postags = postagger.postag(words) # words_and_pos.append('$','$') # for word,pos in words_and_pos: # if word != '$': # file_write_pos.write(word+" "+pos+" ") # else: # file_write_pos.write('\n') file_read.close() file_write_seg.close() # file_write_pos.close() segmentor.release() postagger.release() read_and_seg_pos()
class Word(): def __init__(self, dictDir): self.segmentor = Segmentor() self.segmentor.load_with_lexicon(f'{LTP_DATA_DIR}/cws.model', f'{dictDir}/dict.txt') def split(self, myStr): return list(self.segmentor.segment(myStr))
def segment(self, texts, use_tag_filter=True): # 初始化实例 # global word_list, netags, postags, relation, heads words = [] pos = [] ner = [] rel = [] hea = [] segmentor = Segmentor() segmentor.load_with_lexicon(self.cws_model_path, './dict/user_recg.dic') # 加载模型,参数是自定义词典的文件路径 self.dic_list postagger = Postagger() postagger.load(self.pos_model_path) recognizer = NamedEntityRecognizer() recognizer.load(self.ner_model_path) parser = Parser() parser.load(self.pas_model_path) for text in texts: text = text.lower() word_list = segmentor.segment(text) word_list = [word for word in word_list if len(word) > 1] # word_list = [word for word in word_list if re.match("[\u0041-\u005a\u4e00-\u9fa5]+", word) != None] # .decode('utf8') 保留中英文 word_list = [word.strip() for word in word_list if word.strip() not in self.stop_words] # 去除停用词 # 词性标注 posttags = postagger.postag(word_list) postags = list(posttags) # NER识别 netags = recognizer.recognize(word_list, postags) # 句法分析 arcs = parser.parse(word_list, postags) rely_id = [arc.head for arc in arcs] # 提取依存父节点id relation = [arc.relation for arc in arcs] # 提取依存关系 heads = ['Root' if id == 0 else word_list[id - 1] for id in rely_id] # 匹配依存父节点词语 if use_tag_filter: dic = dict(zip(word_list, postags)) word_list = [x for x in dic.keys() if dic[x] in self.tags_filter] words.append(word_list) pos.append(postags) ner.append(netags) rel.append(relation) hea.append(heads) segmentor.release() postagger.release() recognizer.release() parser.release() return words, pos, ner, rel, hea
def ltp_segmentor(LTP_DATA_DIR, sentence): # 分词模型路径,模型名称为`cws.model` cws_model_path = os.path.join(LTP_DATA_DIR, "cws.model") segmentor = Segmentor() # 初始化实例 # segmentor.load(cws_model_path) # 加载模型 segmentor.load_with_lexicon(cws_model_path, "ltp_data/dict/school") words = segmentor.segment(sentence) # 分词 segmentor.release() # 释放模型 return words
def seg_test(filepath, cwspath, dictpath): from pyltp import Segmentor segmentor = Segmentor() segmentor.load_with_lexicon(cwspath, dictpath) text = open(filepath).read() words = segmentor.segment(text) print('\t'.join(words)) segmentor.release()
def segment(text): global segmentor if segmentor is None: cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` segmentor = Segmentor() # 初始化实例 segmentor.load_with_lexicon(cws_model_path, 'dict/lexicon.txt') # 加载模型,第二个参数是您的外部词典文件路径 words = segmentor.segment(text) # 分词 # print(list(words)) return list(words)
def test_dict(): # self.segmentor.load(os.path.join(MODELDIR, "cws.model")) dictf = resource_path('dict_zh.txt') segmentor = Segmentor() # 初始化实例 segmentor.load_with_lexicon(os.path.join(MODELDIR, "cws.model"), dictf) # segmentor.load(os.path.join(MODELDIR, "cws.model")) words = segmentor.segment('列出派工单') # fail: '列出所有的采购订单' print('\t'.join(words)) segmentor.release()
def segmentor(sentence): segmentor = Segmentor() # 初始化实例 segmentor.load_with_lexicon(cws_model_path,r'D:\ltp\ltp_data_v3.4.0\user.dict') # 加载模型 words = segmentor.segment(sentence) # 分词 #默认可以这样输出 print ('\t'.join(words)) # 可以转换成List 输出 words_list = list(words) segmentor.release() # 释放模型 return words_list
def cut(str): cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` segmentor = Segmentor() # 初始化实例 #segmentor.load(cws_model_path) # 加载模型 segmentor.load_with_lexicon( cws_model_path, 'lexicon') # 可自定义单词,加载模型,参数./lexicon是自定义词典的文件路径 words = segmentor.segment(str) # 分词,str = "你好,我是大王" # print (' '.join(words)) segmentor.release() # 释放模型 return words
def cut(sent): segmentor = Segmentor() model_path = 'D:/app/ltp_data_v3.4.0/cws.model' user_dict = 'ds_dict.txt' segmentor.load_with_lexicon(model_path, user_dict) words = segmentor.segment(sent) print(words) array_str = "|".join(words) print(array_str) segmentor.release() # 释放应用 return
def cut_sentence(input_sentence): segmentor = Segmentor() # segmentor.load_with_lexicon("../../../../../ltp_data/cws.model","../../../../../ltp_data/fulluserdict") segmentor.load_with_lexicon("/home/liu/ltp_data/cws.model","/home/liu/ltp_data/fulluserdict") # 分词模块,输入句子,输出分词,可考虑加入用户词典! # input_sentence = "王老师的办公室在哪里" words = segmentor.segment(input_sentence) result = ' '.join(words) result = 'BOS '+result+' EOS' return [result]
class MyPyLtp: # 本地配置ltp model路径 LTP_DATA_DIR = '/Users/yuanjin/PycharmProjects/ltp_data_v3.4.0' # ltp模型目录的路径 # linux配置ltp model路径 # LTP_DATA_DIR = '/home/student/project/project-01/ltp_data' pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') special_word_path = os.path.join(LTP_DATA_DIR, 'special_word.txt') ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') parser_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') # 初始化模型 def __init__(self): # 初始化实例,加载模型 self.segmentor = Segmentor() # 初始化实例 self.segmentor.load_with_lexicon( self.cws_model_path, self.special_word_path) # 加载模型,第二个参数是您的外部词典文件路径 self.postagger = Postagger() self.postagger.load(self.pos_model_path) self.recognizer = NamedEntityRecognizer() self.recognizer.load(self.ner_model_path) self.parser = Parser() self.parser.load(self.parser_model_path) # 分句 def split_sentence(self, text): return SentenceSplitter.split(text) # 分词 def split_word(self, sentence): self.words = list(self.segmentor.segment(sentence)) # 分词 return self.words # 词性标注 def tagging_word(self): self.tagging = self.postagger.postag(self.words) # 词性标注 return self.tagging # 命名实体识别 def name_recognizer(self): self.names = self.recognizer.recognize(self.words, self.tagging) # 命名实体识别 return self.names # 依存句法分析 def relation_analysis(self, sentence): self.split_word(sentence) self.tagging_word() self.name_recognizer() arcs = self.parser.parse(self.words, self.tagging) # 句法分析 return [[arc.head, arc.relation] for arc in arcs]
def get_cut_words(self, text, dict_path=None): # 分词 segmentor = Segmentor() # 初始化实例 if dict_path is None: segmentor.load(self.cws_model_path) # 加载模型 else: segmentor.load_with_lexicon(self.cws_model_path, dict_path) # 加载模型,第二个参数是您的外部词典文件路径 words = segmentor.segment(text) print('\t'.join(words)) segmentor.release() return list(words)
def get_words_by_pyltp(self, sent): words_list = list() # 分词模型路径,模型名称为’cws.model‘ cws_model_path = os.path.join(self.ltp_dir_path, "cws.model") # dict是自定义词典的文件路径 dict_path = os.path.join(self.ltp_dir_path, "dict.txt") segmentor = Segmentor() segmentor.load_with_lexicon(cws_model_path, dict_path) words = segmentor.segment(sent) segmentor.release() words_list = list(words) return words_list
def segmentor(self, sentence): segmentor = Segmentor() #segmentor.load(cws_model_path) # 加载模型 segmentor.load_with_lexicon(cws_model_path, user_dict_path) # 加载模型,第二个参数是您的外部词典文件路径 words = segmentor.segment(sentence) # 分词 # 默认可以这样输出 #print ('\t'.join(words)) # 可以转化成List输出 word_list = list(words) segmentor.release() # 释放模型 return word_list
def ltp_segment(sent): # 加载文件 cws_model_path = os.path.join( 'ltp_data_v3.4.0/cws.model') # 分词模型路径,模型名称为`cws.model` lexicon_path = os.path.join( 'ltp_data_v3.4.0/lexicon.txt') # 参数lexicon是自定义词典的文件路径 segmentor = Segmentor() segmentor.load_with_lexicon(cws_model_path, lexicon_path) words = list(segmentor.segment(sent)) segmentor.release() return words
def words_split(): """ 对于句子进行分词 :return: """ segmentor = Segmentor() cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') segmentor.load_with_lexicon(cws_model_path, 'data/all_word_dict.txt') for sequence in sentences: words = segmentor.segment(sequence) tuple_get(words, sequence) segmentor.release()
def seg_initialize(model_path, lexicon_path): print "load segment data..." segmentor = Segmentor() segmentor.load_with_lexicon(model_path, lexicon_path) return segmentor
for line in path.readlines(): emotion_set.append((line.strip().split('\t')[0])) return emotion_set def sortByPMI(coPMI): sorted_tuple =[] for item in coPMI: items = item.split('\001') #print 'item:',items,type(items) #print coPMI[item],type(coPMI[item]) sorted_tuple.append((items[0],items[1],coPMI[item])) return sorted(sorted_tuple,key =itemgetter(0,2)),sorted(sorted_tuple,key= itemgetter(1,2)) segmentor = Segmentor() segmentor.load_with_lexicon(os.path.join(MODELDIR,"cws.model"),"/data0/dm/dict/dict.txt") if __name__ == "__main__": path = os.path.abspath(os.path.dirname(sys.argv[0])) path_property = open(path+"/car_entity_property.txt",'r') pro_words = fun_property_set(path_property) path_sentiment = open(path+"/car_sentiment_dic.txt",'r') sen_words = fun_emotion_set(path_sentiment) path_corpus = path+"/car_pmi_corpus.txt" path_out1 = open(path+"/pro_sen_pmi_corpus_sort1.txt",'w') path_out2 = open(path+"/pro_sen_pmi_corpus_sort2.txt",'w') posPmi = getPMI(path_corpus, pro_words, sen_words) sorted_tuple1,sorted_tuple2 = sortByPMI(posPmi)