def Begin_to_identify(request): # index页面需要一开始就加载的内容写在这里 context = {} ctx = {} if request.POST: key = request.POST["user_text"] thu1 = pre_load_thu # 使用thulac进行分词 TagList[i][0]代表第i个词 # TagList[i][1]代表第i个词的词性 key = key.strip() TagList = thu1.cut(key, text=False)#[[词,词性], [词,词性], []] text = "" NE_List = get_NE(key) # 获取实体列表 for pair in NE_List: # 根据实体列表,显示各个实体 if pair[1] == 0: text += pair[0] continue if temporaryok(pair[1]): # 判断实体词性 # text += "<a href='#' data-original-title='" + get_explain( # pair[1] # ) + "(暂无资料)' data-placement='top' data-trigger='hover' data-content='" + get_detail_explain(pair[1]) + "' class='popovers'>" + pair[0] + "</a>" # continue text += "<a href='#' data-original-title='" + pair[1]+ "(暂无资料)' data-placement='top' data-trigger='hover' data-content='" + pair[1] + "' class='popovers'>" + pair[0] + "</a>" continue # text += "<a href='detail.html?title=" + pair[0] + "' data-original-title='" + get_explain( # pair[1]) + "' data-placement='top' data-trigger='hover' data-content='" + get_detail_explain(pair[1]) + "' class='popovers'>" + pair[0] + "</a>" # "http://stockdata.stock.hexun.com/gszl/s000001.shtml" text += "<a href='http://stockdata.stock.hexun.com/gszl/s"+str(pair[1])+".shtml'>"+str(pair[0])+"</a>" # text += "<a href='http://stockdata.stock.hexun.com/gszl/s"+str(pair[1])+".shtml'>"+str(pair[0])+"</a>" # text += "<a href='detail.html?title=" + pair[0] + "' data-original-title='" +pair[1]+"' data-placement='top' data-trigger='hover' data-content='" + pair[1]+ "' class='popovers'>" + pair[0] + "</a>" # <a href="detail.html?title=平安银行 data-original-title=类别 data-placement="top" data-trigger="hover" data-content="类别描述" class="popovers" ">平安银行<a> # 跳转链接, 应该只是跳转个链接带个titile, 这些属性应该是在<a>标签之上的。 ctx['rlt'] = text # 将实体对应类别和描述,+ 对应单词放入ctx字典,以key=rlt进行查询 seg_word = "" length = len(TagList) # TagList分词后的数量 for t in TagList: # 测试打印词性序列 seg_word += t[0] + " <strong><small>[" + t[1] + "]</small></strong> " # 将单词和词向进行添加标签 seg_word += "" # 后面加入"" ctx['seg_word'] = seg_word # 以seg_word的key进行查询 return render(request, "index.html", ctx)#返回主页面
def ER_post(request): ctx ={} if request.POST: key = request.POST['user_text'] thu1 = pre_load_thu #提前加载好了 # 使用thulac进行分词 TagList[i][0]代表第i个词 # TagList[i][1]代表第i个词的词性 key = key.strip() TagList = thu1.cut(key, text=False) text = "" NE_List = get_NE(key) #获取实体列表 for pair in NE_List: #根据实体列表,显示各个实体 if pair[1] == 0: text += pair[0] continue if temporaryok(pair[1]): text += "<a href='#' data-original-title='" + get_explain(pair[1]) + "(暂无资料)' data-placement='top' data-trigger='hover' data-content='"+get_detail_explain(pair[1])+"' class='popovers'>" + pair[0] + "</a>" continue text += "<a href='detail.html?title=" + pair[0] + "' data-original-title='" + get_explain(pair[1]) + "' data-placement='top' data-trigger='hover' data-content='"+get_detail_explain(pair[1])+"' class='popovers'>" + pair[0] + "</a>" ctx['rlt'] = text seg_word = "" length = len(TagList) for t in TagList: #测试打印词性序列 seg_word += t[0]+" <strong><small>["+t[1]+"]</small></strong> " seg_word += "" ctx['seg_word'] = seg_word return render(request, "index.html", ctx)
def ER_post(request): ctx = {} if request.POST: key = request.POST['user_text'] thu1 = pre_load_thu #提前加载好了 # 使用thulac进行分词 TagList[i][0]代表第i个词 # TagList[i][1]代表第i个词的词性 key = key.encode('utf-8').strip() TagList = thu1.cut(key, text=False) text = "" NE_List = get_NE(key) #获取实体列表 for pair in NE_List: #根据实体列表,显示各个实体 if pair[1] == 0: text += pair[0] continue if temporaryok(pair[1]): text += "<a href='#' data-original-title='" + get_explain( pair[1] ) + "(暂无资料)' data-placement='top' data-trigger='hover' data-content='" + get_detail_explain( pair[1]) + "' class='popovers'>" + pair[0] + "</a>" continue text += "<a href='detail.html?title=" + pair[ 0] + "' data-original-title='" + get_explain( pair[1] ) + "' data-placement='top' data-trigger='hover' data-content='" + get_detail_explain( pair[1]) + "' class='popovers'>" + pair[0] + "</a>" ctx['rlt'] = text # while i < length: # # 尝试将2个词组合,若不是NE则组合一个,还不是就直接打印文本 # p1 = TagList[i][0] # p2 = "*-" # 保证p2没被赋值时,p1+p2必不存在 # if i+1 < length: # p2 = TagList[i+1][0] # # t1 = TagList[i][1] # t2 = "*-" # if i+1 < length: # t2 = TagList[i+1][1] # # p = p1 + p2 # if i+1 < length and preok(t1) and nowok(t2): # answer = db.matchHudongItembyTitle(p) # if answer != None: # text += "<a href='detail.html?title=" + str(p) + "' data-toggle='tooltip' title='" + get_explain(t2) + "'>" + p + "</a>" # i += 2 # continue # # p = p1 # if nowok(t1): # answer = db.matchHudongItembyTitle(p) # if answer != None: # text += "<a href='detail.html?title=" + str(p) + "' data-toggle='tooltip' title='" + get_explain(t1) + "'>" + p + "</a>" # i += 1 # continue # elif temporaryok(t1): # text += "<a href='#' data-toggle='tooltip' title='" + get_explain(t1) + "(暂无资料)'>" + p + "</a>" # i += 1 # continue # # # i += 1 # text += str(p) seg_word = "" length = len(TagList) for t in TagList: #测试打印词性序列 seg_word += t[0] + " <strong><small>[" + t[ 1] + "]</small></strong> " seg_word += "" ctx['seg_word'] = seg_word return render(request, "index.html", ctx)
def ER_post(request): ctx ={} if request.POST: key = request.POST['user_text'] thu1 = pre_load_thu #提前加载好了 # 使用thulac进行分词 TagList[i][0]代表第i个词 # TagList[i][1]代表第i个词的词性 key = key.strip() TagList = thu1.cut(key, text=False) text = "" NE_List = get_NE(key) #获取实体列表 for pair in NE_List: #根据实体列表,显示各个实体 if pair[1] == 0: text += pair[0] continue if temporaryok(pair[1]): text += "<a href='#' data-original-title='" + get_explain(pair[1]) + "(暂无资料)' data-placement='top' data-trigger='hover' data-content='"+get_detail_explain(pair[1])+"' class='popovers'>" + pair[0] + "</a>" continue text += "<a href='detail.html?title=" + pair[0] + "' data-original-title='" + get_explain(pair[1]) + "' data-placement='top' data-trigger='hover' data-content='"+get_detail_explain(pair[1])+"' class='popovers'>" + pair[0] + "</a>" ctx['rlt'] = text # while i < length: # # 尝试将2个词组合,若不是NE则组合一个,还不是就直接打印文本 # p1 = TagList[i][0] # p2 = "*-" # 保证p2没被赋值时,p1+p2必不存在 # if i+1 < length: # p2 = TagList[i+1][0] # # t1 = TagList[i][1] # t2 = "*-" # if i+1 < length: # t2 = TagList[i+1][1] # # p = p1 + p2 # if i+1 < length and preok(t1) and nowok(t2): # answer = db.matchHudongItembyTitle(p) # if answer != None: # text += "<a href='detail.html?title=" + str(p) + "' data-toggle='tooltip' title='" + get_explain(t2) + "'>" + p + "</a>" # i += 2 # continue # # p = p1 # if nowok(t1): # answer = db.matchHudongItembyTitle(p) # if answer != None: # text += "<a href='detail.html?title=" + str(p) + "' data-toggle='tooltip' title='" + get_explain(t1) + "'>" + p + "</a>" # i += 1 # continue # elif temporaryok(t1): # text += "<a href='#' data-toggle='tooltip' title='" + get_explain(t1) + "(暂无资料)'>" + p + "</a>" # i += 1 # continue # # # i += 1 # text += str(p) seg_word = "" length = len(TagList) for t in TagList: #测试打印词性序列 seg_word += t[0]+" <strong><small>["+t[1]+"]</small></strong> " seg_word += "" ctx['seg_word'] = seg_word return render(request, "index.html", ctx)
if (len(file) > 7 and file[-7:] == 'zh_hans'): with open(filePath, 'r') as fr: count = 0 for line in fr: count += 1 if (count % 100 == 0): print(filePath + " " + str(count)) #过滤掉<doc > </doc> 等无用行 if (len(line) < 2 or line[0:4] == '<doc' or line[0:6] == "</doc>"): continue #分句 statements = CutStatements(line) for statement in statements: #分词 cutResult = get_NE(statement.strip()) #得到每句话的实体列表后,两两匹配查询是否具有某种关系,如果有的话就写到文件中 #entityList 存储实体列表和实体出现的位置,entity1存储实体名称,entity1Index存储实体位置 entityList = [] nowIndex = -1 for word in cutResult: if (word[1] != 0 and not temporaryok(word[1])): entity1Index = statement.index( word[0], nowIndex + 1) entityList.append({ 'entity1': word[0], 'entity1Index': entity1Index })