def getSplitData(self,node): qid = DataExtrator.getAttrValue(node, 'QID') qcategory = DataExtrator.getAttrValue(node, 'QCATEGORY') quserid = DataExtrator.getAttrValue(node, 'QUSERID') qtype = DataExtrator.getAttrValue(node, 'QTYPE') qgold_yn = DataExtrator.getAttrValue(node, 'QGOLD_YN') # print qid,qcategory,quserid,qtype,qgold_yn qBody=DataExtrator.getNodeValue(DataExtrator.getXMLNode(node, 'QBody')[0]) commentNodeList=DataExtrator.getXMLNode(node, 'Comment') return qid,qcategory,quserid,qtype,qgold_yn,qBody,commentNodeList
def getTestData(self,testQtype,testQBody,testCommentNodeList,trainTopNWords): testData=[] cidList=[] for commentNode in testCommentNodeList: cid = DataExtrator.getAttrValue(commentNode, 'CID') cuser = DataExtrator.getAttrValue(commentNode, 'CUSERID') cgold = DataExtrator.getAttrValue(commentNode, 'CGOLD') cgold_yn = DataExtrator.getAttrValue(commentNode, 'CGOLD_YN') cBody=DataExtrator.getNodeValue(DataExtrator.getXMLNode(commentNode, 'CBody')[0]) testTopNWords,splitWords=self.selectTopNWords(testQBody+cBody); allFeatures=self.getFeatures(trainTopNWords,[[testTopNWords]]); #这里仅为利用已有方法而做的结构变化 testData.append(allFeatures[0][0]) cidList.append(cid) return testData,cidList
def generateFeatures(self,qtype,qBody,commentNodeList): allList=[] goodComments=[] badComments=[] potentialComments=[] dialogueComments=[] nonEnglishComments=[] otherComments=[] yesComments=[] noComments=[] unsureComments=[] naComments=[] noDisplayComments=[] # qBodyFeatures=DataExtrator.selectTopNWords(qBody); for node in commentNodeList: cid = DataExtrator.getAttrValue(node, 'CID') cuser = DataExtrator.getAttrValue(node, 'CUSERID') cgold = DataExtrator.getAttrValue(node, 'CGOLD') cgold_yn = DataExtrator.getAttrValue(node, 'CGOLD_YN') cBody=DataExtrator.getNodeValue(DataExtrator.getXMLNode(node, 'CBody')[0]) if qtype == general: if cgold==questionMarks[0]: goodComments.append(cBody) elif cgold==questionMarks[1]: badComments.append(cBody) elif cgold==questionMarks[2]: potentialComments.append(cBody) elif cgold==questionMarks[3]: dialogueComments.append(cBody) elif cgold==questionMarks[4]: nonEnglishComments.append(cBody) elif cgold==questionMarks[5]: otherComments.append(cBody) #NA问题没必要进行相似运算 # elif cgold==questionMarks[9]: # naComments.append(cBody) else: noDisplayComments.append(cBody) elif qtype == yesno:#yes_no型问题暂未考虑ctype if cgold_yn==questionMarks[6]: yesComments.append(cBody) elif cgold_yn==questionMarks[7]: noComments.append(cBody) elif cgold_yn==questionMarks[8]: unsureComments.append(cBody) # elif cgold_yn==questionMarks[9]: # naComments.append(cBody) else: noDisplayComments.append(cBody) #暂时不加入 # noDisplayComments.append(cBody) qBodyList=[qBody] allList.append(qBodyList) allList.append(goodComments) allList.append(badComments) allList.append(potentialComments) allList.append(dialogueComments) allList.append(nonEnglishComments) allList.append(otherComments) allList.append(yesComments) allList.append(noComments) allList.append(unsureComments) # allList.append(naComments)#N/A答案没必要进行相似运算 allList.append(noDisplayComments) topNWords,splitWords=self.selectTopNWords(allList) splitWords.pop(0) allFeatures=self.getFeatures(topNWords,splitWords); return allFeatures,topNWords;