def remove_concepts_from_denied_questions(self): """ Function to remove concepts that occur in questions that are denied by the It scans the question and answer for concepts, if the answer is "yes", or complicated, we keep the concepts in the question If the answer is 'No' or 'Wnl' (within normal limits), we ignore concepts detected in the question (if the answer is long, we keep the answer) """ tb = TextBits() self.concepts = self.processConceptsOnContext(tb.denials) self.tokens = self.processWordsOnContext(tb.denials)
def remove_concepts_from_uncertain_questions(self): """ Function to remove concepts that occur in questions that are denied by the It scans the question and answer for concepts, if the answer is "yes", or complicated, we keep the concepts in the question If the answer is uncertain, we ignore concepts detected in the question (if the answer is long, we keep the answer) """ tb = TextBits() self.concepts = self.processConceptsOnContext(tb.conflicted) self.tokens = self.processWordsOnContext(tb.denials, modifyInsteadOfRemove=True, prefix='DEN_')
def separate_concepts_from_uncertain_questions(self): """ Function to modify concept-ids that occur in questions that are denied by the It scans the question and answer for concepts, if the answer is "yes", or complicated, we keep the concepts in the question as they are If the answer is uncertain, we add a prefix 'NEG_' to its concept id """ tb = TextBits() self.concepts = self.processConceptsOnContext( tb.conflicted, modifyInsteadOfRemove=True, prefix='UNC_') self.tokens = self.processWordsOnContext(tb.denials, modifyInsteadOfRemove=True, prefix='DEN_')
def separate_concepts_from_family_questions(self): """ Function to modify concept-ids that occur in questions that are denied by the It scans the question and answer for concepts, if the answer is "yes", or complicated, we keep the concepts in the question as they are If the answer is uncertain, we add a prefix 'NEG_' to its concept id """ tb = TextBits() prefix = 'FAM_' retainedConcepts = [] retainedWords = [] segmenter = Segment() for segment in segmenter.segment(self): qCon = self.get_covered_concepts_annots(segment.begQue, segment.endQue) aCon = self.get_covered_concepts_annots(segment.begAns, segment.endAns) qTok = self.get_covered_words_annots(segment.begQue, segment.endQue) aTok = self.get_covered_words_annots(segment.begAns, segment.endAns) #print('qcon:',[con.ide for con in qCon]) #print('acon:',[con.ide for con in aCon]) if bool(segment.answers): cueFound = False for cue in tb.family: if cue in segment.question.lower(): cueFound = True break if not cueFound: retainedConcepts.extend(qCon) retainedConcepts.extend(aCon) retainedWords.extend(qTok) retainedWords.extend(aTok) else: #do nothing if remove, add it with a prefix if modify for con in qCon: nc = copy.copy(con) nc.ide = prefix + con.ide retainedConcepts.append(nc) for con in aCon: nc = copy.copy(con) nc.ide = prefix + con.ide retainedConcepts.append(nc) self.concepts = retainedConcepts self.tokens = retainedWords
def get_non_denied_text(self): words = [] segmenter = Segment() tb = TextBits() for segment in segmenter.segment(self): q_words = self.get_covered_tokens(segment.begQue, segment.endQue) a_words = self.get_covered_tokens(segment.begAns, segment.endAns) if segment.answers: if not set(segment.answers[0].lower().split()).intersection( tb.denials): words.extend(a_words) else: words.extend(q_words) return words
def remove_concepts_from_family_questions(self): """ Function to remove concepts that occur in questions that are denied by the It scans the question and answer for concepts, if the answer is "yes", or complicated, we keep the concepts in the question If the answer is uncertain, we ignore concepts detected in the question (if the answer is long, we keep the answer) """ tb = TextBits() retainedConcepts = [] retainedWords = [] segmenter = Segment() for segment in segmenter.segment(self): qCon = self.get_covered_concepts_annots(segment.begQue, segment.endQue) aCon = self.get_covered_concepts_annots(segment.begAns, segment.endAns) qTok = self.get_covered_words_annots(segment.begQue, segment.endQue) aTok = self.get_covered_words_annots(segment.begAns, segment.endAns) #print('qcon:',[con.ide for con in qCon]) #print('acon:',[con.ide for con in aCon]) if bool(segment.answers): cueFound = False for cue in tb.family: if cue in segment.question.lower(): cueFound = True break if not cueFound: retainedConcepts.extend(qCon) retainedConcepts.extend(aCon) retainedWords.extend(qTok) retainedWords.extend(aTok) #do nothing if remove, add it with a prefix if modify self.concepts = retainedConcepts self.tokens = retainedWords
def __init__(self, expSet, trainSet, groupTrainQues=False, findSimTestQ=False): """ Class for featurization based on Q-and-A :param featList: A list of features to apply. :param trainSet: A list of training that can be used to determine the questions and/or question clusters used """ self.groupTrainQues = groupTrainQues self.retrieveSimilarQuestions = findSimTestQ self.segmenter = Segment() self.textBits = TextBits() self.expSet = expSet featuredict = { "BOW_ANSWERS": self.get_bow_answers, "CATEGORICAL_QUESTIONSET": self.get_categorical_questionset, "QUESTIONSET": self.get_questionset, "PREAMBLE_CLUSTERS": self.get_preamble_clusters, "CONCEPT_CLUSTERS": self.get_concept_clusters, "LONG_QUESTIONSET": self.get_long_questionset, "CONCEPTS_FROM_SUMMARY": self.get_concepts_from_summary } #questionSet - > yes/no/uncertain/else QSet #descQuesSet add self.featList = { f: featuredict[f.upper()] for f in expSet.featTypes if f.upper() in featuredict } # Determine features. # Gets questions from trainingset. {Questions:freq in train data} self.questionSet = dict() # make a list of segments found in the entire trainingset segments = [] self.segConPairs = dict( ) #used for concept-based question clusterer, (a reconstructed list of concepts that occur in a question) for d in trainSet: currSegments = self.segmenter.segment(d.getTextObject()) for currSegment in currSegments: currConInSegment = set() for concept in d.getTextObject().get_covered_concepts_annots( currSegment.begQue, currSegment.endQue): nc = copy.copy(concept) #removing prefixes if there are any if ('_' in nc.ide): nc.ide = nc.ide[nc.ide.index('_') + 1:len(nc.ide)] currConInSegment.add(nc) self.segConPairs[currSegment.question] = currConInSegment #self.segConPairs.append([currSegment.question, currConInSegment]) segments.extend(currSegments) self.questionSet = Counter([seg.question.upper() for seg in segments]) #for each q in qset assign categories - short/long etc #utils.out("Identifying question type") self.all_unique_questions, self.all_questions_type, self.all_questions_cat = self.identify_ques_type( trainSet, thresholdYNU=0.3, thresholdCat=0.3, thresholdLong=0.3) ## Now, doing the same for groups of questions occurring in train if self.groupTrainQues: utils.out("Grouping questions") self.grouped_questions, self.questions_type, self.grouped_questions_cat = self.get_grouped_questions( trainSet, simThreshold=0.9) utils.out("Assigning question type to grouped questions") self.grouped_questions_type = self.assign_ques_type( self.questions_type, 0.14, len(trainSet), thresholdYNU=0.3, thresholdCat=0.3, thresholdLong=0.3) #14%: smallest data set size utils.out("Done preprocessing questions, now clustering") ## Getting clusters based on the preamble if ('PREAMBLE_CLUSTERS' in self.featList): self.calcFalseulatePreambleClusters() ## Getting clusters based on the concepts in the questions if ('CONCEPT_CLUSTERS' in self.featList): self.calculateCommonConceptClusters()