Python TextBits 예제들, preprocessing.Segment.TextBits Python 예제들

예제 #1

0

파일 보기

파일: Text.py 프로젝트: aascode/rdocChallenge

 def remove_concepts_from_denied_questions(self):
     """
     Function to remove concepts that occur in questions that are denied by the 
     It scans the question and answer for concepts, if the answer is "yes", or complicated, we keep the concepts in the question
     If the answer is 'No' or 'Wnl' (within normal limits), we ignore concepts detected in the question (if the answer is long, we keep the answer)
     """
     tb = TextBits()
     self.concepts = self.processConceptsOnContext(tb.denials)
     self.tokens = self.processWordsOnContext(tb.denials)

예제 #2

0

파일 보기

파일: Text.py 프로젝트: aascode/rdocChallenge

 def remove_concepts_from_uncertain_questions(self):
     """
     Function to remove concepts that occur in questions that are denied by the 
     It scans the question and answer for concepts, if the answer is "yes", or complicated, we keep the concepts in the question
     If the answer is uncertain, we ignore concepts detected in the question (if the answer is long, we keep the answer)
     """
     tb = TextBits()
     self.concepts = self.processConceptsOnContext(tb.conflicted)
     self.tokens = self.processWordsOnContext(tb.denials,
                                              modifyInsteadOfRemove=True,
                                              prefix='DEN_')

예제 #3

0

파일 보기

파일: Text.py 프로젝트: aascode/rdocChallenge

 def separate_concepts_from_uncertain_questions(self):
     """
     Function to modify concept-ids that occur in questions that are denied by the 
     It scans the question and answer for concepts, if the answer is "yes", or complicated, we keep the concepts in the question as they are
     If the answer is uncertain, we add a prefix 'NEG_' to its concept id
     """
     tb = TextBits()
     self.concepts = self.processConceptsOnContext(
         tb.conflicted, modifyInsteadOfRemove=True, prefix='UNC_')
     self.tokens = self.processWordsOnContext(tb.denials,
                                              modifyInsteadOfRemove=True,
                                              prefix='DEN_')

예제 #4

0

파일 보기

파일: Text.py 프로젝트: aascode/rdocChallenge

    def separate_concepts_from_family_questions(self):
        """
        Function to modify concept-ids that occur in questions that are denied by the 
        It scans the question and answer for concepts, if the answer is "yes", or complicated, we keep the concepts in the question as they are
        If the answer is uncertain, we add a prefix 'NEG_' to its concept id
        """
        tb = TextBits()
        prefix = 'FAM_'
        retainedConcepts = []
        retainedWords = []
        segmenter = Segment()

        for segment in segmenter.segment(self):
            qCon = self.get_covered_concepts_annots(segment.begQue,
                                                    segment.endQue)
            aCon = self.get_covered_concepts_annots(segment.begAns,
                                                    segment.endAns)

            qTok = self.get_covered_words_annots(segment.begQue,
                                                 segment.endQue)
            aTok = self.get_covered_words_annots(segment.begAns,
                                                 segment.endAns)
            #print('qcon:',[con.ide for con in qCon])
            #print('acon:',[con.ide for con in aCon])
            if bool(segment.answers):
                cueFound = False
                for cue in tb.family:
                    if cue in segment.question.lower():
                        cueFound = True
                        break

                if not cueFound:
                    retainedConcepts.extend(qCon)
                    retainedConcepts.extend(aCon)
                    retainedWords.extend(qTok)
                    retainedWords.extend(aTok)
                else:
                    #do nothing if remove, add it with a prefix if modify
                    for con in qCon:
                        nc = copy.copy(con)
                        nc.ide = prefix + con.ide
                        retainedConcepts.append(nc)
                        for con in aCon:
                            nc = copy.copy(con)
                            nc.ide = prefix + con.ide
                            retainedConcepts.append(nc)

        self.concepts = retainedConcepts
        self.tokens = retainedWords

예제 #5

0

파일 보기

파일: Text.py 프로젝트: aascode/rdocChallenge

    def get_non_denied_text(self):

        words = []
        segmenter = Segment()
        tb = TextBits()

        for segment in segmenter.segment(self):
            q_words = self.get_covered_tokens(segment.begQue, segment.endQue)
            a_words = self.get_covered_tokens(segment.begAns, segment.endAns)
            if segment.answers:
                if not set(segment.answers[0].lower().split()).intersection(
                        tb.denials):
                    words.extend(a_words)
                else:
                    words.extend(q_words)
        return words

예제 #6

0

파일 보기

파일: Text.py 프로젝트: aascode/rdocChallenge

    def remove_concepts_from_family_questions(self):
        """
        Function to remove concepts that occur in questions that are denied by the 
        It scans the question and answer for concepts, if the answer is "yes", or complicated, we keep the concepts in the question
        If the answer is uncertain, we ignore concepts detected in the question (if the answer is long, we keep the answer)
        """
        tb = TextBits()
        retainedConcepts = []
        retainedWords = []
        segmenter = Segment()

        for segment in segmenter.segment(self):
            qCon = self.get_covered_concepts_annots(segment.begQue,
                                                    segment.endQue)
            aCon = self.get_covered_concepts_annots(segment.begAns,
                                                    segment.endAns)

            qTok = self.get_covered_words_annots(segment.begQue,
                                                 segment.endQue)
            aTok = self.get_covered_words_annots(segment.begAns,
                                                 segment.endAns)
            #print('qcon:',[con.ide for con in qCon])
            #print('acon:',[con.ide for con in aCon])
            if bool(segment.answers):
                cueFound = False
                for cue in tb.family:
                    if cue in segment.question.lower():
                        cueFound = True
                        break

                if not cueFound:
                    retainedConcepts.extend(qCon)
                    retainedConcepts.extend(aCon)
                    retainedWords.extend(qTok)
                    retainedWords.extend(aTok)
                #do nothing if remove, add it with a prefix if modify
        self.concepts = retainedConcepts
        self.tokens = retainedWords

예제 #7

0

파일 보기

파일: QandAFeatures.py 프로젝트: aascode/rdocChallenge

    def __init__(self,
                 expSet,
                 trainSet,
                 groupTrainQues=False,
                 findSimTestQ=False):
        """
        Class for featurization based on Q-and-A

        :param featList: A list of features to apply.
        :param trainSet: A list of training that can be used to determine the questions and/or question clusters used
        """
        self.groupTrainQues = groupTrainQues
        self.retrieveSimilarQuestions = findSimTestQ

        self.segmenter = Segment()
        self.textBits = TextBits()

        self.expSet = expSet

        featuredict = {
            "BOW_ANSWERS": self.get_bow_answers,
            "CATEGORICAL_QUESTIONSET": self.get_categorical_questionset,
            "QUESTIONSET": self.get_questionset,
            "PREAMBLE_CLUSTERS": self.get_preamble_clusters,
            "CONCEPT_CLUSTERS": self.get_concept_clusters,
            "LONG_QUESTIONSET": self.get_long_questionset,
            "CONCEPTS_FROM_SUMMARY": self.get_concepts_from_summary
        }

        #questionSet - > yes/no/uncertain/else QSet
        #descQuesSet add
        self.featList = {
            f: featuredict[f.upper()]
            for f in expSet.featTypes if f.upper() in featuredict
        }

        # Determine features.
        # Gets questions from trainingset. {Questions:freq in train data}
        self.questionSet = dict()

        # make a list of segments found in the entire trainingset

        segments = []
        self.segConPairs = dict(
        )  #used for concept-based question clusterer, (a reconstructed list of concepts that occur in a question)
        for d in trainSet:
            currSegments = self.segmenter.segment(d.getTextObject())
            for currSegment in currSegments:
                currConInSegment = set()
                for concept in d.getTextObject().get_covered_concepts_annots(
                        currSegment.begQue, currSegment.endQue):
                    nc = copy.copy(concept)
                    #removing prefixes if there are any
                    if ('_' in nc.ide):
                        nc.ide = nc.ide[nc.ide.index('_') + 1:len(nc.ide)]
                    currConInSegment.add(nc)
                self.segConPairs[currSegment.question] = currConInSegment
                #self.segConPairs.append([currSegment.question, currConInSegment])
            segments.extend(currSegments)
        self.questionSet = Counter([seg.question.upper() for seg in segments])

        #for each q in qset assign categories - short/long etc
        #utils.out("Identifying question type")
        self.all_unique_questions, self.all_questions_type, self.all_questions_cat = self.identify_ques_type(
            trainSet, thresholdYNU=0.3, thresholdCat=0.3, thresholdLong=0.3)

        ## Now, doing the same for groups of questions occurring in train
        if self.groupTrainQues:
            utils.out("Grouping questions")
            self.grouped_questions, self.questions_type, self.grouped_questions_cat = self.get_grouped_questions(
                trainSet, simThreshold=0.9)
            utils.out("Assigning question type to grouped questions")
            self.grouped_questions_type = self.assign_ques_type(
                self.questions_type,
                0.14,
                len(trainSet),
                thresholdYNU=0.3,
                thresholdCat=0.3,
                thresholdLong=0.3)  #14%: smallest data set size

            utils.out("Done preprocessing questions, now clustering")

        ## Getting clusters based on the preamble
        if ('PREAMBLE_CLUSTERS' in self.featList):
            self.calcFalseulatePreambleClusters()

        ## Getting clusters based on the concepts in the questions
        if ('CONCEPT_CLUSTERS' in self.featList):
            self.calculateCommonConceptClusters()