예제 #1
0
class СollocationBuilder(AbstractFragmentIterator):
    def __init__(self, accessor, indexPrefix, configuration=None):
        super(СollocationBuilder, self).__init__(accessor, indexPrefix,
                                                 configuration)
        self.posListIndex = POSListIndex(accessor, indexPrefix)
        self.flMatcher = FormalLanguagesMatcher()
        self.defisWordsBuilder = DefisWordsBuilder()

    def preProcess(self):
        self.fragments = {}
        self.totalCounts = Counter()
        self.totalLen = 0
        self.fragmentLen = {}

    def postProcess(self):
        grammars = {}
        for fType in self.fragments:
            grammars[fType] = []
            for fragment in sorted(self.fragments[fType],
                                   key=self.fragments[fType].get,
                                   reverse=True):
                grammars[fType].append({
                    'name':
                    str(fragment),
                    'freq':
                    self.fragments[fType][fragment] / self.fragmentLen[fType],
                    'total_freq':
                    self.totalCounts[fragment] / self.totalLen,
                    'grammar':
                    fragment.genGrammar(),
                })
        with open(self.accessor.directory + self.prefix + 'collocations.pcl',
                  'wb') as f:
            pickle.dump(grammars, f, pickle.HIGHEST_PROTOCOL)

    def processFragmentStart(self, fType):
        self.fragments[fType] = Counter()
        self.fragmentLen[fType] = 0
        self.goodWords = self.posListIndex.getFunctionalNouns(fType, 0.5)

    def processFragmentEnd(self, fType):
        pass

    def processDocument(self, fType, headerId, docId):
        text = self.headerIndex.getDocSection(docId, headerId)
        self.tokenSplitter.split(text)
        tokens = self.tokenSplitter.getTokenArray()
        self.posTagger.posTagging(tokens)
        self.flMatcher.combineTokens(tokens)
        self.defisWordsBuilder.combineTokens(tokens)
        fragmentStart = -1
        fragmentEnd = -1
        self.fragmentLen[fType] += len(tokens)
        self.totalLen += len(tokens)
        signCount = 0
        for i in range(len(tokens)):
            if ((fragmentEnd - fragmentStart >= 2
                 and signCount < fragmentEnd - fragmentStart)
                    or (fragmentEnd - fragmentStart == 1
                        and tokens[fragmentStart].tokenType != TYPE_SIGN)):
                fragment = Fragment(tokens[fragmentStart:fragmentEnd],
                                    self.goodWords)
                self.fragments[fType][fragment] += 1
                self.totalCounts[fragment] += 1
                signCount = 0
                # "Общий" - существительное из предметной области
            if Fragment.isCommon(tokens[i], self.goodWords):
                fragmentStart = -1
                fragmentEnd = -1
                signCount = 0
            else:
                if fragmentStart == -1:
                    fragmentStart = i
                if tokens[i].tokenType == TYPE_SIGN:
                    signCount += 1
                fragmentEnd = i

    def printFragments(self, onlyGood=False):
        for fType in self.fragments:
            print(fType)
            for fragment in sorted(self.fragments[fType],
                                   key=self.fragments[fType].get,
                                   reverse=True):
                if onlyGood and not fragment.isGood():
                    continue
                if self.fragments[fType][fragment] < 10:
                    break
                print("\t{}:{}".format(str(fragment),
                                       self.fragments[fType][fragment]))
예제 #2
0
class TextFragmentator:
    def __init__(self, accessor, prefix):
        #self.hists
        #self.tfidf
        #self.patterns

        self.tokenSplitter = TokenSplitter()
        self.posTagger = POSTagger()
        self.flSelector = FormalLanguagesMatcher()
        self.defisWordsBuilder = DefisWordsBuilder()
        self.initialsWordsBuilder = InitialsWordsBuilder()
        self.formalLanguagesMatcher = FormalLanguagesMatcher()
        self.headersMatcher = HeaderMatcher()
        self.sentenceSplitter = SentenceSplitter()

        self.posListIndex = POSListIndex(accessor, prefix)
        self.collocatonsGrammars = CollocationGrammars(accessor, prefix)
        self.fragmentTypes = self.collocatonsGrammars.getFunctionalTypes()
        self.verbs = self.posListIndex.getVerbsHistsForAllTypes()
        self.patternMatcher = PatternMatcher()

        self.sq = lambda x: x * x
        self.sums = {}
        for fType in self.verbs:
            self.sums[fType] = self.module(self.verbs[fType])

    def module(self, data):
        return sqrt(sum(map(self.sq, data.values())))

    def findPatterns(self, fType, sentence):
        patternsWeight = 0
        if not sentence.internalTokens:
            return patternsWeight
        patterns = self.collocatonsGrammars.getGrammars(fType, border=0.00005)
        for pattern in patterns:
            if (pattern['freq'] / pattern['total_freq'] <= 1):
                continue
            self.patternMatcher.setParameters(pattern['grammar'], fType)
            self.patternMatcher.combineTokens(sentence.internalTokens, False)
            if len(self.patternMatcher.newTokens) > 0:
                sentence.setFlag(fType, True)
                patternsWeight += pattern['freq'] / pattern['total_freq']
        return patternsWeight

    def estimateLexicalSimilarity(self, fType, hists):
        typeVerbs = self.verbs[fType]
        est = 0
        for v in hists:
            est += hists[v] * typeVerbs[v]
        module = self.module(hists)
        if module == 0:
            return 0
        est /= self.sums[fType] * module
        return est

    def estimate(self, fType, sentence, hists):
        patternsCount = self.findPatterns(fType, sentence)
        lexicalSimilarity = self.estimateLexicalSimilarity(fType, hists)
        #print (fType+": "+str(patternsCount))
        return {
            'pattern': patternsCount,
            'lexical': lexicalSimilarity,
        }

    def genFragments(self, text, border=0.1):
        tokens = self.tokenSplitter.split(text)
        self.newTokens = []
        self.posTagger.posTagging(tokens)
        self.defisWordsBuilder.combineTokens(tokens)
        self.initialsWordsBuilder.combineTokens(tokens)
        self.headersMatcher.combineTokens(tokens)
        self.formalLanguagesMatcher.combineTokens(tokens)
        self.sentenceSplitter.combineTokens(tokens)

        lexicalEstimations = np.zeros((len(tokens), len(self.fragmentTypes)),
                                      dtype=np.float)
        patternEstimations = np.zeros((len(tokens), len(self.fragmentTypes)),
                                      dtype=np.float)
        totalEstimations = np.zeros((len(tokens), len(self.fragmentTypes)),
                                    dtype=np.float)

        for ind in range(len(tokens)):
            #print(tokens[ind].token)
            if not tokens[ind].internalTokens:
                hists = calcHist([tokens[ind]])['VERB']
            else:
                hists = calcHist(tokens[ind].internalTokens)['VERB']
            for fTypeInd in range(len(self.fragmentTypes)):
                fType = self.fragmentTypes[fTypeInd]
                oneEstimation = self.estimate(fType, tokens[ind], hists)
                lexicalEstimations[ind][fTypeInd] = oneEstimation['lexical']
                patternEstimations[ind][fTypeInd] = oneEstimation['pattern']
        lexicalEstimations = lexicalEstimations / np.amax(lexicalEstimations)
        patternEstimations = patternEstimations / np.amax(patternEstimations)
        totalEstimations = patternEstimations + 0.5 * lexicalEstimations
        #for ind in range(len(tokens)):
        #    for fTypeInd in range(len(self.fragmentTypes)):
        #        totalEstimations[ind][fTypeInd] = patternEstimations[ind][fTypeInd] + 0.5 * patternEstimations[ind][fTypeInd]

        totalEstimations = totalEstimations / np.amax(totalEstimations)
        orderedLexicalTypes = np.argsort(lexicalEstimations, axis=1)
        orderedPatternTypes = np.argsort(patternEstimations, axis=1)
        orderedTotalTypes = np.argsort(totalEstimations, axis=1)

        self.combineTokens(
            tokens, {
                'lexical': {
                    'values': lexicalEstimations,
                    "order": orderedLexicalTypes
                },
                'pattern': {
                    'values': patternEstimations,
                    "order": orderedPatternTypes
                },
                'total': {
                    'values': totalEstimations,
                    "order": orderedTotalTypes
                }
            }, border)
        return tokens

    def calcType(self, border, estC, estL=None, estR=None):
        votes = Counter()
        weight = Counter()
        #neibVotes = Counter()

        for parameter in range(len(estC)):
            estCBest = estC[parameter]['order'][-1]
            estC2Best = estC[parameter]['order'][-2]
            if estC[parameter]['value'][estCBest] > border:
                votes[self.fragmentTypes[estCBest]] += 6
                weight[self.fragmentTypes[estCBest]] += estC[parameter][
                    'value'][estCBest]
                if estC[parameter]['value'][
                        estC2Best] > 0.7 * estC[parameter]['value'][estCBest]:
                    votes[self.fragmentTypes[estC2Best]] += 4
                else:
                    votes[self.fragmentTypes[estC2Best]] += 2
                weight[self.fragmentTypes[estC2Best]] += estC[parameter][
                    'value'][estC2Best]
            if estL:
                estLBest = estL[parameter]['order'][-1]
                if estL[parameter]['value'][estLBest] > border:
                    votes[self.fragmentTypes[estLBest]] += 2
                    #neibVotes[self.fragmentTypes[estLBest]] += 1
            if estR:
                estRBest = estR[parameter]['order'][-1]
                if estR[parameter]['value'][estRBest] > border:
                    votes[self.fragmentTypes[estRBest]] += 2
                    #neibVotes[self.fragmentTypes[estRBest]] += 1
        #print (votes.most_common(4))
        #print (weight.most_common(4))
        commons = votes.most_common(2)
        commonWeights = weight.most_common(2)
        if len(commonWeights) == 0:
            return None
        if len(commons) == 0 or commons[0][1] == 0:
            return None
        if (len(commons) == 1
                or (len(commonWeights) > 1
                    and commonWeights[1][1] < 0.9 * commonWeights[0][1])
                or commons[1][1] != commons[0][1]):
            return commons[0][0]
        #neibCommons = votes.most_common(2)
        #if neibCommons[0][1] == 0:
        #    return None
        #if len(neibCommons) > 1 and neibCommons[0][1] != neibCommons[1][1]:
        #    return neibCommons[0][0]
        return None
        '''
        for parameter in estC:
            if estC[parameter]['fType']:
                votes[estC[parameter]['fType']] += 4
                if estC[parameter]['all'][1][1] > 0.7 * estC[parameter]['estimation']:
                    votes[estC[parameter]['all'][1][0]] += 3
                else:
                    votes[estC[parameter]['all'][1][0]] += 1
            if estL and estL[parameter]['fType']:
                votes[estL[parameter]['fType']] += 2
                neibVotes[estL[parameter]['fType']] += 1
            if estR and estR[parameter]['fType']:
                votes[estR[parameter]['fType']] += 2
                neibVotes[estR[parameter]['fType']] += 1
        print (votes.most_common(4))
        commons = votes.most_common(2)       
        if commons[0][1] != commons[1][1]:
            return commons[0][0]
        neibCommons = votes.most_common(2)  
        if neibCommons[0][1] != neibCommons[1][1]:
            return neibCommons[0][0]
        return None
        '''

    def __getOneEstimation(self, estimations, indToken):
        return [
            {
                'order': estimations['lexical']['order'][indToken],
                'value': estimations['lexical']['values'][indToken]
            },
            {
                'order': estimations['pattern']['order'][indToken],
                'value': estimations['pattern']['values'][indToken]
            },
            {
                'order': estimations['total']['order'][indToken],
                'value': estimations['total']['values'][indToken]
            },
        ]

    def __convertEstimationToNativeFormat(self, estimation):
        res = [['lexical', []], ['pattern', []], ['total', []]]
        for criteria in range(len(res)):
            for fTypeInd in estimation[criteria]['order']:
                res[criteria][1].append(
                    (self.fragmentTypes[fTypeInd],
                     estimation[criteria]['value'][fTypeInd]))
        return res

    def combineTokens(self, tokens, estimations, border):
        tokenTypes = []
        for indToken in range(len(tokens)):
            estC = self.__getOneEstimation(estimations, indToken)

            estL = None
            estR = None
            if indToken > 0:
                estL = self.__getOneEstimation(estimations, indToken - 1)
            if indToken < len(tokens) - 1:
                estR = self.__getOneEstimation(estimations, indToken - 1)
            tokenTypes.append(self.calcType(border, estC, estL, estR))
            #print (tokenTypes[indToken])
            tokens[indToken].setAdditionalInfo('functionalType',
                                               tokenTypes[indToken])
            if tokenTypes[indToken]:
                self.newTokens.append(tokens[indToken])
                tokens[indToken].setAdditionalInfo(
                    'ft_estimations',
                    self.__convertEstimationToNativeFormat(estC))
            #print (tokens[indToken].token)
        '''