class СollocationBuilder(AbstractFragmentIterator): def __init__(self, accessor, indexPrefix, configuration=None): super(СollocationBuilder, self).__init__(accessor, indexPrefix, configuration) self.posListIndex = POSListIndex(accessor, indexPrefix) self.flMatcher = FormalLanguagesMatcher() self.defisWordsBuilder = DefisWordsBuilder() def preProcess(self): self.fragments = {} self.totalCounts = Counter() self.totalLen = 0 self.fragmentLen = {} def postProcess(self): grammars = {} for fType in self.fragments: grammars[fType] = [] for fragment in sorted(self.fragments[fType], key=self.fragments[fType].get, reverse=True): grammars[fType].append({ 'name': str(fragment), 'freq': self.fragments[fType][fragment] / self.fragmentLen[fType], 'total_freq': self.totalCounts[fragment] / self.totalLen, 'grammar': fragment.genGrammar(), }) with open(self.accessor.directory + self.prefix + 'collocations.pcl', 'wb') as f: pickle.dump(grammars, f, pickle.HIGHEST_PROTOCOL) def processFragmentStart(self, fType): self.fragments[fType] = Counter() self.fragmentLen[fType] = 0 self.goodWords = self.posListIndex.getFunctionalNouns(fType, 0.5) def processFragmentEnd(self, fType): pass def processDocument(self, fType, headerId, docId): text = self.headerIndex.getDocSection(docId, headerId) self.tokenSplitter.split(text) tokens = self.tokenSplitter.getTokenArray() self.posTagger.posTagging(tokens) self.flMatcher.combineTokens(tokens) self.defisWordsBuilder.combineTokens(tokens) fragmentStart = -1 fragmentEnd = -1 self.fragmentLen[fType] += len(tokens) self.totalLen += len(tokens) signCount = 0 for i in range(len(tokens)): if ((fragmentEnd - fragmentStart >= 2 and signCount < fragmentEnd - fragmentStart) or (fragmentEnd - fragmentStart == 1 and tokens[fragmentStart].tokenType != TYPE_SIGN)): fragment = Fragment(tokens[fragmentStart:fragmentEnd], self.goodWords) self.fragments[fType][fragment] += 1 self.totalCounts[fragment] += 1 signCount = 0 # "Общий" - существительное из предметной области if Fragment.isCommon(tokens[i], self.goodWords): fragmentStart = -1 fragmentEnd = -1 signCount = 0 else: if fragmentStart == -1: fragmentStart = i if tokens[i].tokenType == TYPE_SIGN: signCount += 1 fragmentEnd = i def printFragments(self, onlyGood=False): for fType in self.fragments: print(fType) for fragment in sorted(self.fragments[fType], key=self.fragments[fType].get, reverse=True): if onlyGood and not fragment.isGood(): continue if self.fragments[fType][fragment] < 10: break print("\t{}:{}".format(str(fragment), self.fragments[fType][fragment]))
class TextFragmentator: def __init__(self, accessor, prefix): #self.hists #self.tfidf #self.patterns self.tokenSplitter = TokenSplitter() self.posTagger = POSTagger() self.flSelector = FormalLanguagesMatcher() self.defisWordsBuilder = DefisWordsBuilder() self.initialsWordsBuilder = InitialsWordsBuilder() self.formalLanguagesMatcher = FormalLanguagesMatcher() self.headersMatcher = HeaderMatcher() self.sentenceSplitter = SentenceSplitter() self.posListIndex = POSListIndex(accessor, prefix) self.collocatonsGrammars = CollocationGrammars(accessor, prefix) self.fragmentTypes = self.collocatonsGrammars.getFunctionalTypes() self.verbs = self.posListIndex.getVerbsHistsForAllTypes() self.patternMatcher = PatternMatcher() self.sq = lambda x: x * x self.sums = {} for fType in self.verbs: self.sums[fType] = self.module(self.verbs[fType]) def module(self, data): return sqrt(sum(map(self.sq, data.values()))) def findPatterns(self, fType, sentence): patternsWeight = 0 if not sentence.internalTokens: return patternsWeight patterns = self.collocatonsGrammars.getGrammars(fType, border=0.00005) for pattern in patterns: if (pattern['freq'] / pattern['total_freq'] <= 1): continue self.patternMatcher.setParameters(pattern['grammar'], fType) self.patternMatcher.combineTokens(sentence.internalTokens, False) if len(self.patternMatcher.newTokens) > 0: sentence.setFlag(fType, True) patternsWeight += pattern['freq'] / pattern['total_freq'] return patternsWeight def estimateLexicalSimilarity(self, fType, hists): typeVerbs = self.verbs[fType] est = 0 for v in hists: est += hists[v] * typeVerbs[v] module = self.module(hists) if module == 0: return 0 est /= self.sums[fType] * module return est def estimate(self, fType, sentence, hists): patternsCount = self.findPatterns(fType, sentence) lexicalSimilarity = self.estimateLexicalSimilarity(fType, hists) #print (fType+": "+str(patternsCount)) return { 'pattern': patternsCount, 'lexical': lexicalSimilarity, } def genFragments(self, text, border=0.1): tokens = self.tokenSplitter.split(text) self.newTokens = [] self.posTagger.posTagging(tokens) self.defisWordsBuilder.combineTokens(tokens) self.initialsWordsBuilder.combineTokens(tokens) self.headersMatcher.combineTokens(tokens) self.formalLanguagesMatcher.combineTokens(tokens) self.sentenceSplitter.combineTokens(tokens) lexicalEstimations = np.zeros((len(tokens), len(self.fragmentTypes)), dtype=np.float) patternEstimations = np.zeros((len(tokens), len(self.fragmentTypes)), dtype=np.float) totalEstimations = np.zeros((len(tokens), len(self.fragmentTypes)), dtype=np.float) for ind in range(len(tokens)): #print(tokens[ind].token) if not tokens[ind].internalTokens: hists = calcHist([tokens[ind]])['VERB'] else: hists = calcHist(tokens[ind].internalTokens)['VERB'] for fTypeInd in range(len(self.fragmentTypes)): fType = self.fragmentTypes[fTypeInd] oneEstimation = self.estimate(fType, tokens[ind], hists) lexicalEstimations[ind][fTypeInd] = oneEstimation['lexical'] patternEstimations[ind][fTypeInd] = oneEstimation['pattern'] lexicalEstimations = lexicalEstimations / np.amax(lexicalEstimations) patternEstimations = patternEstimations / np.amax(patternEstimations) totalEstimations = patternEstimations + 0.5 * lexicalEstimations #for ind in range(len(tokens)): # for fTypeInd in range(len(self.fragmentTypes)): # totalEstimations[ind][fTypeInd] = patternEstimations[ind][fTypeInd] + 0.5 * patternEstimations[ind][fTypeInd] totalEstimations = totalEstimations / np.amax(totalEstimations) orderedLexicalTypes = np.argsort(lexicalEstimations, axis=1) orderedPatternTypes = np.argsort(patternEstimations, axis=1) orderedTotalTypes = np.argsort(totalEstimations, axis=1) self.combineTokens( tokens, { 'lexical': { 'values': lexicalEstimations, "order": orderedLexicalTypes }, 'pattern': { 'values': patternEstimations, "order": orderedPatternTypes }, 'total': { 'values': totalEstimations, "order": orderedTotalTypes } }, border) return tokens def calcType(self, border, estC, estL=None, estR=None): votes = Counter() weight = Counter() #neibVotes = Counter() for parameter in range(len(estC)): estCBest = estC[parameter]['order'][-1] estC2Best = estC[parameter]['order'][-2] if estC[parameter]['value'][estCBest] > border: votes[self.fragmentTypes[estCBest]] += 6 weight[self.fragmentTypes[estCBest]] += estC[parameter][ 'value'][estCBest] if estC[parameter]['value'][ estC2Best] > 0.7 * estC[parameter]['value'][estCBest]: votes[self.fragmentTypes[estC2Best]] += 4 else: votes[self.fragmentTypes[estC2Best]] += 2 weight[self.fragmentTypes[estC2Best]] += estC[parameter][ 'value'][estC2Best] if estL: estLBest = estL[parameter]['order'][-1] if estL[parameter]['value'][estLBest] > border: votes[self.fragmentTypes[estLBest]] += 2 #neibVotes[self.fragmentTypes[estLBest]] += 1 if estR: estRBest = estR[parameter]['order'][-1] if estR[parameter]['value'][estRBest] > border: votes[self.fragmentTypes[estRBest]] += 2 #neibVotes[self.fragmentTypes[estRBest]] += 1 #print (votes.most_common(4)) #print (weight.most_common(4)) commons = votes.most_common(2) commonWeights = weight.most_common(2) if len(commonWeights) == 0: return None if len(commons) == 0 or commons[0][1] == 0: return None if (len(commons) == 1 or (len(commonWeights) > 1 and commonWeights[1][1] < 0.9 * commonWeights[0][1]) or commons[1][1] != commons[0][1]): return commons[0][0] #neibCommons = votes.most_common(2) #if neibCommons[0][1] == 0: # return None #if len(neibCommons) > 1 and neibCommons[0][1] != neibCommons[1][1]: # return neibCommons[0][0] return None ''' for parameter in estC: if estC[parameter]['fType']: votes[estC[parameter]['fType']] += 4 if estC[parameter]['all'][1][1] > 0.7 * estC[parameter]['estimation']: votes[estC[parameter]['all'][1][0]] += 3 else: votes[estC[parameter]['all'][1][0]] += 1 if estL and estL[parameter]['fType']: votes[estL[parameter]['fType']] += 2 neibVotes[estL[parameter]['fType']] += 1 if estR and estR[parameter]['fType']: votes[estR[parameter]['fType']] += 2 neibVotes[estR[parameter]['fType']] += 1 print (votes.most_common(4)) commons = votes.most_common(2) if commons[0][1] != commons[1][1]: return commons[0][0] neibCommons = votes.most_common(2) if neibCommons[0][1] != neibCommons[1][1]: return neibCommons[0][0] return None ''' def __getOneEstimation(self, estimations, indToken): return [ { 'order': estimations['lexical']['order'][indToken], 'value': estimations['lexical']['values'][indToken] }, { 'order': estimations['pattern']['order'][indToken], 'value': estimations['pattern']['values'][indToken] }, { 'order': estimations['total']['order'][indToken], 'value': estimations['total']['values'][indToken] }, ] def __convertEstimationToNativeFormat(self, estimation): res = [['lexical', []], ['pattern', []], ['total', []]] for criteria in range(len(res)): for fTypeInd in estimation[criteria]['order']: res[criteria][1].append( (self.fragmentTypes[fTypeInd], estimation[criteria]['value'][fTypeInd])) return res def combineTokens(self, tokens, estimations, border): tokenTypes = [] for indToken in range(len(tokens)): estC = self.__getOneEstimation(estimations, indToken) estL = None estR = None if indToken > 0: estL = self.__getOneEstimation(estimations, indToken - 1) if indToken < len(tokens) - 1: estR = self.__getOneEstimation(estimations, indToken - 1) tokenTypes.append(self.calcType(border, estC, estL, estR)) #print (tokenTypes[indToken]) tokens[indToken].setAdditionalInfo('functionalType', tokenTypes[indToken]) if tokenTypes[indToken]: self.newTokens.append(tokens[indToken]) tokens[indToken].setAdditionalInfo( 'ft_estimations', self.__convertEstimationToNativeFormat(estC)) #print (tokens[indToken].token) '''