Python Utilities.ComputeVector примеры использования

Язык программирования: Python

Пространство имен/Пакет: Utilities

Класс/Тип: Utilities

Метод/Функция: ComputeVector

Примеров на hotexamples.com: 2

Python Utilities.ComputeVector - 2 примера найдено. Это лучшие примеры Python кода для Utilities.Utilities.ComputeVector, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Utilities(30)

ShowError(23)

CrFolder(16)

__init__(6)

errorWindow(6)

chop(5)

ExitError(5)

dateTagToDateTime(4)

BuildBatch(4)

chr2hex(3)

calculateModifier(3)

GetFolders(3)

cm2inch(3)

dmToDd(3)

calculateDistanceBetweenPoints(3)

rescan(2)

convertNumericIpToHex(2)

coordAdd(2)

coordToBoardArray(2)

euclidian_distance(2)

BuildValidationDataset(2)

find_path(2)

dec2hex(2)

delete_previous_output(2)

find_nearest(2)

dateTagToDate(2)

createTicketFromTuple(2)

checkersum(2)

ComputeVector(2)

checkForFile(2)

Oper_Det(2)

GetUserInput(2)

boardArrayToCoord(2)

comm(1)

fixDateTime(1)

emptyList(1)

fill_array_from_start_to_end(1)

filterData(1)

find_nEqual_nZero(1)

GetTrainedEmulator(1)

GetClosestContextFile(1)

generate_mst_map_file(1)

generate_map_disto_file(1)

doesFileExist(1)

generate_one_map_file(1)

generate_studios(1)

getDueDate(1)

group(1)

groupAndMerge(1)

readQueryAndConstructQueryDictionary(1)

Пример #1

Показать файл

Файл: ProcessContext.py Проект: coderop2/Native-QA-Bot

class ProcessContext:
    def __init__(self, contextParas, remove_stopwords=True):
        self.utl = Utilities()
        self.sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        self.remove_stopwords = remove_stopwords
        self.stopwords = stopwords.words('english')
        self.stemmer = PorterStemmer()
        self.numOfParas = len(contextParas)
        self.paraInfo, self.vocab, self.processed_vocab = self.processParas(
            contextParas)
        del contextParas

    def processParas(self, paras):
        idf = {}
        docs = {}
        vocab = set()
        processed_vocab = set()

        for index in range(self.numOfParas):

            docs[index] = {}
            docs[index]['para'] = paras[index]
            docs[index]['paraWords'] = word_tokenize(paras[index])
            vocab.update(docs[index]['paraWords'])
            docs[index]['paraSentences'] = self.sent_tokenizer.tokenize(
                paras[index])
            wf, processed_sentences, pv = self.processSentences(
                docs[index]['paraSentences'])
            docs[index]['paraWF'] = wf
            docs[index]['paraProcessedSentences'] = processed_sentences
            docs[index]['paraPV'] = pv
            processed_vocab.update(pv)

            for word in pv:
                if idf.get(word, 0) == 0:
                    idf[word] = 1
                else:
                    idf[word] += 1

        self.contextIDF = {}
        for word in idf:
            # Laplace smoothing
            self.contextIDF[word] = math.log((self.numOfParas + 1) / idf[word])

        for index in range(self.numOfParas):
            docs[index]['paraVector'] = self.utl.ComputeVector(
                docs[index]['paraWF'], self.contextIDF)

        return docs, vocab, processed_vocab

    def processSentences(self, sentences):
        wf = {}
        processed_sentences = []
        processed_vocab = set()
        for sent in sentences:
            processed_sentence = []
            words = word_tokenize(sent)
            wf, processed_sentence = self.utl.processText(
                words, self.remove_stopwords, self.stemmer, wf)
            # for word in words:
            #     if not self.remove_stopwords:
            #         continue
            #     else:
            #         if word in self.stopwords:
            #             continue
            #     word = self.stemmer.stem(word)
            #     if wf.get(word, 0) == 0:
            #         wf[word] = 1
            #     else:
            #         wf[word] += 1
            #     processed_sentence.append(word)
            processed_vocab.update(processed_sentence)
            processed_sentences.append(" ".join(processed_sentence))
        return wf, processed_sentence, processed_vocab

    def getResults(self, PQ):
        PQ.questionDoc['questionVector'] = self.utl.ComputeVector(
            PQ.questionDoc['questionWF'], self.contextIDF)
        # print(PQ.questionDoc['questionVector'])
        simParas = self.getSimilarParas(PQ.questionDoc)
        allSentences = []
        if simParas != None:
            for i in simParas:
                allSentences.extend(sent_tokenize(i[0]))

        if len(allSentences) == 0:
            return "Oops! Unable to find answer"

        relevantSentencesWithScores = self.getMostRelevantSentences(
            allSentences, PQ, 1)
        sentences = [
            sentencewithscore[0]
            for sentencewithscore in relevantSentencesWithScores
        ]
        # print(relevantSentences)

        answerType = PQ.questionDoc['Atype']
        print(answerType)
        answer = " ".join(sentences[:2])

        if answerType in ["GPE", "PERSON", "ORGANIZATION"]:
            entities = self.utl.getNamedEntityChunks(sentences)
            for entity in entities:
                if entity[0] == answerType:
                    answer = entity[1]
                    # print(entities)
                    ansTokens = [
                        self.stemmer.stem(word)
                        for word in word_tokenize(answer)
                    ]
                    if [(i in PQ.questionDoc['processedQuestion'])
                            for i in ansTokens].count(True) >= 1:
                        continue
                    break
        elif answerType == "DEFINITION":
            answer = " ".join(sentences[:4])

        elif answerType == "DATE":
            dates = self.utl.getDates(" ".join(sentences))
            # print(dates)
            if len(dates) > 0:
                answer = dates[0]

        elif answerType == "QUANTITY":
            answer = "Work In Progress"
        elif answerType == "YESNO":
            answer = "Work In Progress"
        else:
            if len(sentences) > 5:
                answer = " ".join(sentences[:5])

        return answer

    def getSimilarParas(self, questionVector):
        if questionVector['questionVector'] == 0:
            return None

        qv = questionVector['questionWF']

        rankedParas = []
        for index in range(self.numOfParas):
            dotProduct = 0
            paraInfo = self.paraInfo[index]
            for word in qv.keys():
                if word in paraInfo['paraWF']:
                    dotProduct += qv[word] * paraInfo['paraWF'][
                        word] * self.contextIDF[word] * self.contextIDF[word]

            sim = dotProduct / (paraInfo['paraVector'] *
                                questionVector['questionVector'])
            rankedParas.append((paraInfo['para'], sim))

        return sorted(rankedParas, key=lambda x: (x[1], x[0]),
                      reverse=True)[:4]

    def getMostRelevantSentences(self, sentences, pQ, nGram=3):
        relevantSentences = []
        for sent in sentences:
            sim = 0
            if (len(word_tokenize(pQ.question)) > nGram + 1):
                sim = self.sim_ngram_sentence(pQ.question, sent, nGram)
            else:
                sim = self.sim_sentence(pQ.qVector, sent)
            relevantSentences.append((sent, sim))

        return sorted(relevantSentences,
                      key=lambda tup: (tup[1], tup[0]),
                      reverse=True)

    def sim_ngram_sentence(self, question, sentence, nGram):
        #considering stop words as well
        ps = PorterStemmer()
        getToken = lambda question: [
            ps.stem(w.lower()) for w in word_tokenize(question)
        ]
        getNGram = lambda tokens, n: [
            " ".join([tokens[index + i] for i in range(0, n)])
            for index in range(0,
                               len(tokens) - n + 1)
        ]
        qToken = getToken(question)
        sToken = getToken(sentence)

        if (len(qToken) > nGram):
            q3gram = set(getNGram(qToken, nGram))
            s3gram = set(getNGram(sToken, nGram))
            if (len(s3gram) < nGram):
                return 0
            qLen = len(q3gram)
            sLen = len(s3gram)
            sim = len(q3gram.intersection(s3gram)) / len(q3gram.union(s3gram))
            return sim
        else:
            return 0

    def sim_sentence(self, queryVector, sentence):
        sentToken = word_tokenize(sentence)
        ps = PorterStemmer()
        for index in range(0, len(sentToken)):
            sentToken[index] = ps.stem(sentToken[index])
        sim = 0
        for word in queryVector.keys():
            w = ps.stem(word)
            if w in sentToken:
                sim += 1
        return sim / (len(sentToken) * len(queryVector.keys()))

Пример #2

Показать файл

class ProcessContext:
    def __init__(self, contextParas, remove_stopwords, lemm_or_stemm,
                 use_stemmer_lemm, which_stemmer, sim_func, sent_t):
        self.utl = Utilities()
        self.sent_tokenizer = sent_tokenize
        self.sim_func = sim_func
        if sent_t == "Punkt":
            self.sent_tokenizer = nltk.data.load(
                'tokenizers/punkt/english.pickle').tokenize
        self.remove_stopwords = remove_stopwords
        self.stopwords = stopwords.words('english')
        self.stemmer = lambda x: x.lower()
        if lemm_or_stemm == "Stemming" and which_stemmer == "PorterStemmer" and use_stemmer_lemm:
            self.stemmer = PorterStemmer().stem
        elif lemm_or_stemm == "Stemming" and which_stemmer == "SnowBallStemmer" and use_stemmer_lemm:
            self.stemmer = SnowballStemmer(language='english').stem
        elif lemm_or_stemm == "Lemmanization" and use_stemmer_lemm:
            self.stemmer = WordNetLemmatizer().lemmatize
        self.numOfParas = len(contextParas)
        self.paraInfo, self.vocab, self.processed_vocab = self.processParas(
            contextParas)
        del contextParas

    def processParas(self, paras):
        idf = {}
        docs = {}
        vocab = set()
        processed_vocab = set()

        for index in range(self.numOfParas):
            docs[index] = {}
            docs[index]['para'] = paras[index]
            docs[index]['paraWords'] = word_tokenize(paras[index])
            vocab.update(docs[index]['paraWords'])
            docs[index]['paraSentences'] = self.sent_tokenizer(paras[index])
            wf, processed_sentences, pv = self.processSentences(
                docs[index]['paraSentences'])
            docs[index]['paraWF'] = wf
            docs[index]['paraProcessedSentences'] = processed_sentences
            docs[index]['paraPV'] = pv
            processed_vocab.update(pv)

            for word in pv:
                if idf.get(word, 0) == 0:
                    idf[word] = 1
                else:
                    idf[word] += 1

        self.contextIDF = {}
        for word in idf:
            self.contextIDF[word] = math.log((self.numOfParas + 1) / idf[word])

        for index in range(self.numOfParas):
            docs[index]['paraVector'] = self.utl.ComputeVector(
                docs[index]['paraWF'], self.contextIDF)

        return docs, vocab, processed_vocab

    def processSentences(self, sentences):
        wf = {}
        processed_sentences = []
        processed_vocab = set()
        for sent in sentences:
            processed_sentence = []
            words = word_tokenize(sent)
            wf, processed_sentence = self.utl.processText(
                words, self.remove_stopwords, self.stemmer, wf)
            processed_vocab.update(processed_sentence)
            processed_sentences.append(" ".join(processed_sentence))
        return wf, processed_sentence, processed_vocab

    def getResults(self, PQ):
        PQ.questionDoc['questionVector'] = self.utl.ComputeVector(
            PQ.questionDoc['questionWF'], self.contextIDF)
        # print(PQ.questionDoc['questionVector'])
        simParas = self.getSimilarParas(PQ.questionDoc)
        allSentences = []
        if simParas != None:
            for i in simParas:
                allSentences.extend(sent_tokenize(i[0]))

        if len(allSentences) == 0:
            return "Oops! Unable to find answer"

        relevantSentencesWithScores = self.getMostRelevantSentences(
            allSentences, PQ, 1)
        if self.sim_func in ["SkLearn", "Gensim"]:
            sentencesWithSimScores = set()
            sentencesWithSimScores.update(
                self.getMostSimilarSentences(PQ.question, allSentences))
            while len(sentencesWithSimScores) < 5:
                sentencesWithSimScores.update(
                    self.getMostSimilarSentences(sentencesWithSimScores[0],
                                                 allSentences, True))
            relevantSentencesWithScores = list(sentencesWithSimScores)

        sentences = [
            sentencewithscore[0]
            for sentencewithscore in relevantSentencesWithScores
        ]
        # print(relevantSentences)

        answerType = PQ.questionDoc['Atype']
        # print(answerType)
        answer = " ".join(sentences[:2])

        if answerType in ["GPE", "PERSON", "ORGANIZATION"]:
            entities = self.utl.getNamedEntityChunks(sentences)
            for entity in entities:
                if entity[0] == answerType:
                    answer = entity[1]
                    # print(entities)
                    ansTokens = [
                        self.stemmer(word) for word in word_tokenize(answer)
                    ]
                    if [(i in PQ.questionDoc['processedQuestion'])
                            for i in ansTokens].count(True) >= 1:
                        continue
                    break
        elif answerType == "DEFINITION":
            answer = " ".join(sentences[:4])

        elif answerType == "DATE":
            dates = self.utl.getDates(" ".join(sentences))
            # print(dates)
            if len(dates) > 0:
                answer = dates[0]

        elif answerType == "QUANTITY":
            answer = "Work In Progress"
        elif answerType == "YESNO":
            answer = "Work In Progress"
        else:
            if len(sentences) > 5:
                answer = " ".join(sentences[:5])

        return answer

    def getSimilarParas(self, questionVector):
        if questionVector['questionVector'] == 0:
            return None

        qv = questionVector['questionWF']

        rankedParas = []
        for index in range(self.numOfParas):
            dotProduct = 0
            paraInfo = self.paraInfo[index]
            for word in qv.keys():
                if word in paraInfo['paraWF']:
                    dotProduct += qv[word] * paraInfo['paraWF'][
                        word] * self.contextIDF[word] * self.contextIDF[word]

            sim = dotProduct / (paraInfo['paraVector'] *
                                questionVector['questionVector'])
            rankedParas.append((paraInfo['para'], sim))

        return sorted(rankedParas, key=lambda x: (x[1], x[0]),
                      reverse=True)[:4]

    def getMostSimilarSentences(self, question, sentences, second_time=False):
        documents = [question]
        documents.extend(sentences)
        df = self.getsimilarityDF(documents)
        res = cosine_similarity(df, df)
        z = res[0, :]
        y = []
        for idx, i in enumerate(z):
            if i > 0:
                y.append((documents[idx], i))
        arr = sorted(y, key=lambda x: (x[1], x[0]), reverse=True)
        return arr[2:] if second_time else arr[1:]

    def getsimilarityDF(self, documents):
        count_vectorizer = CountVectorizer(stop_words='english')
        count_vectorizer = CountVectorizer()
        sparse_matrix = count_vectorizer.fit_transform(documents)
        doc_term_matrix = sparse_matrix.todense()
        return doc_term_matrix

    # Thanks to the project made by vaibhav for the below Code
    def getMostRelevantSentences(self, sentences, pQ, nGram=3):
        relevantSentences = []
        for sent in sentences:
            sim = 0
            if (len(word_tokenize(pQ.question)) > nGram + 1):
                sim = self.sim_ngram_sentence(pQ.question, sent, nGram)
            else:
                sim = self.sim_sentence(pQ.qVector, sent)
            relevantSentences.append((sent, sim))

        return sorted(relevantSentences,
                      key=lambda tup: (tup[1], tup[0]),
                      reverse=True)

    def sim_ngram_sentence(self, question, sentence, nGram):
        #considering stop words as well
        ps = PorterStemmer()
        getToken = lambda question: [
            ps.stem(w.lower()) for w in word_tokenize(question)
        ]
        getNGram = lambda tokens, n: [
            " ".join([tokens[index + i] for i in range(0, n)])
            for index in range(0,
                               len(tokens) - n + 1)
        ]
        qToken = getToken(question)
        sToken = getToken(sentence)

        if (len(qToken) > nGram):
            q3gram = set(getNGram(qToken, nGram))
            s3gram = set(getNGram(sToken, nGram))
            if (len(s3gram) < nGram):
                return 0
            qLen = len(q3gram)
            sLen = len(s3gram)
            sim = len(q3gram.intersection(s3gram)) / len(q3gram.union(s3gram))
            return sim
        else:
            return 0

    def sim_sentence(self, queryVector, sentence):
        sentToken = word_tokenize(sentence)
        ps = PorterStemmer()
        for index in range(0, len(sentToken)):
            sentToken[index] = ps.stem(sentToken[index])
        sim = 0
        for word in queryVector.keys():
            w = ps.stem(word)
            if w in sentToken:
                sim += 1
        return sim / (len(sentToken) * len(queryVector.keys()))