Python tokenize_text 예제들, preprocess.tokenizer.tokenize_text Python 예제들

예제 #1

0

파일 보기

파일: binary_independence.py 프로젝트: gitbuda/infinity

    def run(self, query, page=Page(0, 20)):
        '''
        Theory:
            t -> term / token
            r -> relevant
            D -> document
            Q -> query
            sum -> sum by all terms from query
            w_t -> weight of term
            sum(w_t) -> sum of all weights from query

            document weight = sum(log(p(D_t|Q,r) / p(D_t|Q,not r)))
            where p(D_t|Q,r) = 0.5
                  p(D_t|Q,not r) = n_t / N_d
            where n_t = number of documents containing term t
                  N_d = total number of documents

        Args:
            query: query string
            page: used defined page
        '''
        tokens = tokenize_text(query)
        if len(tokens) <= 0:
            return []

        results = []
        for doc_key, document in self.documents.items():
            result = 0
            for token in tokens:
                if token not in document.bag:
                    continue
                token_docs = self.docs_bag.get(token, {})
                token_docs_len = len(token_docs)
                if token_docs_len <= 0:
                    continue
                ratio = 1.0 * token_docs_len / self.docs_no
                if abs(ratio - 10e-8) < 0:
                    continue
                result += math.log(0.5 / ratio)
            if result > 0:
                results.append((doc_key, result))

        results = sorted(results, key=lambda x: x[1], reverse=True)
        results = list(map(lambda x: x[0], results))

        return results[page.start_index:page.end_index]

예제 #2

0

파일 보기

파일: binary_independence.py 프로젝트: gitbuda/infinity

    def run(self, query, page=Page(0, 20)):
        '''
        Theory:
            t -> term / token
            r -> relevant
            D -> document
            Q -> query
            sum -> sum by all terms from query
            w_t -> weight of term
            sum(w_t) -> sum of all weights from query

            document weight = sum(log(p(D_t|Q,r) / p(D_t|Q,not r)))
            where p(D_t|Q,r) = 0.5
                  p(D_t|Q,not r) = n_t / N_d
            where n_t = number of documents containing term t
                  N_d = total number of documents

        Args:
            query: query string
            page: used defined page
        '''
        tokens = tokenize_text(query)
        if len(tokens) <= 0:
            return []

        results = []
        for doc_key, document in self.documents.items():
            result = 0
            for token in tokens:
                if token not in document.bag:
                    continue
                token_docs = self.docs_bag.get(token, {})
                token_docs_len = len(token_docs)
                if token_docs_len <= 0:
                    continue
                ratio = 1.0 * token_docs_len / self.docs_no
                if abs(ratio - 10e-8) < 0:
                    continue
                result += math.log(0.5 / ratio)
            if result > 0:
                results.append((doc_key, result))

        results = sorted(results, key=lambda x: x[1], reverse=True)
        results = list(map(lambda x: x[0], results))

        return results[page.start_index:page.end_index]

예제 #3

0

파일 보기

    def run(self, query, page=Page(0, 20)):
        '''
        Procedure:
            1. tokenize the query
            2. calculate query weight
            3. calculate all distances
            4. sort distances
            5. return sorted result

        Args:
            query: query string
            page: page size and offset

        Returns:
            list of identifiers
        '''
        # tokenize query
        tokens = tokenize_text(query)
        if len(tokens) <= 0:
            return []

        # calculate query weigth
        bag_of_words = bag(tokens)
        max_freq = bag_of_words[max(bag_of_words, key=bag_of_words.get)]
        query_tf = lil_matrix((1, self.tokens_no))
        for token, freq in bag_of_words.items():
            if token not in self.tokens:
                continue
            index = self.tokens[token]
            query_tf[0, index] = tf(freq, max_freq)
        query_tf = csr_matrix(query_tf)
        query_w = csr_matrix(query_tf.multiply(self.idf))

        # calculate distances between all documents and query
        distances = self.distance(self.tf_idf, query_w)

        # sort results and return specified page
        distances = distances[:, 0]
        sorted_indices = np.argsort(distances)
        top = sorted_indices[page.start_index:page.end_index]
        f = np.vectorize(lambda x: self.iterative_docs[x])
        result = list(f(top))

        return result

예제 #4

0

파일 보기

파일: vector_space.py 프로젝트: gitbuda/infinity

    def run(self, query, page=Page(0, 20)):
        '''
        Procedure:
            1. tokenize the query
            2. calculate query weight
            3. calculate all distances
            4. sort distances
            5. return sorted result

        Args:
            query: query string
            page: page size and offset

        Returns:
            list of identifiers
        '''
        # tokenize query
        tokens = tokenize_text(query)
        if len(tokens) <= 0:
            return []

        # calculate query weigth
        bag_of_words = bag(tokens)
        max_freq = bag_of_words[max(bag_of_words, key=bag_of_words.get)]
        query_tf = lil_matrix((1, self.tokens_no))
        for token, freq in bag_of_words.items():
            if token not in self.tokens:
                continue
            index = self.tokens[token]
            query_tf[0, index] = tf(freq, max_freq)
        query_tf = csr_matrix(query_tf)
        query_w = csr_matrix(query_tf.multiply(self.idf))

        # calculate distances between all documents and query
        distances = self.distance(self.tf_idf, query_w)

        # sort results and return specified page
        distances = distances[:, 0]
        sorted_indices = np.argsort(distances)
        top = sorted_indices[page.start_index:page.end_index]
        f = np.vectorize(lambda x: self.iterative_docs[x])
        result = list(f(top))

        return result

예제 #5

0

파일 보기

파일: preprocessor.py 프로젝트: gitbuda/infinity

def preprocess_one(raw_file):
    '''
    Procedure:
        1. create document
        2. fill with the data

    Args:
        raw_file: text (string)

    Returns:
        Document
    '''
    document = Document()

    document.text = raw_file
    document.identifier = text_hash(raw_file)
    document.content_hash = document.identifier
    document.tokens = tokenize_text(raw_file)
    document.bag = bag(document.tokens)

    return document

예제 #6

0

파일 보기

def preprocess_one(raw_file):
    '''
    Procedure:
        1. create document
        2. fill with the data

    Args:
        raw_file: text (string)

    Returns:
        Document
    '''
    document = Document()

    document.text = raw_file
    document.identifier = text_hash(raw_file)
    document.content_hash = document.identifier
    document.tokens = tokenize_text(raw_file)
    document.bag = bag(document.tokens)

    return document

예제 #7

0

파일 보기

    def run(self, query, page=Page(0, 20)):
        '''
        Procedure:
            1. tokenize the query
            2. for each token get normalized score for each document
               that contains the token
            3. sum all normalized scores

        Why normalization:
        E.g. let's say that the query is "test" the document
        "Test one more time." is more relevant than document
        "Test one more time because something could went wrong.",
        because test has bigger impact.

        Args:
            query: query string
            page: page size and offset

        Returns:
            list of identifiers
        '''
        tokens = tokenize_text(query)
        if len(tokens) <= 0:
            return []

        data = {}
        for token in tokens:
            if token not in self.docs_bag:
                continue
            docs_bag_item = self.docs_bag[token]
            for doc_key, token_occurrence in docs_bag_item.items():
                doc_size = len(self.documents[doc_key].tokens)
                if doc_size <= 0:
                    normalized = 0
                else:
                    normalized = token_occurrence / doc_size
                data[doc_key] = data.get(doc_key, 0) + normalized
        data = sorted(data, key=data.get, reverse=True)

        return data[page.start_index:page.end_index]