示例#1
0
    def _crawl_tree (self, parse_node):

        """
        Recursively crawls the tree building a unique list of terms and the
        node 
        """

        # Build our list of term contexts that point to a term. Once done,
        # uniqify the list by looking at the term values.
        #
        # NOTE: There must be a better way to do this, also, so we can
        # create more contexts easily without making a large amount of
        # modifications.
        terms = [
            [TermModel.get_word_id(token), context_id_map['docstring']] \
            for token in list(set(tokenize(parse_node.docstring)))
        ]
        terms.extend([
            [TermModel.get_word_id(token), context_id_map['comment']] \
            for token in list(set(tokenize(parse_node.comments)))
        ])
        terms.extend([
            [TermModel.get_word_id(token), context_id_map['library']] \
            for token in parse_node.imports
        ])
        
        if parse_node.type == 'Class':
            terms.append([
                TermModel.get_word_id(token),
                context_id_map['class']
            ])

        terms = uniqify_terms(terms)

        # NOTE: We copy the list of terms so that we dont modify the list
        # of terms for this treenode as we navigate upwards. Therefore,
        # unique terms from other nodes wont end up in the unique terms for
        # this node.
        node = dict(
            name=parse_node.name,
            type=parse_node.type,
            source=parse_node.source,
            signals=parse_node.signals,
            terms=terms[:],
            children = [],
        )

        for child in parse_node.children:
            child_node, child_terms = self._crawl_tree(child)
            node['children'].append(child_node)
            terms.extend(child_terms)

        return node, uniqify_terms(terms)
示例#2
0
def process (query):
        
    """
    Returns a list of tokenized strings that have been processed and
    classified (tagged).

    Returns a list of pairs (token-type, token-value) where token type
    can be keyword, library, lang.
    """
    
    # Maps the function match to each token of the query. Then filters the
    # results of that mapping process so no invalid tokens are included in
    # the list of (token_type, sanitized_token)
    return filter(lambda x: x[0] != 'invalid', map(match, tokenize(query)))
示例#3
0
    def _summarize (self, doc, query_tfidf):
        
        """
        Returns a dynamic summary based upon the given query tfidf. The
        returned summary also has the query terms highlighted.

        Returns a summary using the top 3 most relevant terms.
        """
        
        blocks = [line for line in doc['text'].split('\n') if len(line) > 0]
        rankings = []

        for block in blocks:
            
            block_terms = tokenize(block)
            block_term_occurences = term_occurences(block_terms)
            block_term_table = self._find_terms(block_terms)

            block_tfidf = self._generate_tfidf(
                block_term_occurences, 
                block_term_table
            )
            
            terms = self._vector_term_unison(block_tfidf,query_tfidf)
            block_vector = []
            query_vector = []
            
            for term in terms:
                block_term_tfidf = block_tfidf[term] if term in block_tfidf else 0.0
                query_term_tfidf = query_tfidf[term] if term in query_tfidf else 0.0

                block_vector.append(block_term_tfidf)
                query_vector.append(query_term_tfidf)

            similarity = self._calculate_similarity(block_vector, query_vector)
            rankings.append((similarity, block))
        
        rankings = sorted(rankings, key=itemgetter(0), reverse=True)
        summary = ' '.join([ranking[1] for ranking in rankings[:2]])

        return highlight(summary, query_tfidf.keys())
示例#4
0
    def search (self, query, page = 1, num_page = 10):

        """
        Performs a search for the given query on our database.

        Returns a tuple with the number of results and a list of the actual
        results.
        """
       
        #   First we need to tokenize our given query into terms
        logging.info('Performing Query: %s' % (query))
        query_terms = tokenize(query)
        query_term_occurences = term_occurences(query_terms)
        logging.debug('Tokenized Query: %s' % (query_terms))

        #   Okay, now construct our tfidf vector for the query for cosine
        #   similarity comparison when required.
        term_table = self._find_terms(query_terms)
        query_tfidf = self._generate_tfidf(query_term_occurences, term_table)
        logging.debug('Query TFIDF: %s' % (query_tfidf))

        document_rankings = self._perform_search(query_tfidf, term_table)
        documents = []
        for rank, doc_id in document_rankings[(page - 1)*num_page:page*num_page]:
            
            start_time = time.time() 
            doc = self.documents[doc_id]
            documents.append(dict(
                similarity = rank,
                text = self._summarize(doc, query_tfidf),
                title = highlight(doc['title'], query_tfidf.keys()),
                doc_id = doc_id
            ))

            logging.debug('%.5fms to summarize document id: %s' % (
                (time.time() - start_time) * 1000,
                doc_id
            ))
        
        return len(document_rankings), documents