Пример #1
0
    def _crawl_tree (self, parse_node):

        """
        Recursively crawls the tree building a unique list of terms and the
        node 
        """

        # Build our list of term contexts that point to a term. Once done,
        # uniqify the list by looking at the term values.
        #
        # NOTE: There must be a better way to do this, also, so we can
        # create more contexts easily without making a large amount of
        # modifications.
        terms = [
            [TermModel.get_word_id(token), context_id_map['docstring']] \
            for token in list(set(tokenize(parse_node.docstring)))
        ]
        terms.extend([
            [TermModel.get_word_id(token), context_id_map['comment']] \
            for token in list(set(tokenize(parse_node.comments)))
        ])
        terms.extend([
            [TermModel.get_word_id(token), context_id_map['library']] \
            for token in parse_node.imports
        ])
        
        if parse_node.type == 'Class':
            terms.append([
                TermModel.get_word_id(token),
                context_id_map['class']
            ])

        terms = uniqify_terms(terms)

        # NOTE: We copy the list of terms so that we dont modify the list
        # of terms for this treenode as we navigate upwards. Therefore,
        # unique terms from other nodes wont end up in the unique terms for
        # this node.
        node = dict(
            name=parse_node.name,
            type=parse_node.type,
            source=parse_node.source,
            signals=parse_node.signals,
            terms=terms[:],
            children = [],
        )

        for child in parse_node.children:
            child_node, child_terms = self._crawl_tree(child)
            node['children'].append(child_node)
            terms.extend(child_terms)

        return node, uniqify_terms(terms)
Пример #2
0
    def update (self):

        """
        The final phase is to save all relationships between the created
        documents and terms.

        NOTE: We don't have transactional support, so, if this fails, we
        will have a bunch fo documents with missing Term->Document
        pointers.
        """

        for term, docs in self.terms.iteritems():
            TermModel.update_doclist(term, docs)
            logging.debug('Updated doc_list for term id %s with %d docs' % (
                term,
                len(docs)
            ))
        
        logging.info('Completed updating Term->Document list')
Пример #3
0
    def _get_documents (self, tokens):

        """
        Returns a list of unique documents that are sorted by their score
        from highest to lowest.
        """
        
        # Transform our list of token pairs (context_id, term_id) into a
        # map of term_id -> context_id. We do this for optimizations
        # further down the road.
        token_map = dict()
        for context_id, term_id in tokens:
            token_map[term_id] = context_id

        # Retrieve our dictionary of Term -> [(doc,context), ...] from the
        # database
        start_time = time.time()
        term_doc_map = TermModel.get_term_doc_map(token_map.keys())
        logging.debug('Took %.4fs to retrieve data structure for %d terms' % (
            time.time() - start_time,
            len(term_doc_map)
        ))

        # Once we have our dictionary mapping we need to group based on
        # document id's and their terms. We will build a dictionary mapping
        # document id to a list of terms and the context in which those
        # terms occur. map[DOC_ID] = [[context_id, term_id], ...]
        #
        # At this point we will also remove all document ids if given
        # contexts are specified in our token list. This will handle the
        # case were a user has specified that a term must occur in a
        # certain context.
        start_time = time.time()
        doc_term_map = self._organize(token_map, term_doc_map)
        logging.debug('Took %.4fs to rearrange data into structure for %d docs' % (
            time.time() - start_time,
            len(doc_term_map)
        ))
        
        # Recall document data for each document relevant to our query,
        # then we build a map structure for mapping doc_id -> doc_data
        start_time = time.time()
        docs = self._retrieve_documents(doc_term_map)
        logging.debug('Took %.4fs to retrieve %d document data from database' % (
            time.time() - start_time,
            len(docs)
        ))
        return docs
Пример #4
0
def translate (query):
    
    """
    Returns a list of pairs (token-type-id, token-id) using the lexicon
    provided through the TermModel class. Takes in a non-sanitized query
    string.

    Uses process and match functionality of this module.
    """
    
    # Convert query stirng into list of term type and their sanitized
    # values
    sanitized = process(query)
    logging.debug('Tokenized query "%s" into %s' % (
        query,
        sanitized
    ))

    # Converts term types and terms into their corresponding integer values
    # from the database and lexicon.
    return map(
        lambda x: (context_id_map[x[0]], TermModel.get_word_id(x[1])),
        sanitized
    )