예제 #1
0
    def _crawl_tree (self, parse_node):

        """
        Recursively crawls the tree building a unique list of terms and the
        node 
        """

        # Build our list of term contexts that point to a term. Once done,
        # uniqify the list by looking at the term values.
        #
        # NOTE: There must be a better way to do this, also, so we can
        # create more contexts easily without making a large amount of
        # modifications.
        terms = [
            [TermModel.get_word_id(token), context_id_map['docstring']] \
            for token in list(set(tokenize(parse_node.docstring)))
        ]
        terms.extend([
            [TermModel.get_word_id(token), context_id_map['comment']] \
            for token in list(set(tokenize(parse_node.comments)))
        ])
        terms.extend([
            [TermModel.get_word_id(token), context_id_map['library']] \
            for token in parse_node.imports
        ])
        
        if parse_node.type == 'Class':
            terms.append([
                TermModel.get_word_id(token),
                context_id_map['class']
            ])

        terms = uniqify_terms(terms)

        # NOTE: We copy the list of terms so that we dont modify the list
        # of terms for this treenode as we navigate upwards. Therefore,
        # unique terms from other nodes wont end up in the unique terms for
        # this node.
        node = dict(
            name=parse_node.name,
            type=parse_node.type,
            source=parse_node.source,
            signals=parse_node.signals,
            terms=terms[:],
            children = [],
        )

        for child in parse_node.children:
            child_node, child_terms = self._crawl_tree(child)
            node['children'].append(child_node)
            terms.extend(child_terms)

        return node, uniqify_terms(terms)
예제 #2
0
def translate (query):
    
    """
    Returns a list of pairs (token-type-id, token-id) using the lexicon
    provided through the TermModel class. Takes in a non-sanitized query
    string.

    Uses process and match functionality of this module.
    """
    
    # Convert query stirng into list of term type and their sanitized
    # values
    sanitized = process(query)
    logging.debug('Tokenized query "%s" into %s' % (
        query,
        sanitized
    ))

    # Converts term types and terms into their corresponding integer values
    # from the database and lexicon.
    return map(
        lambda x: (context_id_map[x[0]], TermModel.get_word_id(x[1])),
        sanitized
    )