Пример #1
0
def prepare_language_vocabulary(args):
    language, (tokens_key, vocab_size, type_) = args
    print(f'Building vocabulary for {language} {type_}')

    docs = utils.load_cached_docs(language, 'train')
    tokens = utils.flatten(preprocess_query_tokens(utils.flatten(doc[tokens_key] for doc in docs)))
    vocabulary = BpeVocabulary(vocab_size=vocab_size, pct_bpe=shared.VOCABULARY_PCT_BPE)
    vocabulary.fit(Counter(tokens))
    utils.cache_vocabulary(vocabulary, language, type_)

    print(f'Done building vocabulary for {language} {type_}')
Пример #2
0
    def prepare_query_vocabulary(self, vocabulary_size: int, pct_bpe: float):
        if self.verbose:
            print('Preparing query vocabulary')

        corpora = utils.flatten(
            self.data_manager.get_preprocessed_language_corpus(
                language, set_=shared.DataSet.TRAIN)
            for language in self.languages)
        tokens = utils.flatten(doc['query_tokens'] for doc in corpora)
        vocabulary = build_vocabulary(tokens, vocabulary_size, pct_bpe)
        self.data_manager.save_query_vocabulary(vocabulary)
Пример #3
0
def extract_sub_tokens(token):
    sub_tokens = re.split('[._]', token)
    sub_tokens = [IDENTIFIER_CAMEL_CASE_SPLIT.sub(r' \1', sub_token).split(' ')
                  if IDENTIFIER_TOKEN_REGEX.match(sub_token) else [sub_token]
                  for sub_token in sub_tokens]

    return [token.lower() for token in utils.flatten(sub_tokens) if len(token.strip()) > 0]
Пример #4
0
def pad_encode_seqs(seqs: shared.TokensGenerator, max_length: int,
                    vocabulary: BpeVocabulary,
                    preprocess_tokens_fn) -> np.ndarray:
    encoded_seqs = vocabulary.transform(
        (utils.flatten(preprocess_tokens_fn(seq)) for seq in seqs),
        fixed_length=max_length)
    return np.array(list(encoded_seqs))
Пример #5
0
def get_query_tokens(docstring_tokens: List[str], identifier: str):
    query_tokens = list(
        utils.flatten(preprocess_query_tokens(docstring_tokens)))
    if len(query_tokens) > 0:
        return query_tokens
    elif identifier and len(identifier) >= shared.MIN_FUNC_NAME_QUERY_LENGTH:
        return extract_sub_tokens(identifier)

    return []
Пример #6
0
def pad_encode_seqs(
        preprocess_tokens_fn: Callable[[Iterable[str]], shared.TokensGenerator],
        seqs: shared.TokensGenerator,
        max_length: int,
        language: str,
        type_: str) -> np.ndarray:
    bpe = utils.load_cached_vocabulary(language, type_)
    encoded_seqs = bpe.transform(
        (utils.flatten(preprocess_tokens_fn(seq)) for seq in seqs), fixed_length=max_length)
    return np.array(list(encoded_seqs))
Пример #7
0
    def prepare_language_vocabulary(self, language: str, vocabulary_size: int,
                                    pct_bpe: float):
        if self.verbose:
            print(f'Preparing language vocabulary: {language}')

        corpus = self.data_manager.get_preprocessed_language_corpus(
            language, set_=shared.DataSet.TRAIN)
        tokens = utils.flatten(doc['code_tokens'] for doc in corpus)
        vocabulary = build_vocabulary(tokens, vocabulary_size, pct_bpe)
        self.data_manager.save_language_vocabulary(vocabulary, language)
Пример #8
0
def evaluate_mrr(model: CodeSearchNN,
                 language_code_seqs: Dict[str, np.ndarray],
                 language_query_seqs: Dict[str, np.ndarray],
                 device: torch.device,
                 batch_size: int = 1000):
    mrrs_per_language = {}
    for language in language_code_seqs.keys():
        code_seqs = np_to_torch(language_code_seqs[language], device)
        query_seqs = np_to_torch(language_query_seqs[language], device)
        mrrs_per_language[language] = get_language_mrrs(model, language, code_seqs, query_seqs, batch_size=batch_size)

    mean_mrr = np.mean(list(utils.flatten(mrrs_per_language.values())))
    mean_mrr_per_language = {language: np.mean(values) for language, values in mrrs_per_language.items()}
    return mean_mrr, mean_mrr_per_language
Пример #9
0
def preprocess_doc(doc, language: str):
    identifier = doc['identifier']
    docstring_tokens = doc['docstring_tokens']
    code_tokens = doc['code_tokens']

    return {
        # func_name and url are needed for evaluation
        'identifier':
        identifier,
        'url':
        doc.get('url'),
        'query_tokens':
        get_query_tokens(docstring_tokens, identifier),
        'code_tokens':
        list(utils.flatten(preprocess_code_tokens(language, code_tokens))),
    }
Пример #10
0
def extract_sub_tokens(token):
    # Skip strings
    if len(token) > 0 and (token[0] in ['\'', '"']
                           or token[:2] in ['r\'', 'r"', 'f\'', 'f"']):
        return [token]

    sub_tokens = re.split('[._]', token)
    sub_tokens = [
        IDENTIFIER_CAMEL_CASE_SPLIT.sub(r' \1', sub_token).split(' ')
        if IDENTIFIER_TOKEN_REGEX.match(sub_token) else [sub_token]
        for sub_token in sub_tokens
    ]

    return [
        token.strip() for token in utils.flatten(sub_tokens)
        if len(token.strip()) > 0
    ]
Пример #11
0
def pad_encode_query(query: str, language: str) -> np.ndarray:
    seq = query.split(' ')
    bpe = utils.load_cached_vocabulary(language, 'query')
    encoded_seq = bpe.transform(
        (utils.flatten(preprocess_query_tokens(seq_)) for seq_ in [seq]), fixed_length=shared.QUERY_MAX_SEQ_LENGTH)
    return np.array(list(encoded_seq)[0])