Exemplo n.º 1
0
def build_annoy_indices(data_manager: DataManager,
                        languages: List[str],
                        n_trees=200,
                        build_on_disk=False,
                        verbose=True):
    for language in languages:
        if verbose:
            print(f'Building {language} AnnoyIndex')

        code_embeddings = data_manager.get_language_code_embeddings(language)
        n_samples, embedding_size = code_embeddings.shape
        annoy_index = get_annoy_index(embedding_size)

        if build_on_disk:
            annoy_index.on_disk_build(
                add_extension(
                    'annoy',
                    data_manager.get_language_annoy_index_path(language)))

        for i in range(n_samples):
            annoy_index.add_item(i, code_embeddings[i, :])
        annoy_index.build(n_trees)

        if not build_on_disk:
            data_manager.save_language_annoy_index(annoy_index, language)
def parse_dedupe_definitions(data_manager: DataManager, language: str):
    dedupe_definitions_pkl_path = os.path.join(
        shared.CODESEARCHNET_DATA_DIR, f'{language}_dedupe_definitions_v2')
    dedupe_definitions = serialize.load('pickle', dedupe_definitions_pkl_path)

    corpus = (rename_dedupe_definitions_keys(doc)
              for doc in dedupe_definitions)
    data_manager.save_language_corpus(corpus, language, shared.DataSet.ALL)
Exemplo n.º 3
0
def get_ndcg_predictions(
        queries,
        model: CodeSearchNN,
        data_manager: DataManager,
        device: torch.device,
        nn_lib: str = 'scikit',
        n_neighbors: int = 150,
        search_k: int = -1):
    predictions = []
    for language in shared.LANGUAGES:
        print(f'Evaluating {language}')

        evaluation_docs = [{'url': doc['url'], 'identifier': doc['identifier']}
                           for doc in data_manager.get_language_corpus(language, shared.DataSet.ALL)]

        with torch.no_grad():
            query_seqs = prepare_data.pad_encode_seqs(
                (line.split(' ') for line in queries),
                shared.QUERY_MAX_SEQ_LENGTH,
                data_manager.get_query_vocabulary(),
                preprocessing_tokens.preprocess_query_tokens)
            query_embeddings = torch_gpu_to_np(model.encode_query(np_to_torch(query_seqs, device)))

        if nn_lib == 'scikit':
            nn = NearestNeighbors(n_neighbors=n_neighbors, metric='cosine', n_jobs=-1)
            nn.fit(data_manager.get_language_code_embeddings(language))
            _, nearest_neighbor_indices_per_query = nn.kneighbors(query_embeddings)

            for query_idx, query in enumerate(queries):
                for query_nearest_code_idx in nearest_neighbor_indices_per_query[query_idx, :]:
                    predictions.append({
                        'query': query,
                        'language': language,
                        'identifier': evaluation_docs[query_nearest_code_idx]['identifier'],
                        'url': evaluation_docs[query_nearest_code_idx]['url'],
                    })
        elif nn_lib == 'annoy':
            annoy_index = data_manager.get_language_annoy_index(get_annoy_index(query_embeddings.shape[1]), language)
            for query_idx, query in enumerate(queries):
                nearest_neighbor_indices = annoy_index.get_nns_by_vector(
                    query_embeddings[query_idx, :], n_neighbors, search_k=search_k)

                for query_nearest_code_idx in nearest_neighbor_indices:
                    predictions.append({
                        'query': query,
                        'language': language,
                        'identifier': evaluation_docs[query_nearest_code_idx]['identifier'],
                        'url': evaluation_docs[query_nearest_code_idx]['url'],
                    })
        else:
            raise Exception('Unknown nearest neighbors library.')

        del evaluation_docs
        gc.collect()

    return predictions
Exemplo n.º 4
0
def load_language_set_seqs(data_manager: DataManager, languages: List[str],
                           set_: shared.DataSet):
    language_code_seqs = OrderedDict()
    language_query_seqs = OrderedDict()
    for language in languages:
        language_code_seqs[language] = data_manager.get_language_seqs(
            language, shared.DataType.CODE, set_)
        language_query_seqs[language] = data_manager.get_language_seqs(
            language, shared.DataType.QUERY, set_)
    return language_code_seqs, language_query_seqs
def extract_repository_language_corpus(data_manager: DataManager, repository_dir: str, language: str):
    code = set()
    code_documents = []
    for code_document in extract(repository_dir, language):
        if code_document['code'] in code:
            continue

        code_documents.append(code_document)
        code.add(code_document['code'])

    data_manager.save_language_corpus(code_documents, language, shared.DataSet.ALL)
Exemplo n.º 6
0
def build_code_embeddings(model: CodeSearchNN,
                          data_manager: DataManager,
                          languages: List[str],
                          device: torch.device,
                          verbose=True):
    for language in languages:
        if verbose:
            print(f'Building {language} code embeddings')

        code_seqs = data_manager.get_language_seqs(language,
                                                   shared.DataType.CODE,
                                                   shared.DataSet.ALL)
        code_embeddings = batch_encode_code_seqs(model, language, code_seqs,
                                                 device)
        data_manager.save_language_code_embeddings(code_embeddings, language)
Exemplo n.º 7
0
def train(model: CodeSearchNN, data_manager: DataManager, languages: List[str],
          device: torch.device, **kwargs):
    train_language_seqs = load_language_set_seqs(data_manager, languages,
                                                 shared.DataSet.TRAIN)
    valid_language_seqs = load_language_set_seqs(data_manager, languages,
                                                 shared.DataSet.VALID)

    train_model(model, train_language_seqs, valid_language_seqs, data_manager,
                device, **kwargs)

    test_language_code_seqs, test_language_query_seqs = load_language_set_seqs(
        data_manager, languages, shared.DataSet.TEST)

    best_model = data_manager.get_torch_model(model)
    model.eval()
    with torch.no_grad():
        test_mean_mrr, test_mean_mrr_per_language = evaluate_mrr(
            best_model,
            test_language_code_seqs,
            test_language_query_seqs,
            device,
            batch_size=kwargs.get('mrr_eval_batch_size', 1000))

        if kwargs['verbose']:
            print(f'Test MRR: {test_mean_mrr:.4f}')
            print(f'Test MRR: {test_mean_mrr_per_language}')
Exemplo n.º 8
0
def get_repository_model_for_evaluation(
        data_manager: DataManager,
        languages: List[str],
        device: Optional[torch.device] = None) -> CodeSearchNN:
    model = data_manager.get_torch_model(
        get_repository_model(data_manager, languages, device))
    model.eval()
    return model
Exemplo n.º 9
0
def get_repository_model(data_manager: DataManager,
                         languages: List[str],
                         device: Optional[torch.device] = None):
    query_vocabulary_size = data_manager.get_query_vocabulary().vocab_size
    code_vocabulary_size = {
        language: data_manager.get_language_vocabulary(language).vocab_size
        for language in languages
    }

    model = get_model(languages, shared.EMBEDDING_SIZE, code_vocabulary_size,
                      shared.CODE_MAX_SEQ_LENGTH, query_vocabulary_size,
                      shared.QUERY_MAX_SEQ_LENGTH)

    if device is not None:
        model = model.to(device)

    return model
Exemplo n.º 10
0
def build_repository_model(
        repository_data_manager: DataManager,
        base_data_manager: DataManager,
        languages: List[str]):
    base_model = get_base_language_model_for_evaluation(base_data_manager)
    repository_model = get_repository_model(repository_data_manager, languages)

    repository_model.set_query_embedding_weights(
        torch_utils.np_to_torch(repository_data_manager.get_query_embedding_weights()))
    repository_model.set_query_weights_layer(base_model.get_query_weights_layer())

    for language in languages:
        repository_model.set_language_embedding_weights(
            language,
            torch_utils.np_to_torch(repository_data_manager.get_language_embedding_weights(language)))
        repository_model.set_language_weights_layer(language, base_model.get_language_weights_layer(language))

    repository_data_manager.save_torch_model(repository_model)
Exemplo n.º 11
0
def get_nearest_code_neighbors(
        data_manager: DataManager,
        language: str,
        embedding_row_index: int,
        embedding_size: int,
        n_results: int = 20) -> Tuple[List[int], List[float]]:
    ann = data_manager.get_language_annoy_index(
        get_annoy_index(embedding_size), language)
    indices, distances = ann.get_nns_by_item(embedding_row_index,
                                             n_results + 1,
                                             include_distances=True)
    return indices[1:], distances[
        1:]  # Exclude the first result since it belongs to the embedding row
Exemplo n.º 12
0
def get_nearest_embedding_neighbors_per_language(
    data_manager: DataManager,
    languages: List[str],
    embedding: np.ndarray,
    results_per_language: int = 20
) -> Dict[str, Tuple[List[int], List[float]]]:
    nearest_neighbors_per_language = {}

    for language in languages:
        embedding_size = embedding.shape[0]
        ann = data_manager.get_language_annoy_index(
            get_annoy_index(embedding_size), language)
        nearest_neighbors_per_language[language] = ann.get_nns_by_vector(
            embedding, results_per_language, include_distances=True)

    return nearest_neighbors_per_language
Exemplo n.º 13
0
def import_corpus(
        data_manager: DataManager,
        repository: models.CodeRepository,
        language: str,
        commit_hash: str,
        batch_size: int = 500):
    corpus = data_manager.get_language_corpus(language, shared.DataSet.ALL)
    code_docs = []
    organization, name = repository.organization, repository.name
    code_language = models.CodeLanguage.objects.get(name=language)
    for idx, doc in enumerate(corpus):
        code_doc = models.CodeDocument(
            url=get_code_document_url(organization, name, commit_hash, doc['path'], doc['start_line'], doc['end_line']),
            path=doc['path'],
            identifier=doc['identifier'],
            code=doc['code'],
            code_hash=hashlib.sha1(doc['code'].encode('utf-8')).hexdigest(),
            embedded_row_index=idx,
            language=code_language,
            repository=repository,
        )
        code_docs.append(code_doc)

    models.CodeDocument.objects.bulk_create(code_docs, batch_size=batch_size)
Exemplo n.º 14
0
def pad_encode_query(data_manager: DataManager, query: str,
                     max_query_seq_length: int):
    return pad_encode_seqs(
        (seq.split(' ') for seq in [query]), max_query_seq_length,
        data_manager.get_query_vocabulary(), preprocess_query_tokens)
Exemplo n.º 15
0
def get_base_language_model_for_evaluation(data_manager: DataManager,
                                           device: Optional[
                                               torch.device] = None):
    model = data_manager.get_torch_model(get_base_language_model(device))
    model.eval()
    return model
Exemplo n.º 16
0
def pad_encode_code_tokens(data_manager: DataManager, tokens: List[str],
                           language: str, max_code_seq_length: int):
    return pad_encode_seqs((_ for _ in [tokens]), max_code_seq_length,
                           data_manager.get_language_vocabulary(language),
                           functools.partial(preprocess_code_tokens, language))
def combine_language_set_corpus(data_manager: DataManager, language: str,
                                set_: shared.DataSet):
    corpus = (rename_set_doc_keys(doc)
              for doc in get_codesearchnet_language_set_corpus(language, set_))
    data_manager.save_language_corpus(corpus, language, set_)