Exemplo n.º 1
0
def main(script_folder,topics,vocab_pickle_filename,model_pickle_filename,max_script_count,use_binary,n_jobs):

    # Retrieve existing vocabulary
    if vocab_pickle_filename is not None:
        vocab = pickle.load(open(vocab_pickle_filename, "rb"))
    else:
        logger.warning("Pickle file containing bag of words vocabulary required")
        quit()

    code_scripts_list = list()
    counter = 0

    # Retrieve files containing Python scripts
    # Altair's JSON format uses the 'content' label for the script code
    for py_file in sorted(os.listdir(script_folder)):
        if counter >= max_script_count: break
        fullpath = os.path.join(script_folder, py_file)
        with open(fullpath, "r") as py_file_contents:
            for line in py_file_contents:
                counter += 1
                parsed_json = json.loads(line)
                code, comments = separate_code_and_comments(parsed_json['content'],py_file)
                if len(code) == 0:
                    continue
                else:
                    normalized_code = normalize_text(code, remove_stop_words=True, only_letters=False, return_list=False, remove_one_char_words=True)
                    code_scripts_list.append(normalized_code)

    lda_model = build_lda_model(code_scripts_list,topics,vocab,use_binary,n_jobs)

    #logger.info("Saving LDA model in a pickle file at %s" % model_pickle_filename)
    pickle.dump(lda_model, open(model_pickle_filename, "wb"))
    logger.info("LDA model pickle file saved at %s" % model_pickle_filename)
Exemplo n.º 2
0
def get_closest_docs(uri):
    #user_doc = requests.get(uri).text
    r = requests.get(uri)
    if r.status_code == 200:
        user_doc = r.text
        print("URI content length", len(user_doc))
        code, _ = separate_code_and_comments(user_doc, "user doc")
        normalized_code = normalize_text(code,
                                         remove_stop_words=False,
                                         only_letters=False,
                                         return_list=True)
        model.random.seed(0)
        user_vector = model.infer_vector(normalized_code)
        print("finding similar...")
        sys.stdout.flush()
        stored_urls = list()
        stored_vectors = list()
        for url in vectors:
            stored_urls.append(url)
            stored_vectors.append(vectors[url])
        pair_sims = cosine_similarity(user_vector.reshape(1, -1),
                                      stored_vectors)
        indices = (-pair_sims[0]).argsort()[:5]
        return [(stored_urls[index], round(float(pair_sims[0][index]), 2))
                for index in indices]
    else:
        print("URL returned status code", r.status_code)
        raise ValueError('URL error')
def vectorize_code(code_vectors, model, code_urls, vector_file):
    # code_urls is a set, make it a list to allow skipping some entries
    code_urls_list = list(code_urls)
    # start at a point close to where the previous code_vectors file left off
    starting_point = len(code_vectors)
    for i in range(starting_point, len(code_urls_list)):
        url = code_urls_list[i]
        if i > starting_point and i % 5000 == 0:
            intermediate_save(code_vectors, vector_file)
        if requests.get(url).status_code == 200:
            try:
                code = requests.get(url).text
                parsed_code, _ = separate_code_and_comments(code, "code")
                normalized_code = normalize_text(parsed_code,
                                                 remove_stop_words=False,
                                                 only_letters=False,
                                                 return_list=True)
                if len(normalized_code) > 1:
                    model.random.seed(0)
                    vector = model.infer_vector(normalized_code)
                    code_vectors[url] = vector
                else:
                    print("Parsing resulted in empty list for", url)
                    continue
            except:
                print("Unexpected error:", sys.exc_info()[0])
                continue
        else:
            print("Error code {} for url: {}".format(
                requests.get(url).status_code, url))
    return code_vectors
Exemplo n.º 4
0
 def vectorize_multi(self, documents):
     vectorized = []
     for document in documents:
         # Doc2Vec expects a list of words that includes stop words
         normalized_doc = normalize_text(document, remove_stop_words=False, only_letters=False, return_list=True, **self.normalizer_kwargs)
         # Doc2Vec requires a defined seed for deterministic results when calling infer_vector
         self.model.random.seed(0)
         vectorized.append(self.model.infer_vector(normalized_doc, **self.infer_kwargs))
     return vectorized
def build_bow_script_vocabulary(script_folder,
                                max_script_count=10000,
                                max_vocab_size=5000,
                                min_word_count=2):
    '''
    Generates a dictionary of words to be used as the vocabulary in techniques that utilize bag of words.
    Args:
        script_folder (str): Folder location of corpus containing script files
        max_script_count (int): the maximum number of code scripts to process in the script_folder
        max_vocab_size (int): the maximum number of words to be used in the vocabulary (dimension of bag of words vector)
        min_word_count (int): a word will be included in vocabulary if it appears at least min_count times in the corpus
    Returns:
        words_ordered_by_count (list): a list of size equal or less than vocab_size that contains the most frequent
        normalized words in the corpus
    '''
    word_count = defaultdict(int)
    counter = 0

    # Read file contents, extract code, normalize contents and count resulting tokens
    # Altair's JSON format uses the 'content' label for the script code
    for py_file in sorted(os.listdir(script_folder)):
        if counter >= max_script_count: break
        fullpath = os.path.join(script_folder, py_file)
        with open(fullpath, "r") as py_file_contents:
            for line in py_file_contents:
                counter += 1
                parsed_json = json.loads(line)
                code, comments = separate_code_and_comments(
                    parsed_json['content'], py_file)
                normalized_script = normalize_text(code,
                                                   remove_stop_words=True,
                                                   only_letters=False,
                                                   return_list=True,
                                                   remove_one_char_words=True)
                for token in normalized_script:
                    word_count[token] += 1
                if counter >= max_script_count: break

    # Determine descending order for library based on count and restricted by min_count threshold
    words_ordered_by_count = [
        i[0] for i in sorted(
            word_count.items(), key=lambda x: (x[1], x[0]), reverse=True)
        if i[1] > min_word_count
    ]

    # Trim the vocabulary to the requested vocab_size
    if len(words_ordered_by_count) >= max_vocab_size:
        words_ordered_by_count = words_ordered_by_count[:max_vocab_size]
    else:
        logger.warning("Only %d words were observed using max_script_count=%d, max_vocab_size=%d and min_word_count=%d" % \
                       (len(words_ordered_by_count),max_script_count, max_vocab_size,min_word_count))

    return words_ordered_by_count
def main(script_folder, output_folder, min_script_len, max_total_files,
         max_per_pkl):

    doc2vec_tagged_documents = list()
    counter = 0
    logger.info("retrieving files")
    just_started = True

    # Retrieve files containing Python scripts
    # Altair's JSON format uses the 'content' label for the script code
    for py_file in sorted(os.listdir(script_folder)):
        if counter >= max_total_files: break
        fullpath = os.path.join(script_folder, py_file)
        with open(fullpath, "r") as py_file_contents:
            for line in py_file_contents:
                if counter >= max_total_files: break
                if counter != 0 and counter % 50000 == 0:
                    logger.info("processed %d files" % counter)
                if not just_started and counter % max_per_pkl == 0:
                    logger.info(
                        "Saving pickle file of tagged documents for size %d",
                        max_per_pkl)
                    pickle.dump(
                        doc2vec_tagged_documents,
                        open(
                            os.path.join(output_folder,
                                         "training" + str(counter) + ".pkl"),
                            "wb"))
                    doc2vec_tagged_documents = list()
                    just_started = True
                parsed_json = json.loads(line)
                code, _ = separate_code_and_comments(parsed_json['content'],
                                                     py_file)
                if len(code) < min_script_len:
                    continue
                else:
                    tokenized_code = normalize_text(code,
                                                    remove_stop_words=False,
                                                    only_letters=False,
                                                    return_list=True,
                                                    remove_one_char_words=True)
                    if len(tokenized_code) > 1:
                        doc2vec_tagged_documents.append(
                            doc2vec.TaggedDocument(tokenized_code, [counter]))
                        counter += 1
                        just_started = False

    logger.info("Saving final pickle file of tagged documents for size %d",
                max_per_pkl)
    pickle.dump(
        doc2vec_tagged_documents,
        open(os.path.join(output_folder, "training" + str(counter) + ".pkl"),
             "wb"))
Exemplo n.º 7
0
def main(script_folder, model_pickle_filename, training_algorithm, num_cores,
         epochs, vector_size, window, min_count, alpha, max_script_count,
         min_script_len, negative):

    doc2vec_tagged_documents = list()
    counter = 0

    logger.info("retrieving files")

    # Retrieve files containing Python scripts
    # Altair's JSON format uses the 'content' label for the script code
    for py_file in sorted(os.listdir(script_folder)):
        if counter >= max_script_count: break
        if counter % 100000 == 0: logger.info("processed %d files" % counter)
        fullpath = os.path.join(script_folder, py_file)
        with open(fullpath, "r") as py_file_contents:
            for line in py_file_contents:
                parsed_json = json.loads(line)
                code, comments = separate_code_and_comments(
                    parsed_json['content'], py_file)
                if len(code) < min_script_len:
                    continue
                else:
                    tokenized_code = normalize_text(code,
                                                    remove_stop_words=False,
                                                    only_letters=False,
                                                    return_list=True,
                                                    remove_one_char_words=True)
                    doc2vec_tagged_documents.append(
                        doc2vec.TaggedDocument(tokenized_code, [counter]))
                    counter += 1

    doc2vec_model = build_doc2vec_model(doc2vec_tagged_documents,
                                        training_algorithm, num_cores, epochs,
                                        vector_size, window, min_count, alpha,
                                        negative)

    # Per http://radimrehurek.com/gensim/models/doc2vec.html, delete_temporary_training_data reduces model size
    # If keep_doctags_vectors is set to false, most_similar, similarity, sims is no longer available
    # If keep_inference is set to false, infer_vector on a new document is no longer possible
    doc2vec_model.delete_temporary_training_data(keep_doctags_vectors=False,
                                                 keep_inference=True)

    # Per http://radimrehurek.com/gensim/models/doc2vec.html, doc2vec has its own  method for saving/loading models
    # doc2vec_model.save(model_pickle_filename)
    # doc2vec_model = doc2vec.Doc2Vec.load(model_pickle_filename)

    #logger.info("saving doc2vec model in a pickle file at %s" % model_pickle_filename)
    pickle.dump(doc2vec_model, open(model_pickle_filename, "wb"))
    logger.info("doc2vec model pickle file saved at %s" %
                model_pickle_filename)
def vectorize_code(current_script_fullpath, model, remove_comments):
    with open(current_script_fullpath, "r") as input:
        code = input.read()
    if remove_comments:
        parsed_code, _ = separate_code_and_comments(code, "code")
    else:
        parsed_code = code
    normalized_code = normalize_text(parsed_code, remove_stop_words=False, only_letters=False, return_list=True)
    if len(normalized_code) > 1:
        model.random.seed(0)
        return model.infer_vector(normalized_code)
    else:
        print("Warning - Parsing resulted in empty list for", current_script_fullpath)
        return None
Exemplo n.º 9
0
def get_closest_docs(uri):
    user_doc = requests.get(uri).text
    code, _ = separate_code_and_comments(user_doc, "user doc")
    normalized_code = normalize_text(code,
                                     remove_stop_words=False,
                                     only_letters=False,
                                     return_list=True)
    model.random.seed(0)
    user_vector = model.infer_vector(normalized_code)
    print("finding similar...")
    stored_urls = list()
    stored_vectors = list()
    for url in vectors:
        stored_urls.append(url)
        stored_vectors.append(vectors[url])
    pair_sims = cosine_similarity(user_vector.reshape(1, -1), stored_vectors)
    indices = (-pair_sims[0]).argsort()[:5]
    return [(stored_urls[index], round(float(pair_sims[0][index]), 2))
            for index in indices]