def main(script_folder,topics,vocab_pickle_filename,model_pickle_filename,max_script_count,use_binary,n_jobs): # Retrieve existing vocabulary if vocab_pickle_filename is not None: vocab = pickle.load(open(vocab_pickle_filename, "rb")) else: logger.warning("Pickle file containing bag of words vocabulary required") quit() code_scripts_list = list() counter = 0 # Retrieve files containing Python scripts # Altair's JSON format uses the 'content' label for the script code for py_file in sorted(os.listdir(script_folder)): if counter >= max_script_count: break fullpath = os.path.join(script_folder, py_file) with open(fullpath, "r") as py_file_contents: for line in py_file_contents: counter += 1 parsed_json = json.loads(line) code, comments = separate_code_and_comments(parsed_json['content'],py_file) if len(code) == 0: continue else: normalized_code = normalize_text(code, remove_stop_words=True, only_letters=False, return_list=False, remove_one_char_words=True) code_scripts_list.append(normalized_code) lda_model = build_lda_model(code_scripts_list,topics,vocab,use_binary,n_jobs) #logger.info("Saving LDA model in a pickle file at %s" % model_pickle_filename) pickle.dump(lda_model, open(model_pickle_filename, "wb")) logger.info("LDA model pickle file saved at %s" % model_pickle_filename)
def get_closest_docs(uri): #user_doc = requests.get(uri).text r = requests.get(uri) if r.status_code == 200: user_doc = r.text print("URI content length", len(user_doc)) code, _ = separate_code_and_comments(user_doc, "user doc") normalized_code = normalize_text(code, remove_stop_words=False, only_letters=False, return_list=True) model.random.seed(0) user_vector = model.infer_vector(normalized_code) print("finding similar...") sys.stdout.flush() stored_urls = list() stored_vectors = list() for url in vectors: stored_urls.append(url) stored_vectors.append(vectors[url]) pair_sims = cosine_similarity(user_vector.reshape(1, -1), stored_vectors) indices = (-pair_sims[0]).argsort()[:5] return [(stored_urls[index], round(float(pair_sims[0][index]), 2)) for index in indices] else: print("URL returned status code", r.status_code) raise ValueError('URL error')
def vectorize_code(code_vectors, model, code_urls, vector_file): # code_urls is a set, make it a list to allow skipping some entries code_urls_list = list(code_urls) # start at a point close to where the previous code_vectors file left off starting_point = len(code_vectors) for i in range(starting_point, len(code_urls_list)): url = code_urls_list[i] if i > starting_point and i % 5000 == 0: intermediate_save(code_vectors, vector_file) if requests.get(url).status_code == 200: try: code = requests.get(url).text parsed_code, _ = separate_code_and_comments(code, "code") normalized_code = normalize_text(parsed_code, remove_stop_words=False, only_letters=False, return_list=True) if len(normalized_code) > 1: model.random.seed(0) vector = model.infer_vector(normalized_code) code_vectors[url] = vector else: print("Parsing resulted in empty list for", url) continue except: print("Unexpected error:", sys.exc_info()[0]) continue else: print("Error code {} for url: {}".format( requests.get(url).status_code, url)) return code_vectors
def vectorize_multi(self, documents): vectorized = [] for document in documents: # Doc2Vec expects a list of words that includes stop words normalized_doc = normalize_text(document, remove_stop_words=False, only_letters=False, return_list=True, **self.normalizer_kwargs) # Doc2Vec requires a defined seed for deterministic results when calling infer_vector self.model.random.seed(0) vectorized.append(self.model.infer_vector(normalized_doc, **self.infer_kwargs)) return vectorized
def build_bow_script_vocabulary(script_folder, max_script_count=10000, max_vocab_size=5000, min_word_count=2): ''' Generates a dictionary of words to be used as the vocabulary in techniques that utilize bag of words. Args: script_folder (str): Folder location of corpus containing script files max_script_count (int): the maximum number of code scripts to process in the script_folder max_vocab_size (int): the maximum number of words to be used in the vocabulary (dimension of bag of words vector) min_word_count (int): a word will be included in vocabulary if it appears at least min_count times in the corpus Returns: words_ordered_by_count (list): a list of size equal or less than vocab_size that contains the most frequent normalized words in the corpus ''' word_count = defaultdict(int) counter = 0 # Read file contents, extract code, normalize contents and count resulting tokens # Altair's JSON format uses the 'content' label for the script code for py_file in sorted(os.listdir(script_folder)): if counter >= max_script_count: break fullpath = os.path.join(script_folder, py_file) with open(fullpath, "r") as py_file_contents: for line in py_file_contents: counter += 1 parsed_json = json.loads(line) code, comments = separate_code_and_comments( parsed_json['content'], py_file) normalized_script = normalize_text(code, remove_stop_words=True, only_letters=False, return_list=True, remove_one_char_words=True) for token in normalized_script: word_count[token] += 1 if counter >= max_script_count: break # Determine descending order for library based on count and restricted by min_count threshold words_ordered_by_count = [ i[0] for i in sorted( word_count.items(), key=lambda x: (x[1], x[0]), reverse=True) if i[1] > min_word_count ] # Trim the vocabulary to the requested vocab_size if len(words_ordered_by_count) >= max_vocab_size: words_ordered_by_count = words_ordered_by_count[:max_vocab_size] else: logger.warning("Only %d words were observed using max_script_count=%d, max_vocab_size=%d and min_word_count=%d" % \ (len(words_ordered_by_count),max_script_count, max_vocab_size,min_word_count)) return words_ordered_by_count
def main(script_folder, output_folder, min_script_len, max_total_files, max_per_pkl): doc2vec_tagged_documents = list() counter = 0 logger.info("retrieving files") just_started = True # Retrieve files containing Python scripts # Altair's JSON format uses the 'content' label for the script code for py_file in sorted(os.listdir(script_folder)): if counter >= max_total_files: break fullpath = os.path.join(script_folder, py_file) with open(fullpath, "r") as py_file_contents: for line in py_file_contents: if counter >= max_total_files: break if counter != 0 and counter % 50000 == 0: logger.info("processed %d files" % counter) if not just_started and counter % max_per_pkl == 0: logger.info( "Saving pickle file of tagged documents for size %d", max_per_pkl) pickle.dump( doc2vec_tagged_documents, open( os.path.join(output_folder, "training" + str(counter) + ".pkl"), "wb")) doc2vec_tagged_documents = list() just_started = True parsed_json = json.loads(line) code, _ = separate_code_and_comments(parsed_json['content'], py_file) if len(code) < min_script_len: continue else: tokenized_code = normalize_text(code, remove_stop_words=False, only_letters=False, return_list=True, remove_one_char_words=True) if len(tokenized_code) > 1: doc2vec_tagged_documents.append( doc2vec.TaggedDocument(tokenized_code, [counter])) counter += 1 just_started = False logger.info("Saving final pickle file of tagged documents for size %d", max_per_pkl) pickle.dump( doc2vec_tagged_documents, open(os.path.join(output_folder, "training" + str(counter) + ".pkl"), "wb"))
def main(script_folder, model_pickle_filename, training_algorithm, num_cores, epochs, vector_size, window, min_count, alpha, max_script_count, min_script_len, negative): doc2vec_tagged_documents = list() counter = 0 logger.info("retrieving files") # Retrieve files containing Python scripts # Altair's JSON format uses the 'content' label for the script code for py_file in sorted(os.listdir(script_folder)): if counter >= max_script_count: break if counter % 100000 == 0: logger.info("processed %d files" % counter) fullpath = os.path.join(script_folder, py_file) with open(fullpath, "r") as py_file_contents: for line in py_file_contents: parsed_json = json.loads(line) code, comments = separate_code_and_comments( parsed_json['content'], py_file) if len(code) < min_script_len: continue else: tokenized_code = normalize_text(code, remove_stop_words=False, only_letters=False, return_list=True, remove_one_char_words=True) doc2vec_tagged_documents.append( doc2vec.TaggedDocument(tokenized_code, [counter])) counter += 1 doc2vec_model = build_doc2vec_model(doc2vec_tagged_documents, training_algorithm, num_cores, epochs, vector_size, window, min_count, alpha, negative) # Per http://radimrehurek.com/gensim/models/doc2vec.html, delete_temporary_training_data reduces model size # If keep_doctags_vectors is set to false, most_similar, similarity, sims is no longer available # If keep_inference is set to false, infer_vector on a new document is no longer possible doc2vec_model.delete_temporary_training_data(keep_doctags_vectors=False, keep_inference=True) # Per http://radimrehurek.com/gensim/models/doc2vec.html, doc2vec has its own method for saving/loading models # doc2vec_model.save(model_pickle_filename) # doc2vec_model = doc2vec.Doc2Vec.load(model_pickle_filename) #logger.info("saving doc2vec model in a pickle file at %s" % model_pickle_filename) pickle.dump(doc2vec_model, open(model_pickle_filename, "wb")) logger.info("doc2vec model pickle file saved at %s" % model_pickle_filename)
def vectorize_code(current_script_fullpath, model, remove_comments): with open(current_script_fullpath, "r") as input: code = input.read() if remove_comments: parsed_code, _ = separate_code_and_comments(code, "code") else: parsed_code = code normalized_code = normalize_text(parsed_code, remove_stop_words=False, only_letters=False, return_list=True) if len(normalized_code) > 1: model.random.seed(0) return model.infer_vector(normalized_code) else: print("Warning - Parsing resulted in empty list for", current_script_fullpath) return None
def get_closest_docs(uri): user_doc = requests.get(uri).text code, _ = separate_code_and_comments(user_doc, "user doc") normalized_code = normalize_text(code, remove_stop_words=False, only_letters=False, return_list=True) model.random.seed(0) user_vector = model.infer_vector(normalized_code) print("finding similar...") stored_urls = list() stored_vectors = list() for url in vectors: stored_urls.append(url) stored_vectors.append(vectors[url]) pair_sims = cosine_similarity(user_vector.reshape(1, -1), stored_vectors) indices = (-pair_sims[0]).argsort()[:5] return [(stored_urls[index], round(float(pair_sims[0][index]), 2)) for index in indices]