def transform_and_filter(elem): """ Cleaning is performed to reduce sparsity: - pylab.xxx --> plt.xxx (if the function exists in pyplot) - various add_subplot.xxx --> plt.gca.xxx (see stype.tsv) - only plt.* are kept Returns: cleaned elem or None """ elem = enormer.simplify(elem) if elem.startswith('plt.'): return elem else: return None if __name__ == '__main__': coke_counts = defaultdict(int) count = 0 for code in code_examples(): count += 1 if count % 1000 == 0: print '%d ... unique_cokes=%d'%(count, len(coke_counts)) for x, y in get_cokes(code): coke_counts[x,y] += 1 bh = BackupHandler(relative_path('models/output/backup')) bh.save('coke_0329', coke_counts)
# (3). add children to function, argument # Coming from experimental/code_suggest/mine_argvs.py for (f, a), v_counts in fa_v_counts.items(): if not (f, a) in element_index: continue for v, count in v_counts.items(): element_index[f, a, v] = Element(v, '', count, element_index[f, a]) # (4). Sort all children by count then by value for elem in element_index.values(): elem.children = sorted(elem.children, key=lambda x: (-x.count, x.val)) print '%d total entries in element index'%len(element_index) bh2 = BackupHandler(relative_path('demo/data')) bh2.save('element_index', element_index) """ There are 15770 code examples in total Example f counts (plot) 4228 Example fa counts: {'mfc': 28, 'xlim': 1, 'markeredgewidth': 11, 'markeredgecolor': 20, 'linewidth': 169, 'rot': 4, 'style': 20, 'layout': 1, 'lc': 1, 'title': 14, 'lw': 183, 'ls': 34, 'yerr': 5, 'markersize': 117, 'grid': 1, 'xdata': 1, 'ys': 1, 'rasterized': 3, 'drawstyle': 2, 'x_compat': 2, 'dashes': 3, 'x': 29, 'picker': 13, 'edgecolor': 2, 'table': 3, 'edge_labels': 1, 'whis': 1, 'zs': 11, 'latlon': 3, 'sharey': 1, 'sharex': 2, 'markerfacecolor': 25, 'label': 527, 'colormap': 4, 'mec': 19, 'mew': 19, 'antialiased': 3, 'sym': 1, 'startangle': 1, 'legend': 17, 'c': 112, 's': 2, 'markeresize': 1, 'autopct': 1, 'clip_on': 25, 'color': 526, 'xerr': 2, 'scaley': 1, 'visible': 6,
fu, fau = get_fu_fau() bh = BackupHandler(relative_path('experimental/code_suggest')) all_codes = bh.load('all_codes') print 'There are %d code examples in total'%len(all_codes) pos_sum = defaultdict(float) # [f] = sum pos_cnt = defaultdict(int) # [f] = count for code in all_codes: try: node = ast.parse(code) except SyntaxError: continue calls = findCallNodes(node) called_funcs = [extractCallComponents(x)[0] for x in calls] called_funcs = filter(lambda x: x in fu, called_funcs) if len(calls) < 3: continue for i, f in enumerate(called_funcs): pos_sum[f] += float(i) / len(called_funcs) pos_cnt[f] += 1 pos_ave = {} for f in pos_sum: pos_ave[f] = pos_sum[f] / pos_cnt[f] print 'Extracted average positions for %d functions'%len(pos_ave) bh2 = BackupHandler(relative_path('demo/data')) bh2.save('pos_ave', pos_ave)
if 'source' in cell: src = ''.join(cell['source']) else: src = ''.join(cell['input']) if not src: continue try: is_useful |= parseAndCount(src, elem_counts, fu) except SyntaxError: counters['count_not_parseable_cell'] += 1 continue notebook_code_cells.append(src) if is_useful: counters['count_useful_notebook_file'] += 1 if notebook_code_cells: all_codes.append('\n'.join(notebook_code_cells)) else: counters['count_bad_suffix'] += 1 if is_useful: counters['count_useful_files'] += 1 bh.save('elem_counts_0322', elem_counts) for cnt_key in sorted(counters.keys()): print '%s: %d' % (cnt_key, counters[cnt_key]) bh.save('all_codes_github_1k_repo_0322', all_codes)
from codemend.models.extract_so_code import load_threads, Thread, Answer from codemend import BackupHandler, relative_path if __name__ == '__main__': bh_dir = relative_path('models/output/backup') bh = BackupHandler(bh_dir) try: threads = bh.load('mpl_threads') except AssertionError: threads = list(load_threads( qfilter="Tags LIKE '%<matplotlib>%' AND AnswerCount > 0 AND Score >= 0", afilter="Score >= 0 ORDER BY Score DESC LIMIT 3")) bh.save('mpl_threads', threads) with open(relative_path('models/output/mpl_so_titles.txt'), 'w') as writer: for t in threads: writer.write('%d\t%s\n'%(t.qid, t.qtitle.encode('utf-8')))
return unique_train_pairs if __name__ == '__main__': bh = BackupHandler(relative_path('models/output/backup')) # Step 1 fu, fau = get_fu_fau() # Step 2 with open(relative_path('models/output/mpl_code_blocks.txt')) as reader: content = reader.read() content = content.decode('utf-8') content = content.replace("<", "<") content = content.replace(">", ">") content = content.replace("&", "&") blocks = content.split('\n\n\n') assert len(blocks) > 100 train_pairs = get_train_pairs(fu, fau, blocks, include_arguments=False) bh.save('train_pairs_0204', train_pairs) train_pairs_with_args = get_train_pairs(fu, fau, blocks, include_arguments=True) bh.save('train_pairs_0204_with_args', train_pairs_with_args)
def transform_and_filter(elem): """ Cleaning is performed to reduce sparsity: - pylab.xxx --> plt.xxx (if the function exists in pyplot) - various add_subplot.xxx --> plt.gca.xxx (see stype.tsv) - only plt.* are kept Returns: cleaned elem or None """ elem = enormer.simplify(elem) if elem.startswith('plt.'): return elem else: return None if __name__ == '__main__': coke_counts = defaultdict(int) count = 0 for code in code_examples(): count += 1 if count % 1000 == 0: print '%d ... unique_cokes=%d' % (count, len(coke_counts)) for x, y in get_cokes(code): coke_counts[x, y] += 1 bh = BackupHandler(relative_path('models/output/backup')) bh.save('coke_0329', coke_counts)
def __init__(self, w2v_model, all_elem_counts, maxngram=1, name=None, use_lemma=True, heuristic=False, use_coke=False): """ w2v_model can be a binary vectors file, or a loaded gensim model instance. """ self.maxngram = maxngram self.name = name self.use_lemma = use_lemma assert isinstance(all_elem_counts, dict) self.all_elem_counts = all_elem_counts self.heuristic = heuristic self.use_coke = use_coke if isinstance(w2v_model, basestring): self.model = load_gensim_from_binary_file(w2v_model) self.model.filename = w2v_model.split('/')[-1] if not self.name: self.name = self.model.filename else: assert isinstance(w2v_model, Word2Vec) self.model = w2v_model if not self.name: if hasattr(self.model, 'filename'): self.name = self.model.filename self.model.init_sims() # normalize the vectors self.enormer = ElementNormalizer() if self.use_coke: bh = BackupHandler(relative_path('models/output/backup')) coke_file = 'coke_0329' if not bh.exists(coke_file): raise ValueError('Coke file does not exist: %s'%coke_file) self.coke = bh.load(coke_file) print 'Trying to load element indexes from cache ...' bh = BackupHandler(relative_path('models/output/backup')) elem_index_backup_name = self.model.filename + '_elem_index' if bh.exists(elem_index_backup_name): self.idfs, self.elems, self.elem_lookup, self.vecmat = bh.load(elem_index_backup_name) else: print 'Word2vecBaseline building element indexes...' fu, fau = get_fu_fau() self.idfs = self.get_idf(fu.values() + fau.values()) self.elems = sorted(self.all_elem_counts.keys()) self.elem_lookup = dict((y,x) for (x,y) in enumerate(self.elems)) vecs = [] for e in self.elems: u = doc_serve.get_training_doc(e, True) v = self.get_bow_representation(u) vecs.append(v) self.vecmat = np.array(vecs) assert self.vecmat.shape == (len(self.elems), self.model.vector_size) bh.save(elem_index_backup_name, (self.idfs, self.elems, self.elem_lookup, self.vecmat)) print 'Finished building indexes.'
if 'source' in cell: src = ''.join(cell['source']) else: src = ''.join(cell['input']) if not src: continue try: is_useful |= parseAndCount(src, elem_counts, fu) except SyntaxError: counters['count_not_parseable_cell'] += 1 continue notebook_code_cells.append(src) if is_useful: counters['count_useful_notebook_file'] += 1 if notebook_code_cells: all_codes.append('\n'.join(notebook_code_cells)) else: counters['count_bad_suffix'] += 1 if is_useful: counters['count_useful_files'] += 1 bh.save('elem_counts_0322', elem_counts) for cnt_key in sorted(counters.keys()): print '%s: %d'%(cnt_key, counters[cnt_key]) bh.save('all_codes_github_1k_repo_0322', all_codes)
def __init__(self, w2v_model, all_elem_counts, maxngram=1, name=None, use_lemma=True, heuristic=False, use_coke=False): """ w2v_model can be a binary vectors file, or a loaded gensim model instance. """ self.maxngram = maxngram self.name = name self.use_lemma = use_lemma assert isinstance(all_elem_counts, dict) self.all_elem_counts = all_elem_counts self.heuristic = heuristic self.use_coke = use_coke if isinstance(w2v_model, basestring): self.model = load_gensim_from_binary_file(w2v_model) self.model.filename = w2v_model.split('/')[-1] if not self.name: self.name = self.model.filename else: assert isinstance(w2v_model, Word2Vec) self.model = w2v_model if not self.name: if hasattr(self.model, 'filename'): self.name = self.model.filename self.model.init_sims() # normalize the vectors self.enormer = ElementNormalizer() if self.use_coke: bh = BackupHandler(relative_path('models/output/backup')) coke_file = 'coke_0329' if not bh.exists(coke_file): raise ValueError('Coke file does not exist: %s' % coke_file) self.coke = bh.load(coke_file) print 'Trying to load element indexes from cache ...' bh = BackupHandler(relative_path('models/output/backup')) elem_index_backup_name = self.model.filename + '_elem_index' if bh.exists(elem_index_backup_name): self.idfs, self.elems, self.elem_lookup, self.vecmat = bh.load( elem_index_backup_name) else: print 'Word2vecBaseline building element indexes...' fu, fau = get_fu_fau() self.idfs = self.get_idf(fu.values() + fau.values()) self.elems = sorted(self.all_elem_counts.keys()) self.elem_lookup = dict((y, x) for (x, y) in enumerate(self.elems)) vecs = [] for e in self.elems: u = doc_serve.get_training_doc(e, True) v = self.get_bow_representation(u) vecs.append(v) self.vecmat = np.array(vecs) assert self.vecmat.shape == (len(self.elems), self.model.vector_size) bh.save(elem_index_backup_name, (self.idfs, self.elems, self.elem_lookup, self.vecmat)) print 'Finished building indexes.'
for i in xrange(len(all_codes)): if not svgs[i]: continue code = all_codes[i].strip() if code in seen_code_set: # Dedupe count_dupe += 1 continue else: seen_code_set.add(code) node = ast.parse(code) calls = findCallNodes(node) for call in calls: func_name, keywords = extractCallComponents(call) if func_name in plot_commands_set: examples[func_name].add(i) print 'There are %d duplicates'%count_dupe print '"Scoring" code examples.' # Sorting function: number of chars in the code example examples = dict(examples) for func, idxs in examples.items(): examples[func] = sorted(idxs, key=lambda x: get_effective_code_len(all_codes[x])) bh.save('plotcommands_examples', examples)
'syntax_errors', 'unsafes', 'timeouts', 'exec_errors', 'nofigures', 'savefig_errors', 'empty_svgs', 'successes' ] for name in counter_names: counters[name] = shared_counter.Counter(name=name) pool = ThreadPool(processes=4) # all_codes = all_codes[:1000] # DEBUG svgs = pool.map(partial(run_with_timeout, 3, get_svg, counters), all_codes) for counter in counters.values(): print counter bh.save('svgs', svgs) bh.save('all_codes', all_codes) # LOG: # There are 15582 code examples from mpl stackoverflow # Restored from ./cookbook_segs.pickle # There are 174 code examples from matplotlib cookbook # There are 15756 code blocks in total # timeouts: 514 # empty_svgs: 92 # unsafes: 1420 # exec_errors: 6165 # savefig_errors: 68 # successes: 2582 # syntax_errors: 3223 # nofigures: 1691
from codemend.models.extract_so_code import load_threads, Thread, Answer from codemend import BackupHandler, relative_path if __name__ == '__main__': bh_dir = relative_path('models/output/backup') bh = BackupHandler(bh_dir) try: threads = bh.load('mpl_threads') except AssertionError: threads = list( load_threads( qfilter= "Tags LIKE '%<matplotlib>%' AND AnswerCount > 0 AND Score >= 0", afilter="Score >= 0 ORDER BY Score DESC LIMIT 3")) bh.save('mpl_threads', threads) with open(relative_path('models/output/mpl_so_titles.txt'), 'w') as writer: for t in threads: writer.write('%d\t%s\n' % (t.qid, t.qtitle.encode('utf-8')))
# (3). add children to function, argument # Coming from experimental/code_suggest/mine_argvs.py for (f, a), v_counts in fa_v_counts.items(): if not (f, a) in element_index: continue for v, count in v_counts.items(): element_index[f, a, v] = Element(v, '', count, element_index[f, a]) # (4). Sort all children by count then by value for elem in element_index.values(): elem.children = sorted(elem.children, key=lambda x: (-x.count, x.val)) print '%d total entries in element index' % len(element_index) bh2 = BackupHandler(relative_path('demo/data')) bh2.save('element_index', element_index) """ There are 15770 code examples in total Example f counts (plot) 4228 Example fa counts: {'mfc': 28, 'xlim': 1, 'markeredgewidth': 11, 'markeredgecolor': 20, 'linewidth': 169, 'rot': 4, 'style': 20, 'layout': 1, 'lc': 1, 'title': 14, 'lw': 183, 'ls': 34, 'yerr': 5, 'markersize': 117, 'grid': 1, 'xdata': 1, 'ys': 1, 'rasterized': 3, 'drawstyle': 2, 'x_compat': 2, 'dashes': 3, 'x': 29, 'picker': 13, 'edgecolor': 2, 'table': 3, 'edge_labels': 1, 'whis': 1, 'zs': 11, 'latlon': 3, 'sharey': 1, 'sharex': 2, 'markerfacecolor': 25, 'label': 527, 'colormap': 4, 'mec': 19, 'mew': 19, 'antialiased': 3, 'sym': 1, 'startangle': 1, 'legend': 17, 'c': 112, 's': 2, 'markeresize': 1, 'autopct': 1, 'clip_on': 25, 'color': 526, 'xerr': 2, 'scaley': 1, 'visible': 6, 'marker': 191, 'xs': 1, 'markeredecolor': 1, 'transform': 24, 'xticks': 6,
seen_code_set = set() count_dupe = 0 for i in xrange(len(all_codes)): if not svgs[i]: continue code = all_codes[i].strip() if code in seen_code_set: # Dedupe count_dupe += 1 continue else: seen_code_set.add(code) node = ast.parse(code) calls = findCallNodes(node) for call in calls: func_name, keywords = extractCallComponents(call) if func_name in plot_commands_set: examples[func_name].add(i) print 'There are %d duplicates' % count_dupe print '"Scoring" code examples.' # Sorting function: number of chars in the code example examples = dict(examples) for func, idxs in examples.items(): examples[func] = sorted( idxs, key=lambda x: get_effective_code_len(all_codes[x])) bh.save('plotcommands_examples', examples)
print 'Processed %d code examples'%count print 'There are %d unique elements'%len(element_counts) print 'There are %d unique pyplot elements'%len(element_pyplot_counts) for k in counters: print '%s: %d'%(k, counters[k]) bh = BackupHandler(relative_path('experimental/code_suggest/output/backup')) # Change logs: # - 0322: using raw format # - 0327: using Element, tracking return type and variable assignments and # import aliases. # - 0404: fixed issue with dict as positional argument; # added element_value_counts; # added Shiyan's example. bh.save('elem_counts_0404', element_counts) bh.save('elem_pyplot_counts_0404', element_pyplot_counts) bh.save('elem_pyplot_value_counts_0404', element_pyplot_value_counts) """ Log: # 0327 Processed 24502 code examples There are 144898 unique elements There are 7741 unique pyplot elements Saved to /Users/ronxin/Dropbox/git/codemend/codemend/experimental/code_suggest/output/backup/elem_counts_0327.pickle Saved to /Users/ronxin/Dropbox/git/codemend/codemend/experimental/code_suggest/output/backup/elem_pyplot_counts_0327.pickle synax_error_files: 3223 # 0404
print 'total_matched_args', total_matched_args print 'total_unique_train_pairs', len(unique_train_pairs) return unique_train_pairs if __name__ == '__main__': bh = BackupHandler(relative_path('models/output/backup')) # Step 1 fu, fau = get_fu_fau() # Step 2 with open(relative_path('models/output/mpl_code_blocks.txt')) as reader: content = reader.read() content = content.decode('utf-8') content = content.replace("<", "<") content = content.replace(">", ">") content = content.replace("&", "&") blocks = content.split('\n\n\n') assert len(blocks) > 100 train_pairs = get_train_pairs(fu, fau, blocks, include_arguments=False) bh.save('train_pairs_0204', train_pairs) train_pairs_with_args = get_train_pairs(fu, fau, blocks, include_arguments=True) bh.save('train_pairs_0204_with_args', train_pairs_with_args)
counters = {} counter_names = ['syntax_errors', 'unsafes', 'timeouts', 'exec_errors', 'nofigures', 'savefig_errors', 'empty_svgs', 'successes'] for name in counter_names: counters[name] = shared_counter.Counter(name=name) pool = ThreadPool(processes=4) # all_codes = all_codes[:1000] # DEBUG svgs = pool.map(partial(run_with_timeout, 3, get_svg, counters), all_codes) for counter in counters.values(): print counter bh.save('svgs', svgs) bh.save('all_codes', all_codes) # LOG: # There are 15582 code examples from mpl stackoverflow # Restored from ./cookbook_segs.pickle # There are 174 code examples from matplotlib cookbook # There are 15756 code blocks in total # timeouts: 514 # empty_svgs: 92 # unsafes: 1420 # exec_errors: 6165 # savefig_errors: 68 # successes: 2582 # syntax_errors: 3223 # nofigures: 1691