def __init__(self): # load simplification mapping self.stype_map = {} with open(relative_path('docstring_parse/annotation/stype_map.csv'), 'rb') as csvfile: reader = csv.reader(csvfile) for fields in reader: assert len(fields) == 2 assert fields[0] assert fields[1] self.stype_map[fields[0]] = fields[1] # load rtype mapping self.rtype_map = {} with open(relative_path('docstring_parse/annotation/rtype_map.csv'), 'r') as csvfile: reader = csv.reader(csvfile) for fields in reader: assert len(fields) == 2 assert fields[0] assert fields[1] fields = map(self.simplify, fields) self.rtype_map[fields[0]] = fields[1] self.fu, self.fau = get_fu_fau()
def get_fu_fau(omit_module=True, truncate=True): """ Do step 1. Returns fu, fau fu: [func_name] = most_popular_utter fau: [func_name, arg] = most_popular_utter Parameters ---------- omit_module: if True, the func_name will be the last part only. """ def get_func_name(fullName): if omit_module: return fullName.split('.')[-1] else: return fullName fu = defaultdict(lambda: defaultdict(int)) # [func_name][utter] = count with open(relative_path('docstring_parse/fu.csv'), 'rb') as csvfile: reader = csv.reader(csvfile) next(reader, None) # skip the header for f, u in reader: u = u.split('|||')[0] # see consolidate.py for meaning of ||| if truncate: u = ' '.join( u.split()[:15]) # limit the maximum number of tokens fu[get_func_name(f)][u] += 1 fau = defaultdict( lambda: defaultdict(int)) # [func_name, arg][utter] = count with open(relative_path('docstring_parse/fau.csv'), 'rb') as csvfile: reader = csv.reader(csvfile) next(reader, None) # skip the header for f, a, u in reader: u = u.split('|||')[0] # see consolidate.py for meaning of ||| if truncate: u = ' '.join( u.split()[:15]) # limit the maximum number of tokens fau[get_func_name(f), a][u] += 1 # consolidate the fu and fau mappings for f in fu: fu[f] = get_most_popular_lowered(fu[f]) fu = dict(fu) for fa in fau: fau[fa] = get_most_popular_lowered(fau[fa]) fau = dict(fau) return fu, fau
def get_fu_fau(omit_module=True, truncate=True): """ Do step 1. Returns fu, fau fu: [func_name] = most_popular_utter fau: [func_name, arg] = most_popular_utter Parameters ---------- omit_module: if True, the func_name will be the last part only. """ def get_func_name(fullName): if omit_module: return fullName.split('.')[-1] else: return fullName fu = defaultdict(lambda: defaultdict(int)) # [func_name][utter] = count with open(relative_path('docstring_parse/fu.csv'), 'rb') as csvfile: reader = csv.reader(csvfile) next(reader, None) # skip the header for f, u in reader: u = u.split('|||')[0] # see consolidate.py for meaning of ||| if truncate: u = ' '.join(u.split()[:15]) # limit the maximum number of tokens fu[get_func_name(f)][u] += 1 fau = defaultdict(lambda: defaultdict(int)) # [func_name, arg][utter] = count with open(relative_path('docstring_parse/fau.csv'), 'rb') as csvfile: reader = csv.reader(csvfile) next(reader, None) # skip the header for f, a, u in reader: u = u.split('|||')[0] # see consolidate.py for meaning of ||| if truncate: u = ' '.join(u.split()[:15]) # limit the maximum number of tokens fau[get_func_name(f), a][u] += 1 # consolidate the fu and fau mappings for f in fu: fu[f] = get_most_popular_lowered(fu[f]) fu = dict(fu) for fa in fau: fau[fa] = get_most_popular_lowered(fau[fa]) fau = dict(fau) return fu, fau
def __init__(self): bh = BackupHandler( relative_path('experimental/code_suggest/output/backup')) elem_counts = bh.load('elem_pyplot_counts_0404') self.all_elems = set(elem_counts.keys()) self.all_elem_counts = elem_counts self.enormer = ElementNormalizer()
def __init__(self): plot_commands = get_plot_commands() pyplot_fu = get_pyplot_fu() self.plot_commands = plot_commands self.nonplot_commands = [ f for f in pyplot_fu.keys() if not f in plot_commands ] print 'CodeSuggest: extracted %d plot commands' % len(plot_commands) # Load all code examples of plotting commands from db into memory # These are generated by index_examples.py print 'CodeSuggest: Loading code examples and pregenerated SVGs...' db = sqlite3.connect(relative_path('demo/data/code.sqlite3')) cursor = db.cursor() cursor.execute("SELECT func_id, code, svg FROM example") code_example_lookup = {} # [func_id] = [(code, svg)] count_code_examples = 0 for func_id, code, svg in cursor.fetchall(): count_code_examples += 1 if not func_id in code_example_lookup: code_example_lookup[func_id] = [] code_example_lookup[func_id].append({'code': code, 'svg': svg}) # Sort it again for func_id in code_example_lookup: code_example_lookup[func_id] = sorted( code_example_lookup[func_id], key=lambda x: get_effective_code_len(x['code'])) self.code_example_lookup = code_example_lookup print 'CodeSuggest: Loaded %d code examples (with svgs)...' % count_code_examples db.close() # Load element_index generated by experimental/code_suggest/mine_argvs.py # bh = BackupHandler(relative_path('demo/data')) # self.element_index = bh.load('element_index') # print 'Loaded element_index with %d keys'%len(self.element_index) # Load element value counts bh2 = BackupHandler( relative_path('experimental/code_suggest/output/backup')) self.elem_val_counts = bh2.load( 'elem_pyplot_value_counts_0404') # [elem][val] = count for elem_id in self.elem_val_counts: self.elem_val_counts[elem_id] = sorted( self.elem_val_counts[elem_id].items(), key=lambda x: -x[1]) self.func_position_finder = FuncPositionFinder()
def handleRequest(self, params, server_handle=None): request_type = '' try: experiment_mode = 'no_mode' try: with open(relative_path('demo/log-mode.txt')) as reader: experiment_mode = reader.read().strip() except: pass request_type = params['type'] if 'type' in params else '' ret = {'type': request_type} code = params['code'] if 'code' in params else '' query = params['query'].strip().lower() if 'query' in params else '' self.current_code = code self.prepare_current_code() control_group = False if experiment_mode[0] == 'g': control_group = True if request_type == 'nlp': if not control_group: matches = self.get_matches(query) ret['matches'] = matches elif request_type in ('summary', 'suggest'): if not control_group: cursor_line = int(params['cursor_line']) + 1 cursor_ch = int(params['cursor_ch']) if request_type == 'summary': ret['summary_groups'] = self.cs.get_summary(query, code, cursor_line, cursor_ch, self) else: elem_id = params['elem_id'] ret['suggest'] = self.cs.get_suggest(query, code, cursor_line, cursor_ch, self, elem_id) elif request_type == 'google': pass elif request_type == 'experiment_mode': ret['mode'] = experiment_mode else: raise ValueError('Unrecognized request type: "%s"'%request_type) return ret except SyntaxError: return {'error': 'syntax error', 'type': request_type} except Exception as e: print '\n\nBrain Error:' print traceback.format_exc() + '\n\n' return {'error': server_util.pack_exception_for_html(e, 'Brain Error'), 'type': request_type }
def main(): from codemend import relative_path bh = BackupHandler(relative_path('utils/output/backup')) a = [1,2,3,{4:[5,6]}] bh.save('backup_test_a', a) assert bh.exists('backup_test_a') b = bh.load('backup_test_a') assert a == b print 'All test passed.'
def main(): from codemend import relative_path bh = BackupHandler(relative_path('utils/output/backup')) a = [1, 2, 3, {4: [5, 6]}] bh.save('backup_test_a', a) assert bh.exists('backup_test_a') b = bh.load('backup_test_a') assert a == b print 'All test passed.'
def __init__(self): plot_commands = get_plot_commands() pyplot_fu = get_pyplot_fu() self.plot_commands = plot_commands self.nonplot_commands = [f for f in pyplot_fu.keys() if not f in plot_commands] print 'CodeSuggest: extracted %d plot commands'%len(plot_commands) # Load all code examples of plotting commands from db into memory # These are generated by index_examples.py print 'CodeSuggest: Loading code examples and pregenerated SVGs...' db = sqlite3.connect(relative_path('demo/data/code.sqlite3')) cursor = db.cursor() cursor.execute("SELECT func_id, code, svg FROM example") code_example_lookup = {} # [func_id] = [(code, svg)] count_code_examples = 0 for func_id, code, svg in cursor.fetchall(): count_code_examples += 1 if not func_id in code_example_lookup: code_example_lookup[func_id] = [] code_example_lookup[func_id].append({'code': code, 'svg':svg}) # Sort it again for func_id in code_example_lookup: code_example_lookup[func_id] = sorted( code_example_lookup[func_id], key=lambda x:get_effective_code_len(x['code'])) self.code_example_lookup = code_example_lookup print 'CodeSuggest: Loaded %d code examples (with svgs)...'%count_code_examples db.close() # Load element_index generated by experimental/code_suggest/mine_argvs.py # bh = BackupHandler(relative_path('demo/data')) # self.element_index = bh.load('element_index') # print 'Loaded element_index with %d keys'%len(self.element_index) # Load element value counts bh2 = BackupHandler(relative_path('experimental/code_suggest/output/backup')) self.elem_val_counts = bh2.load('elem_pyplot_value_counts_0404') # [elem][val] = count for elem_id in self.elem_val_counts: self.elem_val_counts[elem_id] = sorted( self.elem_val_counts[elem_id].items(), key=lambda x:-x[1]) self.func_position_finder = FuncPositionFinder()
def code_examples(): """ Yield code examples. """ global all_codes1, all_codes2, all_codes3 # 15770 code examples mined from SO answers in threads that are tagged # "matplotlib". if not all_codes1: print 'Loading SO code examples...' bh1 = BackupHandler(relative_path('experimental/code_suggest')) all_codes1 = bh1.load('all_codes') print '%d examples from SO'%len(all_codes1) for code in all_codes1: yield code # print 'WARNING: mine_element.py ignoring all GitHub code examples...' # """ if not all_codes2: # 8732 code examples (including 395 IPython Notebook files) mined from # GitHub repositories that contain "matplotlib". print 'Loading GitHub code examples...' bh2 = BackupHandler(relative_path('experimental/code_suggest/output/backup')) all_codes2 = bh2.load('all_codes_github_1k_repo_0322') print '%d examples from GitHub'%len(all_codes2) for code in all_codes2: yield code # """ if not all_codes3: # 21993 code examples extracted by Shiyan from the Web print 'Loading Web code examples' bh3 = BackupHandler(relative_path('experimental/mining/output')) all_codes3 = bh3.load('codes_shiyan_0331_web') print '%d examples from Web Shiyan'%len(all_codes3) for code in all_codes3: yield code
def __init__(self): print 'Brain initializing ...' self.fu, self.fau = get_fu_fau() self.cb = ContextBuilder() self.w2v = load_gensim_from_binary_file( relative_path('models/output/vectors-so-text-python-lemma.bin')) self.current_code = None self.current_code_hash = None self.bimodal = BiModalBaseline('bimodal', relative_path('models/output/bi2-0410-d.model'), self.w2v) self.cs = CodeSuggest() self.gws_cache = {} print 'Brain intialized.'
def load_model(model_id=None): # model_file_name = 'models/output/bi2-test.model' model_file_name = 'models/output/bi2-0410-d.model' if model_id: print 'Using customized model_id' model_file_name = 'models/output/bi2-0410' + model_id + '.model' w2v_model_file = 'vectors-so-text-python-lemma.bin' if model_id == '-s': w2v_model_file = 'vectors-so-text-python-lemma-win3.bin' elif model_id == '-t': w2v_model_file = 'vectors-so-text-python-lemma-win5.bin' w2v_model = load_gensim_from_binary_file( relative_path('models/output/' + w2v_model_file)) model = BiModal.load(relative_path(model_file_name)) print "@@@ PLEASE CHECK WHICH FILE IS BEING TESTED ... @@@" print '@@@ MODEL_FILE: %s @@@'%model_file_name model.w2v_model = w2v_model model.syn0l = w2v_model.syn0 return w2v_model, model
def get_pyplot_funcs(): # used by other part as well with open(relative_path('lib/matplotlib/pyplot.py')) as reader: pyplot_src = reader.read() pyplot_funcs = [] for line in pyplot_src.split('\n'): line = line.strip() if line.startswith('def '): line = line[len('def '):] fields = line.split('(') func_name = fields[0] pyplot_funcs.append(func_name) return pyplot_funcs
def get_pyplot_funcs(): # used by other part as well with open(relative_path('lib/matplotlib/pyplot.py')) as reader: pyplot_src = reader.read() pyplot_funcs = [] for line in pyplot_src.split('\n'): line = line.strip() if line.startswith('def '): line = line[len('def '):] fields = line.split('(') func_name = fields[0] pyplot_funcs.append(func_name) return pyplot_funcs
def __init__(self): # load simplification mapping self.stype_map = {} with open(relative_path('docstring_parse/annotation/stype_map.csv'), 'rb') as csvfile: reader = csv.reader(csvfile) for fields in reader: assert len(fields) == 2 assert fields[0] assert fields[1] self.stype_map[fields[0]] = fields[1] # load rtype mapping self.rtype_map = {} with open(relative_path('docstring_parse/annotation/rtype_map.csv'), 'r') as csvfile: reader = csv.reader(csvfile) for fields in reader: assert len(fields) == 2 assert fields[0] assert fields[1] fields = map(self.simplify, fields) self.rtype_map[fields[0]] = fields[1] self.fu, self.fau = get_fu_fau()
def get_pyplot_fu(): # Load csv file of pyplot summary pyplot_fu = {} # [func] = utter print 'CodeSuggest: Loading pyplot fu...' with open(relative_path('docstring_parse/pyplot_fu.csv'), 'rb') as csvfile: reader = csv.reader(csvfile) next(reader, None) # skip the header for f, u in reader: if not u: continue pyplot_fu[f] = u print 'CodeSuggest: read %d fu pairs' % len(pyplot_fu) # lowercase and tokenization of u's for f in pyplot_fu: pyplot_fu[f] = ' '.join(pattern.en.tokenize(pyplot_fu[f].lower())) return pyplot_fu
def get_pyplot_fu(): # Load csv file of pyplot summary pyplot_fu = {} # [func] = utter print 'CodeSuggest: Loading pyplot fu...' with open(relative_path('docstring_parse/pyplot_fu.csv'), 'rb') as csvfile: reader = csv.reader(csvfile) next(reader, None) # skip the header for f, u in reader: if not u: continue pyplot_fu[f] = u print 'CodeSuggest: read %d fu pairs'%len(pyplot_fu) # lowercase and tokenization of u's for f in pyplot_fu: pyplot_fu[f] = ' '.join(pattern.en.tokenize(pyplot_fu[f].lower())) return pyplot_fu
def anything_to_used_elems(anything): if isinstance(anything, basestring): filename = { 'bar': relative_path('demo/code-samples/user-study/task1.py'), 'pie': relative_path('demo/code-samples/user-study/task2.py'), 'line': relative_path('demo/code-samples/user-study/practice.py'), 'empty': relative_path('demo/code-samples/empty.py'), 'eval3': relative_path('demo/code-samples/eval3.py'), 'line_video': relative_path('demo/code-samples/demo_video_linechart.py') }[anything] with open(filename) as reader: code = reader.read() if not code.strip(): return [] node = ast.parse(code) ast_utils.mark_text_ranges(node, unicode(code)) context = cb.getContext(node) return context.used_elems() elif isinstance(anything, list): return anything else: raise TypeError(type(anything))
def lemmaWithVocab(token, vocab): """ First try PORTER. If not in vocab, then lemma. """ out = pattern.vector.stem(token, stemmer=pattern.vector.PORTER) if out in vocab: return out if token.endswith('ing'): out = token[:-3] if out in vocab: return out return lemma(token) from codemend import relative_path stopwords = set() with open(relative_path('models/stopwords-en.txt')) as reader: for line in reader: if line.startswith('#'): continue line = line.strip() words = line.split(', ') stopwords |= set(words) stopwords |= set(string.punctuation) if __name__ == '__main__': s = 'hatching-hatches color" colors interesting flying flies' vocab = dict(hatch=1, color=2, interest=3, fly=4) tokens = tokenize(s) lemmas = [lemmaWithVocab(x, vocab) for x in tokens] print lemmas
train_pairs.append((merged_utter, astunparse.unparse(call_node))) unique_train_pairs = list(set(train_pairs)) print 'total_block', total_block print 'total_grammatical', total_grammatical print 'total_call_nodes', total_call_nodes print 'total_matched_funcs', total_matched_funcs, '(total train pairs)' print 'total_matched_args', total_matched_args print 'total_unique_train_pairs', len(unique_train_pairs) return unique_train_pairs if __name__ == '__main__': bh = BackupHandler(relative_path('models/output/backup')) # Step 1 fu, fau = get_fu_fau() # Step 2 with open(relative_path('models/output/mpl_code_blocks.txt')) as reader: content = reader.read() content = content.decode('utf-8') content = content.replace("<", "<") content = content.replace(">", ">") content = content.replace("&", "&") blocks = content.split('\n\n\n')
this code 3. take average per function Output: - a dictionary: [function] = average_position average position: between 0 (beginning of code) and 1 (end of code). """ import ast from collections import defaultdict from codemend import BackupHandler, relative_path from codemend.models.annotate_code_with_api import get_fu_fau, findCallNodes, extractCallComponents fu, fau = get_fu_fau() bh = BackupHandler(relative_path('experimental/code_suggest')) all_codes = bh.load('all_codes') print 'There are %d code examples in total'%len(all_codes) pos_sum = defaultdict(float) # [f] = sum pos_cnt = defaultdict(int) # [f] = count for code in all_codes: try: node = ast.parse(code) except SyntaxError: continue calls = findCallNodes(node) called_funcs = [extractCallComponents(x)[0] for x in calls] called_funcs = filter(lambda x: x in fu, called_funcs) if len(calls) < 3: continue
def __init__(self): bh = BackupHandler(relative_path('demo/data')) self.pos_ave = bh.load('pos_ave') print 'FuncPositionFinder: loaded %d average positions for functions' % len( self.pos_ave)
import csv import pydoc from itertools import imap from collections import defaultdict, namedtuple from recordclass import recordclass import funcsigs import string from codemend import BackupHandler, relative_path from codemend.models.element import ElementNormalizer from codemend.docstring_parse.elemdoc import ElemDoc from codemend.docstring_parse.consolidate import is_setXXX, get_class from codemend.models.annotate_code_with_api import get_fu_fau # Load all input sources bh = BackupHandler(relative_path('experimental/code_suggest/output/backup')) fu_t, fau_t = get_fu_fau(omit_module=False, truncate=True) fu, fau = get_fu_fau(omit_module=False, truncate=False) fa_lookup = defaultdict(list) # [function] = [argument] for f, a in fau.keys(): fa_lookup[f].append(a) fa_lookup = dict(fa_lookup) cf_lookup = defaultdict(list) # [class] = [function] for f in fu.keys(): cf_lookup[get_class(f)].append(f) elem_counts = bh.load('elem_pyplot_counts_0404') enormer = ElementNormalizer()
""" def do_mh_handle_requests(self, params_list): """This callback function is to be used by brain, so that it can take advantage of the worker pool to do multithreaded plotting.""" return plotter_pool.map(mh_handle_request, params_list) def jedi_lock(self): """To be used by brain to protect non-thread-safe jedi.""" return jedi_lock class ThreadedTCPServer(SocketServer.ThreadingMixIn, SocketServer.TCPServer): pass if __name__ == '__main__': os.chdir(relative_path('demo')) if len(sys.argv) > 1 and sys.argv[1] != '-': port = int(sys.argv[1]) else: port = PORT_NUMBER if len(sys.argv) > 2 and sys.argv[2] != '-': host_name = sys.argv[2] else: host_name = HOST_NAME server_util.port_available_or_die(port) plotter_pool = multiprocessing.Pool()
from codemend.models.extract_so_code import load_threads, Thread, Answer from codemend import BackupHandler, relative_path if __name__ == '__main__': bh_dir = relative_path('models/output/backup') bh = BackupHandler(bh_dir) try: threads = bh.load('mpl_threads') except AssertionError: threads = list(load_threads( qfilter="Tags LIKE '%<matplotlib>%' AND AnswerCount > 0 AND Score >= 0", afilter="Score >= 0 ORDER BY Score DESC LIMIT 3")) bh.save('mpl_threads', threads) with open(relative_path('models/output/mpl_so_titles.txt'), 'w') as writer: for t in threads: writer.write('%d\t%s\n'%(t.qid, t.qtitle.encode('utf-8')))
import re from codemend import relative_path with open(relative_path('models/output/mpl_so_titles.txt')) as reader: id_titles = [ x.split('\t') for x in reader.read().decode('utf-8').split('\n') ] print 'There are %d threads in total' % len(id_titles) count = 0 how_prefixes = [ 'how can i', 'how do i', 'how do you', 'how would one', 'how to', 'how should i', 'how do we', 'how is it possible to', 'how does one', 'how i can', 'how could i', 'how can one', 'how we can', 'how can you', 'do i have to', 'how i', 'how do', 'how would you', 'how would i', 'how would', 'how should', 'how can', 'how are', 'how / where to', 'how' ] # order is important goals = [] for x in id_titles: if not len(x) == 2: continue id_, title = x title = title.lower() if title.startswith('how'): for hp in how_prefixes: title = title.replace(hp, '').strip() goals.append((id_, title)) print 'There are %d how-questions' % len(goals)
def __init__(self, w2v_model, all_elem_counts, maxngram=1, name=None, use_lemma=True, heuristic=False, use_coke=False): """ w2v_model can be a binary vectors file, or a loaded gensim model instance. """ self.maxngram = maxngram self.name = name self.use_lemma = use_lemma assert isinstance(all_elem_counts, dict) self.all_elem_counts = all_elem_counts self.heuristic = heuristic self.use_coke = use_coke if isinstance(w2v_model, basestring): self.model = load_gensim_from_binary_file(w2v_model) self.model.filename = w2v_model.split('/')[-1] if not self.name: self.name = self.model.filename else: assert isinstance(w2v_model, Word2Vec) self.model = w2v_model if not self.name: if hasattr(self.model, 'filename'): self.name = self.model.filename self.model.init_sims() # normalize the vectors self.enormer = ElementNormalizer() if self.use_coke: bh = BackupHandler(relative_path('models/output/backup')) coke_file = 'coke_0329' if not bh.exists(coke_file): raise ValueError('Coke file does not exist: %s'%coke_file) self.coke = bh.load(coke_file) print 'Trying to load element indexes from cache ...' bh = BackupHandler(relative_path('models/output/backup')) elem_index_backup_name = self.model.filename + '_elem_index' if bh.exists(elem_index_backup_name): self.idfs, self.elems, self.elem_lookup, self.vecmat = bh.load(elem_index_backup_name) else: print 'Word2vecBaseline building element indexes...' fu, fau = get_fu_fau() self.idfs = self.get_idf(fu.values() + fau.values()) self.elems = sorted(self.all_elem_counts.keys()) self.elem_lookup = dict((y,x) for (x,y) in enumerate(self.elems)) vecs = [] for e in self.elems: u = doc_serve.get_training_doc(e, True) v = self.get_bow_representation(u) vecs.append(v) self.vecmat = np.array(vecs) assert self.vecmat.shape == (len(self.elems), self.model.vector_size) bh.save(elem_index_backup_name, (self.idfs, self.elems, self.elem_lookup, self.vecmat)) print 'Finished building indexes.'
('plt.grid', 'add grid lines', 'bar'), ('plt.grid', 'add grid', 'bar'), ('plt.grid', 'add grids', 'bar'), ('plt.grid', 'add gridlines', 'bar'), ('plt.plot@linewidth', 'thickness', 'line'), ('plt.plot@linewidth', 'line thickness', 'line'), ('plt.plot@linewidth', 'thick', 'line'), ('plt.plot@linewidth', 'wide', 'line'), # ('plt.xticks@rotation', 'change the style of x-axis label', 'bar'), ('plt.xkcd', 'fancy style', 'line'), ] gt_set = set(small_gt2) small_gt3 = [] with open(relative_path('models/data/gt-0924.csv'), 'rb') as csvfile: reader = csv.reader(csvfile) next(reader, None) # skip the header for file_,query,expected,remark in reader: combined = (expected, query, file_) if combined not in gt_set: small_gt3.append(combined) else: print 'duplicated: %s %s %s'%combined cb = ContextBuilder() def anything_to_used_elems(anything): if isinstance(anything, basestring): filename = { 'bar': relative_path('demo/code-samples/user-study/task1.py'),
""" Generates default_varmap.py. Needs to be run only once. Needs to see a matplotlib repository. """ from codemend import relative_path from element import get_pyplot_funcs from element_extract import extract_varmap_elems if __name__ == '__main__': with open(relative_path('lib/matplotlib/pyplot.py')) as reader: pyplot_src = reader.read() with open(relative_path('lib/matplotlib/pylab.py')) as reader: pylab_src = reader.read() var_map1, _ = extract_varmap_elems(pyplot_src, False, True) print len(var_map1) var_map2, _ = extract_varmap_elems(pylab_src, False, True) print len(var_map2) lines = [] pyplot_funcs = get_pyplot_funcs() for f in pyplot_funcs: lines.append('from matplotlib.pyplot import %s'%f) fake_pyplot_src = '\n'.join(lines)
elem_counts[f,a] += 1 elem_counts[f,a,v] += 1 return is_useful if __name__ == '__main__': counters = defaultdict(int) md5s = set() all_codes = [] fu, _ = get_fu_fau() elem_counts = defaultdict(int) # [elem] = count bh = BackupHandler(relative_path('experimental/code_suggest/output/backup')) for root, dirs, files in os.walk( relative_path('mining/output/github-matplotlib-repos')): if '.git' in root: continue for file_name in files: counters['count_file'] += 1 if counters['count_file'] % 1000 == 0: print 'Processed %d files - Useful files: %d'%( counters['count_file'], counters['count_useful_files']) file_path = os.path.join(root, file_name)
""" Generates default_varmap.py. Needs to be run only once. Needs to see a matplotlib repository. """ from codemend import relative_path from element import get_pyplot_funcs from element_extract import extract_varmap_elems if __name__ == '__main__': with open(relative_path('lib/matplotlib/pyplot.py')) as reader: pyplot_src = reader.read() with open(relative_path('lib/matplotlib/pylab.py')) as reader: pylab_src = reader.read() var_map1, _ = extract_varmap_elems(pyplot_src, False, True) print len(var_map1) var_map2, _ = extract_varmap_elems(pylab_src, False, True) print len(var_map2) lines = [] pyplot_funcs = get_pyplot_funcs() for f in pyplot_funcs: lines.append('from matplotlib.pyplot import %s' % f) fake_pyplot_src = '\n'.join(lines)
import csv from itertools import imap from collections import defaultdict import string from codemend.docstring_parse.elemdoc import ElemDoc from codemend.docstring_parse.polish import \ create_new_element_doc, fu,fau, fu_t, fau_t, enormer from codemend import relative_path elem_lookup = {} # [elem_id] = elem_doc # Import pre-computed elem docs with open(relative_path( 'docstring_parse/doc_polished/elem_docs.csv' # this file is generated by docstring_parse/polish.py ), 'rb') as csvfile: reader = csv.reader(csvfile) columns = tuple(next(reader)) assert ElemDoc._fields == columns for elem in imap(ElemDoc._make, reader): elem_id = elem.elem_id elem_lookup[elem_id] = elem # Create children_lookup children_lookup = defaultdict(list) # [parent_elem_id] = [elems] for elem in elem_lookup.values(): if elem.parent_id: children_lookup[elem.parent_id].append(elem)
def __init__(self): bh = BackupHandler(relative_path('experimental/code_suggest/output/backup')) elem_counts = bh.load('elem_pyplot_counts_0404') self.all_elems = set(elem_counts.keys()) self.all_elem_counts = elem_counts self.enormer = ElementNormalizer()
from codemend import BackupHandler, relative_path from codemend.models.element import ElementNormalizer from codemend.models.word2vec_util import load_gensim_from_binary_file from codemend.models.bimodal2 import BiModal from codemend.experimental.code_suggest.mine_element import code_examples if __name__ == '__main__': bh = BackupHandler( relative_path('experimental/code_suggest/output/backup')) elem_counts = bh.load('elem_pyplot_counts_0404') all_elems = sorted(elem_counts.keys()) all_elems_counts = elem_counts enormer = ElementNormalizer() w2v_model = load_gensim_from_binary_file( relative_path('models/output/vectors-so-text-python-lemma-win5.bin') ) # <-- note the change here!! model = BiModal(all_elems, all_elems_counts, w2v_model, code_examples, enormer, threads=None, alpha=0.05, window=5, negative=20, additive=0, multiply=0, concat=1, epoch=1, rand_parent_doc=True,
def __init__(self): self.expected_set_cache = {} self.queries = [] # load the ground-truth file with open('eval-gt-2.csv', 'rb') as csv_file: reader = csv.reader(csv_file) # columns: # case_study_no,answer_func,answer_arg,query Query = recordclass('Query', next(reader)) for query in imap(Query._make, reader): assert query.case_study_no.startswith('example-') query.case_study_no = int(query.case_study_no.replace('example-', '')) query.answer = query.answer.strip() query.query = query.query.strip() query.query_source = query.query_source.strip() self.queries.append(query) print 'Loading the code samples...' self.code_samples = [] fnames = [relative_path('demo/code-samples/before_afters/before%d.py'%x) for x in [1,2,3,4,5]] for f in fnames: with open(f) as reader: code = reader.read().strip() self.code_samples.append(code) print 'Initializing context builder...' self.cb = ContextBuilder() print 'Initializing element normalizer...' self.enormer = ElementNormalizer() print 'Instantiating baselines...' self.baselines = [] self.baselines.append(RandomBaseline(self.cb.getAllElements())) w2vb1 = Word2vecBaseline( relative_path('models/output/vectors-so-text-python-lemma.bin'), self.cb.getAllElementCounts(), 1, 'w2v') w2vb2 = Word2vecBaseline(w2vb1.model, self.cb.getAllElementCounts(), 1, 'w2v-heuristic', heuristic=True) w2vb3 = Word2vecBaseline(w2vb1.model, self.cb.getAllElementCounts(), 1, 'w2v-cooccur', use_coke=True) w2vb4 = Word2vecBaseline(w2vb1.model, self.cb.getAllElementCounts(), 1, 'w2v-hc', heuristic=True, use_coke=True) self.baselines += [w2vb1, w2vb2, w2vb3, w2vb4] # bimodal = BiModalBaseline('bimodal-concat-10epoch', # relative_path('models/output/bi2-test-ggg.model'), # relative_path('models/output/vectors-flat-mpl-0205.bin')) # bimodal_ids = list('denopq') bimodal_ids = list('d') for id_ in bimodal_ids: bimodal = BiModalBaseline('bimodal-'+id_, relative_path('models/output/bi2-0410-%s.model'%id_), w2vb1.model) self.baselines.append(bimodal) print 'Starts evaluating...' metric_names = ['MRR', 'P@1', 'P@5', 'P@10'] results = np.zeros((len(self.baselines), len(metric_names)), dtype=float) result_log = [] # for diagnosis count_query = 0 for idx, code in enumerate(self.code_samples): # triple-for-loop structure: {code-sample -> gt -> baseline}. print 'Processing code sample %d'%(idx + 1) current_queries = filter(lambda x: int(x.case_study_no) == int(idx + 1), self.queries) assert current_queries context = self.cb.getContext(code) for query in current_queries: # "query" = "ground truth" count_query += 1 assert query.answer for b_idx, b in enumerate(self.baselines): suggested_items = b.suggest(query.query, context) answer_rank = self.getRankOfExpectedItem( suggested_items, code, query.answer) mrr_idx = metric_names.index('MRR') p1_idx = metric_names.index('P@1') p5_idx = metric_names.index('P@5') p10_idx = metric_names.index('P@10') if answer_rank > 0: results[b_idx, mrr_idx] += 1. / answer_rank if answer_rank == 1: results[b_idx, p1_idx] += 1 if answer_rank <= 5: results[b_idx, p5_idx] += 1 if answer_rank <= 10: results[b_idx, p10_idx] += 1 self.updateResultLog(result_log, idx + 1, query.query, b, suggested_items, code, query.answer, answer_rank) assert count_query > 0 for metric_idx, metric in enumerate(metric_names): if metric == 'MRR' or metric.startswith('P@'): results[:, metric_idx] /= count_query # output print 'Writing outputs...' with open(relative_path('models/output/eval-result-0413.csv'), 'wb') as csv_file: writer = csv.writer(csv_file) writer.writerow(['Baseline'] + metric_names) for b_idx, b in enumerate(self.baselines): writer.writerow([b.__repr__()] + results[b_idx].tolist()) with open(relative_path('models/output/eval-log-0413.csv'), 'wb') as csv_file: writer = csv.writer(csv_file) writer.writerow(ResultLogEntry._fields) for row in result_log: writer.writerow(row) # close resources print 'Closing resources' # whoosh_baseline.close() print 'Done'
def __init__(self, w2v_model, all_elem_counts, maxngram=1, name=None, use_lemma=True, heuristic=False, use_coke=False): """ w2v_model can be a binary vectors file, or a loaded gensim model instance. """ self.maxngram = maxngram self.name = name self.use_lemma = use_lemma assert isinstance(all_elem_counts, dict) self.all_elem_counts = all_elem_counts self.heuristic = heuristic self.use_coke = use_coke if isinstance(w2v_model, basestring): self.model = load_gensim_from_binary_file(w2v_model) self.model.filename = w2v_model.split('/')[-1] if not self.name: self.name = self.model.filename else: assert isinstance(w2v_model, Word2Vec) self.model = w2v_model if not self.name: if hasattr(self.model, 'filename'): self.name = self.model.filename self.model.init_sims() # normalize the vectors self.enormer = ElementNormalizer() if self.use_coke: bh = BackupHandler(relative_path('models/output/backup')) coke_file = 'coke_0329' if not bh.exists(coke_file): raise ValueError('Coke file does not exist: %s' % coke_file) self.coke = bh.load(coke_file) print 'Trying to load element indexes from cache ...' bh = BackupHandler(relative_path('models/output/backup')) elem_index_backup_name = self.model.filename + '_elem_index' if bh.exists(elem_index_backup_name): self.idfs, self.elems, self.elem_lookup, self.vecmat = bh.load( elem_index_backup_name) else: print 'Word2vecBaseline building element indexes...' fu, fau = get_fu_fau() self.idfs = self.get_idf(fu.values() + fau.values()) self.elems = sorted(self.all_elem_counts.keys()) self.elem_lookup = dict((y, x) for (x, y) in enumerate(self.elems)) vecs = [] for e in self.elems: u = doc_serve.get_training_doc(e, True) v = self.get_bow_representation(u) vecs.append(v) self.vecmat = np.array(vecs) assert self.vecmat.shape == (len(self.elems), self.model.vector_size) bh.save(elem_index_backup_name, (self.idfs, self.elems, self.elem_lookup, self.vecmat)) print 'Finished building indexes.'
count += 1 if count % 1000 == 0: print count qid, qtitle, qbody, qscore = row q2 = "SELECT Id, Body, Score from posts WHERE ParentID = %d"%qid if afilter: q2 = '%s AND %s'%(q2, afilter) answers = [] cursor2 = db.cursor() cursor2 = cursor2.execute(q2) answers = map(Answer._make, cursor2.fetchall()) if not answers: continue answers = tuple(answers) yield Thread._make(row + (answers,)) if __name__ == '__main__': bh_dir = relative_path('models/output/backup') bh = BackupHandler(bh_dir) try: threads = bh.load('mpl_threads') except AssertionError: threads = list(load_threads( qfilter="Tags LIKE '%<matplotlib>%' AND AnswerCount > 0 AND Score >= 0", afilter="Score >= 0 ORDER BY Score DESC LIMIT 3")) bh.save('mpl_threads', threads) # Dump the code blocks extracted from the threads print 'Extracting and dumping code blocks ...' outdir = 'output/' if not os.path.exists(outdir): os.makedirs(outdir)
def transform_and_filter(elem): """ Cleaning is performed to reduce sparsity: - pylab.xxx --> plt.xxx (if the function exists in pyplot) - various add_subplot.xxx --> plt.gca.xxx (see stype.tsv) - only plt.* are kept Returns: cleaned elem or None """ elem = enormer.simplify(elem) if elem.startswith('plt.'): return elem else: return None if __name__ == '__main__': coke_counts = defaultdict(int) count = 0 for code in code_examples(): count += 1 if count % 1000 == 0: print '%d ... unique_cokes=%d' % (count, len(coke_counts)) for x, y in get_cokes(code): coke_counts[x, y] += 1 bh = BackupHandler(relative_path('models/output/backup')) bh.save('coke_0329', coke_counts)
""" List all argkeys for human inspection. The purpose is to check every argkey and manually link proper documentations to them. """ import csv from itertools import imap from codemend.docstring_parse.elemdoc import ElemDoc from codemend import relative_path with open(relative_path( 'docstring_parse/doc_polished/elem_docs.csv' # this file is generated by docstring_parse/polish.py ), 'rb') as csvfile: reader = csv.reader(csvfile) columns = tuple(next(reader)) assert ElemDoc._fields == columns outset = set() for elem in imap(ElemDoc._make, reader): elem_id = elem.elem_id if elem.type == 'argkey': fields = elem_id.split('@') key = '@'.join(fields[:2]) outset.add(key) for x in sorted(outset):
""" Testing word2vec baseline """ TEST_VECTOR_BIN_FILE = 'output/vectors-flat-mpl-0205.bin' # TEST_VECTOR_BIN_FILE = 'output/vectors-so-text-python-5gram.bin' # TEST_VECTOR_BIN_FILE = 'output/vectors-so-text-python-stem-3gram.bin' from codemend.models.eval2 import ContextBuilder cb = ContextBuilder() MAXNGRAM = 1 wb = Word2vecBaseline(TEST_VECTOR_BIN_FILE, cb.getAllElementCounts(), MAXNGRAM, 'test-w2v', use_lemma=True, heuristic=False, use_coke=True) query = 'add legend' with open(relative_path( 'demo/code-samples/before_afters/before1.py')) as reader: code = reader.read() context = cb.getContext(code) results = wb.suggest(query, context) for r in results: assert isinstance(r, SuggestItem) print '%.3f\t%s\t%d' % (r.score, r.elem, wb.all_elem_counts[r.elem])
def __init__(self): bh = BackupHandler(relative_path('demo/data')) self.pos_ave = bh.load('pos_ave') print 'FuncPositionFinder: loaded %d average positions for functions'%len(self.pos_ave)
def __init__(self): self.expected_set_cache = {} self.queries = [] # load the ground-truth file with open('eval-gt-2.csv', 'rb') as csv_file: reader = csv.reader(csv_file) # columns: # case_study_no,answer_func,answer_arg,query Query = recordclass('Query', next(reader)) for query in imap(Query._make, reader): assert query.case_study_no.startswith('example-') query.case_study_no = int( query.case_study_no.replace('example-', '')) query.answer = query.answer.strip() query.query = query.query.strip() query.query_source = query.query_source.strip() self.queries.append(query) print 'Loading the code samples...' self.code_samples = [] fnames = [ relative_path('demo/code-samples/before_afters/before%d.py' % x) for x in [1, 2, 3, 4, 5] ] for f in fnames: with open(f) as reader: code = reader.read().strip() self.code_samples.append(code) print 'Initializing context builder...' self.cb = ContextBuilder() print 'Initializing element normalizer...' self.enormer = ElementNormalizer() print 'Instantiating baselines...' self.baselines = [] self.baselines.append(RandomBaseline(self.cb.getAllElements())) w2vb1 = Word2vecBaseline( relative_path('models/output/vectors-so-text-python-lemma.bin'), self.cb.getAllElementCounts(), 1, 'w2v') w2vb2 = Word2vecBaseline(w2vb1.model, self.cb.getAllElementCounts(), 1, 'w2v-heuristic', heuristic=True) w2vb3 = Word2vecBaseline(w2vb1.model, self.cb.getAllElementCounts(), 1, 'w2v-cooccur', use_coke=True) w2vb4 = Word2vecBaseline(w2vb1.model, self.cb.getAllElementCounts(), 1, 'w2v-hc', heuristic=True, use_coke=True) self.baselines += [w2vb1, w2vb2, w2vb3, w2vb4] # bimodal = BiModalBaseline('bimodal-concat-10epoch', # relative_path('models/output/bi2-test-ggg.model'), # relative_path('models/output/vectors-flat-mpl-0205.bin')) # bimodal_ids = list('denopq') bimodal_ids = list('d') for id_ in bimodal_ids: bimodal = BiModalBaseline( 'bimodal-' + id_, relative_path('models/output/bi2-0410-%s.model' % id_), w2vb1.model) self.baselines.append(bimodal) print 'Starts evaluating...' metric_names = ['MRR', 'P@1', 'P@5', 'P@10'] results = np.zeros((len(self.baselines), len(metric_names)), dtype=float) result_log = [] # for diagnosis count_query = 0 for idx, code in enumerate(self.code_samples): # triple-for-loop structure: {code-sample -> gt -> baseline}. print 'Processing code sample %d' % (idx + 1) current_queries = filter( lambda x: int(x.case_study_no) == int(idx + 1), self.queries) assert current_queries context = self.cb.getContext(code) for query in current_queries: # "query" = "ground truth" count_query += 1 assert query.answer for b_idx, b in enumerate(self.baselines): suggested_items = b.suggest(query.query, context) answer_rank = self.getRankOfExpectedItem( suggested_items, code, query.answer) mrr_idx = metric_names.index('MRR') p1_idx = metric_names.index('P@1') p5_idx = metric_names.index('P@5') p10_idx = metric_names.index('P@10') if answer_rank > 0: results[b_idx, mrr_idx] += 1. / answer_rank if answer_rank == 1: results[b_idx, p1_idx] += 1 if answer_rank <= 5: results[b_idx, p5_idx] += 1 if answer_rank <= 10: results[b_idx, p10_idx] += 1 self.updateResultLog(result_log, idx + 1, query.query, b, suggested_items, code, query.answer, answer_rank) assert count_query > 0 for metric_idx, metric in enumerate(metric_names): if metric == 'MRR' or metric.startswith('P@'): results[:, metric_idx] /= count_query # output print 'Writing outputs...' with open(relative_path('models/output/eval-result-0413.csv'), 'wb') as csv_file: writer = csv.writer(csv_file) writer.writerow(['Baseline'] + metric_names) for b_idx, b in enumerate(self.baselines): writer.writerow([b.__repr__()] + results[b_idx].tolist()) with open(relative_path('models/output/eval-log-0413.csv'), 'wb') as csv_file: writer = csv.writer(csv_file) writer.writerow(ResultLogEntry._fields) for row in result_log: writer.writerow(row) # close resources print 'Closing resources' # whoosh_baseline.close() print 'Done'
from codemend.models.extract_so_code import load_threads, Thread, Answer from codemend import BackupHandler, relative_path if __name__ == '__main__': bh_dir = relative_path('models/output/backup') bh = BackupHandler(bh_dir) try: threads = bh.load('mpl_threads') except AssertionError: threads = list( load_threads( qfilter= "Tags LIKE '%<matplotlib>%' AND AnswerCount > 0 AND Score >= 0", afilter="Score >= 0 ORDER BY Score DESC LIMIT 3")) bh.save('mpl_threads', threads) with open(relative_path('models/output/mpl_so_titles.txt'), 'w') as writer: for t in threads: writer.write('%d\t%s\n' % (t.qid, t.qtitle.encode('utf-8')))
def lemmaWithVocab(token, vocab): """ First try PORTER. If not in vocab, then lemma. """ out = pattern.vector.stem(token, stemmer=pattern.vector.PORTER) if out in vocab: return out if token.endswith('ing'): out = token[:-3] if out in vocab: return out return lemma(token) from codemend import relative_path stopwords = set() with open(relative_path('models/stopwords-en.txt')) as reader: for line in reader: if line.startswith('#'): continue line = line.strip() words = line.split(', ') stopwords |= set(words) stopwords |= set(string.punctuation) if __name__ == '__main__': s = 'hatching-hatches color" colors interesting flying flies' vocab = dict(hatch=1, color=2, interest=3, fly=4) tokens = tokenize(s) lemmas = [lemmaWithVocab(x, vocab) for x in tokens] print lemmas
called. - #3: Not recommending elements that occur too infrequently. - #4: When a function is not used before, and its argv is recommended, we strip the "@", and recommend the function first, followed by the argv. e.g. [pie@0, pie] => [pie, pie@0]. """ from codemend import BackupHandler, relative_path from codemend.demo.code_suggest import get_plot_commands from codemend.models.baseline2 import SuggestItem plot_commands = get_plot_commands() plot_commands_set = set(plot_commands) bh = BackupHandler(relative_path('experimental/code_suggest/output/backup')) elem_counts = bh.load('elem_pyplot_counts_0404') def prune(used_elems, suggest_elems): for elem in used_elems: assert isinstance(elem, basestring) for elem in suggest_elems: assert isinstance(elem, SuggestItem), type(elem) used_elems_set = set(used_elems) used_funcs = map(get_func_name, used_elems) used_funcs_set = set(used_funcs) has_used_plot_commands = any( map(lambda x: x in plot_commands_set, used_funcs))
return is_useful if __name__ == '__main__': counters = defaultdict(int) md5s = set() all_codes = [] fu, _ = get_fu_fau() elem_counts = defaultdict(int) # [elem] = count bh = BackupHandler( relative_path('experimental/code_suggest/output/backup')) for root, dirs, files in os.walk( relative_path('mining/output/github-matplotlib-repos')): if '.git' in root: continue for file_name in files: counters['count_file'] += 1 if counters['count_file'] % 1000 == 0: print 'Processed %d files - Useful files: %d' % ( counters['count_file'], counters['count_useful_files']) file_path = os.path.join(root, file_name)
def transform_and_filter(elem): """ Cleaning is performed to reduce sparsity: - pylab.xxx --> plt.xxx (if the function exists in pyplot) - various add_subplot.xxx --> plt.gca.xxx (see stype.tsv) - only plt.* are kept Returns: cleaned elem or None """ elem = enormer.simplify(elem) if elem.startswith('plt.'): return elem else: return None if __name__ == '__main__': coke_counts = defaultdict(int) count = 0 for code in code_examples(): count += 1 if count % 1000 == 0: print '%d ... unique_cokes=%d'%(count, len(coke_counts)) for x, y in get_cokes(code): coke_counts[x,y] += 1 bh = BackupHandler(relative_path('models/output/backup')) bh.save('coke_0329', coke_counts)
import csv import pydoc from itertools import imap from collections import defaultdict, namedtuple from recordclass import recordclass import funcsigs import string from codemend import BackupHandler, relative_path from codemend.models.element import ElementNormalizer from codemend.docstring_parse.elemdoc import ElemDoc from codemend.docstring_parse.consolidate import is_setXXX, get_class from codemend.models.annotate_code_with_api import get_fu_fau # Load all input sources bh = BackupHandler(relative_path('experimental/code_suggest/output/backup')) fu_t, fau_t = get_fu_fau(omit_module=False, truncate=True) fu, fau = get_fu_fau(omit_module=False, truncate=False) fa_lookup = defaultdict(list) # [function] = [argument] for f, a in fau.keys(): fa_lookup[f].append(a) fa_lookup = dict(fa_lookup) cf_lookup = defaultdict(list) # [class] = [function] for f in fu.keys(): cf_lookup[get_class(f)].append(f) elem_counts = bh.load('elem_pyplot_counts_0404') enormer = ElementNormalizer()
self.model = BiModal.load(model_file) if isinstance(w2v_model, basestring): w2v_model = load_gensim_from_binary_file(w2v_model) else: assert isinstance(w2v_model, Word2Vec) self.model.w2v_model = w2v_model self.model.syn0l = self.model.w2v_model.syn0 def suggest(self, query, context): used_elems = context.used_elems() scores = self.model.score_all(query, used_elems) elems_sorted = sorted(zip(scores, self.model.all_elems), reverse=True) suggest_sorted = [SuggestItem(elem=elem, score=score) for (score, elem) in elems_sorted] suggest_pruned = prune(used_elems, suggest_sorted) return suggest_pruned[:50] def __repr__(self): return self.name if __name__ == '__main__': from codemend import relative_path bmb = BiModalBaseline('tmp', relative_path('models/output/bi2-test.model'), relative_path('models/output/vectors-flat-mpl-0205.bin') ) print bmb, 'initialized.'
import csv from itertools import imap from collections import defaultdict import string from codemend.docstring_parse.elemdoc import ElemDoc from codemend.docstring_parse.polish import \ create_new_element_doc, fu,fau, fu_t, fau_t, enormer from codemend import relative_path elem_lookup = {} # [elem_id] = elem_doc # Import pre-computed elem docs with open( relative_path('docstring_parse/doc_polished/elem_docs.csv' # this file is generated by docstring_parse/polish.py ), 'rb') as csvfile: reader = csv.reader(csvfile) columns = tuple(next(reader)) assert ElemDoc._fields == columns for elem in imap(ElemDoc._make, reader): elem_id = elem.elem_id elem_lookup[elem_id] = elem # Create children_lookup children_lookup = defaultdict(list) # [parent_elem_id] = [elems] for elem in elem_lookup.values(): if elem.parent_id:
train_pairs.append((merged_utter, astunparse.unparse(call_node))) unique_train_pairs = list(set(train_pairs)) print 'total_block', total_block print 'total_grammatical', total_grammatical print 'total_call_nodes', total_call_nodes print 'total_matched_funcs', total_matched_funcs, '(total train pairs)' print 'total_matched_args', total_matched_args print 'total_unique_train_pairs', len(unique_train_pairs) return unique_train_pairs if __name__ == '__main__': bh = BackupHandler(relative_path('models/output/backup')) # Step 1 fu, fau = get_fu_fau() # Step 2 with open(relative_path('models/output/mpl_code_blocks.txt')) as reader: content = reader.read() content = content.decode('utf-8') content = content.replace("<", "<") content = content.replace(">", ">") content = content.replace("&", "&") blocks = content.split('\n\n\n')
if elem_id.startswith('plt.'): element_pyplot_counts[elem_id] += 1 val = get_countable_value(e.val_node, varmap, enormer) if val: element_pyplot_value_counts[elem_id][val] += 1 for elem_id in element_pyplot_value_counts: element_pyplot_value_counts[elem_id] = dict(element_pyplot_value_counts[elem_id]) element_pyplot_value_counts = dict(element_pyplot_value_counts) print 'Processed %d code examples'%count print 'There are %d unique elements'%len(element_counts) print 'There are %d unique pyplot elements'%len(element_pyplot_counts) for k in counters: print '%s: %d'%(k, counters[k]) bh = BackupHandler(relative_path('experimental/code_suggest/output/backup')) # Change logs: # - 0322: using raw format # - 0327: using Element, tracking return type and variable assignments and # import aliases. # - 0404: fixed issue with dict as positional argument; # added element_value_counts; # added Shiyan's example. bh.save('elem_counts_0404', element_counts) bh.save('elem_pyplot_counts_0404', element_pyplot_counts) bh.save('elem_pyplot_value_counts_0404', element_pyplot_value_counts) """ Log: # 0327
from codemend import BackupHandler, relative_path from codemend.models.element import ElementNormalizer from codemend.models.word2vec_util import load_gensim_from_binary_file from codemend.models.bimodal2 import BiModal from codemend.experimental.code_suggest.mine_element import code_examples if __name__ == '__main__': bh = BackupHandler(relative_path('experimental/code_suggest/output/backup')) elem_counts = bh.load('elem_pyplot_counts_0404') all_elems = sorted(elem_counts.keys()) all_elems_counts = elem_counts enormer = ElementNormalizer() w2v_model = load_gensim_from_binary_file( relative_path('models/output/vectors-so-text-python-lemma-win5.bin')) # <-- note the change here!! model = BiModal(all_elems, all_elems_counts, w2v_model, code_examples, enormer, threads=None, alpha=0.05, window=5, negative=20, additive=0, multiply=0, concat=1, epoch=1, rand_parent_doc=True, hint_pvecs_init=True, hint_rvecs_init=False, neg_sample_used_elem=False) model.save(relative_path('models/output/bi2-0410-t.model')) # Changes: # bi2-test -- lastest gold version for user study # bi2-0410-a -- epoch=10, fixed stopwords (e.g., excluding bar from stopwords) -- this is vanilla # bi2-0410-b -- epoch=1, quick check if setting is all right. # bi2-0410-c -- epoch=10, replicating bi2-0410-a # bi2-0410-d -- epoch=1, randomly with-parent doc # bi2-0410-e -- epoch=5, randomly with-parent doc
def do_mh_handle_requests(self, params_list): """This callback function is to be used by brain, so that it can take advantage of the worker pool to do multithreaded plotting.""" return plotter_pool.map(mh_handle_request, params_list) def jedi_lock(self): """To be used by brain to protect non-thread-safe jedi.""" return jedi_lock class ThreadedTCPServer(SocketServer.ThreadingMixIn, SocketServer.TCPServer): pass if __name__ == '__main__': os.chdir(relative_path('demo')) if len(sys.argv) > 1 and sys.argv[1] != '-': port = int(sys.argv[1]) else: port = PORT_NUMBER if len(sys.argv) > 2 and sys.argv[2] != '-': host_name = sys.argv[2] else: host_name = HOST_NAME server_util.port_available_or_die(port) plotter_pool = multiprocessing.Pool()