Пример #1
0
def transform_and_filter(elem):
  """
  Cleaning is performed to reduce sparsity:
    - pylab.xxx --> plt.xxx (if the function exists in pyplot)
    - various add_subplot.xxx --> plt.gca.xxx (see stype.tsv)
    - only plt.* are kept

  Returns: cleaned elem or None

  """
  elem = enormer.simplify(elem)
  if elem.startswith('plt.'):
    return elem
  else:
    return None

if __name__ == '__main__':
  coke_counts = defaultdict(int)
  count = 0
  for code in code_examples():
    count += 1
    if count % 1000 == 0:
      print '%d ... unique_cokes=%d'%(count, len(coke_counts))

    for x, y in get_cokes(code):
      coke_counts[x,y] += 1

  bh = BackupHandler(relative_path('models/output/backup'))
  bh.save('coke_0329', coke_counts)
Пример #2
0
  # (3). add children to function, argument
  # Coming from experimental/code_suggest/mine_argvs.py
  for (f, a), v_counts in fa_v_counts.items():
    if not (f, a) in element_index: continue
    for v, count in v_counts.items():
      element_index[f, a, v] = Element(v, '', count, element_index[f, a])

  # (4). Sort all children by count then by value
  for elem in element_index.values():
    elem.children = sorted(elem.children, key=lambda x: (-x.count, x.val))

  print '%d total entries in element index'%len(element_index)

  bh2 = BackupHandler(relative_path('demo/data'))
  bh2.save('element_index', element_index)

  """
There are 15770 code examples in total
Example f counts (plot)
4228
Example fa counts:
{'mfc': 28, 'xlim': 1, 'markeredgewidth': 11, 'markeredgecolor': 20,
'linewidth': 169, 'rot': 4, 'style': 20, 'layout': 1, 'lc': 1, 'title': 14,
'lw': 183, 'ls': 34, 'yerr': 5, 'markersize': 117, 'grid': 1, 'xdata': 1,
'ys': 1, 'rasterized': 3, 'drawstyle': 2, 'x_compat': 2, 'dashes': 3, 'x': 29,
'picker': 13, 'edgecolor': 2, 'table': 3, 'edge_labels': 1, 'whis': 1, 'zs':
11, 'latlon': 3, 'sharey': 1, 'sharex': 2, 'markerfacecolor': 25, 'label':
527, 'colormap': 4, 'mec': 19, 'mew': 19, 'antialiased': 3, 'sym': 1,
'startangle': 1, 'legend': 17, 'c': 112, 's': 2, 'markeresize': 1, 'autopct':
1, 'clip_on': 25, 'color': 526, 'xerr': 2, 'scaley': 1, 'visible': 6,
Пример #3
0
fu, fau = get_fu_fau()
bh = BackupHandler(relative_path('experimental/code_suggest'))
all_codes = bh.load('all_codes')
print 'There are %d code examples in total'%len(all_codes)

pos_sum = defaultdict(float)  # [f] = sum
pos_cnt = defaultdict(int)  # [f] = count
for code in all_codes:
  try:
    node = ast.parse(code)
  except SyntaxError:
    continue
  calls = findCallNodes(node)
  called_funcs = [extractCallComponents(x)[0] for x in calls]
  called_funcs = filter(lambda x: x in fu, called_funcs)
  if len(calls) < 3:
    continue
  for i, f in enumerate(called_funcs):
    pos_sum[f] += float(i) / len(called_funcs)
    pos_cnt[f] += 1

pos_ave = {}
for f in pos_sum:
  pos_ave[f] = pos_sum[f] / pos_cnt[f]

print 'Extracted average positions for %d functions'%len(pos_ave)

bh2 = BackupHandler(relative_path('demo/data'))
bh2.save('pos_ave', pos_ave)
Пример #4
0
                        if 'source' in cell:
                            src = ''.join(cell['source'])
                        else:
                            src = ''.join(cell['input'])
                        if not src:
                            continue
                        try:
                            is_useful |= parseAndCount(src, elem_counts, fu)
                        except SyntaxError:
                            counters['count_not_parseable_cell'] += 1
                            continue
                        notebook_code_cells.append(src)

                if is_useful:
                    counters['count_useful_notebook_file'] += 1

                if notebook_code_cells:
                    all_codes.append('\n'.join(notebook_code_cells))

            else:
                counters['count_bad_suffix'] += 1

            if is_useful:
                counters['count_useful_files'] += 1

    bh.save('elem_counts_0322', elem_counts)
    for cnt_key in sorted(counters.keys()):
        print '%s: %d' % (cnt_key, counters[cnt_key])

    bh.save('all_codes_github_1k_repo_0322', all_codes)
Пример #5
0
from codemend.models.extract_so_code import load_threads, Thread, Answer
from codemend import BackupHandler, relative_path

if __name__ == '__main__':
  bh_dir = relative_path('models/output/backup')
  bh = BackupHandler(bh_dir)

  try:
    threads = bh.load('mpl_threads')
  except AssertionError:
    threads = list(load_threads(
      qfilter="Tags LIKE '%<matplotlib>%' AND AnswerCount > 0 AND Score >= 0",
      afilter="Score >= 0 ORDER BY Score DESC LIMIT 3"))
    bh.save('mpl_threads', threads)

  with open(relative_path('models/output/mpl_so_titles.txt'), 'w') as writer:
    for t in threads:
      writer.write('%d\t%s\n'%(t.qid, t.qtitle.encode('utf-8')))
    return unique_train_pairs


if __name__ == '__main__':
    bh = BackupHandler(relative_path('models/output/backup'))

    # Step 1
    fu, fau = get_fu_fau()

    # Step 2
    with open(relative_path('models/output/mpl_code_blocks.txt')) as reader:
        content = reader.read()

    content = content.decode('utf-8')
    content = content.replace("&lt;", "<")
    content = content.replace("&gt;", ">")
    content = content.replace("&amp;", "&")

    blocks = content.split('\n\n\n')

    assert len(blocks) > 100

    train_pairs = get_train_pairs(fu, fau, blocks, include_arguments=False)
    bh.save('train_pairs_0204', train_pairs)

    train_pairs_with_args = get_train_pairs(fu,
                                            fau,
                                            blocks,
                                            include_arguments=True)
    bh.save('train_pairs_0204_with_args', train_pairs_with_args)
Пример #7
0
def transform_and_filter(elem):
    """
  Cleaning is performed to reduce sparsity:
    - pylab.xxx --> plt.xxx (if the function exists in pyplot)
    - various add_subplot.xxx --> plt.gca.xxx (see stype.tsv)
    - only plt.* are kept

  Returns: cleaned elem or None

  """
    elem = enormer.simplify(elem)
    if elem.startswith('plt.'):
        return elem
    else:
        return None


if __name__ == '__main__':
    coke_counts = defaultdict(int)
    count = 0
    for code in code_examples():
        count += 1
        if count % 1000 == 0:
            print '%d ... unique_cokes=%d' % (count, len(coke_counts))

        for x, y in get_cokes(code):
            coke_counts[x, y] += 1

    bh = BackupHandler(relative_path('models/output/backup'))
    bh.save('coke_0329', coke_counts)
Пример #8
0
  def __init__(self, w2v_model, all_elem_counts, maxngram=1,
               name=None, use_lemma=True,
               heuristic=False, use_coke=False):
    """
    w2v_model can be a binary vectors file, or a loaded gensim model instance.

    """
    self.maxngram = maxngram
    self.name = name
    self.use_lemma = use_lemma
    assert isinstance(all_elem_counts, dict)
    self.all_elem_counts = all_elem_counts
    self.heuristic = heuristic
    self.use_coke = use_coke

    if isinstance(w2v_model, basestring):
      self.model = load_gensim_from_binary_file(w2v_model)
      self.model.filename = w2v_model.split('/')[-1]
      if not self.name:
        self.name = self.model.filename
    else:
      assert isinstance(w2v_model, Word2Vec)
      self.model = w2v_model
      if not self.name:
        if hasattr(self.model, 'filename'):
          self.name = self.model.filename


    self.model.init_sims()  # normalize the vectors

    self.enormer = ElementNormalizer()

    if self.use_coke:
      bh = BackupHandler(relative_path('models/output/backup'))
      coke_file = 'coke_0329'
      if not bh.exists(coke_file):
        raise ValueError('Coke file does not exist: %s'%coke_file)
      self.coke = bh.load(coke_file)

    print 'Trying to load element indexes from cache ...'
    bh = BackupHandler(relative_path('models/output/backup'))
    elem_index_backup_name = self.model.filename + '_elem_index'
    if bh.exists(elem_index_backup_name):
      self.idfs, self.elems, self.elem_lookup, self.vecmat = bh.load(elem_index_backup_name)

    else:
      print 'Word2vecBaseline building element indexes...'

      fu, fau = get_fu_fau()
      self.idfs = self.get_idf(fu.values() + fau.values())

      self.elems = sorted(self.all_elem_counts.keys())
      self.elem_lookup = dict((y,x) for (x,y) in enumerate(self.elems))
      vecs = []
      for e in self.elems:
        u = doc_serve.get_training_doc(e, True)
        v = self.get_bow_representation(u)
        vecs.append(v)
      self.vecmat = np.array(vecs)
      assert self.vecmat.shape == (len(self.elems), self.model.vector_size)

      bh.save(elem_index_backup_name, (self.idfs, self.elems, self.elem_lookup, self.vecmat))

      print 'Finished building indexes.'
Пример #9
0
            if 'source' in cell:
              src = ''.join(cell['source'])
            else:
              src = ''.join(cell['input'])
            if not src:
              continue
            try:
              is_useful |= parseAndCount(src, elem_counts, fu)
            except SyntaxError:
              counters['count_not_parseable_cell'] += 1
              continue
            notebook_code_cells.append(src)

        if is_useful:
          counters['count_useful_notebook_file'] += 1

        if notebook_code_cells:
          all_codes.append('\n'.join(notebook_code_cells))

      else:
        counters['count_bad_suffix'] += 1

      if is_useful:
        counters['count_useful_files'] += 1

  bh.save('elem_counts_0322', elem_counts)
  for cnt_key in sorted(counters.keys()):
    print '%s: %d'%(cnt_key, counters[cnt_key])

  bh.save('all_codes_github_1k_repo_0322', all_codes)
Пример #10
0
    def __init__(self,
                 w2v_model,
                 all_elem_counts,
                 maxngram=1,
                 name=None,
                 use_lemma=True,
                 heuristic=False,
                 use_coke=False):
        """
    w2v_model can be a binary vectors file, or a loaded gensim model instance.

    """
        self.maxngram = maxngram
        self.name = name
        self.use_lemma = use_lemma
        assert isinstance(all_elem_counts, dict)
        self.all_elem_counts = all_elem_counts
        self.heuristic = heuristic
        self.use_coke = use_coke

        if isinstance(w2v_model, basestring):
            self.model = load_gensim_from_binary_file(w2v_model)
            self.model.filename = w2v_model.split('/')[-1]
            if not self.name:
                self.name = self.model.filename
        else:
            assert isinstance(w2v_model, Word2Vec)
            self.model = w2v_model
            if not self.name:
                if hasattr(self.model, 'filename'):
                    self.name = self.model.filename

        self.model.init_sims()  # normalize the vectors

        self.enormer = ElementNormalizer()

        if self.use_coke:
            bh = BackupHandler(relative_path('models/output/backup'))
            coke_file = 'coke_0329'
            if not bh.exists(coke_file):
                raise ValueError('Coke file does not exist: %s' % coke_file)
            self.coke = bh.load(coke_file)

        print 'Trying to load element indexes from cache ...'
        bh = BackupHandler(relative_path('models/output/backup'))
        elem_index_backup_name = self.model.filename + '_elem_index'
        if bh.exists(elem_index_backup_name):
            self.idfs, self.elems, self.elem_lookup, self.vecmat = bh.load(
                elem_index_backup_name)

        else:
            print 'Word2vecBaseline building element indexes...'

            fu, fau = get_fu_fau()
            self.idfs = self.get_idf(fu.values() + fau.values())

            self.elems = sorted(self.all_elem_counts.keys())
            self.elem_lookup = dict((y, x) for (x, y) in enumerate(self.elems))
            vecs = []
            for e in self.elems:
                u = doc_serve.get_training_doc(e, True)
                v = self.get_bow_representation(u)
                vecs.append(v)
            self.vecmat = np.array(vecs)
            assert self.vecmat.shape == (len(self.elems),
                                         self.model.vector_size)

            bh.save(elem_index_backup_name,
                    (self.idfs, self.elems, self.elem_lookup, self.vecmat))

            print 'Finished building indexes.'
Пример #11
0
  for i in xrange(len(all_codes)):
    if not svgs[i]: continue
    code = all_codes[i].strip()

    if code in seen_code_set:
      # Dedupe
      count_dupe += 1
      continue
    else:
      seen_code_set.add(code)

    node = ast.parse(code)
    calls = findCallNodes(node)
    for call in calls:
      func_name, keywords = extractCallComponents(call)
      if func_name in plot_commands_set:
        examples[func_name].add(i)

  print 'There are %d duplicates'%count_dupe



  print '"Scoring" code examples.'
  # Sorting function: number of chars in the code example
  examples = dict(examples)
  for func, idxs in examples.items():
    examples[func] = sorted(idxs, key=lambda x: get_effective_code_len(all_codes[x]))


  bh.save('plotcommands_examples', examples)
Пример #12
0
        'syntax_errors', 'unsafes', 'timeouts', 'exec_errors', 'nofigures',
        'savefig_errors', 'empty_svgs', 'successes'
    ]
    for name in counter_names:
        counters[name] = shared_counter.Counter(name=name)

    pool = ThreadPool(processes=4)

    # all_codes = all_codes[:1000]  # DEBUG

    svgs = pool.map(partial(run_with_timeout, 3, get_svg, counters), all_codes)

    for counter in counters.values():
        print counter

    bh.save('svgs', svgs)
    bh.save('all_codes', all_codes)

    # LOG:
    # There are 15582 code examples from mpl stackoverflow
    # Restored from ./cookbook_segs.pickle
    # There are 174 code examples from matplotlib cookbook
    # There are 15756 code blocks in total
    # timeouts: 514
    # empty_svgs: 92
    # unsafes: 1420
    # exec_errors: 6165
    # savefig_errors: 68
    # successes: 2582
    # syntax_errors: 3223
    # nofigures: 1691
Пример #13
0
from codemend.models.extract_so_code import load_threads, Thread, Answer
from codemend import BackupHandler, relative_path

if __name__ == '__main__':
    bh_dir = relative_path('models/output/backup')
    bh = BackupHandler(bh_dir)

    try:
        threads = bh.load('mpl_threads')
    except AssertionError:
        threads = list(
            load_threads(
                qfilter=
                "Tags LIKE '%<matplotlib>%' AND AnswerCount > 0 AND Score >= 0",
                afilter="Score >= 0 ORDER BY Score DESC LIMIT 3"))
        bh.save('mpl_threads', threads)

    with open(relative_path('models/output/mpl_so_titles.txt'), 'w') as writer:
        for t in threads:
            writer.write('%d\t%s\n' % (t.qid, t.qtitle.encode('utf-8')))
Пример #14
0
    # (3). add children to function, argument
    # Coming from experimental/code_suggest/mine_argvs.py
    for (f, a), v_counts in fa_v_counts.items():
        if not (f, a) in element_index: continue
        for v, count in v_counts.items():
            element_index[f, a, v] = Element(v, '', count, element_index[f, a])

    # (4). Sort all children by count then by value
    for elem in element_index.values():
        elem.children = sorted(elem.children, key=lambda x: (-x.count, x.val))

    print '%d total entries in element index' % len(element_index)

    bh2 = BackupHandler(relative_path('demo/data'))
    bh2.save('element_index', element_index)
    """
There are 15770 code examples in total
Example f counts (plot)
4228
Example fa counts:
{'mfc': 28, 'xlim': 1, 'markeredgewidth': 11, 'markeredgecolor': 20,
'linewidth': 169, 'rot': 4, 'style': 20, 'layout': 1, 'lc': 1, 'title': 14,
'lw': 183, 'ls': 34, 'yerr': 5, 'markersize': 117, 'grid': 1, 'xdata': 1,
'ys': 1, 'rasterized': 3, 'drawstyle': 2, 'x_compat': 2, 'dashes': 3, 'x': 29,
'picker': 13, 'edgecolor': 2, 'table': 3, 'edge_labels': 1, 'whis': 1, 'zs':
11, 'latlon': 3, 'sharey': 1, 'sharex': 2, 'markerfacecolor': 25, 'label':
527, 'colormap': 4, 'mec': 19, 'mew': 19, 'antialiased': 3, 'sym': 1,
'startangle': 1, 'legend': 17, 'c': 112, 's': 2, 'markeresize': 1, 'autopct':
1, 'clip_on': 25, 'color': 526, 'xerr': 2, 'scaley': 1, 'visible': 6,
'marker': 191, 'xs': 1, 'markeredecolor': 1, 'transform': 24, 'xticks': 6,
Пример #15
0
    seen_code_set = set()
    count_dupe = 0
    for i in xrange(len(all_codes)):
        if not svgs[i]: continue
        code = all_codes[i].strip()

        if code in seen_code_set:
            # Dedupe
            count_dupe += 1
            continue
        else:
            seen_code_set.add(code)

        node = ast.parse(code)
        calls = findCallNodes(node)
        for call in calls:
            func_name, keywords = extractCallComponents(call)
            if func_name in plot_commands_set:
                examples[func_name].add(i)

    print 'There are %d duplicates' % count_dupe

    print '"Scoring" code examples.'
    # Sorting function: number of chars in the code example
    examples = dict(examples)
    for func, idxs in examples.items():
        examples[func] = sorted(
            idxs, key=lambda x: get_effective_code_len(all_codes[x]))

    bh.save('plotcommands_examples', examples)
Пример #16
0
  print 'Processed %d code examples'%count
  print 'There are %d unique elements'%len(element_counts)
  print 'There are %d unique pyplot elements'%len(element_pyplot_counts)
  for k in counters:
    print '%s: %d'%(k, counters[k])

  bh = BackupHandler(relative_path('experimental/code_suggest/output/backup'))
  # Change logs:
  # - 0322: using raw format
  # - 0327: using Element, tracking return type and variable assignments and
  #   import aliases.
  # - 0404: fixed issue with dict as positional argument;
  #         added element_value_counts;
  #         added Shiyan's example.
  bh.save('elem_counts_0404', element_counts)
  bh.save('elem_pyplot_counts_0404', element_pyplot_counts)
  bh.save('elem_pyplot_value_counts_0404', element_pyplot_value_counts)

  """
  Log:

  # 0327
  Processed 24502 code examples
  There are 144898 unique elements
  There are 7741 unique pyplot elements
  Saved to /Users/ronxin/Dropbox/git/codemend/codemend/experimental/code_suggest/output/backup/elem_counts_0327.pickle
  Saved to /Users/ronxin/Dropbox/git/codemend/codemend/experimental/code_suggest/output/backup/elem_pyplot_counts_0327.pickle
  synax_error_files: 3223

  # 0404
Пример #17
0
  print 'total_matched_args', total_matched_args
  print 'total_unique_train_pairs', len(unique_train_pairs)

  return unique_train_pairs


if __name__ == '__main__':
  bh = BackupHandler(relative_path('models/output/backup'))

  # Step 1
  fu, fau = get_fu_fau()

  # Step 2
  with open(relative_path('models/output/mpl_code_blocks.txt')) as reader:
    content = reader.read()

  content = content.decode('utf-8')
  content = content.replace("&lt;", "<")
  content = content.replace("&gt;", ">")
  content = content.replace("&amp;", "&")

  blocks = content.split('\n\n\n')

  assert len(blocks) > 100

  train_pairs = get_train_pairs(fu, fau, blocks, include_arguments=False)
  bh.save('train_pairs_0204', train_pairs)

  train_pairs_with_args = get_train_pairs(fu, fau, blocks, include_arguments=True)
  bh.save('train_pairs_0204_with_args', train_pairs_with_args)
Пример #18
0
  counters = {}
  counter_names = ['syntax_errors', 'unsafes', 'timeouts', 'exec_errors',
                   'nofigures', 'savefig_errors', 'empty_svgs', 'successes']
  for name in counter_names:
    counters[name] = shared_counter.Counter(name=name)

  pool = ThreadPool(processes=4)

  # all_codes = all_codes[:1000]  # DEBUG

  svgs = pool.map(partial(run_with_timeout, 3, get_svg, counters), all_codes)

  for counter in counters.values():
    print counter

  bh.save('svgs', svgs)
  bh.save('all_codes', all_codes)

  # LOG:
  # There are 15582 code examples from mpl stackoverflow
  # Restored from ./cookbook_segs.pickle
  # There are 174 code examples from matplotlib cookbook
  # There are 15756 code blocks in total
  # timeouts: 514
  # empty_svgs: 92
  # unsafes: 1420
  # exec_errors: 6165
  # savefig_errors: 68
  # successes: 2582
  # syntax_errors: 3223
  # nofigures: 1691