Пример #1
0
    def __init__(self):
        # load simplification mapping
        self.stype_map = {}
        with open(relative_path('docstring_parse/annotation/stype_map.csv'),
                  'rb') as csvfile:
            reader = csv.reader(csvfile)
            for fields in reader:
                assert len(fields) == 2
                assert fields[0]
                assert fields[1]
                self.stype_map[fields[0]] = fields[1]

        # load rtype mapping
        self.rtype_map = {}
        with open(relative_path('docstring_parse/annotation/rtype_map.csv'),
                  'r') as csvfile:
            reader = csv.reader(csvfile)
            for fields in reader:
                assert len(fields) == 2
                assert fields[0]
                assert fields[1]
                fields = map(self.simplify, fields)
                self.rtype_map[fields[0]] = fields[1]

        self.fu, self.fau = get_fu_fau()
def get_fu_fau(omit_module=True, truncate=True):
    """
  Do step 1.

  Returns fu, fau
  fu: [func_name] = most_popular_utter
  fau: [func_name, arg] = most_popular_utter

  Parameters
  ----------
  omit_module: if True, the func_name will be the last part only.

  """
    def get_func_name(fullName):
        if omit_module:
            return fullName.split('.')[-1]
        else:
            return fullName

    fu = defaultdict(lambda: defaultdict(int))  # [func_name][utter] = count
    with open(relative_path('docstring_parse/fu.csv'), 'rb') as csvfile:
        reader = csv.reader(csvfile)
        next(reader, None)  # skip the header
        for f, u in reader:
            u = u.split('|||')[0]  # see consolidate.py for meaning of |||
            if truncate:
                u = ' '.join(
                    u.split()[:15])  # limit the maximum number of tokens
            fu[get_func_name(f)][u] += 1

    fau = defaultdict(
        lambda: defaultdict(int))  # [func_name, arg][utter] = count
    with open(relative_path('docstring_parse/fau.csv'), 'rb') as csvfile:
        reader = csv.reader(csvfile)
        next(reader, None)  # skip the header
        for f, a, u in reader:
            u = u.split('|||')[0]  # see consolidate.py for meaning of |||
            if truncate:
                u = ' '.join(
                    u.split()[:15])  # limit the maximum number of tokens
            fau[get_func_name(f), a][u] += 1

    # consolidate the fu and fau mappings
    for f in fu:
        fu[f] = get_most_popular_lowered(fu[f])
    fu = dict(fu)

    for fa in fau:
        fau[fa] = get_most_popular_lowered(fau[fa])
    fau = dict(fau)

    return fu, fau
Пример #3
0
def get_fu_fau(omit_module=True, truncate=True):
  """
  Do step 1.

  Returns fu, fau
  fu: [func_name] = most_popular_utter
  fau: [func_name, arg] = most_popular_utter

  Parameters
  ----------
  omit_module: if True, the func_name will be the last part only.

  """

  def get_func_name(fullName):
    if omit_module:
      return fullName.split('.')[-1]
    else:
      return fullName


  fu = defaultdict(lambda: defaultdict(int))  # [func_name][utter] = count
  with open(relative_path('docstring_parse/fu.csv'), 'rb') as csvfile:
    reader = csv.reader(csvfile)
    next(reader, None)  # skip the header
    for f, u in reader:
      u = u.split('|||')[0]  # see consolidate.py for meaning of |||
      if truncate: u = ' '.join(u.split()[:15])  # limit the maximum number of tokens
      fu[get_func_name(f)][u] += 1

  fau = defaultdict(lambda: defaultdict(int))  # [func_name, arg][utter] = count
  with open(relative_path('docstring_parse/fau.csv'), 'rb') as csvfile:
    reader = csv.reader(csvfile)
    next(reader, None)  # skip the header
    for f, a, u in reader:
      u = u.split('|||')[0]  # see consolidate.py for meaning of |||
      if truncate: u = ' '.join(u.split()[:15])  # limit the maximum number of tokens
      fau[get_func_name(f), a][u] += 1

  # consolidate the fu and fau mappings
  for f in fu:
    fu[f] = get_most_popular_lowered(fu[f])
  fu = dict(fu)

  for fa in fau:
    fau[fa] = get_most_popular_lowered(fau[fa])
  fau = dict(fau)

  return fu, fau
Пример #4
0
 def __init__(self):
     bh = BackupHandler(
         relative_path('experimental/code_suggest/output/backup'))
     elem_counts = bh.load('elem_pyplot_counts_0404')
     self.all_elems = set(elem_counts.keys())
     self.all_elem_counts = elem_counts
     self.enormer = ElementNormalizer()
Пример #5
0
    def __init__(self):
        plot_commands = get_plot_commands()
        pyplot_fu = get_pyplot_fu()
        self.plot_commands = plot_commands
        self.nonplot_commands = [
            f for f in pyplot_fu.keys() if not f in plot_commands
        ]
        print 'CodeSuggest: extracted %d plot commands' % len(plot_commands)

        # Load all code examples of plotting commands from db into memory
        # These are generated by index_examples.py
        print 'CodeSuggest: Loading code examples and pregenerated SVGs...'
        db = sqlite3.connect(relative_path('demo/data/code.sqlite3'))
        cursor = db.cursor()
        cursor.execute("SELECT func_id, code, svg FROM example")
        code_example_lookup = {}  # [func_id] = [(code, svg)]
        count_code_examples = 0
        for func_id, code, svg in cursor.fetchall():
            count_code_examples += 1
            if not func_id in code_example_lookup:
                code_example_lookup[func_id] = []
            code_example_lookup[func_id].append({'code': code, 'svg': svg})
        # Sort it again
        for func_id in code_example_lookup:
            code_example_lookup[func_id] = sorted(
                code_example_lookup[func_id],
                key=lambda x: get_effective_code_len(x['code']))
        self.code_example_lookup = code_example_lookup
        print 'CodeSuggest: Loaded %d code examples (with svgs)...' % count_code_examples
        db.close()

        # Load element_index generated by experimental/code_suggest/mine_argvs.py
        # bh = BackupHandler(relative_path('demo/data'))
        # self.element_index = bh.load('element_index')
        # print 'Loaded element_index with %d keys'%len(self.element_index)

        # Load element value counts
        bh2 = BackupHandler(
            relative_path('experimental/code_suggest/output/backup'))
        self.elem_val_counts = bh2.load(
            'elem_pyplot_value_counts_0404')  # [elem][val] = count
        for elem_id in self.elem_val_counts:
            self.elem_val_counts[elem_id] = sorted(
                self.elem_val_counts[elem_id].items(), key=lambda x: -x[1])

        self.func_position_finder = FuncPositionFinder()
Пример #6
0
  def handleRequest(self, params, server_handle=None):
    request_type = ''

    try:
      experiment_mode = 'no_mode'
      try:
        with open(relative_path('demo/log-mode.txt')) as reader:
          experiment_mode = reader.read().strip()
      except:
        pass

      request_type = params['type'] if 'type' in params else ''
      ret = {'type': request_type}

      code = params['code'] if 'code' in params else ''
      query = params['query'].strip().lower() if 'query' in params else ''

      self.current_code = code
      self.prepare_current_code()

      control_group = False
      if experiment_mode[0] == 'g':
        control_group = True

      if request_type == 'nlp':
        if not control_group:
          matches = self.get_matches(query)
          ret['matches'] = matches

      elif request_type in ('summary', 'suggest'):
        if not control_group:
          cursor_line = int(params['cursor_line']) + 1
          cursor_ch = int(params['cursor_ch'])
          if request_type == 'summary':
            ret['summary_groups'] = self.cs.get_summary(query, code, cursor_line, cursor_ch, self)
          else:
            elem_id = params['elem_id']
            ret['suggest'] = self.cs.get_suggest(query, code, cursor_line, cursor_ch, self, elem_id)

      elif request_type == 'google':
        pass

      elif request_type == 'experiment_mode':
        ret['mode'] = experiment_mode

      else:
        raise ValueError('Unrecognized request type: "%s"'%request_type)

      return ret

    except SyntaxError:
      return {'error': 'syntax error', 'type': request_type}
    except Exception as e:
      print '\n\nBrain Error:'
      print traceback.format_exc() + '\n\n'
      return {'error': server_util.pack_exception_for_html(e, 'Brain Error'),
              'type': request_type
             }
Пример #7
0
def main():
  from codemend import relative_path
  bh = BackupHandler(relative_path('utils/output/backup'))
  a = [1,2,3,{4:[5,6]}]
  bh.save('backup_test_a', a)
  assert bh.exists('backup_test_a')
  b = bh.load('backup_test_a')
  assert a == b
  print 'All test passed.'
Пример #8
0
def main():
    from codemend import relative_path
    bh = BackupHandler(relative_path('utils/output/backup'))
    a = [1, 2, 3, {4: [5, 6]}]
    bh.save('backup_test_a', a)
    assert bh.exists('backup_test_a')
    b = bh.load('backup_test_a')
    assert a == b
    print 'All test passed.'
Пример #9
0
  def __init__(self):
    plot_commands = get_plot_commands()
    pyplot_fu = get_pyplot_fu()
    self.plot_commands = plot_commands
    self.nonplot_commands = [f for f in pyplot_fu.keys() if not f in plot_commands]
    print 'CodeSuggest: extracted %d plot commands'%len(plot_commands)

    # Load all code examples of plotting commands from db into memory
    # These are generated by index_examples.py
    print 'CodeSuggest: Loading code examples and pregenerated SVGs...'
    db = sqlite3.connect(relative_path('demo/data/code.sqlite3'))
    cursor = db.cursor()
    cursor.execute("SELECT func_id, code, svg FROM example")
    code_example_lookup = {}  # [func_id] = [(code, svg)]
    count_code_examples = 0
    for func_id, code, svg in cursor.fetchall():
      count_code_examples += 1
      if not func_id in code_example_lookup:
        code_example_lookup[func_id] = []
      code_example_lookup[func_id].append({'code': code, 'svg':svg})
    # Sort it again
    for func_id in code_example_lookup:
      code_example_lookup[func_id] = sorted(
        code_example_lookup[func_id],
        key=lambda x:get_effective_code_len(x['code']))
    self.code_example_lookup = code_example_lookup
    print 'CodeSuggest: Loaded %d code examples (with svgs)...'%count_code_examples
    db.close()

    # Load element_index generated by experimental/code_suggest/mine_argvs.py
    # bh = BackupHandler(relative_path('demo/data'))
    # self.element_index = bh.load('element_index')
    # print 'Loaded element_index with %d keys'%len(self.element_index)

    # Load element value counts
    bh2 = BackupHandler(relative_path('experimental/code_suggest/output/backup'))
    self.elem_val_counts = bh2.load('elem_pyplot_value_counts_0404')  # [elem][val] = count
    for elem_id in self.elem_val_counts:
      self.elem_val_counts[elem_id] = sorted(
        self.elem_val_counts[elem_id].items(), key=lambda x:-x[1])

    self.func_position_finder = FuncPositionFinder()
Пример #10
0
def code_examples():
  """
  Yield code examples.

  """

  global all_codes1, all_codes2, all_codes3

  # 15770 code examples mined from SO answers in threads that are tagged
  # "matplotlib".
  if not all_codes1:
    print 'Loading SO code examples...'
    bh1 = BackupHandler(relative_path('experimental/code_suggest'))
    all_codes1 = bh1.load('all_codes')
    print '%d examples from SO'%len(all_codes1)

  for code in all_codes1:
    yield code

  # print 'WARNING: mine_element.py ignoring all GitHub code examples...'
  # """
  if not all_codes2:
    # 8732 code examples (including 395 IPython Notebook files) mined from
    # GitHub repositories that contain "matplotlib".
    print 'Loading GitHub code examples...'
    bh2 = BackupHandler(relative_path('experimental/code_suggest/output/backup'))
    all_codes2 = bh2.load('all_codes_github_1k_repo_0322')
    print '%d examples from GitHub'%len(all_codes2)

  for code in all_codes2:
    yield code
  # """

  if not all_codes3:
    # 21993 code examples extracted by Shiyan from the Web
    print 'Loading Web code examples'
    bh3 = BackupHandler(relative_path('experimental/mining/output'))
    all_codes3 = bh3.load('codes_shiyan_0331_web')
    print '%d examples from Web Shiyan'%len(all_codes3)

  for code in all_codes3:
    yield code
Пример #11
0
  def __init__(self):
    print 'Brain initializing ...'
    self.fu, self.fau = get_fu_fau()

    self.cb = ContextBuilder()
    self.w2v = load_gensim_from_binary_file(
      relative_path('models/output/vectors-so-text-python-lemma.bin'))

    self.current_code = None
    self.current_code_hash = None

    self.bimodal = BiModalBaseline('bimodal',
      relative_path('models/output/bi2-0410-d.model'),
      self.w2v)

    self.cs = CodeSuggest()

    self.gws_cache = {}

    print 'Brain intialized.'
Пример #12
0
def load_model(model_id=None):
  # model_file_name = 'models/output/bi2-test.model'
  model_file_name = 'models/output/bi2-0410-d.model'
  if model_id:
    print 'Using customized model_id'
    model_file_name = 'models/output/bi2-0410' + model_id + '.model'

  w2v_model_file = 'vectors-so-text-python-lemma.bin'
  if model_id == '-s':
    w2v_model_file = 'vectors-so-text-python-lemma-win3.bin'
  elif model_id == '-t':
    w2v_model_file = 'vectors-so-text-python-lemma-win5.bin'
  w2v_model = load_gensim_from_binary_file(
    relative_path('models/output/' + w2v_model_file))

  model = BiModal.load(relative_path(model_file_name))
  print "@@@ PLEASE CHECK WHICH FILE IS BEING TESTED ... @@@"
  print '@@@ MODEL_FILE: %s @@@'%model_file_name

  model.w2v_model = w2v_model
  model.syn0l = w2v_model.syn0
  return w2v_model, model
Пример #13
0
def get_pyplot_funcs():
  # used by other part as well
  with open(relative_path('lib/matplotlib/pyplot.py')) as reader:
    pyplot_src = reader.read()
  pyplot_funcs = []
  for line in pyplot_src.split('\n'):
    line = line.strip()
    if line.startswith('def '):
      line = line[len('def '):]
      fields = line.split('(')
      func_name = fields[0]
      pyplot_funcs.append(func_name)
  return pyplot_funcs
Пример #14
0
def get_pyplot_funcs():
    # used by other part as well
    with open(relative_path('lib/matplotlib/pyplot.py')) as reader:
        pyplot_src = reader.read()
    pyplot_funcs = []
    for line in pyplot_src.split('\n'):
        line = line.strip()
        if line.startswith('def '):
            line = line[len('def '):]
            fields = line.split('(')
            func_name = fields[0]
            pyplot_funcs.append(func_name)
    return pyplot_funcs
Пример #15
0
  def __init__(self):
    # load simplification mapping
    self.stype_map = {}
    with open(relative_path('docstring_parse/annotation/stype_map.csv'), 'rb') as csvfile:
      reader = csv.reader(csvfile)
      for fields in reader:
        assert len(fields) == 2
        assert fields[0]
        assert fields[1]
        self.stype_map[fields[0]] = fields[1]

    # load rtype mapping
    self.rtype_map = {}
    with open(relative_path('docstring_parse/annotation/rtype_map.csv'), 'r') as csvfile:
      reader = csv.reader(csvfile)
      for fields in reader:
        assert len(fields) == 2
        assert fields[0]
        assert fields[1]
        fields = map(self.simplify, fields)
        self.rtype_map[fields[0]] = fields[1]

    self.fu, self.fau = get_fu_fau()
Пример #16
0
def get_pyplot_fu():
    # Load csv file of pyplot summary
    pyplot_fu = {}  # [func] = utter
    print 'CodeSuggest: Loading pyplot fu...'
    with open(relative_path('docstring_parse/pyplot_fu.csv'), 'rb') as csvfile:
        reader = csv.reader(csvfile)
        next(reader, None)  # skip the header
        for f, u in reader:
            if not u:
                continue
            pyplot_fu[f] = u
    print 'CodeSuggest: read %d fu pairs' % len(pyplot_fu)

    # lowercase and tokenization of u's
    for f in pyplot_fu:
        pyplot_fu[f] = ' '.join(pattern.en.tokenize(pyplot_fu[f].lower()))
    return pyplot_fu
Пример #17
0
def get_pyplot_fu():
  # Load csv file of pyplot summary
  pyplot_fu = {}  # [func] = utter
  print 'CodeSuggest: Loading pyplot fu...'
  with open(relative_path('docstring_parse/pyplot_fu.csv'), 'rb') as csvfile:
    reader = csv.reader(csvfile)
    next(reader, None)  # skip the header
    for f, u in reader:
      if not u:
        continue
      pyplot_fu[f] = u
  print 'CodeSuggest: read %d fu pairs'%len(pyplot_fu)

  # lowercase and tokenization of u's
  for f in pyplot_fu:
    pyplot_fu[f] = ' '.join(pattern.en.tokenize(pyplot_fu[f].lower()))
  return pyplot_fu
Пример #18
0
def anything_to_used_elems(anything):
  if isinstance(anything, basestring):
    filename = {
      'bar': relative_path('demo/code-samples/user-study/task1.py'),
      'pie': relative_path('demo/code-samples/user-study/task2.py'),
      'line': relative_path('demo/code-samples/user-study/practice.py'),
      'empty': relative_path('demo/code-samples/empty.py'),
      'eval3': relative_path('demo/code-samples/eval3.py'),
      'line_video': relative_path('demo/code-samples/demo_video_linechart.py')
    }[anything]
    with open(filename) as reader:
      code = reader.read()
    if not code.strip(): return []
    node = ast.parse(code)
    ast_utils.mark_text_ranges(node, unicode(code))
    context = cb.getContext(node)
    return context.used_elems()
  elif isinstance(anything, list):
    return anything
  else:
    raise TypeError(type(anything))
Пример #19
0
def lemmaWithVocab(token, vocab):
  """
  First try PORTER. If not in vocab, then lemma.

  """
  out = pattern.vector.stem(token, stemmer=pattern.vector.PORTER)
  if out in vocab: return out
  if token.endswith('ing'): out = token[:-3]
  if out in vocab: return out
  return lemma(token)

from codemend import relative_path

stopwords = set()
with open(relative_path('models/stopwords-en.txt')) as reader:
  for line in reader:
    if line.startswith('#'): continue
    line = line.strip()
    words = line.split(', ')
    stopwords |= set(words)
  stopwords |= set(string.punctuation)


if __name__ == '__main__':
  s = 'hatching-hatches color" colors interesting flying flies'
  vocab = dict(hatch=1, color=2, interest=3, fly=4)
  tokens = tokenize(s)
  lemmas = [lemmaWithVocab(x, vocab) for x in tokens]
  print lemmas
Пример #20
0
            train_pairs.append((merged_utter, astunparse.unparse(call_node)))

    unique_train_pairs = list(set(train_pairs))

    print 'total_block', total_block
    print 'total_grammatical', total_grammatical
    print 'total_call_nodes', total_call_nodes
    print 'total_matched_funcs', total_matched_funcs, '(total train pairs)'
    print 'total_matched_args', total_matched_args
    print 'total_unique_train_pairs', len(unique_train_pairs)

    return unique_train_pairs


if __name__ == '__main__':
    bh = BackupHandler(relative_path('models/output/backup'))

    # Step 1
    fu, fau = get_fu_fau()

    # Step 2
    with open(relative_path('models/output/mpl_code_blocks.txt')) as reader:
        content = reader.read()

    content = content.decode('utf-8')
    content = content.replace("&lt;", "<")
    content = content.replace("&gt;", ">")
    content = content.replace("&amp;", "&")

    blocks = content.split('\n\n\n')
Пример #21
0
   this code
3. take average per function

Output:
 - a dictionary: [function] = average_position
   average position: between 0 (beginning of code) and 1 (end of code).
"""

import ast
from collections import defaultdict

from codemend import BackupHandler, relative_path
from codemend.models.annotate_code_with_api import get_fu_fau, findCallNodes, extractCallComponents

fu, fau = get_fu_fau()
bh = BackupHandler(relative_path('experimental/code_suggest'))
all_codes = bh.load('all_codes')
print 'There are %d code examples in total'%len(all_codes)

pos_sum = defaultdict(float)  # [f] = sum
pos_cnt = defaultdict(int)  # [f] = count
for code in all_codes:
  try:
    node = ast.parse(code)
  except SyntaxError:
    continue
  calls = findCallNodes(node)
  called_funcs = [extractCallComponents(x)[0] for x in calls]
  called_funcs = filter(lambda x: x in fu, called_funcs)
  if len(calls) < 3:
    continue
Пример #22
0
 def __init__(self):
     bh = BackupHandler(relative_path('demo/data'))
     self.pos_ave = bh.load('pos_ave')
     print 'FuncPositionFinder: loaded %d average positions for functions' % len(
         self.pos_ave)
Пример #23
0
import csv
import pydoc
from itertools import imap
from collections import defaultdict, namedtuple
from recordclass import recordclass
import funcsigs
import string

from codemend import BackupHandler, relative_path
from codemend.models.element import ElementNormalizer
from codemend.docstring_parse.elemdoc import ElemDoc
from codemend.docstring_parse.consolidate import is_setXXX, get_class
from codemend.models.annotate_code_with_api import get_fu_fau

# Load all input sources
bh = BackupHandler(relative_path('experimental/code_suggest/output/backup'))

fu_t, fau_t = get_fu_fau(omit_module=False, truncate=True)
fu, fau = get_fu_fau(omit_module=False, truncate=False)

fa_lookup = defaultdict(list)  # [function] = [argument]
for f, a in fau.keys():
    fa_lookup[f].append(a)
fa_lookup = dict(fa_lookup)

cf_lookup = defaultdict(list)  # [class] = [function]
for f in fu.keys():
    cf_lookup[get_class(f)].append(f)

elem_counts = bh.load('elem_pyplot_counts_0404')
enormer = ElementNormalizer()
Пример #24
0
    """

  def do_mh_handle_requests(self, params_list):
    """This callback function is to be used by brain, so that it can take
    advantage of the worker pool to do multithreaded plotting."""
    return plotter_pool.map(mh_handle_request, params_list)

  def jedi_lock(self):
    """To be used by brain to protect non-thread-safe jedi."""
    return jedi_lock

class ThreadedTCPServer(SocketServer.ThreadingMixIn, SocketServer.TCPServer):
  pass

if __name__ == '__main__':
  os.chdir(relative_path('demo'))

  if len(sys.argv) > 1 and sys.argv[1] != '-':
    port = int(sys.argv[1])
  else:
    port = PORT_NUMBER

  if len(sys.argv) > 2 and sys.argv[2] != '-':
    host_name = sys.argv[2]
  else:
    host_name = HOST_NAME


  server_util.port_available_or_die(port)

  plotter_pool = multiprocessing.Pool()
Пример #25
0
from codemend.models.extract_so_code import load_threads, Thread, Answer
from codemend import BackupHandler, relative_path

if __name__ == '__main__':
  bh_dir = relative_path('models/output/backup')
  bh = BackupHandler(bh_dir)

  try:
    threads = bh.load('mpl_threads')
  except AssertionError:
    threads = list(load_threads(
      qfilter="Tags LIKE '%<matplotlib>%' AND AnswerCount > 0 AND Score >= 0",
      afilter="Score >= 0 ORDER BY Score DESC LIMIT 3"))
    bh.save('mpl_threads', threads)

  with open(relative_path('models/output/mpl_so_titles.txt'), 'w') as writer:
    for t in threads:
      writer.write('%d\t%s\n'%(t.qid, t.qtitle.encode('utf-8')))
Пример #26
0
import re

from codemend import relative_path

with open(relative_path('models/output/mpl_so_titles.txt')) as reader:
    id_titles = [
        x.split('\t') for x in reader.read().decode('utf-8').split('\n')
    ]

print 'There are %d threads in total' % len(id_titles)

count = 0
how_prefixes = [
    'how can i', 'how do i', 'how do you', 'how would one', 'how to',
    'how should i', 'how do we', 'how is it possible to', 'how does one',
    'how i can', 'how could i', 'how can one', 'how we can', 'how can you',
    'do i have to', 'how i', 'how do', 'how would you', 'how would i',
    'how would', 'how should', 'how can', 'how are', 'how / where to', 'how'
]  # order is important
goals = []
for x in id_titles:
    if not len(x) == 2: continue
    id_, title = x
    title = title.lower()
    if title.startswith('how'):
        for hp in how_prefixes:
            title = title.replace(hp, '').strip()
        goals.append((id_, title))

print 'There are %d how-questions' % len(goals)
Пример #27
0
  def __init__(self, w2v_model, all_elem_counts, maxngram=1,
               name=None, use_lemma=True,
               heuristic=False, use_coke=False):
    """
    w2v_model can be a binary vectors file, or a loaded gensim model instance.

    """
    self.maxngram = maxngram
    self.name = name
    self.use_lemma = use_lemma
    assert isinstance(all_elem_counts, dict)
    self.all_elem_counts = all_elem_counts
    self.heuristic = heuristic
    self.use_coke = use_coke

    if isinstance(w2v_model, basestring):
      self.model = load_gensim_from_binary_file(w2v_model)
      self.model.filename = w2v_model.split('/')[-1]
      if not self.name:
        self.name = self.model.filename
    else:
      assert isinstance(w2v_model, Word2Vec)
      self.model = w2v_model
      if not self.name:
        if hasattr(self.model, 'filename'):
          self.name = self.model.filename


    self.model.init_sims()  # normalize the vectors

    self.enormer = ElementNormalizer()

    if self.use_coke:
      bh = BackupHandler(relative_path('models/output/backup'))
      coke_file = 'coke_0329'
      if not bh.exists(coke_file):
        raise ValueError('Coke file does not exist: %s'%coke_file)
      self.coke = bh.load(coke_file)

    print 'Trying to load element indexes from cache ...'
    bh = BackupHandler(relative_path('models/output/backup'))
    elem_index_backup_name = self.model.filename + '_elem_index'
    if bh.exists(elem_index_backup_name):
      self.idfs, self.elems, self.elem_lookup, self.vecmat = bh.load(elem_index_backup_name)

    else:
      print 'Word2vecBaseline building element indexes...'

      fu, fau = get_fu_fau()
      self.idfs = self.get_idf(fu.values() + fau.values())

      self.elems = sorted(self.all_elem_counts.keys())
      self.elem_lookup = dict((y,x) for (x,y) in enumerate(self.elems))
      vecs = []
      for e in self.elems:
        u = doc_serve.get_training_doc(e, True)
        v = self.get_bow_representation(u)
        vecs.append(v)
      self.vecmat = np.array(vecs)
      assert self.vecmat.shape == (len(self.elems), self.model.vector_size)

      bh.save(elem_index_backup_name, (self.idfs, self.elems, self.elem_lookup, self.vecmat))

      print 'Finished building indexes.'
Пример #28
0
  ('plt.grid', 'add grid lines', 'bar'),
  ('plt.grid', 'add grid', 'bar'),
  ('plt.grid', 'add grids', 'bar'),
  ('plt.grid', 'add gridlines', 'bar'),
  ('plt.plot@linewidth', 'thickness', 'line'),
  ('plt.plot@linewidth', 'line thickness', 'line'),
  ('plt.plot@linewidth', 'thick', 'line'),
  ('plt.plot@linewidth', 'wide', 'line'),
  # ('plt.xticks@rotation', 'change the style of x-axis label', 'bar'),
  ('plt.xkcd', 'fancy style', 'line'),
]

gt_set = set(small_gt2)

small_gt3 = []
with open(relative_path('models/data/gt-0924.csv'), 'rb') as csvfile:
  reader = csv.reader(csvfile)
  next(reader, None)  # skip the header
  for file_,query,expected,remark in reader:
    combined = (expected, query, file_)
    if combined not in gt_set:
      small_gt3.append(combined)
    else:
      print 'duplicated: %s %s %s'%combined

cb = ContextBuilder()

def anything_to_used_elems(anything):
  if isinstance(anything, basestring):
    filename = {
      'bar': relative_path('demo/code-samples/user-study/task1.py'),
Пример #29
0
"""
Generates default_varmap.py.

Needs to be run only once. Needs to see a matplotlib repository.

"""

from codemend import relative_path

from element import get_pyplot_funcs
from element_extract import extract_varmap_elems

if __name__ == '__main__':
  with open(relative_path('lib/matplotlib/pyplot.py')) as reader:
    pyplot_src = reader.read()

  with open(relative_path('lib/matplotlib/pylab.py')) as reader:
    pylab_src = reader.read()

  var_map1, _ = extract_varmap_elems(pyplot_src, False, True)
  print len(var_map1)

  var_map2, _ = extract_varmap_elems(pylab_src, False, True)
  print len(var_map2)

  lines = []
  pyplot_funcs = get_pyplot_funcs()
  for f in pyplot_funcs:
    lines.append('from matplotlib.pyplot import %s'%f)

  fake_pyplot_src = '\n'.join(lines)
Пример #30
0
      elem_counts[f,a] += 1
      elem_counts[f,a,v] += 1
  return is_useful

if __name__ == '__main__':
  counters = defaultdict(int)

  md5s = set()

  all_codes = []

  fu, _ = get_fu_fau()

  elem_counts = defaultdict(int)  # [elem] = count

  bh = BackupHandler(relative_path('experimental/code_suggest/output/backup'))

  for root, dirs, files in os.walk(
      relative_path('mining/output/github-matplotlib-repos')):

    if '.git' in root: continue

    for file_name in files:
      counters['count_file'] += 1

      if counters['count_file'] % 1000 == 0:
        print 'Processed %d files - Useful files: %d'%(
          counters['count_file'], counters['count_useful_files'])

      file_path = os.path.join(root, file_name)
Пример #31
0
"""
Generates default_varmap.py.

Needs to be run only once. Needs to see a matplotlib repository.

"""

from codemend import relative_path

from element import get_pyplot_funcs
from element_extract import extract_varmap_elems

if __name__ == '__main__':
    with open(relative_path('lib/matplotlib/pyplot.py')) as reader:
        pyplot_src = reader.read()

    with open(relative_path('lib/matplotlib/pylab.py')) as reader:
        pylab_src = reader.read()

    var_map1, _ = extract_varmap_elems(pyplot_src, False, True)
    print len(var_map1)

    var_map2, _ = extract_varmap_elems(pylab_src, False, True)
    print len(var_map2)

    lines = []
    pyplot_funcs = get_pyplot_funcs()
    for f in pyplot_funcs:
        lines.append('from matplotlib.pyplot import %s' % f)

    fake_pyplot_src = '\n'.join(lines)
Пример #32
0
import csv
from itertools import imap
from collections import defaultdict
import string

from codemend.docstring_parse.elemdoc import ElemDoc
from codemend.docstring_parse.polish import \
    create_new_element_doc, fu,fau, fu_t, fau_t, enormer
from codemend import relative_path

elem_lookup = {}  # [elem_id] = elem_doc

# Import pre-computed elem docs
with open(relative_path(
      'docstring_parse/doc_polished/elem_docs.csv'
      # this file is generated by docstring_parse/polish.py
      ), 'rb') as csvfile:
  reader = csv.reader(csvfile)

  columns = tuple(next(reader))
  assert ElemDoc._fields == columns

  for elem in imap(ElemDoc._make, reader):
    elem_id = elem.elem_id
    elem_lookup[elem_id] = elem

# Create children_lookup
children_lookup = defaultdict(list)  # [parent_elem_id] = [elems]
for elem in elem_lookup.values():
  if elem.parent_id:
    children_lookup[elem.parent_id].append(elem)
Пример #33
0
 def __init__(self):
   bh = BackupHandler(relative_path('experimental/code_suggest/output/backup'))
   elem_counts = bh.load('elem_pyplot_counts_0404')
   self.all_elems = set(elem_counts.keys())
   self.all_elem_counts = elem_counts
   self.enormer = ElementNormalizer()
Пример #34
0
from codemend import BackupHandler, relative_path
from codemend.models.element import ElementNormalizer
from codemend.models.word2vec_util import load_gensim_from_binary_file
from codemend.models.bimodal2 import BiModal
from codemend.experimental.code_suggest.mine_element import code_examples

if __name__ == '__main__':
    bh = BackupHandler(
        relative_path('experimental/code_suggest/output/backup'))
    elem_counts = bh.load('elem_pyplot_counts_0404')
    all_elems = sorted(elem_counts.keys())
    all_elems_counts = elem_counts
    enormer = ElementNormalizer()
    w2v_model = load_gensim_from_binary_file(
        relative_path('models/output/vectors-so-text-python-lemma-win5.bin')
    )  # <-- note the change here!!

    model = BiModal(all_elems,
                    all_elems_counts,
                    w2v_model,
                    code_examples,
                    enormer,
                    threads=None,
                    alpha=0.05,
                    window=5,
                    negative=20,
                    additive=0,
                    multiply=0,
                    concat=1,
                    epoch=1,
                    rand_parent_doc=True,
Пример #35
0
  def __init__(self):
    self.expected_set_cache = {}

    self.queries = []
    # load the ground-truth file
    with open('eval-gt-2.csv', 'rb') as csv_file:
      reader = csv.reader(csv_file)
      # columns:
      # case_study_no,answer_func,answer_arg,query
      Query = recordclass('Query', next(reader))
      for query in imap(Query._make, reader):
        assert query.case_study_no.startswith('example-')
        query.case_study_no = int(query.case_study_no.replace('example-', ''))
        query.answer = query.answer.strip()
        query.query = query.query.strip()
        query.query_source = query.query_source.strip()
        self.queries.append(query)

    print 'Loading the code samples...'
    self.code_samples = []
    fnames = [relative_path('demo/code-samples/before_afters/before%d.py'%x)
              for x in [1,2,3,4,5]]
    for f in fnames:
      with open(f) as reader:
        code = reader.read().strip()
        self.code_samples.append(code)

    print 'Initializing context builder...'
    self.cb = ContextBuilder()

    print 'Initializing element normalizer...'
    self.enormer = ElementNormalizer()

    print 'Instantiating baselines...'
    self.baselines = []
    self.baselines.append(RandomBaseline(self.cb.getAllElements()))

    w2vb1 = Word2vecBaseline(
        relative_path('models/output/vectors-so-text-python-lemma.bin'),
        self.cb.getAllElementCounts(), 1, 'w2v')

    w2vb2 = Word2vecBaseline(w2vb1.model,
        self.cb.getAllElementCounts(), 1, 'w2v-heuristic', heuristic=True)

    w2vb3 = Word2vecBaseline(w2vb1.model,
        self.cb.getAllElementCounts(), 1, 'w2v-cooccur', use_coke=True)

    w2vb4 = Word2vecBaseline(w2vb1.model,
        self.cb.getAllElementCounts(), 1, 'w2v-hc', heuristic=True, use_coke=True)

    self.baselines += [w2vb1, w2vb2, w2vb3, w2vb4]

    # bimodal = BiModalBaseline('bimodal-concat-10epoch',
    #     relative_path('models/output/bi2-test-ggg.model'),
    #     relative_path('models/output/vectors-flat-mpl-0205.bin'))
    # bimodal_ids = list('denopq')
    bimodal_ids = list('d')
    for id_  in bimodal_ids:
      bimodal = BiModalBaseline('bimodal-'+id_,
          relative_path('models/output/bi2-0410-%s.model'%id_),
          w2vb1.model)
      self.baselines.append(bimodal)

    print 'Starts evaluating...'
    metric_names = ['MRR', 'P@1', 'P@5', 'P@10']
    results = np.zeros((len(self.baselines), len(metric_names)), dtype=float)
    result_log = []  # for diagnosis

    count_query = 0
    for idx, code in enumerate(self.code_samples):
      # triple-for-loop structure: {code-sample -> gt -> baseline}.
      print 'Processing code sample %d'%(idx + 1)

      current_queries = filter(lambda x: int(x.case_study_no) == int(idx + 1), self.queries)
      assert current_queries

      context = self.cb.getContext(code)

      for query in current_queries:  # "query" = "ground truth"
        count_query += 1
        assert query.answer

        for b_idx, b in enumerate(self.baselines):
          suggested_items = b.suggest(query.query, context)
          answer_rank = self.getRankOfExpectedItem(
              suggested_items, code, query.answer)

          mrr_idx = metric_names.index('MRR')
          p1_idx = metric_names.index('P@1')
          p5_idx = metric_names.index('P@5')
          p10_idx = metric_names.index('P@10')
          if answer_rank > 0:
            results[b_idx, mrr_idx] += 1. / answer_rank
            if answer_rank == 1:
              results[b_idx, p1_idx] += 1
            if answer_rank <= 5:
              results[b_idx, p5_idx] += 1
            if answer_rank <= 10:
              results[b_idx, p10_idx] += 1


          self.updateResultLog(result_log, idx + 1, query.query, b,
                               suggested_items, code, query.answer,
                               answer_rank)

    assert count_query > 0
    for metric_idx, metric in enumerate(metric_names):
      if metric == 'MRR' or metric.startswith('P@'):
        results[:, metric_idx] /= count_query

    # output
    print 'Writing outputs...'
    with open(relative_path('models/output/eval-result-0413.csv'), 'wb') as csv_file:
      writer = csv.writer(csv_file)
      writer.writerow(['Baseline'] + metric_names)
      for b_idx, b in enumerate(self.baselines):
        writer.writerow([b.__repr__()] + results[b_idx].tolist())

    with open(relative_path('models/output/eval-log-0413.csv'), 'wb') as csv_file:
      writer = csv.writer(csv_file)
      writer.writerow(ResultLogEntry._fields)
      for row in result_log:
        writer.writerow(row)

    # close resources
    print 'Closing resources'
    # whoosh_baseline.close()

    print 'Done'
Пример #36
0
    def __init__(self,
                 w2v_model,
                 all_elem_counts,
                 maxngram=1,
                 name=None,
                 use_lemma=True,
                 heuristic=False,
                 use_coke=False):
        """
    w2v_model can be a binary vectors file, or a loaded gensim model instance.

    """
        self.maxngram = maxngram
        self.name = name
        self.use_lemma = use_lemma
        assert isinstance(all_elem_counts, dict)
        self.all_elem_counts = all_elem_counts
        self.heuristic = heuristic
        self.use_coke = use_coke

        if isinstance(w2v_model, basestring):
            self.model = load_gensim_from_binary_file(w2v_model)
            self.model.filename = w2v_model.split('/')[-1]
            if not self.name:
                self.name = self.model.filename
        else:
            assert isinstance(w2v_model, Word2Vec)
            self.model = w2v_model
            if not self.name:
                if hasattr(self.model, 'filename'):
                    self.name = self.model.filename

        self.model.init_sims()  # normalize the vectors

        self.enormer = ElementNormalizer()

        if self.use_coke:
            bh = BackupHandler(relative_path('models/output/backup'))
            coke_file = 'coke_0329'
            if not bh.exists(coke_file):
                raise ValueError('Coke file does not exist: %s' % coke_file)
            self.coke = bh.load(coke_file)

        print 'Trying to load element indexes from cache ...'
        bh = BackupHandler(relative_path('models/output/backup'))
        elem_index_backup_name = self.model.filename + '_elem_index'
        if bh.exists(elem_index_backup_name):
            self.idfs, self.elems, self.elem_lookup, self.vecmat = bh.load(
                elem_index_backup_name)

        else:
            print 'Word2vecBaseline building element indexes...'

            fu, fau = get_fu_fau()
            self.idfs = self.get_idf(fu.values() + fau.values())

            self.elems = sorted(self.all_elem_counts.keys())
            self.elem_lookup = dict((y, x) for (x, y) in enumerate(self.elems))
            vecs = []
            for e in self.elems:
                u = doc_serve.get_training_doc(e, True)
                v = self.get_bow_representation(u)
                vecs.append(v)
            self.vecmat = np.array(vecs)
            assert self.vecmat.shape == (len(self.elems),
                                         self.model.vector_size)

            bh.save(elem_index_backup_name,
                    (self.idfs, self.elems, self.elem_lookup, self.vecmat))

            print 'Finished building indexes.'
Пример #37
0
    count += 1
    if count % 1000 == 0: print count
    qid, qtitle, qbody, qscore = row
    q2 = "SELECT Id, Body, Score from posts WHERE ParentID = %d"%qid
    if afilter:
      q2 = '%s AND %s'%(q2, afilter)
    answers = []
    cursor2 = db.cursor()
    cursor2 = cursor2.execute(q2)
    answers = map(Answer._make, cursor2.fetchall())
    if not answers: continue
    answers = tuple(answers)
    yield Thread._make(row + (answers,))

if __name__ == '__main__':
  bh_dir = relative_path('models/output/backup')
  bh = BackupHandler(bh_dir)

  try:
    threads = bh.load('mpl_threads')
  except AssertionError:
    threads = list(load_threads(
      qfilter="Tags LIKE '%<matplotlib>%' AND AnswerCount > 0 AND Score >= 0",
      afilter="Score >= 0 ORDER BY Score DESC LIMIT 3"))
    bh.save('mpl_threads', threads)

  # Dump the code blocks extracted from the threads
  print 'Extracting and dumping code blocks ...'
  outdir = 'output/'
  if not os.path.exists(outdir):
    os.makedirs(outdir)
Пример #38
0
def transform_and_filter(elem):
    """
  Cleaning is performed to reduce sparsity:
    - pylab.xxx --> plt.xxx (if the function exists in pyplot)
    - various add_subplot.xxx --> plt.gca.xxx (see stype.tsv)
    - only plt.* are kept

  Returns: cleaned elem or None

  """
    elem = enormer.simplify(elem)
    if elem.startswith('plt.'):
        return elem
    else:
        return None


if __name__ == '__main__':
    coke_counts = defaultdict(int)
    count = 0
    for code in code_examples():
        count += 1
        if count % 1000 == 0:
            print '%d ... unique_cokes=%d' % (count, len(coke_counts))

        for x, y in get_cokes(code):
            coke_counts[x, y] += 1

    bh = BackupHandler(relative_path('models/output/backup'))
    bh.save('coke_0329', coke_counts)
Пример #39
0
"""
List all argkeys for human inspection. The purpose is to check every argkey
and manually link proper documentations to them.

"""
import csv
from itertools import imap

from codemend.docstring_parse.elemdoc import ElemDoc
from codemend import relative_path

with open(relative_path(
      'docstring_parse/doc_polished/elem_docs.csv'
      # this file is generated by docstring_parse/polish.py
      ), 'rb') as csvfile:
  reader = csv.reader(csvfile)

  columns = tuple(next(reader))
  assert ElemDoc._fields == columns

  outset = set()
  for elem in imap(ElemDoc._make, reader):
    elem_id = elem.elem_id

    if elem.type == 'argkey':

      fields = elem_id.split('@')
      key = '@'.join(fields[:2])
      outset.add(key)

  for x in sorted(outset):
Пример #40
0
    """
  Testing word2vec baseline

  """
    TEST_VECTOR_BIN_FILE = 'output/vectors-flat-mpl-0205.bin'
    # TEST_VECTOR_BIN_FILE = 'output/vectors-so-text-python-5gram.bin'
    # TEST_VECTOR_BIN_FILE = 'output/vectors-so-text-python-stem-3gram.bin'

    from codemend.models.eval2 import ContextBuilder

    cb = ContextBuilder()

    MAXNGRAM = 1
    wb = Word2vecBaseline(TEST_VECTOR_BIN_FILE,
                          cb.getAllElementCounts(),
                          MAXNGRAM,
                          'test-w2v',
                          use_lemma=True,
                          heuristic=False,
                          use_coke=True)

    query = 'add legend'
    with open(relative_path(
            'demo/code-samples/before_afters/before1.py')) as reader:
        code = reader.read()
    context = cb.getContext(code)
    results = wb.suggest(query, context)
    for r in results:
        assert isinstance(r, SuggestItem)
        print '%.3f\t%s\t%d' % (r.score, r.elem, wb.all_elem_counts[r.elem])
Пример #41
0
 def __init__(self):
   bh = BackupHandler(relative_path('demo/data'))
   self.pos_ave = bh.load('pos_ave')
   print 'FuncPositionFinder: loaded %d average positions for functions'%len(self.pos_ave)
Пример #42
0
    def __init__(self):
        self.expected_set_cache = {}

        self.queries = []
        # load the ground-truth file
        with open('eval-gt-2.csv', 'rb') as csv_file:
            reader = csv.reader(csv_file)
            # columns:
            # case_study_no,answer_func,answer_arg,query
            Query = recordclass('Query', next(reader))
            for query in imap(Query._make, reader):
                assert query.case_study_no.startswith('example-')
                query.case_study_no = int(
                    query.case_study_no.replace('example-', ''))
                query.answer = query.answer.strip()
                query.query = query.query.strip()
                query.query_source = query.query_source.strip()
                self.queries.append(query)

        print 'Loading the code samples...'
        self.code_samples = []
        fnames = [
            relative_path('demo/code-samples/before_afters/before%d.py' % x)
            for x in [1, 2, 3, 4, 5]
        ]
        for f in fnames:
            with open(f) as reader:
                code = reader.read().strip()
                self.code_samples.append(code)

        print 'Initializing context builder...'
        self.cb = ContextBuilder()

        print 'Initializing element normalizer...'
        self.enormer = ElementNormalizer()

        print 'Instantiating baselines...'
        self.baselines = []
        self.baselines.append(RandomBaseline(self.cb.getAllElements()))

        w2vb1 = Word2vecBaseline(
            relative_path('models/output/vectors-so-text-python-lemma.bin'),
            self.cb.getAllElementCounts(), 1, 'w2v')

        w2vb2 = Word2vecBaseline(w2vb1.model,
                                 self.cb.getAllElementCounts(),
                                 1,
                                 'w2v-heuristic',
                                 heuristic=True)

        w2vb3 = Word2vecBaseline(w2vb1.model,
                                 self.cb.getAllElementCounts(),
                                 1,
                                 'w2v-cooccur',
                                 use_coke=True)

        w2vb4 = Word2vecBaseline(w2vb1.model,
                                 self.cb.getAllElementCounts(),
                                 1,
                                 'w2v-hc',
                                 heuristic=True,
                                 use_coke=True)

        self.baselines += [w2vb1, w2vb2, w2vb3, w2vb4]

        # bimodal = BiModalBaseline('bimodal-concat-10epoch',
        #     relative_path('models/output/bi2-test-ggg.model'),
        #     relative_path('models/output/vectors-flat-mpl-0205.bin'))
        # bimodal_ids = list('denopq')
        bimodal_ids = list('d')
        for id_ in bimodal_ids:
            bimodal = BiModalBaseline(
                'bimodal-' + id_,
                relative_path('models/output/bi2-0410-%s.model' % id_),
                w2vb1.model)
            self.baselines.append(bimodal)

        print 'Starts evaluating...'
        metric_names = ['MRR', 'P@1', 'P@5', 'P@10']
        results = np.zeros((len(self.baselines), len(metric_names)),
                           dtype=float)
        result_log = []  # for diagnosis

        count_query = 0
        for idx, code in enumerate(self.code_samples):
            # triple-for-loop structure: {code-sample -> gt -> baseline}.
            print 'Processing code sample %d' % (idx + 1)

            current_queries = filter(
                lambda x: int(x.case_study_no) == int(idx + 1), self.queries)
            assert current_queries

            context = self.cb.getContext(code)

            for query in current_queries:  # "query" = "ground truth"
                count_query += 1
                assert query.answer

                for b_idx, b in enumerate(self.baselines):
                    suggested_items = b.suggest(query.query, context)
                    answer_rank = self.getRankOfExpectedItem(
                        suggested_items, code, query.answer)

                    mrr_idx = metric_names.index('MRR')
                    p1_idx = metric_names.index('P@1')
                    p5_idx = metric_names.index('P@5')
                    p10_idx = metric_names.index('P@10')
                    if answer_rank > 0:
                        results[b_idx, mrr_idx] += 1. / answer_rank
                        if answer_rank == 1:
                            results[b_idx, p1_idx] += 1
                        if answer_rank <= 5:
                            results[b_idx, p5_idx] += 1
                        if answer_rank <= 10:
                            results[b_idx, p10_idx] += 1

                    self.updateResultLog(result_log, idx + 1, query.query, b,
                                         suggested_items, code, query.answer,
                                         answer_rank)

        assert count_query > 0
        for metric_idx, metric in enumerate(metric_names):
            if metric == 'MRR' or metric.startswith('P@'):
                results[:, metric_idx] /= count_query

        # output
        print 'Writing outputs...'
        with open(relative_path('models/output/eval-result-0413.csv'),
                  'wb') as csv_file:
            writer = csv.writer(csv_file)
            writer.writerow(['Baseline'] + metric_names)
            for b_idx, b in enumerate(self.baselines):
                writer.writerow([b.__repr__()] + results[b_idx].tolist())

        with open(relative_path('models/output/eval-log-0413.csv'),
                  'wb') as csv_file:
            writer = csv.writer(csv_file)
            writer.writerow(ResultLogEntry._fields)
            for row in result_log:
                writer.writerow(row)

        # close resources
        print 'Closing resources'
        # whoosh_baseline.close()

        print 'Done'
Пример #43
0
from codemend.models.extract_so_code import load_threads, Thread, Answer
from codemend import BackupHandler, relative_path

if __name__ == '__main__':
    bh_dir = relative_path('models/output/backup')
    bh = BackupHandler(bh_dir)

    try:
        threads = bh.load('mpl_threads')
    except AssertionError:
        threads = list(
            load_threads(
                qfilter=
                "Tags LIKE '%<matplotlib>%' AND AnswerCount > 0 AND Score >= 0",
                afilter="Score >= 0 ORDER BY Score DESC LIMIT 3"))
        bh.save('mpl_threads', threads)

    with open(relative_path('models/output/mpl_so_titles.txt'), 'w') as writer:
        for t in threads:
            writer.write('%d\t%s\n' % (t.qid, t.qtitle.encode('utf-8')))
Пример #44
0
def lemmaWithVocab(token, vocab):
    """
  First try PORTER. If not in vocab, then lemma.

  """
    out = pattern.vector.stem(token, stemmer=pattern.vector.PORTER)
    if out in vocab: return out
    if token.endswith('ing'): out = token[:-3]
    if out in vocab: return out
    return lemma(token)


from codemend import relative_path

stopwords = set()
with open(relative_path('models/stopwords-en.txt')) as reader:
    for line in reader:
        if line.startswith('#'): continue
        line = line.strip()
        words = line.split(', ')
        stopwords |= set(words)
    stopwords |= set(string.punctuation)

if __name__ == '__main__':
    s = 'hatching-hatches color" colors interesting flying flies'
    vocab = dict(hatch=1, color=2, interest=3, fly=4)
    tokens = tokenize(s)
    lemmas = [lemmaWithVocab(x, vocab) for x in tokens]
    print lemmas
Пример #45
0
  called.
- #3: Not recommending elements that occur too infrequently.
- #4: When a function is not used before, and its argv is recommended, we strip
  the "@", and recommend the function first, followed by the argv. e.g.
  [pie@0, pie] => [pie, pie@0].

"""

from codemend import BackupHandler, relative_path
from codemend.demo.code_suggest import get_plot_commands
from codemend.models.baseline2 import SuggestItem

plot_commands = get_plot_commands()
plot_commands_set = set(plot_commands)

bh = BackupHandler(relative_path('experimental/code_suggest/output/backup'))
elem_counts = bh.load('elem_pyplot_counts_0404')


def prune(used_elems, suggest_elems):
    for elem in used_elems:
        assert isinstance(elem, basestring)
    for elem in suggest_elems:
        assert isinstance(elem, SuggestItem), type(elem)

    used_elems_set = set(used_elems)
    used_funcs = map(get_func_name, used_elems)
    used_funcs_set = set(used_funcs)
    has_used_plot_commands = any(
        map(lambda x: x in plot_commands_set, used_funcs))
Пример #46
0
    return is_useful


if __name__ == '__main__':
    counters = defaultdict(int)

    md5s = set()

    all_codes = []

    fu, _ = get_fu_fau()

    elem_counts = defaultdict(int)  # [elem] = count

    bh = BackupHandler(
        relative_path('experimental/code_suggest/output/backup'))

    for root, dirs, files in os.walk(
            relative_path('mining/output/github-matplotlib-repos')):

        if '.git' in root: continue

        for file_name in files:
            counters['count_file'] += 1

            if counters['count_file'] % 1000 == 0:
                print 'Processed %d files - Useful files: %d' % (
                    counters['count_file'], counters['count_useful_files'])

            file_path = os.path.join(root, file_name)
Пример #47
0
def transform_and_filter(elem):
  """
  Cleaning is performed to reduce sparsity:
    - pylab.xxx --> plt.xxx (if the function exists in pyplot)
    - various add_subplot.xxx --> plt.gca.xxx (see stype.tsv)
    - only plt.* are kept

  Returns: cleaned elem or None

  """
  elem = enormer.simplify(elem)
  if elem.startswith('plt.'):
    return elem
  else:
    return None

if __name__ == '__main__':
  coke_counts = defaultdict(int)
  count = 0
  for code in code_examples():
    count += 1
    if count % 1000 == 0:
      print '%d ... unique_cokes=%d'%(count, len(coke_counts))

    for x, y in get_cokes(code):
      coke_counts[x,y] += 1

  bh = BackupHandler(relative_path('models/output/backup'))
  bh.save('coke_0329', coke_counts)
Пример #48
0
import csv
import pydoc
from itertools import imap
from collections import defaultdict, namedtuple
from recordclass import recordclass
import funcsigs
import string

from codemend import BackupHandler, relative_path
from codemend.models.element import ElementNormalizer
from codemend.docstring_parse.elemdoc import ElemDoc
from codemend.docstring_parse.consolidate import is_setXXX, get_class
from codemend.models.annotate_code_with_api import get_fu_fau

# Load all input sources
bh = BackupHandler(relative_path('experimental/code_suggest/output/backup'))

fu_t, fau_t = get_fu_fau(omit_module=False, truncate=True)
fu, fau = get_fu_fau(omit_module=False, truncate=False)

fa_lookup = defaultdict(list)  # [function] = [argument]
for f, a in fau.keys():
  fa_lookup[f].append(a)
fa_lookup = dict(fa_lookup)

cf_lookup = defaultdict(list)  # [class] = [function]
for f in fu.keys():
  cf_lookup[get_class(f)].append(f)

elem_counts = bh.load('elem_pyplot_counts_0404')
enormer = ElementNormalizer()
Пример #49
0
     self.model = BiModal.load(model_file)

     if isinstance(w2v_model, basestring):
       w2v_model = load_gensim_from_binary_file(w2v_model)
     else:
       assert isinstance(w2v_model, Word2Vec)
     self.model.w2v_model = w2v_model

     self.model.syn0l = self.model.w2v_model.syn0

   def suggest(self, query, context):
     used_elems = context.used_elems()
     scores = self.model.score_all(query, used_elems)
     elems_sorted = sorted(zip(scores, self.model.all_elems), reverse=True)
     suggest_sorted = [SuggestItem(elem=elem, score=score) for (score, elem) in elems_sorted]
     suggest_pruned = prune(used_elems, suggest_sorted)
     return suggest_pruned[:50]

   def __repr__(self):
     return self.name

if __name__ == '__main__':
  from codemend import relative_path

  bmb = BiModalBaseline('tmp',
      relative_path('models/output/bi2-test.model'),
      relative_path('models/output/vectors-flat-mpl-0205.bin')
    )
  print bmb, 'initialized.'
Пример #50
0
import csv
from itertools import imap
from collections import defaultdict
import string

from codemend.docstring_parse.elemdoc import ElemDoc
from codemend.docstring_parse.polish import \
    create_new_element_doc, fu,fau, fu_t, fau_t, enormer
from codemend import relative_path

elem_lookup = {}  # [elem_id] = elem_doc

# Import pre-computed elem docs
with open(
        relative_path('docstring_parse/doc_polished/elem_docs.csv'
                      # this file is generated by docstring_parse/polish.py
                      ),
        'rb') as csvfile:
    reader = csv.reader(csvfile)

    columns = tuple(next(reader))
    assert ElemDoc._fields == columns

    for elem in imap(ElemDoc._make, reader):
        elem_id = elem.elem_id
        elem_lookup[elem_id] = elem

# Create children_lookup
children_lookup = defaultdict(list)  # [parent_elem_id] = [elems]
for elem in elem_lookup.values():
    if elem.parent_id:
Пример #51
0
      train_pairs.append((merged_utter, astunparse.unparse(call_node)))

  unique_train_pairs = list(set(train_pairs))

  print 'total_block', total_block
  print 'total_grammatical', total_grammatical
  print 'total_call_nodes', total_call_nodes
  print 'total_matched_funcs', total_matched_funcs, '(total train pairs)'
  print 'total_matched_args', total_matched_args
  print 'total_unique_train_pairs', len(unique_train_pairs)

  return unique_train_pairs


if __name__ == '__main__':
  bh = BackupHandler(relative_path('models/output/backup'))

  # Step 1
  fu, fau = get_fu_fau()

  # Step 2
  with open(relative_path('models/output/mpl_code_blocks.txt')) as reader:
    content = reader.read()

  content = content.decode('utf-8')
  content = content.replace("&lt;", "<")
  content = content.replace("&gt;", ">")
  content = content.replace("&amp;", "&")

  blocks = content.split('\n\n\n')
Пример #52
0
      if elem_id.startswith('plt.'):
        element_pyplot_counts[elem_id] += 1
        val = get_countable_value(e.val_node, varmap, enormer)
        if val: element_pyplot_value_counts[elem_id][val] += 1

  for elem_id in element_pyplot_value_counts:
    element_pyplot_value_counts[elem_id] = dict(element_pyplot_value_counts[elem_id])
  element_pyplot_value_counts = dict(element_pyplot_value_counts)

  print 'Processed %d code examples'%count
  print 'There are %d unique elements'%len(element_counts)
  print 'There are %d unique pyplot elements'%len(element_pyplot_counts)
  for k in counters:
    print '%s: %d'%(k, counters[k])

  bh = BackupHandler(relative_path('experimental/code_suggest/output/backup'))
  # Change logs:
  # - 0322: using raw format
  # - 0327: using Element, tracking return type and variable assignments and
  #   import aliases.
  # - 0404: fixed issue with dict as positional argument;
  #         added element_value_counts;
  #         added Shiyan's example.
  bh.save('elem_counts_0404', element_counts)
  bh.save('elem_pyplot_counts_0404', element_pyplot_counts)
  bh.save('elem_pyplot_value_counts_0404', element_pyplot_value_counts)

  """
  Log:

  # 0327
Пример #53
0
from codemend import BackupHandler, relative_path
from codemend.models.element import ElementNormalizer
from codemend.models.word2vec_util import load_gensim_from_binary_file
from codemend.models.bimodal2 import BiModal
from codemend.experimental.code_suggest.mine_element import code_examples

if __name__ == '__main__':
  bh = BackupHandler(relative_path('experimental/code_suggest/output/backup'))
  elem_counts = bh.load('elem_pyplot_counts_0404')
  all_elems = sorted(elem_counts.keys())
  all_elems_counts = elem_counts
  enormer = ElementNormalizer()
  w2v_model = load_gensim_from_binary_file(
    relative_path('models/output/vectors-so-text-python-lemma-win5.bin'))  # <-- note the change here!!

  model = BiModal(all_elems, all_elems_counts, w2v_model, code_examples, enormer,
                  threads=None, alpha=0.05, window=5, negative=20,
                  additive=0, multiply=0, concat=1,
                  epoch=1, rand_parent_doc=True,
                  hint_pvecs_init=True, hint_rvecs_init=False,
                  neg_sample_used_elem=False)

  model.save(relative_path('models/output/bi2-0410-t.model'))

  # Changes:
  # bi2-test -- lastest gold version for user study
  # bi2-0410-a -- epoch=10, fixed stopwords (e.g., excluding bar from stopwords) -- this is vanilla
  # bi2-0410-b -- epoch=1, quick check if setting is all right.
  # bi2-0410-c -- epoch=10, replicating bi2-0410-a
  # bi2-0410-d -- epoch=1, randomly with-parent doc
  # bi2-0410-e -- epoch=5, randomly with-parent doc
Пример #54
0
    def do_mh_handle_requests(self, params_list):
        """This callback function is to be used by brain, so that it can take
    advantage of the worker pool to do multithreaded plotting."""
        return plotter_pool.map(mh_handle_request, params_list)

    def jedi_lock(self):
        """To be used by brain to protect non-thread-safe jedi."""
        return jedi_lock


class ThreadedTCPServer(SocketServer.ThreadingMixIn, SocketServer.TCPServer):
    pass


if __name__ == '__main__':
    os.chdir(relative_path('demo'))

    if len(sys.argv) > 1 and sys.argv[1] != '-':
        port = int(sys.argv[1])
    else:
        port = PORT_NUMBER

    if len(sys.argv) > 2 and sys.argv[2] != '-':
        host_name = sys.argv[2]
    else:
        host_name = HOST_NAME

    server_util.port_available_or_die(port)

    plotter_pool = multiprocessing.Pool()