예제 #1
0
def tokenizeCode(code, lang):
  code = code.strip().decode('utf-8').encode('ascii', 'replace')
  typedCode = None
  if lang == "sql":
    query = SqlTemplate(code, regex=True)
    typedCode = query.parseSql()
  elif lang == "csharp":
    typedCode = parseCSharp(code)
  elif lang == "python":
    typedCode = q.strip().decode('utf-8').encode('ascii', 'replace').split("\\s")

  tokens = [re.sub( '\s+', ' ', x.strip())  for x in typedCode]
  return tokens
예제 #2
0
def tokenizeCode(code, lang):
    code = code.strip().decode('utf-8').encode('ascii', 'replace')
    typedCode = None
    if lang == "sql":
        query = SqlTemplate(code, regex=True)
        typedCode = query.parseSql()
    elif lang == "csharp":
        typedCode = parseCSharp(code)
    elif lang == "python":
        typedCode = q.strip().decode('utf-8').encode('ascii',
                                                     'replace').split("\\s")

    tokens = [re.sub('\s+', ' ', x.strip()) for x in typedCode]
    return tokens
예제 #3
0
def tokenizeCode(code, lang):
    code = code.strip().decode('utf-8').encode('ascii', 'replace')
    typedCode = None
    if lang == "sql":
        query = SqlTemplate(code, regex=True)
        typedCode = query.parseSql()
    elif lang == "csharp":
        typedCode = parseCSharp(code)
    elif lang == "python":
        typedCode = code.strip().decode('utf-8').encode('ascii', 'replace')
        chars_python = [':', '(', ')']
        for char in chars_python:
            typedCode = typedCode.replace(char, ' ' + char + ' ')
        typedCode = typedCode.split()

    tokens = [re.sub('\s+', ' ', x.strip()) for x in typedCode]
    return tokens
예제 #4
0
        if (len(code) > 6 and len(code) <= 1000):                   # Code must be at most 1000 chars
          code = code.replace('\n', '\\n').replace('\t', '')        # Newlines are important to remove comments later on but get rid of tabs
          


          # Filter out these weird code snippets
          if code[0] == "<" or code[0] == "=" or code[0] == "@" or code[0] == "$" or \
            code[0:7].lower() == "select " or code[0:7].lower() == "update " or code[0:6].lower() == "alter " or \
            code[0:2].lower() == "c:" or code[0:4].lower() == "http" or code[0:4].lower() == "hkey" or \
                  re.match(r"^[a-zA-Z0-9_]*$", code) is not None: # last one is single word answers
            pass
          else:
          
            # Now also make sure it passes the lexer
            try:
              parseCSharp(code)
              try:
                f.write('\t'.join([str(rid), str(acceptedAnswers[rid]['id']), acceptedAnswers[rid]['title'], code, "0"]) + '\n')
              except:
                print("error")
            except:
              pass


f.close()





# Create training and validation and test sets