def tokenizeCode(code, lang): code = code.strip().decode('utf-8').encode('ascii', 'replace') typedCode = None if lang == "sql": query = SqlTemplate(code, regex=True) typedCode = query.parseSql() elif lang == "csharp": typedCode = parseCSharp(code) elif lang == "python": typedCode = q.strip().decode('utf-8').encode('ascii', 'replace').split("\\s") tokens = [re.sub( '\s+', ' ', x.strip()) for x in typedCode] return tokens
def tokenizeCode(code, lang): code = code.strip().decode('utf-8').encode('ascii', 'replace') typedCode = None if lang == "sql": query = SqlTemplate(code, regex=True) typedCode = query.parseSql() elif lang == "csharp": typedCode = parseCSharp(code) elif lang == "python": typedCode = q.strip().decode('utf-8').encode('ascii', 'replace').split("\\s") tokens = [re.sub('\s+', ' ', x.strip()) for x in typedCode] return tokens
def tokenizeCode(code, lang): code = code.strip().decode('utf-8').encode('ascii', 'replace') typedCode = None if lang == "sql": query = SqlTemplate(code, regex=True) typedCode = query.parseSql() elif lang == "csharp": typedCode = parseCSharp(code) elif lang == "python": typedCode = code.strip().decode('utf-8').encode('ascii', 'replace') chars_python = [':', '(', ')'] for char in chars_python: typedCode = typedCode.replace(char, ' ' + char + ' ') typedCode = typedCode.split() tokens = [re.sub('\s+', ' ', x.strip()) for x in typedCode] return tokens
if (len(code) > 6 and len(code) <= 1000): # Code must be at most 1000 chars code = code.replace('\n', '\\n').replace('\t', '') # Newlines are important to remove comments later on but get rid of tabs # Filter out these weird code snippets if code[0] == "<" or code[0] == "=" or code[0] == "@" or code[0] == "$" or \ code[0:7].lower() == "select " or code[0:7].lower() == "update " or code[0:6].lower() == "alter " or \ code[0:2].lower() == "c:" or code[0:4].lower() == "http" or code[0:4].lower() == "hkey" or \ re.match(r"^[a-zA-Z0-9_]*$", code) is not None: # last one is single word answers pass else: # Now also make sure it passes the lexer try: parseCSharp(code) try: f.write('\t'.join([str(rid), str(acceptedAnswers[rid]['id']), acceptedAnswers[rid]['title'], code, "0"]) + '\n') except: print("error") except: pass f.close() # Create training and validation and test sets