def generate_indices_from_benchmark(writer, counter): javafiles = java_files_from_dir(source_path) i = 0 j = 0 for javafile in javafiles: # print javafile i += 1 if i % 1000 == 0: #1000개 마다 프린트 print("Counter: %s" % i) print "typed_method_call" + str(counter.typed_method_call_count) document = Document() document.add(Field("file", javafile, Field.Store.YES, Field.Index.NO)) try: with codecs.open(javafile, "r", encoding='utf-8', errors='ignore') as f: file_content = f.read().encode("utf-8", errors='ignore') f.close() ast = parse(file_content, resolve=False) if add_code_keyword_into_document(document, file_content, ast, counter): writer.addDocument(document) j += 1 if j % 1000 == 0: print "Wrote:: %s files" % j except Exception as e: print("Error: %s" % e) continue print "Number of files: %s" % i print "Number of duplicates: %s" % len(hashes) print "%s files has been indexed" % j
def transform_body(body): code_snippets = [] code_hints = [] for item in body.split("</code>"): if "<code>" in item: code_tag = item[item.find("<code>") + len("<code>"):] code_tag = MyUtils.unescape_html(code_tag) if "." in code_tag and "(" in code_tag: code_snippets.append(code_tag) if "<pre" not in item and len( code_tag ) < 25: # Heuristic to determine if code_tag is enclosed in inline code block code_hints.append(code_tag) elif len(code_tag) < 25: code_hints.append(code_tag) l = [] for code_hint in code_hints: l.extend(MyUtils.tokenize(code_hint)) code_hints = set(l) asts = [] for code_snippet in code_snippets: ast = parse(code_snippet, resolve=True) if ast: asts.append(ast) return asts, code_hints
def Generator(code): file_content = code print '1. Origianl Query : ', file_content ast = parse(file_content, resolve=False) query = add_code_keyword_into_document(file_content, ast) print "2. Right after alternation & before the removing stop words : ", query query = remove_unified_stop_lists(query) print '3. Right after the stop words removing : ', query return query
def Generator(code): file_content = code # print '1. Origianl Query : ', file_content ast = parse(file_content, resolve=False) # newJavaParser를 사용하여 자바 코드 파싱 query = add_code_keyword_into_document(file_content, ast) # print "Query before the removing stop words : ", query # write_search_log("\nQuery before the removing stop words : " + str(query)) # print '2. Right after the code query generator : ', query query = remove_unified_stop_lists(query) # print '3. Right after the stop words removing : ', query # print "Transformed user code query : ", query # write_search_log("\nTransformed user code query : " + str(query)) return query
def generate_indices_from_projects(writer, counter): HOME = "/Users/Falcon/Downloads/GCJ_Repository/2014" #####################/2014 javafiles = java_files_from_dir(HOME) #자바 파일들만 뽑아내는 함수 i = 0 j = 0 for javafile in javafiles: print javafile i += 1 if i % 1000 == 0: #1000개 될때마다 프린트 한번씩 print("Counter: %s" % i) print "typed_method_call" + str(counter.typed_method_call_count) document = Document() #루씬 Document 객체 ################################################################################################################ splits = javafile.split("/")[6:] project_path = "" for names in splits: project_path += "/" + names changed_path = HOME + project_path document.add( Field("file", changed_path, Field.Store.YES, Field.Index.NO)) ################################################################################################################ try: with codecs.open(javafile, "r", encoding='utf-8', errors='ignore') as f: file_content = f.read().encode("utf-8", errors='ignore') ast = parse(file_content, resolve=False) #newJavaParser를 사용하여 자바 코드 파싱 if add_code_keyword_into_document(document, file_content, ast, counter): writer.addDocument(document) j += 1 if j % 1000 == 0: print "Wrote:: %s files" % j except Exception as e: print("Error: %s" % e) continue print "Number of files: %s" % i print "Number of duplicates: %s" % len(hashes) print "%s files has been indexed" % j
def generate_indices_from_projects(writer, counter): HOME = "/Users/Falcon/Desktop/IJA/dataset/" javafiles = java_files_from_dir(HOME) i = 0 j = 0 for javafile in javafiles: #print javafile i += 1 if i % 1000 == 0: print("Counter: %s" % i) print "typed_method_call" + str(counter.typed_method_call_count) document = Document() ################################################################################################################ splits = javafile.split("/")[6:] project_path = "" for names in splits: project_path += "/" + names changed_path = "/Users/Falcon/Desktop/IJA/dataset" + project_path document.add(Field("file", changed_path, Field.Store.YES, Field.Index.NO)) ################################################################################################################ try: with codecs.open(javafile, "r", encoding='utf-8', errors='ignore') as f: file_content = f.read().encode("utf-8", errors='ignore') ast = parse(file_content, resolve=False) if add_code_keyword_into_document(document, file_content, ast, counter): writer.addDocument(document) j += 1 if j % 1000 == 0: print "Wrote:: %s files" % j except Exception as e: print("Error: %s" % e) continue print "Number of files: %s" % i print "Number of duplicates: %s" % len(hashes) print "%s files has been indexed" % j
def __init__(self, snippet, sources): self.snippet = parse(snippet, resolve=True) self.sources = sources self.class_PQN_to_FQN = defaultdict(list) self.method_PQN_to_FQN = defaultdict(lambda: {"fqn": [], "class": []})