def process_clone(self): rep_output = RepertoireOutput() print self.repOutputPath rep_output.loadFromFile(self.repOutputPath, 1) for index, fileName in rep_output.getFileIter(): if config.DEBUG == 2: print "fileName =" + fileName self.fileList.append(fileName) listItem = QtGui.QTreeWidgetItem(self.ui.fileList) listItem.setText(0, str(index)) listItem.setText(1, fileName) for indx, (cl1, cl2, metric) in rep_output.getCloneIter(): fidx1, start1, end1 = cl1 fidx2, start2, end2 = cl2 metric = int(metric) # metric = max(end1 - start1, end2 - start2) if (config.DEBUG == 2): print line print indx + " " + cl1 + " " + cl2 print metric if metric: listItem = QtGui.QTreeWidgetItem(self.ui.cloneList) listItem.setText(0, str(indx)) listItem.setText( 1, "{0}.{1}-{2}\t{3}.{4}-{5}".format(fidx1, start1, end1, fidx2, start2, end2)) listItem.setText(2, str(metric))
def process_clone(self): rep_output = RepertoireOutput() print self.repOutputPath rep_output.loadFromFile(self.repOutputPath,1) for index, fileName in rep_output.getFileIter(): if config.DEBUG == 2: print "fileName =" + fileName self.fileList.append(fileName) listItem = QtGui.QTreeWidgetItem(self.ui.fileList) listItem.setText(0,str(index)) listItem.setText(1,fileName) for indx,(cl1, cl2,metric) in rep_output.getCloneIter(): fidx1, start1, end1 = cl1 fidx2, start2, end2 = cl2 metric = int(metric) # metric = max(end1 - start1, end2 - start2) if (config.DEBUG == 2): print line print indx + " " + cl1 + " " + cl2 print metric if metric: listItem = QtGui.QTreeWidgetItem(self.ui.cloneList) listItem.setText(0, str(indx)) listItem.setText(1,"{0}.{1}-{2}\t{3}.{4}-{5}".format( fidx1, start1, end1, fidx2, start2, end2)) listItem.setText(2,str(metric))
def file_dist(rep_out_path, conv_dir1, conv_dir2): rep_out = RepertoireOutput() convDir1 = DiffToFileMapping(conv_dir1) convDir2 = DiffToFileMapping(conv_dir2) convDir1.walk_dir() convDir2.walk_dir() fileDist = {} rep_out.loadFromFile(rep_out_path,1) # print rep_out.clones for cloneIdx, (clone1, clone2, metric) in rep_out.getCloneIter(): fidx1, start1, end1 = clone1 fidx2, start2, end2 = clone2 diff_file1 = os.path.basename(rep_out.getFilePath(fidx1)) diff_file1 = os.path.splitext(diff_file1)[0] start1 = str(start1) diff_file2 = os.path.basename(rep_out.getFilePath(fidx2)) diff_file2 = os.path.splitext(diff_file2)[0] start2 = str(start2) src_file1 = convDir1.diff2file.get((diff_file1,start1), -1) src_file2 = convDir2.diff2file.get((diff_file2,start2), -1) #taking directory upto depth 3 if src_file1 == -1 or src_file2 == -1: continue temp_name = src_file1.split("_") src_dir1 = temp_name[0] + os.sep + temp_name[1] + os.sep + temp_name[2] temp_name = src_file2.split("_") src_dir2 = temp_name[0] + os.sep + temp_name[1] + os.sep + temp_name[2] # key = (src_file1,src_file2) key = (src_dir1,src_dir2) if (fileDist.has_key(key) == 0): fileDist[key] = 0 m = metric.partition(':')[0] m = m.lstrip('(') fileDist[key] += int(m) return fileDist
def generateDB(self, vcs1, vcs2): # a mapping from ccfx input file path -> file idx # note that since we're loading a ton of output files # these indices are necessarily being rewritten file2fidx = {} # a mapping from fidx -> num ported lines in the corresponding file fidx2numports = {} # a list of CloneMeta # once again, we're rewriting indices for the very same reasons clones = [] output_tuples = [] for is_new in [False, True]: for lang in ['cxx', 'hxx', 'java']: ccfx_output_path = ( self.pb.getRepertoireOutputPath(lang, is_new) + self.pb.getRepertoireOutputFileName(lang, is_new)) if not os.path.exists(ccfx_output_path): continue output = RepertoireOutput() output.loadFromFile(ccfx_output_path, True) output_tuples.append(OutputTuple(lang, is_new, output)) for output_tuple in output_tuples: self.firstPass(output_tuple.output, clones, file2fidx, fidx2numports) # map from commit id -> CommitMeta commits = {} fidx2commitid = {} for vcs in (vcs1, vcs2): vcs.populateDB(commits, fidx2commitid, file2fidx, fidx2numports) # at this point, we have clones filled out, except that we need commit # ids for files involved and the # ports per commit haven't been filled # out for clone in clones: if not clone.lhs.fileId in fidx2commitid: print "Going down looking for fidx {0} ({1})".format( clone.lhs.fileId, file2fidx[clone.lhs.fileId]) if not clone.rhs.fileId in fidx2commitid: print "Going down looking for fidx {0} ({1})".format( clone.rhs.fileId, file2fidx[clone.rhs.fileId]) clone.lhsCommitId = fidx2commitid[clone.lhs.fileId] clone.rhsCommitId = fidx2commitid[clone.rhs.fileId] return RepDB(commits, clones)
def dumpRepOut(rep_out_path, pickle_output_path): repOut = RepertoireOutput() repOut.loadFromFile(rep_out_path,1) clones = [] for cloneIdx, (clone1, clone2, metric) in repOut.getCloneIter(): fidx1, start1, end1 = clone1 fidx2, start2, end2 = clone2 lhs = SideOfClone(fidx1,start1,end1) rhs = SideOfClone(fidx2,start2,end2) lhs_commit_id = fidx1 #for time being rhs_commit_id = fidx2 #for time being ClMeta = CloneMeta(cloneIdx,lhs,lhs_commit_id,rhs,rhs_commit_id,metric) clones.append(ClMeta) pickle.dump( clones, open( pickle_output_path, "wb" ) )
def convert_ccfx_output(pb, proj, lang, is_new): metaDB = CCFXMetaMapping() # maps from ccfx input paths to meta objects representing the files #for proj in [PathBuilder.PROJ0, PathBuilder.PROJ1]: filter_path = pb.getFilterOutputPath(proj, lang) conv_path = pb.getLineMapPath(proj, lang, is_new) ccfx_i_path = pb.getCCFXInputPath(proj, lang, is_new) ccfx_p_path = pb.getCCXFPrepPath(proj, lang, is_new) print "filter_path = " + filter_path print "conv_path = " + conv_path print "ccfx_i_path = " + ccfx_i_path print "ccfx_p_path = " + ccfx_p_path for name in os.listdir(filter_path): meta = CCFXMetaData( ccfx_i_path + name, ccfx_p_path + pb.findPrepFileFor(ccfx_p_path, name), conv_path + pb.makeLineMapFileName(name), filter_path + name) metaDB.addFile(meta) print metaDB # we have our files, now map line numbers in the prep files to input files for meta in metaDB.getMetas(): if config.DEBUG is False: print "prep file = " + meta.ccfxPrep print "conv file = " + meta.filterConv prepHandler = open(meta.ccfxPrep, 'r') prep = prepHandler.readlines() prepHandler.close() convHandler = open(meta.filterConv, 'r') conv = convHandler.readlines() convHandler.close() input2orig = {} pidx2orig = {} origline2op = {} # build a map of line numbers in ccfx_input to filtered diff line for i, cline in enumerate(conv): if i < 2: continue if cline.rstrip().startswith('"'): #filename-->skip the line continue dstIdx, srcIdx, op, changId = cline.split(',') input2orig[int(dstIdx)] = int(srcIdx) origline2op[int(srcIdx)] = op for pidx, pline in enumerate(prep): inputIdx = int(pline.partition(".")[0], 16) # ccfx numbers from 1, but pidx is from 0 pidx2orig[pidx + 1] = input2orig.get(inputIdx, -1) meta.prepIdx2OrigIdx = pidx2orig meta.line2op = origline2op ccfx_out_path = pb.getCCFXOutputPath() + pb.getCCFXOutputFileName( lang, is_new, is_tmp=False) ccfx_out = RepertoireOutput() ccfx_out.loadFromFile(ccfx_out_path) files = {} for fileIdx, path in ccfx_out.getFileIter(): print fileIdx print path if not metaDB.hasInputPath(path): raise Exception( "Couldn't find meta information for file: {0}".format(path)) print ">>>>>>> " + path meta = metaDB.getMetaForPath(path) files[fileIdx] = meta.filterOutput clones = {} for cloneIdx, (clone1, clone2) in ccfx_out.getCloneIter(): op1 = [] op2 = [] fidx1, start1, end1 = clone1 fidx2, start2, end2 = clone2 meta1 = metaDB.getMetaForPath(ccfx_out.getFilePath(fidx1)) meta2 = metaDB.getMetaForPath(ccfx_out.getFilePath(fidx2)) start1 = meta1.prepIdx2OrigIdx.get(start1 + 1, -1) end1 = meta1.prepIdx2OrigIdx.get(end1, -1) start2 = meta2.prepIdx2OrigIdx.get(start2 + 1, -1) end2 = end2 = meta2.prepIdx2OrigIdx.get(end2, -1) for i in range(start1, end1 + 1): op = meta1.line2op.get(i, "X") op1.append((i, op)) for i in range(start2, end2 + 1): op = meta2.line2op.get(i, "X") op2.append((i, op)) clone1 = (fidx1, start1, end1, op1) clone2 = (fidx2, start2, end2, op2) if clone1[0] < clone2[0]: clone = (clone1, clone2) else: clone = (clone2, clone1) clones[cloneIdx] = clone rep_out = RepertoireOutput() rep_out.loadFromData(files, clones) return rep_out
def convert_ccfx_output(pb, proj, lang, is_new): metaDB = CCFXMetaMapping() # maps from ccfx input paths to meta objects representing the files #for proj in [PathBuilder.PROJ0, PathBuilder.PROJ1]: filter_path = pb.getFilterOutputPath(proj, lang) conv_path = pb.getLineMapPath(proj, lang, is_new) ccfx_i_path = pb.getCCFXInputPath(proj, lang, is_new) ccfx_p_path = pb.getCCXFPrepPath(proj, lang, is_new) print "filter_path = " + filter_path print "conv_path = " + conv_path print "ccfx_i_path = " + ccfx_i_path print "ccfx_p_path = " + ccfx_p_path for name in os.listdir(filter_path): meta = CCFXMetaData( ccfx_i_path + name, ccfx_p_path + pb.findPrepFileFor(ccfx_p_path, name), conv_path + pb.makeLineMapFileName(name), filter_path + name) metaDB.addFile(meta) print metaDB # we have our files, now map line numbers in the prep files to input files for meta in metaDB.getMetas(): if config.DEBUG is False: print "prep file = " + meta.ccfxPrep print "conv file = " + meta.filterConv prepHandler = open(meta.ccfxPrep, 'r') prep = prepHandler.readlines() prepHandler.close() convHandler = open(meta.filterConv, 'r') conv = convHandler.readlines() convHandler.close() input2orig = {} pidx2orig = {} origline2op = {} # build a map of line numbers in ccfx_input to filtered diff line for i, cline in enumerate(conv): if i < 2: continue if cline.rstrip().startswith('"'): #filename-->skip the line continue dstIdx,srcIdx,op,changId = cline.split(',') input2orig[int(dstIdx)] = int(srcIdx) origline2op[int(srcIdx)] = op for pidx, pline in enumerate(prep): inputIdx = int(pline.partition(".")[0], 16) # ccfx numbers from 1, but pidx is from 0 pidx2orig[pidx + 1] = input2orig.get(inputIdx, -1) meta.prepIdx2OrigIdx = pidx2orig meta.line2op = origline2op ccfx_out_path = pb.getCCFXOutputPath() + pb.getCCFXOutputFileName( lang, is_new, is_tmp = False) ccfx_out = RepertoireOutput() ccfx_out.loadFromFile(ccfx_out_path) files = {} for fileIdx, path in ccfx_out.getFileIter(): print fileIdx print path if not metaDB.hasInputPath(path): raise Exception( "Couldn't find meta information for file: {0}".format( path)) print ">>>>>>> " + path meta = metaDB.getMetaForPath(path) files[fileIdx] = meta.filterOutput clones = {} for cloneIdx, (clone1, clone2) in ccfx_out.getCloneIter(): op1 = [] op2 = [] fidx1, start1, end1 = clone1 fidx2, start2, end2 = clone2 meta1 = metaDB.getMetaForPath(ccfx_out.getFilePath(fidx1)) meta2 = metaDB.getMetaForPath(ccfx_out.getFilePath(fidx2)) start1 = meta1.prepIdx2OrigIdx.get(start1+1, -1) end1 = meta1.prepIdx2OrigIdx.get(end1, -1) start2 = meta2.prepIdx2OrigIdx.get(start2+1, -1) end2 = end2 = meta2.prepIdx2OrigIdx.get(end2, -1) for i in range(start1,end1+1): op = meta1.line2op.get(i, "X") op1.append((i,op)) for i in range(start2,end2+1): op = meta2.line2op.get(i, "X") op2.append((i,op)) clone1 = (fidx1, start1, end1, op1) clone2 = (fidx2, start2, end2, op2) if clone1[0] < clone2[0]: clone = (clone1, clone2) else: clone = (clone2, clone1) clones[cloneIdx] = clone rep_out = RepertoireOutput() rep_out.loadFromData(files, clones) return rep_out
def convert_ccfx_output(pb, lang, is_new, debug = False): metaDB = CCFXMetaMapping() # maps from ccfx input paths to meta objects representing the files for proj in [PathBuilder.PROJ0, PathBuilder.PROJ1]: filter_path = pb.getFilterOutputPath(proj, lang) conv_path = pb.getLineMapPath(proj, lang, is_new) ccfx_i_path = pb.getCCFXInputPath(proj, lang, is_new) ccfx_p_path = pb.getCCXFPrepPath(proj, lang, is_new) for name in os.listdir(filter_path): meta = CCFXMetaData( ccfx_i_path + name, ccfx_p_path + pb.findPrepFileFor(ccfx_p_path, name), conv_path + pb.makeLineMapFileName(name), filter_path + name) metaDB.addFile(meta) # we have our files, now map line numbers in the prep files to input files for meta in metaDB.getMetas(): if config.DEBUG is True: print "prep file = " + meta.ccfxPrep print "conv file = " + meta.filterConv prepHandler = open(meta.ccfxPrep, 'r') prep = prepHandler.readlines() prepHandler.close() convHandler = open(meta.filterConv, 'r') conv = convHandler.readlines() convHandler.close() input2orig = {} pidx2orig = {} origline2op = {} # build a map of line numbers in ccfx_input to filtered diff line last_dst = last_src = 0 for i, cline in enumerate(conv): if i < 2: continue if cline.rstrip().startswith('"'): #filename-->skip the line continue dstIdx,srcIdx,op,changId = cline.split(',') input2orig[int(dstIdx)] = int(srcIdx) origline2op[int(srcIdx)] = op last_dst = int(dstIdx) + 1 last_src = int(srcIdx) + 1 # ccfx cares about the end of file, which isn't represented by our mappings input2orig[last_dst] = last_src origline2op[last_src] = "NOCHANGE" for pidx, pline in enumerate(prep): inputIdx = int(pline.partition(".")[0], 16) # ccfx output has numbers like 0-131, meaning that pidx # is meant to be taken from 0 pidx2orig[pidx] = input2orig.get(inputIdx, -1) if debug and input2orig.get(inputIdx, -1) == -1: print "failed to translate from pidx to original: {0} -> {1}".format(pidx, inputIdx) print " file: " + meta.ccfxInput meta.prepIdx2OrigIdx = pidx2orig meta.line2op = origline2op ccfx_out_path = pb.getCCFXOutputPath() + pb.getCCFXOutputFileName( lang, is_new, is_tmp = False) ccfx_out = RepertoireOutput() if debug: print 'loading from ccfx output file: {0}'.format(ccfx_out_path) ccfx_out.loadFromFile(ccfx_out_path) if debug: print "finished loading ccfx output." files = {} for fileIdx, path in ccfx_out.getFileIter(): if not metaDB.hasInputPath(path): raise Exception( "Couldn't find meta information for file: {0}".format( path)) meta = metaDB.getMetaForPath(path) files[fileIdx] = meta.filterOutput clones = {} # rewrite the line numbers to index into filter_output files for clone_idx, clone_pair in ccfx_out.getCloneIter(): fidx1, start1, end1, op1 = clone_pair.clone1 fidx2, start2, end2, op2 = clone_pair.clone2 metric = clone_pair.metric meta1 = metaDB.getMetaForPath(ccfx_out.getFilePath(fidx1)) meta2 = metaDB.getMetaForPath(ccfx_out.getFilePath(fidx2)) start1 = meta1.prepIdx2OrigIdx.get(start1 + 1, -1) end1 = meta1.prepIdx2OrigIdx.get(end1, -1) start2 = meta2.prepIdx2OrigIdx.get(start2 + 1, -1) end2 = end2 = meta2.prepIdx2OrigIdx.get(end2, -1) if (start1 == -1 or start2 == -1 or end1 == -1 or end2 == -1): if debug: print 'line translation failed for ' + str(clone_pair) # don't even try to translate a clonew with bad indices # this usually means we somehow dumped an empty file on # ccfx and we can't translate the eof token correctly # enabling debug should verify this continue for i in range(start1, end1 + 1): op = meta1.line2op.get(i, "X") op1.append(Operation(i,op)) for i in range(start2, end2 + 1): op = meta2.line2op.get(i, "X") op2.append(Operation(i,op)) clone1 = Clone(fidx1, start1, end1, op1) clone2 = Clone(fidx2, start2, end2, op2) if clone1.fidx < clone2.fidx: unsplit_clone = ClonePair(clone1, clone2, metric) else: unsplit_clone = ClonePair(clone2, clone1, metric) # split into hunks, add those hunks into our final output clone_pairs = split_clone_into_hunks(unsplit_clone, debug) for clone_pair in clone_pairs: clones[len(clones)] = clone_pair rep_out = RepertoireOutput() rep_out.loadFromData(files, clones) return rep_out
meta.prepIdx2OrigIdx = pidx2orig meta.line2op = origline2op print 'map line number finished' # write all keys in metaDB to a new file metaDB_key_file = 'metaDB_key_file.txt' with open(metaDB_key_file, 'w') as f: for key in metaDB.name2meta.keys(): f.write(key+'\n') print 'write key finished' ccfx_out_path = 'cross_file.txt' ccfx_out = RepertoireOutput() print 'load clone data from file starts...' ccfx_out.loadFromFile(ccfx_out_path) print 'load clone data from file ends...' print 'parsed files: ', len(ccfx_out.files) print 'parsed clones: ', len(ccfx_out.clones) files = {} for fileIdx, path in ccfx_out.getFileIter(): # print fileIdx # print path if not metaDB.hasInputPath(path): print "Couldn't find meta information for file: ", path sys.exit(-1) meta = metaDB.getMetaForPath(path) files[fileIdx] = meta.filterOutput