def readData(self): file_path = os.path.join(DATA_DIR, "dronology", "dronologydataset.json") with open(file_path, encoding='utf8') as fin: content = fin.read(); jsonObj = json.loads(content); re_dict = dict() dd_dict = dict() links = [] for entry in jsonObj["entries"]: id = entry["issueid"] attrib_dict = entry["attributes"] issueType = attrib_dict['issuetype'] summary = attrib_dict["summary"].lower() description = attrib_dict["description"].lower() if issueType == "Design Definition": dd_dict[id] = summary + "," + description if issueType == "Requirement": re_dict[id] = summary + "," + description if "children" in entry: children_dict = entry["children"] if "refinedby" in children_dict: refined_by_list = children_dict["refinedby"] for dd in refined_by_list: if dd.startswith("DD"): links.append((id, dd)) artif_pair = ArtifactPair(re_dict, "re", dd_dict, "dd") for link in links: if link[0] not in re_dict: print(link[0]) for link in links: if link[1] not in dd_dict: print(link[1]) return Dataset([LinkSet(artif_pair, links)])
def __readData(self, issue_path, commit_path, link_path, do_filter=True): def all_english(content: str) -> bool: def get_en(doc): pattern = re.compile("[a-zA-Z]+") res = pattern.findall(doc) return res return len(get_en(content)) == len(content.split()) issues = dict() commits = dict() issue_close_time_dict = dict() commit_time_dict = dict() MIN_DOC_SIZE = 15 filtered_issued = 0 filtered_commit = 0 with open(issue_path, encoding='utf8') as fin: for i, line in enumerate(fin): if i == 0: continue id, content, close_time = line.strip("\n\t\r").split(",") if (len(content.split()) < MIN_DOC_SIZE) and do_filter: filtered_issued += 1 continue issues[id] = content issue_close_time_dict[id] = close_time # print("{} issues are filtered with minimal lenght {}...".format(filtered_issued, MIN_DOC_SIZE, # len(issues))) with open(commit_path, encoding='utf8') as fin: for i, line in enumerate(fin): if i == 0: continue id, summary, content, commit_time = line.strip("\n\t\r").split(",") commit_content = summary + content if (len(commit_content.split()) < MIN_DOC_SIZE) and do_filter: filtered_commit += 1 continue commits[id] = commit_content commit_time_dict[id] = commit_time # print("{} commit are filtered minimal lenght {}".format(filtered_commit, MIN_DOC_SIZE, len(commits))) artif_pair = ArtifactPair(issues, "issues", commits, "commits") links = [] origin_link_cnt = 0 with open(link_path) as fin: for i, line in enumerate(fin): if i == 0: continue origin_link_cnt = i issue_id, commit_id = line.split(",") issue_id = issue_id.strip("\n\t\r") commit_id = commit_id.strip("\n\t\r") if issue_id not in issues or commit_id not in commits: continue link = (issue_id, commit_id) links.append(link) # print("Link size:{}/{}".format(len(links), origin_link_cnt)) link_set = LinkSet(artif_pair, links) return Dataset([link_set])
def readData(self): data_dir_path = os.path.join(DATA_DIR, "maven") artifact_bug = self.read_csv(os.path.join(data_dir_path, "bug.csv"), 0, 3) artifact_commit = self.read_csv(os.path.join(data_dir_path, "commits.csv"), 0, 2) artifact_improvement = self.read_csv(os.path.join(data_dir_path, "improvement.csv"), 0, 3) artifact_code = self.__read_code(data_dir_path) bug_commit_links = self.read_link(os.path.join(data_dir_path, "bugCommitLinks.csv")) commit_code_links = self.read_link(os.path.join(data_dir_path, "CommitCodeLinks.csv")) improvement_commit_links = self.read_link(os.path.join(data_dir_path, "improvementCommitLinks.csv")) bug_commit_pair = ArtifactPair(artifact_bug, "bug", artifact_commit, "commit") commit_code_pair = ArtifactPair(artifact_commit, "commit", artifact_code, "code") improvement_commit_pair = ArtifactPair(artifact_improvement, "improvement", artifact_commit, "commit") bug_commit_set = LinkSet(bug_commit_pair, bug_commit_links) commit_code_set = LinkSet(commit_code_pair, commit_code_links) improvement_commit_set = LinkSet(improvement_commit_pair, improvement_commit_links) link_sets = [bug_commit_set, commit_code_set, improvement_commit_set] return Dataset(link_sets)
def limit_artifacts_in_links(self, dataset: Dataset, origin_dataset: Dataset): """ synchronize the artifacts and links of a translated dataset with origin multi-lingual dataset :param dataset: :return: """ modified_link_sets = [] data_set_infos = [] for linkset_id in dataset.gold_link_sets: link_set: LinkSet = dataset.gold_link_sets[linkset_id] source_dict: dict = link_set.artiPair.source_artif target_dict: dict = link_set.artiPair.target_artif links = link_set.links gold_artif_set = set() # Fix for the bug when read translated data. Translated data have no Chinese at all origin_source_dict = origin_dataset.gold_link_sets[linkset_id].artiPair.source_artif origin_target_dict = origin_dataset.gold_link_sets[linkset_id].artiPair.target_artif # links = [x for x in links if (self.isIL(origin_source_dict[x[0]], origin_target_dict[x[1]]))] print("links = {}".format(len(links))) for (s, t) in links: gold_artif_set.add(s) gold_artif_set.add(t) limited_source_dict = dict() for s_art in source_dict.keys(): if s_art in gold_artif_set: limited_source_dict[s_art] = source_dict[s_art] limited_target_dict = dict() for t_art in target_dict.keys(): if t_art in gold_artif_set: limited_target_dict[t_art] = target_dict[t_art] modified_artif_pair = ArtifactPair(limited_source_dict, link_set.artiPair.source_name, limited_target_dict, link_set.artiPair.target_name) modified_artif_pair.source_artif_extra_info = link_set.artiPair.source_artif_extra_info modified_artif_pair.target_artif_extra_info = link_set.artiPair.target_artif_extra_info # Keep the extra information modified_link_sets.append(LinkSet(modified_artif_pair, links)) issue_num = len(modified_artif_pair.source_artif) commit_num = len(modified_artif_pair.target_artif) issue_commit_info = "{} issues and {} commits remains after limiting artifacts to links...".format( issue_num, commit_num) data_set_infos.append(issue_commit_info) # print(issue_commit_info) candidate_num = issue_num * commit_num base_accuracy = 0 if candidate_num > 0: base_accuracy = len(links) / candidate_num # print("Baseline accuracy is {}/{} = {}".format(len(links), candidate_num, base_accuracy)) return Dataset(modified_link_sets), "\n".join(data_set_infos)
def readData(self): source_path = os.path.join(DATA_DIR, "cm1", "CM1-sourceArtifacts.xml") target_path = os.path.join(DATA_DIR, "cm1", "CM1-targetArtifacts.xml") answer_path = os.path.join(DATA_DIR, "cm1", "CM1-answerSet.xml") sourceArtifact = self.__read_artifact(source_path) targetArtifact = self.__read_artifact(target_path) links = set() tree = ET.parse(answer_path) root = tree.getroot() for artifact in root.iter('link'): source_id = artifact.find("source_artifact_id").text target_id = artifact.find("target_artifact_id").text links.add((source_id, target_id)) artif_pair = ArtifactPair(sourceArtifact, "cm1Source", targetArtifact, "cm1Target") link_set = LinkSet(artif_pair, links) return Dataset([link_set])
def limit_artifacts_in_links(self, dataset: Dataset): """ Remove the artifacts which did not appear in the golden links :param dataset: :return: """ modified_link_sets = [] data_set_infos = [] for linkset_id in dataset.gold_link_sets: link_set: LinkSet = dataset.gold_link_sets[linkset_id] source_dict: dict = link_set.artiPair.source_artif target_dict: dict = link_set.artiPair.target_artif links = link_set.links gold_artif_set = set() for (s, t) in links: gold_artif_set.add(s) gold_artif_set.add(t) limited_source_dict = dict() for s_art in source_dict.keys(): if s_art in gold_artif_set: limited_source_dict[s_art] = source_dict[s_art] limited_target_dict = dict() for t_art in target_dict.keys(): if t_art in gold_artif_set: limited_target_dict[t_art] = target_dict[t_art] modified_artif_pair = ArtifactPair(limited_source_dict, link_set.artiPair.source_name, limited_target_dict, link_set.artiPair.target_name) # Keep the extra information modified_link_sets.append(LinkSet(modified_artif_pair, links)) issue_num = len(modified_artif_pair.source_artif) commit_num = len(modified_artif_pair.target_artif) issue_commit_info = "{} issues and {} commits remains after limiting artifacts to links...".format( issue_num, commit_num) data_set_infos.append(issue_commit_info) # print(issue_commit_info) candidate_num = issue_num * commit_num base_accuracy = 0 if candidate_num > 0: base_accuracy = len(links) / candidate_num # print("Baseline accuracy is {}/{} = {}".format(len(links), candidate_num, base_accuracy)) return Dataset(modified_link_sets), "\n".join(data_set_infos)
def readData(self): data_dir_path = os.path.join(DATA_DIR, "EasyClinicDataset", "2 - docs (English)") aritfacts_dirs = os.listdir(data_dir_path) arti_name_convert = {"1 - use cases": "UC", "2 - Interaction diagrams": "ID", "3 - test cases": "TC", "4 - class description": "CC"} artifact_dict = dict() link_sets = [] for artifact_dir in aritfacts_dirs: artif_code = arti_name_convert[artifact_dir] artifact_dir = os.path.join(data_dir_path, artifact_dir) tmp_dict = dict() for artifact_file in os.listdir(artifact_dir): file_path = os.path.join(data_dir_path, artifact_dir, artifact_file) with open(file_path, encoding='utf8', errors="ignore") as fin: content = fin.read() id = artifact_file.replace(".txt", "") tmp_dict[id] = content artifact_dict[artif_code] = tmp_dict oracle_dir = os.path.join(data_dir_path, "..", "oracle") for link_file in os.listdir(oracle_dir): artif_type_codes = link_file.strip(".txt").split("_") source_code = artif_type_codes[0] target_code = artif_type_codes[1] links = [] with open(os.path.join(oracle_dir, link_file)) as fin: for line in fin: parts = line.split(":") from_artif = parts[0] to_artifs = parts[1].split() for to_artif in to_artifs: from_artif_id = from_artif.strip(".txt") to_artif_id = to_artif.strip(".txt") links.append((from_artif_id, to_artif_id)) artif_pair = ArtifactPair(artifact_dict[source_code], source_code, artifact_dict[target_code], target_code) link_set = LinkSet(artif_pair, links) link_sets.append(link_set) return Dataset(link_sets)
def __readData(self, issue_path, commit_path, link_path, min_doc_length=3, do_filter=True): def all_english(content: str) -> bool: def get_en(doc): pattern = re.compile("[a-zA-Z0-9]+") res = pattern.findall(doc) return res return len(get_en(content)) == len(content.split()) issues = dict() commits = dict() issue_close_time_dict = dict() issue_create_time_dict = dict() commit_time_dict = dict() filtered_issued = 0 filtered_commit = 0 with open(issue_path, encoding='utf8') as fin: for i, line in enumerate(fin): if i == 0: continue id, content, close_time, create_time = line.strip("\n\t\r").split(",") if (len(content.split()) < min_doc_length): filtered_issued += 1 continue issues[id] = content issue_close_time_dict[id] = close_time issue_create_time_dict[id] = create_time with open(commit_path, encoding='utf8') as fin: for i, line in enumerate(fin): if i == 0: continue id, summary, content, commit_time_info = line.strip("\n\t\r").split(",") commit_content = summary + content if len(commit_content.split()) < min_doc_length: filtered_commit += 1 continue commits[id] = commit_content commit_time_dict[id] = commit_time_info artif_pair = ArtifactPair(issues, "issues", commits, "commits") issue_time_info = {} commit_time_info = {} issue_time_info["create"] = issue_create_time_dict issue_time_info["close"] = issue_close_time_dict commit_time_info["create"] = commit_time_dict artif_pair.source_artif_extra_info = issue_time_info artif_pair.target_artif_extra_info = commit_time_info links = [] origin_link_cnt = 0 with open(link_path) as fin: for i, line in enumerate(fin): if i == 0: continue origin_link_cnt = i issue_id, commit_id = line.split(",") issue_id = issue_id.strip("\n\t\r") commit_id = commit_id.strip("\n\t\r") if do_filter: if issue_id not in issues or commit_id not in commits: continue issue_content = issues[issue_id] commit_content = commits[commit_id] if all_english(issue_content) and all_english(commit_content): continue link = (issue_id, commit_id) links.append(link) print("Link size:{}/{}".format(len(links), origin_link_cnt)) link_set = LinkSet(artif_pair, links) return Dataset([link_set])